Commit cbe5b38b authored by Nitha Sagar Jayanna's avatar Nitha Sagar Jayanna
Browse files

initial commit

parents
import requests
import re
import os,sys
from html.parser import HTMLParser
import config
#TableParser Code modified from: https://docs.python.org/3/library/html.parser.html
class TableParser(HTMLParser):
def __init__(self):
HTMLParser.__init__(self)
#initialize parser to false meaning outside the tag <td>
self.in_td = False
#Temporary list to store filename and size
self.list=[]
#inside td, set to True
def handle_starttag(self, tag, attrs):
if tag == 'td':
self.in_td = True
#Append the data thats inside tag td
def handle_data(self, data):
if self.in_td:
self.list.append(data)
#To indicate outside td
def handle_endtag(self, tag):
self.in_td = False
#To access the source page of the site
def urlaccess(sites):
try:
filelist=[]
#for each url in the list of sites
for url in sites:
#requests the page source of the url
r = requests.get(url)
#stores the whole page
page_source = r.text
#splits the lines on the basis of newline
page_source = page_source.split('\n')
i=0
#for each line in source page, search for the filename that with matches the filepattern
for row in page_source[:]:
file_pattern = config.file_pattern
p = TableParser()
if re.search(file_pattern,row):
#append the filename and its size in the filelist
p.feed(row)
filelist.append((p.list[0],p.list[2]))
i=i+1
return filelist
except BaseException as e:
print("URL meta info access error",e)
sys.exit()
{"router":
{
"rrc00":"rrc00",
"rrc01":"rrc01"
},
"baseURL": "http://data.ris.ripe.net/" ,
"start_date" : "2020-05-05",
"end_date" : "2020-06-10",
"log_file":"log.log",
"file_pattern":"(bview)\\.(\\d{8})\\.(\\d{4})\\.(gz)"
}
\ No newline at end of file
import re
import json
import os,sys
try:
with open('config.json') as config_file:
data = json.load(config_file)
except BaseException as e:
print("config file not available")
sys.exit()
router = data["router"]["rrc00"]
baseURL = data["baseURL"]
start_date = data["start_date"]
end_date = data["end_date"]
log_file = router+data["log_file"]
file_pattern = re.compile(data["file_pattern"])
try:
if not os.path.exists(log_file):
print("logfile not present, created:",log_file)
open(log_file, 'a+')
except BaseException as e:
print(str(e))
sys.exit()
import time, datetime, os
from datetime import datetime, timedelta
import config
import sys
def create(router,year,month):
try:
if not os.path.exists(router):
os.mkdir(router)
routerYear = os.path.join(router,year)
os.makedirs(routerYear)
os.makedirs(os.path.join(routerYear,month))
return os.path.join(routerYear,month)
else:
if not os.path.exists(os.path.join(router,year)):
os.makedirs(os.path.join(router,year))
routerYear = os.path.join(router,year)
os.makedirs(os.path.join(routerYear,month))
return os.path.join(routerYear,month)
else:
routerYear = os.path.join(router,year)
if not os.path.exists(os.path.join(routerYear,month)):
os.makedirs(os.path.join(routerYear, month))
return os.path.join(routerYear, month)
else:
return os.path.join(routerYear, month)
except BaseException as e:
print(str(e))
sys.exit()
\ No newline at end of file
from datetime import datetime, timedelta
import sys,os
from datetime import date
import logging
import config
import accessurl
import urllib.request
import dirCreate
import latestdate
import fileinput
import calendar
start = config.start_date
end = config.end_date
today = str(date.today())
router = config.router
log_file = config.log_file
def clean_log(stripline):
with open(log_file,'r+') as f:
filedata = f.read()
f.seek(0)
filedata = filedata.replace(stripline,'\n')
f.truncate(0)
f.write(filedata)
'''
def clean_log(stripline):
with open(log_file, 'r+') as f:
t = f.read()
f.seek(0)
for line in t.split("\n"):
if not stripline in line:
f.write(line + "\n")
f.truncate()
'''
# The following function (remove_empty_lines) is modified from
# https://stackoverflow.com/questions/53358985/remove-n-with-strip-in-python
def remove_empty_lines():
with open(log_file) as filehandle:
lines = filehandle.readlines()
with open(log_file, 'w') as filehandle:
lines = filter(lambda x: x.strip(), lines)
filehandle.writelines(lines)
def filecheck(file):
try:
with open(log_file, 'r') as read_obj:
for line in read_obj:
if file[0] in line:
memFile = line.split(" ")
sizeFile = memFile[1]
lenString = len(sizeFile)
memFileSize = int(sizeFile[0:lenString-2])
siteFileSize = file[1]
lenSite = len(siteFileSize)
webFileSize = int(siteFileSize[0:lenSite-1])
if webFileSize > memFileSize:
#print("download required")
#print(memFile,webFileSize,memFileSize)
read_obj.close()
clean_log(line)
return True
else:
#print("no download required")
return False
#print("download required")
return True
read_obj.close()
except BaseException as e:
print(str(e))
def urlDownload(start,end):
try:
startDate = datetime.strptime(start,"%Y-%m-%d")
endDate = datetime.strptime(end, "%Y-%m-%d")
todayDate = datetime.strptime(today, "%Y-%m-%d")
except BaseException as e:
print("Date error:",str(e))
sys.exit()
#print(startDate,endDate)
startYear = datetime.strftime(startDate, "%Y")
startMonth = datetime.strftime(startDate,"%m")
startDay = datetime.strftime(startDate,"%d")
endYear = datetime.strftime(endDate, "%Y")
endMonth = datetime.strftime(endDate,"%m")
endDay = datetime.strftime(endDate,"%d")
if (startDate > endDate) or (startDate > todayDate) or (endDate > todayDate):
print('Invalid dates or Future dates')
sys.exit()
else:
num_months = (endDate.year - startDate.year) * 12 + (endDate.month - startDate.month)
if num_months > 0 :
# If days range over months
# This is done in three stages:
# stage 1: first month number of days data
# stage 2: any number of months data, starting from 2 month to last but
# one month
# stage 3: last month's number of days data
#####################################
# stage 1: first month's days data #
#####################################
dayRange = calendar.monthrange(startDate.year,startDate.month)
#print(dayRange[1])
newEndDatestring = startYear+"-"+startMonth+"-"+str(dayRange[1])
newEndDate = datetime.strptime(newEndDatestring,"%Y-%m-%d")
#print(newstartDate)
span = newEndDate - startDate
#print("span")
url = config.baseURL + router + "/" + startYear + "." + startMonth
filelist = accessurl.urlaccess([url])
#print(filelist)
if len(filelist) == 0:
print("Download Error: No data available",url)
newfilelist=[]
for i in range (span.days + 1):
day = startDate + timedelta(days=i)
daystring = str(day)
daydata = daystring.split("-")
dayinfo = daydata[0]+daydata[1]+daydata[2][0:2]
#print(dayinfo)
for j in filelist:
if j[0].find(dayinfo) > 0:
newfilelist.append(j)
for i in newfilelist:
downloadRequired = filecheck(i)
if downloadRequired == True:
#print("download required",i[0])
downloadURL = url+"/"+i[0]
#print(downloadURL)
fileLoc = dirCreate.create(router,startYear,startMonth)
#print(fileLoc)
print("downloading")
#urllib.request.urlretrieve(downloadURL, fileLoc+"/"+i[0])
logString = i[0]+" "+i[1]+"\n"
logHandle = open(log_file,'a+')
logHandle.write(logString)
logHandle.close()
intstartMonth = int(startMonth)+1
if intstartMonth > 12:
intstartMonth = 1
intstartYear = int(startYear)+1
startYear=str(intstartYear)
if intstartMonth < 10:
startMonth = "0"+str(intstartMonth)
else:
startMonth = str(intstartMonth)
#####################################
# stage 2: number of months data #
#####################################
for j in range(1,num_months):
#print("Success, form the URI",num_months)
url = config.baseURL+router+"/"+ startYear + "." + startMonth
#print(url)
filelist = accessurl.urlaccess([url])
#print(filelist)
if len(filelist) == 0:
print("No data availbale",)
for i in filelist:
downloadRequired = filecheck(i)
if downloadRequired == True:
#print("download required",i[0])
downloadURL = url+"/"+i[0]
#print(downloadURL)
fileLoc = dirCreate.create(router,startYear,startMonth)
#print(fileLoc)
print("downloading")
#urllib.request.urlretrieve(downloadURL, fileLoc+"/"+i[0])
logString = i[0]+" "+i[1]+"\n"
logHandle = open(log_file,'a+')
logHandle.write(logString)
logHandle.close()
intstartMonth = int(startMonth)+1
if intstartMonth > 12:
intstartMonth = 1
intstartYear = int(startYear)+1
startYear=str(intstartYear)
if intstartMonth < 10:
startMonth = "0"+str(intstartMonth)
else:
startMonth = str(intstartMonth)
#####################################
# stage 3: last month's days data #
#####################################
newstartDatestring = startYear+"-"+startMonth+"-"+"01"
newstartDate = datetime.strptime(newstartDatestring,"%Y-%m-%d")
#print(newstartDate)
span = endDate - newstartDate
url = config.baseURL + router + "/" + startYear + "." + startMonth
filelist = accessurl.urlaccess([url])
#print(filelist)
if len(filelist) == 0:
print("Download Error: No data available",url)
newfilelist=[]
for i in range (span.days + 1):
day = newstartDate + timedelta(days=i)
daystring = str(day)
daydata = daystring.split("-")
dayinfo = daydata[0]+daydata[1]+daydata[2][0:2]
#print(dayinfo)
for j in filelist:
if j[0].find(dayinfo) > 0:
newfilelist.append(j)
for i in newfilelist:
downloadRequired = filecheck(i)
if downloadRequired == True:
#print("download required",i[0])
downloadURL = url+"/"+i[0]
#print(downloadURL)
fileLoc = dirCreate.create(router,startYear,startMonth)
#print(fileLoc)
print("downloading")
#urllib.request.urlretrieve(downloadURL, fileLoc+"/"+i[0])
logString = i[0]+" "+i[1]+"\n"
logHandle = open(log_file,'a+')
logHandle.write(logString)
logHandle.close()
else:
# if days range in a single month
# to do: function
span = endDate - startDate
url = config.baseURL+router+"/"+ startYear + "." + startMonth
#print(url)
filelist = accessurl.urlaccess([url])
#print(filelist)
if len(filelist) == 0:
print("Error: No data Available")
newfilelist=[]
for i in range (span.days + 1):
day = startDate + timedelta(days=i)
daystring = str(day)
daydata = daystring.split("-")
dayinfo = daydata[0]+daydata[1]+daydata[2][0:2]
for j in filelist:
if j[0].find(dayinfo) > 0:
newfilelist.append(j)
for i in newfilelist:
downloadRequired = filecheck(i)
if downloadRequired == True:
#print("download required")
downloadURL = url+"/"+i[0]
#print(downloadURL)
fileLoc = dirCreate.create(router,startYear,startMonth)
#print(fileLoc)
print("downloading")
#urllib.request.urlretrieve(downloadURL, fileLoc+"/"+i[0])
logString = i[0]+" "+i[1]+"\n"
logHandle = open(log_file,'a+')
logHandle.write(logString)
logHandle.close()
if start != "" and end != "":
urlDownload(start,end)
elif start == "" and end != "":
startdate = latestdate.recent()
print(startdate)
if startdate == "empty":
month = date.today().month;
year = date.today().year
if month < 10:
month = "0"+str(month)
startdate = str(year) + "-"+month+"-"+"01"
urlDownload(startdate,end)
else:
urlDownload(startdate[0],end)
elif end == "" and start != "":
#print("No end date")
enddate = str(today)
urlDownload(start,enddate)
else:
#print("No arguments")
startdate = latestdate.recent()
enddate = str(today)
if startdate == "empty":
month = date.today().month;
year = date.today().year
if month < 10:
month = "0"+str(month)
startdate = str(year) + "-"+month+"-"+"01"
urlDownload(startdate,enddate)
else:
urlDownload(startdate[0],enddate)
remove_empty_lines()
import re
import datetime
import config
import sys
places = []
log_file = config.log_file
try:
with open(log_file, 'r') as filehandle:
for line in filehandle:
places.append(line)
except BaseException as e:
print(str(e))
date_pattern = config.file_pattern
def get_date(filename):
matched = date_pattern.search(filename)
if not matched:
return None
file = filename.split(".")
y = file[1][0:4]
m= file[1][4:6]
d= file[1][6:8]
return datetime.date(int(y), int(m), int(d))
def recent():
try:
dates = (get_date(fn) for fn in places)
dates = (d for d in dates if d is not None)
last_date = max(dates)
last_dateStr=str(last_date)
last_dateSplit=last_dateStr.split("-")
dateMax = last_dateSplit[0]+last_dateSplit[1]+last_dateSplit[2]
filelist=[]
for i in places:
j = i.find(dateMax)
if j > 0:
f = i.split(' ')
filelist.append(f[0])
maxTime=0
for i in filelist:
splitname = i.split('.')
if int(splitname[2]) >= maxTime:
maxTime = int(splitname[2])
return([last_dateStr,maxTime])
except BaseException as e:
print(log_file, "empty, default download of present month data")
return "empty"
#sys.exit()
\ No newline at end of file
Markdown is supported
0% or .
You are about to add 0 people to the discussion. Proceed with caution.
Finish editing this message first!
Please register or to comment