BBloggsbott · September 19, 2019 05:08
diff --git a/README.md b/README.md
diff --git a/arxiv_scrapper.py b/arxiv_scrapper.py
 import urllib
 from bs4 import BeautifulSoup
 from pathlib import Path
 import datetime
 import time
 import os
 import requests
 import warnings
 warnings.filterwarnings('ignore')

 last_run = ''
 temp_file = os.path.join('data','tmp','time.txt')
 current_time = datetime.datetime.now()
 print('Checking for required files')
 req_file = Path(temp_file)
 if req_file.is_file():
 	print('File Exists. Getting last run time')
 	with open(temp_file, 'r') as f:
 		last_run = datetime.datetime.fromtimestamp(int(f.read()))
 		f.close()
 	print(str(last_run))
 	f = open(temp_file, 'w')
 	f.write(str(int(time.mktime(current_time.timetuple()))))
 	f.close()
 	time_diff = current_time - last_run
 else:
 	print('Files do not exist. Creating files.')
 	os.makedirs(os.path.join('data','tmp'))
 	os.makedirs(os.path.join('data','pdf'))
 	f = open(temp_file, 'w')
 	last_run = current_time
 	f.write(str(int(time.mktime(current_time.timetuple()))))
 	f.close()
 	time_diff = datetime.timedelta(hours = 24)

 print('Getting data\n--')
 page_url = 'https://twitter.com/arXiv__ml'
 page = urllib.request.urlopen(page_url)
 soup = BeautifulSoup(page)
 cnt = 0
 tweets = soup.find_all('div', attrs = {'class': 'content'})
 for twt in tweets:
 	time_delay = twt.find_all('span', attrs = {'class': '_timestamp'})
 	if(time_diff > current_time - datetime.datetime.fromtimestamp(int(time_delay[0]['data-time']))):
 		print('New content found. Getting resources. This might take a while')
 		try:
 			anchor = twt.find_all('a', attrs = {'class': 'twitter-timeline-link'})
 			link = anchor[0]['title']
 			redir_page = urllib.request.urlopen(link)
 			redir_soup = BeautifulSoup(redir_page)
 			pdf_anchor = redir_soup.find_all('a', attrs = {'accesskey':'f'})
 			save_loc = pdf_anchor[0]['href'][1:]+'.pdf'
 			pdf_url = 'https://arxiv.org/'+save_loc
 			print('Getting PDF. This might take a while too...')
 			pdf = open(os.path.join('data/',save_loc.split('/')[0],save_loc.split('/')[1]), 'wb+')
 			res = urllib.request.urlopen(pdf_url)
 			pdf.write(res.read())
 			pdf.close()
 			print('Done. Pdf saved in data/pdf/'+save_loc.split('/')[1]+'\n--')
 		except:
 			print('Problem getting file. Moving to next file.\n--')
 	else:
 		print('No new content found. Maybe, try after a while. Quitting\n--')
 		break

 print('Bye')
diff --git a/requirements.txt b/requirements.txt
 beautifulsoup4==4.7.1
 bs4==0.0.1
 certifi==2019.6.16
 chardet==3.0.4
 idna==2.8
 pathlib==1.0.1
 requests==2.22.0
 soupsieve==1.9.1
 urllib3==1.25.3
	import urllib
	from bs4 import BeautifulSoup
	from pathlib import Path
	import datetime
	import time
	import os
	import requests
	import warnings
	warnings.filterwarnings('ignore')

	last_run = ''
	temp_file = os.path.join('data','tmp','time.txt')
	current_time = datetime.datetime.now()
	print('Checking for required files')
	req_file = Path(temp_file)
	if req_file.is_file():
	print('File Exists. Getting last run time')
	with open(temp_file, 'r') as f:
	last_run = datetime.datetime.fromtimestamp(int(f.read()))
	f.close()
	print(str(last_run))
	f = open(temp_file, 'w')
	f.write(str(int(time.mktime(current_time.timetuple()))))
	f.close()
	time_diff = current_time - last_run
	else:
	print('Files do not exist. Creating files.')
	os.makedirs(os.path.join('data','tmp'))
	os.makedirs(os.path.join('data','pdf'))
	f = open(temp_file, 'w')
	last_run = current_time
	f.write(str(int(time.mktime(current_time.timetuple()))))
	f.close()
	time_diff = datetime.timedelta(hours = 24)

	print('Getting data\n--')
	page_url = 'https://twitter.com/arXiv__ml'
	page = urllib.request.urlopen(page_url)
	soup = BeautifulSoup(page)
	cnt = 0
	tweets = soup.find_all('div', attrs = {'class': 'content'})
	for twt in tweets:
	time_delay = twt.find_all('span', attrs = {'class': '_timestamp'})
	if(time_diff > current_time - datetime.datetime.fromtimestamp(int(time_delay[0]['data-time']))):
	print('New content found. Getting resources. This might take a while')
	try:
	anchor = twt.find_all('a', attrs = {'class': 'twitter-timeline-link'})
	link = anchor[0]['title']
	redir_page = urllib.request.urlopen(link)
	redir_soup = BeautifulSoup(redir_page)
	pdf_anchor = redir_soup.find_all('a', attrs = {'accesskey':'f'})
	save_loc = pdf_anchor[0]['href'][1:]+'.pdf'
	pdf_url = 'https://arxiv.org/'+save_loc
	print('Getting PDF. This might take a while too...')
	pdf = open(os.path.join('data/',save_loc.split('/')[0],save_loc.split('/')[1]), 'wb+')
	res = urllib.request.urlopen(pdf_url)
	pdf.write(res.read())
	pdf.close()
	print('Done. Pdf saved in data/pdf/'+save_loc.split('/')[1]+'\n--')
	except:
	print('Problem getting file. Moving to next file.\n--')
	else:
	print('No new content found. Maybe, try after a while. Quitting\n--')
	break

	print('Bye')
	beautifulsoup4==4.7.1
	bs4==0.0.1
	certifi==2019.6.16
	chardet==3.0.4
	idna==2.8
	pathlib==1.0.1
	requests==2.22.0
	soupsieve==1.9.1
	urllib3==1.25.3