Install required packages
pip install -r requirements.txtRun Scrapper
python arxiv_scrapper.py| import urllib | |
| from bs4 import BeautifulSoup | |
| from pathlib import Path | |
| import datetime | |
| import time | |
| import os | |
| import requests | |
| import warnings | |
| warnings.filterwarnings('ignore') | |
| last_run = '' | |
| temp_file = os.path.join('data','tmp','time.txt') | |
| current_time = datetime.datetime.now() | |
| print('Checking for required files') | |
| req_file = Path(temp_file) | |
| if req_file.is_file(): | |
| print('File Exists. Getting last run time') | |
| with open(temp_file, 'r') as f: | |
| last_run = datetime.datetime.fromtimestamp(int(f.read())) | |
| f.close() | |
| print(str(last_run)) | |
| f = open(temp_file, 'w') | |
| f.write(str(int(time.mktime(current_time.timetuple())))) | |
| f.close() | |
| time_diff = current_time - last_run | |
| else: | |
| print('Files do not exist. Creating files.') | |
| os.makedirs(os.path.join('data','tmp')) | |
| os.makedirs(os.path.join('data','pdf')) | |
| f = open(temp_file, 'w') | |
| last_run = current_time | |
| f.write(str(int(time.mktime(current_time.timetuple())))) | |
| f.close() | |
| time_diff = datetime.timedelta(hours = 24) | |
| print('Getting data\n--') | |
| page_url = 'https://twitter.com/arXiv__ml' | |
| page = urllib.request.urlopen(page_url) | |
| soup = BeautifulSoup(page) | |
| cnt = 0 | |
| tweets = soup.find_all('div', attrs = {'class': 'content'}) | |
| for twt in tweets: | |
| time_delay = twt.find_all('span', attrs = {'class': '_timestamp'}) | |
| if(time_diff > current_time - datetime.datetime.fromtimestamp(int(time_delay[0]['data-time']))): | |
| print('New content found. Getting resources. This might take a while') | |
| try: | |
| anchor = twt.find_all('a', attrs = {'class': 'twitter-timeline-link'}) | |
| link = anchor[0]['title'] | |
| redir_page = urllib.request.urlopen(link) | |
| redir_soup = BeautifulSoup(redir_page) | |
| pdf_anchor = redir_soup.find_all('a', attrs = {'accesskey':'f'}) | |
| save_loc = pdf_anchor[0]['href'][1:]+'.pdf' | |
| pdf_url = 'https://arxiv.org/'+save_loc | |
| print('Getting PDF. This might take a while too...') | |
| pdf = open(os.path.join('data/',save_loc.split('/')[0],save_loc.split('/')[1]), 'wb+') | |
| res = urllib.request.urlopen(pdf_url) | |
| pdf.write(res.read()) | |
| pdf.close() | |
| print('Done. Pdf saved in data/pdf/'+save_loc.split('/')[1]+'\n--') | |
| except: | |
| print('Problem getting file. Moving to next file.\n--') | |
| else: | |
| print('No new content found. Maybe, try after a while. Quitting\n--') | |
| break | |
| print('Bye') |
| beautifulsoup4==4.7.1 | |
| bs4==0.0.1 | |
| certifi==2019.6.16 | |
| chardet==3.0.4 | |
| idna==2.8 | |
| pathlib==1.0.1 | |
| requests==2.22.0 | |
| soupsieve==1.9.1 | |
| urllib3==1.25.3 |