Last active
September 20, 2024 14:31
-
-
Save starhound/ddc6cff2c4ff5b12f92decaa3284eac9 to your computer and use it in GitHub Desktop.
OASIS Doc Scraper
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from urllib.parse import urljoin, urlparse | |
| import time | |
| BASE_URL = 'https://docs.oasis-open.org/' | |
| VISITED = [] | |
| def download_pdfs(pdf_links, download_folder='downloaded_pdfs'): | |
| if not os.path.exists(download_folder): | |
| os.makedirs(download_folder) | |
| for pdf_link in pdf_links: | |
| if pdf_link in VISITED: | |
| continue | |
| try: | |
| response = requests.get(pdf_link) | |
| response.raise_for_status() | |
| # Extract the file name from the URL | |
| parsed_url = urlparse(pdf_link) | |
| pdf_filename = os.path.basename(parsed_url.path) | |
| save_path = os.path.join(download_folder, pdf_filename) | |
| # Save the PDF to the local file system | |
| with open(save_path, 'wb') as f: | |
| f.write(response.content) | |
| print(f"Downloaded: {pdf_filename}") | |
| VISITED.append(pdf_link) | |
| time.sleep(1) | |
| except requests.exceptions.RequestException as e: | |
| print(f"Failed to download {pdf_link}: {e}") | |
| time.sleep(1) | |
| def scrape_pdfs(url, visited=None): | |
| if visited is None: | |
| visited = set() | |
| if url in visited: | |
| return [] | |
| visited.add(url) | |
| pdf_links = [] | |
| try: | |
| response = requests.get(url) | |
| response.raise_for_status() | |
| except requests.exceptions.RequestException as e: | |
| print(f"Failed to fetch {url}: {e}") | |
| return [] | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find all links on the current page | |
| for link in soup.find_all('a', href=True): | |
| href = link.get('href') | |
| full_url = urljoin(url, href) | |
| # if the full url is not part of the base url, skip it | |
| if not full_url.startswith(BASE_URL): | |
| continue | |
| # Recursively scrape directories (ignoring file types that are not PDFs or folders) | |
| if full_url.endswith('/'): | |
| print(f"Scraping: {full_url}") | |
| # Wait a second before doing next request | |
| time.sleep(1) | |
| pdf_links.extend(scrape_pdfs(full_url, visited)) | |
| # If the link is a PDF, add it to the list | |
| elif full_url.lower().endswith('.pdf'): | |
| pdf_links.append(full_url) | |
| print(f"Found PDF: {full_url}") | |
| # Download the PDFs | |
| download_pdfs(pdf_links) | |
| return pdf_links | |
| if __name__ == "__main__": | |
| # Start scraping from the base URL | |
| all_pdfs = scrape_pdfs(BASE_URL) | |
| print(f"Found {len(all_pdfs)} PDFs in total") |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment