Skip to content

Instantly share code, notes, and snippets.

@starhound
Last active September 20, 2024 14:31
Show Gist options
  • Select an option

  • Save starhound/ddc6cff2c4ff5b12f92decaa3284eac9 to your computer and use it in GitHub Desktop.

Select an option

Save starhound/ddc6cff2c4ff5b12f92decaa3284eac9 to your computer and use it in GitHub Desktop.
OASIS Doc Scraper
import os
import requests
from bs4 import BeautifulSoup
from urllib.parse import urljoin, urlparse
import time
BASE_URL = 'https://docs.oasis-open.org/'
VISITED = []
def download_pdfs(pdf_links, download_folder='downloaded_pdfs'):
if not os.path.exists(download_folder):
os.makedirs(download_folder)
for pdf_link in pdf_links:
if pdf_link in VISITED:
continue
try:
response = requests.get(pdf_link)
response.raise_for_status()
# Extract the file name from the URL
parsed_url = urlparse(pdf_link)
pdf_filename = os.path.basename(parsed_url.path)
save_path = os.path.join(download_folder, pdf_filename)
# Save the PDF to the local file system
with open(save_path, 'wb') as f:
f.write(response.content)
print(f"Downloaded: {pdf_filename}")
VISITED.append(pdf_link)
time.sleep(1)
except requests.exceptions.RequestException as e:
print(f"Failed to download {pdf_link}: {e}")
time.sleep(1)
def scrape_pdfs(url, visited=None):
if visited is None:
visited = set()
if url in visited:
return []
visited.add(url)
pdf_links = []
try:
response = requests.get(url)
response.raise_for_status()
except requests.exceptions.RequestException as e:
print(f"Failed to fetch {url}: {e}")
return []
soup = BeautifulSoup(response.text, 'html.parser')
# Find all links on the current page
for link in soup.find_all('a', href=True):
href = link.get('href')
full_url = urljoin(url, href)
# if the full url is not part of the base url, skip it
if not full_url.startswith(BASE_URL):
continue
# Recursively scrape directories (ignoring file types that are not PDFs or folders)
if full_url.endswith('/'):
print(f"Scraping: {full_url}")
# Wait a second before doing next request
time.sleep(1)
pdf_links.extend(scrape_pdfs(full_url, visited))
# If the link is a PDF, add it to the list
elif full_url.lower().endswith('.pdf'):
pdf_links.append(full_url)
print(f"Found PDF: {full_url}")
# Download the PDFs
download_pdfs(pdf_links)
return pdf_links
if __name__ == "__main__":
# Start scraping from the base URL
all_pdfs = scrape_pdfs(BASE_URL)
print(f"Found {len(all_pdfs)} PDFs in total")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment