starhound · September 20, 2024 14:31
diff --git a/scraper.py b/scraper.py
 import os
 import requests
 from bs4 import BeautifulSoup
 from urllib.parse import urljoin, urlparse
 import time

 BASE_URL = 'https://docs.oasis-open.org/'

 VISITED = []

 def download_pdfs(pdf_links, download_folder='downloaded_pdfs'):
    if not os.path.exists(download_folder):
        os.makedirs(download_folder)

    for pdf_link in pdf_links:
        if pdf_link in VISITED:
            continue
        try:
            response = requests.get(pdf_link)
            response.raise_for_status()

            # Extract the file name from the URL
            parsed_url = urlparse(pdf_link)
            pdf_filename = os.path.basename(parsed_url.path)
            save_path = os.path.join(download_folder, pdf_filename)

            # Save the PDF to the local file system
            with open(save_path, 'wb') as f:
                f.write(response.content)
            
            print(f"Downloaded: {pdf_filename}")
            VISITED.append(pdf_link)
            time.sleep(1)
        except requests.exceptions.RequestException as e:
            print(f"Failed to download {pdf_link}: {e}")
            time.sleep(1)


 def scrape_pdfs(url, visited=None):
    if visited is None:
        visited = set()
    
    if url in visited:
        return []
    
    visited.add(url)
    
    pdf_links = []
    
    try:
        response = requests.get(url)
        response.raise_for_status()
    except requests.exceptions.RequestException as e:
        print(f"Failed to fetch {url}: {e}")
        return []
    
    soup = BeautifulSoup(response.text, 'html.parser')
    
    # Find all links on the current page
    for link in soup.find_all('a', href=True):
        href = link.get('href')
        full_url = urljoin(url, href)
        
        # if the full url is not part of the base url, skip it
        if not full_url.startswith(BASE_URL):
            continue

        # Recursively scrape directories (ignoring file types that are not PDFs or folders)
        if full_url.endswith('/'):
            print(f"Scraping: {full_url}")
            # Wait a second before doing next request
            time.sleep(1)
            pdf_links.extend(scrape_pdfs(full_url, visited))
        # If the link is a PDF, add it to the list
        elif full_url.lower().endswith('.pdf'):
            pdf_links.append(full_url)
            print(f"Found PDF: {full_url}")
            # Download the PDFs
            download_pdfs(pdf_links)
    
    return pdf_links

 if __name__ == "__main__":
    # Start scraping from the base URL
    all_pdfs = scrape_pdfs(BASE_URL)
    print(f"Found {len(all_pdfs)} PDFs in total")
	import os
	import requests
	from bs4 import BeautifulSoup
	from urllib.parse import urljoin, urlparse
	import time

	BASE_URL = 'https://docs.oasis-open.org/'

	VISITED = []

	def download_pdfs(pdf_links, download_folder='downloaded_pdfs'):
	if not os.path.exists(download_folder):
	os.makedirs(download_folder)

	for pdf_link in pdf_links:
	if pdf_link in VISITED:
	continue
	try:
	response = requests.get(pdf_link)
	response.raise_for_status()

	# Extract the file name from the URL
	parsed_url = urlparse(pdf_link)
	pdf_filename = os.path.basename(parsed_url.path)
	save_path = os.path.join(download_folder, pdf_filename)

	# Save the PDF to the local file system
	with open(save_path, 'wb') as f:
	f.write(response.content)

	print(f"Downloaded: {pdf_filename}")
	VISITED.append(pdf_link)
	time.sleep(1)
	except requests.exceptions.RequestException as e:
	print(f"Failed to download {pdf_link}: {e}")
	time.sleep(1)


	def scrape_pdfs(url, visited=None):
	if visited is None:
	visited = set()

	if url in visited:
	return []

	visited.add(url)

	pdf_links = []

	try:
	response = requests.get(url)
	response.raise_for_status()
	except requests.exceptions.RequestException as e:
	print(f"Failed to fetch {url}: {e}")
	return []

	soup = BeautifulSoup(response.text, 'html.parser')

	# Find all links on the current page
	for link in soup.find_all('a', href=True):
	href = link.get('href')
	full_url = urljoin(url, href)

	# if the full url is not part of the base url, skip it
	if not full_url.startswith(BASE_URL):
	continue

	# Recursively scrape directories (ignoring file types that are not PDFs or folders)
	if full_url.endswith('/'):
	print(f"Scraping: {full_url}")
	# Wait a second before doing next request
	time.sleep(1)
	pdf_links.extend(scrape_pdfs(full_url, visited))
	# If the link is a PDF, add it to the list
	elif full_url.lower().endswith('.pdf'):
	pdf_links.append(full_url)
	print(f"Found PDF: {full_url}")
	# Download the PDFs
	download_pdfs(pdf_links)

	return pdf_links

	if __name__ == "__main__":
	# Start scraping from the base URL
	all_pdfs = scrape_pdfs(BASE_URL)
	print(f"Found {len(all_pdfs)} PDFs in total")
No results found