Last active
December 3, 2024 13:12
-
-
Save CJCShadowsan/2a0640892e444676cac44f1fe82d0903 to your computer and use it in GitHub Desktop.
A simple Python script using BeautifulSoup to loop through the PDFs available from the session and download them. Now renames the title of the PDFs based on the title in the publication.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env python3 | |
| import time | |
| import random | |
| import os | |
| import requests | |
| from bs4 import BeautifulSoup | |
| from PyPDF2 import PdfReader | |
| # Function to create a directory if it doesn't exist | |
| def create_directory(directory): | |
| if not os.path.exists(directory): | |
| os.makedirs(directory) | |
| def sleep_random(min_seconds=15, max_seconds=45): | |
| """ | |
| Sleep for a random amount of time between min_seconds and max_seconds. | |
| Args: | |
| min_seconds (int): Minimum sleep time in seconds. | |
| max_seconds (int): Maximum sleep time in seconds. | |
| """ | |
| sleep_time = random.uniform(min_seconds, max_seconds) | |
| print(f"Sleeping for {sleep_time:.2f} seconds...") | |
| time.sleep(sleep_time) | |
| # Function to download a PDF from a given URL | |
| def download_pdf(pdf_url, save_directory): | |
| try: | |
| response = requests.get(pdf_url) | |
| response.raise_for_status() # Raise an HTTPError for bad responses | |
| filename = pdf_url.split('/')[-1] | |
| filepath = os.path.join(save_directory, filename+'.pdf') | |
| with open(filepath, 'wb') as pdf_file: | |
| pdf_file.write(response.content) | |
| print(f"Downloaded: {filename}") | |
| sleep_random(20,45) | |
| except Exception as e: | |
| print(f"Failed to download {pdf_url}: {e}") | |
| # Main script to scrape and download PDFs | |
| def download_all_pdfs(page_url, save_directory="pdfs"): | |
| headers = { | |
| "User-Agent": "Mozilla/5.0 (Windows NT 10.0; Win64; x64) AppleWebKit/537.36 (KHTML, like Gecko) Chrome/114.0.0.0 Safari/537.36" | |
| } | |
| try: | |
| # Request the main page | |
| response = requests.get(page_url, headers=headers) | |
| response.raise_for_status() # Raise an HTTPError for bad responses | |
| # Parse the HTML content | |
| soup = BeautifulSoup(response.text, 'html.parser') | |
| # Find all PDF links (based on ACM's URL structure) | |
| pdf_links = [] | |
| for link in soup.find_all('a', href=lambda href: href and 'pdf' in href): | |
| pdf_links.append(link['href']) | |
| # Ensure we have links to download | |
| if not pdf_links: | |
| print("No PDF links found on the page.") | |
| return | |
| # Create a directory to save PDFs | |
| create_directory(save_directory) | |
| # Download each PDF | |
| for pdf_link in pdf_links: | |
| if not pdf_link.startswith('http'): | |
| pdf_link = f"https://dl.acm.org{pdf_link}" # Construct full URL if relative | |
| download_pdf(pdf_link, save_directory) | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| # Replace with the desired URL | |
| base_url = "https://dl.acm.org/doi/proceedings/10.5555/3703596?tocHeading=heading" | |
| urls = [] | |
| for i in range(1, 37): # Loop from 1 to 36 | |
| full_url = f"{base_url}{i}" | |
| urls.append(full_url) | |
| # Download the resulting PDFs from the list of URLs by number of headings | |
| for url in urls: | |
| download_all_pdfs(url) | |
| def rename_pdfs_in_directory(directory_path): | |
| """ | |
| Renames all PDF files in a directory based on the text from the first page. | |
| :param directory_path: Path to the directory containing PDF files | |
| """ | |
| try: | |
| # List all files in the directory | |
| files = os.listdir(directory_path) | |
| # Filter to include only PDF files | |
| pdf_files = [f for f in files if f.lower().endswith('.pdf')] | |
| if not pdf_files: | |
| print("No PDF files found in the directory.") | |
| return | |
| for file_name in pdf_files: | |
| file_path = os.path.join(directory_path, file_name) | |
| try: | |
| # Read the PDF file | |
| reader = PdfReader(file_path) | |
| # Extract text from the first page | |
| if reader.pages: | |
| first_page = reader.pages[0] | |
| text = first_page.extract_text() | |
| else: | |
| print(f"No pages found in file: {file_name}. Skipping...") | |
| continue | |
| if not text.strip(): | |
| print(f"No text found on the first page of file: {file_name}. Skipping...") | |
| continue | |
| # Use the first line of text as the title | |
| line1 = text.splitlines()[0] | |
| line2 = text.splitlines()[1] | |
| title = line1+line2 | |
| # Sanitize the title to create a valid file name | |
| sanitized_title = "".join(c if c.isalnum() or c in " ._-" else "_" for c in title) | |
| # Create the new file name with the same extension | |
| new_file_name = f"{sanitized_title}.pdf" | |
| new_file_path = os.path.join(directory_path, new_file_name) | |
| # Rename the file | |
| os.rename(file_path, new_file_path) | |
| print(f"Renamed '{file_name}' to '{new_file_name}'.") | |
| except Exception as e: | |
| print(f"An error occurred while processing '{file_name}': {e}") | |
| except Exception as e: | |
| print(f"An error occurred: {e}") | |
| # Example usage: | |
| # Replace 'path/to/your/directory' with the actual path to your directory containing PDFs | |
| rename_pdfs_in_directory('pdfs') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment