Created
December 22, 2024 17:14
-
-
Save ActuallyFro/7ec18562b128eb4da395fde80274aab5 to your computer and use it in GitHub Desktop.
Crawling the Doctrine Web FFNP
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Set the base URL and domain to only crawl within this domain | |
| BASE_URL="https://www.doctrine.af.mil" | |
| DOMAIN="doctrine.af.mil" | |
| #OUTPUT_DIR="downloaded_pdfs" | |
| OUTPUT_DIR="./" | |
| # Create an output directory if it doesn't exist | |
| mkdir -p "$OUTPUT_DIR" | |
| # Temporary directory for visited links | |
| VISITED_FILE=$(mktemp) | |
| # Function to download PDFs from a given URL | |
| download_pdfs() { | |
| url=$1 | |
| echo "[INFO] Downloading PDFs from: $url" | |
| # Download the page and extract PDF links (relative and absolute) | |
| wget -q -O - "$url" | grep -oP '(?<=href=")[^"]+\.pdf' | while read -r pdf_link; do | |
| # Ensure the PDF link is absolute | |
| if [[ "$pdf_link" =~ ^/ ]]; then | |
| # If it's a relative URL, prepend the base URL | |
| pdf_link="$BASE_URL$pdf_link" | |
| fi | |
| # Extract the PDF filename from the URL | |
| pdf_filename=$(basename "$pdf_link") | |
| # Check if the PDF already exists in the output directory | |
| if [[ -f "$OUTPUT_DIR/$pdf_filename" ]]; then | |
| echo "[INFO] Skipping (already downloaded): $pdf_filename" | |
| else | |
| echo "[INFO] Found PDF: $pdf_link" | |
| # Download the PDF to the output directory | |
| wget -q -P "$OUTPUT_DIR" "$pdf_link" | |
| fi | |
| done | |
| } | |
| # Function to crawl a page and follow links | |
| crawl_page() { | |
| url=$1 | |
| echo "[INFO] Crawling page: $url" | |
| # Get the HTML content of the page and extract all links | |
| wget -q -O - "$url" | grep -oP '(?<=href=")[^"]+' | while read -r link; do | |
| # Skip if the link is empty, a fragment, or non-HTML (external links) | |
| if [[ -z "$link" || "$link" =~ ^# || "$link" =~ "https://" || ! "$link" =~ ^/Doctrine-Publications/ && ! "$link" =~ ^/Operational-Level-Doctrine/ && ! "$link" =~ ^/Glossaries/ ]]; then | |
| continue | |
| fi | |
| # Make the link absolute if it's relative | |
| if [[ "$link" =~ ^/ ]]; then | |
| link="$BASE_URL$link" | |
| fi | |
| # Check if the link is within the same domain | |
| if [[ "$link" =~ ^https://$DOMAIN ]]; then | |
| # Skip if we've already visited this link | |
| if ! grep -Fxq "$link" "$VISITED_FILE"; then | |
| # Mark the link as visited | |
| echo "$link" >> "$VISITED_FILE" | |
| # Download PDFs from the page | |
| download_pdfs "$link" | |
| # If it's a page (not a PDF), crawl further (do not follow PDF links) | |
| if [[ "$link" != *.pdf ]]; then | |
| crawl_page "$link" | |
| fi | |
| fi | |
| fi | |
| done | |
| } | |
| # Step 1: Start crawling from the base URL and extract <area> links (root+1) | |
| echo "[INFO] Starting crawl from: $BASE_URL" | |
| # Download the page's HTML and extract links from <area> tags (root+1) | |
| wget -q -O - "$BASE_URL" | grep -oP '(?<=href=")[^"]+' | while read -r link; do | |
| # Skip non-relevant links (we are only interested in Doctrine publication links) | |
| if [[ -z "$link" || "$link" =~ ^# || "$link" =~ "https://" || ! "$link" =~ ^/Doctrine-Publications/ && ! "$link" =~ ^/Operational-Level-Doctrine/ && ! "$link" =~ ^/Glossaries/ ]]; then | |
| continue | |
| fi | |
| # Convert relative link to absolute URL | |
| full_url="$BASE_URL$link" | |
| # Mark the link as visited and start crawling | |
| if ! grep -Fxq "$full_url" "$VISITED_FILE"; then | |
| echo "$full_url" >> "$VISITED_FILE" | |
| # Download PDFs from this page | |
| download_pdfs "$full_url" | |
| # Crawl further for other links on this page | |
| crawl_page "$full_url" | |
| fi | |
| done | |
| # Clean up the temporary visited file | |
| rm -f "$VISITED_FILE" | |
| echo "[INFO] Download complete. PDFs saved in the '$OUTPUT_DIR' directory." |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment