Skip to content

Instantly share code, notes, and snippets.

@ActuallyFro
Created December 22, 2024 17:14
Show Gist options
  • Select an option

  • Save ActuallyFro/7ec18562b128eb4da395fde80274aab5 to your computer and use it in GitHub Desktop.

Select an option

Save ActuallyFro/7ec18562b128eb4da395fde80274aab5 to your computer and use it in GitHub Desktop.
Crawling the Doctrine Web FFNP
#!/bin/bash
# Set the base URL and domain to only crawl within this domain
BASE_URL="https://www.doctrine.af.mil"
DOMAIN="doctrine.af.mil"
#OUTPUT_DIR="downloaded_pdfs"
OUTPUT_DIR="./"
# Create an output directory if it doesn't exist
mkdir -p "$OUTPUT_DIR"
# Temporary directory for visited links
VISITED_FILE=$(mktemp)
# Function to download PDFs from a given URL
download_pdfs() {
url=$1
echo "[INFO] Downloading PDFs from: $url"
# Download the page and extract PDF links (relative and absolute)
wget -q -O - "$url" | grep -oP '(?<=href=")[^"]+\.pdf' | while read -r pdf_link; do
# Ensure the PDF link is absolute
if [[ "$pdf_link" =~ ^/ ]]; then
# If it's a relative URL, prepend the base URL
pdf_link="$BASE_URL$pdf_link"
fi
# Extract the PDF filename from the URL
pdf_filename=$(basename "$pdf_link")
# Check if the PDF already exists in the output directory
if [[ -f "$OUTPUT_DIR/$pdf_filename" ]]; then
echo "[INFO] Skipping (already downloaded): $pdf_filename"
else
echo "[INFO] Found PDF: $pdf_link"
# Download the PDF to the output directory
wget -q -P "$OUTPUT_DIR" "$pdf_link"
fi
done
}
# Function to crawl a page and follow links
crawl_page() {
url=$1
echo "[INFO] Crawling page: $url"
# Get the HTML content of the page and extract all links
wget -q -O - "$url" | grep -oP '(?<=href=")[^"]+' | while read -r link; do
# Skip if the link is empty, a fragment, or non-HTML (external links)
if [[ -z "$link" || "$link" =~ ^# || "$link" =~ "https://" || ! "$link" =~ ^/Doctrine-Publications/ && ! "$link" =~ ^/Operational-Level-Doctrine/ && ! "$link" =~ ^/Glossaries/ ]]; then
continue
fi
# Make the link absolute if it's relative
if [[ "$link" =~ ^/ ]]; then
link="$BASE_URL$link"
fi
# Check if the link is within the same domain
if [[ "$link" =~ ^https://$DOMAIN ]]; then
# Skip if we've already visited this link
if ! grep -Fxq "$link" "$VISITED_FILE"; then
# Mark the link as visited
echo "$link" >> "$VISITED_FILE"
# Download PDFs from the page
download_pdfs "$link"
# If it's a page (not a PDF), crawl further (do not follow PDF links)
if [[ "$link" != *.pdf ]]; then
crawl_page "$link"
fi
fi
fi
done
}
# Step 1: Start crawling from the base URL and extract <area> links (root+1)
echo "[INFO] Starting crawl from: $BASE_URL"
# Download the page's HTML and extract links from <area> tags (root+1)
wget -q -O - "$BASE_URL" | grep -oP '(?<=href=")[^"]+' | while read -r link; do
# Skip non-relevant links (we are only interested in Doctrine publication links)
if [[ -z "$link" || "$link" =~ ^# || "$link" =~ "https://" || ! "$link" =~ ^/Doctrine-Publications/ && ! "$link" =~ ^/Operational-Level-Doctrine/ && ! "$link" =~ ^/Glossaries/ ]]; then
continue
fi
# Convert relative link to absolute URL
full_url="$BASE_URL$link"
# Mark the link as visited and start crawling
if ! grep -Fxq "$full_url" "$VISITED_FILE"; then
echo "$full_url" >> "$VISITED_FILE"
# Download PDFs from this page
download_pdfs "$full_url"
# Crawl further for other links on this page
crawl_page "$full_url"
fi
done
# Clean up the temporary visited file
rm -f "$VISITED_FILE"
echo "[INFO] Download complete. PDFs saved in the '$OUTPUT_DIR' directory."
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment