ActuallyFro · December 22, 2024 17:14
diff --git a/GetAFDoctrinePDFs.sh b/GetAFDoctrinePDFs.sh
 #!/bin/bash

 # Set the base URL and domain to only crawl within this domain
 BASE_URL="https://www.doctrine.af.mil"
 DOMAIN="doctrine.af.mil"
 #OUTPUT_DIR="downloaded_pdfs"
 OUTPUT_DIR="./"

 # Create an output directory if it doesn't exist
 mkdir -p "$OUTPUT_DIR"

 # Temporary directory for visited links
 VISITED_FILE=$(mktemp)

 # Function to download PDFs from a given URL
 download_pdfs() {
  url=$1
  echo "[INFO] Downloading PDFs from: $url"
  
  # Download the page and extract PDF links (relative and absolute)
  wget -q -O - "$url" | grep -oP '(?<=href=")[^"]+\.pdf' | while read -r pdf_link; do
    # Ensure the PDF link is absolute
    if [[ "$pdf_link" =~ ^/ ]]; then
      # If it's a relative URL, prepend the base URL
      pdf_link="$BASE_URL$pdf_link"
    fi

    # Extract the PDF filename from the URL
    pdf_filename=$(basename "$pdf_link")

    # Check if the PDF already exists in the output directory
    if [[ -f "$OUTPUT_DIR/$pdf_filename" ]]; then
      echo "[INFO] Skipping (already downloaded): $pdf_filename"
    else
      echo "[INFO] Found PDF: $pdf_link"
      # Download the PDF to the output directory
      wget -q -P "$OUTPUT_DIR" "$pdf_link"
    fi
  done
 }

 # Function to crawl a page and follow links
 crawl_page() {
  url=$1
  echo "[INFO] Crawling page: $url"
  
  # Get the HTML content of the page and extract all links
  wget -q -O - "$url" | grep -oP '(?<=href=")[^"]+' | while read -r link; do
    # Skip if the link is empty, a fragment, or non-HTML (external links)
    if [[ -z "$link" || "$link" =~ ^# || "$link" =~ "https://" || ! "$link" =~ ^/Doctrine-Publications/ && ! "$link" =~ ^/Operational-Level-Doctrine/ && ! "$link" =~ ^/Glossaries/ ]]; then
      continue
    fi
    
    # Make the link absolute if it's relative
    if [[ "$link" =~ ^/ ]]; then
      link="$BASE_URL$link"
    fi

    # Check if the link is within the same domain
    if [[ "$link" =~ ^https://$DOMAIN ]]; then
      # Skip if we've already visited this link
      if ! grep -Fxq "$link" "$VISITED_FILE"; then
        # Mark the link as visited
        echo "$link" >> "$VISITED_FILE"
        
        # Download PDFs from the page
        download_pdfs "$link"
        
        # If it's a page (not a PDF), crawl further (do not follow PDF links)
        if [[ "$link" != *.pdf ]]; then
          crawl_page "$link"
        fi
      fi
    fi
  done
 }

 # Step 1: Start crawling from the base URL and extract <area> links (root+1)
 echo "[INFO] Starting crawl from: $BASE_URL"

 # Download the page's HTML and extract links from <area> tags (root+1)
 wget -q -O - "$BASE_URL" | grep -oP '(?<=href=")[^"]+' | while read -r link; do
  # Skip non-relevant links (we are only interested in Doctrine publication links)
  if [[ -z "$link" || "$link" =~ ^# || "$link" =~ "https://" || ! "$link" =~ ^/Doctrine-Publications/ && ! "$link" =~ ^/Operational-Level-Doctrine/ && ! "$link" =~ ^/Glossaries/ ]]; then
    continue
  fi

  # Convert relative link to absolute URL
  full_url="$BASE_URL$link"
  
  # Mark the link as visited and start crawling
  if ! grep -Fxq "$full_url" "$VISITED_FILE"; then
    echo "$full_url" >> "$VISITED_FILE"
    # Download PDFs from this page
    download_pdfs "$full_url"
    # Crawl further for other links on this page
    crawl_page "$full_url"
  fi
 done

 # Clean up the temporary visited file
 rm -f "$VISITED_FILE"

 echo "[INFO] Download complete. PDFs saved in the '$OUTPUT_DIR' directory."
	#!/bin/bash

	# Set the base URL and domain to only crawl within this domain
	BASE_URL="https://www.doctrine.af.mil"
	DOMAIN="doctrine.af.mil"
	#OUTPUT_DIR="downloaded_pdfs"
	OUTPUT_DIR="./"

	# Create an output directory if it doesn't exist
	mkdir -p "$OUTPUT_DIR"

	# Temporary directory for visited links
	VISITED_FILE=$(mktemp)

	# Function to download PDFs from a given URL
	download_pdfs() {
	url=$1
	echo "[INFO] Downloading PDFs from: $url"

	# Download the page and extract PDF links (relative and absolute)
	wget -q -O - "$url" \| grep -oP '(?<=href=")[^"]+\.pdf' \| while read -r pdf_link; do
	# Ensure the PDF link is absolute
	if [[ "$pdf_link" =~ ^/ ]]; then
	# If it's a relative URL, prepend the base URL
	pdf_link="$BASE_URL$pdf_link"
	fi

	# Extract the PDF filename from the URL
	pdf_filename=$(basename "$pdf_link")

	# Check if the PDF already exists in the output directory
	if [[ -f "$OUTPUT_DIR/$pdf_filename" ]]; then
	echo "[INFO] Skipping (already downloaded): $pdf_filename"
	else
	echo "[INFO] Found PDF: $pdf_link"
	# Download the PDF to the output directory
	wget -q -P "$OUTPUT_DIR" "$pdf_link"
	fi
	done
	}

	# Function to crawl a page and follow links
	crawl_page() {
	url=$1
	echo "[INFO] Crawling page: $url"

	# Get the HTML content of the page and extract all links
	wget -q -O - "$url" \| grep -oP '(?<=href=")[^"]+' \| while read -r link; do
	# Skip if the link is empty, a fragment, or non-HTML (external links)
	if [[ -z "$link" \|\| "$link" =~ ^# \|\| "$link" =~ "https://" \|\| ! "$link" =~ ^/Doctrine-Publications/ && ! "$link" =~ ^/Operational-Level-Doctrine/ && ! "$link" =~ ^/Glossaries/ ]]; then
	continue
	fi

	# Make the link absolute if it's relative
	if [[ "$link" =~ ^/ ]]; then
	link="$BASE_URL$link"
	fi

	# Check if the link is within the same domain
	if [[ "$link" =~ ^https://$DOMAIN ]]; then
	# Skip if we've already visited this link
	if ! grep -Fxq "$link" "$VISITED_FILE"; then
	# Mark the link as visited
	echo "$link" >> "$VISITED_FILE"

	# Download PDFs from the page
	download_pdfs "$link"

	# If it's a page (not a PDF), crawl further (do not follow PDF links)
	if [[ "$link" != *.pdf ]]; then
	crawl_page "$link"
	fi
	fi
	fi
	done
	}

	# Step 1: Start crawling from the base URL and extract <area> links (root+1)
	echo "[INFO] Starting crawl from: $BASE_URL"

	# Download the page's HTML and extract links from <area> tags (root+1)
	wget -q -O - "$BASE_URL" \| grep -oP '(?<=href=")[^"]+' \| while read -r link; do
	# Skip non-relevant links (we are only interested in Doctrine publication links)
	if [[ -z "$link" \|\| "$link" =~ ^# \|\| "$link" =~ "https://" \|\| ! "$link" =~ ^/Doctrine-Publications/ && ! "$link" =~ ^/Operational-Level-Doctrine/ && ! "$link" =~ ^/Glossaries/ ]]; then
	continue
	fi

	# Convert relative link to absolute URL
	full_url="$BASE_URL$link"

	# Mark the link as visited and start crawling
	if ! grep -Fxq "$full_url" "$VISITED_FILE"; then
	echo "$full_url" >> "$VISITED_FILE"
	# Download PDFs from this page
	download_pdfs "$full_url"
	# Crawl further for other links on this page
	crawl_page "$full_url"
	fi
	done

	# Clean up the temporary visited file
	rm -f "$VISITED_FILE"

	echo "[INFO] Download complete. PDFs saved in the '$OUTPUT_DIR' directory."
No results found