Created
January 11, 2017 12:47
-
-
Save hn-support/60016f4b7986ad2cce693bdf2f3501b4 to your computer and use it in GitHub Desktop.
A cache warmer in bash using curl
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| if [ "$#" -ne 1 ] || [ "x$1" == "x" ] ; then | |
| echo "Usage: $0 <sitemap.xml>" | |
| exit 0; | |
| fi | |
| if [ ! -f "$1" ]; then | |
| echo "Sitemap file $1 not found! Exit!" | |
| exit 1 | |
| fi | |
| cat "$1" | perl -ne 'while (/>(http.+?)</g) { print "$1\n"; }' | while read line; do | |
| echo " Crawling $line " | |
| curl -so /dev/null -w "%{time_connect} - %{time_starttransfer} - %{time_total} " $line | |
| done |
Updated version with basic auth support and sitemap index support;
#!/bin/bash
if [ "$#" -lt 1 ] || [ "x$1" == "x" ] ; then
echo "Usage: $0 <sitemap.xml> [--user <username>] [--pass <password>]"
echo ""
echo "Options:"
echo " --user <username> Username for basic auth"
echo " --pass <password> Password for basic auth"
exit 0;
fi
SITEMAP="$1"
AUTH_USER=""
AUTH_PASS=""
CURL_AUTH_OPTS=""
# Parse optional auth arguments
shift
while [ $# -gt 0 ]; do
case "$1" in
--user)
AUTH_USER="$2"
shift 2
;;
--pass)
AUTH_PASS="$2"
shift 2
;;
*)
echo "Unknown option: $1"
exit 1
;;
esac
done
if [ ! -f "$SITEMAP" ]; then
echo "Sitemap file $SITEMAP not found! Exit!"
exit 1
fi
# Build curl auth options if credentials provided
if [ -n "$AUTH_USER" ] && [ -n "$AUTH_PASS" ]; then
CURL_AUTH_OPTS="-u $AUTH_USER:$AUTH_PASS"
elif [ -n "$AUTH_USER" ] || [ -n "$AUTH_PASS" ]; then
echo "Error: Both --user and --pass must be provided for basic auth"
exit 1
fi
# Function to crawl URLs from a sitemap
crawl_sitemap() {
local sitemap_file="$1"
cat "$sitemap_file" | perl -ne 'while (/<loc>(http.+?)<\/loc>/g) { print "$1\n"; }' | while read line; do
echo " Crawling $line "
curl -so /dev/null -w "%{time_connect} - %{time_starttransfer} - %{time_total} " $CURL_AUTH_OPTS "$line"
done
}
# Function to check if file is a sitemap index
is_sitemap_index() {
local file="$1"
grep -q "<sitemapindex" "$file"
return $?
}
# Check if this is a sitemap index
if is_sitemap_index "$SITEMAP"; then
echo "Detected sitemap index. Processing child sitemaps..."
# Extract sitemap URLs from the index
cat "$SITEMAP" | grep -oP '(?<=<loc>)[^<]+(?=</loc>)' | grep 'sitemap.*\.xml' | while read sitemap_url; do
if [ -n "$sitemap_url" ]; then
echo ""
echo "Fetching sitemap: $sitemap_url"
# Download the child sitemap to a temp file
temp_sitemap=$(mktemp)
if curl -sf $CURL_AUTH_OPTS "$sitemap_url" -o "$temp_sitemap"; then
crawl_sitemap "$temp_sitemap"
rm "$temp_sitemap"
else
echo " Error: Failed to download sitemap $sitemap_url"
rm "$temp_sitemap" 2>/dev/null
fi
fi
done
else
echo "Processing regular sitemap..."
crawl_sitemap "$SITEMAP"
fi
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment
Updated version that skips images when they are present in the sitemap (which is allowed);