Skip to content

Instantly share code, notes, and snippets.

@DonRichards
Created August 26, 2025 21:23
Show Gist options
  • Select an option

  • Save DonRichards/2fe9abdb375c56fabc3c8275c7b4f6fc to your computer and use it in GitHub Desktop.

Select an option

Save DonRichards/2fe9abdb375c56fabc3c8275c7b4f6fc to your computer and use it in GitHub Desktop.
Islandora Hash extraction
#!/bin/bash
# chmod +x fedora_checksums.sh
# ./fedora_checksums.sh
# This script is used to extract Fedora checksums for files referenced in Drupal nodes.
# It must be run from within the Drupal container.
# The script connects to Fedora via http://fcrepo:8080 and uses Drush
# to query the Drupal database for file URIs.
# The script creates a fedora_checksums.json file with node IDs and their
# corresponding Fedora file checksums.
# Help function
show_help() {
echo "Usage: $0 [OPTIONS]"
echo ""
echo "Extract Fedora checksums for files referenced in Drupal nodes."
echo ""
echo "OPTIONS:"
echo " -h, --help Show this help message"
echo ""
echo "IMPORTANT: This script must be run from within the Drupal container."
echo "The script connects to Fedora via http://fcrepo:8080 and uses Drush"
echo "to query the Drupal database for file URIs."
echo ""
echo "Output: Creates fedora_checksums.json with node IDs and their"
echo " corresponding Fedora file checksums."
exit 0
}
# Parse command line arguments
while [[ $# -gt 0 ]]; do
case $1 in
-h|--help)
show_help
;;
*)
echo "Unknown option: $1"
echo "Use --help for usage information."
exit 1
;;
esac
shift
done
echo "πŸ” Checking if running in a container..."
if ! curl -s http://fcrepo:8080 > /dev/null; then
echo "❌ Could not resolve http://fcrepo:8080. Are you running in a container?"
exit 1
fi
echo "πŸ” Running Fedora checksum extraction script..."
echo "πŸ“¦ Using Drupal SQL connection via Drush"
echo "πŸ“‘ Running SQL query to fetch node IDs and file URIs..."
SQL_QUERY="
SELECT n.nid, fm.uri
FROM node_field_data n
JOIN media__field_media_of mo ON mo.field_media_of_target_id = n.nid
JOIN media_field_data mfd ON mfd.mid = mo.entity_id
JOIN media__field_media_file mff ON mfd.mid = mff.entity_id
JOIN file_managed fm ON mff.field_media_file_target_id = fm.fid
WHERE fm.uri LIKE 'fedora://%'
AND fm.uri NOT LIKE '%Extracted Text%'
AND fm.uri NOT LIKE '%FITS File%';
"
echo "🐚 Executing query..."
if ! NODE_ROWS=$(drush sql:query "$SQL_QUERY"); then
echo "❌ SQL query failed. Exiting."
exit 1
else
echo "βœ… SQL query successful."
NODE_COUNT=$(echo "$NODE_ROWS" | wc -l)
echo "πŸ” Number of results: $NODE_COUNT"
fi
echo "πŸ“Š Processing rows..."
declare -A RESULTS
JSON_OUTPUT="{"
while IFS=$'\t' read -r NID URI; do
[[ "$NID" =~ ^[0-9]+$ ]] || continue # Skip header row if it exists
if [[ "$URI" == fedora://* ]]; then
FEDORA_PATH="${URI#fedora://}"
FEDORA_URL="http://fcrepo:8080/fcrepo/rest/${FEDORA_PATH}"
echo "🌐 Fetching checksum for $FEDORA_URL"
DIGEST_HEADER=$(curl -sI -H "Want-Digest: sha-256" "$FEDORA_URL" | grep -i ^Digest)
if [[ -z "$DIGEST_HEADER" ]]; then
echo "⚠️ No Digest returned for $FEDORA_URL"
CHECKSUM="MISSING"
else
CHECKSUM=$(echo "$DIGEST_HEADER" | sed -n 's/^Digest: sha-256=\(.*\)/\1/p')
fi
else
echo "❓ Unknown URI scheme: $URI"
continue
fi
# Append to JSON output
JSON_OUTPUT+="\"$NID\": {
\"fedora_uri\": \"${URI}\",
\"checksum\": \"${CHECKSUM}\"
},"
done <<< "$NODE_ROWS"
# Remove trailing comma and close JSON
JSON_OUTPUT="${JSON_OUTPUT%,}"
JSON_OUTPUT+="}"
echo "$JSON_OUTPUT" > fedora_checksums.json
echo "βœ… Checksums saved to fedora_checksums.json"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment