Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save norandom/0adacfc2992d986ea2ad172d4b38f731 to your computer and use it in GitHub Desktop.

Select an option

Save norandom/0adacfc2992d986ea2ad172d4b38f731 to your computer and use it in GitHub Desktop.
Upload multiple batches of PDFs to RAGflow via Bash
#!/bin/bash
# Check for jq and install if not found
if ! command -v jq &> /dev/null
then
echo "jq could not be found, attempting to install."
if [ -f /etc/os-release ]; then
. /etc/os-release
if [ "$ID" = "ubuntu" ] || [ "$ID" = "debian" ]; then
sudo apt-get update && sudo apt-get install -y jq
elif [ "$ID" = "centos" ] || [ "$ID" = "rhel" ] || [ "$ID" = "fedora" ]; then
sudo yum install -y jq
elif [ "$ID" = "arch" ]; then
sudo pacman -Sy --noconfirm jq
else
echo "Unsupported OS for automatic jq installation. Please install jq manually." >&2
exit 1
fi
else
echo "Could not detect OS for automatic jq installation. Please install jq manually." >&2
exit 1
fi
if ! command -v jq &> /dev/null; then
echo "jq installation failed. Exiting." >&2
exit 1
fi
echo "jq installed successfully."
fi
# Configuration
RAGFLOW_BASE_URL="http://1.2.3.4:9380"
RAGFLOW_API_KEY="ragflow-MyKey"
DATASET_NAME="MyKnowledge" # Name of the dataset to upload to
DATASET_ID="" # This will be dynamically resolved
# Function to get dataset ID by name
get_dataset_id() {
local dataset_name="$1"
local response=$(curl -s -k -X GET "${RAGFLOW_BASE_URL}/api/v1/datasets" -H "Authorization: Bearer ${RAGFLOW_API_KEY}")
local dataset_id=$(echo "$response" | jq -r ".data[] | select(.name == \"$dataset_name\") | .id")
echo "$dataset_id"
}
# Resolve DATASET_ID
echo "Resolving ID for dataset: $DATASET_NAME"
DATASET_ID=$(get_dataset_id "$DATASET_NAME")
if [ -z "$DATASET_ID" ]; then
echo "Error: Could not resolve ID for dataset '$DATASET_NAME'. Exiting."
exit 1
fi
echo "Resolved DATASET_ID: $DATASET_ID"
MAX_PDFS_PER_MERGE=50
TEMP_DIR="/tmp/ragflow_uploads"
# Create temporary directory if it doesn't exist
mkdir -p "$TEMP_DIR"
# Find all PDF files
PDF_FILES=(*.pdf)
TOTAL_PDFS=${#PDF_FILES[@]}
UPLOADED_COUNT=0
BATCH_NUM=0
echo "Found $TOTAL_PDFS PDF files."
# Loop through PDFs in batches
for (( i=0; i<TOTAL_PDFS; i+=MAX_PDFS_PER_MERGE )); do
BATCH_NUM=$((BATCH_NUM + 1))
BATCH_FILES=()
for (( j=0; j<MAX_PDFS_PER_MERGE && (i+j)<TOTAL_PDFS; j++ )); do
BATCH_FILES+=("${PDF_FILES[i+j]}")
done
MERGED_PDF_NAME="merged_batch_${BATCH_NUM}.pdf"
MERGED_PDF_PATH="${TEMP_DIR}/${MERGED_PDF_NAME}"
echo "--- Processing batch $BATCH_NUM: Merging ${#BATCH_FILES[@]} PDFs ---"
echo "Files to merge: ${BATCH_FILES[*]}"
# Merge PDFs using pdftk
PDFTK_COMMAND="pdftk"
for file in "${BATCH_FILES[@]}"; do
PDFTK_COMMAND+=" "$file""
done
PDFTK_COMMAND+=" cat output "$MERGED_PDF_PATH""
echo "Executing: $PDFTK_COMMAND"
eval "$PDFTK_COMMAND"
if [ $? -eq 0 ]; then
echo "Successfully merged batch $BATCH_NUM to $MERGED_PDF_PATH"
# Upload merged PDF to RAGFlow
echo "Uploading $MERGED_PDF_NAME to RAGFlow..."
UPLOAD_RESPONSE=$(curl -k -s -w "\n%{http_code}" -X POST \
"${RAGFLOW_BASE_URL}/api/v1/datasets/${DATASET_ID}/documents" \
-H "Content-Type: multipart/form-data" \
-H "Authorization: Bearer ${RAGFLOW_API_KEY}" \
-F "file=@${MERGED_PDF_PATH}")
HTTP_CODE=$(echo "$UPLOAD_RESPONSE" | tail -n1)
RESPONSE_BODY=$(echo "$UPLOAD_RESPONSE" | sed '$d')
if [ "$HTTP_CODE" -eq 200 ]; then
echo "Upload successful for $MERGED_PDF_NAME. Response: $RESPONSE_BODY"
UPLOADED_COUNT=$((UPLOADED_COUNT + ${#BATCH_FILES[@]}))
else
echo "Upload failed for $MERGED_PDF_NAME. HTTP Code: $HTTP_CODE, Response: $RESPONSE_BODY"
fi
# Clean up merged PDF
rm "$MERGED_PDF_PATH"
else
echo "Error merging PDFs for batch $BATCH_NUM."
fi
done
echo "--- Upload process complete ---"
echo "Total PDFs processed: $TOTAL_PDFS"
echo "Total PDFs uploaded (in merged batches): $UPLOADED_COUNT"
echo "Temporary files cleaned up from $TEMP_DIR"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment