Last active
July 8, 2025 11:02
-
-
Save norandom/1dcc3c88d028ad10e4501a6d8cdf67c7 to your computer and use it in GitHub Desktop.
Upload a batch of PDFs to RAGflow via Bash
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/bin/bash | |
| # Check for jq and install if not found | |
| if ! command -v jq &> /dev/null | |
| then | |
| echo "jq could not be found, attempting to install." | |
| if [ -f /etc/os-release ]; then | |
| . /etc/os-release | |
| if [ "$ID" = "ubuntu" ] || [ "$ID" = "debian" ]; then | |
| sudo apt-get update && sudo apt-get install -y jq | |
| elif [ "$ID" = "centos" ] || [ "$ID" = "rhel" ] || [ "$ID" = "fedora" ]; then | |
| sudo yum install -y jq | |
| elif [ "$ID" = "arch" ]; then | |
| sudo pacman -Sy --noconfirm jq | |
| else | |
| echo "Unsupported OS for automatic jq installation. Please install jq manually." >&2 | |
| exit 1 | |
| fi | |
| else | |
| echo "Could not detect OS for automatic jq installation. Please install jq manually." >&2 | |
| exit 1 | |
| fi | |
| if ! command -v jq &> /dev/null; then | |
| echo "jq installation failed. Exiting." >&2 | |
| exit 1 | |
| fi | |
| echo "jq installed successfully." | |
| fi | |
| # Configuration | |
| RAGFLOW_BASE_URL="http://1.2.3.4:9380" | |
| RAGFLOW_API_KEY="ragflow-MyKewlKey" | |
| DATASET_NAME="MyKnowledge" # Name of the dataset to upload to | |
| DATASET_ID="" # This will be dynamically resolved | |
| # Function to get dataset ID by name | |
| get_dataset_id() { | |
| local dataset_name="$1" | |
| local response=$(curl -s -k -X GET "${RAGFLOW_BASE_URL}/api/v1/datasets" -H "Authorization: Bearer ${RAGFLOW_API_KEY}") | |
| local dataset_id=$(echo "$response" | jq -r ".data[] | select(.name == \"$dataset_name\") | .id") | |
| echo "$dataset_id" | |
| } | |
| # Resolve DATASET_ID | |
| echo "Resolving ID for dataset: $DATASET_NAME" | |
| DATASET_ID=$(get_dataset_id "$DATASET_NAME") | |
| if [ -z "$DATASET_ID" ]; then | |
| echo "Error: Could not resolve ID for dataset '$DATASET_NAME'. Exiting." | |
| exit 1 | |
| fi | |
| echo "Resolved DATASET_ID: $DATASET_ID" | |
| # Find all PDF files | |
| PDF_FILES=(*.pdf) | |
| TOTAL_PDFS=${#PDF_FILES[@]} | |
| UPLOADED_COUNT=0 | |
| echo "Found $TOTAL_PDFS PDF files." | |
| # Loop through and upload each PDF | |
| for pdf_file in "${PDF_FILES[@]}"; do | |
| echo "--- Processing file: $pdf_file ---" | |
| # Upload PDF to RAGFlow | |
| echo "Uploading $pdf_file to RAGFlow..." | |
| UPLOAD_RESPONSE=$(curl -k -s -w "\n%{http_code}" -X POST \ | |
| "${RAGFLOW_BASE_URL}/api/v1/datasets/${DATASET_ID}/documents" \ | |
| -H "Content-Type: multipart/form-data" \ | |
| -H "Authorization: Bearer ${RAGFLOW_API_KEY}" \ | |
| -F "file=@${pdf_file}") | |
| HTTP_CODE=$(echo "$UPLOAD_RESPONSE" | tail -n1) | |
| RESPONSE_BODY=$(echo "$UPLOAD_RESPONSE" | sed '$d') | |
| if [ "$HTTP_CODE" -eq 200 ]; then | |
| echo "Upload successful for $pdf_file. Response: $RESPONSE_BODY" | |
| UPLOADED_COUNT=$((UPLOADED_COUNT + 1)) | |
| else | |
| echo "Upload failed for $pdf_file. HTTP Code: $HTTP_CODE, Response: $RESPONSE_BODY" | |
| fi | |
| done | |
| echo "--- Upload process complete ---" | |
| echo "Total PDFs processed: $TOTAL_PDFS" | |
| echo "Total PDFs uploaded: $UPLOADED_COUNT" |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment