Skip to content

Instantly share code, notes, and snippets.

@IdoBar
Last active December 1, 2025 05:35
Show Gist options
  • Select an option

  • Save IdoBar/326f866bf02476fbf96eb78e46a7fffb to your computer and use it in GitHub Desktop.

Select an option

Save IdoBar/326f866bf02476fbf96eb78e46a7fffb to your computer and use it in GitHub Desktop.
Fast and parallel download of NCBI Blast databases with bash and aria2c
#!/usr/bin/env bash
# usage function
function usage()
{
cat << HEREDOC
Usage: $progname [--threads NUM] [--con NUM] [--db STR] [--outdir STR] [--remove STR] [--verbose] [--list] [--extract]
optional arguments:
-h, --help show this help message and exit
-t, --threads NUM number of threads to use (default: all available threads)
-c, --con NUM number of concurrent connections (default: 5)
-d, --db STR name of an NCBI database to download
-o, --outdir STR path of database (default: current folder)
-r, --remove STR remove existing database, archive and md5 files (options are "all", "md5", "tar", "md5tar", "none". default: "md5")
boolean flags (default to false):
-x, --extract uncompress database (will also remove *.tar.gz files)
-l, --list list available databases
-v, --verbose increase the verbosity of the bash script (can be specified up to twice)
HEREDOC
}
# initialize variables
progname=$(basename $0)
verbose=0
list=0
extract=0
remove_str="md5"
con_str=5
threads_str=$(nproc)
db_str=
outdir_str=$(pwd)
# use getopt and store the output into $OPTS
# note the use of -o for the short options, --long for the long name options
# and a : for any option that takes a parameter
OPTS=$(getopt -o "ht:c:d:o:r:xlv" --long "help,threads:,con:,db:,outdir:,remove:,extract,list,verbose" -n "$progname" -- "$@")
if [ $? != 0 ] ; then echo "Error in command line arguments." >&2 ; usage; exit 1 ; fi
eval set -- "$OPTS"
while true; do
# uncomment the next line to see how shift is working
# echo "\$1:\"$1\" \$2:\"$2\""
case "$1" in
-h | --help ) usage; exit; ;;
-t | --threads ) threads_str="$2"; shift 2 ;;
-c | --con ) con_str="$2"; shift 2 ;;
-d | --db ) db_str="$2"; shift 2 ;;
-o | --outdir ) outdir_str="$2"; shift 2 ;;
-r | --remove ) remove_str="$2"; shift 2 ;;
-x | --extract ) extract=1; shift ;;
-l | --list ) list=1; shift ;;
-v | --verbose ) verbose=$((verbose + 1)); shift ;;
-- ) shift; break ;;
* ) break ;;
esac
done
set -euo pipefail
if [ "$verbose" -eq 2 ]; then
set -x
# print out all the parameters we read in
cat <<EOM
con=$con_str
threads=$threads_str
db=$db_str
outdir=$outdir_str
extract=$extract
remove=$remove_str
verbose=$verbose
list=$list
EOM
fi
# print list of databases
if [ "$list" -eq 1 ]; then
curl -s https://ftp.ncbi.nih.gov/blast/db/ | grep -Eo "[a-zA-Z0-9_\.]+.tar.gz" | sed -r 's/\..+$//g' | sort | uniq
exit 0
fi
DB=$db_str
# check if database exists
set +e
curl -s https://ftp.ncbi.nih.gov/blast/db/ | grep -Eq "href=\"$DB[0-9_\.]*.tar.gz.md5"
if [ $? -ne 0 ]; then
echo "Database not found, please check your internet connection and rerun the program with '--list' flag to see available databases"; usage; exit 1
fi
set -e
# remove existing database, archive and md5 files
if [ "$remove_str" != "none" ]; then
if [ "$verbose" -eq 1 ]; then echo "Removing existing database files..." >&2 ; fi
case "$remove_str" in
"all" ) rm $outdir_str/$DB.* || true; ;;
"md5tar" ) rm $outdir_str/$DB.*tar.gz* || true; ;;
"md5" ) rm $outdir_str/$DB.*md5 || true; ;;
"tar" ) rm $outdir_str/$DB.*tar.gz || true; ;;
* ) echo "Error in command line arguments (not a valid option for --remove)." >&2 ; usage; exit 1 ;;
esac
fi
# DB=$1
curl -s https://ftp.ncbi.nih.gov/blast/db/ | grep -Eo "href=\"$DB[0-9_\.]*.tar.gz.md5" | sed 's/href="//' | sort | uniq | gawk '{printf "https://ftp.ncbi.nih.gov/blast/db/%s\n", $1}' > $outdir_str/$DB.md5.files
if [ -e "$outdir_str/$DB.md5.files" ] && [ ! -s "$outdir_str/$DB.md5.files" ]; then
echo "No md5 files were found for database '$DB', please rerun with '--list' flag to see available databases (and make sure you can access the NCBI FTP site)."
exit 1
elif [ ! -e "$outdir_str/$DB.md5.files" ]; then
echo "The file '$outdir_str/$DB.md5.files' could not be created, please check that the folder exists and you have writing permissions to it."
exit 1
fi
aria2c --allow-overwrite -x $threads_str -j $con_str -i $outdir_str/$DB.md5.files -d $outdir_str
cat $outdir_str/$DB.md5.files | gawk -F "/" -v OUTDIR=$outdir_str '{printf "%s/%s\n", OUTDIR, $NF}' | xargs gawk -v OUTDIR=$outdir_str '{printf "%s\t%s/%s\n", $1,OUTDIR, $NF}' > $outdir_str/$DB.md5.tmp
curl -s https://ftp.ncbi.nih.gov/blast/db/ | grep -Eo "href=\"$DB[0-9_\.]*.tar.gz" | sed 's/href="//' | sort | uniq | gawk '{printf "https://ftp.ncbi.nih.gov/blast/db/%s\n", $1}' > $outdir_str/$DB.files
if [ -e "$outdir_str/$DB.files" ] && [ ! -s "$outdir_str/$DB.files" ]; then
echo "No tar.gz files were found for database '$DB', please rerun with '--list' flag to see available databases (and make sure you can access the NCBI FTP site)."
exit 1
elif [ ! -e "$outdir_str/$DB.files" ]; then
echo "The file '$outdir_str/$DB.files' could not be created, please check that the folder exists and you have writing permissions to it."
exit 1
fi
aria2c --allow-overwrite -x $threads_str -j $con_str -i $outdir_str/$DB.files -d $outdir_str
if [ "$verbose" -eq 1 ]; then echo "Testing files integrity..." >&2 ; fi
cd $outdir_str/
until md5sum -c $outdir_str/$DB.md5.tmp; do
if [ "$verbose" -eq 1 ]; then echo "MD5 checksum failed, repeating download..." >&2 ; fi
aria2c -c -x $threads_str -j $con_str -i $outdir_str/$DB.files -d $outdir_str
done
if [ $extract -eq 1 ]; then
if [ "$verbose" -eq 1 ]; then echo "Extracting '$DB.*.tar.gz' files..." >&2 ; fi
cat $outdir_str/$DB.files | gawk -F "/" -v OUTDIR=$outdir_str '{printf "%s/%s\n", OUTDIR, $NF}' | parallel "tar xzf {}"
# if echo "$remove_str" | egrep -q "all|tar" ; then cat $outdir_str/$DB.files | gawk -F "/" -v OUTDIR=$outdir_str '{printf "%s/%s\n", OUTDIR, $NF}' | xargs rm; fi
# remove existing database, archive and md5 files
if [ "$remove_str" != "none" ]; then
if [ "$verbose" -eq 1 ]; then echo "Removing downloaded '$outdir_str/$DB.*.tar.gz' files..." >&2 ; fi
case "$remove_str" in
"all" | "md5tar" ) rm $outdir_str/$DB.*tar.gz* || true; ;;
"md5" ) rm $outdir_str/$DB.*md5 || true; ;;
"tar" ) rm $outdir_str/$DB.*tar.gz || true; ;;
* ) echo "Error in command line arguments (not a valid option for --remove)." >&2 ; usage; exit 1 ;;
esac
fi
# Check integrity of the extracted files
if [ "$verbose" -eq 1 ]; then echo "Checking the integrity of '$outdir_str/$DB'..." >&2 ; fi
if [ -f "$outdir_str/$DB.pal" ] || [ -f "$outdir_str/$DB.nal" ]; then
if blastdbcheck -db $outdir_str/$DB ; then
echo "'$DB' database (in folder $outdir_str) is ready for use in BLAST!" >&2
else
echo "'$DB' database (in folder $outdir_str) could not be extracted properly, please check that you have writing permissions and enough disk space and try again..." >&2
if [ "$verbose" -eq 1 ]; then echo "Removing temporary files..." >&2 ; fi
rm $outdir_str/$DB.files $outdir_str/$DB.md5.*
exit 1
fi
else
echo "'$DB' database (in folder $outdir_str) could not be extracted properly, please check that you have writing permissions and enough disk space and try again..." >&2
exit 1
fi
else
if [ "$verbose" -eq 1 ]; then echo "'$DB' database archive is downloaded to '$outdir_str/$DB.*.tar.gz'. Please uncompress the files before use in BLAST!" >&2 ; fi
fi
if [ "$verbose" -eq 1 ]; then echo "Removing temporary files..." >&2 ; fi
rm $outdir_str/$DB.files $outdir_str/$DB.md5.*
if [ "$verbose" -eq 1 ]; then echo "Done!" >&2 ; fi
exit 0
package:
name: download-blast-db
version: "1.0.1"
source:
url: https://gist.githubusercontent.com/IdoBar/326f866bf02476fbf96eb78e46a7fffb/raw/download_blast_db.sh
sha256: fee275a34f8cfd36b8f87e5dcf7cfe2c9ea75ea72639ccb66248472d06272f48
build:
number: 0
script: |
mkdir -p $PREFIX/bin
cp download_blast_db.sh $PREFIX/bin/download-blast-db
chmod +x $PREFIX/bin/download-blast-db
requirements:
build:
- blast
- parallel
- aria2
run:
- blast
- parallel
- aria2
about:
home: https://gist.github.com/IdoBar/326f866bf02476fbf96eb78e46a7fffb
summary: "Fast and parallel download of NCBI BLAST databases using bash, parallel and aria2c."
license: "MIT"
extra:
recipe-maintainers:
- idobar
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment