Last active
December 1, 2025 05:35
-
-
Save IdoBar/326f866bf02476fbf96eb78e46a7fffb to your computer and use it in GitHub Desktop.
Fast and parallel download of NCBI Blast databases with bash and aria2c
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/env bash | |
| # usage function | |
| function usage() | |
| { | |
| cat << HEREDOC | |
| Usage: $progname [--threads NUM] [--con NUM] [--db STR] [--outdir STR] [--remove STR] [--verbose] [--list] [--extract] | |
| optional arguments: | |
| -h, --help show this help message and exit | |
| -t, --threads NUM number of threads to use (default: all available threads) | |
| -c, --con NUM number of concurrent connections (default: 5) | |
| -d, --db STR name of an NCBI database to download | |
| -o, --outdir STR path of database (default: current folder) | |
| -r, --remove STR remove existing database, archive and md5 files (options are "all", "md5", "tar", "md5tar", "none". default: "md5") | |
| boolean flags (default to false): | |
| -x, --extract uncompress database (will also remove *.tar.gz files) | |
| -l, --list list available databases | |
| -v, --verbose increase the verbosity of the bash script (can be specified up to twice) | |
| HEREDOC | |
| } | |
| # initialize variables | |
| progname=$(basename $0) | |
| verbose=0 | |
| list=0 | |
| extract=0 | |
| remove_str="md5" | |
| con_str=5 | |
| threads_str=$(nproc) | |
| db_str= | |
| outdir_str=$(pwd) | |
| # use getopt and store the output into $OPTS | |
| # note the use of -o for the short options, --long for the long name options | |
| # and a : for any option that takes a parameter | |
| OPTS=$(getopt -o "ht:c:d:o:r:xlv" --long "help,threads:,con:,db:,outdir:,remove:,extract,list,verbose" -n "$progname" -- "$@") | |
| if [ $? != 0 ] ; then echo "Error in command line arguments." >&2 ; usage; exit 1 ; fi | |
| eval set -- "$OPTS" | |
| while true; do | |
| # uncomment the next line to see how shift is working | |
| # echo "\$1:\"$1\" \$2:\"$2\"" | |
| case "$1" in | |
| -h | --help ) usage; exit; ;; | |
| -t | --threads ) threads_str="$2"; shift 2 ;; | |
| -c | --con ) con_str="$2"; shift 2 ;; | |
| -d | --db ) db_str="$2"; shift 2 ;; | |
| -o | --outdir ) outdir_str="$2"; shift 2 ;; | |
| -r | --remove ) remove_str="$2"; shift 2 ;; | |
| -x | --extract ) extract=1; shift ;; | |
| -l | --list ) list=1; shift ;; | |
| -v | --verbose ) verbose=$((verbose + 1)); shift ;; | |
| -- ) shift; break ;; | |
| * ) break ;; | |
| esac | |
| done | |
| set -euo pipefail | |
| if [ "$verbose" -eq 2 ]; then | |
| set -x | |
| # print out all the parameters we read in | |
| cat <<EOM | |
| con=$con_str | |
| threads=$threads_str | |
| db=$db_str | |
| outdir=$outdir_str | |
| extract=$extract | |
| remove=$remove_str | |
| verbose=$verbose | |
| list=$list | |
| EOM | |
| fi | |
| # print list of databases | |
| if [ "$list" -eq 1 ]; then | |
| curl -s https://ftp.ncbi.nih.gov/blast/db/ | grep -Eo "[a-zA-Z0-9_\.]+.tar.gz" | sed -r 's/\..+$//g' | sort | uniq | |
| exit 0 | |
| fi | |
| DB=$db_str | |
| # check if database exists | |
| set +e | |
| curl -s https://ftp.ncbi.nih.gov/blast/db/ | grep -Eq "href=\"$DB[0-9_\.]*.tar.gz.md5" | |
| if [ $? -ne 0 ]; then | |
| echo "Database not found, please check your internet connection and rerun the program with '--list' flag to see available databases"; usage; exit 1 | |
| fi | |
| set -e | |
| # remove existing database, archive and md5 files | |
| if [ "$remove_str" != "none" ]; then | |
| if [ "$verbose" -eq 1 ]; then echo "Removing existing database files..." >&2 ; fi | |
| case "$remove_str" in | |
| "all" ) rm $outdir_str/$DB.* || true; ;; | |
| "md5tar" ) rm $outdir_str/$DB.*tar.gz* || true; ;; | |
| "md5" ) rm $outdir_str/$DB.*md5 || true; ;; | |
| "tar" ) rm $outdir_str/$DB.*tar.gz || true; ;; | |
| * ) echo "Error in command line arguments (not a valid option for --remove)." >&2 ; usage; exit 1 ;; | |
| esac | |
| fi | |
| # DB=$1 | |
| curl -s https://ftp.ncbi.nih.gov/blast/db/ | grep -Eo "href=\"$DB[0-9_\.]*.tar.gz.md5" | sed 's/href="//' | sort | uniq | gawk '{printf "https://ftp.ncbi.nih.gov/blast/db/%s\n", $1}' > $outdir_str/$DB.md5.files | |
| if [ -e "$outdir_str/$DB.md5.files" ] && [ ! -s "$outdir_str/$DB.md5.files" ]; then | |
| echo "No md5 files were found for database '$DB', please rerun with '--list' flag to see available databases (and make sure you can access the NCBI FTP site)." | |
| exit 1 | |
| elif [ ! -e "$outdir_str/$DB.md5.files" ]; then | |
| echo "The file '$outdir_str/$DB.md5.files' could not be created, please check that the folder exists and you have writing permissions to it." | |
| exit 1 | |
| fi | |
| aria2c --allow-overwrite -x $threads_str -j $con_str -i $outdir_str/$DB.md5.files -d $outdir_str | |
| cat $outdir_str/$DB.md5.files | gawk -F "/" -v OUTDIR=$outdir_str '{printf "%s/%s\n", OUTDIR, $NF}' | xargs gawk -v OUTDIR=$outdir_str '{printf "%s\t%s/%s\n", $1,OUTDIR, $NF}' > $outdir_str/$DB.md5.tmp | |
| curl -s https://ftp.ncbi.nih.gov/blast/db/ | grep -Eo "href=\"$DB[0-9_\.]*.tar.gz" | sed 's/href="//' | sort | uniq | gawk '{printf "https://ftp.ncbi.nih.gov/blast/db/%s\n", $1}' > $outdir_str/$DB.files | |
| if [ -e "$outdir_str/$DB.files" ] && [ ! -s "$outdir_str/$DB.files" ]; then | |
| echo "No tar.gz files were found for database '$DB', please rerun with '--list' flag to see available databases (and make sure you can access the NCBI FTP site)." | |
| exit 1 | |
| elif [ ! -e "$outdir_str/$DB.files" ]; then | |
| echo "The file '$outdir_str/$DB.files' could not be created, please check that the folder exists and you have writing permissions to it." | |
| exit 1 | |
| fi | |
| aria2c --allow-overwrite -x $threads_str -j $con_str -i $outdir_str/$DB.files -d $outdir_str | |
| if [ "$verbose" -eq 1 ]; then echo "Testing files integrity..." >&2 ; fi | |
| cd $outdir_str/ | |
| until md5sum -c $outdir_str/$DB.md5.tmp; do | |
| if [ "$verbose" -eq 1 ]; then echo "MD5 checksum failed, repeating download..." >&2 ; fi | |
| aria2c -c -x $threads_str -j $con_str -i $outdir_str/$DB.files -d $outdir_str | |
| done | |
| if [ $extract -eq 1 ]; then | |
| if [ "$verbose" -eq 1 ]; then echo "Extracting '$DB.*.tar.gz' files..." >&2 ; fi | |
| cat $outdir_str/$DB.files | gawk -F "/" -v OUTDIR=$outdir_str '{printf "%s/%s\n", OUTDIR, $NF}' | parallel "tar xzf {}" | |
| # if echo "$remove_str" | egrep -q "all|tar" ; then cat $outdir_str/$DB.files | gawk -F "/" -v OUTDIR=$outdir_str '{printf "%s/%s\n", OUTDIR, $NF}' | xargs rm; fi | |
| # remove existing database, archive and md5 files | |
| if [ "$remove_str" != "none" ]; then | |
| if [ "$verbose" -eq 1 ]; then echo "Removing downloaded '$outdir_str/$DB.*.tar.gz' files..." >&2 ; fi | |
| case "$remove_str" in | |
| "all" | "md5tar" ) rm $outdir_str/$DB.*tar.gz* || true; ;; | |
| "md5" ) rm $outdir_str/$DB.*md5 || true; ;; | |
| "tar" ) rm $outdir_str/$DB.*tar.gz || true; ;; | |
| * ) echo "Error in command line arguments (not a valid option for --remove)." >&2 ; usage; exit 1 ;; | |
| esac | |
| fi | |
| # Check integrity of the extracted files | |
| if [ "$verbose" -eq 1 ]; then echo "Checking the integrity of '$outdir_str/$DB'..." >&2 ; fi | |
| if [ -f "$outdir_str/$DB.pal" ] || [ -f "$outdir_str/$DB.nal" ]; then | |
| if blastdbcheck -db $outdir_str/$DB ; then | |
| echo "'$DB' database (in folder $outdir_str) is ready for use in BLAST!" >&2 | |
| else | |
| echo "'$DB' database (in folder $outdir_str) could not be extracted properly, please check that you have writing permissions and enough disk space and try again..." >&2 | |
| if [ "$verbose" -eq 1 ]; then echo "Removing temporary files..." >&2 ; fi | |
| rm $outdir_str/$DB.files $outdir_str/$DB.md5.* | |
| exit 1 | |
| fi | |
| else | |
| echo "'$DB' database (in folder $outdir_str) could not be extracted properly, please check that you have writing permissions and enough disk space and try again..." >&2 | |
| exit 1 | |
| fi | |
| else | |
| if [ "$verbose" -eq 1 ]; then echo "'$DB' database archive is downloaded to '$outdir_str/$DB.*.tar.gz'. Please uncompress the files before use in BLAST!" >&2 ; fi | |
| fi | |
| if [ "$verbose" -eq 1 ]; then echo "Removing temporary files..." >&2 ; fi | |
| rm $outdir_str/$DB.files $outdir_str/$DB.md5.* | |
| if [ "$verbose" -eq 1 ]; then echo "Done!" >&2 ; fi | |
| exit 0 | |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| package: | |
| name: download-blast-db | |
| version: "1.0.1" | |
| source: | |
| url: https://gist.githubusercontent.com/IdoBar/326f866bf02476fbf96eb78e46a7fffb/raw/download_blast_db.sh | |
| sha256: fee275a34f8cfd36b8f87e5dcf7cfe2c9ea75ea72639ccb66248472d06272f48 | |
| build: | |
| number: 0 | |
| script: | | |
| mkdir -p $PREFIX/bin | |
| cp download_blast_db.sh $PREFIX/bin/download-blast-db | |
| chmod +x $PREFIX/bin/download-blast-db | |
| requirements: | |
| build: | |
| - blast | |
| - parallel | |
| - aria2 | |
| run: | |
| - blast | |
| - parallel | |
| - aria2 | |
| about: | |
| home: https://gist.github.com/IdoBar/326f866bf02476fbf96eb78e46a7fffb | |
| summary: "Fast and parallel download of NCBI BLAST databases using bash, parallel and aria2c." | |
| license: "MIT" | |
| extra: | |
| recipe-maintainers: | |
| - idobar |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment