-
-
Save DannyArends/04d87f5590090dfe0dc6b42e5e1bbe15 to your computer and use it in GitHub Desktop.
| # Add yourself to the sudo group | |
| su - | |
| usermod -aG sudo danny | |
| exit | |
| # Install the virtual box guest additions | |
| cd /media/cdrom0 | |
| sudo sh ./VBoxLinuxAdditions.run | |
| # Install R and deps | |
| sudo apt install r-base | |
| sudo nano /etc/apt/sources.list | |
| sudo apt install libssl-dev | |
| sudo apt install libxml2-dev | |
| sudo apt install libcurl4-openssl-dev | |
| # Install R packages | |
| sudo R | |
| if (!require("BiocManager", quietly = TRUE)) | |
| install.packages("BiocManager") | |
| BiocManager::install("GenomicFeatures") | |
| BiocManager::install("preprocessCore") | |
| q("no") | |
| # Install Trimmomatic | |
| sudo apt install git | |
| sudo apt install ant | |
| mkdir software | |
| cd software | |
| git clone https://github.com/usadellab/Trimmomatic.git | |
| cd Trimmomatic | |
| ant | |
| # Install STAR | |
| git clone https://github.com/alexdobin/STAR.git | |
| cd STAR/source | |
| # [UPDATE 2024] - Checkout an older version of STAR | |
| git checkout ee50484 | |
| make | |
| # Install picard | |
| git clone https://github.com/broadinstitute/picard.git | |
| cd picard | |
| # [UPDATED 2024] - Checkout an older version of PICARD | |
| git checkout 5db8017 | |
| ./gradlew shadowJar | |
| # Install HTSlib / samtools / bcftools | |
| sudo apt install autoconf | |
| git clone https://github.com/samtools/htslib.git | |
| git clone https://github.com/samtools/samtools.git | |
| git clone https://github.com/samtools/bcftools.git | |
| # Install HTSlib | |
| cd htslib | |
| git submodule update --init --recursive | |
| autoreconf -i | |
| ./configure | |
| make | |
| # Install samtools | |
| cd samtools | |
| autoheader | |
| autoconf -Wno-syntax | |
| ./configure | |
| make | |
| # Install bcftools | |
| cd bcftools | |
| autoheader | |
| autoconf -Wno-syntax | |
| ./configure | |
| make | |
| # GATK install | |
| wget https://github.com/broadinstitute/gatk/releases/download/4.2.6.1/gatk-4.2.6.1.zip | |
| unzip gatk-4.2.6.1.zip | |
| # SRA | |
| wget https://ftp-trace.ncbi.nlm.nih.gov/sra/sdk/3.0.0/sratoolkit.3.0.0-centos_linux64-cloud.tar.gz | |
| mkdir sratoolkit | |
| tar -xzf sratoolkit.3.0.0-centos_linux64-cloud.tar.gz –C sratoolkit | |
| ./sratoolkit/usr/local/ncbi/sra-tools/bin/vdb-config --interactive | |
| # Make symbolic links | |
| mkdir bin | |
| cd bin | |
| ln -s /home/danny/software/STAR/source/STAR STAR | |
| ln -s /home/danny/software/htslib/bgzip bgzip | |
| ln -s /home/danny/software/samtools/samtools samtools | |
| ln -s /home/danny/software/bcftools/bcftools bcftools | |
| ln -s /home/danny/software/sratoolkit/usr/local/ncbi/sra-tools/bin/fasterq-dump fasterq-dump | |
| #Update the bashrc file | |
| nano ~/.bashrc | |
| # Add at the end: | |
| export PATH="$HOME/bin:$PATH" | |
| # Additional link: | |
| ln -s /home/danny/software/htslib/tabix tabix | |
| # Additional R package: | |
| sudo R | |
| install.packages("ggplot2") | |
| install.packages("gplots") | |
| install.packages("gsalib") | |
| q("no") | |
| wget https://data.broadinstitute.org/igv/projects/downloads/2.14/IGV_Linux_2.14.1_WithJava.zip | |
| unzip IGV_Linux_2.14.1_WithJava.zip |
| # | |
| # Download Saccharomyces Cerevisiae genome | |
| # copyright (c) 2022 - Danny Arends | |
| # | |
| uri <- "ftp.ensembl.org/pub/release-108/fasta/saccharomyces_cerevisiae/dna/" | |
| base <- "Saccharomyces_cerevisiae.R64-1-1.dna.chromosome." | |
| chrs <- c(as.character(as.roman(seq(1:16))), "Mito") | |
| # Download | |
| for(chr in chrs){ | |
| fname <- paste0(base, chr, ".fa.gz") | |
| # Download command | |
| cmd <- paste0("wget ", uri, fname) | |
| #cat(cmd, "\n") | |
| system(cmd) | |
| } | |
| # Create an empty the file | |
| cat("", file = "Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
| for(chr in chrs){ | |
| fname <- paste0(base, chr, ".fa.gz") | |
| # Extract and merge into a fast file | |
| cmd <- paste0("zcat ", fname, " >> Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
| #cat(cmd, "\n") | |
| system(cmd) | |
| } | |
| # Compress the fasta file using bgzip (keep original) | |
| cmd <- paste0("bgzip -k Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa") | |
| #cat(cmd, "\n") | |
| system(cmd) | |
| # Delete the chromosomes | |
| for(chr in chrs){ | |
| fname <- paste0(base, chr, ".fa.gz") | |
| # Extract and merge into a fast file | |
| cmd <- paste0("rm ", fname) | |
| #cat(cmd, "\n") | |
| system(cmd) | |
| } |
| # Get the reference transcriptome and GTF for STAR | |
| wget http://ftp.ensembl.org/pub/release-108/gtf/saccharomyces_cerevisiae/Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz | |
| gunzip -k Saccharomyces_cerevisiae.R64-1-1.108.gtf.gz | |
| wget http://ftp.ensembl.org/pub/release-108/variation/vcf/saccharomyces_cerevisiae/saccharomyces_cerevisiae.vcf.gz | |
| # Index the genome using samtools | |
| samtools faidx Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz | |
| # Generate genome/transcriptome index using STAR | |
| STAR --runThreadN 2 --runMode genomeGenerate \ | |
| --genomeDir ~/genome/STAR \ | |
| --genomeSAindexNbases 10 \ | |
| --sjdbGTFfile ~/genome/Saccharomyces_cerevisiae.R64-1-1.108.gtf \ | |
| --genomeFastaFiles ~/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa | |
| # Get the reference SNPs and index using tabix | |
| tabix saccharomyces_cerevisiae.vcf.gz | |
| # Index the genome using picard | |
| java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar CreateSequenceDictionary \ | |
| -R Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz |
| # | |
| # Align SRA reads to the Saccharomyces Cerevisiae genome | |
| # copyright (c) 2022 - Danny Arends | |
| # | |
| execute <- function(x, outputfile = NA, intern = FALSE, quitOnError = FALSE){ | |
| if(!is.na(outputfile) && file.exists(outputfile)){ | |
| cat("Output for step exists, skipping this step\n"); | |
| invisible("") | |
| } | |
| cat("----", x, "\n"); res <- system(x, intern = intern); cat(">>>>", res[1], "\n") | |
| if(res[1] >= 1){ | |
| cat("Error external process did not finish\n\n"); | |
| if(quitOnError) q("no") | |
| } | |
| } | |
| input.dir <- "/home/danny/data/raw" | |
| input.base <- "SRR13978643" #Get from the command line | |
| output.dir <- paste0("/home/danny/data/output/", input.base,".aln") | |
| genome.path <- "/home/danny/genome/STAR" | |
| ref.fa.gz <- "/home/danny/genome/Saccharomyces_cerevisiae.R64-1-1.dna.primary_assembly.fa.gz" | |
| ref.snps <- "/home/danny/genome/saccharomyces_cerevisiae.vcf.gz" | |
| # Create an output folder | |
| if(!file.exists(input.dir)){ dir.create(input.dir, recursive = TRUE) } | |
| if(!file.exists(output.dir)){ dir.create(output.dir, recursive = TRUE) } | |
| # STEP 0 - SRA Download and Compress | |
| setwd(input.dir) | |
| execute(paste0("fasterq-dump -p --split-files ", input.base), paste0(input.base, "_1.fastq")) | |
| execute(paste0("bgzip ", input.base, "_1.fastq"), paste0(input.base, "_1.fastq.gz")) | |
| execute(paste0("bgzip ", input.base, "_2.fastq"), paste0(input.base, "_2.fastq.gz")) | |
| # STEP 1 - READ Trimming | |
| trim.files <- c( | |
| paste0(input.dir, "/", input.base,"_1.fastq.gz"), | |
| paste0(input.dir, "/", input.base,"_2.fastq.gz"), | |
| paste0(output.dir, "/", input.base,"_1.P.fastq.gz"), | |
| paste0(output.dir, "/", input.base,"_1.U.fastq.gz"), | |
| paste0(output.dir, "/", input.base,"_2.P.fastq.gz"), | |
| paste0(output.dir, "/", input.base,"_2.U.fastq.gz") | |
| ) | |
| trim.path <- "/home/danny/software/Trimmomatic" | |
| trim.exec <- paste0("java -jar ", trim.path, "/dist/jar/trimmomatic-0.40-rc1.jar") | |
| trim.opts <- paste0("ILLUMINACLIP:",trim.path,"/adapters/TruSeq3-PE-2.fa:2:30:10") | |
| trim.opts <- paste0(trim.opts, " LEADING:3 TRAILING:3 SLIDINGWINDOW:4:15 MINLEN:36") | |
| trim.cmd <- paste0(trim.exec, " PE ", paste0(trim.files, collapse=" "), " ", trim.opts) | |
| execute(trim.cmd, trim.files[3]) | |
| # STEP 1.1 - UNZIP for STAR | |
| execute(paste0("gunzip -k ", trim.files[3]), gsub(".fastq.gz", ".fastq", trim.files[3])) | |
| execute(paste0("gunzip -k ", trim.files[5]), gsub(".fastq.gz", ".fastq", trim.files[5])) | |
| files.in <- gsub(".fastq.gz", ".fastq", trim.files[c(3,5)]) | |
| # STEP 2 - Alignment using STAR | |
| star.outbase <- paste0(output.dir, "/", input.base) | |
| star.bam <- paste0(star.outbase, "Aligned.sortedByCoord.out.bam") | |
| star.exec <- "STAR --runMode alignReads" | |
| star.opts <- paste0("--genomeDir ", genome.path, " --outSAMtype BAM SortedByCoordinate") | |
| star.in <- paste0("--readFilesIn ", paste0(files.in, collapse=" ")) | |
| star.out <- paste0("--outFileNamePrefix ", star.outbase) | |
| star.cmd <- paste0(star.exec, " ", star.in, " ", star.opts, " ", star.out) | |
| execute(star.cmd, star.bam) | |
| # STEP 2.1 - Create a samtools index | |
| execute(paste0("samtools index ", star.bam), paste0(star.bam, ".bai")) | |
| # STEP 2.2 - Create mapping and coverage statistics | |
| execute(paste0("samtools flagstats ", star.bam)) | |
| execute(paste0("samtools coverage ", star.bam)) | |
| #STEP 3 - Remove duplicate reads using picard tools | |
| p.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.out.bam") | |
| metrics.out <- paste0(star.outbase, "_metrics.txt") | |
| p.exec <- "java -Xmx4g -jar /home/danny/software/picard/build/libs/picard.jar" | |
| p.in <- paste0("-I ", star.bam) | |
| p.out <- paste0("-O ", p.bam, " -M ", metrics.out) | |
| p.opts <- paste0("--REMOVE_DUPLICATES true") | |
| p.cmd <- paste0(p.exec, " MarkDuplicates ", p.opts," ", p.in, " ", p.out) | |
| execute(p.cmd, p.bam) | |
| # STEP 3.1 - Create a samtools index | |
| execute(paste0("samtools index ", p.bam), paste0(p.bam, ".bai")) | |
| # STEP 3.2 - Create mapping and coverage statistics | |
| execute(paste0("samtools flagstats ", p.bam)) | |
| execute(paste0("samtools coverage ", p.bam)) | |
| # STEP 4 - Add read group (1) and sample run, library, and name | |
| rg.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.out.bam") | |
| rg.opts <- paste0("-PL ILLUMINA -PU run -LB ", gsub("SRR", "", input.base), " -SM ", input.base) | |
| p.cmd <- paste0(p.exec, " AddOrReplaceReadGroups -I ", p.bam, " -O ", rg.bam, " ", rg.opts) | |
| execute(p.cmd) | |
| # STEP 4.1 - Create a samtools index | |
| execute(paste0("samtools index ", rg.bam), paste0(rg.bam, ".bai")) | |
| # STEP 5 - GATK prep | |
| gatk.exec <- "java -Xmx4g -jar /home/danny/software/gatk-4.2.6.1/gatk-package-4.2.6.1-local.jar" | |
| gatk.opts <- paste0("-R ", ref.fa.gz, " --known-sites ", ref.snps) | |
| # STEP 5.1 - GATK BaseRecalibrator | |
| gatk.cov1 <- paste0(star.outbase, "_cov1.txt") | |
| gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", rg.bam, " -O ", gatk.cov1) | |
| execute(gatk.cmd, gatk.cov1) | |
| # STEP 5.2 - GATK ApplyBQSR | |
| recal.bam <- paste0(star.outbase, "Aligned.sortedByCoord.RD.RG.RC.out.bam") | |
| gatk.cmd <- paste0(gatk.exec, " ApplyBQSR -R ", ref.fa.gz, " -bqsr ", gatk.cov1, " -I ", rg.bam, " -O ", recal.bam) | |
| execute(gatk.cmd, recal.bam) | |
| # STEP 5.3 - GATK BaseRecalibrator | |
| gatk.cov2 <- paste0(star.outbase, "_cov2.txt") | |
| gatk.cmd <- paste0(gatk.exec, " BaseRecalibrator ", gatk.opts, " -I ", recal.bam, " -O ", gatk.cov2) | |
| execute(gatk.cmd, gatk.cov2) | |
| # STEP 5.4 - GATK AnalyzeCovariates | |
| recal.plot <- paste0(star.outbase, "AnalyzeCovariates.pdf") | |
| gatk.cmd <- paste0(gatk.exec, " AnalyzeCovariates -before ", gatk.cov1, " -after ", gatk.cov2, " -plots ", recal.plot) | |
| execute(gatk.cmd) | |
| # STEP 6 - Index the recalibrated bam files | |
| execute(paste0("samtools index ", recal.bam), paste0(recal.bam, ".bai")) | |
| # STEP 6.1 - Create mapping and coverage statistics | |
| execute(paste0("samtools flagstats ", recal.bam)) | |
| execute(paste0("samtools coverage ", recal.bam)) | |
| q("no") |
Hi Danny,
I am currently a master's student in bioinformatics and have watched all your RNA-seq analysis videos. I found them incredibly insightful and greatly appreciate the effort you put into them. I have a question regarding RNA-seq data analysis: if you have any Linux command codes related to the analysis, could you please upload them to your GitHub repository? I would like to practice on my Linux machine.
Hi Danny, I am currently a master's student in bioinformatics and have watched all your RNA-seq analysis videos. I found them incredibly insightful and greatly appreciate the effort you put into them. I have a question regarding RNA-seq data analysis: if you have any Linux command codes related to the analysis, could you please upload them to your GitHub repository? I would like to practice on my Linux machine.
Hey @aberakenea, the shell commands generated by R, are specific to Linux (the sh bash). That is why we run them in a Debian virtual machine under Windows. The scripts should work without much modification on e.g. Ubuntu or CentOS as long as the shell used to run them is the bash / sh shell.
Hello,
when I do:
git clone https://github.com/broadinstitute/picard.gitI see:
then I do:
cd picard/git checkout 5db8017and afterwards see:
then I do:
./gradlew shadowJarand see:
How to fix this?
Thank you