allgenesconsidered · November 22, 2017 17:29
diff --git a/1_bwa_mem_example_COMMENTED.sh b/1_bwa_mem_example_COMMENTED.sh
 #!/bin/bash

 cd /data/home/molvera/wgs-patient-data # Change the directory to my project directory

 REF=/data/home/molvera/opt/hg38/Homo_sapiens_assembly38.fasta # Create a variable assigning REF to the absolute location 
 # of my .fasta reference file. 

 bwa index $REF # The first part of BWA, which indexes your reference file. You only need to do this once per fasta reference.

 for samp in "BT_P1_S2" "BT_P3_S9" "BT_P6_S7" "BT_P7_S6" "BT_P8_S3" # For loop through my 5 samples. 
 do # This is the start of the loop, analogous to '{' in R. 

 echo "[NEW SAMPLE: $samp]" # The echo command just prints the input to the screen. This is a common debuging tactic, since
 # some bash software does not report which sample it's handling during an error message or during normal outputs. It's nice
 # to have incase only one sample has problems. You can also include 'echo's inbetween bash commands to debug specific
 # programs. 

 rg_string='@RG\tID:'$samp'\.1\tSM:'$samp'\tPL:illumina\tLB:1\tPU:1' # I've gone ahead and taken out the string costruction
 # in bwa mem and placed it in the variable rg_string. For my convention, I like having global variables (ie variables
 # avalible to anywhere in the progam) in all caps (like REF) and I reserve lowercase for local variables (ie ones that are
 # only accessible in specific parts of the program, such as the 'for loop' we're in now). 
 #
 # Anyways, there is a difference between single and double quotes in bash, see this:
 # https://stackoverflow.com/questions/6697753/difference-between-single-and-double-quotes-in-bash
 # What that basically means is that we cannot call variables in between single quotes, but we can for double quotes. We need
 # to use single quotes, because we are specifically telling the stirng to make tabs (\t) between each variable (the two
 # capital letters, followed by the colon). So I've taken the variable names out of the quotes, and bash will be smart enough
 # to concatinate them together. So during my first pass through the loop, 'rg_string' should look like this:
 # '@RG\tID:BT_P1_S2.1\tSM:BT_P1_S2\tPL:illumina\tLB:1\tPU:1', which will produce a read group line like this:
 # @RG  ID:BT_P1_S2.1 SM:BT_P1_S2 PL:illumina LB:1  PU:1.

 bwa mem -M -R $rg_string $REF ./fastq/$samp\_R1_001.fastq.gz ./fastq/$samp\_R2_001.fastq.gz > ./sam/$samp\.sam
 # Here is the meat of the script. This takes in the indexed REF fasta, and one or two fastq files (or fastq.gz), and
 # maps them to the refeence. -M is done to improve output compatability with Picard, which is a tool we will use to detect
 # PCR duplicates (which there shouldn't be any), and -R assigns the readgroup string to each read.
 #
 # What about the '>' at the end. A lot of bash programs have something called STDOUT (standard out), which tells the 
 # software to return the output to the command line unless otherwise commanded. This is important if you just want 
 # to take the output of one program and give it immediately to another program to process (this is called a pipe, 
 # symbol "|", as in pipeline). You can also take STDOUT and write it to a file, which is what '>' does. So we are
 # taking the output of bwa mem and writing it to a new file called './sam/$samp\.sam'.

 done # This is the end of the loop, analogous to '}' in R.
	#!/bin/bash

	cd /data/home/molvera/wgs-patient-data # Change the directory to my project directory

	REF=/data/home/molvera/opt/hg38/Homo_sapiens_assembly38.fasta # Create a variable assigning REF to the absolute location
	# of my .fasta reference file.

	bwa index $REF # The first part of BWA, which indexes your reference file. You only need to do this once per fasta reference.

	for samp in "BT_P1_S2" "BT_P3_S9" "BT_P6_S7" "BT_P7_S6" "BT_P8_S3" # For loop through my 5 samples.
	do # This is the start of the loop, analogous to '{' in R.

	echo "[NEW SAMPLE: $samp]" # The echo command just prints the input to the screen. This is a common debuging tactic, since
	# some bash software does not report which sample it's handling during an error message or during normal outputs. It's nice
	# to have incase only one sample has problems. You can also include 'echo's inbetween bash commands to debug specific
	# programs.

	rg_string='@RG\tID:'$samp'\.1\tSM:'$samp'\tPL:illumina\tLB:1\tPU:1' # I've gone ahead and taken out the string costruction
	# in bwa mem and placed it in the variable rg_string. For my convention, I like having global variables (ie variables
	# avalible to anywhere in the progam) in all caps (like REF) and I reserve lowercase for local variables (ie ones that are
	# only accessible in specific parts of the program, such as the 'for loop' we're in now).
	#
	# Anyways, there is a difference between single and double quotes in bash, see this:
	# https://stackoverflow.com/questions/6697753/difference-between-single-and-double-quotes-in-bash
	# What that basically means is that we cannot call variables in between single quotes, but we can for double quotes. We need
	# to use single quotes, because we are specifically telling the stirng to make tabs (\t) between each variable (the two
	# capital letters, followed by the colon). So I've taken the variable names out of the quotes, and bash will be smart enough
	# to concatinate them together. So during my first pass through the loop, 'rg_string' should look like this:
	# '@RG\tID:BT_P1_S2.1\tSM:BT_P1_S2\tPL:illumina\tLB:1\tPU:1', which will produce a read group line like this:
	# @RG ID:BT_P1_S2.1 SM:BT_P1_S2 PL:illumina LB:1 PU:1.

	bwa mem -M -R $rg_string $REF ./fastq/$samp\_R1_001.fastq.gz ./fastq/$samp\_R2_001.fastq.gz > ./sam/$samp\.sam
	# Here is the meat of the script. This takes in the indexed REF fasta, and one or two fastq files (or fastq.gz), and
	# maps them to the refeence. -M is done to improve output compatability with Picard, which is a tool we will use to detect
	# PCR duplicates (which there shouldn't be any), and -R assigns the readgroup string to each read.
	#
	# What about the '>' at the end. A lot of bash programs have something called STDOUT (standard out), which tells the
	# software to return the output to the command line unless otherwise commanded. This is important if you just want
	# to take the output of one program and give it immediately to another program to process (this is called a pipe,
	# symbol "\|", as in pipeline). You can also take STDOUT and write it to a file, which is what '>' does. So we are
	# taking the output of bwa mem and writing it to a new file called './sam/$samp\.sam'.

	done # This is the end of the loop, analogous to '}' in R.
No results found