Skip to content

Instantly share code, notes, and snippets.

@Zepeng-Mu
Last active February 20, 2025 01:55
Show Gist options
  • Select an option

  • Save Zepeng-Mu/a071538be8f72451f7046074a55eeb42 to your computer and use it in GitHub Desktop.

Select an option

Save Zepeng-Mu/a071538be8f72451f7046074a55eeb42 to your computer and use it in GitHub Desktop.
Run GLIMPSE on scATAC-seq data
## Run GLIMPSE on scATAC bam files
import pandas as pd
import os
smp = os.listdir("/project/yangili1/zpmu/covid19/data/cellranger_out21")
SNPDIR = "/project/yangili1/zpmu/covid19/data/hornet_snp"
chrm = sorted([str(x) for x in range(1, 23)])
rule all:
input:
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/merged_chr1-22.merged.phased.rename.vcf.gz",
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/merged_chr1-22.merged.phased.rename.filtered.vcf.gz",
rule chunk:
input:
vcf="/project/yangili1/zpmu/covid19/data/glimpse/ref/ALL.chr{chrm}.sites.vcf.gz",
output:
"/project/yangili1/zpmu/covid19/data/glimpse/chunks/chunks.chr{chrm}.txt",
resources:
runtime=60,
mem_mb=20000,
cpus_per_task=8,
shell:
"GLIMPSE_chunk_static --thread 4 --input {input.vcf} --region {wildcards.chrm} "
"--window-size 2000000 --buffer-size 200000 --output {output}"
rule reheader_bam_chr:
input:
bam="/project/yangili1/zpmu/covid19/data/cellranger_out21/{smp}/outs/possorted_bam.hornet.final.bam",
output:
bam=temp(
"/project/yangili1/zpmu/covid19/data/cellranger_out21/{smp}/outs/possorted_bam_nodup_renameChr.bam"
),
resources:
runtime=120,
mem_mb=12000,
cpus_per_task=2,
shell:
"samtools view -h {input} | tr -d chr | samtools view -bh - > {output}; "
"samtools index {output}"
rule compute_GL:
input:
bam=rules.reheader_bam_chr.output.bam,
vcf="/project/yangili1/zpmu/covid19/data/glimpse/ref/ALL.chr{chrm}.sites.vcf.gz",
tsv="/project/yangili1/zpmu/covid19/data/glimpse/ref/ALL.chr{chrm}.sites.tsv.gz",
output:
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/mpileup/{smp}.chr{chrm}.vcf.gz",
resources:
runtime=60,
mem_mb=10000,
cpus_per_task=15,
shell:
"bcftools mpileup --threads 14 "
"-f /project/yangili1/zpmu/index/Homo_sapiens.GRCh37.dna.primary_assembly.fa.gz "
"-I -E -a 'FORMAT/DP' -r {wildcards.chrm} --max-depth 1000 "
"-T {input.vcf} {input.bam} -Ou | "
"bcftools view -i 'FORMAT/DP<=15' -Ou | "
"bcftools call -Aim -C alleles "
"-T {input.tsv} -Oz -o {output}; bcftools index -f {output}"
rule merge_GL:
input:
expand(
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/mpileup/{smp}.chr{{chrm}}.vcf.gz",
smp=smp,
),
output:
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/mpileup/merged_chr{chrm}.vcf.gz",
resources:
runtime=60,
mem_mb=16000,
cpus_per_task=1,
shell:
"ls {input} > list_{wildcards.chrm}.txt; "
"bcftools merge -m none -Oz -o {output} -l list_{wildcards.chrm}.txt; "
"bcftools index -f {output}; rm list_{wildcards.chrm}.txt"
rule impute:
input:
vcf=rules.merge_GL.output,
ref="/project/yangili1/zpmu/covid19/data/glimpse/ref/ALL.chr{chrm}.bcf",
gmap="/project/yangili1/zpmu/1000GP_Phase3/genetic_map_chr{chrm}_combined_b37.txt",
chunk=rules.chunk.output,
output:
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/out/merged_chr{chrm}.done.txt",
resources:
runtime=500,
mem_mb=36000,
cpus_per_task=20,
shell:
"while IFS='' read -r LINE || [ -n \"${{LINE}}\" ]; do "
"printf -v ID '%02d' $(echo ${{LINE}} | cut -d' ' -f1); "
"IRG=$(echo ${{LINE}} | cut -d' ' -f3); "
"ORG=$(echo ${{LINE}} | cut -d' ' -f4); "
"GLIMPSE_phase_static --thread 16 "
"--input {input.vcf} --reference {input.ref} --map {input.gmap} "
"--input-region ${{IRG}} --output-region ${{ORG}} "
"--output /project/yangili1/zpmu/covid19/data/glimpse_scATAC/out/merged_chr{wildcards.chrm}.${{ID}}.bcf; "
"bcftools index -f /project/yangili1/zpmu/covid19/data/glimpse_scATAC/out/merged_chr{wildcards.chrm}.${{ID}}.bcf; "
"done < {input.chunk}; "
"echo DONE > {output}"
rule ligate:
input:
rules.impute.output,
output:
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/ligated/merged_chr{chrm}.merged.vcf.gz",
resources:
runtime=60,
mem_mb=20000,
cpus_per_task=5,
shell:
"ls /project/yangili1/zpmu/covid19/data/glimpse_scATAC/out/merged_chr{wildcards.chrm}.*.bcf > "
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/ligated/merged.list.chr{wildcards.chrm}.txt; "
"GLIMPSE_ligate_static --thread 3 "
"--input /project/yangili1/zpmu/covid19/data/glimpse_scATAC/ligated/merged.list.chr{wildcards.chrm}.txt "
"--output {output}; tabix -p vcf {output}"
rule eagle:
input:
vcf=rules.ligate.output,
hap="/home/zepengmu/tools/Eagle_v2.4.1/tables/genetic_map_hg19_withX.txt.gz",
ref="/project/yangili1/zpmu/covid19/data/glimpse/ref/ALL.chr{chrm}.bcf",
output:
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/phased/merged_chr{chrm}.merged.phased.vcf.gz",
resources:
runtime=120,
mem_mb=36000,
cpus_per_task=6,
params:
outputPrefix="/project/yangili1/zpmu/covid19/data/glimpse_scATAC/phased/merged_chr{chrm}.merged.phased",
shell:
"eagle --geneticMapFile {input.hap} --vcfRef {input.ref} "
"--vcfTarget {input.vcf} --outPrefix {params.outputPrefix} "
"--numThreads {resources.cpus_per_task} --vcfOutFormat z; "
"bcftools index -f {output} --thread {resources.cpus_per_task}"
rule concat:
input:
expand(
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/phased/merged_chr{chrm}.merged.phased.vcf.gz",
chrm=chrm,
),
output:
"/project/yangili1/zpmu/covid19/data/glimpse_scATAC/merged_chr1-22.merged.phased.vcf.gz",
resources:
runtime=60,
mem_mb=16000,
cpus_per_task=3,
shell:
"bcftools concat --threads 2 -Oz -o {output} {input}; tabix -p vcf {output}"
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment