explodecomputer · December 5, 2025 07:58
diff --git a/bmi-alcohol-ancestry.r b/bmi-alcohol-ancestry.r
 library(dplyr)
 library(data.table)
 library(seqminer)
 library(stringr)

 dat <- fread("/mnt/project/analysis/bmi-alcohol-ancestry/phen_extract.csv")
 str(dat)

 anc <- fread("/mnt/project/data/ancestry/ukb_InferredAncestry.txt")
 str(anc)

 sampleinfo_colnames <- stringr::str_replace_all(
  tolower(c(
    "x1",
    "x2",
    "genotyping.array",
    "Batch",
    "Plate.Name",
    "Well",
    "Cluster.CR",
    "dQC",
    "Internal.Pico..ng.uL.",
    "Submitted.Gender",
    "Inferred.Gender",
    "X.intensity",
    "Y.intensity",
    "Submitted.Plate.Name",
    "Submitted.Well",
    "sample.qc.missing.rate",
    "heterozygosity",
    "heterozygosity.pc.corrected",
    "het.missing.outliers",
    "putative.sex.chromosome.aneuploidy",
    "in.kinship.table",
    "excluded.from.kinship.inference",
    "excess.relatives",
    "in.white.British.ancestry.subset",
    "used.in.pca.calculation",
    paste("pc", 1:40, sep = ""),
    "in.Phasing.Input.chr1_22",
    "in.Phasing.Input.chrX",
    "in.Phasing.Input.chrXY"
  )),
  "[[:punct:]]", "_"
 )

 sampleinfo <- fread("/mnt/project/Bulk/Genotype Results/Genotype calls/ukb_sqc_v2.txt")
 colnames(sampleinfo) <- sampleinfo_colnames
 str(sampleinfo)

 bg.ref.file <- "/mnt/project/Bulk/Imputation/Imputation from genotype (TOPmed)/ukb21007_c4_b0_v1.bgen"
 file.exists(bg.ref.file)
 bg.range <- "chr4:99318162-99318162"
 geno.mat <- seqminer::readBGENToMatrixByRange(bg.ref.file, bg.range)

 samplefile <- fread("/mnt/project/Bulk/Imputation/Imputation from genotype (TOPmed)/ukb21007_c4_b0_v1.sample", skip=2, header=FALSE)
 str(samplefile)

 str(geno.mat)
 dim(geno.mat[[1]])
 str(geno.mat[[1]])

 # Combine everything
 # genotype
 # phenotypes
 # ancestry
 # PCs

 table(samplefile$V1 %in% sampleinfo$x2)
 table(samplefile$V1 %in% dat$eid)
 table(samplefile$V1 %in% anc$IID)

 dim(sampleinfo)
 dim(samplefile)


 str(dat)

 d <- inner_join(dat, anc %>% select(IID, PC1, PC2, ANC=Anc_1st), by=c("eid"="IID"))
 g <- tibble(eid=samplefile$V2, geno=drop(t(geno.mat[[1]])))
 d <- inner_join(d, g)
 str(d)


 summary(lm(p21001_i0 ~ geno + PC1 + PC2, d))
 summary(lm(p21001_i0 ~ geno + PC1 + PC2, d))
 summary(lm(geno ~ p1558_i0 + PC1 + PC2, d))

 d <- mutate(d, 
            alcohol = case_when(
              p1558_i0 == "Never" ~ 0, 
              p1558_i0 == "Special occasions only" ~ 1, 
              p1558_i0 == "One to three times a month" ~ 2, 
              p1558_i0 == "Once or twice a week" ~ 3,
              p1558_i0 == "Three or four times a week" ~ 4,
              p1558_i0 == "Daily or almost daily" ~ 5,
              TRUE ~ NA
            ))
 table(d$alcohol)
 table(d$alcohol, d$ANC)

 summary(lm(p21001_i0 ~ geno * ANC, d))
 summary(lm(p21001_i0 ~ geno * alcohol, d %>% filter(ANC == "EUR")))

 summary(lm(p21001_i0 ~ geno * alcohol + geno * ANC, d))
 summary(lm(p21001_i0 ~ geno * alcohol, d %>% filter(ANC == "SAS")))

 summary(lm(alcohol ~ geno, d %>% filter(ANC == "EUR")))
 summary(lm(alcohol ~ geno, d %>% filter(ANC == "EUR")))
 summary(lm(p21001_i0 ~ geno, d %>% filter(ANC == "EUR")))
 summary(lm(p21001_i0 ~ geno + alcohol, d %>% filter(ANC == "EUR")))
	library(dplyr)
	library(data.table)
	library(seqminer)
	library(stringr)

	dat <- fread("/mnt/project/analysis/bmi-alcohol-ancestry/phen_extract.csv")
	str(dat)

	anc <- fread("/mnt/project/data/ancestry/ukb_InferredAncestry.txt")
	str(anc)

	sampleinfo_colnames <- stringr::str_replace_all(
	tolower(c(
	"x1",
	"x2",
	"genotyping.array",
	"Batch",
	"Plate.Name",
	"Well",
	"Cluster.CR",
	"dQC",
	"Internal.Pico..ng.uL.",
	"Submitted.Gender",
	"Inferred.Gender",
	"X.intensity",
	"Y.intensity",
	"Submitted.Plate.Name",
	"Submitted.Well",
	"sample.qc.missing.rate",
	"heterozygosity",
	"heterozygosity.pc.corrected",
	"het.missing.outliers",
	"putative.sex.chromosome.aneuploidy",
	"in.kinship.table",
	"excluded.from.kinship.inference",
	"excess.relatives",
	"in.white.British.ancestry.subset",
	"used.in.pca.calculation",
	paste("pc", 1:40, sep = ""),
	"in.Phasing.Input.chr1_22",
	"in.Phasing.Input.chrX",
	"in.Phasing.Input.chrXY"
	)),
	"[[:punct:]]", "_"
	)

	sampleinfo <- fread("/mnt/project/Bulk/Genotype Results/Genotype calls/ukb_sqc_v2.txt")
	colnames(sampleinfo) <- sampleinfo_colnames
	str(sampleinfo)

	bg.ref.file <- "/mnt/project/Bulk/Imputation/Imputation from genotype (TOPmed)/ukb21007_c4_b0_v1.bgen"
	file.exists(bg.ref.file)
	bg.range <- "chr4:99318162-99318162"
	geno.mat <- seqminer::readBGENToMatrixByRange(bg.ref.file, bg.range)

	samplefile <- fread("/mnt/project/Bulk/Imputation/Imputation from genotype (TOPmed)/ukb21007_c4_b0_v1.sample", skip=2, header=FALSE)
	str(samplefile)

	str(geno.mat)
	dim(geno.mat[[1]])
	str(geno.mat[[1]])

	# Combine everything
	# genotype
	# phenotypes
	# ancestry
	# PCs

	table(samplefile$V1 %in% sampleinfo$x2)
	table(samplefile$V1 %in% dat$eid)
	table(samplefile$V1 %in% anc$IID)

	dim(sampleinfo)
	dim(samplefile)


	str(dat)

	d <- inner_join(dat, anc %>% select(IID, PC1, PC2, ANC=Anc_1st), by=c("eid"="IID"))
	g <- tibble(eid=samplefile$V2, geno=drop(t(geno.mat[[1]])))
	d <- inner_join(d, g)
	str(d)


	summary(lm(p21001_i0 ~ geno + PC1 + PC2, d))
	summary(lm(p21001_i0 ~ geno + PC1 + PC2, d))
	summary(lm(geno ~ p1558_i0 + PC1 + PC2, d))

	d <- mutate(d,
	alcohol = case_when(
	p1558_i0 == "Never" ~ 0,
	p1558_i0 == "Special occasions only" ~ 1,
	p1558_i0 == "One to three times a month" ~ 2,
	p1558_i0 == "Once or twice a week" ~ 3,
	p1558_i0 == "Three or four times a week" ~ 4,
	p1558_i0 == "Daily or almost daily" ~ 5,
	TRUE ~ NA
	))
	table(d$alcohol)
	table(d$alcohol, d$ANC)

	summary(lm(p21001_i0 ~ geno * ANC, d))
	summary(lm(p21001_i0 ~ geno * alcohol, d %>% filter(ANC == "EUR")))

	summary(lm(p21001_i0 ~ geno * alcohol + geno * ANC, d))
	summary(lm(p21001_i0 ~ geno * alcohol, d %>% filter(ANC == "SAS")))

	summary(lm(alcohol ~ geno, d %>% filter(ANC == "EUR")))
	summary(lm(alcohol ~ geno, d %>% filter(ANC == "EUR")))
	summary(lm(p21001_i0 ~ geno, d %>% filter(ANC == "EUR")))
	summary(lm(p21001_i0 ~ geno + alcohol, d %>% filter(ANC == "EUR")))
No results found