Last active
April 3, 2023 01:33
-
-
Save y9c/90da3e67ca6a31b73db7e7c7b6d77322 to your computer and use it in GitHub Desktop.
alternative polyadenylation (APA)
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| source("./utils_APA.R") | |
| ## How ot use? | |
| ## | |
| ## Change the path of tsv and gtf file | |
| ## `chr_name` is the column name of chromosome in the tsv file, default is "Chromosome" | |
| ## `pos_name` is the column name of position in the tsv file, default is "End" | |
| ## `strand_name` is the column name of strand in the tsv file, default is "Strand" | |
| tsv <- "./test_sites.tsv" | |
| gtf <- "~/reference/feature/Homo_sapiens.GRCh38.genome.gtf" | |
| parse_PAS(tsv, gtf, chr_name = "Chromosome", pos_name = "End", strand_name = "Strand") |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| library(dplyr) | |
| library(GenomicFeatures) | |
| library(APAlyzer) | |
| .annotatePASRegion <- function(pasdb, dbreigon, dfTXGeneMapping, flag) { | |
| x <- unlist(dbreigon) | |
| x$IndexID <- 1:length(x) | |
| x$transcript_id <- names(x) | |
| names(x) <- x$IndexID | |
| acc2sym <- data.frame(names(x), x$transcript_id) | |
| colnames(acc2sym) <- c("IndexID", "transcript_id") | |
| acc2sym <- merge(acc2sym, dfTXGeneMapping, on = "transcript_id", all.x = TRUE) | |
| x <- x[acc2sym$IndexID] | |
| names(x) <- acc2sym$gene_id | |
| dbreigon <- split(x, names(x)) | |
| dbreigon <- GenomicRanges::reduce(dbreigon) | |
| olp <- findOverlaps(pasdb, dbreigon) | |
| if (length(olp) > 0) { | |
| hit <- pasdb[queryHits(olp), ] | |
| hit$LOCATION <- flag | |
| hit$PASid <- paste(seqnames(hit), start(hit), strand(hit), sep = ":") | |
| hit <- as.data.frame(hit, row.names=1:length(hit)) | |
| } else { | |
| hit <- "NoHit" | |
| } | |
| return(hit) | |
| } | |
| .annotatePAS_V2 <- function(pasdb, TXDB, dfTXGeneMapping) { | |
| ## a.5UTR | |
| fiveUTRs <- fiveUTRsByTranscript(TXDB, use.names = TRUE) | |
| fiveUTRhit <- .annotatePASRegion(pasdb, fiveUTRs, dfTXGeneMapping, "fiveUTR") | |
| ## b.intron | |
| introns <- intronsByTranscript(TXDB, use.names = TRUE) | |
| intronhit <- .annotatePASRegion(pasdb, introns, dfTXGeneMapping, "intron") | |
| ## c.3UTR | |
| threeUTRs <- threeUTRsByTranscript(TXDB, use.names = TRUE) | |
| threeUTRhit <- .annotatePASRegion(pasdb, threeUTRs, dfTXGeneMapping, "threeUTR") | |
| ## d.CDS | |
| cds <- cdsBy(TXDB, by = "tx", use.names = TRUE) | |
| cdshit <- .annotatePASRegion(pasdb, cds, dfTXGeneMapping, "coding") | |
| ## e.all Exons | |
| exon <- exonsBy(TXDB, by = "tx", use.names = TRUE) | |
| exonhit <- .annotatePASRegion(pasdb, exon, dfTXGeneMapping, "Exon") | |
| ## Creat header | |
| dbhit <- as.data.frame(pasdb[1]) | |
| dbhit$LOCATION <- "NONE" | |
| dbhit$PASid <- "NONE" | |
| for (hit in list(fiveUTRhit, intronhit, threeUTRhit, cdshit, exonhit)) { | |
| if (class(hit) == "data.frame") { | |
| dbhit <- rbind(dbhit, hit) | |
| } | |
| } | |
| dbhit <- dbhit[dbhit$PASid != "NONE", ] | |
| finaldf <- dbhit[, c("PASid", "LOCATION", "gene_id")] | |
| colnames(finaldf)[3] <- "GENEID" | |
| finaldf$uniID1 <- paste(finaldf$PASid, finaldf$GENEID, finaldf$LOCATION, sep = ":") | |
| finaldf$uniID2 <- paste(finaldf$PASid, finaldf$GENEID, sep = ":") | |
| finaldf <- finaldf[!duplicated(finaldf), ] | |
| finaldf$ORDERID <- 10 | |
| if (nrow(finaldf[which(finaldf$LOCATION == "fiveUTR"), ]) > 0) { | |
| finaldf[which(finaldf$LOCATION == "fiveUTR"), ]$ORDERID <- 1 | |
| } | |
| if (nrow(finaldf[which(finaldf$LOCATION == "intron"), ]) > 0) { | |
| finaldf[which(finaldf$LOCATION == "intron"), ]$ORDERID <- 2 | |
| } | |
| if (nrow(finaldf[which(finaldf$LOCATION == "coding"), ]) > 0) { | |
| finaldf[which(finaldf$LOCATION == "coding"), ]$ORDERID <- 3 | |
| } | |
| if (nrow(finaldf[which(finaldf$LOCATION == "threeUTR"), ]) > 0) { | |
| finaldf[which(finaldf$LOCATION == "threeUTR"), ]$ORDERID <- 4 | |
| } | |
| finaldf <- finaldf[with(finaldf, order(uniID2, ORDERID)), ] | |
| finaldf <- finaldf[!duplicated(finaldf$uniID2), ] | |
| return(finaldf) | |
| } | |
| .GTF2refUTRraw <- function(TXDB, finaldf) { | |
| ####### CDS end ############ | |
| cds <- cdsBy(TXDB, "gene") | |
| x <- unlist(cds) | |
| cds1 <- split(x, names(x)) | |
| test <- as.data.frame(cds1) | |
| index <- which(test$end - test$start > 1) | |
| allcds <- test[index, ] | |
| indexP <- which(allcds$strand == "+") | |
| indexN <- which(allcds$strand == "-") | |
| allcds$cdsend <- "" | |
| allcds[indexP, ]$cdsend <- allcds[indexP, ]$end | |
| allcds[indexN, ]$cdsend <- allcds[indexN, ]$start | |
| MAX <- aggregate(cdsend ~ group_name, allcds, max) | |
| MIN <- aggregate(cdsend ~ group_name, allcds, min) | |
| names(MAX)[2] <- "max" | |
| names(MIN)[2] <- "min" | |
| maxmin <- merge(x = MAX, y = MIN, by = "group_name") | |
| allcdssub <- allcds[, c("group_name", "strand")] | |
| allcdssub <- allcdssub[!duplicated(allcdssub$group_name), ] | |
| allcdsminmax <- merge(x = allcdssub, y = maxmin, by = "group_name") | |
| indexP <- which(allcdsminmax$strand == "+") | |
| indexN <- which(allcdsminmax$strand == "-") | |
| allcdsminmax$cdsend <- "" | |
| allcdsminmax[indexP, ]$cdsend <- allcdsminmax[indexP, ]$max | |
| allcdsminmax[indexN, ]$cdsend <- allcdsminmax[indexN, ]$min | |
| names(allcdsminmax)[1] <- "GENEID" | |
| names(allcdsminmax)[2] <- "Strand" | |
| allcdsminmax <- allcdsminmax[, c("GENEID", "Strand", "cdsend")] | |
| ##### 3UTR PAS ###### | |
| df3UTR <- finaldf[which(finaldf$LOCATION == "threeUTR"), ] | |
| df3UTR <- merge(df3UTR, allcdsminmax[, c("GENEID", "cdsend")], by = c("GENEID")) | |
| ## Filter 3UTR PAS by CDS | |
| df3UTR <- df3UTR[which(((df3UTR$strand == "+") & (df3UTR$PAS > df3UTR$cdsend)) | ((df3UTR$strand == "-") & (df3UTR$PAS < df3UTR$cdsend))), ] | |
| df3UTR_P <- df3UTR[which(df3UTR$strand == "+"), ] | |
| df3UTR_P <- df3UTR_P[with(df3UTR_P, order(GENEID, PAS)), ] | |
| df3UTR_N <- df3UTR[which(df3UTR$strand == "-"), ] | |
| df3UTR_N <- df3UTR_N[with(df3UTR_N, order(GENEID, -PAS)), ] | |
| df3UTR <- rbind(df3UTR_P, df3UTR_N) | |
| df3UTR_count <- df3UTR %>% | |
| group_by(GENEID) %>% | |
| mutate(col = 1:n()) %>% ## create featureID | |
| mutate(count = n()) | |
| df3UTR_count <- data.frame(df3UTR_count) | |
| df3UTR_count$pA_type <- "M" | |
| if (nrow(df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count == 1), ]) > 0) { | |
| df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count == 1), ]$pA_type <- "S" | |
| } | |
| if (nrow(df3UTR_count[which(df3UTR_count$col == df3UTR_count$count & df3UTR_count$count > 1), ]) > 0) { | |
| df3UTR_count[which(df3UTR_count$col == df3UTR_count$count & df3UTR_count$count > 1), ]$pA_type <- "L" | |
| } | |
| if (nrow(df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count > 1), ]) > 0) { | |
| df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count > 1), ]$pA_type <- "F" | |
| } | |
| dfF <- df3UTR_count[df3UTR_count$pA_type == "F", ][, c("GENEID", "Chr", "strand", "PAS", "gene_name")] | |
| dfL <- df3UTR_count[df3UTR_count$pA_type == "L", ][, c("GENEID", "Chr", "strand", "PAS", "gene_name")] | |
| colnames(dfF) <- c("GENEID", "Chrom", "Strand", "First", "gene_symbol") | |
| colnames(dfL) <- c("GENEID", "Chrom", "Strand", "Last", "gene_symbol") | |
| dfFL <- merge(dfF, dfL, by = c("GENEID", "Chrom", "Strand", "gene_symbol")) | |
| dfFL <- dfFL[!duplicated(dfFL$GENEID), ] | |
| dfFLCDE <- merge(dfFL, allcdsminmax, by = c("GENEID", "Strand")) | |
| dfFLCDE <- dfFLCDE[which(((dfFLCDE$Strand == "+") & (dfFLCDE$First > dfFLCDE$cdsend)) | ((dfFLCDE$Strand == "-") & (dfFLCDE$First < dfFLCDE$cdsend))), ] | |
| dfFLCDE$First <- as.integer(dfFLCDE$First) | |
| dfFLCDE$cdsend <- as.integer(dfFLCDE$cdsend) | |
| dfFLCDE$distance <- dfFLCDE$First - dfFLCDE$cdsend | |
| dfFLCDE[which(dfFLCDE$Strand == "-"), ]$distance <- dfFLCDE[which(dfFLCDE$Strand == "-"), ]$cdsend - dfFLCDE[which(dfFLCDE$Strand == "-"), ]$First | |
| index1 <- which(dfFLCDE$Strand == "+" & dfFLCDE$distance < 100) | |
| dfFLCDE[index1, ]$cdsend <- dfFLCDE[index1, ]$cdsend - (100 - dfFLCDE[index1, ]$distance) | |
| index2 <- which(dfFLCDE$Strand == "-" & dfFLCDE$distance < 100) | |
| dfFLCDE[index2, ]$cdsend <- dfFLCDE[index2, ]$cdsend + (100 - dfFLCDE[index2, ]$distance) | |
| refUTRraw <- dfFLCDE[, c("gene_symbol", "Chrom", "Strand", "First", "Last", "cdsend")] | |
| refUTRraw <- refUTRraw[!is.na(refUTRraw$gene_symbol), ] | |
| return(refUTRraw) | |
| } | |
| .GTF2IPA <- function(EDB, TXDB, finaldf) { | |
| ############ intron and SS ################ | |
| introns <- intronsByTranscript(TXDB, use.names = TRUE) | |
| dfintrons <- as.data.frame(introns) | |
| colnames(dfintrons) <- c("group", "tx_id", "Chrom", "Start", "End", "width", "Strand") | |
| # tx2gene <- mcols(transcripts(EDB, columns=c("tx_id", "gene_id",'gene_name'))) | |
| # colnames(tx2gene)=c("tx_id", "GENEID",'gene_symbol') | |
| tx2gene <- as.data.frame(EDB[which(EDB$type == "transcript"), ])[, c("transcript_id", "gene_id", "gene_name")] | |
| colnames(tx2gene) <- c("tx_id", "GENEID", "gene_symbol") | |
| tx2gene <- tx2gene[!duplicated(tx2gene), ] | |
| dfintrons <- merge(dfintrons, tx2gene, by = "tx_id") | |
| dfintrons <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Start", "End", "Strand", "width")] | |
| dfintrons <- dfintrons[!duplicated(dfintrons), ] | |
| ## ANNO 5SS and 3SS## | |
| dfintrons$SS5 <- dfintrons$Start | |
| dfintrons$SS3 <- dfintrons$End | |
| dfintrons[which(dfintrons$Strand == "-"), ]$SS5 <- dfintrons[which(dfintrons$Strand == "-"), ]$End | |
| dfintrons[which(dfintrons$Strand == "-"), ]$SS3 <- dfintrons[which(dfintrons$Strand == "-"), ]$Start | |
| ## Combine 5SS and 3SS## | |
| dfSS5 <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Strand", "SS5")] | |
| dfSS3 <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Strand", "SS3")] | |
| comname <- c("GENEID", "gene_symbol", "Chrom", "Strand", "upstreamSS") | |
| colnames(dfSS5) <- comname | |
| colnames(dfSS3) <- comname | |
| dfSS5$type <- "SS5" | |
| dfSS3$type <- "SS3" | |
| dfupSS <- rbind(dfSS5, dfSS3) | |
| dfupSS$SSID <- paste0(dfupSS$GENEID, dfupSS$gene_symbol, dfupSS$Chrom, dfupSS$Strand, dfupSS$upstreamSS) | |
| dfupSS <- dfupSS[!duplicated(dfupSS$SSID), ] | |
| ############ build condidate downstream SS ################## | |
| dfdnSS3 <- dfSS3 | |
| dfdnSS3$SSID <- paste0(dfdnSS3$GENEID, dfdnSS3$gene_symbol, dfdnSS3$Chrom, dfdnSS3$Strand, dfdnSS3$upstreamSS) | |
| dfdnSS3 <- dfdnSS3[!duplicated(dfdnSS3$SSID), ] | |
| colnames(dfdnSS3) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "downstreamSS", "type", "SSID") | |
| # dfdnSS3$Chrom=paste0('chr',dfdnSS3$Chrom) | |
| # dfupSS$Chrom=paste0('chr',dfupSS$Chrom) | |
| dfupSS <- dfupSS[!is.na(dfupSS$gene_symbol), ] | |
| dfdnSS3 <- dfdnSS3[!is.na(dfdnSS3$gene_symbol), ] | |
| ######### IPA and SS ############ | |
| dfIPA <- finaldf[(finaldf$LOCATION == "intron"), ] | |
| dfIPA <- dfIPA[, c("PASid", "Chr", "PAS", "strand", "GENEID", "gene_name")] | |
| colnames(dfIPA) <- c("PASid", "Chrom", "Pos", "Strand", "GENEID", "gene_symbol") | |
| dfIPA <- dfIPA[!duplicated(dfIPA), ] | |
| dfIPA <- dfIPA[!is.na(dfIPA$gene_symbol), ] | |
| DFIPASS <- list(dfIPA, dfupSS, dfdnSS3) | |
| names(DFIPASS) <- c("dfIPA", "dfupSS", "dfdnSS3") | |
| return(DFIPASS) | |
| } | |
| .GTF2LE <- function(TXDB, dfIPAXXX, dfupSSXXX, dfdnSS3XXX) { | |
| dfIPAXXX <- as.data.frame(dfIPAXXX) | |
| dfupSSXXX <- as.data.frame(dfupSSXXX) | |
| dfdnSS3XXX <- as.data.frame(dfdnSS3XXX) | |
| ############ Hit upstream SS ################## | |
| dfHIT_up <- merge(dfIPAXXX, dfupSSXXX, by = c("GENEID", "gene_symbol", "Chrom", "Strand")) | |
| dfHIT_up$dis <- dfHIT_up$Pos - dfHIT_up$upstreamSS | |
| dfHIT_up[which(dfHIT_up$Strand == "-"), ]$dis <- dfHIT_up[which(dfHIT_up$Strand == "-"), ]$upstreamSS - dfHIT_up[which(dfHIT_up$Strand == "-"), ]$Pos | |
| dfHIT_up$HITID <- paste0(dfHIT_up$GENEID, dfHIT_up$gene_symbol, dfHIT_up$PASid) | |
| dfHIT_up <- dfHIT_up[which(dfHIT_up$dis > 100), ] | |
| dfHIT_up <- dfHIT_up[with(dfHIT_up, order(HITID, dis)), ] | |
| dfHIT_up <- dfHIT_up[!duplicated(dfHIT_up$HITID), ] | |
| dfHIT_up <- dfHIT_up[, c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "upstreamSS", "type", "dis")] | |
| colnames(dfHIT_up) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "upstreamSS", "type", "upstream_dis") | |
| ############ Hit downstream SS ################## | |
| dfHIT_dn <- merge(dfIPAXXX, dfdnSS3XXX, by = c("GENEID", "gene_symbol", "Chrom", "Strand")) | |
| dfHIT_dn$dis <- dfHIT_dn$downstreamSS - dfHIT_dn$Pos | |
| dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$dis <- dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$Pos - dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$downstreamSS | |
| dfHIT_dn$HITID <- paste0(dfHIT_dn$GENEID, dfHIT_dn$gene_symbol, dfHIT_dn$PASid) | |
| dfHIT_dn <- dfHIT_dn[which(dfHIT_dn$dis > 100), ] | |
| dfHIT_dn <- dfHIT_dn[with(dfHIT_dn, order(HITID, dis)), ] | |
| dfHIT_dn <- dfHIT_dn[!duplicated(dfHIT_dn$HITID), ] | |
| dfHIT_dn <- dfHIT_dn[, c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "downstreamSS", "dis")] | |
| colnames(dfHIT_dn) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "downstreamSS", "downstream_dis") | |
| ############ combine IPA SS ################## | |
| dfHIT_combine <- merge(dfHIT_up, dfHIT_dn, by = c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos")) | |
| dfHIT_combine <- dfHIT_combine[!duplicated(dfHIT_combine), ] | |
| dfHIT_combine$IPA_up_start <- dfHIT_combine$upstreamSS | |
| dfHIT_combine$IPA_up_end <- dfHIT_combine$Pos | |
| dfHIT_combine$IPA_dn_start <- dfHIT_combine$Pos | |
| dfHIT_combine$IPA_dn_end <- dfHIT_combine$downstreamSS | |
| dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_up_start <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$Pos | |
| dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_up_end <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$upstreamSS | |
| dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_dn_start <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$downstreamSS | |
| dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_dn_end <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$Pos | |
| dfIPAREF <- dfHIT_combine[, c("PASid", "GENEID", "gene_symbol", "Chrom", "Strand", "Pos", "upstreamSS", "downstreamSS")] | |
| dfIPAREF <- as.data.frame(dfIPAREF) | |
| dfIPAREF <- dfIPAREF[!duplicated(dfIPAREF), ] | |
| dfIPAsim <- dfHIT_combine[, c("PASid", "gene_symbol", "Chrom", "Strand", "Pos", "upstreamSS", "downstreamSS")] | |
| dfIPAsim <- as.data.frame(dfIPAsim) | |
| dfIPAsim <- dfIPAsim[!duplicated(dfIPAsim), ] | |
| dfIPAsim <- dfIPAsim[!is.na(dfIPAsim$gene_symbol), ] | |
| ############ TES ############## | |
| dfgenic <- as.data.frame(genes(TXDB)) | |
| dfgenic <- dfgenic[, c(6, 1:3, 5)] | |
| names(dfgenic) <- c("GENEID", "Chr", "Start", "End", "Strand") | |
| # find the TES | |
| dfMinMAX <- dfgenic | |
| dfMinMAX$TES <- "" | |
| indexN <- which(dfMinMAX$Strand == "-") | |
| indexP <- which(dfMinMAX$Strand == "+") | |
| dfMinMAX[indexP, ]$TES <- dfMinMAX[indexP, ]$End | |
| dfMinMAX[indexN, ]$TES <- dfMinMAX[indexN, ]$Start | |
| dfTES <- dfMinMAX[, c("GENEID", "Chr", "TES", "Strand")] | |
| dfTES <- dfTES[!duplicated(dfTES), ] | |
| ### I exon### | |
| exondb <- exonsBy(TXDB, by = "gene") | |
| exondb <- GenomicRanges::reduce(exondb) | |
| exonsdf <- as.data.frame(exondb) | |
| exonsdfP <- exonsdf[which(exonsdf$strand == "+"), ] | |
| exonsdfN <- exonsdf[which(exonsdf$strand == "-"), ] | |
| exonsdfN <- exonsdfN[with(exonsdfN, order(group_name, -start)), ] | |
| exonsdf <- rbind(exonsdfP, exonsdfN) | |
| exonsdf <- exonsdf[, -c(1)] | |
| data_EXrename <- exonsdf %>% | |
| group_by(group_name) %>% | |
| mutate(col = 1:n()) %>% ## create exonID | |
| mutate(count = n()) | |
| exonsdf <- data.frame(data_EXrename) | |
| names(exonsdf)[7:8] <- c("EXid", "EXcount") | |
| dfLexon <- exonsdf[which(exonsdf$EXid == exonsdf$EXcount), ] | |
| dfLexon$LEstart <- dfLexon$start | |
| dfLexon[which(dfLexon$strand == "-"), ]$LEstart <- dfLexon[which(dfLexon$strand == "-"), ]$end | |
| dfLexon <- dfLexon[, c("group_name", "seqnames", "strand", "LEstart")] | |
| colnames(dfLexon) <- c("GENEID", "Chr", "Strand", "LEstart") | |
| dfLE <- merge(dfLexon, dfTES, by = c("GENEID", "Chr", "Strand")) | |
| dfLE <- dfLE[!duplicated(dfLE), ] | |
| dfIPAused <- dfIPAREF[, c("GENEID", "gene_symbol")] | |
| dfIPAused <- dfIPAused[!duplicated(dfIPAused), ] | |
| dfLE <- merge(dfLE, dfIPAused, by = "GENEID") | |
| dfLE <- dfLE[!duplicated(dfLE), ] | |
| # dfLE$Chr=paste0('chr',dfLE$Chr) | |
| dfLE <- dfLE[, c("gene_symbol", "Chr", "Strand", "LEstart", "TES")] | |
| colnames(dfLE) <- c("gene_symbol", "Chrom", "Strand", "LEstart", "TES") | |
| dfLE <- dfLE[!is.na(dfLE$gene_symbol), ] | |
| dfLE_dfIPA <- list(dfIPAsim, dfLE) | |
| names(dfLE_dfIPA) <- c("dfIPAsim", "dfLE") | |
| return(dfLE_dfIPA) | |
| } | |
| .get_input_pos <- function(x, y) { | |
| y2 <- y | |
| start(y2) <- start(x) | |
| end(y2) <- end(x) | |
| return(y2) | |
| } | |
| parse_PAS <- function(tsv, gtf, chr_name = "Chromosome", pos_name = "End", strand_name = "Strand", remove_suffix = FALSE) { | |
| # parse PAS sites | |
| print("Read PAS sites from input tsv file") | |
| df_input <- read.table(tsv, sep = "\t", header = TRUE) | |
| if (remove_suffix) { | |
| chr_col <- sub("chr", "", df_input[[chr_name]]) | |
| } else { | |
| chr_col <- df_input[[chr_name]] | |
| } | |
| gr_input <- GRanges( | |
| seqnames = chr_col, | |
| ranges = IRanges(start = df_input[[pos_name]], end = df_input[[pos_name]]), | |
| strand = df_input[[strand_name]] | |
| ) | |
| print("Read gene model from gtf file") | |
| EDB <- rtracklayer::import(gtf) | |
| TXDB <- makeTxDbFromGRanges(EDB) | |
| print("Annotate PAS sites with gene model") | |
| ol <- findOverlaps(gr_input, EDB, ignore.strand = FALSE, type = "within", select = "all") | |
| gr_annot <- unlist(mendoapply(.get_input_pos, split(gr_input[queryHits(ol)], 1:length(ol)), y = split(EDB[subjectHits(ol)], 1:length(ol)))) | |
| dfPAS <- as.data.frame(gr_annot[which(gr_annot$type == "transcript"), ][, c("transcript_id", "gene_id", "gene_name")]) | |
| dfPAS$PAS <- dfPAS$start | |
| dfTXGeneMapping <- dfPAS[, c("transcript_id", "gene_id")] | |
| dfTXGeneMapping <- dfTXGeneMapping[!duplicated(dfTXGeneMapping), ] | |
| dfTXGeneMapping <- dfTXGeneMapping[!is.na(dfTXGeneMapping$transcript_id), ] | |
| pasdb <- makeGRangesFromDataFrame(dfPAS, keep.extra.columns = TRUE) | |
| finaldf <- .annotatePAS_V2(pasdb, TXDB, dfTXGeneMapping) | |
| dfPAS$Chr <- dfPAS$seqnames | |
| dfPAS$PASid <- paste(dfPAS$seqnames, dfPAS$PAS, dfPAS$strand, sep = ":") | |
| dfinfo <- dfPAS[, c("PASid", "Chr", "strand", "PAS", "gene_id", "gene_name")] | |
| colnames(dfinfo) <- c("PASid", "Chr", "strand", "PAS", "GENEID", "gene_name") | |
| finaldf <- merge(finaldf, dfinfo, by = c("PASid", "GENEID")) | |
| # output PAS objects | |
| print("Extracting and filtering 3'UTR PASs") | |
| refUTRraw <- .GTF2refUTRraw(TXDB, finaldf) | |
| print("Extracting IPAs") | |
| dfIPAALL <- .GTF2IPA(EDB, TXDB, finaldf) | |
| print("Extracting 3' last exons") | |
| dfIPAXXX <- dfIPAALL$dfIPA | |
| dfupSSXXX <- dfIPAALL$dfupSS | |
| dfdnSS3XXX <- dfIPAALL$dfdnSS3 | |
| dfLE_dfIPA <- .GTF2LE(TXDB, dfIPAXXX, dfupSSXXX, dfdnSS3XXX) | |
| print("Finalizing references") | |
| PASREF <- list(refUTRraw, dfLE_dfIPA$dfIPAsim, dfLE_dfIPA$dfLE) | |
| names(PASREF) <- c("refUTRraw", "dfIPA", "dfLE") | |
| return(PASREF) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment