y9c · April 3, 2023 01:33
diff --git a/test_APA.R b/test_APA.R
 source("./utils_APA.R")


 ## How ot use?
 ##
 ## Change the path of tsv and gtf file
 ## `chr_name` is the column name of chromosome in the tsv file, default is "Chromosome"
 ## `pos_name` is the column name of position in the tsv file, default is "End"
 ## `strand_name` is the column name of strand in the tsv file, default is "Strand"
 tsv <- "./test_sites.tsv"
 gtf <- "~/reference/feature/Homo_sapiens.GRCh38.genome.gtf"
 parse_PAS(tsv, gtf, chr_name = "Chromosome", pos_name = "End", strand_name = "Strand")
diff --git a/utils_APA.R b/utils_APA.R
 library(dplyr)
 library(GenomicFeatures)
 library(APAlyzer)

 .annotatePASRegion <- function(pasdb, dbreigon, dfTXGeneMapping, flag) {
  x <- unlist(dbreigon)
  x$IndexID <- 1:length(x)
  x$transcript_id <- names(x)
  names(x) <- x$IndexID
  acc2sym <- data.frame(names(x), x$transcript_id)
  colnames(acc2sym) <- c("IndexID", "transcript_id")
  acc2sym <- merge(acc2sym, dfTXGeneMapping, on = "transcript_id", all.x = TRUE)
  x <- x[acc2sym$IndexID]
  names(x) <- acc2sym$gene_id
  dbreigon <- split(x, names(x))
  dbreigon <- GenomicRanges::reduce(dbreigon)
  olp <- findOverlaps(pasdb, dbreigon)
  if (length(olp) > 0) {
    hit <- pasdb[queryHits(olp), ]
    hit$LOCATION <- flag
    hit$PASid <- paste(seqnames(hit), start(hit), strand(hit), sep = ":")
    hit <- as.data.frame(hit, row.names=1:length(hit))
  } else {
    hit <- "NoHit"
  }
  return(hit)
 }

 .annotatePAS_V2 <- function(pasdb, TXDB, dfTXGeneMapping) {
  ## a.5UTR
  fiveUTRs <- fiveUTRsByTranscript(TXDB, use.names = TRUE)
  fiveUTRhit <- .annotatePASRegion(pasdb, fiveUTRs, dfTXGeneMapping, "fiveUTR")

  ## b.intron
  introns <- intronsByTranscript(TXDB, use.names = TRUE)
  intronhit <- .annotatePASRegion(pasdb, introns, dfTXGeneMapping, "intron")

  ## c.3UTR
  threeUTRs <- threeUTRsByTranscript(TXDB, use.names = TRUE)
  threeUTRhit <- .annotatePASRegion(pasdb, threeUTRs, dfTXGeneMapping, "threeUTR")

  ## d.CDS
  cds <- cdsBy(TXDB, by = "tx", use.names = TRUE)
  cdshit <- .annotatePASRegion(pasdb, cds, dfTXGeneMapping, "coding")

  ## e.all Exons
  exon <- exonsBy(TXDB, by = "tx", use.names = TRUE)
  exonhit <- .annotatePASRegion(pasdb, exon, dfTXGeneMapping, "Exon")

  ## Creat header
  dbhit <- as.data.frame(pasdb[1])
  dbhit$LOCATION <- "NONE"
  dbhit$PASid <- "NONE"
  for (hit in list(fiveUTRhit, intronhit, threeUTRhit, cdshit, exonhit)) {
    if (class(hit) == "data.frame") {
      dbhit <- rbind(dbhit, hit)
    }
  }
  dbhit <- dbhit[dbhit$PASid != "NONE", ]
  finaldf <- dbhit[, c("PASid", "LOCATION", "gene_id")]
  colnames(finaldf)[3] <- "GENEID"
  finaldf$uniID1 <- paste(finaldf$PASid, finaldf$GENEID, finaldf$LOCATION, sep = ":")
  finaldf$uniID2 <- paste(finaldf$PASid, finaldf$GENEID, sep = ":")
  finaldf <- finaldf[!duplicated(finaldf), ]
  finaldf$ORDERID <- 10
  if (nrow(finaldf[which(finaldf$LOCATION == "fiveUTR"), ]) > 0) {
    finaldf[which(finaldf$LOCATION == "fiveUTR"), ]$ORDERID <- 1
  }

  if (nrow(finaldf[which(finaldf$LOCATION == "intron"), ]) > 0) {
    finaldf[which(finaldf$LOCATION == "intron"), ]$ORDERID <- 2
  }

  if (nrow(finaldf[which(finaldf$LOCATION == "coding"), ]) > 0) {
    finaldf[which(finaldf$LOCATION == "coding"), ]$ORDERID <- 3
  }

  if (nrow(finaldf[which(finaldf$LOCATION == "threeUTR"), ]) > 0) {
    finaldf[which(finaldf$LOCATION == "threeUTR"), ]$ORDERID <- 4
  }
  finaldf <- finaldf[with(finaldf, order(uniID2, ORDERID)), ]
  finaldf <- finaldf[!duplicated(finaldf$uniID2), ]
  return(finaldf)
 }


 .GTF2refUTRraw <- function(TXDB, finaldf) {
  ####### CDS end ############
  cds <- cdsBy(TXDB, "gene")
  x <- unlist(cds)
  cds1 <- split(x, names(x))
  test <- as.data.frame(cds1)
  index <- which(test$end - test$start > 1)
  allcds <- test[index, ]
  indexP <- which(allcds$strand == "+")
  indexN <- which(allcds$strand == "-")
  allcds$cdsend <- ""
  allcds[indexP, ]$cdsend <- allcds[indexP, ]$end
  allcds[indexN, ]$cdsend <- allcds[indexN, ]$start
  MAX <- aggregate(cdsend ~ group_name, allcds, max)
  MIN <- aggregate(cdsend ~ group_name, allcds, min)
  names(MAX)[2] <- "max"
  names(MIN)[2] <- "min"
  maxmin <- merge(x = MAX, y = MIN, by = "group_name")
  allcdssub <- allcds[, c("group_name", "strand")]
  allcdssub <- allcdssub[!duplicated(allcdssub$group_name), ]
  allcdsminmax <- merge(x = allcdssub, y = maxmin, by = "group_name")
  indexP <- which(allcdsminmax$strand == "+")
  indexN <- which(allcdsminmax$strand == "-")
  allcdsminmax$cdsend <- ""
  allcdsminmax[indexP, ]$cdsend <- allcdsminmax[indexP, ]$max
  allcdsminmax[indexN, ]$cdsend <- allcdsminmax[indexN, ]$min
  names(allcdsminmax)[1] <- "GENEID"
  names(allcdsminmax)[2] <- "Strand"
  allcdsminmax <- allcdsminmax[, c("GENEID", "Strand", "cdsend")]


  ##### 3UTR PAS ######
  df3UTR <- finaldf[which(finaldf$LOCATION == "threeUTR"), ]
  df3UTR <- merge(df3UTR, allcdsminmax[, c("GENEID", "cdsend")], by = c("GENEID"))
  ## Filter 3UTR PAS by CDS
  df3UTR <- df3UTR[which(((df3UTR$strand == "+") & (df3UTR$PAS > df3UTR$cdsend)) | ((df3UTR$strand == "-") & (df3UTR$PAS < df3UTR$cdsend))), ]

  df3UTR_P <- df3UTR[which(df3UTR$strand == "+"), ]
  df3UTR_P <- df3UTR_P[with(df3UTR_P, order(GENEID, PAS)), ]
  df3UTR_N <- df3UTR[which(df3UTR$strand == "-"), ]
  df3UTR_N <- df3UTR_N[with(df3UTR_N, order(GENEID, -PAS)), ]
  df3UTR <- rbind(df3UTR_P, df3UTR_N)
  df3UTR_count <- df3UTR %>%
    group_by(GENEID) %>%
    mutate(col = 1:n()) %>% ## create featureID
    mutate(count = n())
  df3UTR_count <- data.frame(df3UTR_count)
  df3UTR_count$pA_type <- "M"
  if (nrow(df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count == 1), ]) > 0) {
    df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count == 1), ]$pA_type <- "S"
  }

  if (nrow(df3UTR_count[which(df3UTR_count$col == df3UTR_count$count & df3UTR_count$count > 1), ]) > 0) {
    df3UTR_count[which(df3UTR_count$col == df3UTR_count$count & df3UTR_count$count > 1), ]$pA_type <- "L"
  }

  if (nrow(df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count > 1), ]) > 0) {
    df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count > 1), ]$pA_type <- "F"
  }


  dfF <- df3UTR_count[df3UTR_count$pA_type == "F", ][, c("GENEID", "Chr", "strand", "PAS", "gene_name")]
  dfL <- df3UTR_count[df3UTR_count$pA_type == "L", ][, c("GENEID", "Chr", "strand", "PAS", "gene_name")]
  colnames(dfF) <- c("GENEID", "Chrom", "Strand", "First", "gene_symbol")
  colnames(dfL) <- c("GENEID", "Chrom", "Strand", "Last", "gene_symbol")
  dfFL <- merge(dfF, dfL, by = c("GENEID", "Chrom", "Strand", "gene_symbol"))
  dfFL <- dfFL[!duplicated(dfFL$GENEID), ]

  dfFLCDE <- merge(dfFL, allcdsminmax, by = c("GENEID", "Strand"))
  dfFLCDE <- dfFLCDE[which(((dfFLCDE$Strand == "+") & (dfFLCDE$First > dfFLCDE$cdsend)) | ((dfFLCDE$Strand == "-") & (dfFLCDE$First < dfFLCDE$cdsend))), ]
  dfFLCDE$First <- as.integer(dfFLCDE$First)
  dfFLCDE$cdsend <- as.integer(dfFLCDE$cdsend)
  dfFLCDE$distance <- dfFLCDE$First - dfFLCDE$cdsend
  dfFLCDE[which(dfFLCDE$Strand == "-"), ]$distance <- dfFLCDE[which(dfFLCDE$Strand == "-"), ]$cdsend - dfFLCDE[which(dfFLCDE$Strand == "-"), ]$First

  index1 <- which(dfFLCDE$Strand == "+" & dfFLCDE$distance < 100)
  dfFLCDE[index1, ]$cdsend <- dfFLCDE[index1, ]$cdsend - (100 - dfFLCDE[index1, ]$distance)
  index2 <- which(dfFLCDE$Strand == "-" & dfFLCDE$distance < 100)
  dfFLCDE[index2, ]$cdsend <- dfFLCDE[index2, ]$cdsend + (100 - dfFLCDE[index2, ]$distance)
  refUTRraw <- dfFLCDE[, c("gene_symbol", "Chrom", "Strand", "First", "Last", "cdsend")]
  refUTRraw <- refUTRraw[!is.na(refUTRraw$gene_symbol), ]
  return(refUTRraw)
 }


 .GTF2IPA <- function(EDB, TXDB, finaldf) {
  ############ intron and SS ################
  introns <- intronsByTranscript(TXDB, use.names = TRUE)
  dfintrons <- as.data.frame(introns)
  colnames(dfintrons) <- c("group", "tx_id", "Chrom", "Start", "End", "width", "Strand")
  # tx2gene <- mcols(transcripts(EDB, columns=c("tx_id", "gene_id",'gene_name')))
  # colnames(tx2gene)=c("tx_id", "GENEID",'gene_symbol')
  tx2gene <- as.data.frame(EDB[which(EDB$type == "transcript"), ])[, c("transcript_id", "gene_id", "gene_name")]
  colnames(tx2gene) <- c("tx_id", "GENEID", "gene_symbol")
  tx2gene <- tx2gene[!duplicated(tx2gene), ]
  dfintrons <- merge(dfintrons, tx2gene, by = "tx_id")
  dfintrons <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Start", "End", "Strand", "width")]
  dfintrons <- dfintrons[!duplicated(dfintrons), ]

  ## ANNO 5SS and 3SS##
  dfintrons$SS5 <- dfintrons$Start
  dfintrons$SS3 <- dfintrons$End
  dfintrons[which(dfintrons$Strand == "-"), ]$SS5 <- dfintrons[which(dfintrons$Strand == "-"), ]$End
  dfintrons[which(dfintrons$Strand == "-"), ]$SS3 <- dfintrons[which(dfintrons$Strand == "-"), ]$Start

  ## Combine 5SS and 3SS##
  dfSS5 <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Strand", "SS5")]
  dfSS3 <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Strand", "SS3")]
  comname <- c("GENEID", "gene_symbol", "Chrom", "Strand", "upstreamSS")
  colnames(dfSS5) <- comname
  colnames(dfSS3) <- comname
  dfSS5$type <- "SS5"
  dfSS3$type <- "SS3"
  dfupSS <- rbind(dfSS5, dfSS3)
  dfupSS$SSID <- paste0(dfupSS$GENEID, dfupSS$gene_symbol, dfupSS$Chrom, dfupSS$Strand, dfupSS$upstreamSS)
  dfupSS <- dfupSS[!duplicated(dfupSS$SSID), ]
  ############ build condidate downstream SS ##################
  dfdnSS3 <- dfSS3
  dfdnSS3$SSID <- paste0(dfdnSS3$GENEID, dfdnSS3$gene_symbol, dfdnSS3$Chrom, dfdnSS3$Strand, dfdnSS3$upstreamSS)
  dfdnSS3 <- dfdnSS3[!duplicated(dfdnSS3$SSID), ]
  colnames(dfdnSS3) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "downstreamSS", "type", "SSID")
  # dfdnSS3$Chrom=paste0('chr',dfdnSS3$Chrom)
  # dfupSS$Chrom=paste0('chr',dfupSS$Chrom)
  dfupSS <- dfupSS[!is.na(dfupSS$gene_symbol), ]
  dfdnSS3 <- dfdnSS3[!is.na(dfdnSS3$gene_symbol), ]

  ######### IPA and SS ############
  dfIPA <- finaldf[(finaldf$LOCATION == "intron"), ]
  dfIPA <- dfIPA[, c("PASid", "Chr", "PAS", "strand", "GENEID", "gene_name")]
  colnames(dfIPA) <- c("PASid", "Chrom", "Pos", "Strand", "GENEID", "gene_symbol")
  dfIPA <- dfIPA[!duplicated(dfIPA), ]
  dfIPA <- dfIPA[!is.na(dfIPA$gene_symbol), ]
  DFIPASS <- list(dfIPA, dfupSS, dfdnSS3)
  names(DFIPASS) <- c("dfIPA", "dfupSS", "dfdnSS3")
  return(DFIPASS)
 }



 .GTF2LE <- function(TXDB, dfIPAXXX, dfupSSXXX, dfdnSS3XXX) {
  dfIPAXXX <- as.data.frame(dfIPAXXX)
  dfupSSXXX <- as.data.frame(dfupSSXXX)
  dfdnSS3XXX <- as.data.frame(dfdnSS3XXX)
  ############ Hit upstream SS ##################
  dfHIT_up <- merge(dfIPAXXX, dfupSSXXX, by = c("GENEID", "gene_symbol", "Chrom", "Strand"))
  dfHIT_up$dis <- dfHIT_up$Pos - dfHIT_up$upstreamSS
  dfHIT_up[which(dfHIT_up$Strand == "-"), ]$dis <- dfHIT_up[which(dfHIT_up$Strand == "-"), ]$upstreamSS - dfHIT_up[which(dfHIT_up$Strand == "-"), ]$Pos
  dfHIT_up$HITID <- paste0(dfHIT_up$GENEID, dfHIT_up$gene_symbol, dfHIT_up$PASid)
  dfHIT_up <- dfHIT_up[which(dfHIT_up$dis > 100), ]
  dfHIT_up <- dfHIT_up[with(dfHIT_up, order(HITID, dis)), ]
  dfHIT_up <- dfHIT_up[!duplicated(dfHIT_up$HITID), ]

  dfHIT_up <- dfHIT_up[, c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "upstreamSS", "type", "dis")]
  colnames(dfHIT_up) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "upstreamSS", "type", "upstream_dis")

  ############ Hit downstream SS ##################
  dfHIT_dn <- merge(dfIPAXXX, dfdnSS3XXX, by = c("GENEID", "gene_symbol", "Chrom", "Strand"))
  dfHIT_dn$dis <- dfHIT_dn$downstreamSS - dfHIT_dn$Pos
  dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$dis <- dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$Pos - dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$downstreamSS
  dfHIT_dn$HITID <- paste0(dfHIT_dn$GENEID, dfHIT_dn$gene_symbol, dfHIT_dn$PASid)
  dfHIT_dn <- dfHIT_dn[which(dfHIT_dn$dis > 100), ]
  dfHIT_dn <- dfHIT_dn[with(dfHIT_dn, order(HITID, dis)), ]
  dfHIT_dn <- dfHIT_dn[!duplicated(dfHIT_dn$HITID), ]

  dfHIT_dn <- dfHIT_dn[, c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "downstreamSS", "dis")]
  colnames(dfHIT_dn) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "downstreamSS", "downstream_dis")

  ############ combine IPA SS ##################
  dfHIT_combine <- merge(dfHIT_up, dfHIT_dn, by = c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos"))
  dfHIT_combine <- dfHIT_combine[!duplicated(dfHIT_combine), ]
  dfHIT_combine$IPA_up_start <- dfHIT_combine$upstreamSS
  dfHIT_combine$IPA_up_end <- dfHIT_combine$Pos
  dfHIT_combine$IPA_dn_start <- dfHIT_combine$Pos
  dfHIT_combine$IPA_dn_end <- dfHIT_combine$downstreamSS

  dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_up_start <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$Pos
  dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_up_end <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$upstreamSS
  dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_dn_start <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$downstreamSS
  dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_dn_end <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$Pos

  dfIPAREF <- dfHIT_combine[, c("PASid", "GENEID", "gene_symbol", "Chrom", "Strand", "Pos", "upstreamSS", "downstreamSS")]
  dfIPAREF <- as.data.frame(dfIPAREF)
  dfIPAREF <- dfIPAREF[!duplicated(dfIPAREF), ]

  dfIPAsim <- dfHIT_combine[, c("PASid", "gene_symbol", "Chrom", "Strand", "Pos", "upstreamSS", "downstreamSS")]
  dfIPAsim <- as.data.frame(dfIPAsim)
  dfIPAsim <- dfIPAsim[!duplicated(dfIPAsim), ]
  dfIPAsim <- dfIPAsim[!is.na(dfIPAsim$gene_symbol), ]

  ############ TES ##############
  dfgenic <- as.data.frame(genes(TXDB))
  dfgenic <- dfgenic[, c(6, 1:3, 5)]
  names(dfgenic) <- c("GENEID", "Chr", "Start", "End", "Strand")

  # find the TES
  dfMinMAX <- dfgenic
  dfMinMAX$TES <- ""
  indexN <- which(dfMinMAX$Strand == "-")
  indexP <- which(dfMinMAX$Strand == "+")
  dfMinMAX[indexP, ]$TES <- dfMinMAX[indexP, ]$End
  dfMinMAX[indexN, ]$TES <- dfMinMAX[indexN, ]$Start
  dfTES <- dfMinMAX[, c("GENEID", "Chr", "TES", "Strand")]
  dfTES <- dfTES[!duplicated(dfTES), ]

  ### I exon###
  exondb <- exonsBy(TXDB, by = "gene")
  exondb <- GenomicRanges::reduce(exondb)
  exonsdf <- as.data.frame(exondb)
  exonsdfP <- exonsdf[which(exonsdf$strand == "+"), ]
  exonsdfN <- exonsdf[which(exonsdf$strand == "-"), ]
  exonsdfN <- exonsdfN[with(exonsdfN, order(group_name, -start)), ]
  exonsdf <- rbind(exonsdfP, exonsdfN)
  exonsdf <- exonsdf[, -c(1)]

  data_EXrename <- exonsdf %>%
    group_by(group_name) %>%
    mutate(col = 1:n()) %>% ## create exonID
    mutate(count = n())
  exonsdf <- data.frame(data_EXrename)
  names(exonsdf)[7:8] <- c("EXid", "EXcount")

  dfLexon <- exonsdf[which(exonsdf$EXid == exonsdf$EXcount), ]
  dfLexon$LEstart <- dfLexon$start
  dfLexon[which(dfLexon$strand == "-"), ]$LEstart <- dfLexon[which(dfLexon$strand == "-"), ]$end
  dfLexon <- dfLexon[, c("group_name", "seqnames", "strand", "LEstart")]
  colnames(dfLexon) <- c("GENEID", "Chr", "Strand", "LEstart")
  dfLE <- merge(dfLexon, dfTES, by = c("GENEID", "Chr", "Strand"))
  dfLE <- dfLE[!duplicated(dfLE), ]

  dfIPAused <- dfIPAREF[, c("GENEID", "gene_symbol")]
  dfIPAused <- dfIPAused[!duplicated(dfIPAused), ]

  dfLE <- merge(dfLE, dfIPAused, by = "GENEID")
  dfLE <- dfLE[!duplicated(dfLE), ]
  # dfLE$Chr=paste0('chr',dfLE$Chr)
  dfLE <- dfLE[, c("gene_symbol", "Chr", "Strand", "LEstart", "TES")]
  colnames(dfLE) <- c("gene_symbol", "Chrom", "Strand", "LEstart", "TES")
  dfLE <- dfLE[!is.na(dfLE$gene_symbol), ]
  dfLE_dfIPA <- list(dfIPAsim, dfLE)
  names(dfLE_dfIPA) <- c("dfIPAsim", "dfLE")
  return(dfLE_dfIPA)
 }



 .get_input_pos <- function(x, y) {
  y2 <- y
  start(y2) <- start(x)
  end(y2) <- end(x)
  return(y2)
 }

 parse_PAS <- function(tsv, gtf, chr_name = "Chromosome", pos_name = "End", strand_name = "Strand", remove_suffix = FALSE) {
  # parse PAS sites
  print("Read PAS sites from input tsv file")
  df_input <- read.table(tsv, sep = "\t", header = TRUE)
  if (remove_suffix) {
    chr_col <- sub("chr", "", df_input[[chr_name]])
  } else {
    chr_col <- df_input[[chr_name]]
  }
  gr_input <- GRanges(
    seqnames = chr_col,
    ranges = IRanges(start = df_input[[pos_name]], end = df_input[[pos_name]]),
    strand = df_input[[strand_name]]
  )

  print("Read gene model from gtf file")
  EDB <- rtracklayer::import(gtf)
  TXDB <- makeTxDbFromGRanges(EDB)

  print("Annotate PAS sites with gene model")
  ol <- findOverlaps(gr_input, EDB, ignore.strand = FALSE, type = "within", select = "all")
  gr_annot <- unlist(mendoapply(.get_input_pos, split(gr_input[queryHits(ol)], 1:length(ol)), y = split(EDB[subjectHits(ol)], 1:length(ol))))
  dfPAS <- as.data.frame(gr_annot[which(gr_annot$type == "transcript"), ][, c("transcript_id", "gene_id", "gene_name")])
  dfPAS$PAS <- dfPAS$start
  dfTXGeneMapping <- dfPAS[, c("transcript_id", "gene_id")]
  dfTXGeneMapping <- dfTXGeneMapping[!duplicated(dfTXGeneMapping), ]
  dfTXGeneMapping <- dfTXGeneMapping[!is.na(dfTXGeneMapping$transcript_id), ]
  pasdb <- makeGRangesFromDataFrame(dfPAS, keep.extra.columns = TRUE)
  finaldf <- .annotatePAS_V2(pasdb, TXDB, dfTXGeneMapping)
  dfPAS$Chr <- dfPAS$seqnames
  dfPAS$PASid <- paste(dfPAS$seqnames, dfPAS$PAS, dfPAS$strand, sep = ":")
  dfinfo <- dfPAS[, c("PASid", "Chr", "strand", "PAS", "gene_id", "gene_name")]
  colnames(dfinfo) <- c("PASid", "Chr", "strand", "PAS", "GENEID", "gene_name")
  finaldf <- merge(finaldf, dfinfo, by = c("PASid", "GENEID"))

  # output PAS objects
  print("Extracting and filtering 3'UTR PASs")
  refUTRraw <- .GTF2refUTRraw(TXDB, finaldf)

  print("Extracting IPAs")
  dfIPAALL <- .GTF2IPA(EDB, TXDB, finaldf)

  print("Extracting 3' last exons")
  dfIPAXXX <- dfIPAALL$dfIPA
  dfupSSXXX <- dfIPAALL$dfupSS
  dfdnSS3XXX <- dfIPAALL$dfdnSS3
  dfLE_dfIPA <- .GTF2LE(TXDB, dfIPAXXX, dfupSSXXX, dfdnSS3XXX)
  print("Finalizing references")
  PASREF <- list(refUTRraw, dfLE_dfIPA$dfIPAsim, dfLE_dfIPA$dfLE)
  names(PASREF) <- c("refUTRraw", "dfIPA", "dfLE")
  return(PASREF)
 }
	source("./utils_APA.R")


	## How ot use?
	##
	## Change the path of tsv and gtf file
	## `chr_name` is the column name of chromosome in the tsv file, default is "Chromosome"
	## `pos_name` is the column name of position in the tsv file, default is "End"
	## `strand_name` is the column name of strand in the tsv file, default is "Strand"
	tsv <- "./test_sites.tsv"
	gtf <- "~/reference/feature/Homo_sapiens.GRCh38.genome.gtf"
	parse_PAS(tsv, gtf, chr_name = "Chromosome", pos_name = "End", strand_name = "Strand")
	library(dplyr)
	library(GenomicFeatures)
	library(APAlyzer)

	.annotatePASRegion <- function(pasdb, dbreigon, dfTXGeneMapping, flag) {
	x <- unlist(dbreigon)
	x$IndexID <- 1:length(x)
	x$transcript_id <- names(x)
	names(x) <- x$IndexID
	acc2sym <- data.frame(names(x), x$transcript_id)
	colnames(acc2sym) <- c("IndexID", "transcript_id")
	acc2sym <- merge(acc2sym, dfTXGeneMapping, on = "transcript_id", all.x = TRUE)
	x <- x[acc2sym$IndexID]
	names(x) <- acc2sym$gene_id
	dbreigon <- split(x, names(x))
	dbreigon <- GenomicRanges::reduce(dbreigon)
	olp <- findOverlaps(pasdb, dbreigon)
	if (length(olp) > 0) {
	hit <- pasdb[queryHits(olp), ]
	hit$LOCATION <- flag
	hit$PASid <- paste(seqnames(hit), start(hit), strand(hit), sep = ":")
	hit <- as.data.frame(hit, row.names=1:length(hit))
	} else {
	hit <- "NoHit"
	}
	return(hit)
	}

	.annotatePAS_V2 <- function(pasdb, TXDB, dfTXGeneMapping) {
	## a.5UTR
	fiveUTRs <- fiveUTRsByTranscript(TXDB, use.names = TRUE)
	fiveUTRhit <- .annotatePASRegion(pasdb, fiveUTRs, dfTXGeneMapping, "fiveUTR")

	## b.intron
	introns <- intronsByTranscript(TXDB, use.names = TRUE)
	intronhit <- .annotatePASRegion(pasdb, introns, dfTXGeneMapping, "intron")

	## c.3UTR
	threeUTRs <- threeUTRsByTranscript(TXDB, use.names = TRUE)
	threeUTRhit <- .annotatePASRegion(pasdb, threeUTRs, dfTXGeneMapping, "threeUTR")

	## d.CDS
	cds <- cdsBy(TXDB, by = "tx", use.names = TRUE)
	cdshit <- .annotatePASRegion(pasdb, cds, dfTXGeneMapping, "coding")

	## e.all Exons
	exon <- exonsBy(TXDB, by = "tx", use.names = TRUE)
	exonhit <- .annotatePASRegion(pasdb, exon, dfTXGeneMapping, "Exon")

	## Creat header
	dbhit <- as.data.frame(pasdb[1])
	dbhit$LOCATION <- "NONE"
	dbhit$PASid <- "NONE"
	for (hit in list(fiveUTRhit, intronhit, threeUTRhit, cdshit, exonhit)) {
	if (class(hit) == "data.frame") {
	dbhit <- rbind(dbhit, hit)
	}
	}
	dbhit <- dbhit[dbhit$PASid != "NONE", ]
	finaldf <- dbhit[, c("PASid", "LOCATION", "gene_id")]
	colnames(finaldf)[3] <- "GENEID"
	finaldf$uniID1 <- paste(finaldf$PASid, finaldf$GENEID, finaldf$LOCATION, sep = ":")
	finaldf$uniID2 <- paste(finaldf$PASid, finaldf$GENEID, sep = ":")
	finaldf <- finaldf[!duplicated(finaldf), ]
	finaldf$ORDERID <- 10
	if (nrow(finaldf[which(finaldf$LOCATION == "fiveUTR"), ]) > 0) {
	finaldf[which(finaldf$LOCATION == "fiveUTR"), ]$ORDERID <- 1
	}

	if (nrow(finaldf[which(finaldf$LOCATION == "intron"), ]) > 0) {
	finaldf[which(finaldf$LOCATION == "intron"), ]$ORDERID <- 2
	}

	if (nrow(finaldf[which(finaldf$LOCATION == "coding"), ]) > 0) {
	finaldf[which(finaldf$LOCATION == "coding"), ]$ORDERID <- 3
	}

	if (nrow(finaldf[which(finaldf$LOCATION == "threeUTR"), ]) > 0) {
	finaldf[which(finaldf$LOCATION == "threeUTR"), ]$ORDERID <- 4
	}
	finaldf <- finaldf[with(finaldf, order(uniID2, ORDERID)), ]
	finaldf <- finaldf[!duplicated(finaldf$uniID2), ]
	return(finaldf)
	}


	.GTF2refUTRraw <- function(TXDB, finaldf) {
	####### CDS end ############
	cds <- cdsBy(TXDB, "gene")
	x <- unlist(cds)
	cds1 <- split(x, names(x))
	test <- as.data.frame(cds1)
	index <- which(test$end - test$start > 1)
	allcds <- test[index, ]
	indexP <- which(allcds$strand == "+")
	indexN <- which(allcds$strand == "-")
	allcds$cdsend <- ""
	allcds[indexP, ]$cdsend <- allcds[indexP, ]$end
	allcds[indexN, ]$cdsend <- allcds[indexN, ]$start
	MAX <- aggregate(cdsend ~ group_name, allcds, max)
	MIN <- aggregate(cdsend ~ group_name, allcds, min)
	names(MAX)[2] <- "max"
	names(MIN)[2] <- "min"
	maxmin <- merge(x = MAX, y = MIN, by = "group_name")
	allcdssub <- allcds[, c("group_name", "strand")]
	allcdssub <- allcdssub[!duplicated(allcdssub$group_name), ]
	allcdsminmax <- merge(x = allcdssub, y = maxmin, by = "group_name")
	indexP <- which(allcdsminmax$strand == "+")
	indexN <- which(allcdsminmax$strand == "-")
	allcdsminmax$cdsend <- ""
	allcdsminmax[indexP, ]$cdsend <- allcdsminmax[indexP, ]$max
	allcdsminmax[indexN, ]$cdsend <- allcdsminmax[indexN, ]$min
	names(allcdsminmax)[1] <- "GENEID"
	names(allcdsminmax)[2] <- "Strand"
	allcdsminmax <- allcdsminmax[, c("GENEID", "Strand", "cdsend")]


	##### 3UTR PAS ######
	df3UTR <- finaldf[which(finaldf$LOCATION == "threeUTR"), ]
	df3UTR <- merge(df3UTR, allcdsminmax[, c("GENEID", "cdsend")], by = c("GENEID"))
	## Filter 3UTR PAS by CDS
	df3UTR <- df3UTR[which(((df3UTR$strand == "+") & (df3UTR$PAS > df3UTR$cdsend)) \| ((df3UTR$strand == "-") & (df3UTR$PAS < df3UTR$cdsend))), ]

	df3UTR_P <- df3UTR[which(df3UTR$strand == "+"), ]
	df3UTR_P <- df3UTR_P[with(df3UTR_P, order(GENEID, PAS)), ]
	df3UTR_N <- df3UTR[which(df3UTR$strand == "-"), ]
	df3UTR_N <- df3UTR_N[with(df3UTR_N, order(GENEID, -PAS)), ]
	df3UTR <- rbind(df3UTR_P, df3UTR_N)
	df3UTR_count <- df3UTR %>%
	group_by(GENEID) %>%
	mutate(col = 1:n()) %>% ## create featureID
	mutate(count = n())
	df3UTR_count <- data.frame(df3UTR_count)
	df3UTR_count$pA_type <- "M"
	if (nrow(df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count == 1), ]) > 0) {
	df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count == 1), ]$pA_type <- "S"
	}

	if (nrow(df3UTR_count[which(df3UTR_count$col == df3UTR_count$count & df3UTR_count$count > 1), ]) > 0) {
	df3UTR_count[which(df3UTR_count$col == df3UTR_count$count & df3UTR_count$count > 1), ]$pA_type <- "L"
	}

	if (nrow(df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count > 1), ]) > 0) {
	df3UTR_count[which(df3UTR_count$col == 1 & df3UTR_count$count > 1), ]$pA_type <- "F"
	}


	dfF <- df3UTR_count[df3UTR_count$pA_type == "F", ][, c("GENEID", "Chr", "strand", "PAS", "gene_name")]
	dfL <- df3UTR_count[df3UTR_count$pA_type == "L", ][, c("GENEID", "Chr", "strand", "PAS", "gene_name")]
	colnames(dfF) <- c("GENEID", "Chrom", "Strand", "First", "gene_symbol")
	colnames(dfL) <- c("GENEID", "Chrom", "Strand", "Last", "gene_symbol")
	dfFL <- merge(dfF, dfL, by = c("GENEID", "Chrom", "Strand", "gene_symbol"))
	dfFL <- dfFL[!duplicated(dfFL$GENEID), ]

	dfFLCDE <- merge(dfFL, allcdsminmax, by = c("GENEID", "Strand"))
	dfFLCDE <- dfFLCDE[which(((dfFLCDE$Strand == "+") & (dfFLCDE$First > dfFLCDE$cdsend)) \| ((dfFLCDE$Strand == "-") & (dfFLCDE$First < dfFLCDE$cdsend))), ]
	dfFLCDE$First <- as.integer(dfFLCDE$First)
	dfFLCDE$cdsend <- as.integer(dfFLCDE$cdsend)
	dfFLCDE$distance <- dfFLCDE$First - dfFLCDE$cdsend
	dfFLCDE[which(dfFLCDE$Strand == "-"), ]$distance <- dfFLCDE[which(dfFLCDE$Strand == "-"), ]$cdsend - dfFLCDE[which(dfFLCDE$Strand == "-"), ]$First

	index1 <- which(dfFLCDE$Strand == "+" & dfFLCDE$distance < 100)
	dfFLCDE[index1, ]$cdsend <- dfFLCDE[index1, ]$cdsend - (100 - dfFLCDE[index1, ]$distance)
	index2 <- which(dfFLCDE$Strand == "-" & dfFLCDE$distance < 100)
	dfFLCDE[index2, ]$cdsend <- dfFLCDE[index2, ]$cdsend + (100 - dfFLCDE[index2, ]$distance)
	refUTRraw <- dfFLCDE[, c("gene_symbol", "Chrom", "Strand", "First", "Last", "cdsend")]
	refUTRraw <- refUTRraw[!is.na(refUTRraw$gene_symbol), ]
	return(refUTRraw)
	}


	.GTF2IPA <- function(EDB, TXDB, finaldf) {
	############ intron and SS ################
	introns <- intronsByTranscript(TXDB, use.names = TRUE)
	dfintrons <- as.data.frame(introns)
	colnames(dfintrons) <- c("group", "tx_id", "Chrom", "Start", "End", "width", "Strand")
	# tx2gene <- mcols(transcripts(EDB, columns=c("tx_id", "gene_id",'gene_name')))
	# colnames(tx2gene)=c("tx_id", "GENEID",'gene_symbol')
	tx2gene <- as.data.frame(EDB[which(EDB$type == "transcript"), ])[, c("transcript_id", "gene_id", "gene_name")]
	colnames(tx2gene) <- c("tx_id", "GENEID", "gene_symbol")
	tx2gene <- tx2gene[!duplicated(tx2gene), ]
	dfintrons <- merge(dfintrons, tx2gene, by = "tx_id")
	dfintrons <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Start", "End", "Strand", "width")]
	dfintrons <- dfintrons[!duplicated(dfintrons), ]

	## ANNO 5SS and 3SS##
	dfintrons$SS5 <- dfintrons$Start
	dfintrons$SS3 <- dfintrons$End
	dfintrons[which(dfintrons$Strand == "-"), ]$SS5 <- dfintrons[which(dfintrons$Strand == "-"), ]$End
	dfintrons[which(dfintrons$Strand == "-"), ]$SS3 <- dfintrons[which(dfintrons$Strand == "-"), ]$Start

	## Combine 5SS and 3SS##
	dfSS5 <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Strand", "SS5")]
	dfSS3 <- dfintrons[, c("GENEID", "gene_symbol", "Chrom", "Strand", "SS3")]
	comname <- c("GENEID", "gene_symbol", "Chrom", "Strand", "upstreamSS")
	colnames(dfSS5) <- comname
	colnames(dfSS3) <- comname
	dfSS5$type <- "SS5"
	dfSS3$type <- "SS3"
	dfupSS <- rbind(dfSS5, dfSS3)
	dfupSS$SSID <- paste0(dfupSS$GENEID, dfupSS$gene_symbol, dfupSS$Chrom, dfupSS$Strand, dfupSS$upstreamSS)
	dfupSS <- dfupSS[!duplicated(dfupSS$SSID), ]
	############ build condidate downstream SS ##################
	dfdnSS3 <- dfSS3
	dfdnSS3$SSID <- paste0(dfdnSS3$GENEID, dfdnSS3$gene_symbol, dfdnSS3$Chrom, dfdnSS3$Strand, dfdnSS3$upstreamSS)
	dfdnSS3 <- dfdnSS3[!duplicated(dfdnSS3$SSID), ]
	colnames(dfdnSS3) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "downstreamSS", "type", "SSID")
	# dfdnSS3$Chrom=paste0('chr',dfdnSS3$Chrom)
	# dfupSS$Chrom=paste0('chr',dfupSS$Chrom)
	dfupSS <- dfupSS[!is.na(dfupSS$gene_symbol), ]
	dfdnSS3 <- dfdnSS3[!is.na(dfdnSS3$gene_symbol), ]

	######### IPA and SS ############
	dfIPA <- finaldf[(finaldf$LOCATION == "intron"), ]
	dfIPA <- dfIPA[, c("PASid", "Chr", "PAS", "strand", "GENEID", "gene_name")]
	colnames(dfIPA) <- c("PASid", "Chrom", "Pos", "Strand", "GENEID", "gene_symbol")
	dfIPA <- dfIPA[!duplicated(dfIPA), ]
	dfIPA <- dfIPA[!is.na(dfIPA$gene_symbol), ]
	DFIPASS <- list(dfIPA, dfupSS, dfdnSS3)
	names(DFIPASS) <- c("dfIPA", "dfupSS", "dfdnSS3")
	return(DFIPASS)
	}



	.GTF2LE <- function(TXDB, dfIPAXXX, dfupSSXXX, dfdnSS3XXX) {
	dfIPAXXX <- as.data.frame(dfIPAXXX)
	dfupSSXXX <- as.data.frame(dfupSSXXX)
	dfdnSS3XXX <- as.data.frame(dfdnSS3XXX)
	############ Hit upstream SS ##################
	dfHIT_up <- merge(dfIPAXXX, dfupSSXXX, by = c("GENEID", "gene_symbol", "Chrom", "Strand"))
	dfHIT_up$dis <- dfHIT_up$Pos - dfHIT_up$upstreamSS
	dfHIT_up[which(dfHIT_up$Strand == "-"), ]$dis <- dfHIT_up[which(dfHIT_up$Strand == "-"), ]$upstreamSS - dfHIT_up[which(dfHIT_up$Strand == "-"), ]$Pos
	dfHIT_up$HITID <- paste0(dfHIT_up$GENEID, dfHIT_up$gene_symbol, dfHIT_up$PASid)
	dfHIT_up <- dfHIT_up[which(dfHIT_up$dis > 100), ]
	dfHIT_up <- dfHIT_up[with(dfHIT_up, order(HITID, dis)), ]
	dfHIT_up <- dfHIT_up[!duplicated(dfHIT_up$HITID), ]

	dfHIT_up <- dfHIT_up[, c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "upstreamSS", "type", "dis")]
	colnames(dfHIT_up) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "upstreamSS", "type", "upstream_dis")

	############ Hit downstream SS ##################
	dfHIT_dn <- merge(dfIPAXXX, dfdnSS3XXX, by = c("GENEID", "gene_symbol", "Chrom", "Strand"))
	dfHIT_dn$dis <- dfHIT_dn$downstreamSS - dfHIT_dn$Pos
	dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$dis <- dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$Pos - dfHIT_dn[which(dfHIT_dn$Strand == "-"), ]$downstreamSS
	dfHIT_dn$HITID <- paste0(dfHIT_dn$GENEID, dfHIT_dn$gene_symbol, dfHIT_dn$PASid)
	dfHIT_dn <- dfHIT_dn[which(dfHIT_dn$dis > 100), ]
	dfHIT_dn <- dfHIT_dn[with(dfHIT_dn, order(HITID, dis)), ]
	dfHIT_dn <- dfHIT_dn[!duplicated(dfHIT_dn$HITID), ]

	dfHIT_dn <- dfHIT_dn[, c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "downstreamSS", "dis")]
	colnames(dfHIT_dn) <- c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos", "downstreamSS", "downstream_dis")

	############ combine IPA SS ##################
	dfHIT_combine <- merge(dfHIT_up, dfHIT_dn, by = c("GENEID", "gene_symbol", "Chrom", "Strand", "PASid", "Pos"))
	dfHIT_combine <- dfHIT_combine[!duplicated(dfHIT_combine), ]
	dfHIT_combine$IPA_up_start <- dfHIT_combine$upstreamSS
	dfHIT_combine$IPA_up_end <- dfHIT_combine$Pos
	dfHIT_combine$IPA_dn_start <- dfHIT_combine$Pos
	dfHIT_combine$IPA_dn_end <- dfHIT_combine$downstreamSS

	dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_up_start <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$Pos
	dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_up_end <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$upstreamSS
	dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_dn_start <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$downstreamSS
	dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$IPA_dn_end <- dfHIT_combine[which(dfHIT_combine$Strand == "-"), ]$Pos

	dfIPAREF <- dfHIT_combine[, c("PASid", "GENEID", "gene_symbol", "Chrom", "Strand", "Pos", "upstreamSS", "downstreamSS")]
	dfIPAREF <- as.data.frame(dfIPAREF)
	dfIPAREF <- dfIPAREF[!duplicated(dfIPAREF), ]

	dfIPAsim <- dfHIT_combine[, c("PASid", "gene_symbol", "Chrom", "Strand", "Pos", "upstreamSS", "downstreamSS")]
	dfIPAsim <- as.data.frame(dfIPAsim)
	dfIPAsim <- dfIPAsim[!duplicated(dfIPAsim), ]
	dfIPAsim <- dfIPAsim[!is.na(dfIPAsim$gene_symbol), ]

	############ TES ##############
	dfgenic <- as.data.frame(genes(TXDB))
	dfgenic <- dfgenic[, c(6, 1:3, 5)]
	names(dfgenic) <- c("GENEID", "Chr", "Start", "End", "Strand")

	# find the TES
	dfMinMAX <- dfgenic
	dfMinMAX$TES <- ""
	indexN <- which(dfMinMAX$Strand == "-")
	indexP <- which(dfMinMAX$Strand == "+")
	dfMinMAX[indexP, ]$TES <- dfMinMAX[indexP, ]$End
	dfMinMAX[indexN, ]$TES <- dfMinMAX[indexN, ]$Start
	dfTES <- dfMinMAX[, c("GENEID", "Chr", "TES", "Strand")]
	dfTES <- dfTES[!duplicated(dfTES), ]

	### I exon###
	exondb <- exonsBy(TXDB, by = "gene")
	exondb <- GenomicRanges::reduce(exondb)
	exonsdf <- as.data.frame(exondb)
	exonsdfP <- exonsdf[which(exonsdf$strand == "+"), ]
	exonsdfN <- exonsdf[which(exonsdf$strand == "-"), ]
	exonsdfN <- exonsdfN[with(exonsdfN, order(group_name, -start)), ]
	exonsdf <- rbind(exonsdfP, exonsdfN)
	exonsdf <- exonsdf[, -c(1)]

	data_EXrename <- exonsdf %>%
	group_by(group_name) %>%
	mutate(col = 1:n()) %>% ## create exonID
	mutate(count = n())
	exonsdf <- data.frame(data_EXrename)
	names(exonsdf)[7:8] <- c("EXid", "EXcount")

	dfLexon <- exonsdf[which(exonsdf$EXid == exonsdf$EXcount), ]
	dfLexon$LEstart <- dfLexon$start
	dfLexon[which(dfLexon$strand == "-"), ]$LEstart <- dfLexon[which(dfLexon$strand == "-"), ]$end
	dfLexon <- dfLexon[, c("group_name", "seqnames", "strand", "LEstart")]
	colnames(dfLexon) <- c("GENEID", "Chr", "Strand", "LEstart")
	dfLE <- merge(dfLexon, dfTES, by = c("GENEID", "Chr", "Strand"))
	dfLE <- dfLE[!duplicated(dfLE), ]

	dfIPAused <- dfIPAREF[, c("GENEID", "gene_symbol")]
	dfIPAused <- dfIPAused[!duplicated(dfIPAused), ]

	dfLE <- merge(dfLE, dfIPAused, by = "GENEID")
	dfLE <- dfLE[!duplicated(dfLE), ]
	# dfLE$Chr=paste0('chr',dfLE$Chr)
	dfLE <- dfLE[, c("gene_symbol", "Chr", "Strand", "LEstart", "TES")]
	colnames(dfLE) <- c("gene_symbol", "Chrom", "Strand", "LEstart", "TES")
	dfLE <- dfLE[!is.na(dfLE$gene_symbol), ]
	dfLE_dfIPA <- list(dfIPAsim, dfLE)
	names(dfLE_dfIPA) <- c("dfIPAsim", "dfLE")
	return(dfLE_dfIPA)
	}



	.get_input_pos <- function(x, y) {
	y2 <- y
	start(y2) <- start(x)
	end(y2) <- end(x)
	return(y2)
	}

	parse_PAS <- function(tsv, gtf, chr_name = "Chromosome", pos_name = "End", strand_name = "Strand", remove_suffix = FALSE) {
	# parse PAS sites
	print("Read PAS sites from input tsv file")
	df_input <- read.table(tsv, sep = "\t", header = TRUE)
	if (remove_suffix) {
	chr_col <- sub("chr", "", df_input[[chr_name]])
	} else {
	chr_col <- df_input[[chr_name]]
	}
	gr_input <- GRanges(
	seqnames = chr_col,
	ranges = IRanges(start = df_input[[pos_name]], end = df_input[[pos_name]]),
	strand = df_input[[strand_name]]
	)

	print("Read gene model from gtf file")
	EDB <- rtracklayer::import(gtf)
	TXDB <- makeTxDbFromGRanges(EDB)

	print("Annotate PAS sites with gene model")
	ol <- findOverlaps(gr_input, EDB, ignore.strand = FALSE, type = "within", select = "all")
	gr_annot <- unlist(mendoapply(.get_input_pos, split(gr_input[queryHits(ol)], 1:length(ol)), y = split(EDB[subjectHits(ol)], 1:length(ol))))
	dfPAS <- as.data.frame(gr_annot[which(gr_annot$type == "transcript"), ][, c("transcript_id", "gene_id", "gene_name")])
	dfPAS$PAS <- dfPAS$start
	dfTXGeneMapping <- dfPAS[, c("transcript_id", "gene_id")]
	dfTXGeneMapping <- dfTXGeneMapping[!duplicated(dfTXGeneMapping), ]
	dfTXGeneMapping <- dfTXGeneMapping[!is.na(dfTXGeneMapping$transcript_id), ]
	pasdb <- makeGRangesFromDataFrame(dfPAS, keep.extra.columns = TRUE)
	finaldf <- .annotatePAS_V2(pasdb, TXDB, dfTXGeneMapping)
	dfPAS$Chr <- dfPAS$seqnames
	dfPAS$PASid <- paste(dfPAS$seqnames, dfPAS$PAS, dfPAS$strand, sep = ":")
	dfinfo <- dfPAS[, c("PASid", "Chr", "strand", "PAS", "gene_id", "gene_name")]
	colnames(dfinfo) <- c("PASid", "Chr", "strand", "PAS", "GENEID", "gene_name")
	finaldf <- merge(finaldf, dfinfo, by = c("PASid", "GENEID"))

	# output PAS objects
	print("Extracting and filtering 3'UTR PASs")
	refUTRraw <- .GTF2refUTRraw(TXDB, finaldf)

	print("Extracting IPAs")
	dfIPAALL <- .GTF2IPA(EDB, TXDB, finaldf)

	print("Extracting 3' last exons")
	dfIPAXXX <- dfIPAALL$dfIPA
	dfupSSXXX <- dfIPAALL$dfupSS
	dfdnSS3XXX <- dfIPAALL$dfdnSS3
	dfLE_dfIPA <- .GTF2LE(TXDB, dfIPAXXX, dfupSSXXX, dfdnSS3XXX)
	print("Finalizing references")
	PASREF <- list(refUTRraw, dfLE_dfIPA$dfIPAsim, dfLE_dfIPA$dfLE)
	names(PASREF) <- c("refUTRraw", "dfIPA", "dfLE")
	return(PASREF)
	}