Last active
October 13, 2015 08:48
-
-
Save radaniba/4170466 to your computer and use it in GitHub Desktop.
Here is a script for gene set enrichment over KEGG, the perl script serves as automation for the analysis of several gene set files inside a directory, the R script does the analysis itself using bioconductor GOStats package. Look inside the perl script
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| sink("log",append=FALSE,type=c("output","message"),split=FALSE) | |
| library(GOstats) | |
| library(EMA) | |
| library(fdrtool) | |
| library(org.Pt.eg.db) | |
| Args <- commandArgs(); | |
| UNIV <- Args[6] | |
| #GL <- Args[5] | |
| DIR <- Args[5] | |
| PVCUT <- Args[7] | |
| FDRcut <- Args[8] | |
| sprintf("%s","%s","PROCESSING PLEASE WAIT...","\n") | |
| ALLFILES <- list.files(toString(DIR)) | |
| #,pattern="^H11_" | |
| i<-1 | |
| universe <- read.table(toString(UNIV),sep="\t",header=FALSE) | |
| while(i <= length(ALLFILES)){ | |
| inGenes <- read.table (paste(toString(DIR),"/",toString(ALLFILES[i]),sep="",collapse=NULL), sep="\t", header=FALSE) | |
| OUTFILE <- paste(toString(DIR),"/",toString(ALLFILES[i]),".ENR",sep="",collapse=NULL) | |
| selected <- unique(inGenes$V1) | |
| paramK <- new("KEGGHyperGParams", geneIds=selected, universeGeneIds=universe, annotation="org.Hs.eg.db", pvalueCutoff=as.numeric(PVCUT), testDirection="over") | |
| hypK <- hyperGTest(paramK) | |
| sumTableK <- summary(hypK) | |
| # subset the output table to get the columns of interest | |
| # (GO ID, GO description, p-value) | |
| # (KEGG ID, KEGG description, p-value) | |
| outK <- subset(sumTableK, select=c(1,7,2)) | |
| #cat(outK$KEGGID) | |
| # retrieve input genes associated witsourceh each KEGG identifier | |
| # use the org.Hs.eg data mapping to get KEGG terms for each ID | |
| if(length(outK$KEGGID)>=2){ | |
| keggMaps <- lapply(outK$KEGGID, function(x) unlist(mget(x, org.Hs.egPATH2EG))) | |
| # subset the selected genes based on those in the mappings | |
| keggSelected <- lapply(keggMaps, function(x) selected[selected %in% x]) | |
| # join together with a semicolon to make up the last column | |
| outK$inGenes <- unlist(lapply(keggSelected, function(x) paste(x, collapse=";"))) | |
| #adjusted pvalues here we use FDR-BF | |
| pvalsK <- outK$Pvalue | |
| adjpvalsK <- multiple.correction(pvalsK,typeFDR="FDR-BH") | |
| fdr = fdrtool(adjpvalsK, statistic="pvalue",plot=FALSE) | |
| # Add the adjusted Pvalues to the data frame containing the results | |
| outK[,5]<-as.data.frame(adjpvalsK) | |
| outK[,6]<-as.data.frame(fdr$qval) | |
| # write the final data table as a tab separated file | |
| write.table(outK, file=OUTFILE, sep="\t", row.names=FALSE) | |
| debugging_line <- paste(as.character(ALLFILES[i]),"... Done",sep=" ") | |
| cat(debugging_line) | |
| } | |
| i<-i+1 | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| #!/usr/bin/perl | |
| use strict; | |
| use warnings; | |
| my $num_args = $#ARGV + 1; | |
| if ($num_args != 4 ) { | |
| print "usage: perl BulkEnrichment.pl <Folder containing the Gene List> <Universe> <pvalue_cutoff> <FDR_cutoff> \n"; | |
| exit; | |
| } | |
| my $dir = $ARGV[0]; | |
| my $universe=$ARGV[1]; | |
| my $pvalue_cutoff = $ARGV[2]; | |
| my $FDR_cutoff = $ARGV[3]; | |
| my $Enrichment_CMD = "R --slave --vanilla --args $dir $universe $pvalue_cutoff $FDR_cutoff < Enrichment.R > LOG"; | |
| system($Enrichment_CMD); | |
| exit 0; |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment