Created
August 24, 2020 21:26
-
-
Save anutua/d5644416e6cf44cff58cd3cb8f803a3a to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| ########################################################## | |
| ## OncoMerge: oncoMerge.py ## | |
| ## ______ ______ __ __ ## | |
| ## /\ __ \ /\ ___\ /\ \/\ \ ## | |
| ## \ \ __ \ \ \___ \ \ \ \_\ \ ## | |
| ## \ \_\ \_\ \/\_____\ \ \_____\ ## | |
| ## \/_/\/_/ \/_____/ \/_____/ ## | |
| ## @Developed by: Plaisier Lab ## | |
| ## (https://plaisierlab.engineering.asu.edu/) ## | |
| ## Arizona State University ## | |
| ## 242 ISTB1, 550 E Orange St ## | |
| ## Tempe, AZ 85281 ## | |
| ## @github: https://github.com/plaisier-lab/OncoMerge ## | |
| ## @Author: Chris Plaisier, Rob Schultz ## | |
| ## @License: GNU GPLv3 ## | |
| ## ## | |
| ## If this program is used in your analysis please ## | |
| ## mention who built it. Thanks. :-) ## | |
| ########################################################## | |
| ##################### | |
| ## Import packages ## | |
| ##################### | |
| import json | |
| import argparse | |
| import pandas as pd | |
| import numpy as np | |
| from statsmodels.stats.multitest import multipletests | |
| import itertools | |
| import sys | |
| import os | |
| from scipy.stats import hypergeom | |
| import re | |
| #Racheal | |
| ################################## | |
| ## Read in command line options ## | |
| ################################## | |
| parser = argparse.ArgumentParser(description='OncoMerge merges patient Protein Affecting Mutations (PAMs) and Copy Number Alterations (CNAs) into a unified mutation matrix.') | |
| parser.add_argument('-cf', '--config_path', help='Path to JSON encoded configuration file, overrides command line parameters', type = str) | |
| parser.add_argument('-gp', '--gistic_path', help='Path to GISTIC output folder', type = str) | |
| parser.add_argument('-dp', '--del_path', help='Path to GISTIC deletion file (default = del_genes.conf_99.txt)', type = str, default = 'del_genes.conf_99.txt') | |
| parser.add_argument('-ap', '--amp_path', help='Path to the GISTIC amplification file (default = amp_genes.conf_99.txt)', type = str, default = 'amp_genes.conf_99.txt') | |
| parser.add_argument('-gdp', '--gene_data_path', help='Path to the GISTIC gene data file (default = all_data_by_genes.txt)', type = str, default = 'all_data_by_genes.txt') | |
| parser.add_argument('-cfp', '--conversion_file_path', help='Supply hand annotated file to convert gene symbols to Entrez IDs (default does not import hand annotated conversion file and instead uses conversion embedded in GISTIC output files).', type = str) | |
| parser.add_argument('-ln', '--label_name', help='Label for Entrez ID column in GISTIC gene data file (default = \'Gene ID\')', type = str, default = 'Gene ID') | |
| parser.add_argument('-tp', '--thresh_path', help='Path to the GISTIC all_thresholded file (default = all_thresholded.by_genes.txt)', type = str, default = 'all_thresholded.by_genes.txt') | |
| parser.add_argument('-smp', '--som_mut_path', help='Path to the somatic mutations file (CSV matrix where columns are patients and genes are rows) [0 = not mutated, and 1 = mutated]', type = str) | |
| parser.add_argument('-mscv', '--mutsig2cv_path', help='Path to a MutSig2CV output file', type = str) | |
| parser.add_argument('-op', '--output_path', help='Path you would like to output OncoMerged files (default = current directory)', type = str, default = '.') | |
| parser.add_argument('-mmf', '--min_mut_freq', help='Minimum frequency of mutation (range = 0-1; default = 0.05)', type = float, default = 0.05) | |
| parser.add_argument('-pq', '--perm_qv', help='Permuted p-value FDR BH corrected cutoff (default = 0.1)', type = float, default = 0.1) | |
| parser.add_argument('-sp', '--save_permutation', help='Run and save out permutation analysis to be used for comparability in another OncoMerge run (default off)', action='store_true') | |
| parser.add_argument('-lp', '--load_permutation', help='Do not run permutation anlaysis and load permutation anlaysis from previous run (default off)', type = str, default = None) | |
| parser.add_argument('-mcr', '--min_coincidence_rate', help='Minimum coincidence rate for the shallow coincidence filter (default = 0.001)', type = float, default = 0.001) | |
| parser.add_argument('-lg', '--min_loci_genes', help='Minimum number of genes in loci to apply shallow coincidence filter (default = 2)', type = int, default = 2) | |
| args = parser.parse_args() | |
| ####################### | |
| ## Define parameters ## | |
| ####################### | |
| params = args.__dict__ | |
| if args.config_path: | |
| with open(args.config_path, "r") as cfg: | |
| tmp = json.loads(cfg.read()) | |
| for i in tmp: | |
| params[i] = tmp[i] | |
| if (not params['gistic_path']) or (not params['som_mut_path']) or (not params['mutsig2cv_path']) or (params['save_permutation'] and not params['load_permutation']==None): | |
| parser.print_help() | |
| sys.exit(1) | |
| ################## | |
| ## Load up data ## | |
| ################## | |
| print('Loading data...') | |
| # Create conversion series for gene symbol to Entrez ID | |
| if not params['conversion_file_path']: | |
| n1 = pd.read_csv(params['gistic_path']+'/'+params['gene_data_path'],index_col=0,sep='\t',usecols=[0,1]) | |
| n1.index = [i.split('|')[0] for i in n1.index] | |
| n1 = n1.drop_duplicates() | |
| n1 = n1[params['label_name']] | |
| else: | |
| n1 = pd.read_csv(params['conversion_file_path'],index_col=0)[params['label_name']].apply(int) | |
| # load up significantly mutated genes | |
| mutSig2CV = pd.read_csv(params['mutsig2cv_path'],index_col=1) | |
| mutSig2CV = mutSig2CV.loc[mutSig2CV.index.map(lambda x: x in n1.index)] | |
| mutSig2CV.index = mutSig2CV.index.map(lambda x: n1.loc[x]) | |
| mutSig2CV = mutSig2CV.loc[~mutSig2CV.index.duplicated(keep='first')] | |
| sigPAMs = list(mutSig2CV.index[mutSig2CV['q']<=0.05]) | |
| print('sigpams1') | |
| print(sigPAMs) | |
| print('sigpams done') | |
| # Get list of significantly CNA amplified genes | |
| ampLoci = {} | |
| ampLoci_qv = {} | |
| amp1 = pd.read_csv(params['gistic_path']+'/'+params['amp_path'],index_col=0,sep='\t') | |
| for col1 in amp1.columns: | |
| if float(amp1[col1]['residual q value'])<=0.05 and not (col1[0]=='X' or col1=='Y'): | |
| ampLoci[col1] = list(set([n1.loc[i] for i in [i.lstrip('[').rstrip(']').split('|')[0] for i in list(amp1[col1].dropna()[3:])] if i in n1.index and n1.loc[i]>0])) | |
| ampLoci_qv[col1] = float(amp1[col1]['residual q value']) | |
| print('amp1') | |
| print(amp1) | |
| print('amp1 done') | |
| print(ampLoci) | |
| # Get list of significantly CNA deleted genes | |
| delLoci = {} | |
| delLoci_qv = {} | |
| del1 = pd.read_csv(params['gistic_path']+'/'+params['del_path'],index_col=0,sep='\t') | |
| for col1 in del1.columns: | |
| if float(del1[col1]['residual q value'])<=0.05 and not (col1[0]=='X' or col1=='Y'): | |
| delLoci[col1] = list(set([n1.loc[i] for i in [i.lstrip('[').rstrip(']').split('|')[0] for i in list(del1[col1].dropna()[3:])] if i in n1.index and n1.loc[i]>0])) | |
| delLoci_qv[col1] = float(del1[col1]['residual q value']) | |
| print('delLoci') | |
| print(delLoci) | |
| # Set up background gene numbers for gold-standard enrichment analysis | |
| backgrounds = {'Activating':[], 'LossOfFunction':[], 'Aggregate':[]} | |
| actTmp = [gene for locus in ampLoci for gene in ampLoci[locus]] | |
| backgrounds['Activating'] += actTmp | |
| backgrounds['Aggregate'] += actTmp | |
| delTmp = [gene for locus in delLoci for gene in delLoci[locus]] | |
| backgrounds['LossOfFunction'] += delTmp | |
| backgrounds['Aggregate'] += delTmp | |
| backgrounds['Aggregate'] += sigPAMs | |
| #Load go pathway | |
| godf=pd.read_csv(params['gopath'],sep=',', index_col=0, dtype=object, header=None) | |
| keggdf=pd.read_csv(params['keggpath'],sep=',', index_col=0, dtype=object, header=None) | |
| keggdf = keggdf.iloc[1:] | |
| print("go pathway") | |
| print(godf) | |
| print("go pathway done1") | |
| godict_int=dict(zip(godf.index,[[int(j) for j in godf.loc[i][2].split(' ')] for i in godf.index])) | |
| kegg_int=dict(zip(keggdf.index,[[int(j) for j in keggdf.loc[i][2].split(' ')] for i in keggdf.index])) | |
| #godict_int = {key: value for key, value in godict_int.items() if key.isnumeric()} | |
| #godict_string=(zip(godf.index,[godf.loc[i][2].split(' ') for i in godf.index])) | |
| # Load up enrichment sets | |
| """enrichmentSets = {} | |
| if 'enrichment_sets' in params: | |
| for set1 in params['enrichment_sets']: | |
| enSet1 = pd.read_csv(params['enrichment_sets'][set1], header=0) | |
| enrichmentSets[set1] = {} | |
| enrichmentSets[set1]['Aggregate'] = {'Activating': [], 'LossOfFunction': []} | |
| for cancer in set(enSet1['Cancer']): | |
| enrichmentSets[set1][cancer] = {} | |
| for type1 in ['Activating', 'LossOfFunction']: | |
| enrichmentSets[set1][cancer][type1] = list(enSet1.loc[(enSet1['Cancer']==cancer) & (enSet1['Type']==type1),'Entrez']) | |
| enrichmentSets[set1]['Aggregate'][type1] += list(enSet1.loc[(enSet1['Cancer']==cancer) & (enSet1['Type']==type1),'Entrez']) | |
| """ | |
| # Load up somatically mutated genes | |
| somMuts = pd.read_csv(params['som_mut_path'],index_col=0,header=0) | |
| if not somMuts.index.dtype=='int64': | |
| somMuts = somMuts.loc[somMuts.index.map(lambda x: x in n1.index)] | |
| somMuts.index = somMuts.index.map(lambda x: n1.loc[x]) | |
| somMuts = somMuts.loc[~somMuts.index.duplicated(keep='first')] | |
| # Read in gistic all_data_by_genes file | |
| with open(params['gistic_path']+'/'+params['thresh_path'],'r') as inFile: | |
| tmp = inFile.readline().strip().split('\t') | |
| numCols1 = len(tmp) | |
| d1 = pd.read_csv(params['gistic_path']+'/'+params['thresh_path'],index_col=0,sep='\t').drop(tmp[1], axis = 1) | |
| d1.columns = [i[:12] for i in d1.columns] | |
| d1 = d1.loc[d1.index.map(lambda x: x.split('|')[0] in n1.index)] | |
| d1.index = d1.index.map(lambda x: n1.loc[x.split('|')[0]]) | |
| d1.index.name = 'Locus ID' | |
| # Removing sex chromosomes (issues in CNA analysis) from d1 | |
| lociThresh = d1['Cytoband'] | |
| include = [] | |
| for i in lociThresh: | |
| if not(i[0]=='X' or i[0]=='Y'): | |
| include.append(True) | |
| else: | |
| include.append(False) | |
| d1 = d1.loc[lociThresh[include].index].drop('Cytoband', axis = 1) | |
| # Make sure somMuts and gistic have same samples | |
| somMuts = somMuts[list(set(d1.columns).intersection(somMuts.columns))] | |
| d1 = d1[list(set(d1.columns).intersection(somMuts.columns))] | |
| # Get rid of duplicated rows | |
| d1 = d1[~d1.index.duplicated(keep='first')] | |
| ## Fill out summary matrix | |
| summaryMatrix = pd.DataFrame(index= list(set([gene for locus in ampLoci.values() for gene in locus] + [gene for locus in delLoci.values() for gene in locus] + list(somMuts.index))), columns = ['Symbol', 'PAM_freq', 'MutSig2CV_qvalue', 'CNA_freq', 'CNA_locus', 'CNA_type', 'GISTIC_residual_q_value', 'Act_freq', 'Act_shallow_coincidence_rate', 'LoF_freq', 'LoF_shallow_coincidence_rate', 'OM_type_selected', 'OM_empirical_p_value', 'OM_empirical_q_value', 'Genes_in_locus', 'Final_mutation_type', 'Final_freq', 'Delta_over_PAM']) | |
| # Add gene symbols | |
| toMatch = [i for i in summaryMatrix.index if i in n1.values] | |
| summaryMatrix.loc[toMatch,'Symbol'] = [n1.index[n1==i][0] for i in toMatch] | |
| # Add MutSig2CV q-values | |
| toMatch = [i for i in summaryMatrix.index if i in mutSig2CV.index] | |
| summaryMatrix.loc[toMatch,'MutSig2CV_qvalue'] = mutSig2CV.loc[toMatch,'q'] | |
| # Add CNA amp locus and GISTIC q-value | |
| for locus1 in ampLoci: | |
| for gene1 in ampLoci[locus1]: | |
| summaryMatrix.loc[gene1,'CNA_type'] = 'Amp' | |
| summaryMatrix.loc[gene1,'CNA_locus'] = locus1 | |
| summaryMatrix.loc[gene1,'GISTIC_residual_q_value'] = ampLoci_qv[locus1] | |
| # Add CNA del locus and GISTIC q-value | |
| for locus1 in delLoci: | |
| for gene1 in delLoci[locus1]: | |
| summaryMatrix.loc[gene1,'CNA_type'] = 'Del' | |
| summaryMatrix.loc[gene1,'CNA_locus'] = locus1 | |
| summaryMatrix.loc[gene1,'GISTIC_residual_q_value'] = delLoci_qv[locus1] | |
| # Load enrichment | |
| # Print out some useful information | |
| print('\tSize of CNA matrix: '+str(d1.shape)) | |
| print('\tSize of somatic mutation matrix: '+str(somMuts.shape)) | |
| print('Finished loading data.') | |
| # Make the output directory if it doesn't exists already | |
| if not os.path.exists(params['output_path']): | |
| os.mkdir(params['output_path']) | |
| ######################## | |
| ## Begin onocoMerging ## | |
| ######################## | |
| # Cutoff somatic mutations based on the minimum mutation frequency (mf) | |
| freq1 = somMuts.sum(axis=1)/len(list(somMuts.columns)) | |
| somMutPoint = freq1[freq1>=params['min_mut_freq']].index | |
| # Precompute positive and negative dichotomized matrices | |
| posdicot = (lambda x: 1 if x>=2 else 0) | |
| posD1 = d1.applymap(posdicot) | |
| summaryMatrix.loc[list(summaryMatrix[summaryMatrix['CNA_type']=='Amp'].index),'CNA_freq'] = posD1.loc[list(summaryMatrix[summaryMatrix['CNA_type']=='Amp'].index)].mean(axis=1) | |
| negdicot = (lambda x: 1 if x<=(-2) else 0) | |
| negD1 = d1.applymap(negdicot) | |
| summaryMatrix.loc[list(summaryMatrix[summaryMatrix['CNA_type']=='Del'].index),'CNA_freq'] = negD1.loc[list(summaryMatrix[summaryMatrix['CNA_type']=='Del'].index)].mean(axis=1) | |
| print('Finished precomputing dichotomized matrices.') | |
| #intersect gene lists | |
| #pos_intersect = pd.merge(posD1, somMuts, how='inner', right_index=True, left_index=True indicator=True]) | |
| #neg_intersect = pd.merge(negD1, somMuts, how='inner', on=['user_id']) | |
| #see negD1 | |
| newgodict= {} | |
| for x in godict_int.keys(): | |
| tmp1=list(set(godict_int[x]).intersection(negD1.index)) | |
| if len(tmp1) > 2: | |
| newgodict[x]=tmp1 | |
| aggnegD1=pd.DataFrame(columns=negD1.columns) | |
| for x in newgodict.keys(): | |
| aggnegD1.loc[x] = negD1.loc[newgodict[x]].max(axis=0) | |
| negD1 = pd.concat([negD1,aggnegD1]) | |
| #somMuts | |
| newgodict_sm= {} | |
| for x in godict_int.keys(): | |
| tmp1=list(set(godict_int[x]).intersection(somMuts.index)) | |
| if len(tmp1) > 2: | |
| newgodict_sm[x]=tmp1 | |
| aggsomMuts = pd.DataFrame(columns=somMuts.columns) | |
| for x in newgodict_sm.keys(): | |
| aggsomMuts.loc[x] = somMuts.loc[newgodict_sm[x]].max(axis=0) | |
| somMuts = pd.concat([somMuts,aggsomMuts]) | |
| #posD1 | |
| newgodictpos= {} | |
| for x in godict_int.keys(): | |
| tmp1=list(set(godict_int[x]).intersection(posD1.index)) | |
| if len(tmp1) > 2: | |
| newgodictpos[x]=tmp1 | |
| aggposD1 = pd.DataFrame(columns=posD1.columns) | |
| for x in newgodictpos.keys(): | |
| aggposD1.loc[x] = posD1.loc[newgodictpos[x]].max(axis=0) | |
| posD1 = pd.concat([posD1,aggposD1]) | |
| # Merge loci for deletions and amplifications | |
| lociCNAgenes = {} | |
| lociCNA = pd.DataFrame(columns=d1.columns) | |
| print('Combining amplification loci...') | |
| for loci1 in ampLoci: | |
| # Get matrix of CNAs for genes in loci | |
| dt = posD1.loc[ampLoci[loci1]] | |
| # Get unique rows | |
| dedup = dt.drop_duplicates(keep='first') | |
| # Get genes which match and add to output dictionaries | |
| for i in range(len(dedup.index)): | |
| cnaName = loci1+'_'+str(i)+'_CNAamp' | |
| lociCNA.loc[cnaName] = dedup.iloc[i] | |
| lociCNAgenes[cnaName] = [j for j in dt.index if dedup.iloc[i].equals(dt.loc[j])] | |
| print('Combining deletion loci...') | |
| for loci1 in delLoci: | |
| # Get matrix of CNAs for genes in loci | |
| dt = negD1.loc[delLoci[loci1]] | |
| # Get unique rows | |
| dedup = dt.drop_duplicates(keep='first') | |
| # Get genes which match and add to output dictionaries | |
| for i in range(len(dedup.index)): | |
| cnaName = loci1+'_'+str(i)+'_CNAdel' | |
| lociCNA.loc[cnaName] = dedup.iloc[i] | |
| lociCNAgenes[cnaName] = [j for j in dt.index if dedup.iloc[i].equals(dt.loc[j])] | |
| # Calculate shallow coincidence | |
| shallowCoincidence = {'Act':{},'LoF':{}} | |
| totalPats = len(d1.columns) | |
| # Activating shallow coincidence | |
| for loci1 in ampLoci: | |
| for ampGene in ampLoci[loci1]: | |
| if ampGene in somMuts.index and ampGene in d1.index: | |
| coincPats = set(d1.columns[list(d1.loc[ampGene].ge(1))]).intersection(somMuts.columns[list(somMuts.loc[ampGene]==1)]) | |
| shallowCoincidence['Act'][ampGene] = float(len(coincPats))/float(totalPats) | |
| summaryMatrix.loc[ampGene, 'Act_shallow_coincidence_rate'] = float(len(coincPats))/float(totalPats) | |
| # Loss of Function shallow coincidence | |
| for locus1 in delLoci: | |
| for delGene in delLoci[locus1]: | |
| if delGene in somMuts.index and delGene in d1.index: | |
| coincPats = set(d1.columns[list(d1.loc[delGene].le(-1))]).intersection(somMuts.columns[list(somMuts.loc[delGene]==1)]) | |
| shallowCoincidence['LoF'][delGene] = float(len(coincPats))/float(totalPats) | |
| summaryMatrix.loc[delGene, 'LoF_shallow_coincidence_rate'] = float(len(coincPats))/float(totalPats) | |
| # Make combined matrix | |
| # LoF = deletions + somatic point mutations | |
| # Act = amplifications + somatic point mutations | |
| print('Starting somatic mutations...') | |
| pamLofAct = {} | |
| freq = {} | |
| for s1 in somMutPoint: | |
| if s1>0: | |
| if not str(s1) in pamLofAct: | |
| pamLofAct[str(s1)] = {} | |
| tmpSom = somMuts.loc[s1] | |
| # If potential PAM, store PAM | |
| if not (str(s1)+'_PAM' in pamLofAct[str(s1)] or sum(tmpSom)==0): | |
| pamLofAct[str(s1)][str(s1)+'_PAM'] = tmpSom | |
| if (s1 in negD1.index and s1 in posD1.index): | |
| tmpNeg = negD1.loc[s1] | |
| tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index] | |
| tmpLoF[tmpLoF==2] = 1 | |
| tmpPos = posD1.loc[s1] | |
| tmpAct = tmpSom.add(tmpPos)[tmpPos.index] | |
| tmpAct[tmpAct==2] = 1 | |
| if not s1 in freq: | |
| freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()} | |
| else: | |
| if not s1 in freq: | |
| freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':0,'CNAamp':0,'LoF':0,'Act':0} | |
| ### | |
| print('Starting pathways...') | |
| for s1 in set(godict_int.keys()).intersection(somMuts.index): | |
| if not str(s1) in pamLofAct: | |
| pamLofAct[str(s1)] = {} | |
| tmpSom = somMuts.loc[s1] | |
| # If potential PAM, store PAM | |
| if not (str(s1)+'_PAM' in pamLofAct[str(s1)] or sum(tmpSom)==0): | |
| pamLofAct[str(s1)][str(s1)+'_PAM'] = tmpSom | |
| if (s1 in negD1.index and s1 in posD1.index): | |
| tmpNeg = negD1.loc[s1] | |
| tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index] | |
| tmpLoF[tmpLoF==2] = 1 | |
| tmpPos = posD1.loc[s1] | |
| tmpAct = tmpSom.add(tmpPos)[tmpPos.index] | |
| tmpAct[tmpAct==2] = 1 | |
| if not s1 in freq: | |
| freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()} | |
| else: | |
| if not s1 in freq: | |
| freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':0,'CNAamp':0,'LoF':0,'Act':0} | |
| print('Starting amplifications...') | |
| for loci1 in ampLoci: | |
| for s1 in set(ampLoci[loci1]).intersection(somMuts.index): | |
| if s1>0: | |
| # If potential Act | |
| if s1 in somMuts.index and s1 in posD1.index: | |
| tmpSom = somMuts.loc[s1] | |
| tmpNeg = negD1.loc[s1] | |
| tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index] | |
| tmpLoF[tmpLoF==2] = 1 | |
| tmpPos = posD1.loc[s1] | |
| tmpAct = tmpSom.add(tmpPos)[tmpPos.index] | |
| tmpAct[tmpAct==2] = 1 | |
| if not s1 in freq: | |
| freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()} | |
| # Store Act | |
| if not str(s1) in pamLofAct: | |
| pamLofAct[str(s1)] = {} | |
| if not (str(s1)+'_Act' in pamLofAct[str(s1)] or tmpAct.equals(tmpSom) or tmpAct.equals(tmpPos)): | |
| pamLofAct[str(s1)][str(s1)+'_Act'] = tmpAct | |
| print('Starting deletions...') | |
| for loci1 in delLoci: | |
| for s1 in set(delLoci[loci1]).intersection(somMuts.index): | |
| if s1>0: | |
| # If potential Lof | |
| if s1 in somMuts.index and s1 in negD1.index: | |
| tmpSom = somMuts.loc[s1] | |
| tmpNeg = negD1.loc[s1] | |
| tmpLoF = tmpSom.add(tmpNeg)[tmpNeg.index] | |
| tmpLoF[tmpLoF==2] = 1 | |
| tmpPos = posD1.loc[s1] | |
| tmpAct = tmpSom.add(tmpPos)[tmpPos.index] | |
| tmpAct[tmpAct==2] = 1 | |
| if not s1 in freq: | |
| freq[str(s1)] = {'PAM':tmpSom.mean(),'CNAdel':tmpNeg.mean(),'CNAamp':tmpPos.mean(),'LoF':tmpLoF.mean(),'Act':tmpAct.mean()} | |
| # Store LoF | |
| if not str(s1) in pamLofAct: | |
| pamLofAct[str(s1)] = {} | |
| if not (str(s1)+'_LoF' in pamLofAct[str(s1)] or tmpLoF.equals(tmpSom) or tmpLoF.equals(tmpNeg)): | |
| pamLofAct[str(s1)][str(s1)+'_LoF'] = tmpLoF | |
| # Decide which mutations will be tested in permutation analysis | |
| print('Screening for frequency...') | |
| keepPAM = [] | |
| keepers = {} | |
| calcSig = [] | |
| for s1 in pamLofAct: | |
| if s1 in freq: | |
| freqPAM = freq[s1]['PAM'] | |
| summaryMatrix.loc[int(s1), 'PAM_freq'] = freq[s1]['PAM'] | |
| freqPos = freq[s1]['CNAamp'] | |
| freqNeg = freq[s1]['CNAdel'] | |
| if summaryMatrix.loc[int(s1),'CNA_type']=='Del': | |
| summaryMatrix.loc[int(s1), 'CNA_freq'] = freq[s1]['CNAdel'] | |
| elif summaryMatrix.loc[int(s1),'CNA_type']=='Amp': | |
| summaryMatrix.loc[int(s1), 'PAM_freq'] = freq[s1]['CNAamp'] | |
| freqAct = freq[s1]['Act'] | |
| summaryMatrix.loc[int(s1), 'Act_freq'] = freq[s1]['Act'] | |
| freqLoF = freq[s1]['LoF'] | |
| summaryMatrix.loc[int(s1), 'LoF_freq'] = freq[s1]['LoF'] | |
| if freqLoF>=0.05 or freqAct>=0.05 or freqPAM>=0.05: | |
| name1 = 'Unkown' | |
| if sum(n1.isin([int(s1)]))==1: | |
| name1 = n1.index[n1==int(s1)][0] | |
| print('\t'+''.join([str(i) for i in [name1+' ('+str(s1),') - FreqPAM: ', round(freqPAM,3), ' | FreqNeg: ', round(freqNeg,3), ' | FreqLoF: ', round(freqLoF,3), ' | FreqPos: ', round(freqPos,3),' | FreqAct: ', round(freqAct,3)]])) | |
| if freqPAM>0 and freqPAM>=params['min_mut_freq'] and int(s1) in somMutPoint and int(s1) in sigPAMs: | |
| keepers[str(s1)+'_PAM'] = pamLofAct[str(s1)][str(s1)+'_PAM'] | |
| keepPAM.append(str(s1)+'_PAM') | |
| summaryMatrix.loc[int(s1),'OM_type_selected'] = 'PAM' | |
| if str(s1)+'_Act' in pamLofAct[str(s1)] and freqAct>freqPAM and freqAct>=params['min_mut_freq']: | |
| keepers[str(s1)+'_Act'] = pamLofAct[str(s1)][str(s1)+'_Act'] | |
| calcSig.append(str(s1)+'_Act') | |
| summaryMatrix.loc[int(s1),'OM_type_selected'] = 'Act' | |
| if str(s1)+'_LoF' in pamLofAct[str(s1)] and freqLoF>freqPAM and freqLoF>=params['min_mut_freq']: | |
| keepers[str(s1)+'_LoF'] = pamLofAct[str(s1)][str(s1)+'_LoF'] | |
| calcSig.append(str(s1)+'_LoF') | |
| summaryMatrix.loc[int(s1),'OM_type_selected'] = 'LoF' | |
| ################################## | |
| ## Conduct permutation analysis ## | |
| ################################## | |
| print('Permutation anlaysis...') | |
| numPermutes = 1000 | |
| # Permute to get frequency | |
| def singlePermute(somMutsMF, somCNAsMF): | |
| tmp1 = pd.Series(np.random.permutation(somMutsMF), index=somMutsMF.index) | |
| tmp2 = pd.Series(np.random.permutation(somCNAsMF), index=somCNAsMF.index) | |
| subset1 = set(somMutsMF.index).intersection(somCNAsMF.index) | |
| return list(tmp1.loc[subset1]+tmp2.loc[subset1]) | |
| # If not loading up from previous run | |
| permMF_neg = [] | |
| permMF_pos = [] | |
| if params['load_permutation']==None: | |
| # Deletions | |
| print('\tPermuting deletions...') | |
| somMutsMF = somMuts.transpose().mean() | |
| somCNAsMF = negD1.transpose().mean() | |
| for i in range(numPermutes): | |
| permMF_neg += singlePermute(somMutsMF, somCNAsMF) | |
| # Amplifications | |
| print('\tPermuting amplifications...') | |
| somMutsMF = somMuts.transpose().mean() | |
| somCNAsMF = posD1.transpose().mean() | |
| for i in range(numPermutes): | |
| permMF_pos += singlePermute(somMutsMF, somCNAsMF) | |
| # Change precision so that is the same as what will be written out, | |
| # - Fixes bug where precision change leads to different behavior from freshly run (then saved) and loaded permutation data | |
| permMF_neg = [float(str(i)) for i in permMF_neg] | |
| permMF_pos = [float(str(i)) for i in permMF_pos] | |
| # If requested to save out permutations | |
| if params['save_permutation']==True: | |
| print('\tSaving permutations...') | |
| with open(params['output_path']+'/oncomerge_delPerm.txt','w') as delPermFile, open(params['output_path']+'/oncomerge_ampPerm.txt','w') as ampPermFile: | |
| delPermFile.write('\n'.join([str(i) for i in permMF_neg])) | |
| ampPermFile.write('\n'.join([str(i) for i in permMF_pos])) | |
| # Load up permutations from previous run | |
| elif not params['load_permutation']==None: | |
| print('\tLoading previous permutations...') | |
| with open(params['load_permutation']+'/oncomerge_delPerm.txt','r') as delPermFile, open(params['load_permutation']+'/oncomerge_ampPerm.txt','r') as ampPermFile: | |
| permMF_neg = [float(i.strip()) for i in delPermFile.readlines()] | |
| permMF_pos = [float(i.strip()) for i in ampPermFile.readlines()] | |
| # Write Permutation Analysis file Lof_Act_sig | |
| lofActSig = pd.DataFrame(columns = ['Symbol', 'Type','Freq','Emp.p_value'], index = calcSig) | |
| for sig1 in calcSig: | |
| if sig1.find('LoF')>0: | |
| lofActSig['Symbol'].loc[sig1] = n1.index[n1==int(sig1.rstrip('_LoF'))][0] | |
| lofActSig['Type'].loc[sig1] = 'LoF' | |
| lofActSig['Freq'].loc[sig1] = freq[sig1.rstrip('_LoF')]['LoF'] | |
| elif sig1.find('Act')>0: | |
| lofActSig['Symbol'].loc[sig1] = n1.index[n1==int(sig1.rstrip('_Act'))][0] | |
| lofActSig['Type'].loc[sig1] = 'Act' | |
| lofActSig['Freq'].loc[sig1] = freq[sig1.rstrip('_Act')]['Act'] | |
| permdict1 = {} | |
| permdict1['LoF'] = {} | |
| permdict1['Act'] = {} | |
| oMfreqs = [freq[sig1[:-4]][sig1.split('_')[1]] for sig1 in calcSig] | |
| for f in set(oMfreqs): | |
| permdict1['LoF'][f] = float(len([i for i in permMF_neg if i >= f]))/len(permMF_neg) | |
| permdict1['Act'][f] = float(len([i for i in permMF_pos if i >= f]))/len(permMF_pos) | |
| for sig1 in calcSig: | |
| if sig1.find('LoF')>0: | |
| lofActSig.loc[sig1, 'Emp.p_value'] = permdict1['LoF'][freq[sig1.rstrip('_LoF')]['LoF']] | |
| summaryMatrix.loc[int(sig1.rstrip('_LoF')),'OM_empirical_p_value'] = permdict1['LoF'][freq[sig1.rstrip('_LoF')]['LoF']] | |
| elif sig1.find('Act')>0: | |
| lofActSig.loc[sig1, 'Emp.p_value'] = permdict1['Act'][freq[sig1.rstrip('_Act')]['Act']] | |
| summaryMatrix.loc[int(sig1.rstrip('_Act')),'OM_empirical_p_value'] = permdict1['LoF'][freq[sig1.rstrip('_Act')]['Act']] | |
| if len(lofActSig)>0: | |
| lofActSig['q_value'] = multipletests(lofActSig['Emp.p_value'], 0.05, method='fdr_bh')[1] | |
| summaryMatrix.loc[[int(i.split('_')[0]) for i in lofActSig.index],'OM_empirical_q_value'] = list(lofActSig['q_value']) | |
| lofActSig.sort_values('q_value').to_csv(params['output_path']+'/oncoMerge_ActLofPermPV.csv') | |
| # Screen out LoF and Act that don't meet significance cutoffs | |
| keepLofAct0 = list(lofActSig.index[lofActSig['q_value']<=params['perm_qv']]) | |
| else: | |
| lofActSig.to_csv(params['output_path']+'/oncoMerge_ActLofPermPV.csv') | |
| keepLofAct0 = [] | |
| # Filter passenger mutations from keepLofAct using Shallow Coincidence | |
| def FindLoci(inp, ampLoci, delLoci): | |
| gene1, mutType = inp.split('_') | |
| if mutType == 'Act': | |
| LociSet = ampLoci | |
| if mutType == 'LoF': | |
| LociSet = delLoci | |
| ret = [] | |
| for locus1 in LociSet.keys(): | |
| if int(gene1) in LociSet[locus1]: | |
| ret.append(locus1) | |
| return ret | |
| LofActLoci_dict = {} | |
| for mut in keepLofAct0: | |
| mutLoci1 = FindLoci(mut, ampLoci, delLoci) | |
| for locus1 in mutLoci1: | |
| if locus1 not in LofActLoci_dict.keys(): | |
| LofActLoci_dict[locus1] = [] | |
| LofActLoci_dict[locus1].append(mut) | |
| keepLofAct1 = [] | |
| for locus1 in LofActLoci_dict.keys(): | |
| if len(LofActLoci_dict[locus1]) < params['min_loci_genes']: | |
| keepLofAct1 += LofActLoci_dict[locus1] | |
| for mut in LofActLoci_dict[locus1]: | |
| gene1, mutType1 = mut.split('_') | |
| summaryMatrix.loc[int(gene1), 'Final_mutation_type'] = mutType1 | |
| summaryMatrix.loc[int(gene1), 'Final_freq'] = summaryMatrix.loc[int(gene1), mutType1+'_freq'] | |
| summaryMatrix.loc[int(gene1), 'Delta_over_PAM'] = summaryMatrix.loc[int(gene1), mutType1+'_freq'] - summaryMatrix.loc[int(gene1), 'PAM_freq'] | |
| summaryMatrix.loc[int(gene1), 'Genes_in_locus'] = len(LofActLoci_dict[locus1]) | |
| else: | |
| for mut in LofActLoci_dict[locus1]: | |
| gene1, mutType1 = mut.split('_') | |
| summaryMatrix.loc[int(gene1), 'Genes_in_locus'] = len(LofActLoci_dict[locus1]) | |
| if shallowCoincidence[mutType1][int(gene1)] < params['min_coincidence_rate']: | |
| print(mut + ' filtered with shallow coincidence = ' + str(shallowCoincidence[mutType1][int(gene1)])) | |
| else: | |
| keepLofAct1.append(mut) | |
| summaryMatrix.loc[int(gene1), 'Final_mutation_type'] = mutType1 | |
| summaryMatrix.loc[int(gene1), 'Final_freq'] = summaryMatrix.loc[int(gene1), mutType1+'_freq'] | |
| summaryMatrix.loc[int(gene1), 'Delta_over_PAM'] = summaryMatrix.loc[int(gene1), mutType1+'_freq'] - summaryMatrix.loc[int(gene1), 'PAM_freq'] | |
| summaryMatrix.loc[int(gene1), 'Genes_in_locus'] = len(LofActLoci_dict[locus1]) | |
| keepLofAct = list(set(keepLofAct1)) | |
| # Screen out PAMs that are LoF/Act | |
| newKeepPAM = [] | |
| for pam1 in keepPAM: | |
| found = 0 | |
| tmp1 = pam1.split('_')[0] | |
| for lofAct in keepLofAct: | |
| if tmp1==lofAct.split('_')[0]: | |
| found = 1 | |
| if found==0: | |
| newKeepPAM.append(pam1) | |
| summaryMatrix.loc[int(tmp1), 'Final_mutation_type'] = 'PAM' | |
| summaryMatrix.loc[int(tmp1), 'Final_freq'] = summaryMatrix.loc[int(tmp1), 'PAM_freq'] | |
| summaryMatrix.loc[int(tmp1), 'Delta_over_PAM'] = float(0) | |
| ## Screen out loci that have a representative gene | |
| # Mutations that are at or above minimum mutation frequency cutoff | |
| highFreqLoci = lociCNA[lociCNA.mean(axis=1)>=params['min_mut_freq']] | |
| # Figure out what loci are explained by current Act or LoF genes | |
| explainedLoc = [] | |
| for locus1 in highFreqLoci.index: | |
| genesInLocus = [i for i in lociCNAgenes[locus1] if (str(i)+'_Act' in keepLofAct or str(i)+'_LoF' in keepLofAct)] | |
| if len(genesInLocus)>0: | |
| explainedLoc.append(locus1.split('_')[0]) | |
| # Screen out all other loci in that region | |
| keepLoc = [] | |
| for locus1 in highFreqLoci.index: | |
| if not locus1.split('_')[0] in explainedLoc: | |
| keepLoc.append(locus1) | |
| keepLoc_dict = {} | |
| for locus1 in set(['_'.join([i.split('_')[0],i.split('_')[-1]]) for i in keepLoc]): | |
| locus2 = locus1.split('_')[0] | |
| if locus2 in ampLoci.keys(): | |
| keepLoc_dict[locus1] = posD1.loc[ampLoci[locus2]] | |
| if locus2 in delLoci.keys(): | |
| keepLoc_dict[locus1] = negD1.loc[delLoci[locus2]] | |
| def mode2(pat1col): | |
| tmpser = pat1col.mode() | |
| if len(tmpser)>1: | |
| tmp10 = 1 | |
| else: | |
| tmp10 = tmpser.iloc[0] | |
| return tmp10 | |
| # Use most common value to determine the mutational profile of a loci | |
| keepLoc_df = pd.DataFrame(columns = d1.columns) | |
| tmpMode = pd.Series(index = d1.columns) | |
| for locus1 in keepLoc_dict.keys(): | |
| for pat1 in keepLoc_dict[locus1].columns: | |
| tmpMode.loc[pat1] = mode2(keepLoc_dict[locus1][pat1]) | |
| if tmpMode.mean()>=0.05: | |
| keepLoc_df.loc[locus1] = tmpMode | |
| keepLoc_df = keepLoc_df.applymap(int) | |
| #################################### | |
| ## Compile OncoMerge output files ## | |
| #################################### | |
| finalMutFile = pd.concat([pd.DataFrame(keepers).transpose().loc[newKeepPAM].sort_index(), pd.DataFrame(keepers).transpose().loc[keepLofAct].sort_index(), keepLoc_df.sort_index()]) | |
| # Rename all loci with only one gene | |
| ind_list = finalMutFile.index.tolist() | |
| for locus1 in keepLoc_df.index: | |
| splitUp = locus1.split('_') | |
| if splitUp[-1]=='CNAamp' and len(ampLoci[splitUp[0]])==1: | |
| idx = ind_list.index(locus1) | |
| ind_list[idx] = str(ampLoci[splitUp[0]][0]) + '_CNAamp' | |
| summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Final_mutation_type'] = 'CNAamp' | |
| summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Final_freq'] = summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'CNA_freq'] | |
| summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'Delta_over_PAM'] = summaryMatrix.loc[int(ampLoci[splitUp[0]][0]), 'CNA_freq'] | |
| if splitUp[-1]=='CNAdel' and len(delLoci[splitUp[0]])==1: | |
| idx = ind_list.index(locus1) | |
| ind_list[idx] = str(delLoci[splitUp[0]][0]) + '_CNAdel' | |
| summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Final_mutation_type'] = 'CNAdel' | |
| summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Final_freq'] = summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'CNA_freq'] | |
| summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'Delta_over_PAM'] = summaryMatrix.loc[int(delLoci[splitUp[0]][0]), 'CNA_freq'] | |
| finalMutFile.index = ind_list | |
| # Write out final mutation matrix to csv file | |
| finalMutFile.to_csv(params['output_path']+'/oncoMerge_mergedMuts.csv') | |
| # Write out summary matrix to csv file | |
| summaryMatrix.to_csv(params['output_path']+'/oncoMerge_summaryMatrix.csv') | |
| ## Write out loci information | |
| # Prepare for writing out | |
| writeLoci = ['Locus_name,Genes'] | |
| for locus1 in keepLoc_df.index: | |
| splitUp = locus1.split('_') | |
| if splitUp[-1]=='CNAamp': | |
| writeLoci.append(locus1+','+' '.join([str(i) for i in ampLoci[splitUp[0]]])) | |
| if splitUp[-1]=='CNAdel': | |
| writeLoci.append(locus1+','+' '.join([str(i) for i in delLoci[splitUp[0]]])) | |
| # Write out loci file | |
| with open(params['output_path']+'/oncoMerge_CNA_loci.csv','w') as outFile: | |
| outFile.write('\n'.join(writeLoci)) | |
| ## Write out information for hypergeometric analysis | |
| # Can be used to load in gene information from final mutation file: | |
| # [[int(j.split('_')[0]),j.split('_')[1]] for j in finalMutFile.index[[not i for i in list(finalMutFile.index.str.contains('p|q'))]]] | |
| # Write out background information for hypergeometric analysis | |
| backgrounds = {i:[int(j) for j in backgrounds[i]] for i in backgrounds} | |
| with open(params['output_path']+'/background.json', 'w') as outFile: | |
| json.dump(backgrounds, outFile) | |
| print('Done.') |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment