Last active
May 22, 2018 15:39
-
-
Save ssstrike/3bec39f584998aca8a3472ff840d5d47 to your computer and use it in GitHub Desktop.
input csv of gene expression, creates histogram for p and r values.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sat Apr 07 21:10:33 2018 | |
| @author: Fuzzy | |
| """ | |
| import math | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| import networkx as nx | |
| from scipy import stats | |
| #open json file for geneNetwork | |
| import json | |
| with open('geneNetork74.json', 'r') as f: | |
| geneNetwork = json.load(f) | |
| #creates dataframe and removes rows Unnamed 615-638 (they are empty) | |
| df = pd.read_csv('tfExp.csv',header=0,index_col=0) | |
| #ALL | |
| allrList = [] | |
| allpList = [] | |
| allcorrectPList = [] | |
| alllogpList = [] | |
| allkey1List = [] #regulator list | |
| allkey2List = [] #target list | |
| #iterate through all ids. | |
| for i in range(len(df.index.values)): | |
| row1 = df.loc[df.index.values[i]] | |
| #iterate through all ids for second comparison. | |
| for j in range(i+1, len(df.index.values)): | |
| row2 = df.loc[df.index.values[j]] | |
| #person r function | |
| p = stats.pearsonr(row1,row2) | |
| allkey1List.append(df.index.values[i]) | |
| allkey2List.append(df.index.values[j]) | |
| allrList.append(p[0]) | |
| allpList.append(p[1]) | |
| #logarithmic transformation. | |
| for x in allpList: | |
| if not x==0: | |
| alllogpList.append(-math.log10(x)) | |
| else: | |
| alllogpList.append(302) | |
| #FDR calculating the corrected pList. | |
| size = len(allpList) | |
| rankValue = dict(zip(np.argsort(allpList),range(len(allpList)))) | |
| allcorrectPList = allpList | |
| for i in range(len(allpList)): | |
| allcorrectPList[i] = allpList[i]*(float(len(allpList))/(rankValue[i]+1)) | |
| print stats.skew(allrList,nan_policy='omit') | |
| print stats.skewtest(allrList,nan_policy='omit') | |
| #lists for various p and r values and their associated TF gene | |
| rList = [] | |
| pList = [] | |
| correctPList = [] | |
| logpList = [] | |
| key1List = []#regulator list | |
| key2List = []#target list | |
| #iterate through all ids. | |
| for i in range(len(df.index.values)): | |
| if(str(df.index.values[i]) in geneNetwork): #filter out unwanted IDs | |
| row1 = df.loc[df.index.values[i]] | |
| #iterate through all ids for second comparison. | |
| for j in range(i+1, len(df.index.values)): | |
| if (str(df.index.values[j]) in geneNetwork[str(df.index.values[i])]):#filter again | |
| row2 = df.loc[df.index.values[j]] | |
| #person r function | |
| p = stats.pearsonr(row1,row2) | |
| key1List.append(df.index.values[i]) | |
| key2List.append(df.index.values[j]) | |
| rList.append(p[0]) | |
| pList.append(p[1]) | |
| #logarithmic transformation. | |
| for x in pList: | |
| logpList.append(-math.log10(x)) | |
| #FDR calculating the corrected pList. | |
| size = len(pList) | |
| rankValue = dict(zip(np.argsort(pList),range(len(pList)))) | |
| correctPList = pList | |
| for i in range(len(pList)): | |
| correctPList[i] = pList[i]*(float(len(pList))/(rankValue[i]+1)) | |
| print stats.skew(rList) | |
| print stats.skewtest(rList) | |
| #create and show histogram of r values from -1 to 1 | |
| plt.hist(rList,bins='auto',range=(-1,1),density=True,alpha=0.5,label='Filtered') | |
| plt.hist([i for i in allrList if not math.isnan(i)],bins='auto',range=(-1,1),density=True,alpha=0.5,label='All') | |
| plt.title("r hist with auto bins") | |
| plt.show() | |
| #create and show histogram of p values | |
| plt.hist(logpList,bins='auto') #range=(0,1)) | |
| plt.title("p-value hist with auto bins") | |
| plt.show() | |
| plt.hist([-math.log10(i) for i in correctPList],bins='auto') #range=(0,1)) | |
| plt.title("p-value hist with auto bins") | |
| plt.show() | |
| #plot size of list according to rvalue | |
| numList = [] | |
| xList = [] | |
| x=1 | |
| while x>0.3: | |
| numList.append(len([i for i in rList if abs(i)>=x])) | |
| xList.append(x) | |
| x-=0.01 | |
| plt.plot(xList,numList) | |
| #pull out corrolative interations | |
| r2List = [] | |
| indexList = [] | |
| nameList = [] | |
| r2List = [i for i in rList if abs(i)>=0.5]# and rList.index(i)<0.05] | |
| for r in r2List: | |
| try: | |
| indexList.append(rList.index(r)) | |
| except ValueError: | |
| indexList.append(rList.index((-r))) | |
| tf2tfLink = {} | |
| tf2tfNet = {} | |
| for position in indexList: | |
| reg = key1List[position] | |
| targ = key2List[position] | |
| tf2tfNet[reg] = [] | |
| tf2tfNet[reg].append(targ) | |
| if not reg in tf2tfLink: | |
| tf2tfLink[reg] = [] | |
| if not targ in tf2tfLink[reg]: | |
| tf2tfLink[reg].append(targ) | |
| #form picture using networkx | |
| G = nx.from_dict_of_lists(tf2tfNet) | |
| #G = nx.Graph(tf2tfNet) | |
| nx.draw_shell(G,with_labels=True,node_size=600)#, pos=nx.circular_layout(G) | |
| plt.show() | |
| #for translation | |
| name2Entrez = {} | |
| with open('gene2entrezId.CSV','r') as inFile: | |
| while 1: | |
| inLine = inFile.readline() | |
| if not inLine: | |
| break | |
| split = inLine.strip().split(',') | |
| name2Entrez[split[1]] = split[0] | |
| ''' error | |
| #write subNetwork .sif file | |
| writeSub = [] | |
| interact = 'r2t' | |
| for key in tf2tfNet: | |
| for gene in tf2tfNet[key]: | |
| if key in name2Entrez: | |
| key1 = name2Entrez[key] | |
| else: | |
| key1 = key | |
| if gene in name2Entrez: | |
| gene1 = name2Entrez[gene] | |
| else: | |
| gene1 = gene | |
| writeSub.append(key1+'\t'+interact+'\t'+gene1) | |
| with open('tf2tfNet.sif','w') as outFile: | |
| outFile.write('\n'.join(writeSub)) | |
| ''' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment