ssstrike · May 22, 2018 15:39
diff --git a/pearsonrGeneExpression.py b/pearsonrGeneExpression.py
 # -*- coding: utf-8 -*-
 """
 Created on Sat Apr 07 21:10:33 2018

 @author: Fuzzy
 """
 import math
 import pandas as pd
 import numpy as np
 import matplotlib.pyplot as plt
 import networkx as nx
 from scipy import stats

 #open json file for geneNetwork
 import json
 with open('geneNetork74.json', 'r') as f:
    geneNetwork = json.load(f)

 #creates dataframe and removes rows Unnamed 615-638 (they are empty)
 df = pd.read_csv('tfExp.csv',header=0,index_col=0)

 #ALL
 allrList = []
 allpList = []
 allcorrectPList = []
 alllogpList = []
 allkey1List = [] #regulator list
 allkey2List = [] #target list
 #iterate through all ids.
 for i in range(len(df.index.values)):
    row1 = df.loc[df.index.values[i]]
    #iterate through all ids for second comparison.
    for j in range(i+1, len(df.index.values)):
        row2 = df.loc[df.index.values[j]]
        #person r function
        p = stats.pearsonr(row1,row2)
        allkey1List.append(df.index.values[i])
        allkey2List.append(df.index.values[j])
        allrList.append(p[0])
        allpList.append(p[1])

 #logarithmic transformation.
 for x in allpList:
    if not x==0:
        alllogpList.append(-math.log10(x))
    else:
        alllogpList.append(302)

 #FDR calculating the corrected pList.
 size = len(allpList)
 rankValue = dict(zip(np.argsort(allpList),range(len(allpList))))
 allcorrectPList = allpList
 for i in range(len(allpList)):
    allcorrectPList[i] = allpList[i]*(float(len(allpList))/(rankValue[i]+1))
    
 print stats.skew(allrList,nan_policy='omit')
 print stats.skewtest(allrList,nan_policy='omit')

 #lists for various p and r values and their associated TF gene
 rList = []
 pList = []
 correctPList = []
 logpList = []
 key1List = []#regulator list
 key2List = []#target list
 #iterate through all ids.
 for i in range(len(df.index.values)):
    if(str(df.index.values[i]) in geneNetwork): #filter out unwanted IDs
        row1 = df.loc[df.index.values[i]]
        #iterate through all ids for second comparison.
        for j in range(i+1, len(df.index.values)):
            if (str(df.index.values[j]) in geneNetwork[str(df.index.values[i])]):#filter again
                row2 = df.loc[df.index.values[j]]
                #person r function
                p = stats.pearsonr(row1,row2)
                key1List.append(df.index.values[i])
                key2List.append(df.index.values[j])
                rList.append(p[0])
                pList.append(p[1])
  
 #logarithmic transformation.              
 for x in pList:
    logpList.append(-math.log10(x))

 #FDR calculating the corrected pList.
 size = len(pList)
 rankValue = dict(zip(np.argsort(pList),range(len(pList))))
 correctPList = pList
 for i in range(len(pList)):
    correctPList[i] = pList[i]*(float(len(pList))/(rankValue[i]+1))

 print stats.skew(rList)
 print stats.skewtest(rList)

 #create and show histogram of r values from -1 to 1
 plt.hist(rList,bins='auto',range=(-1,1),density=True,alpha=0.5,label='Filtered')
 plt.hist([i for i in allrList if not math.isnan(i)],bins='auto',range=(-1,1),density=True,alpha=0.5,label='All')
 plt.title("r hist with auto bins")
 plt.show()
 #create and show histogram of p values
 plt.hist(logpList,bins='auto') #range=(0,1))
 plt.title("p-value hist with auto bins")
 plt.show()

 plt.hist([-math.log10(i) for i in correctPList],bins='auto') #range=(0,1))
 plt.title("p-value hist with auto bins")
 plt.show()

 #plot size of list according to rvalue
 numList = []
 xList = []
 x=1
 while x>0.3:
    numList.append(len([i for i in rList if abs(i)>=x]))
    xList.append(x)
    x-=0.01
 plt.plot(xList,numList)

 #pull out corrolative interations
 r2List = []
 indexList = []
 nameList = []    
 r2List = [i for i in rList if abs(i)>=0.5]# and rList.index(i)<0.05]

 for r in r2List:
    try:
        indexList.append(rList.index(r))
    except ValueError:
        indexList.append(rList.index((-r)))
    
 tf2tfLink = {}
 tf2tfNet = {}
 for position in indexList:
    reg = key1List[position]
    targ = key2List[position]
    tf2tfNet[reg] = []
    tf2tfNet[reg].append(targ)
    if not reg in tf2tfLink:
        tf2tfLink[reg] = []
    if not targ in tf2tfLink[reg]:
        tf2tfLink[reg].append(targ)
        
 #form picture using networkx 
 G = nx.from_dict_of_lists(tf2tfNet)
 #G = nx.Graph(tf2tfNet) 
 nx.draw_shell(G,with_labels=True,node_size=600)#, pos=nx.circular_layout(G)
 plt.show()

 #for translation
 name2Entrez = {}
 with open('gene2entrezId.CSV','r') as inFile:
    while 1:
        inLine = inFile.readline()
        if not inLine:
            break
        split = inLine.strip().split(',')
        name2Entrez[split[1]] = split[0]

 ''' error
 #write subNetwork .sif file
 writeSub = []
 interact = 'r2t'
 for key in tf2tfNet:
    for gene in tf2tfNet[key]:
        if key in name2Entrez:
            key1 = name2Entrez[key]
        else:
            key1 = key
        if gene in name2Entrez:
            gene1 = name2Entrez[gene]
        else:
            gene1 = gene
        writeSub.append(key1+'\t'+interact+'\t'+gene1)
 with open('tf2tfNet.sif','w') as outFile:
    outFile.write('\n'.join(writeSub))
 '''
	# -- coding: utf-8 --
	"""
	Created on Sat Apr 07 21:10:33 2018

	@author: Fuzzy
	"""
	import math
	import pandas as pd
	import numpy as np
	import matplotlib.pyplot as plt
	import networkx as nx
	from scipy import stats

	#open json file for geneNetwork
	import json
	with open('geneNetork74.json', 'r') as f:
	geneNetwork = json.load(f)

	#creates dataframe and removes rows Unnamed 615-638 (they are empty)
	df = pd.read_csv('tfExp.csv',header=0,index_col=0)

	#ALL
	allrList = []
	allpList = []
	allcorrectPList = []
	alllogpList = []
	allkey1List = [] #regulator list
	allkey2List = [] #target list
	#iterate through all ids.
	for i in range(len(df.index.values)):
	row1 = df.loc[df.index.values[i]]
	#iterate through all ids for second comparison.
	for j in range(i+1, len(df.index.values)):
	row2 = df.loc[df.index.values[j]]
	#person r function
	p = stats.pearsonr(row1,row2)
	allkey1List.append(df.index.values[i])
	allkey2List.append(df.index.values[j])
	allrList.append(p[0])
	allpList.append(p[1])

	#logarithmic transformation.
	for x in allpList:
	if not x==0:
	alllogpList.append(-math.log10(x))
	else:
	alllogpList.append(302)

	#FDR calculating the corrected pList.
	size = len(allpList)
	rankValue = dict(zip(np.argsort(allpList),range(len(allpList))))
	allcorrectPList = allpList
	for i in range(len(allpList)):
	allcorrectPList[i] = allpList[i]*(float(len(allpList))/(rankValue[i]+1))

	print stats.skew(allrList,nan_policy='omit')
	print stats.skewtest(allrList,nan_policy='omit')

	#lists for various p and r values and their associated TF gene
	rList = []
	pList = []
	correctPList = []
	logpList = []
	key1List = []#regulator list
	key2List = []#target list
	#iterate through all ids.
	for i in range(len(df.index.values)):
	if(str(df.index.values[i]) in geneNetwork): #filter out unwanted IDs
	row1 = df.loc[df.index.values[i]]
	#iterate through all ids for second comparison.
	for j in range(i+1, len(df.index.values)):
	if (str(df.index.values[j]) in geneNetwork[str(df.index.values[i])]):#filter again
	row2 = df.loc[df.index.values[j]]
	#person r function
	p = stats.pearsonr(row1,row2)
	key1List.append(df.index.values[i])
	key2List.append(df.index.values[j])
	rList.append(p[0])
	pList.append(p[1])

	#logarithmic transformation.
	for x in pList:
	logpList.append(-math.log10(x))

	#FDR calculating the corrected pList.
	size = len(pList)
	rankValue = dict(zip(np.argsort(pList),range(len(pList))))
	correctPList = pList
	for i in range(len(pList)):
	correctPList[i] = pList[i]*(float(len(pList))/(rankValue[i]+1))

	print stats.skew(rList)
	print stats.skewtest(rList)

	#create and show histogram of r values from -1 to 1
	plt.hist(rList,bins='auto',range=(-1,1),density=True,alpha=0.5,label='Filtered')
	plt.hist([i for i in allrList if not math.isnan(i)],bins='auto',range=(-1,1),density=True,alpha=0.5,label='All')
	plt.title("r hist with auto bins")
	plt.show()
	#create and show histogram of p values
	plt.hist(logpList,bins='auto') #range=(0,1))
	plt.title("p-value hist with auto bins")
	plt.show()

	plt.hist([-math.log10(i) for i in correctPList],bins='auto') #range=(0,1))
	plt.title("p-value hist with auto bins")
	plt.show()

	#plot size of list according to rvalue
	numList = []
	xList = []
	x=1
	while x>0.3:
	numList.append(len([i for i in rList if abs(i)>=x]))
	xList.append(x)
	x-=0.01
	plt.plot(xList,numList)

	#pull out corrolative interations
	r2List = []
	indexList = []
	nameList = []
	r2List = [i for i in rList if abs(i)>=0.5]# and rList.index(i)<0.05]

	for r in r2List:
	try:
	indexList.append(rList.index(r))
	except ValueError:
	indexList.append(rList.index((-r)))

	tf2tfLink = {}
	tf2tfNet = {}
	for position in indexList:
	reg = key1List[position]
	targ = key2List[position]
	tf2tfNet[reg] = []
	tf2tfNet[reg].append(targ)
	if not reg in tf2tfLink:
	tf2tfLink[reg] = []
	if not targ in tf2tfLink[reg]:
	tf2tfLink[reg].append(targ)

	#form picture using networkx
	G = nx.from_dict_of_lists(tf2tfNet)
	#G = nx.Graph(tf2tfNet)
	nx.draw_shell(G,with_labels=True,node_size=600)#, pos=nx.circular_layout(G)
	plt.show()

	#for translation
	name2Entrez = {}
	with open('gene2entrezId.CSV','r') as inFile:
	while 1:
	inLine = inFile.readline()
	if not inLine:
	break
	split = inLine.strip().split(',')
	name2Entrez[split[1]] = split[0]

	''' error
	#write subNetwork .sif file
	writeSub = []
	interact = 'r2t'
	for key in tf2tfNet:
	for gene in tf2tfNet[key]:
	if key in name2Entrez:
	key1 = name2Entrez[key]
	else:
	key1 = key
	if gene in name2Entrez:
	gene1 = name2Entrez[gene]
	else:
	gene1 = gene
	writeSub.append(key1+'\t'+interact+'\t'+gene1)
	with open('tf2tfNet.sif','w') as outFile:
	outFile.write('\n'.join(writeSub))
	'''
No results found