Skip to content

Instantly share code, notes, and snippets.

Show Gist options
  • Select an option

  • Save cplaisier/54a68aab3924e1dca816362896fee346 to your computer and use it in GitHub Desktop.

Select an option

Save cplaisier/54a68aab3924e1dca816362896fee346 to your computer and use it in GitHub Desktop.
input csv of gene expression, creates histogram for p and r values.
# -*- coding: utf-8 -*-
"""
Created on Sat Apr 07 21:10:33 2018
@author: Fuzzy
"""
import math
import pandas as pd
import numpy as np
import matplotlib.pyplot as plt
from scipy import stats
#creates dataframe and removes rows Unnamed 615-638 (they are empty)
df = pd.read_csv('tfExp.csv',header=0,index_col=0)
#count = 615
#deletes empty collumns.
#while count <639:
# del df['Unnamed: ' + str(count)]
# count += 1
#creates the id2Index dict. Use for determining location of ID's in df.
#id2Index = {}
#indexList = list(df.index.get_values())
#idList = list(df['Unnamed: 0'])
#count = 0
#for i in indexList:
# id2Index[idList[i]] = i
#only use rows that are filled.
#while count < 767:
# id2Index[idList[count]] = indexList[count]
# count += 1
tfStartId = ['430', '1052', '1053', '1385', '84699', '9586', '1871', '1874', '144455', '79733', '1960', '1997', '2002', '2004', '80712', '2114', '2115', '2120', '51513', '2551', '2623', '2624', '2625', '9421', '3232', '10320', '3659', '3662', '3670', '91464', '3726', '10661', '11278', '128209', '10365', '9314', '1316', '51176', '9935', '23269', '4602', '4774', '4790', '7025', '9480', '5468', '5914', '5916', '3516', '5971', '864', '6257', '4093', '6659', '6660', '6662', '25803', '347853', '30009', '9496', '6929', '6925', '8463', '7022', '29842', '10155', '6935', '132625', '23051', '85416', '7707', '7764', '23528', '201516']
#tfStartId = ['430', '1052', '1053', '1385', '84699', '9586', '1871', '1874']
#lists to store r and p values
pList = []
correctPList = []
rList = []
key1List = []
key2List = []
logpList = []
#iterate through all ids.
for key in df.index.values:
if(str(key) in tfStartId): #filter out unwanted IDs
row1 = df.loc[key]
#iterate through all ids for second comparison.
for key2 in df.index.values:
if key2 != key and (str(key2) in tfStartId):
row2 = df.loc[key2]
#person r function
p = stats.pearsonr(row1,row2)
key1List.append(key)
key2List.append(key2)
rList.append(p[0])
pList.append(p[1])
pList.sort();
#logarithmic transformation.
for x in pList:
logpList.append(-math.log10(x))
logpList.sort();
#FDR calculating the corrected pList.
size = len(pList)
#for p in pList:
# correctPList.append(p*(size/(pList.index(p)+1)))
rankValue = dict(zip(np.argsort(pList),range(len(pList))))
correctPList = pList
for i in range(len(pList)):
print i
correctPList[i] = pList[i]*(float(len(pList))/(rankValue[i]+1))
print stats.skew(rList)
print stats.skewtest(rList)
#create and show histogram of r values from 0 to 1
plt.hist(rList,bins='auto',range=(-1,1))
plt.title("r hist with auto bins")
plt.show()
#create and show histogram of p values from 0 to 1
#binList = [0,0.01,0.02,0.03,0.04,0.05]#custom bin range.
plt.hist(logpList,bins='auto') #range=(0,1))
plt.title("p-value hist with auto bins")
plt.show()
plt.hist([-math.log10(i) for i in correctPList],bins='auto') #range=(0,1))
plt.title("p-value hist with auto bins")
plt.show()
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment