-
-
Save cplaisier/54a68aab3924e1dca816362896fee346 to your computer and use it in GitHub Desktop.
input csv of gene expression, creates histogram for p and r values.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| # -*- coding: utf-8 -*- | |
| """ | |
| Created on Sat Apr 07 21:10:33 2018 | |
| @author: Fuzzy | |
| """ | |
| import math | |
| import pandas as pd | |
| import numpy as np | |
| import matplotlib.pyplot as plt | |
| from scipy import stats | |
| #creates dataframe and removes rows Unnamed 615-638 (they are empty) | |
| df = pd.read_csv('tfExp.csv',header=0,index_col=0) | |
| #count = 615 | |
| #deletes empty collumns. | |
| #while count <639: | |
| # del df['Unnamed: ' + str(count)] | |
| # count += 1 | |
| #creates the id2Index dict. Use for determining location of ID's in df. | |
| #id2Index = {} | |
| #indexList = list(df.index.get_values()) | |
| #idList = list(df['Unnamed: 0']) | |
| #count = 0 | |
| #for i in indexList: | |
| # id2Index[idList[i]] = i | |
| #only use rows that are filled. | |
| #while count < 767: | |
| # id2Index[idList[count]] = indexList[count] | |
| # count += 1 | |
| tfStartId = ['430', '1052', '1053', '1385', '84699', '9586', '1871', '1874', '144455', '79733', '1960', '1997', '2002', '2004', '80712', '2114', '2115', '2120', '51513', '2551', '2623', '2624', '2625', '9421', '3232', '10320', '3659', '3662', '3670', '91464', '3726', '10661', '11278', '128209', '10365', '9314', '1316', '51176', '9935', '23269', '4602', '4774', '4790', '7025', '9480', '5468', '5914', '5916', '3516', '5971', '864', '6257', '4093', '6659', '6660', '6662', '25803', '347853', '30009', '9496', '6929', '6925', '8463', '7022', '29842', '10155', '6935', '132625', '23051', '85416', '7707', '7764', '23528', '201516'] | |
| #tfStartId = ['430', '1052', '1053', '1385', '84699', '9586', '1871', '1874'] | |
| #lists to store r and p values | |
| pList = [] | |
| correctPList = [] | |
| rList = [] | |
| key1List = [] | |
| key2List = [] | |
| logpList = [] | |
| #iterate through all ids. | |
| for key in df.index.values: | |
| if(str(key) in tfStartId): #filter out unwanted IDs | |
| row1 = df.loc[key] | |
| #iterate through all ids for second comparison. | |
| for key2 in df.index.values: | |
| if key2 != key and (str(key2) in tfStartId): | |
| row2 = df.loc[key2] | |
| #person r function | |
| p = stats.pearsonr(row1,row2) | |
| key1List.append(key) | |
| key2List.append(key2) | |
| rList.append(p[0]) | |
| pList.append(p[1]) | |
| pList.sort(); | |
| #logarithmic transformation. | |
| for x in pList: | |
| logpList.append(-math.log10(x)) | |
| logpList.sort(); | |
| #FDR calculating the corrected pList. | |
| size = len(pList) | |
| #for p in pList: | |
| # correctPList.append(p*(size/(pList.index(p)+1))) | |
| rankValue = dict(zip(np.argsort(pList),range(len(pList)))) | |
| correctPList = pList | |
| for i in range(len(pList)): | |
| print i | |
| correctPList[i] = pList[i]*(float(len(pList))/(rankValue[i]+1)) | |
| print stats.skew(rList) | |
| print stats.skewtest(rList) | |
| #create and show histogram of r values from 0 to 1 | |
| plt.hist(rList,bins='auto',range=(-1,1)) | |
| plt.title("r hist with auto bins") | |
| plt.show() | |
| #create and show histogram of p values from 0 to 1 | |
| #binList = [0,0.01,0.02,0.03,0.04,0.05]#custom bin range. | |
| plt.hist(logpList,bins='auto') #range=(0,1)) | |
| plt.title("p-value hist with auto bins") | |
| plt.show() | |
| plt.hist([-math.log10(i) for i in correctPList],bins='auto') #range=(0,1)) | |
| plt.title("p-value hist with auto bins") | |
| plt.show() |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment