Skip to content

Instantly share code, notes, and snippets.

@cplaisier
Forked from Dbyrum/task1part2.ipynb
Last active July 25, 2018 15:48
Show Gist options
  • Select an option

  • Save cplaisier/a26da2b2b9da8fcff9c4bcf33a0862db to your computer and use it in GitHub Desktop.

Select an option

Save cplaisier/a26da2b2b9da8fcff9c4bcf33a0862db to your computer and use it in GitHub Desktop.
adding a new file to the task in order to pull new information and create another network
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"74\n"
]
}
],
"source": [
"# Import libraries up front\n",
"import json\n",
"\n",
"# From Table S13 in Plaisier et al., Cell Systems 2016\n",
"# These are Entrez IDs (https://www.ncbi.nlm.nih.gov/pmc/articles/PMC3013746/)\n",
"input = ['430', '1052', '1053', '1385', '84699', '9586', '1871', '1874', '144455', '79733', '1960', '1997', '2002', '2004', '80712', '2114', '2115', '2120', '51513', '2551', '2623', '2624', '2625', '9421', '3232', '10320', '3659', '3662', '3670', '91464', '3726', '10661', '11278', '128209', '10365', '9314', '1316', '51176', '9935', '23269', '4602', '4774', '4790', '7025', '9480', '5468', '5914', '5916', '3516', '5971', '864', '6257', '4093', '6659', '6660', '6662', '25803', '347853', '30009', '9496', '6929', '6925', '8463', '7022', '29842', '10155', '6935', '132625', '23051', '85416', '7707', '7764', '23528', '201516']\n",
"print(len(input))\n",
"# Loading JSON file\n",
"# https://www.safaribooksonline.com/library/view/python-cookbook-3rd/9781449357337/ch06s02.html\n",
"# Example:\n",
"# import json\n",
"#\n",
"# # Reading data back\n",
"# with open('data.json', 'r') as f:\n",
"# data = json.load(f)\n",
"\n",
"# Reading TF regulator to TF target gene relationships into Python\n",
"# The json library we import takes care of most of the work\n",
"with open('tfbsDb_plus_and_minus_5000_entrez.json', 'r') as f:\n",
" tfbsDb = json.load(f)"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['SOX10_HMG_full_dimeric_16_1', 'V_AP2ALPHA_01_M00469', 'V_SIX6_01_M01345', 'Pitx1.1', 'ELF1_ETS_full_monomeric_12_1']\n"
]
}
],
"source": [
"# Example set of keys in tfbsDb, they are Motif IDs (http://jaspar.genereg.net/search?q=Homo%20sapiens&collection=CORE&tax_group=vertebrates)\n",
"print(list(tfbsDb.keys())[0:5])"
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['10', '100131211', '100288797', '100302736', '10057']\n"
]
}
],
"source": [
"# Example set of values under a specific Motif ID, they are Entrez IDs\n",
"print(tfbsDb[list(tfbsDb.keys())[1]][0:5])"
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"1185\n"
]
}
],
"source": [
"print(len(tfbsDb[list(tfbsDb.keys())[0]]))"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {},
"outputs": [],
"source": [
"Family2Id = {}\n",
"Id2Family = {}\n",
"with open('id_conversion/tfFamilies.csv','r', encoding='iso-8859-1') as openFile: # opening file \n",
" header = openFile.readline().strip().split(',') # reading in first line of file as header\n",
" #print(header)\n",
" while 1:\n",
" inLine = openFile.readline()\n",
" if not inLine:\n",
" break\n",
" strip = inLine.strip().split(',')\n",
" strip2 = strip[2].split(' ') # strip2 to seperate Entrez id values \n",
" Family2Id[strip[0]] = strip2 # adding keys and values to Family2Id dictionary from file\n",
" \n",
" for IdList in strip2:\n",
" Id2Family[IdList] = []\n",
" Id2Family[IdList] = strip[0]\n",
"\n",
" \n",
"#print (Id2Family.keys()) \n",
"#print (Family2Id.values())"
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"['Motif Name', 'Gene Symbol', 'Entrez ID']\n"
]
}
],
"source": [
"# Read in humanTFs file\n",
" \n",
"id2motif = {}\n",
"motif2id = {}\n",
"with open('id_conversion/humanTFs_All.csv','r') as inFile:\n",
" # Use the readline() function to read in a single line\n",
" # strip() gets rid of the newline character at the end of the line\n",
" # split(',') splits up the line into columns based on commas\n",
" header = inFile.readline().strip().split(',')\n",
" print (header)\n",
" while 1:\n",
" inLine = inFile.readline()\n",
" if not inLine:\n",
" break\n",
" split = inLine.strip().split(',') \n",
" \n",
" # TODO Fill out the id2motif dictionary (key = Entrez ID, value = Motif Name)\n",
" # if split[2]\n",
" \n",
" if not split[2] in id2motif:\n",
" id2motif[split[2]] = []\n",
" id2motif[split[2]].append(split[0]) \n",
" # TODO Fill out the motif2id dictionary (key = Motif Name, value = Entrez ID)\n",
" motif2id[split[0]]=split[2]\n",
" \n",
" \n",
" \n",
"\n",
"#print(len(motif2id))\n",
"#print(len(id2motif.keys()))\n",
"#print(id2motif)"
]
},
{
"cell_type": "code",
"execution_count": 8,
"metadata": {
"scrolled": true
},
"outputs": [],
"source": [
"\n",
"## To build a TF regulator to TF target gene network (constrained to TFs within the input list).\n",
"## This will require mapping from:\n",
"## 1. Input list of potential TF regulator Entrez Gene IDs (input) \n",
"## 2. List of Motif IDs for an Entrez Gene ID in the input list (either id2motif or motif2id)\n",
"## 3. TF target genes that are Entrez Gene IDs that are the values under a specific Motif ID in tfbsDb\n",
"## 4. Restrict TF target genes to only those in the input list\n",
"## 5. Add new entry to tfNetwor dictionary that has as the key the TF regulator and the values all the TF target genes\n",
"tfNetwork = {}\n",
"tfFamilies = {}\n",
"\n",
"for eachTfReg in input: # for loop that assigns each iteration to eachTfReg\n",
" if eachTfReg in id2motif:\n",
" for eachMotif in id2motif[eachTfReg]: # loop function that checks motif2id in id2motif[eachTfReg]\n",
" if eachMotif in tfbsDb:\n",
" targets = tfbsDb[eachMotif] # assign targets from id2motif[eachTfReg]/eachTfreg\n",
" for eachTarget in targets:\n",
" if not eachTfReg in tfNetwork:\n",
" tfNetwork[eachTfReg] = []\n",
" if eachTarget in input and not eachTarget in tfNetwork[eachTfReg]:\n",
" tfNetwork[eachTfReg].append(eachTarget)\n",
" else:\n",
" for eachFamily in Family2Id:\n",
" if eachTfReg in Family2Id[eachFamily]:\n",
" for eachId in Family2Id[eachFamily]:\n",
" if eachId in id2motif:\n",
" for eachMotif in id2motif[eachId]: # loop function that checks motif2id in id2motif[eachTfReg]\n",
" if eachMotif in tfbsDb:\n",
" targets = tfbsDb[eachMotif] # assign targets from id2motif[eachTfReg]/eachTfreg\n",
" for eachTarget in targets:\n",
" if not eachTfReg in tfNetwork:\n",
" tfNetwork[eachTfReg] = []\n",
" if eachTarget in input and not eachTarget in tfNetwork[eachTfReg]:\n",
" tfNetwork[eachTfReg].append(eachTarget)\n",
" \n"
]
},
{
"cell_type": "code",
"execution_count": 30,
"metadata": {},
"outputs": [
{
"data": {
"image/png": "\n",
"text/plain": [
"<matplotlib.figure.Figure at 0x24774482860>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"import networkx as nx\n",
"import matplotlib.pyplot as plt\n",
"\n",
"netConnections = []\n",
"for TFreg in tfNetwork:\n",
" for TFtarg in tfNetwork[TFreg]:\n",
" netConnections.append((TFreg,TFtarg))\n",
"\n",
" \n",
"#print(netConnections)\n",
"\n",
"G = nx.DiGraph()\n",
"G.add_edges_from(netConnections)\n",
"#print(G)\n",
"pos = nx.spring_layout(G)\n",
"nx.draw_networkx_nodes(G, pos, cmap=plt.get_cmap('jet'), node_size = 500)\n",
"nx.draw_networkx_labels(G, pos)\n",
"nx.draw_networkx_edges(G, pos)\n",
"plt.show()\n"
]
},
{
"cell_type": "code",
"execution_count": 111,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"12\n",
"['84699', '9421', '3726', '128209', '10365', '1316', '5971', '4093', '347853', '8463', '23051', '85416']\n"
]
}
],
"source": [
"l1 = []\n",
"for eachTfReg in input:\n",
" if not eachTfReg in tfNetwork:\n",
" #print (eachTfReg)\n",
" l1.append(eachTfReg)\n",
" \n",
"print(len(l1))\n",
"print(l1)"
]
},
{
"cell_type": "code",
"execution_count": 118,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"True"
]
},
"execution_count": 118,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'84699' in Family2Id['CREB-3-like factors']"
]
},
{
"cell_type": "code",
"execution_count": 103,
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"False"
]
},
"execution_count": 103,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"'142' in tfbsDb['Ascl2.1']"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.6.4"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment