Skip to content

Instantly share code, notes, and snippets.

@cplaisier
Created January 28, 2021 22:23
Show Gist options
  • Select an option

  • Save cplaisier/1a9a2a28307d3e96c32cd2d0f04a8b5d to your computer and use it in GitHub Desktop.

Select an option

Save cplaisier/1a9a2a28307d3e96c32cd2d0f04a8b5d to your computer and use it in GitHub Desktop.
## Docker instance for analysis
# docker run -it -v "/home/bgasvoda/Pia:/files" cplaisier/scrna_seq_velocity
# Commands to run on Docker instance before starting
# cd /files
# pip3 install kneed
# python3
import pandas as pd
import numpy as np
import json
import seaborn as sns
import matplotlib.pyplot as plt
from matplotlib.backends.backend_pdf import PdfPages
d1 = pd.read_csv('Achilles_gene_effect.csv', header=0, index_col=0)
# Identifying the most highly variable genes
d1var = d1.var(axis=0)
d1top = d1var.sort_values(ascending=False).head(n=5000)
valid_cols = [c for c in d1 if c in d1top]
d1 = d1[valid_cols]
# Calculating correlation
c1 = d1.corr(method='pearson')
c1.to_csv('Achilles_gene_effect_PearsonCorrelation_highly_variable.csv')
c1full = pd.read_csv('Achilles_gene_effect_PearsonCorrelation_8_14_2020.csv', header=0, index_col=0)
c1 = pd.read_csv('Achilles_gene_effect_PearsonCorrelation_highly_variable.csv', header=0, index_col=0)
# Relabel index and columns by gene symbol only
tmp1 = [i.split(' ')[0] for i in c1.columns]
c1.index = tmp1
c1.columns = tmp1
# Load gene sets of interest
with open('geneLists_10_31_2019.json','r') as inFile:
geneLists = json.load(inFile)
sns.clustermap(c1.loc[geneLists['RNAPol'],geneLists['RNAPol']])
plt.show()
#colors1 = ['pale red','white','medium green']
with PdfPages('clustermaps_correlation_8_14_2020.pdf') as pdf:
for list1 in geneLists.keys():
tmp2 = [i for i in geneLists[list1] if i in c1.index]
sns.clustermap(c1.loc[tmp2,tmp2], cmap=sns.color_palette("RdBu_r", 7), standard_scale=None, z_score=None, vmin=-1, vmax=1)
plt.title(list1)
#plt.show()
pdf.savefig()
# Converting correlation into distance
corr1 = c1
corr1 = corr1.mask(corr1 < 0, 0)
dist1 = 1-corr1
#############
# Clustering - kmeans - Full Achilles data
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
d1full = pd.read_csv('Achilles_gene_effect.csv', header=0, index_col=0)
# skipping scaling
tmp1 = [i.split(' ')[0] for i in d1full.columns]
d1full.columns = tmp1
scaled_features = d1full.dropna().to_numpy()
kmeans_kwargs = {"init": "random","n_init": 10,"max_iter": 300,"random_state": 42,}
sse = []
for k in range(1, 31):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(scaled_features)
sse.append(kmeans.inertia_)
kl = KneeLocator(range(1, 31), sse, curve="convex", direction="decreasing")
plt.plot(range(1, 31), sse)
plt.xticks(range(1, 31))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.savefig("figures/kmeans_full_achilles_clusters_elbowplot.pdf")
plt.close()
k = kl.elbow #from elbow plot
kmeans = KMeans(n_clusters=k,init="random",n_init=10,max_iter=300,random_state=42)
kmeans.fit(scaled_features)
kmeans.inertia_
kmeans.cluster_centers_
kmeans.n_iter_
kmeans.labels_
dict1 = dict(zip(d1.index, kmeans.labels_))
clusterids = pd.DataFrame()
clusterids['gene'] = d1full.index
clusterids['cluster'] = clusterids['gene'].map(dict1)
clusterids.to_csv("clusterdata_achilles_full.csv")
#############
# Clustering - kmeans - Full Coorelation data
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
c1full = pd.read_csv('Achilles_gene_effect_PearsonCorrelation_8_14_2020.csv', header=0, index_col=0)
# skipping scaling
tmp1 = [i.split(' ')[0] for i in c1full.columns]
c1full.index = tmp1
c1full.columns = tmp1
scaled_features = c1full.dropna().to_numpy()
kmeans_kwargs = {"init": "random","n_init": 10,"max_iter": 300,"random_state": 42,}
sse = []
for k in range(1, 11):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(scaled_features)
sse.append(kmeans.inertia_)
kl = KneeLocator(range(1, 21), sse, curve="convex", direction="decreasing")
plt.plot(range(1, 21), sse)
plt.xticks(range(1, 21))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.savefig("figures/kmeans_full_correlation_clusters_elbowplot.pdf")
plt.close()
k = kl.elbow #from elbow plot
kmeans = KMeans(n_clusters=k,init="random",n_init=10,max_iter=300,random_state=42)
kmeans.fit(scaled_features)
kmeans.inertia_
kmeans.cluster_centers_
kmeans.n_iter_
kmeans.labels_
dict1 = zip(c1full.index, kmeans.labels_)
clusterids = pd.DataFrame()
clusterids['gene'] = c1full.index
clusterids['cluster'] = clusterids['gene'].map(dict1)
clusterids.to_csv("clusterdata_correlation_full.csv")
#############
# Clustering - kmeans - Highly variable Coorelation data
import matplotlib.pyplot as plt
from kneed import KneeLocator
from sklearn.datasets import make_blobs
from sklearn.cluster import KMeans
from sklearn.metrics import silhouette_score
from sklearn.preprocessing import StandardScaler
c1 = pd.read_csv('Achilles_gene_effect_PearsonCorrelation_highly_variable.csv', header=0, index_col=0)
# skipping scaling
tmp1 = [i.split(' ')[0] for i in c1.columns]
c1.index = tmp1
c1.columns = tmp1
scaled_features = c1.dropna().to_numpy()
kmeans_kwargs = {"init": "random","n_init": 10,"max_iter": 300,"random_state": 42,}
sse = []
for k in range(1, 31):
kmeans = KMeans(n_clusters=k, **kmeans_kwargs)
kmeans.fit(scaled_features)
sse.append(kmeans.inertia_)
kl = KneeLocator(range(1, 31), sse, curve="convex", direction="decreasing")
plt.plot(range(1, 31), sse)
plt.xticks(range(1, 31))
plt.xlabel("Number of Clusters")
plt.ylabel("SSE")
plt.savefig("figures/kmeans_highlyvariable_correlation_clusters_elbowplot.pdf")
plt.close()
k = kl.elbow #from elbow plot
kmeans = KMeans(n_clusters=k,init="random",n_init=10,max_iter=300,random_state=42)
kmeans.fit(scaled_features)
kmeans.inertia_
kmeans.cluster_centers_
kmeans.n_iter_
kmeans.labels_
dict1 = dict(zip(c1.index, kmeans.labels_))
clusterids = pd.DataFrame()
clusterids['gene'] = c1.index
clusterids['cluster'] = clusterids['gene'].map(dict1)
clusterids.to_csv("clusterdata_correlation_highlyvariable.csv")
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment