Skip to content

Instantly share code, notes, and snippets.

@spartonia
Last active August 29, 2015 14:19
Show Gist options
  • Select an option

  • Save spartonia/6983cca449ed0324c516 to your computer and use it in GitHub Desktop.

Select an option

Save spartonia/6983cca449ed0324c516 to your computer and use it in GitHub Desktop.
WattyClustering
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"import numpy as np \n",
"from sklearn.cluster import MeanShift, estimate_bandwidth\n",
"import pandas as pd \n",
"import matplotlib.pyplot as plt\n",
"from matplotlib.mlab import PCA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"path = 'data.csv'\n",
"X = pd.read_csv(path) "
]
},
{
"cell_type": "code",
"execution_count": 3,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Perform a PCA on data \n",
"result = PCA(X) \n",
" "
]
},
{
"cell_type": "code",
"execution_count": 4,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"# Plot the results od PCA for inspection \n",
"cum = np.cumsum(result.fracs)\n",
"plt.bar(xrange(50), result.fracs, color='#88aa33', label='components')\n",
"plt.plot(xrange(50), cum, 'bo-', label='cumulative')\n",
"plt.legend()\n",
"plt.show() \n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"We can see from the plot that top 20 components of PCA is almost responsible for the whole data. So we select the top 20 columns \n"
]
},
{
"cell_type": "code",
"execution_count": 5,
"metadata": {
"collapsed": false
},
"outputs": [],
"source": [
"data = result.Y [:,:20]"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Since we dont know how many classes we have, we are going to use meanshift clustering is able to automatically calculate the number of classes. "
]
},
{
"cell_type": "code",
"execution_count": 6,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"number of estimated clusters : 45\n"
]
}
],
"source": [
"# Compute clustering with MeanShift\n",
"# The following bandwidth can be automatically detected using\n",
"# Parameter tuning is required here.., 'quantile' specially. \n",
"bandwidth = estimate_bandwidth(data, quantile=0.10, n_samples=30)\n",
"\n",
"ms = MeanShift(bandwidth=bandwidth)\n",
"ms.fit(data)\n",
"labels = ms.labels_\n",
"# print labels \n",
"\n",
"labels_unique = np.unique(labels)\n",
"n_clusters_ = len(labels_unique)\n",
"\n",
"print(\"number of estimated clusters : %d\" % n_clusters_)\n"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"Ploting the results based on number of clusters. This gives us a better insight into the data, and better parameter tuning. "
]
},
{
"cell_type": "code",
"execution_count": 7,
"metadata": {
"collapsed": false
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"a\t=\t6\n",
"b\t=\t7\n",
"a*b\t=\t42\n",
"n\t=\t39\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"C:\\Python27\\Lib\\site-packages\\matplotlib\\axes\\_subplots.py:69: MatplotlibDeprecationWarning: The use of 0 (which ends up being the _last_ sub-plot) is deprecated in 1.4 and will raise an error in 1.5\n",
" mplDeprecation)\n"
]
}
],
"source": [
"# plot clusters\n",
"# dynamically calculate the number of subplots required, wrt the number \n",
"# of clusters\n",
"\n",
"a = np.floor(n_clusters_**0.5).astype(int) \n",
"b = np.ceil(1.*n_clusters_/a).astype(int)\n",
"print \"a\\t=\\t%d\\nb\\t=\\t%d\\na*b\\t=\\t%d\\nn\\t=\\t%d\" % (a,b,a*b,n_clusters_) \n",
"\n",
"fig = plt.figure(figsize=(2.*b,2.*a)) \n",
"\n",
"ymax = np.ceil(np.amax(data)).astype(int) \n",
"ymin = np.floor(np.amin(data)).astype(int)\n",
"\n",
"for c_label in range(0, n_clusters_): \n",
" ax = fig.add_subplot(a, b, c_label) \n",
"# c_mean = np.mean(data[labels==c_label], axis=0)\n",
" lbls = data[labels==c_label]\n",
" for row in lbls: \n",
" ax.plot(xrange(lbls.shape[1]),row)\n",
" ax.set_ylim([ymin, ymax])\n",
" ax.set_title(\"cluster = %d\" % c_label) \n",
" \n",
"fig.suptitle(\"n_clusters = %d\" % n_clusters_) \n",
"fig.set_tight_layout(True)\n",
"fig.show() \n",
"\n",
"\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"collapsed": true
},
"outputs": [],
"source": []
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 2",
"language": "python",
"name": "python2"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 2
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython2",
"version": "2.7.8"
}
},
"nbformat": 4,
"nbformat_minor": 0
}
@spartonia
Copy link
Author

uploaded

Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment