spartonia · August 29, 2015 14:19 · spartonia · Apr 20, 2015
diff --git a/WattyClustering.ipynb b/WattyClustering.ipynb
 {
 "cells": [
  {
   "cell_type": "code",
   "execution_count": 1,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "import numpy as np \n",
    "from sklearn.cluster import MeanShift, estimate_bandwidth\n",
    "import pandas as pd \n",
    "import matplotlib.pyplot as plt\n",
    "from matplotlib.mlab import PCA"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 2,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "path = 'data.csv'\n",
    "X = pd.read_csv(path) "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 3,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "# Perform a PCA on data \n",
    "result = PCA(X) \n",
    "    "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 4,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "#  Plot the results od PCA for inspection \n",
    "cum = np.cumsum(result.fracs)\n",
    "plt.bar(xrange(50), result.fracs, color='#88aa33', label='components')\n",
    "plt.plot(xrange(50), cum, 'bo-', label='cumulative')\n",
    "plt.legend()\n",
    "plt.show() \n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "We can see from the plot that top 20 components of PCA is almost responsible for the whole data. So we select the top 20 columns \n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 5,
   "metadata": {
    "collapsed": false
   },
   "outputs": [],
   "source": [
    "data = result.Y [:,:20]"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Since we dont know how many classes we have, we are going to use meanshift clustering is able to automatically calculate the number of classes. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 6,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "number of estimated clusters : 45\n"
     ]
    }
   ],
   "source": [
    "# Compute clustering with MeanShift\n",
    "# The following bandwidth can be automatically detected using\n",
    "# Parameter tuning is required here.., 'quantile' specially. \n",
    "bandwidth = estimate_bandwidth(data, quantile=0.10, n_samples=30)\n",
    "\n",
    "ms = MeanShift(bandwidth=bandwidth)\n",
    "ms.fit(data)\n",
    "labels = ms.labels_\n",
    "# print labels \n",
    "\n",
    "labels_unique = np.unique(labels)\n",
    "n_clusters_ = len(labels_unique)\n",
    "\n",
    "print(\"number of estimated clusters : %d\" % n_clusters_)\n"
   ]
  },
  {
   "cell_type": "markdown",
   "metadata": {},
   "source": [
    "Ploting the results based on number of clusters. This gives us a better insight into the data, and better parameter tuning. "
   ]
  },
  {
   "cell_type": "code",
   "execution_count": 7,
   "metadata": {
    "collapsed": false
   },
   "outputs": [
    {
     "name": "stdout",
     "output_type": "stream",
     "text": [
      "a\t=\t6\n",
      "b\t=\t7\n",
      "a*b\t=\t42\n",
      "n\t=\t39\n"
     ]
    },
    {
     "name": "stderr",
     "output_type": "stream",
     "text": [
      "C:\\Python27\\Lib\\site-packages\\matplotlib\\axes\\_subplots.py:69: MatplotlibDeprecationWarning: The use of 0 (which ends up being the _last_ sub-plot) is deprecated in 1.4 and will raise an error in 1.5\n",
      "  mplDeprecation)\n"
     ]
    }
   ],
   "source": [
    "# plot clusters\n",
    "# dynamically calculate the number of subplots required, wrt the number \n",
    "# of clusters\n",
    "\n",
    "a = np.floor(n_clusters_**0.5).astype(int) \n",
    "b = np.ceil(1.*n_clusters_/a).astype(int)\n",
    "print \"a\\t=\\t%d\\nb\\t=\\t%d\\na*b\\t=\\t%d\\nn\\t=\\t%d\" % (a,b,a*b,n_clusters_) \n",
    "\n",
    "fig = plt.figure(figsize=(2.*b,2.*a)) \n",
    "\n",
    "ymax = np.ceil(np.amax(data)).astype(int) \n",
    "ymin = np.floor(np.amin(data)).astype(int)\n",
    "\n",
    "for c_label in range(0, n_clusters_): \n",
    "    ax = fig.add_subplot(a, b, c_label) \n",
    "#     c_mean = np.mean(data[labels==c_label], axis=0)\n",
    "    lbls = data[labels==c_label]\n",
    "    for row in lbls:  \n",
    "        ax.plot(xrange(lbls.shape[1]),row)\n",
    "        ax.set_ylim([ymin, ymax])\n",
    "        ax.set_title(\"cluster = %d\" % c_label) \n",
    "        \n",
    "fig.suptitle(\"n_clusters = %d\" % n_clusters_) \n",
    "fig.set_tight_layout(True)\n",
    "fig.show() \n",
    "\n",
    "\n",
    "\n"
   ]
  },
  {
   "cell_type": "code",
   "execution_count": null,
   "metadata": {
    "collapsed": true
   },
   "outputs": [],
   "source": []
  }
 ],
 "metadata": {
  "kernelspec": {
   "display_name": "Python 2",
   "language": "python",
   "name": "python2"
  },
  "language_info": {
   "codemirror_mode": {
    "name": "ipython",
    "version": 2
   },
   "file_extension": ".py",
   "mimetype": "text/x-python",
   "name": "python",
   "nbconvert_exporter": "python",
   "pygments_lexer": "ipython2",
   "version": "2.7.8"
  }
 },
 "nbformat": 4,
 "nbformat_minor": 0
 }
	{
	"cells": [
	{
	"cell_type": "code",
	"execution_count": 1,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"import numpy as np \n",
	"from sklearn.cluster import MeanShift, estimate_bandwidth\n",
	"import pandas as pd \n",
	"import matplotlib.pyplot as plt\n",
	"from matplotlib.mlab import PCA"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 2,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"path = 'data.csv'\n",
	"X = pd.read_csv(path) "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 3,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Perform a PCA on data \n",
	"result = PCA(X) \n",
	" "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 4,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"# Plot the results od PCA for inspection \n",
	"cum = np.cumsum(result.fracs)\n",
	"plt.bar(xrange(50), result.fracs, color='#88aa33', label='components')\n",
	"plt.plot(xrange(50), cum, 'bo-', label='cumulative')\n",
	"plt.legend()\n",
	"plt.show() \n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"We can see from the plot that top 20 components of PCA is almost responsible for the whole data. So we select the top 20 columns \n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": 5,
	"metadata": {
	"collapsed": false
	},
	"outputs": [],
	"source": [
	"data = result.Y [:,:20]"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Since we dont know how many classes we have, we are going to use meanshift clustering is able to automatically calculate the number of classes. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 6,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"number of estimated clusters : 45\n"
	]
	}
	],
	"source": [
	"# Compute clustering with MeanShift\n",
	"# The following bandwidth can be automatically detected using\n",
	"# Parameter tuning is required here.., 'quantile' specially. \n",
	"bandwidth = estimate_bandwidth(data, quantile=0.10, n_samples=30)\n",
	"\n",
	"ms = MeanShift(bandwidth=bandwidth)\n",
	"ms.fit(data)\n",
	"labels = ms.labels_\n",
	"# print labels \n",
	"\n",
	"labels_unique = np.unique(labels)\n",
	"n_clusters_ = len(labels_unique)\n",
	"\n",
	"print(\"number of estimated clusters : %d\" % n_clusters_)\n"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {},
	"source": [
	"Ploting the results based on number of clusters. This gives us a better insight into the data, and better parameter tuning. "
	]
	},
	{
	"cell_type": "code",
	"execution_count": 7,
	"metadata": {
	"collapsed": false
	},
	"outputs": [
	{
	"name": "stdout",
	"output_type": "stream",
	"text": [
	"a\t=\t6\n",
	"b\t=\t7\n",
	"a*b\t=\t42\n",
	"n\t=\t39\n"
	]
	},
	{
	"name": "stderr",
	"output_type": "stream",
	"text": [
	"C:\\Python27\\Lib\\site-packages\\matplotlib\\axes\\_subplots.py:69: MatplotlibDeprecationWarning: The use of 0 (which ends up being the _last_ sub-plot) is deprecated in 1.4 and will raise an error in 1.5\n",
	" mplDeprecation)\n"
	]
	}
	],
	"source": [
	"# plot clusters\n",
	"# dynamically calculate the number of subplots required, wrt the number \n",
	"# of clusters\n",
	"\n",
	"a = np.floor(n_clusters_**0.5).astype(int) \n",
	"b = np.ceil(1.*n_clusters_/a).astype(int)\n",
	"print \"a\\t=\\t%d\\nb\\t=\\t%d\\nab\\t=\\t%d\\nn\\t=\\t%d\" % (a,b,ab,n_clusters_) \n",
	"\n",
	"fig = plt.figure(figsize=(2.b,2.a)) \n",
	"\n",
	"ymax = np.ceil(np.amax(data)).astype(int) \n",
	"ymin = np.floor(np.amin(data)).astype(int)\n",
	"\n",
	"for c_label in range(0, n_clusters_): \n",
	" ax = fig.add_subplot(a, b, c_label) \n",
	"# c_mean = np.mean(data[labels==c_label], axis=0)\n",
	" lbls = data[labels==c_label]\n",
	" for row in lbls: \n",
	" ax.plot(xrange(lbls.shape[1]),row)\n",
	" ax.set_ylim([ymin, ymax])\n",
	" ax.set_title(\"cluster = %d\" % c_label) \n",
	" \n",
	"fig.suptitle(\"n_clusters = %d\" % n_clusters_) \n",
	"fig.set_tight_layout(True)\n",
	"fig.show() \n",
	"\n",
	"\n",
	"\n"
	]
	},
	{
	"cell_type": "code",
	"execution_count": null,
	"metadata": {
	"collapsed": true
	},
	"outputs": [],
	"source": []
	}
	],
	"metadata": {
	"kernelspec": {
	"display_name": "Python 2",
	"language": "python",
	"name": "python2"
	},
	"language_info": {
	"codemirror_mode": {
	"name": "ipython",
	"version": 2
	},
	"file_extension": ".py",
	"mimetype": "text/x-python",
	"name": "python",
	"nbconvert_exporter": "python",
	"pygments_lexer": "ipython2",
	"version": "2.7.8"
	}
	},
	"nbformat": 4,
	"nbformat_minor": 0
	}
No results found