Skip to content

Instantly share code, notes, and snippets.

@gangtao
Created March 11, 2019 18:23
Show Gist options
  • Select an option

  • Save gangtao/e2e7200a5370dbdc71840ff73eeef123 to your computer and use it in GitHub Desktop.

Select an option

Save gangtao/e2e7200a5370dbdc71840ff73eeef123 to your computer and use it in GitHub Desktop.
Gensim embedding sample
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### create a embedding mode using gensim"
]
},
{
"cell_type": "code",
"execution_count": 1,
"metadata": {},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"Word2Vec(vocab=16, size=100, alpha=0.025)\n",
"['fred', 'is', 'a', 'software', 'engineer', 'xander', 'also', 'can', 'write', 'program', 'knows', 'lot', 'of', 'machine', 'learnnig', 'algorithms']\n",
"[-5.5893237e-04 1.7668052e-03 1.5800461e-03 -2.9155423e-03\n",
" -4.8828649e-04 -1.5046246e-03 2.4005286e-03 -3.3505422e-03\n",
" 2.3780200e-03 -1.3460566e-03 -4.1362685e-03 -4.2379056e-03\n",
" 5.4847426e-04 4.1570985e-03 -2.1742571e-03 2.5030107e-03\n",
" 3.5490937e-04 4.9885712e-03 -4.2925519e-03 -2.6955188e-03\n",
" 2.2162867e-04 3.3585289e-03 3.4013640e-03 -1.0012955e-03\n",
" -1.6629590e-03 -1.8072047e-03 -1.8717305e-04 -5.0179119e-04\n",
" 4.7619347e-03 1.8071961e-03 3.4890659e-03 -3.0099638e-03\n",
" 4.9460814e-03 -2.1814685e-03 3.0371203e-04 1.2496420e-03\n",
" 2.5786117e-03 4.9241213e-03 2.6873725e-03 -4.6886608e-04\n",
" 3.0186286e-03 -9.6760318e-04 -4.6525244e-03 2.4885244e-03\n",
" -3.2226990e-03 1.9304224e-03 -2.2795333e-03 4.6114167e-03\n",
" -1.1173520e-03 -2.2942331e-03 -2.7623768e-03 6.2199228e-04\n",
" 3.7770690e-03 2.5790252e-04 4.7200592e-03 2.1978549e-03\n",
" 1.8990934e-03 2.1505095e-03 4.9980511e-03 -2.9854581e-03\n",
" 8.1377748e-06 4.4536749e-03 9.3884580e-04 2.2312454e-03\n",
" -1.0586825e-04 -2.1196983e-03 2.0551309e-03 2.4646961e-03\n",
" 2.6276743e-03 -2.0470645e-04 4.9548398e-04 -2.5170366e-03\n",
" 1.3753168e-03 3.8361412e-03 3.1737713e-03 -4.0052771e-03\n",
" 3.9899144e-03 3.3078534e-03 -1.5132531e-03 -3.7289991e-03\n",
" -3.3434983e-03 3.6548586e-03 1.7128733e-03 1.7006679e-03\n",
" -3.5741429e-03 4.5237918e-03 -1.0361242e-03 2.5291061e-03\n",
" -2.5380929e-03 -2.2884402e-03 -3.5806587e-03 4.5312424e-03\n",
" -3.9801579e-03 1.8670280e-03 3.7243459e-03 -1.0268994e-03\n",
" 4.5204922e-03 -4.8972399e-04 3.1113774e-03 4.0464094e-03]\n",
"Word2Vec(vocab=16, size=100, alpha=0.025)\n"
]
},
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:15: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" from ipykernel import kernelapp as app\n"
]
}
],
"source": [
"from gensim.models import Word2Vec\n",
"# define training data\n",
"sentences = [['fred', 'is', 'a', 'software', 'engineer'],\n",
"\t\t\t['xander', 'is', 'also', 'a', 'software', 'engineer'],\n",
"\t\t\t['fred', 'can', 'write', 'software', 'program'],\n",
"\t\t\t['xander', 'knows', 'a', 'lot', 'of','machine', 'learnnig', 'algorithms']]\n",
"# train model\n",
"model = Word2Vec(sentences, min_count=1)\n",
"# summarize the loaded model\n",
"print(model)\n",
"# summarize vocabulary\n",
"words = list(model.wv.vocab)\n",
"print(words)\n",
"# access vector for one word\n",
"print(model['fred'])\n",
"# save model\n",
"model.save('model.bin')\n",
"# load model\n",
"new_model = Word2Vec.load('model.bin')\n",
"print(new_model)"
]
},
{
"cell_type": "markdown",
"metadata": {},
"source": [
"### visualize embedding result using PCA"
]
},
{
"cell_type": "code",
"execution_count": 2,
"metadata": {},
"outputs": [
{
"name": "stderr",
"output_type": "stream",
"text": [
"/opt/conda/lib/python3.7/site-packages/ipykernel_launcher.py:8: DeprecationWarning: Call to deprecated `__getitem__` (Method will be removed in 4.0.0, use self.wv.__getitem__() instead).\n",
" \n"
]
},
{
"data": {
"image/png": "\n",
"text/plain": [
"<Figure size 432x288 with 1 Axes>"
]
},
"metadata": {},
"output_type": "display_data"
}
],
"source": [
"%matplotlib inline\n",
"import pandas as pd\n",
"import seaborn as sns\n",
"import matplotlib.pyplot as plt\n",
"from sklearn.decomposition import PCA\n",
"sns.set()\n",
"# fit a 2d PCA model to the vectors\n",
"X = model[model.wv.vocab]\n",
"pca = PCA(n_components=2)\n",
"result = pca.fit_transform(X)\n",
"dataset = pd.DataFrame(data=result, columns=['x','y'])\n",
"ax = sns.scatterplot(x=\"x\", y=\"y\", data=dataset)\n",
"words = list(model.wv.vocab)\n",
"for i, word in enumerate(words):\n",
"\tax.annotate(word, xy=(result[i, 0], result[i, 1]))"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.7.0"
}
},
"nbformat": 4,
"nbformat_minor": 2
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment