Skip to content

Instantly share code, notes, and snippets.

@skyer9
Created May 4, 2019 13:18
Show Gist options
  • Select an option

  • Save skyer9/ed004b397e1569715e26ac5a6a692259 to your computer and use it in GitHub Desktop.

Select an option

Save skyer9/ed004b397e1569715e26ac5a6a692259 to your computer and use it in GitHub Desktop.
word_embedding.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "word_embedding.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/skyer9/ed004b397e1569715e26ac5a6a692259/word_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "hRjnz03pH1Ge",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "a91b147e-b359-4636-be55-962dc9a29071"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive', force_remount=True)"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Mounted at /content/drive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "KiGi259EIKtW",
"colab_type": "code",
"colab": {}
},
"source": [
"!cp \"/content/drive/My Drive/for_voca.csv\" \"for_voca.csv\""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dKbFBjBDIgR7",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 359
},
"outputId": "3dabda54-48fb-4934-a862-458199e5c124"
},
"source": [
"import pandas as pd\n",
"\n",
"filepath = 'for_voca.csv'\n",
"\n",
"df = pd.read_csv(filepath, names=['itemname'], sep='\\t', header=0)\n",
"df.dropna()\n",
"\n",
"df.head(n=10)"
],
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>itemname</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>์Šคํƒฌํ•‘ ๋งคํŠธ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>FRESH A simply mannish jacket_(1166989)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>์ด์–ดํฐ๋งˆ๊ฒŒ ๊ณ ๋ฌดํ•ธ๋“œํฐ๊ณ ๋ฆฌ 4๊ฐœ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>์—์‹œ์•™ ์ ๋ณด์˜์ž ์ „์šฉ ํ”Œ๋ ˆ์ด ์•™์ฅฌ ๋ฆฌ๋“œ์ค„_(566781)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>ํ’€ ์Šคํ†ค ํ๋น… ์ผ€์ด์Šค ์•„์ดํฐ์‹œ๋ฆฌ์ฆˆ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>[์‚์‚]์‚ฌ๊ฐ์†๊ฑฐ์šธ_(894685)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>์ ค๋ฆฌ_์—ด๋งค_์•„๋ชฌ๋“œ๋‚˜๋ฌด(๋ธ”๋ฃจ)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>[๋…์„œ์‹ค์ฑ…์ƒ์šฉ] ํ„ฐ์น˜ LED์Šคํƒ ๋“œ</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค ์—„๋งˆ์•„๋น  (์šฉ๋ˆ๋ด‰ํˆฌ)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>ํˆฌ์นด๋…ธ Microfiber Second Skin 11์ธ์น˜ ์ดˆ๊ทน์„ธ์‚ฌ ํŒŒ์šฐ์น˜(BFMS-...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" itemname\n",
"0 ์Šคํƒฌํ•‘ ๋งคํŠธ\n",
"1 FRESH A simply mannish jacket_(1166989)\n",
"2 ์ด์–ดํฐ๋งˆ๊ฒŒ ๊ณ ๋ฌดํ•ธ๋“œํฐ๊ณ ๋ฆฌ 4๊ฐœ\n",
"3 ์—์‹œ์•™ ์ ๋ณด์˜์ž ์ „์šฉ ํ”Œ๋ ˆ์ด ์•™์ฅฌ ๋ฆฌ๋“œ์ค„_(566781)\n",
"4 ํ’€ ์Šคํ†ค ํ๋น… ์ผ€์ด์Šค ์•„์ดํฐ์‹œ๋ฆฌ์ฆˆ\n",
"5 [์‚์‚]์‚ฌ๊ฐ์†๊ฑฐ์šธ_(894685)\n",
"6 ์ ค๋ฆฌ_์—ด๋งค_์•„๋ชฌ๋“œ๋‚˜๋ฌด(๋ธ”๋ฃจ)\n",
"7 [๋…์„œ์‹ค์ฑ…์ƒ์šฉ] ํ„ฐ์น˜ LED์Šคํƒ ๋“œ\n",
"8 ๊ฐ์‚ฌํ•ฉ๋‹ˆ๋‹ค ์—„๋งˆ์•„๋น  (์šฉ๋ˆ๋ด‰ํˆฌ)\n",
"9 ํˆฌ์นด๋…ธ Microfiber Second Skin 11์ธ์น˜ ์ดˆ๊ทน์„ธ์‚ฌ ํŒŒ์šฐ์น˜(BFMS-..."
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "x5ddu70kJg4e",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 102
},
"outputId": "16ae304f-871e-497d-8da7-1bdabc72f7b4"
},
"source": [
"import nltk\n",
"\n",
"nltk.download('stopwords')\n",
"nltk.download('punkt')"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Ayk-er1XIpBp",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 102
},
"outputId": "714cd673-4400-4e3c-e67a-304446ab8df5"
},
"source": [
"import os\n",
"import numpy as np\n",
"import warnings\n",
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
"from gensim.models import Word2Vec\n",
"\n",
"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
"\n",
"# ์ƒํ’ˆ๋ช… ์ •์ œ\n",
"org_corpus = df.iloc[:,0]\n",
"org_corpus = org_corpus.str.lower()\n",
"org_corpus = org_corpus.str.replace(r'[^a-z0-9ใ„ฑ-ใ…Ž๊ฐ€-ํžฃ\\s-]', ' ', regex=True)\n",
"org_corpus = org_corpus.str.replace(' - ', ' ', regex=False)\n",
"org_corpus = org_corpus.str.replace(' -', ' ', regex=False)\n",
"org_corpus = org_corpus.str.replace('- ', ' ', regex=False)\n",
"org_corpus = org_corpus.str.replace(r'[ ]+', ' ', regex=True)\n",
"org_corpus = org_corpus.str.strip()\n",
"\n",
"corpus = []\n",
"for line in org_corpus:\n",
" if type(line) is not float:\n",
" corpus.append(line)\n",
"print('corpus size : {}'.format(len(corpus)))\n",
"\n",
"result = []\n",
"result = [word_tokenize(sentence) for sentence in corpus]\n",
"print('data created.')\n",
"print(result[:3])\n",
"\n",
"print('start learning...')\n",
"model = Word2Vec(sentences=result, \n",
" size=50, \n",
" window=5, \n",
" min_count=5, \n",
" workers=4, \n",
" sg=1)\n",
"print('finished.')"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"corpus size : 112011\n",
"data created.\n",
"[['์Šคํƒฌํ•‘', '๋งคํŠธ'], ['fresh', 'a', 'simply', 'mannish', 'jacket', '1166989'], ['์ด์–ดํฐ๋งˆ๊ฒŒ', '๊ณ ๋ฌดํ•ธ๋“œํฐ๊ณ ๋ฆฌ', '4๊ฐœ']]\n",
"start learning...\n",
"finished.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "xCM1Owsh3lhF",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "3056be22-593a-43fb-c063-dbf2dba8190c"
},
"source": [
"a = model.wv.most_similar(\"์•„์ดํฐ\")\n",
"print(a)"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"[('์•„์ดํฐ7', 0.95871901512146), ('์•„์ดํฐ8', 0.9415971636772156), ('7ํ”Œ๋Ÿฌ์Šค', 0.9414889216423035), ('8ํ”Œ๋Ÿฌ์Šค', 0.9387949705123901), ('์•„์ดํฐx', 0.9375072121620178), ('์•„์ดํฐ6', 0.9372150301933289), ('ํผ์ŠคํŠธํด๋ž˜์Šค', 0.9353760480880737), ('์•„์ดํฐ5', 0.9324446320533752), ('6ํ”Œ๋Ÿฌ์Šค', 0.9278486371040344), ('5s', 0.9267042875289917)]\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment