Created
May 4, 2019 13:18
-
-
Save skyer9/ed004b397e1569715e26ac5a6a692259 to your computer and use it in GitHub Desktop.
word_embedding.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "word_embedding.ipynb", | |
| "version": "0.3.2", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/skyer9/ed004b397e1569715e26ac5a6a692259/word_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "hRjnz03pH1Ge", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "a91b147e-b359-4636-be55-962dc9a29071" | |
| }, | |
| "source": [ | |
| "from google.colab import drive\n", | |
| "drive.mount('/content/drive', force_remount=True)" | |
| ], | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Mounted at /content/drive\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "KiGi259EIKtW", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "!cp \"/content/drive/My Drive/for_voca.csv\" \"for_voca.csv\"" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "dKbFBjBDIgR7", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 359 | |
| }, | |
| "outputId": "3dabda54-48fb-4934-a862-458199e5c124" | |
| }, | |
| "source": [ | |
| "import pandas as pd\n", | |
| "\n", | |
| "filepath = 'for_voca.csv'\n", | |
| "\n", | |
| "df = pd.read_csv(filepath, names=['itemname'], sep='\\t', header=0)\n", | |
| "df.dropna()\n", | |
| "\n", | |
| "df.head(n=10)" | |
| ], | |
| "execution_count": 3, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>itemname</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>์คํฌํ ๋งคํธ</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>FRESH A simply mannish jacket_(1166989)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>์ด์ดํฐ๋ง๊ฒ ๊ณ ๋ฌดํธ๋ํฐ๊ณ ๋ฆฌ 4๊ฐ</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>์์์ ์ ๋ณด์์ ์ ์ฉ ํ๋ ์ด ์์ฅฌ ๋ฆฌ๋์ค_(566781)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>ํ ์คํค ํ๋น ์ผ์ด์ค ์์ดํฐ์๋ฆฌ์ฆ</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>[์์]์ฌ๊ฐ์๊ฑฐ์ธ_(894685)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>์ ค๋ฆฌ_์ด๋งค_์๋ชฌ๋๋๋ฌด(๋ธ๋ฃจ)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>[๋ ์์ค์ฑ ์์ฉ] ํฐ์น LED์คํ ๋</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>๊ฐ์ฌํฉ๋๋ค ์๋ง์๋น (์ฉ๋๋ดํฌ)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>ํฌ์นด๋ ธ Microfiber Second Skin 11์ธ์น ์ด๊ทน์ธ์ฌ ํ์ฐ์น(BFMS-...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " itemname\n", | |
| "0 ์คํฌํ ๋งคํธ\n", | |
| "1 FRESH A simply mannish jacket_(1166989)\n", | |
| "2 ์ด์ดํฐ๋ง๊ฒ ๊ณ ๋ฌดํธ๋ํฐ๊ณ ๋ฆฌ 4๊ฐ\n", | |
| "3 ์์์ ์ ๋ณด์์ ์ ์ฉ ํ๋ ์ด ์์ฅฌ ๋ฆฌ๋์ค_(566781)\n", | |
| "4 ํ ์คํค ํ๋น ์ผ์ด์ค ์์ดํฐ์๋ฆฌ์ฆ\n", | |
| "5 [์์]์ฌ๊ฐ์๊ฑฐ์ธ_(894685)\n", | |
| "6 ์ ค๋ฆฌ_์ด๋งค_์๋ชฌ๋๋๋ฌด(๋ธ๋ฃจ)\n", | |
| "7 [๋ ์์ค์ฑ ์์ฉ] ํฐ์น LED์คํ ๋\n", | |
| "8 ๊ฐ์ฌํฉ๋๋ค ์๋ง์๋น (์ฉ๋๋ดํฌ)\n", | |
| "9 ํฌ์นด๋ ธ Microfiber Second Skin 11์ธ์น ์ด๊ทน์ธ์ฌ ํ์ฐ์น(BFMS-..." | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 3 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "x5ddu70kJg4e", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 102 | |
| }, | |
| "outputId": "16ae304f-871e-497d-8da7-1bdabc72f7b4" | |
| }, | |
| "source": [ | |
| "import nltk\n", | |
| "\n", | |
| "nltk.download('stopwords')\n", | |
| "nltk.download('punkt')" | |
| ], | |
| "execution_count": 4, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", | |
| "[nltk_data] Package stopwords is already up-to-date!\n", | |
| "[nltk_data] Downloading package punkt to /root/nltk_data...\n", | |
| "[nltk_data] Package punkt is already up-to-date!\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 4 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Ayk-er1XIpBp", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 102 | |
| }, | |
| "outputId": "714cd673-4400-4e3c-e67a-304446ab8df5" | |
| }, | |
| "source": [ | |
| "import os\n", | |
| "import numpy as np\n", | |
| "import warnings\n", | |
| "from nltk.tokenize import word_tokenize, sent_tokenize\n", | |
| "from gensim.models import Word2Vec\n", | |
| "\n", | |
| "warnings.simplefilter(action='ignore', category=FutureWarning)\n", | |
| "\n", | |
| "# ์ํ๋ช ์ ์ \n", | |
| "org_corpus = df.iloc[:,0]\n", | |
| "org_corpus = org_corpus.str.lower()\n", | |
| "org_corpus = org_corpus.str.replace(r'[^a-z0-9ใฑ-ใ ๊ฐ-ํฃ\\s-]', ' ', regex=True)\n", | |
| "org_corpus = org_corpus.str.replace(' - ', ' ', regex=False)\n", | |
| "org_corpus = org_corpus.str.replace(' -', ' ', regex=False)\n", | |
| "org_corpus = org_corpus.str.replace('- ', ' ', regex=False)\n", | |
| "org_corpus = org_corpus.str.replace(r'[ ]+', ' ', regex=True)\n", | |
| "org_corpus = org_corpus.str.strip()\n", | |
| "\n", | |
| "corpus = []\n", | |
| "for line in org_corpus:\n", | |
| " if type(line) is not float:\n", | |
| " corpus.append(line)\n", | |
| "print('corpus size : {}'.format(len(corpus)))\n", | |
| "\n", | |
| "result = []\n", | |
| "result = [word_tokenize(sentence) for sentence in corpus]\n", | |
| "print('data created.')\n", | |
| "print(result[:3])\n", | |
| "\n", | |
| "print('start learning...')\n", | |
| "model = Word2Vec(sentences=result, \n", | |
| " size=50, \n", | |
| " window=5, \n", | |
| " min_count=5, \n", | |
| " workers=4, \n", | |
| " sg=1)\n", | |
| "print('finished.')" | |
| ], | |
| "execution_count": 5, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "corpus size : 112011\n", | |
| "data created.\n", | |
| "[['์คํฌํ', '๋งคํธ'], ['fresh', 'a', 'simply', 'mannish', 'jacket', '1166989'], ['์ด์ดํฐ๋ง๊ฒ', '๊ณ ๋ฌดํธ๋ํฐ๊ณ ๋ฆฌ', '4๊ฐ']]\n", | |
| "start learning...\n", | |
| "finished.\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "xCM1Owsh3lhF", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 54 | |
| }, | |
| "outputId": "3056be22-593a-43fb-c063-dbf2dba8190c" | |
| }, | |
| "source": [ | |
| "a = model.wv.most_similar(\"์์ดํฐ\")\n", | |
| "print(a)" | |
| ], | |
| "execution_count": 6, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "[('์์ดํฐ7', 0.95871901512146), ('์์ดํฐ8', 0.9415971636772156), ('7ํ๋ฌ์ค', 0.9414889216423035), ('8ํ๋ฌ์ค', 0.9387949705123901), ('์์ดํฐx', 0.9375072121620178), ('์์ดํฐ6', 0.9372150301933289), ('ํผ์คํธํด๋์ค', 0.9353760480880737), ('์์ดํฐ5', 0.9324446320533752), ('6ํ๋ฌ์ค', 0.9278486371040344), ('5s', 0.9267042875289917)]\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment