Created
May 4, 2019 13:18
-
-
Save skyer9/ed004b397e1569715e26ac5a6a692259 to your computer and use it in GitHub Desktop.
word_embedding.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "word_embedding.ipynb", | |
| "version": "0.3.2", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/skyer9/ed004b397e1569715e26ac5a6a692259/word_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "hRjnz03pH1Ge", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 34 | |
| }, | |
| "outputId": "a91b147e-b359-4636-be55-962dc9a29071" | |
| }, | |
| "source": [ | |
| "from google.colab import drive\n", | |
| "drive.mount('/content/drive', force_remount=True)" | |
| ], | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Mounted at /content/drive\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "KiGi259EIKtW", | |
| "colab_type": "code", | |
| "colab": {} | |
| }, | |
| "source": [ | |
| "!cp \"/content/drive/My Drive/for_voca.csv\" \"for_voca.csv\"" | |
| ], | |
| "execution_count": 0, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "dKbFBjBDIgR7", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 359 | |
| }, | |
| "outputId": "3dabda54-48fb-4934-a862-458199e5c124" | |
| }, | |
| "source": [ | |
| "import pandas as pd\n", | |
| "\n", | |
| "filepath = 'for_voca.csv'\n", | |
| "\n", | |
| "df = pd.read_csv(filepath, names=['itemname'], sep='\\t', header=0)\n", | |
| "df.dropna()\n", | |
| "\n", | |
| "df.head(n=10)" | |
| ], | |
| "execution_count": 3, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>itemname</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>스탬핑 매트</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>FRESH A simply mannish jacket_(1166989)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>이어폰마게 고무핸드폰고리 4개</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>에시앙 점보의자 전용 플레이 앙쥬 리드줄_(566781)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>풀 스톤 큐빅 케이스 아이폰시리즈</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>[삐삐]사각손거울_(894685)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>젤리_열매_아몬드나무(블루)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>[독서실책상용] 터치 LED스탠드</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>감사합니다 엄마아빠 (용돈봉투)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>투카노 Microfiber Second Skin 11인치 초극세사 파우치(BFMS-...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " itemname\n", | |
| "0 스탬핑 매트\n", | |
| "1 FRESH A simply mannish jacket_(1166989)\n", | |
| "2 이어폰마게 고무핸드폰고리 4개\n", | |
| "3 에시앙 점보의자 전용 플레이 앙쥬 리드줄_(566781)\n", | |
| "4 풀 스톤 큐빅 케이스 아이폰시리즈\n", | |
| "5 [삐삐]사각손거울_(894685)\n", | |
| "6 젤리_열매_아몬드나무(블루)\n", | |
| "7 [독서실책상용] 터치 LED스탠드\n", | |
| "8 감사합니다 엄마아빠 (용돈봉투)\n", | |
| "9 투카노 Microfiber Second Skin 11인치 초극세사 파우치(BFMS-..." | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 3 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "x5ddu70kJg4e", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 102 | |
| }, | |
| "outputId": "16ae304f-871e-497d-8da7-1bdabc72f7b4" | |
| }, | |
| "source": [ | |
| "import nltk\n", | |
| "\n", | |
| "nltk.download('stopwords')\n", | |
| "nltk.download('punkt')" | |
| ], | |
| "execution_count": 4, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "[nltk_data] Downloading package stopwords to /root/nltk_data...\n", | |
| "[nltk_data] Package stopwords is already up-to-date!\n", | |
| "[nltk_data] Downloading package punkt to /root/nltk_data...\n", | |
| "[nltk_data] Package punkt is already up-to-date!\n" | |
| ], | |
| "name": "stdout" | |
| }, | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 4 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "Ayk-er1XIpBp", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 102 | |
| }, | |
| "outputId": "714cd673-4400-4e3c-e67a-304446ab8df5" | |
| }, | |
| "source": [ | |
| "import os\n", | |
| "import numpy as np\n", | |
| "import warnings\n", | |
| "from nltk.tokenize import word_tokenize, sent_tokenize\n", | |
| "from gensim.models import Word2Vec\n", | |
| "\n", | |
| "warnings.simplefilter(action='ignore', category=FutureWarning)\n", | |
| "\n", | |
| "# 상품명 정제\n", | |
| "org_corpus = df.iloc[:,0]\n", | |
| "org_corpus = org_corpus.str.lower()\n", | |
| "org_corpus = org_corpus.str.replace(r'[^a-z0-9ㄱ-ㅎ가-힣\\s-]', ' ', regex=True)\n", | |
| "org_corpus = org_corpus.str.replace(' - ', ' ', regex=False)\n", | |
| "org_corpus = org_corpus.str.replace(' -', ' ', regex=False)\n", | |
| "org_corpus = org_corpus.str.replace('- ', ' ', regex=False)\n", | |
| "org_corpus = org_corpus.str.replace(r'[ ]+', ' ', regex=True)\n", | |
| "org_corpus = org_corpus.str.strip()\n", | |
| "\n", | |
| "corpus = []\n", | |
| "for line in org_corpus:\n", | |
| " if type(line) is not float:\n", | |
| " corpus.append(line)\n", | |
| "print('corpus size : {}'.format(len(corpus)))\n", | |
| "\n", | |
| "result = []\n", | |
| "result = [word_tokenize(sentence) for sentence in corpus]\n", | |
| "print('data created.')\n", | |
| "print(result[:3])\n", | |
| "\n", | |
| "print('start learning...')\n", | |
| "model = Word2Vec(sentences=result, \n", | |
| " size=50, \n", | |
| " window=5, \n", | |
| " min_count=5, \n", | |
| " workers=4, \n", | |
| " sg=1)\n", | |
| "print('finished.')" | |
| ], | |
| "execution_count": 5, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "corpus size : 112011\n", | |
| "data created.\n", | |
| "[['스탬핑', '매트'], ['fresh', 'a', 'simply', 'mannish', 'jacket', '1166989'], ['이어폰마게', '고무핸드폰고리', '4개']]\n", | |
| "start learning...\n", | |
| "finished.\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "xCM1Owsh3lhF", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 54 | |
| }, | |
| "outputId": "3056be22-593a-43fb-c063-dbf2dba8190c" | |
| }, | |
| "source": [ | |
| "a = model.wv.most_similar(\"아이폰\")\n", | |
| "print(a)" | |
| ], | |
| "execution_count": 6, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "[('아이폰7', 0.95871901512146), ('아이폰8', 0.9415971636772156), ('7플러스', 0.9414889216423035), ('8플러스', 0.9387949705123901), ('아이폰x', 0.9375072121620178), ('아이폰6', 0.9372150301933289), ('퍼스트클래스', 0.9353760480880737), ('아이폰5', 0.9324446320533752), ('6플러스', 0.9278486371040344), ('5s', 0.9267042875289917)]\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment