skyer9 · May 4, 2019 13:18
diff --git a/word_embedding.ipynb b/word_embedding.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "word_embedding.ipynb",
      "version": "0.3.2",
      "provenance": [],
      "collapsed_sections": [],
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/skyer9/ed004b397e1569715e26ac5a6a692259/word_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "hRjnz03pH1Ge",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 34
        },
        "outputId": "a91b147e-b359-4636-be55-962dc9a29071"
      },
      "source": [
        "from google.colab import drive\n",
        "drive.mount('/content/drive', force_remount=True)"
      ],
      "execution_count": 1,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Mounted at /content/drive\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "KiGi259EIKtW",
        "colab_type": "code",
        "colab": {}
      },
      "source": [
        "!cp \"/content/drive/My Drive/for_voca.csv\" \"for_voca.csv\""
      ],
      "execution_count": 0,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "dKbFBjBDIgR7",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 359
        },
        "outputId": "3dabda54-48fb-4934-a862-458199e5c124"
      },
      "source": [
        "import pandas as pd\n",
        "\n",
        "filepath = 'for_voca.csv'\n",
        "\n",
        "df = pd.read_csv(filepath, names=['itemname'], sep='\\t', header=0)\n",
        "df.dropna()\n",
        "\n",
        "df.head(n=10)"
      ],
      "execution_count": 3,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>itemname</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>스탬핑 매트</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>FRESH A simply mannish jacket_(1166989)</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>이어폰마게 고무핸드폰고리 4개</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>에시앙 점보의자 전용 플레이 앙쥬 리드줄_(566781)</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>풀 스톤 큐빅 케이스 아이폰시리즈</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>5</th>\n",
              "      <td>[삐삐]사각손거울_(894685)</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>6</th>\n",
              "      <td>젤리_열매_아몬드나무(블루)</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>7</th>\n",
              "      <td>[독서실책상용] 터치 LED스탠드</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>8</th>\n",
              "      <td>감사합니다 엄마아빠 (용돈봉투)</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9</th>\n",
              "      <td>투카노 Microfiber Second Skin 11인치 초극세사 파우치(BFMS-...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "                                            itemname\n",
              "0                                             스탬핑 매트\n",
              "1            FRESH A simply mannish jacket_(1166989)\n",
              "2                                   이어폰마게 고무핸드폰고리 4개\n",
              "3                    에시앙 점보의자 전용 플레이 앙쥬 리드줄_(566781)\n",
              "4                                 풀 스톤 큐빅 케이스 아이폰시리즈\n",
              "5                                 [삐삐]사각손거울_(894685)\n",
              "6                                    젤리_열매_아몬드나무(블루)\n",
              "7                                 [독서실책상용] 터치 LED스탠드\n",
              "8                                  감사합니다 엄마아빠 (용돈봉투)\n",
              "9  투카노 Microfiber Second Skin 11인치 초극세사 파우치(BFMS-..."
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 3
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "x5ddu70kJg4e",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 102
        },
        "outputId": "16ae304f-871e-497d-8da7-1bdabc72f7b4"
      },
      "source": [
        "import nltk\n",
        "\n",
        "nltk.download('stopwords')\n",
        "nltk.download('punkt')"
      ],
      "execution_count": 4,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
            "[nltk_data]   Package stopwords is already up-to-date!\n",
            "[nltk_data] Downloading package punkt to /root/nltk_data...\n",
            "[nltk_data]   Package punkt is already up-to-date!\n"
          ],
          "name": "stdout"
        },
        {
          "output_type": "execute_result",
          "data": {
            "text/plain": [
              "True"
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 4
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "Ayk-er1XIpBp",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 102
        },
        "outputId": "714cd673-4400-4e3c-e67a-304446ab8df5"
      },
      "source": [
        "import os\n",
        "import numpy as np\n",
        "import warnings\n",
        "from nltk.tokenize import word_tokenize, sent_tokenize\n",
        "from gensim.models import Word2Vec\n",
        "\n",
        "warnings.simplefilter(action='ignore', category=FutureWarning)\n",
        "\n",
        "# 상품명 정제\n",
        "org_corpus = df.iloc[:,0]\n",
        "org_corpus = org_corpus.str.lower()\n",
        "org_corpus = org_corpus.str.replace(r'[^a-z0-9ㄱ-ㅎ가-힣\\s-]', ' ', regex=True)\n",
        "org_corpus = org_corpus.str.replace(' - ', ' ', regex=False)\n",
        "org_corpus = org_corpus.str.replace(' -', ' ', regex=False)\n",
        "org_corpus = org_corpus.str.replace('- ', ' ', regex=False)\n",
        "org_corpus = org_corpus.str.replace(r'[ ]+', ' ', regex=True)\n",
        "org_corpus = org_corpus.str.strip()\n",
        "\n",
        "corpus = []\n",
        "for line in org_corpus:\n",
        "    if type(line) is not float:\n",
        "        corpus.append(line)\n",
        "print('corpus size : {}'.format(len(corpus)))\n",
        "\n",
        "result = []\n",
        "result = [word_tokenize(sentence) for sentence in corpus]\n",
        "print('data created.')\n",
        "print(result[:3])\n",
        "\n",
        "print('start learning...')\n",
        "model = Word2Vec(sentences=result, \n",
        "                 size=50, \n",
        "                 window=5, \n",
        "                 min_count=5, \n",
        "                 workers=4, \n",
        "                 sg=1)\n",
        "print('finished.')"
      ],
      "execution_count": 5,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "corpus size : 112011\n",
            "data created.\n",
            "[['스탬핑', '매트'], ['fresh', 'a', 'simply', 'mannish', 'jacket', '1166989'], ['이어폰마게', '고무핸드폰고리', '4개']]\n",
            "start learning...\n",
            "finished.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "xCM1Owsh3lhF",
        "colab_type": "code",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 54
        },
        "outputId": "3056be22-593a-43fb-c063-dbf2dba8190c"
      },
      "source": [
        "a = model.wv.most_similar(\"아이폰\")\n",
        "print(a)"
      ],
      "execution_count": 6,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "[('아이폰7', 0.95871901512146), ('아이폰8', 0.9415971636772156), ('7플러스', 0.9414889216423035), ('8플러스', 0.9387949705123901), ('아이폰x', 0.9375072121620178), ('아이폰6', 0.9372150301933289), ('퍼스트클래스', 0.9353760480880737), ('아이폰5', 0.9324446320533752), ('6플러스', 0.9278486371040344), ('5s', 0.9267042875289917)]\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "word_embedding.ipynb",
	"version": "0.3.2",
	"provenance": [],
	"collapsed_sections": [],
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/skyer9/ed004b397e1569715e26ac5a6a692259/word_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "hRjnz03pH1Ge",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 34
	},
	"outputId": "a91b147e-b359-4636-be55-962dc9a29071"
	},
	"source": [
	"from google.colab import drive\n",
	"drive.mount('/content/drive', force_remount=True)"
	],
	"execution_count": 1,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Mounted at /content/drive\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "KiGi259EIKtW",
	"colab_type": "code",
	"colab": {}
	},
	"source": [
	"!cp \"/content/drive/My Drive/for_voca.csv\" \"for_voca.csv\""
	],
	"execution_count": 0,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "dKbFBjBDIgR7",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 359
	},
	"outputId": "3dabda54-48fb-4934-a862-458199e5c124"
	},
	"source": [
	"import pandas as pd\n",
	"\n",
	"filepath = 'for_voca.csv'\n",
	"\n",
	"df = pd.read_csv(filepath, names=['itemname'], sep='\\t', header=0)\n",
	"df.dropna()\n",
	"\n",
	"df.head(n=10)"
	],
	"execution_count": 3,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>itemname</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>스탬핑 매트</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>FRESH A simply mannish jacket_(1166989)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>이어폰마게 고무핸드폰고리 4개</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>에시앙 점보의자 전용 플레이 앙쥬 리드줄_(566781)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>풀 스톤 큐빅 케이스 아이폰시리즈</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>5</th>\n",
	" <td>[삐삐]사각손거울_(894685)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>6</th>\n",
	" <td>젤리_열매_아몬드나무(블루)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>7</th>\n",
	" <td>[독서실책상용] 터치 LED스탠드</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>8</th>\n",
	" <td>감사합니다 엄마아빠 (용돈봉투)</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9</th>\n",
	" <td>투카노 Microfiber Second Skin 11인치 초극세사 파우치(BFMS-...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" itemname\n",
	"0 스탬핑 매트\n",
	"1 FRESH A simply mannish jacket_(1166989)\n",
	"2 이어폰마게 고무핸드폰고리 4개\n",
	"3 에시앙 점보의자 전용 플레이 앙쥬 리드줄_(566781)\n",
	"4 풀 스톤 큐빅 케이스 아이폰시리즈\n",
	"5 [삐삐]사각손거울_(894685)\n",
	"6 젤리_열매_아몬드나무(블루)\n",
	"7 [독서실책상용] 터치 LED스탠드\n",
	"8 감사합니다 엄마아빠 (용돈봉투)\n",
	"9 투카노 Microfiber Second Skin 11인치 초극세사 파우치(BFMS-..."
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 3
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "x5ddu70kJg4e",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 102
	},
	"outputId": "16ae304f-871e-497d-8da7-1bdabc72f7b4"
	},
	"source": [
	"import nltk\n",
	"\n",
	"nltk.download('stopwords')\n",
	"nltk.download('punkt')"
	],
	"execution_count": 4,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
	"[nltk_data] Package stopwords is already up-to-date!\n",
	"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
	"[nltk_data] Package punkt is already up-to-date!\n"
	],
	"name": "stdout"
	},
	{
	"output_type": "execute_result",
	"data": {
	"text/plain": [
	"True"
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 4
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "Ayk-er1XIpBp",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 102
	},
	"outputId": "714cd673-4400-4e3c-e67a-304446ab8df5"
	},
	"source": [
	"import os\n",
	"import numpy as np\n",
	"import warnings\n",
	"from nltk.tokenize import word_tokenize, sent_tokenize\n",
	"from gensim.models import Word2Vec\n",
	"\n",
	"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
	"\n",
	"# 상품명 정제\n",
	"org_corpus = df.iloc[:,0]\n",
	"org_corpus = org_corpus.str.lower()\n",
	"org_corpus = org_corpus.str.replace(r'[^a-z0-9ㄱ-ㅎ가-힣\\s-]', ' ', regex=True)\n",
	"org_corpus = org_corpus.str.replace(' - ', ' ', regex=False)\n",
	"org_corpus = org_corpus.str.replace(' -', ' ', regex=False)\n",
	"org_corpus = org_corpus.str.replace('- ', ' ', regex=False)\n",
	"org_corpus = org_corpus.str.replace(r'[ ]+', ' ', regex=True)\n",
	"org_corpus = org_corpus.str.strip()\n",
	"\n",
	"corpus = []\n",
	"for line in org_corpus:\n",
	" if type(line) is not float:\n",
	" corpus.append(line)\n",
	"print('corpus size : {}'.format(len(corpus)))\n",
	"\n",
	"result = []\n",
	"result = [word_tokenize(sentence) for sentence in corpus]\n",
	"print('data created.')\n",
	"print(result[:3])\n",
	"\n",
	"print('start learning...')\n",
	"model = Word2Vec(sentences=result, \n",
	" size=50, \n",
	" window=5, \n",
	" min_count=5, \n",
	" workers=4, \n",
	" sg=1)\n",
	"print('finished.')"
	],
	"execution_count": 5,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"corpus size : 112011\n",
	"data created.\n",
	"[['스탬핑', '매트'], ['fresh', 'a', 'simply', 'mannish', 'jacket', '1166989'], ['이어폰마게', '고무핸드폰고리', '4개']]\n",
	"start learning...\n",
	"finished.\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "xCM1Owsh3lhF",
	"colab_type": "code",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 54
	},
	"outputId": "3056be22-593a-43fb-c063-dbf2dba8190c"
	},
	"source": [
	"a = model.wv.most_similar(\"아이폰\")\n",
	"print(a)"
	],
	"execution_count": 6,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"[('아이폰7', 0.95871901512146), ('아이폰8', 0.9415971636772156), ('7플러스', 0.9414889216423035), ('8플러스', 0.9387949705123901), ('아이폰x', 0.9375072121620178), ('아이폰6', 0.9372150301933289), ('퍼스트클래스', 0.9353760480880737), ('아이폰5', 0.9324446320533752), ('6플러스', 0.9278486371040344), ('5s', 0.9267042875289917)]\n"
	],
	"name": "stdout"
	}
	]
	}
	]
	}
No results found