Skip to content

Instantly share code, notes, and snippets.

@skyer9
Created May 4, 2019 13:18
Show Gist options
  • Select an option

  • Save skyer9/ed004b397e1569715e26ac5a6a692259 to your computer and use it in GitHub Desktop.

Select an option

Save skyer9/ed004b397e1569715e26ac5a6a692259 to your computer and use it in GitHub Desktop.
word_embedding.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "word_embedding.ipynb",
"version": "0.3.2",
"provenance": [],
"collapsed_sections": [],
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/skyer9/ed004b397e1569715e26ac5a6a692259/word_embedding.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"metadata": {
"id": "hRjnz03pH1Ge",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 34
},
"outputId": "a91b147e-b359-4636-be55-962dc9a29071"
},
"source": [
"from google.colab import drive\n",
"drive.mount('/content/drive', force_remount=True)"
],
"execution_count": 1,
"outputs": [
{
"output_type": "stream",
"text": [
"Mounted at /content/drive\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "KiGi259EIKtW",
"colab_type": "code",
"colab": {}
},
"source": [
"!cp \"/content/drive/My Drive/for_voca.csv\" \"for_voca.csv\""
],
"execution_count": 0,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "dKbFBjBDIgR7",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 359
},
"outputId": "3dabda54-48fb-4934-a862-458199e5c124"
},
"source": [
"import pandas as pd\n",
"\n",
"filepath = 'for_voca.csv'\n",
"\n",
"df = pd.read_csv(filepath, names=['itemname'], sep='\\t', header=0)\n",
"df.dropna()\n",
"\n",
"df.head(n=10)"
],
"execution_count": 3,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>itemname</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>스탬핑 매트</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>FRESH A simply mannish jacket_(1166989)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>이어폰마게 고무핸드폰고리 4개</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>에시앙 점보의자 전용 플레이 앙쥬 리드줄_(566781)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>풀 스톤 큐빅 케이스 아이폰시리즈</td>\n",
" </tr>\n",
" <tr>\n",
" <th>5</th>\n",
" <td>[삐삐]사각손거울_(894685)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>6</th>\n",
" <td>젤리_열매_아몬드나무(블루)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>7</th>\n",
" <td>[독서실책상용] 터치 LED스탠드</td>\n",
" </tr>\n",
" <tr>\n",
" <th>8</th>\n",
" <td>감사합니다 엄마아빠 (용돈봉투)</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9</th>\n",
" <td>투카노 Microfiber Second Skin 11인치 초극세사 파우치(BFMS-...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" itemname\n",
"0 스탬핑 매트\n",
"1 FRESH A simply mannish jacket_(1166989)\n",
"2 이어폰마게 고무핸드폰고리 4개\n",
"3 에시앙 점보의자 전용 플레이 앙쥬 리드줄_(566781)\n",
"4 풀 스톤 큐빅 케이스 아이폰시리즈\n",
"5 [삐삐]사각손거울_(894685)\n",
"6 젤리_열매_아몬드나무(블루)\n",
"7 [독서실책상용] 터치 LED스탠드\n",
"8 감사합니다 엄마아빠 (용돈봉투)\n",
"9 투카노 Microfiber Second Skin 11인치 초극세사 파우치(BFMS-..."
]
},
"metadata": {
"tags": []
},
"execution_count": 3
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "x5ddu70kJg4e",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 102
},
"outputId": "16ae304f-871e-497d-8da7-1bdabc72f7b4"
},
"source": [
"import nltk\n",
"\n",
"nltk.download('stopwords')\n",
"nltk.download('punkt')"
],
"execution_count": 4,
"outputs": [
{
"output_type": "stream",
"text": [
"[nltk_data] Downloading package stopwords to /root/nltk_data...\n",
"[nltk_data] Package stopwords is already up-to-date!\n",
"[nltk_data] Downloading package punkt to /root/nltk_data...\n",
"[nltk_data] Package punkt is already up-to-date!\n"
],
"name": "stdout"
},
{
"output_type": "execute_result",
"data": {
"text/plain": [
"True"
]
},
"metadata": {
"tags": []
},
"execution_count": 4
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "Ayk-er1XIpBp",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 102
},
"outputId": "714cd673-4400-4e3c-e67a-304446ab8df5"
},
"source": [
"import os\n",
"import numpy as np\n",
"import warnings\n",
"from nltk.tokenize import word_tokenize, sent_tokenize\n",
"from gensim.models import Word2Vec\n",
"\n",
"warnings.simplefilter(action='ignore', category=FutureWarning)\n",
"\n",
"# 상품명 정제\n",
"org_corpus = df.iloc[:,0]\n",
"org_corpus = org_corpus.str.lower()\n",
"org_corpus = org_corpus.str.replace(r'[^a-z0-9ㄱ-ㅎ가-힣\\s-]', ' ', regex=True)\n",
"org_corpus = org_corpus.str.replace(' - ', ' ', regex=False)\n",
"org_corpus = org_corpus.str.replace(' -', ' ', regex=False)\n",
"org_corpus = org_corpus.str.replace('- ', ' ', regex=False)\n",
"org_corpus = org_corpus.str.replace(r'[ ]+', ' ', regex=True)\n",
"org_corpus = org_corpus.str.strip()\n",
"\n",
"corpus = []\n",
"for line in org_corpus:\n",
" if type(line) is not float:\n",
" corpus.append(line)\n",
"print('corpus size : {}'.format(len(corpus)))\n",
"\n",
"result = []\n",
"result = [word_tokenize(sentence) for sentence in corpus]\n",
"print('data created.')\n",
"print(result[:3])\n",
"\n",
"print('start learning...')\n",
"model = Word2Vec(sentences=result, \n",
" size=50, \n",
" window=5, \n",
" min_count=5, \n",
" workers=4, \n",
" sg=1)\n",
"print('finished.')"
],
"execution_count": 5,
"outputs": [
{
"output_type": "stream",
"text": [
"corpus size : 112011\n",
"data created.\n",
"[['스탬핑', '매트'], ['fresh', 'a', 'simply', 'mannish', 'jacket', '1166989'], ['이어폰마게', '고무핸드폰고리', '4개']]\n",
"start learning...\n",
"finished.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "xCM1Owsh3lhF",
"colab_type": "code",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 54
},
"outputId": "3056be22-593a-43fb-c063-dbf2dba8190c"
},
"source": [
"a = model.wv.most_similar(\"아이폰\")\n",
"print(a)"
],
"execution_count": 6,
"outputs": [
{
"output_type": "stream",
"text": [
"[('아이폰7', 0.95871901512146), ('아이폰8', 0.9415971636772156), ('7플러스', 0.9414889216423035), ('8플러스', 0.9387949705123901), ('아이폰x', 0.9375072121620178), ('아이폰6', 0.9372150301933289), ('퍼스트클래스', 0.9353760480880737), ('아이폰5', 0.9324446320533752), ('6플러스', 0.9278486371040344), ('5s', 0.9267042875289917)]\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment