Last active
October 18, 2020 10:20
-
-
Save r-sajal/634b00d2ed686a95bed22d8ce3fb1317 to your computer and use it in GitHub Desktop.
bert-hugging-github.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "bert-hugging-github.ipynb", | |
| "provenance": [], | |
| "collapsed_sections": [], | |
| "authorship_tag": "ABX9TyMtP1bCQSz3OS71NgQ+ei/D", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/r-sajal/634b00d2ed686a95bed22d8ce3fb1317/bert-hugging-github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "LA0hl35VADdQ" | |
| }, | |
| "source": [ | |
| "# **Model 1: Hugging Face**" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "MRCO2oXR_6zM" | |
| }, | |
| "source": [ | |
| "# importing general libraries\n", | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import sklearn\n", | |
| "import warnings, gc\n", | |
| "warnings.filterwarnings(\"ignore\")\n", | |
| "\n", | |
| "# Tensorflow\n", | |
| "import tensorflow as tf\n", | |
| "\n", | |
| "from sklearn.preprocessing import LabelEncoder\n", | |
| "from sklearn.model_selection import train_test_split" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "cNEfWfRzAW58" | |
| }, | |
| "source": [ | |
| "# imporing data\n", | |
| "train_sm_df = pd.read_json(\"/content/drive/My Drive/train_extra.json\")\n", | |
| "test_df=pd.read_json(\"/content/drive/My Drive/embold_test.json\")" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "1Xk6CZIal_7-", | |
| "outputId": "01ec22d3-71e3-4092-81a4-9616da448627", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 195 | |
| } | |
| }, | |
| "source": [ | |
| "# Modifying data according to hugging face input\n", | |
| "#Input Format -- id label alpha(throw-away) text\n", | |
| "# remove newline characters to make more robust\n", | |
| "\n", | |
| "df_bert = pd.DataFrame({\n", | |
| " 'id':range(len(train_sm_df)),\n", | |
| " 'label':train_sm_df['label'],\n", | |
| " 'alpha':['a']*train_sm_df.shape[0],\n", | |
| " 'text': train_sm_df['text'].replace(r'\\n', ' ', regex=True)\n", | |
| "})\n", | |
| "\n", | |
| "df_bert.columns = range(df_bert.shape[1])\n", | |
| "\n", | |
| "# Splitting training data file into *train* and *dev*\n", | |
| "df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.3)\n", | |
| "\n", | |
| "df_bert_train.head()" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>0</th>\n", | |
| " <th>1</th>\n", | |
| " <th>2</th>\n", | |
| " <th>3</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>62292</th>\n", | |
| " <td>62292</td>\n", | |
| " <td>0</td>\n", | |
| " <td>a</td>\n", | |
| " <td>abort on file changed warning - nvim --versio...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>54956</th>\n", | |
| " <td>54956</td>\n", | |
| " <td>1</td>\n", | |
| " <td>a</td>\n", | |
| " <td>add test directory for vscraper idea\\r \\r add ...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>68768</th>\n", | |
| " <td>68768</td>\n", | |
| " <td>0</td>\n", | |
| " <td>a</td>\n", | |
| " <td>torch.cuda.current_device is always 0 at bac...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9172</th>\n", | |
| " <td>9172</td>\n", | |
| " <td>0</td>\n", | |
| " <td>a</td>\n", | |
| " <td>escape orm for sql injections need to rewrite ...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>35190</th>\n", | |
| " <td>35190</td>\n", | |
| " <td>0</td>\n", | |
| " <td>a</td>\n", | |
| " <td>routeutils getdistancetostep not measuring cor...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " 0 1 2 3\n", | |
| "62292 62292 0 a abort on file changed warning - nvim --versio...\n", | |
| "54956 54956 1 a add test directory for vscraper idea\\r \\r add ...\n", | |
| "68768 68768 0 a torch.cuda.current_device is always 0 at bac...\n", | |
| "9172 9172 0 a escape orm for sql injections need to rewrite ...\n", | |
| "35190 35190 0 a routeutils getdistancetostep not measuring cor..." | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 8 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "q-5WINRgnFmX", | |
| "outputId": "42e18873-ce58-4246-b0f0-acc5439cf445", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 195 | |
| } | |
| }, | |
| "source": [ | |
| "# Modifying Test Data according to format above\n", | |
| "\n", | |
| "df_bert_test = pd.DataFrame({\n", | |
| " 'id':range(len(test_df)),\n", | |
| " 'text': test_df['text'].replace(r'\\n', ' ', regex=True)\n", | |
| "})\n", | |
| "df_bert_test.columns = range(df_bert_test.shape[1])\n", | |
| "df_bert_test.head()" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>0</th>\n", | |
| " <th>1</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>0</td>\n", | |
| " <td>config question path-specific environment var...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>crash indien vol de simulator crasht als hij v...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2</td>\n", | |
| " <td>unable to mine rocks sarkasmo starting today, ...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>3</td>\n", | |
| " <td>not all whitelists are processed create follow...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>4</td>\n", | |
| " <td>add ctx menu for idafree 70 and idafree 5 asso...</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " 0 1\n", | |
| "0 0 config question path-specific environment var...\n", | |
| "1 1 crash indien vol de simulator crasht als hij v...\n", | |
| "2 2 unable to mine rocks sarkasmo starting today, ...\n", | |
| "3 3 not all whitelists are processed create follow...\n", | |
| "4 4 add ctx menu for idafree 70 and idafree 5 asso..." | |
| ] | |
| }, | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "execution_count": 9 | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "rQZCaCtrnpvd" | |
| }, | |
| "source": [ | |
| "# making directory to store input/output data\n", | |
| "mkdir data\n", | |
| "mkdir bert_output" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "ulpVWTJ7ncgN" | |
| }, | |
| "source": [ | |
| "# Saving dataframes to .tsv format as required by BERT\n", | |
| "df_bert_train.to_csv('data/train.tsv', sep='\\t', index=False, header=False)\n", | |
| "df_bert_dev.to_csv('data/dev.tsv', sep='\\t', index=False, header=False)\n", | |
| "df_bert_test.to_csv('data/test.tsv', sep='\\t', index=False, header=False)" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "G32xu4_qn8om", | |
| "outputId": "2344b41b-a5e2-476e-a0cc-bea473b3deaa", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 101 | |
| } | |
| }, | |
| "source": [ | |
| "# Downloading the model\n", | |
| "\n", | |
| "!git clone https://github.com/google-research/bert.git" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Cloning into 'bert'...\n", | |
| "remote: Enumerating objects: 340, done.\u001b[K\n", | |
| "remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340\u001b[K\n", | |
| "Receiving objects: 100% (340/340), 317.85 KiB | 7.39 MiB/s, done.\n", | |
| "Resolving deltas: 100% (185/185), done.\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "r0lwzb6Mn-L2", | |
| "outputId": "24145488-d187-432d-9015-5731ed1319c8", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 336 | |
| } | |
| }, | |
| "source": [ | |
| "# unpacking the pre-trained model \n", | |
| "!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n", | |
| "!unzip the file\n", | |
| "!unzip cased_L-12_H-768_A-12.zip" | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "--2020-10-17 01:27:32-- https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n", | |
| "Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.213.128, 173.194.214.128, 173.194.216.128, ...\n", | |
| "Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.213.128|:443... connected.\n", | |
| "HTTP request sent, awaiting response... 200 OK\n", | |
| "Length: 404261442 (386M) [application/zip]\n", | |
| "Saving to: ‘cased_L-12_H-768_A-12.zip’\n", | |
| "\n", | |
| "cased_L-12_H-768_A- 100%[===================>] 385.53M 160MB/s in 2.4s \n", | |
| "\n", | |
| "2020-10-17 01:27:34 (160 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [404261442/404261442]\n", | |
| "\n", | |
| "unzip: cannot find or open the, the.zip or the.ZIP.\n", | |
| "Archive: cased_L-12_H-768_A-12.zip\n", | |
| " creating: cased_L-12_H-768_A-12/\n", | |
| " inflating: cased_L-12_H-768_A-12/bert_model.ckpt.meta \n", | |
| " inflating: cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001 \n", | |
| " inflating: cased_L-12_H-768_A-12/vocab.txt \n", | |
| " inflating: cased_L-12_H-768_A-12/bert_model.ckpt.index \n", | |
| " inflating: cased_L-12_H-768_A-12/bert_config.json \n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "kLurvKQasUwm" | |
| }, | |
| "source": [ | |
| "# please install this version to make model work fine \n", | |
| "# with latest version it give error for some people\n", | |
| "\n", | |
| "pip install tensorflow==1.15.2" | |
| ], | |
| "execution_count": null, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "vjG3oNsUnuYk", | |
| "outputId": "842b4510-7896-41bb-c007-0d4380547a4e", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 1000 | |
| } | |
| }, | |
| "source": [ | |
| "# running the command line commands\n", | |
| "# parameters that can be tuned --> \n", | |
| "# max_seq_length , learning_rate , train_batch_size , num_train_epochs\n", | |
| "\n", | |
| "!python bert/run_classifier.py \\\n", | |
| "--task_name=cola \\\n", | |
| "--do_train=true \\\n", | |
| "--do_eval=true \\\n", | |
| "--do_predict=true \\\n", | |
| "--data_dir=./data/ \\\n", | |
| "--vocab_file=./cased_L-12_H-768_A-12/vocab.txt \\\n", | |
| "--bert_config_file=./cased_L-12_H-768_A-12/bert_config.json \\\n", | |
| "--init_checkpoint=./cased_L-12_H-768_A-12/bert_model.ckpt \\\n", | |
| "--max_seq_length=100 \\\n", | |
| "--train_batch_size=8 \\\n", | |
| "--learning_rate=1e-3 \\\n", | |
| "--num_train_epochs=1.0 \\\n", | |
| "--do_lower_case=False \\\n", | |
| "--output_dir=./bert_output/ \\\n", | |
| "--save_checkpoints_steps=9999999 " | |
| ], | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "\u001b[1;30;43mStreaming output truncated to the last 5000 lines.\u001b[0m\n", | |
| "I1017 09:27:11.383001 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0875963\n", | |
| "INFO:tensorflow:examples/sec: 0.700771\n", | |
| "I1017 09:27:11.383543 139782014523264 tpu_estimator.py:2308] examples/sec: 0.700771\n", | |
| "INFO:tensorflow:global_step/sec: 0.0877112\n", | |
| "I1017 09:27:22.784008 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0877112\n", | |
| "INFO:tensorflow:examples/sec: 0.70169\n", | |
| "I1017 09:27:22.784525 139782014523264 tpu_estimator.py:2308] examples/sec: 0.70169\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment