r-sajal · October 18, 2020 10:20
diff --git a/bert-hugging-github.ipynb b/bert-hugging-github.ipynb
 {
  "nbformat": 4,
  "nbformat_minor": 0,
  "metadata": {
    "colab": {
      "name": "bert-hugging-github.ipynb",
      "provenance": [],
      "collapsed_sections": [],
      "authorship_tag": "ABX9TyMtP1bCQSz3OS71NgQ+ei/D",
      "include_colab_link": true
    },
    "kernelspec": {
      "name": "python3",
      "display_name": "Python 3"
    }
  },
  "cells": [
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "view-in-github",
        "colab_type": "text"
      },
      "source": [
        "<a href=\"https://colab.research.google.com/gist/r-sajal/634b00d2ed686a95bed22d8ce3fb1317/bert-hugging-github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
      ]
    },
    {
      "cell_type": "markdown",
      "metadata": {
        "id": "LA0hl35VADdQ"
      },
      "source": [
        "# **Model 1: Hugging Face**"
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "MRCO2oXR_6zM"
      },
      "source": [
        "# importing general libraries\n",
        "import pandas as pd\n",
        "import numpy as np\n",
        "import sklearn\n",
        "import warnings, gc\n",
        "warnings.filterwarnings(\"ignore\")\n",
        "\n",
        "# Tensorflow\n",
        "import tensorflow as tf\n",
        "\n",
        "from sklearn.preprocessing import LabelEncoder\n",
        "from sklearn.model_selection import train_test_split"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "cNEfWfRzAW58"
      },
      "source": [
        "# imporing data\n",
        "train_sm_df = pd.read_json(\"/content/drive/My Drive/train_extra.json\")\n",
        "test_df=pd.read_json(\"/content/drive/My Drive/embold_test.json\")"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "1Xk6CZIal_7-",
        "outputId": "01ec22d3-71e3-4092-81a4-9616da448627",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        }
      },
      "source": [
        "# Modifying data according to hugging face input\n",
        "#Input Format  --  id  label  alpha(throw-away)  text\n",
        "# remove newline characters to make more robust\n",
        "\n",
        "df_bert = pd.DataFrame({\n",
        "    'id':range(len(train_sm_df)),\n",
        "    'label':train_sm_df['label'],\n",
        "    'alpha':['a']*train_sm_df.shape[0],\n",
        "    'text': train_sm_df['text'].replace(r'\\n', ' ', regex=True)\n",
        "})\n",
        "\n",
        "df_bert.columns = range(df_bert.shape[1])\n",
        "\n",
        "# Splitting training data file into *train* and *dev*\n",
        "df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.3)\n",
        "\n",
        "df_bert_train.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>1</th>\n",
              "      <th>2</th>\n",
              "      <th>3</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>62292</th>\n",
              "      <td>62292</td>\n",
              "      <td>0</td>\n",
              "      <td>a</td>\n",
              "      <td>abort on file changed warning -  nvim --versio...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>54956</th>\n",
              "      <td>54956</td>\n",
              "      <td>1</td>\n",
              "      <td>a</td>\n",
              "      <td>add test directory for vscraper idea\\r \\r add ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>68768</th>\n",
              "      <td>68768</td>\n",
              "      <td>0</td>\n",
              "      <td>a</td>\n",
              "      <td>torch.cuda.current_device   is always 0 at bac...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>9172</th>\n",
              "      <td>9172</td>\n",
              "      <td>0</td>\n",
              "      <td>a</td>\n",
              "      <td>escape orm for sql injections need to rewrite ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>35190</th>\n",
              "      <td>35190</td>\n",
              "      <td>0</td>\n",
              "      <td>a</td>\n",
              "      <td>routeutils getdistancetostep not measuring cor...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "           0  1  2                                                  3\n",
              "62292  62292  0  a  abort on file changed warning -  nvim --versio...\n",
              "54956  54956  1  a  add test directory for vscraper idea\\r \\r add ...\n",
              "68768  68768  0  a  torch.cuda.current_device   is always 0 at bac...\n",
              "9172    9172  0  a  escape orm for sql injections need to rewrite ...\n",
              "35190  35190  0  a  routeutils getdistancetostep not measuring cor..."
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 8
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "q-5WINRgnFmX",
        "outputId": "42e18873-ce58-4246-b0f0-acc5439cf445",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 195
        }
      },
      "source": [
        "# Modifying Test Data according to format above\n",
        "\n",
        "df_bert_test = pd.DataFrame({\n",
        "    'id':range(len(test_df)),\n",
        "    'text': test_df['text'].replace(r'\\n', ' ', regex=True)\n",
        "})\n",
        "df_bert_test.columns = range(df_bert_test.shape[1])\n",
        "df_bert_test.head()"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "execute_result",
          "data": {
            "text/html": [
              "<div>\n",
              "<style scoped>\n",
              "    .dataframe tbody tr th:only-of-type {\n",
              "        vertical-align: middle;\n",
              "    }\n",
              "\n",
              "    .dataframe tbody tr th {\n",
              "        vertical-align: top;\n",
              "    }\n",
              "\n",
              "    .dataframe thead th {\n",
              "        text-align: right;\n",
              "    }\n",
              "</style>\n",
              "<table border=\"1\" class=\"dataframe\">\n",
              "  <thead>\n",
              "    <tr style=\"text-align: right;\">\n",
              "      <th></th>\n",
              "      <th>0</th>\n",
              "      <th>1</th>\n",
              "    </tr>\n",
              "  </thead>\n",
              "  <tbody>\n",
              "    <tr>\n",
              "      <th>0</th>\n",
              "      <td>0</td>\n",
              "      <td>config question  path-specific environment var...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>1</th>\n",
              "      <td>1</td>\n",
              "      <td>crash indien vol de simulator crasht als hij v...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>2</th>\n",
              "      <td>2</td>\n",
              "      <td>unable to mine rocks sarkasmo starting today, ...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>3</th>\n",
              "      <td>3</td>\n",
              "      <td>not all whitelists are processed create follow...</td>\n",
              "    </tr>\n",
              "    <tr>\n",
              "      <th>4</th>\n",
              "      <td>4</td>\n",
              "      <td>add ctx menu for idafree 70 and idafree 5 asso...</td>\n",
              "    </tr>\n",
              "  </tbody>\n",
              "</table>\n",
              "</div>"
            ],
            "text/plain": [
              "   0                                                  1\n",
              "0  0  config question  path-specific environment var...\n",
              "1  1  crash indien vol de simulator crasht als hij v...\n",
              "2  2  unable to mine rocks sarkasmo starting today, ...\n",
              "3  3  not all whitelists are processed create follow...\n",
              "4  4  add ctx menu for idafree 70 and idafree 5 asso..."
            ]
          },
          "metadata": {
            "tags": []
          },
          "execution_count": 9
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "rQZCaCtrnpvd"
      },
      "source": [
        "# making directory to store input/output data\n",
        "mkdir data\n",
        "mkdir bert_output"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "ulpVWTJ7ncgN"
      },
      "source": [
        "# Saving dataframes to .tsv format as required by BERT\n",
        "df_bert_train.to_csv('data/train.tsv', sep='\\t', index=False, header=False)\n",
        "df_bert_dev.to_csv('data/dev.tsv', sep='\\t', index=False, header=False)\n",
        "df_bert_test.to_csv('data/test.tsv', sep='\\t', index=False, header=False)"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "G32xu4_qn8om",
        "outputId": "2344b41b-a5e2-476e-a0cc-bea473b3deaa",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 101
        }
      },
      "source": [
        "# Downloading the model\n",
        "\n",
        "!git clone https://github.com/google-research/bert.git"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "Cloning into 'bert'...\n",
            "remote: Enumerating objects: 340, done.\u001b[K\n",
            "remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340\u001b[K\n",
            "Receiving objects: 100% (340/340), 317.85 KiB | 7.39 MiB/s, done.\n",
            "Resolving deltas: 100% (185/185), done.\n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "r0lwzb6Mn-L2",
        "outputId": "24145488-d187-432d-9015-5731ed1319c8",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 336
        }
      },
      "source": [
        "# unpacking the pre-trained model \n",
        "!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n",
        "!unzip the file\n",
        "!unzip cased_L-12_H-768_A-12.zip"
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "--2020-10-17 01:27:32--  https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n",
            "Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.213.128, 173.194.214.128, 173.194.216.128, ...\n",
            "Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.213.128|:443... connected.\n",
            "HTTP request sent, awaiting response... 200 OK\n",
            "Length: 404261442 (386M) [application/zip]\n",
            "Saving to: ‘cased_L-12_H-768_A-12.zip’\n",
            "\n",
            "cased_L-12_H-768_A- 100%[===================>] 385.53M   160MB/s    in 2.4s    \n",
            "\n",
            "2020-10-17 01:27:34 (160 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [404261442/404261442]\n",
            "\n",
            "unzip:  cannot find or open the, the.zip or the.ZIP.\n",
            "Archive:  cased_L-12_H-768_A-12.zip\n",
            "   creating: cased_L-12_H-768_A-12/\n",
            "  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.meta  \n",
            "  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001  \n",
            "  inflating: cased_L-12_H-768_A-12/vocab.txt  \n",
            "  inflating: cased_L-12_H-768_A-12/bert_model.ckpt.index  \n",
            "  inflating: cased_L-12_H-768_A-12/bert_config.json  \n"
          ],
          "name": "stdout"
        }
      ]
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "kLurvKQasUwm"
      },
      "source": [
        "# please install this version to make model work fine \n",
        "# with latest version it give error for some people\n",
        "\n",
        "pip install tensorflow==1.15.2"
      ],
      "execution_count": null,
      "outputs": []
    },
    {
      "cell_type": "code",
      "metadata": {
        "id": "vjG3oNsUnuYk",
        "outputId": "842b4510-7896-41bb-c007-0d4380547a4e",
        "colab": {
          "base_uri": "https://localhost:8080/",
          "height": 1000
        }
      },
      "source": [
        "# running the command line commands\n",
        "# parameters that can be tuned --> \n",
        "# max_seq_length , learning_rate , train_batch_size , num_train_epochs\n",
        "\n",
        "!python bert/run_classifier.py \\\n",
        "--task_name=cola \\\n",
        "--do_train=true \\\n",
        "--do_eval=true \\\n",
        "--do_predict=true \\\n",
        "--data_dir=./data/ \\\n",
        "--vocab_file=./cased_L-12_H-768_A-12/vocab.txt \\\n",
        "--bert_config_file=./cased_L-12_H-768_A-12/bert_config.json \\\n",
        "--init_checkpoint=./cased_L-12_H-768_A-12/bert_model.ckpt \\\n",
        "--max_seq_length=100 \\\n",
        "--train_batch_size=8 \\\n",
        "--learning_rate=1e-3 \\\n",
        "--num_train_epochs=1.0 \\\n",
        "--do_lower_case=False \\\n",
        "--output_dir=./bert_output/ \\\n",
        "--save_checkpoints_steps=9999999 "
      ],
      "execution_count": null,
      "outputs": [
        {
          "output_type": "stream",
          "text": [
            "\u001b[1;30;43mStreaming output truncated to the last 5000 lines.\u001b[0m\n",
            "I1017 09:27:11.383001 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0875963\n",
            "INFO:tensorflow:examples/sec: 0.700771\n",
            "I1017 09:27:11.383543 139782014523264 tpu_estimator.py:2308] examples/sec: 0.700771\n",
            "INFO:tensorflow:global_step/sec: 0.0877112\n",
            "I1017 09:27:22.784008 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0877112\n",
            "INFO:tensorflow:examples/sec: 0.70169\n",
            "I1017 09:27:22.784525 139782014523264 tpu_estimator.py:2308] examples/sec: 0.70169\n"
          ],
          "name": "stdout"
        }
      ]
    }
  ]
 }
	{
	"nbformat": 4,
	"nbformat_minor": 0,
	"metadata": {
	"colab": {
	"name": "bert-hugging-github.ipynb",
	"provenance": [],
	"collapsed_sections": [],
	"authorship_tag": "ABX9TyMtP1bCQSz3OS71NgQ+ei/D",
	"include_colab_link": true
	},
	"kernelspec": {
	"name": "python3",
	"display_name": "Python 3"
	}
	},
	"cells": [
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "view-in-github",
	"colab_type": "text"
	},
	"source": [
	"<a href=\"https://colab.research.google.com/gist/r-sajal/634b00d2ed686a95bed22d8ce3fb1317/bert-hugging-github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
	]
	},
	{
	"cell_type": "markdown",
	"metadata": {
	"id": "LA0hl35VADdQ"
	},
	"source": [
	"# Model 1: Hugging Face"
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "MRCO2oXR_6zM"
	},
	"source": [
	"# importing general libraries\n",
	"import pandas as pd\n",
	"import numpy as np\n",
	"import sklearn\n",
	"import warnings, gc\n",
	"warnings.filterwarnings(\"ignore\")\n",
	"\n",
	"# Tensorflow\n",
	"import tensorflow as tf\n",
	"\n",
	"from sklearn.preprocessing import LabelEncoder\n",
	"from sklearn.model_selection import train_test_split"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "cNEfWfRzAW58"
	},
	"source": [
	"# imporing data\n",
	"train_sm_df = pd.read_json(\"/content/drive/My Drive/train_extra.json\")\n",
	"test_df=pd.read_json(\"/content/drive/My Drive/embold_test.json\")"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "1Xk6CZIal_7-",
	"outputId": "01ec22d3-71e3-4092-81a4-9616da448627",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 195
	}
	},
	"source": [
	"# Modifying data according to hugging face input\n",
	"#Input Format -- id label alpha(throw-away) text\n",
	"# remove newline characters to make more robust\n",
	"\n",
	"df_bert = pd.DataFrame({\n",
	" 'id':range(len(train_sm_df)),\n",
	" 'label':train_sm_df['label'],\n",
	" 'alpha':['a']*train_sm_df.shape[0],\n",
	" 'text': train_sm_df['text'].replace(r'\\n', ' ', regex=True)\n",
	"})\n",
	"\n",
	"df_bert.columns = range(df_bert.shape[1])\n",
	"\n",
	"# Splitting training data file into train and dev\n",
	"df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.3)\n",
	"\n",
	"df_bert_train.head()"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" <th>2</th>\n",
	" <th>3</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>62292</th>\n",
	" <td>62292</td>\n",
	" <td>0</td>\n",
	" <td>a</td>\n",
	" <td>abort on file changed warning - nvim --versio...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>54956</th>\n",
	" <td>54956</td>\n",
	" <td>1</td>\n",
	" <td>a</td>\n",
	" <td>add test directory for vscraper idea\\r \\r add ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>68768</th>\n",
	" <td>68768</td>\n",
	" <td>0</td>\n",
	" <td>a</td>\n",
	" <td>torch.cuda.current_device is always 0 at bac...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>9172</th>\n",
	" <td>9172</td>\n",
	" <td>0</td>\n",
	" <td>a</td>\n",
	" <td>escape orm for sql injections need to rewrite ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>35190</th>\n",
	" <td>35190</td>\n",
	" <td>0</td>\n",
	" <td>a</td>\n",
	" <td>routeutils getdistancetostep not measuring cor...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1 2 3\n",
	"62292 62292 0 a abort on file changed warning - nvim --versio...\n",
	"54956 54956 1 a add test directory for vscraper idea\\r \\r add ...\n",
	"68768 68768 0 a torch.cuda.current_device is always 0 at bac...\n",
	"9172 9172 0 a escape orm for sql injections need to rewrite ...\n",
	"35190 35190 0 a routeutils getdistancetostep not measuring cor..."
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 8
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "q-5WINRgnFmX",
	"outputId": "42e18873-ce58-4246-b0f0-acc5439cf445",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 195
	}
	},
	"source": [
	"# Modifying Test Data according to format above\n",
	"\n",
	"df_bert_test = pd.DataFrame({\n",
	" 'id':range(len(test_df)),\n",
	" 'text': test_df['text'].replace(r'\\n', ' ', regex=True)\n",
	"})\n",
	"df_bert_test.columns = range(df_bert_test.shape[1])\n",
	"df_bert_test.head()"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "execute_result",
	"data": {
	"text/html": [
	"<div>\n",
	"<style scoped>\n",
	" .dataframe tbody tr th:only-of-type {\n",
	" vertical-align: middle;\n",
	" }\n",
	"\n",
	" .dataframe tbody tr th {\n",
	" vertical-align: top;\n",
	" }\n",
	"\n",
	" .dataframe thead th {\n",
	" text-align: right;\n",
	" }\n",
	"</style>\n",
	"<table border=\"1\" class=\"dataframe\">\n",
	" <thead>\n",
	" <tr style=\"text-align: right;\">\n",
	" <th></th>\n",
	" <th>0</th>\n",
	" <th>1</th>\n",
	" </tr>\n",
	" </thead>\n",
	" <tbody>\n",
	" <tr>\n",
	" <th>0</th>\n",
	" <td>0</td>\n",
	" <td>config question path-specific environment var...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>1</th>\n",
	" <td>1</td>\n",
	" <td>crash indien vol de simulator crasht als hij v...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>2</th>\n",
	" <td>2</td>\n",
	" <td>unable to mine rocks sarkasmo starting today, ...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>3</th>\n",
	" <td>3</td>\n",
	" <td>not all whitelists are processed create follow...</td>\n",
	" </tr>\n",
	" <tr>\n",
	" <th>4</th>\n",
	" <td>4</td>\n",
	" <td>add ctx menu for idafree 70 and idafree 5 asso...</td>\n",
	" </tr>\n",
	" </tbody>\n",
	"</table>\n",
	"</div>"
	],
	"text/plain": [
	" 0 1\n",
	"0 0 config question path-specific environment var...\n",
	"1 1 crash indien vol de simulator crasht als hij v...\n",
	"2 2 unable to mine rocks sarkasmo starting today, ...\n",
	"3 3 not all whitelists are processed create follow...\n",
	"4 4 add ctx menu for idafree 70 and idafree 5 asso..."
	]
	},
	"metadata": {
	"tags": []
	},
	"execution_count": 9
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "rQZCaCtrnpvd"
	},
	"source": [
	"# making directory to store input/output data\n",
	"mkdir data\n",
	"mkdir bert_output"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "ulpVWTJ7ncgN"
	},
	"source": [
	"# Saving dataframes to .tsv format as required by BERT\n",
	"df_bert_train.to_csv('data/train.tsv', sep='\\t', index=False, header=False)\n",
	"df_bert_dev.to_csv('data/dev.tsv', sep='\\t', index=False, header=False)\n",
	"df_bert_test.to_csv('data/test.tsv', sep='\\t', index=False, header=False)"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "G32xu4_qn8om",
	"outputId": "2344b41b-a5e2-476e-a0cc-bea473b3deaa",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 101
	}
	},
	"source": [
	"# Downloading the model\n",
	"\n",
	"!git clone https://github.com/google-research/bert.git"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"Cloning into 'bert'...\n",
	"remote: Enumerating objects: 340, done.\u001b[K\n",
	"remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340\u001b[K\n",
	"Receiving objects: 100% (340/340), 317.85 KiB \| 7.39 MiB/s, done.\n",
	"Resolving deltas: 100% (185/185), done.\n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "r0lwzb6Mn-L2",
	"outputId": "24145488-d187-432d-9015-5731ed1319c8",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 336
	}
	},
	"source": [
	"# unpacking the pre-trained model \n",
	"!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n",
	"!unzip the file\n",
	"!unzip cased_L-12_H-768_A-12.zip"
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"--2020-10-17 01:27:32-- https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n",
	"Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.213.128, 173.194.214.128, 173.194.216.128, ...\n",
	"Connecting to storage.googleapis.com (storage.googleapis.com)\|173.194.213.128\|:443... connected.\n",
	"HTTP request sent, awaiting response... 200 OK\n",
	"Length: 404261442 (386M) [application/zip]\n",
	"Saving to: ‘cased_L-12_H-768_A-12.zip’\n",
	"\n",
	"cased_L-12_H-768_A- 100%[===================>] 385.53M 160MB/s in 2.4s \n",
	"\n",
	"2020-10-17 01:27:34 (160 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [404261442/404261442]\n",
	"\n",
	"unzip: cannot find or open the, the.zip or the.ZIP.\n",
	"Archive: cased_L-12_H-768_A-12.zip\n",
	" creating: cased_L-12_H-768_A-12/\n",
	" inflating: cased_L-12_H-768_A-12/bert_model.ckpt.meta \n",
	" inflating: cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001 \n",
	" inflating: cased_L-12_H-768_A-12/vocab.txt \n",
	" inflating: cased_L-12_H-768_A-12/bert_model.ckpt.index \n",
	" inflating: cased_L-12_H-768_A-12/bert_config.json \n"
	],
	"name": "stdout"
	}
	]
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "kLurvKQasUwm"
	},
	"source": [
	"# please install this version to make model work fine \n",
	"# with latest version it give error for some people\n",
	"\n",
	"pip install tensorflow==1.15.2"
	],
	"execution_count": null,
	"outputs": []
	},
	{
	"cell_type": "code",
	"metadata": {
	"id": "vjG3oNsUnuYk",
	"outputId": "842b4510-7896-41bb-c007-0d4380547a4e",
	"colab": {
	"base_uri": "https://localhost:8080/",
	"height": 1000
	}
	},
	"source": [
	"# running the command line commands\n",
	"# parameters that can be tuned --> \n",
	"# max_seq_length , learning_rate , train_batch_size , num_train_epochs\n",
	"\n",
	"!python bert/run_classifier.py \\\n",
	"--task_name=cola \\\n",
	"--do_train=true \\\n",
	"--do_eval=true \\\n",
	"--do_predict=true \\\n",
	"--data_dir=./data/ \\\n",
	"--vocab_file=./cased_L-12_H-768_A-12/vocab.txt \\\n",
	"--bert_config_file=./cased_L-12_H-768_A-12/bert_config.json \\\n",
	"--init_checkpoint=./cased_L-12_H-768_A-12/bert_model.ckpt \\\n",
	"--max_seq_length=100 \\\n",
	"--train_batch_size=8 \\\n",
	"--learning_rate=1e-3 \\\n",
	"--num_train_epochs=1.0 \\\n",
	"--do_lower_case=False \\\n",
	"--output_dir=./bert_output/ \\\n",
	"--save_checkpoints_steps=9999999 "
	],
	"execution_count": null,
	"outputs": [
	{
	"output_type": "stream",
	"text": [
	"\u001b[1;30;43mStreaming output truncated to the last 5000 lines.\u001b[0m\n",
	"I1017 09:27:11.383001 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0875963\n",
	"INFO:tensorflow:examples/sec: 0.700771\n",
	"I1017 09:27:11.383543 139782014523264 tpu_estimator.py:2308] examples/sec: 0.700771\n",
	"INFO:tensorflow:global_step/sec: 0.0877112\n",
	"I1017 09:27:22.784008 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0877112\n",
	"INFO:tensorflow:examples/sec: 0.70169\n",
	"I1017 09:27:22.784525 139782014523264 tpu_estimator.py:2308] examples/sec: 0.70169\n"
	],
	"name": "stdout"
	}
	]
	}
	]
	}
No results found