Skip to content

Instantly share code, notes, and snippets.

@r-sajal
Last active October 18, 2020 10:20
Show Gist options
  • Select an option

  • Save r-sajal/634b00d2ed686a95bed22d8ce3fb1317 to your computer and use it in GitHub Desktop.

Select an option

Save r-sajal/634b00d2ed686a95bed22d8ce3fb1317 to your computer and use it in GitHub Desktop.
bert-hugging-github.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"name": "bert-hugging-github.ipynb",
"provenance": [],
"collapsed_sections": [],
"authorship_tag": "ABX9TyMtP1bCQSz3OS71NgQ+ei/D",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/r-sajal/634b00d2ed686a95bed22d8ce3fb1317/bert-hugging-github.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "markdown",
"metadata": {
"id": "LA0hl35VADdQ"
},
"source": [
"# **Model 1: Hugging Face**"
]
},
{
"cell_type": "code",
"metadata": {
"id": "MRCO2oXR_6zM"
},
"source": [
"# importing general libraries\n",
"import pandas as pd\n",
"import numpy as np\n",
"import sklearn\n",
"import warnings, gc\n",
"warnings.filterwarnings(\"ignore\")\n",
"\n",
"# Tensorflow\n",
"import tensorflow as tf\n",
"\n",
"from sklearn.preprocessing import LabelEncoder\n",
"from sklearn.model_selection import train_test_split"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "cNEfWfRzAW58"
},
"source": [
"# imporing data\n",
"train_sm_df = pd.read_json(\"/content/drive/My Drive/train_extra.json\")\n",
"test_df=pd.read_json(\"/content/drive/My Drive/embold_test.json\")"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "1Xk6CZIal_7-",
"outputId": "01ec22d3-71e3-4092-81a4-9616da448627",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 195
}
},
"source": [
"# Modifying data according to hugging face input\n",
"#Input Format -- id label alpha(throw-away) text\n",
"# remove newline characters to make more robust\n",
"\n",
"df_bert = pd.DataFrame({\n",
" 'id':range(len(train_sm_df)),\n",
" 'label':train_sm_df['label'],\n",
" 'alpha':['a']*train_sm_df.shape[0],\n",
" 'text': train_sm_df['text'].replace(r'\\n', ' ', regex=True)\n",
"})\n",
"\n",
"df_bert.columns = range(df_bert.shape[1])\n",
"\n",
"# Splitting training data file into *train* and *dev*\n",
"df_bert_train, df_bert_dev = train_test_split(df_bert, test_size=0.3)\n",
"\n",
"df_bert_train.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" <th>2</th>\n",
" <th>3</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>62292</th>\n",
" <td>62292</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" <td>abort on file changed warning - nvim --versio...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>54956</th>\n",
" <td>54956</td>\n",
" <td>1</td>\n",
" <td>a</td>\n",
" <td>add test directory for vscraper idea\\r \\r add ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>68768</th>\n",
" <td>68768</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" <td>torch.cuda.current_device is always 0 at bac...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>9172</th>\n",
" <td>9172</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" <td>escape orm for sql injections need to rewrite ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>35190</th>\n",
" <td>35190</td>\n",
" <td>0</td>\n",
" <td>a</td>\n",
" <td>routeutils getdistancetostep not measuring cor...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1 2 3\n",
"62292 62292 0 a abort on file changed warning - nvim --versio...\n",
"54956 54956 1 a add test directory for vscraper idea\\r \\r add ...\n",
"68768 68768 0 a torch.cuda.current_device is always 0 at bac...\n",
"9172 9172 0 a escape orm for sql injections need to rewrite ...\n",
"35190 35190 0 a routeutils getdistancetostep not measuring cor..."
]
},
"metadata": {
"tags": []
},
"execution_count": 8
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "q-5WINRgnFmX",
"outputId": "42e18873-ce58-4246-b0f0-acc5439cf445",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 195
}
},
"source": [
"# Modifying Test Data according to format above\n",
"\n",
"df_bert_test = pd.DataFrame({\n",
" 'id':range(len(test_df)),\n",
" 'text': test_df['text'].replace(r'\\n', ' ', regex=True)\n",
"})\n",
"df_bert_test.columns = range(df_bert_test.shape[1])\n",
"df_bert_test.head()"
],
"execution_count": null,
"outputs": [
{
"output_type": "execute_result",
"data": {
"text/html": [
"<div>\n",
"<style scoped>\n",
" .dataframe tbody tr th:only-of-type {\n",
" vertical-align: middle;\n",
" }\n",
"\n",
" .dataframe tbody tr th {\n",
" vertical-align: top;\n",
" }\n",
"\n",
" .dataframe thead th {\n",
" text-align: right;\n",
" }\n",
"</style>\n",
"<table border=\"1\" class=\"dataframe\">\n",
" <thead>\n",
" <tr style=\"text-align: right;\">\n",
" <th></th>\n",
" <th>0</th>\n",
" <th>1</th>\n",
" </tr>\n",
" </thead>\n",
" <tbody>\n",
" <tr>\n",
" <th>0</th>\n",
" <td>0</td>\n",
" <td>config question path-specific environment var...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>1</th>\n",
" <td>1</td>\n",
" <td>crash indien vol de simulator crasht als hij v...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>2</th>\n",
" <td>2</td>\n",
" <td>unable to mine rocks sarkasmo starting today, ...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>3</th>\n",
" <td>3</td>\n",
" <td>not all whitelists are processed create follow...</td>\n",
" </tr>\n",
" <tr>\n",
" <th>4</th>\n",
" <td>4</td>\n",
" <td>add ctx menu for idafree 70 and idafree 5 asso...</td>\n",
" </tr>\n",
" </tbody>\n",
"</table>\n",
"</div>"
],
"text/plain": [
" 0 1\n",
"0 0 config question path-specific environment var...\n",
"1 1 crash indien vol de simulator crasht als hij v...\n",
"2 2 unable to mine rocks sarkasmo starting today, ...\n",
"3 3 not all whitelists are processed create follow...\n",
"4 4 add ctx menu for idafree 70 and idafree 5 asso..."
]
},
"metadata": {
"tags": []
},
"execution_count": 9
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "rQZCaCtrnpvd"
},
"source": [
"# making directory to store input/output data\n",
"mkdir data\n",
"mkdir bert_output"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "ulpVWTJ7ncgN"
},
"source": [
"# Saving dataframes to .tsv format as required by BERT\n",
"df_bert_train.to_csv('data/train.tsv', sep='\\t', index=False, header=False)\n",
"df_bert_dev.to_csv('data/dev.tsv', sep='\\t', index=False, header=False)\n",
"df_bert_test.to_csv('data/test.tsv', sep='\\t', index=False, header=False)"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "G32xu4_qn8om",
"outputId": "2344b41b-a5e2-476e-a0cc-bea473b3deaa",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 101
}
},
"source": [
"# Downloading the model\n",
"\n",
"!git clone https://github.com/google-research/bert.git"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"Cloning into 'bert'...\n",
"remote: Enumerating objects: 340, done.\u001b[K\n",
"remote: Total 340 (delta 0), reused 0 (delta 0), pack-reused 340\u001b[K\n",
"Receiving objects: 100% (340/340), 317.85 KiB | 7.39 MiB/s, done.\n",
"Resolving deltas: 100% (185/185), done.\n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "r0lwzb6Mn-L2",
"outputId": "24145488-d187-432d-9015-5731ed1319c8",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 336
}
},
"source": [
"# unpacking the pre-trained model \n",
"!wget https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n",
"!unzip the file\n",
"!unzip cased_L-12_H-768_A-12.zip"
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"--2020-10-17 01:27:32-- https://storage.googleapis.com/bert_models/2018_10_18/cased_L-12_H-768_A-12.zip\n",
"Resolving storage.googleapis.com (storage.googleapis.com)... 173.194.213.128, 173.194.214.128, 173.194.216.128, ...\n",
"Connecting to storage.googleapis.com (storage.googleapis.com)|173.194.213.128|:443... connected.\n",
"HTTP request sent, awaiting response... 200 OK\n",
"Length: 404261442 (386M) [application/zip]\n",
"Saving to: ‘cased_L-12_H-768_A-12.zip’\n",
"\n",
"cased_L-12_H-768_A- 100%[===================>] 385.53M 160MB/s in 2.4s \n",
"\n",
"2020-10-17 01:27:34 (160 MB/s) - ‘cased_L-12_H-768_A-12.zip’ saved [404261442/404261442]\n",
"\n",
"unzip: cannot find or open the, the.zip or the.ZIP.\n",
"Archive: cased_L-12_H-768_A-12.zip\n",
" creating: cased_L-12_H-768_A-12/\n",
" inflating: cased_L-12_H-768_A-12/bert_model.ckpt.meta \n",
" inflating: cased_L-12_H-768_A-12/bert_model.ckpt.data-00000-of-00001 \n",
" inflating: cased_L-12_H-768_A-12/vocab.txt \n",
" inflating: cased_L-12_H-768_A-12/bert_model.ckpt.index \n",
" inflating: cased_L-12_H-768_A-12/bert_config.json \n"
],
"name": "stdout"
}
]
},
{
"cell_type": "code",
"metadata": {
"id": "kLurvKQasUwm"
},
"source": [
"# please install this version to make model work fine \n",
"# with latest version it give error for some people\n",
"\n",
"pip install tensorflow==1.15.2"
],
"execution_count": null,
"outputs": []
},
{
"cell_type": "code",
"metadata": {
"id": "vjG3oNsUnuYk",
"outputId": "842b4510-7896-41bb-c007-0d4380547a4e",
"colab": {
"base_uri": "https://localhost:8080/",
"height": 1000
}
},
"source": [
"# running the command line commands\n",
"# parameters that can be tuned --> \n",
"# max_seq_length , learning_rate , train_batch_size , num_train_epochs\n",
"\n",
"!python bert/run_classifier.py \\\n",
"--task_name=cola \\\n",
"--do_train=true \\\n",
"--do_eval=true \\\n",
"--do_predict=true \\\n",
"--data_dir=./data/ \\\n",
"--vocab_file=./cased_L-12_H-768_A-12/vocab.txt \\\n",
"--bert_config_file=./cased_L-12_H-768_A-12/bert_config.json \\\n",
"--init_checkpoint=./cased_L-12_H-768_A-12/bert_model.ckpt \\\n",
"--max_seq_length=100 \\\n",
"--train_batch_size=8 \\\n",
"--learning_rate=1e-3 \\\n",
"--num_train_epochs=1.0 \\\n",
"--do_lower_case=False \\\n",
"--output_dir=./bert_output/ \\\n",
"--save_checkpoints_steps=9999999 "
],
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"text": [
"\u001b[1;30;43mStreaming output truncated to the last 5000 lines.\u001b[0m\n",
"I1017 09:27:11.383001 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0875963\n",
"INFO:tensorflow:examples/sec: 0.700771\n",
"I1017 09:27:11.383543 139782014523264 tpu_estimator.py:2308] examples/sec: 0.700771\n",
"INFO:tensorflow:global_step/sec: 0.0877112\n",
"I1017 09:27:22.784008 139782014523264 tpu_estimator.py:2307] global_step/sec: 0.0877112\n",
"INFO:tensorflow:examples/sec: 0.70169\n",
"I1017 09:27:22.784525 139782014523264 tpu_estimator.py:2308] examples/sec: 0.70169\n"
],
"name": "stdout"
}
]
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment