Created
September 12, 2020 10:29
-
-
Save chetanambi/5832d8ca53a5eb5b5db29653e3ba33d9 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "name": "paraphrase_mining.ipynb", | |
| "provenance": [] | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "3rQqCncd1jPI", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 793 | |
| }, | |
| "outputId": "52a92d40-956d-4c3f-833c-18db6b090392" | |
| }, | |
| "source": [ | |
| "!pip install sentence_transformers" | |
| ], | |
| "execution_count": 1, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "Collecting sentence_transformers\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/42/74/49848e9bb64482a7e5f475cc66da5de759077817ede36f8812060ebcaba6/sentence-transformers-0.3.6.tar.gz (62kB)\n", | |
| "\u001b[K |████████████████████████████████| 71kB 2.1MB/s \n", | |
| "\u001b[?25hCollecting transformers<3.2.0,>=3.1.0\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/ae/05/c8c55b600308dc04e95100dc8ad8a244dd800fe75dfafcf1d6348c6f6209/transformers-3.1.0-py3-none-any.whl (884kB)\n", | |
| "\u001b[K |████████████████████████████████| 890kB 7.1MB/s \n", | |
| "\u001b[?25hRequirement already satisfied: tqdm in /usr/local/lib/python3.6/dist-packages (from sentence_transformers) (4.41.1)\n", | |
| "Requirement already satisfied: torch>=1.2.0 in /usr/local/lib/python3.6/dist-packages (from sentence_transformers) (1.6.0+cu101)\n", | |
| "Requirement already satisfied: numpy in /usr/local/lib/python3.6/dist-packages (from sentence_transformers) (1.18.5)\n", | |
| "Requirement already satisfied: scikit-learn in /usr/local/lib/python3.6/dist-packages (from sentence_transformers) (0.22.2.post1)\n", | |
| "Requirement already satisfied: scipy in /usr/local/lib/python3.6/dist-packages (from sentence_transformers) (1.4.1)\n", | |
| "Requirement already satisfied: nltk in /usr/local/lib/python3.6/dist-packages (from sentence_transformers) (3.2.5)\n", | |
| "Requirement already satisfied: requests in /usr/local/lib/python3.6/dist-packages (from transformers<3.2.0,>=3.1.0->sentence_transformers) (2.23.0)\n", | |
| "Requirement already satisfied: packaging in /usr/local/lib/python3.6/dist-packages (from transformers<3.2.0,>=3.1.0->sentence_transformers) (20.4)\n", | |
| "Requirement already satisfied: filelock in /usr/local/lib/python3.6/dist-packages (from transformers<3.2.0,>=3.1.0->sentence_transformers) (3.0.12)\n", | |
| "Requirement already satisfied: dataclasses; python_version < \"3.7\" in /usr/local/lib/python3.6/dist-packages (from transformers<3.2.0,>=3.1.0->sentence_transformers) (0.7)\n", | |
| "Requirement already satisfied: regex!=2019.12.17 in /usr/local/lib/python3.6/dist-packages (from transformers<3.2.0,>=3.1.0->sentence_transformers) (2019.12.20)\n", | |
| "Collecting sentencepiece!=0.1.92\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/d4/a4/d0a884c4300004a78cca907a6ff9a5e9fe4f090f5d95ab341c53d28cbc58/sentencepiece-0.1.91-cp36-cp36m-manylinux1_x86_64.whl (1.1MB)\n", | |
| "\u001b[K |████████████████████████████████| 1.1MB 15.9MB/s \n", | |
| "\u001b[?25hCollecting tokenizers==0.8.1.rc2\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/80/83/8b9fccb9e48eeb575ee19179e2bdde0ee9a1904f97de5f02d19016b8804f/tokenizers-0.8.1rc2-cp36-cp36m-manylinux1_x86_64.whl (3.0MB)\n", | |
| "\u001b[K |████████████████████████████████| 3.0MB 29.0MB/s \n", | |
| "\u001b[?25hCollecting sacremoses\n", | |
| "\u001b[?25l Downloading https://files.pythonhosted.org/packages/7d/34/09d19aff26edcc8eb2a01bed8e98f13a1537005d31e95233fd48216eed10/sacremoses-0.0.43.tar.gz (883kB)\n", | |
| "\u001b[K |████████████████████████████████| 890kB 36.9MB/s \n", | |
| "\u001b[?25hRequirement already satisfied: future in /usr/local/lib/python3.6/dist-packages (from torch>=1.2.0->sentence_transformers) (0.16.0)\n", | |
| "Requirement already satisfied: joblib>=0.11 in /usr/local/lib/python3.6/dist-packages (from scikit-learn->sentence_transformers) (0.16.0)\n", | |
| "Requirement already satisfied: six in /usr/local/lib/python3.6/dist-packages (from nltk->sentence_transformers) (1.15.0)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.6/dist-packages (from requests->transformers<3.2.0,>=3.1.0->sentence_transformers) (2020.6.20)\n", | |
| "Requirement already satisfied: idna<3,>=2.5 in /usr/local/lib/python3.6/dist-packages (from requests->transformers<3.2.0,>=3.1.0->sentence_transformers) (2.10)\n", | |
| "Requirement already satisfied: chardet<4,>=3.0.2 in /usr/local/lib/python3.6/dist-packages (from requests->transformers<3.2.0,>=3.1.0->sentence_transformers) (3.0.4)\n", | |
| "Requirement already satisfied: urllib3!=1.25.0,!=1.25.1,<1.26,>=1.21.1 in /usr/local/lib/python3.6/dist-packages (from requests->transformers<3.2.0,>=3.1.0->sentence_transformers) (1.24.3)\n", | |
| "Requirement already satisfied: pyparsing>=2.0.2 in /usr/local/lib/python3.6/dist-packages (from packaging->transformers<3.2.0,>=3.1.0->sentence_transformers) (2.4.7)\n", | |
| "Requirement already satisfied: click in /usr/local/lib/python3.6/dist-packages (from sacremoses->transformers<3.2.0,>=3.1.0->sentence_transformers) (7.1.2)\n", | |
| "Building wheels for collected packages: sentence-transformers, sacremoses\n", | |
| " Building wheel for sentence-transformers (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Created wheel for sentence-transformers: filename=sentence_transformers-0.3.6-cp36-none-any.whl size=101182 sha256=80f7e93a0b9ce6b5d0b4d4f1437275921922b5cb9f793c5685f6db398d43515a\n", | |
| " Stored in directory: /root/.cache/pip/wheels/6f/3f/75/c0c4b3ef5dfbf8806d37b8dc661861772aba2f7aa419c85a9b\n", | |
| " Building wheel for sacremoses (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Created wheel for sacremoses: filename=sacremoses-0.0.43-cp36-none-any.whl size=893257 sha256=523ff6eecefafb556e97b222d26639e3840052a14ffd072ae0e322147ad1291b\n", | |
| " Stored in directory: /root/.cache/pip/wheels/29/3c/fd/7ce5c3f0666dab31a50123635e6fb5e19ceb42ce38d4e58f45\n", | |
| "Successfully built sentence-transformers sacremoses\n", | |
| "Installing collected packages: sentencepiece, tokenizers, sacremoses, transformers, sentence-transformers\n", | |
| "Successfully installed sacremoses-0.0.43 sentence-transformers-0.3.6 sentencepiece-0.1.91 tokenizers-0.8.1rc2 transformers-3.1.0\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "metadata": { | |
| "id": "A1rFGQWk1nVX", | |
| "colab_type": "code", | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 269 | |
| }, | |
| "outputId": "d915a41f-0dd1-4f15-b1a9-fb9f46da680e" | |
| }, | |
| "source": [ | |
| "from sentence_transformers import SentenceTransformer, util\n", | |
| "\n", | |
| "model = SentenceTransformer('distilbert-base-nli-stsb-mean-tokens')\n", | |
| "\n", | |
| "# Single list of sentences - Possible tens of thousands of sentences\n", | |
| "sentences = [\"Don't aggrandize what genuinely transpired.\",\n", | |
| " \"Don't overrate what really happened.\",\n", | |
| " 'This picture is truly beautiful.',\n", | |
| " 'This picture is genuinely comely.',\n", | |
| " \"I am thinking positively about the future.\",\n", | |
| " \"I am being optimistic about the future.\"]\n", | |
| "\n", | |
| "paraphrases = util.paraphrase_mining(model, sentences)\n", | |
| "\n", | |
| "for paraphrase in paraphrases:\n", | |
| " score, i, j = paraphrase\n", | |
| " print(\"{} \\t\\t {} \\t\\t Score: {:.4f}\".format(sentences[i], sentences[j], score))" | |
| ], | |
| "execution_count": 2, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "100%|██████████| 245M/245M [00:08<00:00, 27.9MB/s]\n" | |
| ], | |
| "name": "stderr" | |
| }, | |
| { | |
| "output_type": "stream", | |
| "text": [ | |
| "I am thinking positively about the future. \t\t I am being optimistic about the future. \t\t Score: 0.8899\n", | |
| "Don't aggrandize what genuinely transpired. \t\t Don't overrate what really happened. \t\t Score: 0.8064\n", | |
| "This picture is truly beautiful. \t\t This picture is genuinely comely. \t\t Score: 0.6323\n", | |
| "This picture is genuinely comely. \t\t I am thinking positively about the future. \t\t Score: 0.4090\n", | |
| "This picture is genuinely comely. \t\t I am being optimistic about the future. \t\t Score: 0.4048\n", | |
| "This picture is truly beautiful. \t\t I am being optimistic about the future. \t\t Score: 0.3942\n", | |
| "This picture is truly beautiful. \t\t I am thinking positively about the future. \t\t Score: 0.3844\n", | |
| "Don't aggrandize what genuinely transpired. \t\t I am thinking positively about the future. \t\t Score: 0.3574\n", | |
| "Don't aggrandize what genuinely transpired. \t\t I am being optimistic about the future. \t\t Score: 0.3456\n", | |
| "Don't overrate what really happened. \t\t I am thinking positively about the future. \t\t Score: 0.3440\n", | |
| "Don't overrate what really happened. \t\t I am being optimistic about the future. \t\t Score: 0.3439\n", | |
| "Don't aggrandize what genuinely transpired. \t\t This picture is genuinely comely. \t\t Score: 0.3308\n", | |
| "Don't overrate what really happened. \t\t This picture is genuinely comely. \t\t Score: 0.2427\n", | |
| "This picture is truly beautiful. \t\t Don't overrate what really happened. \t\t Score: 0.2281\n" | |
| ], | |
| "name": "stdout" | |
| } | |
| ] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment