Created
August 5, 2024 12:10
-
-
Save mikk-c/8d811233e2d40f1a4db8182eaae5d74b to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "62b3af7e-9a21-40e4-bfba-da5cc717048d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "import networkx as nx\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "from gensim.models import Word2Vec\n", | |
| "from sklearn.manifold import TSNE\n", | |
| "from sklearn.cluster import KMeans\n", | |
| "from sklearn.metrics import normalized_mutual_info_score\n", | |
| "from sklearn.metrics import roc_curve, auc" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "a23491b6-ba8a-4340-9ec7-add6516be5eb", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "H = nx.read_edgelist(\"1/data.txt\", create_using = nx.Graph(), delimiter = \"\\t\", nodetype = int)\n", | |
| "G = nx.Graph()\n", | |
| "G.add_nodes_from(sorted(H.nodes))\n", | |
| "G.add_edges_from(H.edges)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "6bb19658-4972-432c-a01f-d32e7d4cfda5", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "rndwalks = list(nx.generate_random_paths(G, 10000, path_length = 6))\n", | |
| "model = Word2Vec(sentences = rndwalks, vector_size = 32, min_count = 1, workers = 8)\n", | |
| "\n", | |
| "nodemap = [None] * len(G.nodes)\n", | |
| "for k in model.wv.key_to_index:\n", | |
| " nodemap[k] = model.wv.key_to_index[k]\n", | |
| "\n", | |
| "reducer = TSNE(n_components = 2, init = \"pca\")\n", | |
| "embeddings = reducer.fit_transform(model.wv.vectors[nodemap])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "ba4ca500-c6d2-48b2-a7c2-9823c8bb3079", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "new_edges = set()\n", | |
| "with open(\"4/newedges.txt\", 'r') as f:\n", | |
| " for line in f:\n", | |
| " fields = line.strip().split('\\t')\n", | |
| " new_edges.add((int(fields[0]), int(fields[1])))\n", | |
| "\n", | |
| "nodes = list(G.nodes)\n", | |
| "old_edges = set(G.edges)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "2a2b274b-da77-4ba3-a57c-a3432f927bee", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>n1</th>\n", | |
| " <th>n2</th>\n", | |
| " <th>score</th>\n", | |
| " <th>is_old</th>\n", | |
| " <th>is_new</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>279.262146</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>261.757599</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0</td>\n", | |
| " <td>3</td>\n", | |
| " <td>-36.844929</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>0</td>\n", | |
| " <td>4</td>\n", | |
| " <td>31.864422</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>0</td>\n", | |
| " <td>5</td>\n", | |
| " <td>195.267639</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>499490</th>\n", | |
| " <td>995</td>\n", | |
| " <td>996</td>\n", | |
| " <td>1613.641479</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>499492</th>\n", | |
| " <td>995</td>\n", | |
| " <td>998</td>\n", | |
| " <td>-531.797241</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>499493</th>\n", | |
| " <td>995</td>\n", | |
| " <td>999</td>\n", | |
| " <td>-73.205505</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>499495</th>\n", | |
| " <td>996</td>\n", | |
| " <td>998</td>\n", | |
| " <td>-539.313416</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>499497</th>\n", | |
| " <td>997</td>\n", | |
| " <td>998</td>\n", | |
| " <td>-532.058105</td>\n", | |
| " <td>False</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>495404 rows × 5 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " n1 n2 score is_old is_new\n", | |
| "0 0 1 279.262146 False False\n", | |
| "1 0 2 261.757599 False False\n", | |
| "2 0 3 -36.844929 False False\n", | |
| "3 0 4 31.864422 False False\n", | |
| "4 0 5 195.267639 False False\n", | |
| "... ... ... ... ... ...\n", | |
| "499490 995 996 1613.641479 False False\n", | |
| "499492 995 998 -531.797241 False False\n", | |
| "499493 995 999 -73.205505 False False\n", | |
| "499495 996 998 -539.313416 False False\n", | |
| "499497 997 998 -532.058105 False False\n", | |
| "\n", | |
| "[495404 rows x 5 columns]" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "linkpred = []\n", | |
| "for i in range(len(nodes) - 1):\n", | |
| " for j in range(i + 1, len(nodes)):\n", | |
| " is_old = (nodes[i], nodes[j]) in old_edges or (nodes[j], nodes[i]) in old_edges\n", | |
| " is_new = (nodes[i], nodes[j]) in new_edges or (nodes[j], nodes[i]) in new_edges\n", | |
| " linkpred.append((nodes[i], nodes[j], embeddings[i].T.dot(embeddings[j]), is_old, is_new))\n", | |
| "\n", | |
| "linkpred = pd.DataFrame(data = linkpred, columns = (\"n1\", \"n2\", \"score\", \"is_old\", \"is_new\"))\n", | |
| "linkpred = linkpred[~linkpred[\"is_old\"]]\n", | |
| "\n", | |
| "linkpred" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "b3321c52-4b22-496e-b72b-e5383c5f5850", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>src</th>\n", | |
| " <th>trg</th>\n", | |
| " <th>score</th>\n", | |
| " <th>is_new</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>0</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>0</td>\n", | |
| " <td>2</td>\n", | |
| " <td>25</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>0</td>\n", | |
| " <td>3</td>\n", | |
| " <td>25</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>0</td>\n", | |
| " <td>4</td>\n", | |
| " <td>25</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>0</td>\n", | |
| " <td>5</td>\n", | |
| " <td>25</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>495399</th>\n", | |
| " <td>995</td>\n", | |
| " <td>996</td>\n", | |
| " <td>4891</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>495400</th>\n", | |
| " <td>995</td>\n", | |
| " <td>998</td>\n", | |
| " <td>6700</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>495401</th>\n", | |
| " <td>995</td>\n", | |
| " <td>999</td>\n", | |
| " <td>17822</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>495402</th>\n", | |
| " <td>996</td>\n", | |
| " <td>998</td>\n", | |
| " <td>7300</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>495403</th>\n", | |
| " <td>997</td>\n", | |
| " <td>998</td>\n", | |
| " <td>8000</td>\n", | |
| " <td>False</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>495404 rows × 4 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " src trg score is_new\n", | |
| "0 0 1 25 False\n", | |
| "1 0 2 25 False\n", | |
| "2 0 3 25 False\n", | |
| "3 0 4 25 False\n", | |
| "4 0 5 25 False\n", | |
| "... ... ... ... ...\n", | |
| "495399 995 996 4891 False\n", | |
| "495400 995 998 6700 False\n", | |
| "495401 995 999 17822 False\n", | |
| "495402 996 998 7300 False\n", | |
| "495403 997 998 8000 False\n", | |
| "\n", | |
| "[495404 rows x 4 columns]" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "pa = pd.DataFrame(list(nx.preferential_attachment(G)), columns = (\"src\", \"trg\", \"score\"))\n", | |
| "ja = pd.DataFrame(list(nx.jaccard_coefficient(G)), columns = (\"src\", \"trg\", \"score\"))\n", | |
| "aa = pd.DataFrame(list(nx.adamic_adar_index(G)), columns = (\"src\", \"trg\", \"score\"))\n", | |
| "ra = pd.DataFrame(list(nx.resource_allocation_index(G)), columns = (\"src\", \"trg\", \"score\"))\n", | |
| "\n", | |
| "pa[\"is_new\"] = pa.apply(lambda x: (x[\"src\"], x[\"trg\"]) in new_edges or (x[\"trg\"], x[\"src\"]) in new_edges, axis = 1)\n", | |
| "ja[\"is_new\"] = ja.apply(lambda x: (x[\"src\"], x[\"trg\"]) in new_edges or (x[\"trg\"], x[\"src\"]) in new_edges, axis = 1)\n", | |
| "aa[\"is_new\"] = aa.apply(lambda x: (x[\"src\"], x[\"trg\"]) in new_edges or (x[\"trg\"], x[\"src\"]) in new_edges, axis = 1)\n", | |
| "ra[\"is_new\"] = ra.apply(lambda x: (x[\"src\"], x[\"trg\"]) in new_edges or (x[\"trg\"], x[\"src\"]) in new_edges, axis = 1)\n", | |
| "\n", | |
| "pa" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "53905729-18f0-4493-aa1f-7fc973e32b51", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "N2V's AUC: 0.7979\n", | |
| "PA's AUC: 0.6697\n", | |
| "JA's AUC: 0.6396\n", | |
| "AA's AUC: 0.6661\n", | |
| "RA's AUC: 0.6657\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "fpr_tsne, tpr_tsne, thresholds = roc_curve(linkpred[\"is_new\"], linkpred[\"score\"])\n", | |
| "fpr_pa, tpr_pa, thresholds = roc_curve(pa[\"is_new\"], pa[\"score\"])\n", | |
| "fpr_ja, tpr_ja, thresholds = roc_curve(ja[\"is_new\"], ja[\"score\"])\n", | |
| "fpr_aa, tpr_aa, thresholds = roc_curve(aa[\"is_new\"], aa[\"score\"])\n", | |
| "fpr_ra, tpr_ra, thresholds = roc_curve(ra[\"is_new\"], ra[\"score\"])\n", | |
| "\n", | |
| "print(\"N2V's AUC: %1.4f\" % auc(fpr_tsne, tpr_tsne))\n", | |
| "print(\"PA's AUC: %1.4f\" % auc(fpr_pa, tpr_pa))\n", | |
| "print(\"JA's AUC: %1.4f\" % auc(fpr_ja, tpr_ja))\n", | |
| "print(\"AA's AUC: %1.4f\" % auc(fpr_aa, tpr_aa))\n", | |
| "print(\"RA's AUC: %1.4f\" % auc(fpr_ra, tpr_ra))" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment