Created
August 5, 2024 12:08
-
-
Save mikk-c/533380efd8ca05149f3bebf6e5c5f6f0 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "0351f420-48f4-4b80-9187-eb4b8bc149b4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import networkx as nx\n", | |
| "from gensim.models import Word2Vec\n", | |
| "from sklearn.manifold import TSNE\n", | |
| "from sklearn.cluster import KMeans\n", | |
| "from sklearn.metrics import normalized_mutual_info_score" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "16f3caf7-48bc-406f-8833-17a24debbbee", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "H = nx.read_edgelist(\"1/data.txt\", create_using = nx.Graph(), delimiter = \"\\t\", nodetype = int)\n", | |
| "G = nx.Graph()\n", | |
| "G.add_nodes_from(sorted(H.nodes))\n", | |
| "G.add_edges_from(H.edges)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "715e1fed-4e9f-468e-97bd-97761f9a772a", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([[ 12.677454 , -21.878696 ],\n", | |
| " [ 6.382581 , -7.3105035],\n", | |
| " [ 6.96561 , -14.762308 ],\n", | |
| " ...,\n", | |
| " [ 38.403862 , 22.484592 ],\n", | |
| " [ 21.083256 , -28.476921 ],\n", | |
| " [-36.36369 , -10.843703 ]], dtype=float32)" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "rndwalks = list(nx.generate_random_paths(G, 10000, path_length = 6))\n", | |
| "model = Word2Vec(sentences = rndwalks, vector_size = 32, min_count = 1, workers = 8)\n", | |
| "\n", | |
| "nodemap = [None] * len(G.nodes)\n", | |
| "for k in model.wv.key_to_index:\n", | |
| " nodemap[k] = model.wv.key_to_index[k]\n", | |
| "\n", | |
| "reducer = TSNE(n_components = 2, init = \"pca\")\n", | |
| "embeddings = reducer.fit_transform(model.wv.vectors[nodemap])\n", | |
| "\n", | |
| "embeddings" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "9b127f46-9660-44ee-a3de-9a3fb6077c92", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([2, 2, 2, 0, 1, 2, 3, 2, 0, 3, 2, 3, 1, 1, 2, 2, 3, 2, 2, 1, 1, 1,\n", | |
| " 3, 1, 1, 2, 0, 1, 2, 1, 2, 3, 2, 2, 3, 2, 2, 3, 0, 2, 1, 3, 3, 1,\n", | |
| " 0, 3, 3, 1, 2, 3, 0, 3, 0, 2, 3, 3, 0, 0, 3, 3, 0, 1, 0, 0, 2, 1,\n", | |
| " 1, 3, 0, 2, 0, 1, 0, 3, 1, 2, 2, 2, 0, 3, 3, 0, 2, 0, 3, 3, 1, 1,\n", | |
| " 3, 3, 3, 1, 2, 0, 3, 3, 2, 0, 2, 1, 2, 1, 0, 3, 2, 1, 1, 1, 3, 0,\n", | |
| " 2, 1, 1, 0, 2, 3, 0, 0, 0, 3, 2, 3, 2, 0, 2, 3, 3, 0, 0, 0, 2, 2,\n", | |
| " 1, 0, 0, 1, 1, 1, 1, 2, 1, 2, 0, 2, 2, 0, 1, 1, 3, 0, 3, 2, 3, 2,\n", | |
| " 0, 1, 1, 3, 0, 1, 2, 3, 3, 1, 3, 2, 2, 1, 3, 3, 2, 3, 3, 2, 3, 2,\n", | |
| " 2, 3, 1, 2, 3, 2, 1, 3, 3, 2, 2, 2, 3, 2, 3, 1, 3, 0, 0, 3, 3, 0,\n", | |
| " 2, 1, 3, 3, 3, 2, 0, 1, 2, 2, 0, 3, 2, 1, 1, 2, 2, 1, 3, 3, 2, 3,\n", | |
| " 2, 1, 3, 2, 1, 1, 3, 0, 1, 0, 0, 1, 2, 1, 2, 1, 1, 3, 0, 1, 3, 1,\n", | |
| " 1, 2, 3, 3, 1, 2, 2, 2, 0, 2, 0, 3, 0, 0, 0, 0, 0, 3, 0, 2, 1, 1,\n", | |
| " 3, 3, 1, 1, 3, 0, 0, 3, 0, 1, 1, 1, 2, 1, 1, 1, 2, 3, 2, 1, 1, 3,\n", | |
| " 3, 3, 2, 3, 1, 0, 0, 1, 0, 3, 2, 0, 3, 3, 1, 2, 3, 3, 1, 2, 2, 0,\n", | |
| " 3, 0, 1, 0, 3, 3, 0, 0, 1, 3, 1, 0, 3, 2, 2, 2, 1, 3, 1, 0, 0, 2,\n", | |
| " 1, 1, 2, 1, 3, 0, 1, 2, 1, 0, 1, 1, 0, 1, 2, 1, 1, 1, 0, 3, 2, 0,\n", | |
| " 3, 2, 3, 2, 1, 1, 2, 1, 0, 2, 3, 0, 1, 3, 0, 2, 1, 2, 2, 2, 0, 2,\n", | |
| " 1, 1, 3, 3, 0, 3, 2, 3, 3, 1, 2, 3, 2, 2, 2, 1, 2, 3, 0, 2, 3, 2,\n", | |
| " 1, 0, 1, 0, 1, 3, 1, 1, 3, 0, 2, 0, 0, 2, 0, 2, 1, 0, 0, 1, 3, 3,\n", | |
| " 0, 1, 1, 3, 0, 1, 0, 2, 0, 0, 3, 1, 0, 2, 3, 3, 1, 3, 0, 0, 1, 3,\n", | |
| " 0, 0, 2, 0, 1, 1, 2, 3, 3, 2, 2, 0, 2, 1, 3, 3, 0, 3, 1, 2, 3, 3,\n", | |
| " 0, 1, 0, 2, 0, 0, 3, 1, 3, 2, 1, 1, 1, 0, 1, 2, 2, 0, 0, 0, 3, 0,\n", | |
| " 1, 1, 2, 0, 2, 0, 1, 3, 1, 3, 1, 3, 0, 0, 0, 3, 0, 1, 2, 1, 0, 3,\n", | |
| " 2, 3, 2, 0, 1, 2, 1, 3, 3, 2, 2, 0, 0, 3, 2, 2, 1, 2, 1, 3, 2, 0,\n", | |
| " 1, 3, 1, 1, 0, 2, 2, 1, 2, 1, 3, 0, 1, 2, 3, 1, 0, 3, 0, 3, 3, 2,\n", | |
| " 3, 0, 0, 1, 3, 3, 0, 2, 0, 1, 0, 0, 3, 1, 2, 1, 3, 0, 1, 1, 2, 2,\n", | |
| " 2, 1, 0, 2, 0, 3, 2, 1, 0, 3, 1, 1, 2, 3, 0, 0, 0, 3, 0, 3, 0, 2,\n", | |
| " 3, 2, 2, 2, 1, 3, 1, 3, 0, 3, 1, 3, 2, 2, 0, 0, 3, 1, 1, 1, 1, 2,\n", | |
| " 0, 2, 3, 0, 1, 1, 1, 0, 2, 1, 2, 2, 2, 0, 3, 0, 2, 1, 1, 3, 0, 0,\n", | |
| " 3, 3, 1, 3, 1, 1, 1, 3, 3, 2, 1, 0, 0, 1, 3, 0, 1, 3, 0, 2, 3, 1,\n", | |
| " 2, 0, 1, 3, 0, 0, 0, 1, 2, 3, 0, 1, 1, 1, 2, 2, 1, 2, 1, 3, 0, 3,\n", | |
| " 1, 2, 0, 0, 0, 3, 2, 2, 3, 0, 1, 1, 2, 1, 1, 0, 0, 1, 1, 2, 0, 0,\n", | |
| " 2, 2, 2, 3, 2, 0, 2, 1, 2, 1, 3, 1, 2, 2, 1, 2, 1, 1, 1, 3, 3, 2,\n", | |
| " 1, 2, 0, 2, 3, 1, 1, 3, 0, 3, 1, 2, 1, 3, 2, 3, 0, 0, 0, 3, 2, 2,\n", | |
| " 2, 1, 1, 0, 1, 1, 2, 0, 1, 2, 1, 1, 0, 0, 2, 1, 2, 3, 0, 3, 1, 3,\n", | |
| " 1, 2, 3, 2, 1, 1, 2, 3, 3, 2, 0, 3, 1, 0, 0, 3, 1, 2, 1, 1, 3, 1,\n", | |
| " 1, 3, 1, 1, 2, 1, 3, 0, 1, 3, 3, 0, 2, 3, 0, 1, 0, 3, 0, 1, 3, 0,\n", | |
| " 3, 2, 1, 2, 2, 1, 2, 3, 2, 0, 2, 3, 1, 3, 1, 2, 0, 1, 0, 3, 3, 1,\n", | |
| " 0, 1, 3, 1, 2, 1, 3, 2, 1, 1, 0, 2, 1, 1, 3, 2, 3, 1, 0, 2, 3, 3,\n", | |
| " 1, 2, 1, 1, 2, 3, 1, 1, 3, 0, 3, 0, 3, 1, 1, 1, 2, 0, 1, 0, 0, 3,\n", | |
| " 1, 1, 1, 1, 1, 2, 1, 1, 3, 2, 2, 1, 3, 2, 2, 1, 2, 1, 1, 0, 0, 2,\n", | |
| " 2, 0, 1, 0, 0, 1, 1, 2, 0, 3, 1, 0, 0, 1, 2, 2, 3, 0, 1, 1, 0, 1,\n", | |
| " 3, 3, 1, 1, 0, 3, 3, 1, 1, 2, 2, 3, 2, 0, 2, 3, 0, 2, 2, 1, 3, 2,\n", | |
| " 0, 3, 3, 3, 1, 0, 1, 2, 3, 2, 2, 2, 1, 2, 1, 2, 1, 2, 0, 3, 3, 3,\n", | |
| " 0, 1, 1, 2, 1, 1, 2, 2, 1, 3, 2, 0, 1, 0, 3, 0, 3, 2, 1, 1, 0, 1,\n", | |
| " 0, 1, 1, 1, 3, 0, 0, 0, 2, 1], dtype=int32)" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "reducer = KMeans(n_clusters = 4)\n", | |
| "clusters = reducer.fit(embeddings).labels_\n", | |
| "\n", | |
| "clusters" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "f40c6d77-cb8c-4473-ad77-bc8103fe4622", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.9352377213494278" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "nodes = list(G.nodes)\n", | |
| "\n", | |
| "ground_truth = {}\n", | |
| "with open(\"1/nodes.txt\", 'r') as f:\n", | |
| " for line in f:\n", | |
| " fields = line.strip().split('\\t')\n", | |
| " ground_truth[int(fields[0])] = int(fields[1])\n", | |
| "\n", | |
| "ground_truth = [ground_truth[i] for i in nodes]\n", | |
| "\n", | |
| "normalized_mutual_info_score(clusters, ground_truth)" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.3" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment