Created
August 5, 2024 11:50
-
-
Save tazarov/a5f0d1165234d0a181b139376184b40b to your computer and use it in GitHub Desktop.
Reproduces a bug in HNSW when replace_delete=True
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "# Try to reproduce the bug by replay of the state machine", | |
| "id": "85a95571ea8c3598" | |
| }, | |
| { | |
| "cell_type": "code", | |
| "id": "initial_id", | |
| "metadata": { | |
| "collapsed": true, | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:35.711003Z", | |
| "start_time": "2024-08-05T08:21:35.642637Z" | |
| } | |
| }, | |
| "source": [ | |
| "import random\n", | |
| "\n", | |
| "import hnswlib\n", | |
| "import numpy as np\n", | |
| "\n", | |
| "vectors = np.random.rand(30, 1536).astype(np.float32)\n", | |
| "ids = np.arange(30)\n", | |
| "index = hnswlib.Index(space='l2', dim=1536)\n", | |
| "index.init_index(allow_replace_deleted=True,persistence_location=\"hnsw_label_bug/\",is_persistent_index=True,max_elements=501)\n" | |
| ], | |
| "outputs": [], | |
| "execution_count": 1 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:36.465732Z", | |
| "start_time": "2024-08-05T08:21:36.463921Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Add ID 0\n", | |
| "index.add_items(data=[vectors[0]],ids=[ids[0]],replace_deleted=True)" | |
| ], | |
| "id": "4d7c85267ecc743f", | |
| "outputs": [], | |
| "execution_count": 2 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:37.269094Z", | |
| "start_time": "2024-08-05T08:21:37.266903Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "#delete 0\n", | |
| "index.mark_deleted(ids[0])" | |
| ], | |
| "id": "40c3a1aa79f6b39b", | |
| "outputs": [], | |
| "execution_count": 3 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:37.977276Z", | |
| "start_time": "2024-08-05T08:21:37.975284Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# add 10 vectors\n", | |
| "index.add_items(data=vectors[1:11],ids=ids[1:11],replace_deleted=True)" | |
| ], | |
| "id": "7823b1f0b9a2d96b", | |
| "outputs": [], | |
| "execution_count": 4 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:38.550548Z", | |
| "start_time": "2024-08-05T08:21:38.548687Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# delete ID 1\n", | |
| "index.mark_deleted(ids[1])" | |
| ], | |
| "id": "958be64f115b85cc", | |
| "outputs": [], | |
| "execution_count": 5 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:39.080032Z", | |
| "start_time": "2024-08-05T08:21:39.078103Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# upsert 11 and 12\n", | |
| "index.add_items(data=vectors[11:13],ids=ids[11:13],replace_deleted=True)" | |
| ], | |
| "id": "1df6fba54889425e", | |
| "outputs": [], | |
| "execution_count": 6 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:39.611261Z", | |
| "start_time": "2024-08-05T08:21:39.609267Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Add 11 thru 16\n", | |
| "index.add_items(data=vectors[11:17],ids=ids[11:17],replace_deleted=True)" | |
| ], | |
| "id": "5ed27eaa02dc651d", | |
| "outputs": [], | |
| "execution_count": 7 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:40.282052Z", | |
| "start_time": "2024-08-05T08:21:40.280009Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Add Emmbeddings 17, 18, and 1\n", | |
| "index.add_items(data=vectors[17:20],ids=ids[17:20],replace_deleted=True)" | |
| ], | |
| "id": "d5966e5d96af17c2", | |
| "outputs": [], | |
| "execution_count": 8 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:41.210879Z", | |
| "start_time": "2024-08-05T08:21:41.208631Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Add 20\n", | |
| "index.add_items(data=[vectors[20]],ids=[ids[20]],replace_deleted=True)" | |
| ], | |
| "id": "ea183ec3dfaeaeb4", | |
| "outputs": [], | |
| "execution_count": 9 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:41.764222Z", | |
| "start_time": "2024-08-05T08:21:41.762388Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Add 21 and 22\n", | |
| "index.add_items(data=vectors[21:23],ids=ids[21:23],replace_deleted=True)" | |
| ], | |
| "id": "575d4f7c51b13370", | |
| "outputs": [], | |
| "execution_count": 10 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:42.347645Z", | |
| "start_time": "2024-08-05T08:21:42.345325Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Add 16\n", | |
| "index.add_items(data=[vectors[16]],ids=[ids[16]],replace_deleted=True)" | |
| ], | |
| "id": "fa60cb75d8c5dd67", | |
| "outputs": [], | |
| "execution_count": 11 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:43.072866Z", | |
| "start_time": "2024-08-05T08:21:43.070776Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Add 23\n", | |
| "index.add_items(data=[vectors[23]],ids=[ids[23]],replace_deleted=True)" | |
| ], | |
| "id": "84ffe2f36ba11edf", | |
| "outputs": [], | |
| "execution_count": 12 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:43.809598Z", | |
| "start_time": "2024-08-05T08:21:43.807569Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Add 24 thru 27\n", | |
| "index.add_items(data=vectors[24:28],ids=ids[24:28],replace_deleted=True)" | |
| ], | |
| "id": "ec1951a2a863a670", | |
| "outputs": [], | |
| "execution_count": 13 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:45.093522Z", | |
| "start_time": "2024-08-05T08:21:45.091329Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Delete 21 thru 26\n", | |
| "# index.mark_deleted(ids[21:27])\n", | |
| "for i in range(21,27):\n", | |
| " index.mark_deleted(ids[i])" | |
| ], | |
| "id": "ae44d4e22dcc195f", | |
| "outputs": [], | |
| "execution_count": 14 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:21:46.944606Z", | |
| "start_time": "2024-08-05T08:21:46.940604Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Upsert 20, 26 and 27\n", | |
| "index.add_items(data=[vectors[20],vectors[26],vectors[27]],ids=[ids[20],ids[26],ids[27]],replace_deleted=True)" | |
| ], | |
| "id": "d2906a7759ffaa75", | |
| "outputs": [], | |
| "execution_count": 15 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:32:56.414844Z", | |
| "start_time": "2024-08-05T08:32:56.412628Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "# Delete 19, 20, 27\n", | |
| "# index.mark_deleted([ids[19],ids[20],ids[27]])\n", | |
| "\n", | |
| "for i in [19,20,27]:\n", | |
| " print(i)\n", | |
| " try:\n", | |
| " index.mark_deleted(ids[i])\n", | |
| " except:\n", | |
| " print(\"Error\")" | |
| ], | |
| "id": "6403a57f9e98d9cb", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "19\n", | |
| "Error\n", | |
| "20\n", | |
| "Error\n", | |
| "27\n", | |
| "Error\n" | |
| ] | |
| } | |
| ], | |
| "execution_count": 25 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:47:22.725534Z", | |
| "start_time": "2024-08-05T08:47:22.723324Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "\n", | |
| "print(index.element_count)\n", | |
| "res=index.knn_query(vectors[19], k=10)\n", | |
| "print(res[0][0])" | |
| ], | |
| "id": "33820ccb85967435", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "26\n", | |
| "[12 18 3 6 20 4 26 13 14 27]\n" | |
| ] | |
| } | |
| ], | |
| "execution_count": 30 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:31:34.668700Z", | |
| "start_time": "2024-08-05T08:31:34.666993Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": "print(ids[20])", | |
| "id": "b356e8f82160798c", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "20\n" | |
| ] | |
| } | |
| ], | |
| "execution_count": 21 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T08:47:30.479579Z", | |
| "start_time": "2024-08-05T08:47:30.477757Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": "print(index.get_ids_list())", | |
| "id": "e76e1732806ff74e", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "[26, 27, 23, 22, 21, 20, 19, 18, 17, 16, 15, 14, 13, 12, 11, 10, 9, 8, 7, 6, 5, 4, 3, 2]\n" | |
| ] | |
| } | |
| ], | |
| "execution_count": 31 | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "markdown", | |
| "source": "# Reproduce the bug", | |
| "id": "53a3523613d92529" | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T11:40:31.416876Z", | |
| "start_time": "2024-08-05T11:40:31.092098Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": [ | |
| "import hnswlib\n", | |
| "import numpy as np\n", | |
| "import random\n", | |
| "delete_ids = []\n", | |
| "cycles = 2000\n", | |
| "vectors = np.random.uniform(-1,1,(cycles*6, 1536)).astype(np.float32)\n", | |
| "index = hnswlib.Index(space='l2', dim=1536)\n", | |
| "index.init_index(allow_replace_deleted=True, max_elements=501)\n", | |
| "index.set_num_threads(1)\n", | |
| "for cid in range(cycles):\n", | |
| " if random.random()>0.7 and len(index.get_ids_list()) >0: # we randomly delete ~30% of the time\n", | |
| " ids_to_delete = random.sample(index.get_ids_list(), random.randint(1, len(index.get_ids_list())))\n", | |
| " for id in ids_to_delete:\n", | |
| " if id in delete_ids: # weirdly we also have to check that get_ids_list doesn't return deleted ids\n", | |
| " continue\n", | |
| " index.mark_deleted(id)\n", | |
| " delete_ids.append(id)\n", | |
| " else:\n", | |
| " start_id = index.element_count #this is not idea but does the job.\n", | |
| " num_vectors_to_add = random.randint(1, 6) # \n", | |
| " va = vectors[start_id:start_id+num_vectors_to_add]\n", | |
| " index.add_items(data=va,ids=range(start_id,start_id+len(va)),num_threads=1,replace_deleted=True)\n", | |
| " if index.element_count>0:\n", | |
| " try:\n", | |
| " res=index.knn_query(vectors[random.sample(index.get_ids_list(),1)[0]],num_threads=1, k=min(len(index.get_ids_list()),10))\n", | |
| " if len(res[0][0])>0:\n", | |
| " for i in res[0][0]:\n", | |
| " if i in delete_ids:\n", | |
| " raise Exception(f\"Error in cycle {cid}\",f\"ID: {i}\", f\"Deleted IDs: {delete_ids}\", f\"Query Response Labels: {res[0][0].tolist()}\",f\"All index IDs: {index.get_ids_list()}\")\n", | |
| " except Exception as e:\n", | |
| " if \"Cannot return\" not in str(e):\n", | |
| " print(str(e))\n", | |
| " raise e\n", | |
| " " | |
| ], | |
| "id": "65115ca986be7d34", | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')\n" | |
| ] | |
| }, | |
| { | |
| "ename": "Exception", | |
| "evalue": "('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001B[0;31m---------------------------------------------------------------------------\u001B[0m", | |
| "\u001B[0;31mException\u001B[0m Traceback (most recent call last)", | |
| "Cell \u001B[0;32mIn[1], line 33\u001B[0m\n\u001B[1;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCannot return\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(e):\n\u001B[1;32m 32\u001B[0m \u001B[38;5;28mprint\u001B[39m(\u001B[38;5;28mstr\u001B[39m(e))\n\u001B[0;32m---> 33\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m e\n", | |
| "Cell \u001B[0;32mIn[1], line 29\u001B[0m\n\u001B[1;32m 27\u001B[0m \u001B[38;5;28;01mfor\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m res[\u001B[38;5;241m0\u001B[39m][\u001B[38;5;241m0\u001B[39m]:\n\u001B[1;32m 28\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m i \u001B[38;5;129;01min\u001B[39;00m delete_ids:\n\u001B[0;32m---> 29\u001B[0m \u001B[38;5;28;01mraise\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m(\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mError in cycle \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mcid\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m,\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mID: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mi\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mDeleted IDs: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mdelete_ids\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m, \u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mQuery Response Labels: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mres[\u001B[38;5;241m0\u001B[39m][\u001B[38;5;241m0\u001B[39m]\u001B[38;5;241m.\u001B[39mtolist()\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m,\u001B[38;5;124mf\u001B[39m\u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mAll index IDs: \u001B[39m\u001B[38;5;132;01m{\u001B[39;00mindex\u001B[38;5;241m.\u001B[39mget_ids_list()\u001B[38;5;132;01m}\u001B[39;00m\u001B[38;5;124m\"\u001B[39m)\n\u001B[1;32m 30\u001B[0m \u001B[38;5;28;01mexcept\u001B[39;00m \u001B[38;5;167;01mException\u001B[39;00m \u001B[38;5;28;01mas\u001B[39;00m e:\n\u001B[1;32m 31\u001B[0m \u001B[38;5;28;01mif\u001B[39;00m \u001B[38;5;124m\"\u001B[39m\u001B[38;5;124mCannot return\u001B[39m\u001B[38;5;124m\"\u001B[39m \u001B[38;5;129;01mnot\u001B[39;00m \u001B[38;5;129;01min\u001B[39;00m \u001B[38;5;28mstr\u001B[39m(e):\n", | |
| "\u001B[0;31mException\u001B[0m: ('Error in cycle 5', 'ID: 5', 'Deleted IDs: [1, 2, 0, 6, 5, 7]', 'Query Response Labels: [3, 9, 4, 5, 8, 10]', 'All index IDs: [10, 7, 9, 4, 8, 3]')" | |
| ] | |
| } | |
| ], | |
| "execution_count": 1 | |
| }, | |
| { | |
| "metadata": { | |
| "ExecuteTime": { | |
| "end_time": "2024-08-05T11:16:16.752248Z", | |
| "start_time": "2024-08-05T11:16:16.750843Z" | |
| } | |
| }, | |
| "cell_type": "code", | |
| "source": "", | |
| "id": "182388f341cbd54a", | |
| "outputs": [], | |
| "execution_count": 1 | |
| }, | |
| { | |
| "metadata": {}, | |
| "cell_type": "code", | |
| "outputs": [], | |
| "execution_count": null, | |
| "source": "", | |
| "id": "bcf7cbd3996dd5b9" | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 2 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython2", | |
| "version": "2.7.6" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment