Skip to content

Instantly share code, notes, and snippets.

@rayvoelker
Created March 26, 2024 17:52
Show Gist options
  • Select an option

  • Save rayvoelker/0766786d19d706135e36244cb34a8930 to your computer and use it in GitHub Desktop.

Select an option

Save rayvoelker/0766786d19d706135e36244cb34a8930 to your computer and use it in GitHub Desktop.
SemanticSearchSwiftly-Extract.ipynb
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"id": "b9b05581-d016-488d-95bf-4043fcf65e55",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"source": [
"# Extract, Transform, Load (ETL) (cont.)\n",
"\n",
"Our goals:\n",
"\n",
"1. **Extract relevant** bibliographic **MARC record data** from the Sierra REST API using **`sierra-ils-utils`**\n",
"2. **Transform the relevant record data** using Hugging Face Transformers library and a pre-trained model\n",
"3. **Load embeddings into vector database** -- qdrant seems to be a popular choice, and has some nice features https://qdrant.tech/\n"
]
},
{
"cell_type": "code",
"execution_count": 1,
"id": "367521ee-dd3e-47cf-ac7b-e7978d293120",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"%%capture\n",
"\n",
"# 1. **Extract relevant** bibliographic **MARC record data**\n",
"\n",
"# Extract using `sierra-ils-utils`\n",
"# ... start by setting up the Python library\n",
"\n",
"!pip install -U sierra-ils-utils\n",
"\n",
"import sierra_ils_utils\n",
"import json\n",
"\n",
"config_filename = '/home/jovyan/.config/sierra/api-test.json'\n",
"\n",
"with open(config_filename) as f:\n",
" config = json.load(f)\n",
" \n",
"sierra_api = sierra_ils_utils.SierraAPI(\n",
" sierra_api_base_url=config.get('sierra_api_base_url'),\n",
" sierra_api_key=config.get('sierra_api_key'),\n",
" sierra_api_secret=config.get('sierra_api_secret')\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "71c97fee-3843-4bd0-8aff-6083e66e0143",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"# 1. **Extract relevant** bibliographic **MARC record data** (cont.)\n",
"\n",
"# Next, set up a place to stage or \"persist\" the extracted data\n",
"# ... for this, we'll use sqlite\n",
"\n",
"import sqlite3\n",
"\n",
"local_database_filename = 'semantic-search.db' # database file\n",
" \n",
"# Set the schema for our table\n",
"init_sql = \"\"\"\\\n",
"PRAGMA journal_mode=WAL; -- makes the database more efficient for writes\n",
"\n",
"CREATE TABLE IF NOT EXISTS bib_data (\n",
" id INTEGER PRIMARY KEY, -- the rowid\n",
" bib_id INTEGER, -- the bib record id (record num)\n",
" hash_json_content TEXT UNIQUE, -- the hash of the record\n",
" json_content TEXT, -- the json content\n",
" extracted_content TEXT, -- the exerpt of the json for our search\n",
" timestamp INTEGER DEFAULT (strftime('%s', 'now')) -- the update date, as Unix Epoch int\n",
");\n",
"\"\"\"\n",
"\n",
"with sqlite3.connect(local_database_filename) as con:\n",
" con.executescript(init_sql)\n",
" con.commit()"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "ea6a08b4-3ed2-4783-9cdf-e69095fc94f0",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"# 1. **Extract relevant** bibliographic **MARC record data** (cont.)\n",
"# some \"housekeeping\" / helper features\n",
"import hashlib\n",
"\n",
"# set up the sql statement for db inserts... \n",
"insert_sql = \"\"\"\\\n",
"INSERT OR IGNORE INTO bib_data (\n",
" bib_id,\n",
" hash_json_content, \n",
" json_content,\n",
" extracted_content\n",
")\n",
"VALUES (?,PY_HASH(?),?,?);\n",
"\"\"\"\n",
"\n",
"def python_hash(json_content):\n",
" \"\"\"\n",
" Compute the SHA-256 hash of a JSON string.\n",
" Intended to be used as a registered SQLite function.\n",
" :param json_content: A JSON string.\n",
" :return: The SHA-256 hash of the JSON content.\n",
" \"\"\"\n",
" # Assuming json_content is a JSON string; otherwise, you may need json.dumps() here.\n",
" return hashlib.sha256(json_content.encode()).hexdigest()"
]
},
{
"cell_type": "code",
"execution_count": 4,
"id": "70901749-cf0a-4be3-8a43-572472e94e52",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"# 1. **Extract relevant** bibliographic **MARC record data** (cont.)\n",
"from pymarc import Record\n",
"\n",
"def extract_data(record: Record) -> str:\n",
" \"\"\"\n",
" Returns a JSON string of the extracted portions from a pymarc Record object.\n",
" \n",
" :param record: a pymarc Record object (MARC record data).\n",
" :return: str -- the serialized json as a string.\n",
" \"\"\"\n",
"\n",
" extracted_data = {\n",
" 'title': record.title,\n",
" 'authors': [record.author],\n",
" 'subjects': [field.value() for field in record.get_fields('650', '651')],\n",
" 'genres': [field.value() for field in record.get_fields('655')],\n",
" 'publication_info': {\n",
" 'place_of_publication': record['260'].get('a') if record.get('260') else None,\n",
" 'publisher': record['260'].get('b') if record.get('260') else None,\n",
" 'date_of_publication': record['260'].get('c') if record.get('260') else None,\n",
" },\n",
" 'physical_description': record['300'].get('a') if record.get('300') else None,\n",
" 'notes': [field.value() for field in record.get_fields('500', '504', '505', '520')],\n",
" 'identifiers': {\n",
" 'isbn': record.isbn,\n",
" 'issn': record['022'].get('a') if record.get('022') else None,\n",
" },\n",
" 'classification_numbers': {\n",
" 'dewey_decimal': record['082'].get('a') if record.get('082') else None,\n",
" 'local_dewey_decimal': record['092'].get('a') if record.get('092') else None,\n",
" 'local_free_text': record['099'].get('a') if record.get('099') else None,\n",
" },\n",
" 'additional_details': {\n",
" 'geographical_classification': record['052'].get('a') if record.get('052') else None,\n",
" 'fixed_length_data_elements': record['008'].data if record.get('008') else None,\n",
" 'additional_material_characteristics': record['006'].data if record.get('006') else None,\n",
" 'physical_description_fixed_field': record['007'].data if record.get('007') else None,\n",
" }\n",
" }\n",
" \n",
" return json.dumps(extracted_data)"
]
},
{
"cell_type": "code",
"execution_count": 5,
"id": "d22da365-dc5f-4b8c-b901-54b39eede649",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"# 1. **Extract relevant** bibliographic **MARC record data** (cont.)\n",
"# isn't it crazy that we spent so much time and effort in coming up with \n",
"# complex data formats like JSON, only to turn around and simply use spoken\n",
"# language with computers?\n",
"\n",
"def extract_data_narrative(record: Record) -> str:\n",
" \"\"\"\n",
" Returns a narrative description of the extracted portions from a pymarc Record object.\n",
" \n",
" :param record: a pymarc Record object (MARC record data).\n",
" :return: str -- the narratively described data.\n",
" \"\"\"\n",
" \n",
" # Assuming extract_data is a function that serializes the record to a JSON string\n",
" extracted_data = json.loads(extract_data(record)) # Replace with your method of extracting data\n",
"\n",
" # Convert the extracted data into a narrative\n",
" narrative_parts = []\n",
"\n",
" # Title\n",
" if extracted_data.get('title'):\n",
" narrative_parts.append(f\"Title: '{extracted_data['title']}'\")\n",
"\n",
" # Authors\n",
" if authors := extracted_data.get('authors'):\n",
" authors_text = \", \".join([author for author in authors if author])\n",
" if authors_text:\n",
" narrative_parts.append(f\"Authors: {authors_text}\")\n",
"\n",
" # Subjects\n",
" if subjects := extracted_data.get('subjects'):\n",
" subjects_text = \", \".join([subject for subject in subjects if subject])\n",
" if subjects_text:\n",
" narrative_parts.append(f\"Subjects: {subjects_text}\")\n",
"\n",
" # Genres\n",
" if genres := extracted_data.get('genres'):\n",
" genres_text = \", \".join([genre for genre in genres if genre])\n",
" if genres_text:\n",
" narrative_parts.append(f\"Genres: {genres_text}\")\n",
"\n",
" # Publication Info\n",
" pub_info = extracted_data.get('publication_info', {})\n",
" pub_details = [f\"{key.capitalize().replace('_', ' ')}: {value}\" for key, value in pub_info.items() if value]\n",
" if pub_details:\n",
" narrative_parts.append(\"Publication Info: \" + \", \".join(pub_details))\n",
"\n",
" # Physical Description\n",
" if description := extracted_data.get('physical_description'):\n",
" narrative_parts.append(f\"Physical Description: {description}\")\n",
"\n",
" # Notes\n",
" if notes := extracted_data.get('notes'):\n",
" notes_text = \" \".join([note for note in notes if note])\n",
" if notes_text:\n",
" narrative_parts.append(f\"Notes: {notes_text}\")\n",
"\n",
" # Identifiers\n",
" identifiers = extracted_data.get('identifiers', {})\n",
" if isbn := identifiers.get('isbn'):\n",
" narrative_parts.append(f\"ISBN: {isbn}\")\n",
" if issn := identifiers.get('issn'):\n",
" narrative_parts.append(f\"ISSN: {issn}\")\n",
"\n",
" # Classification Numbers\n",
" classification_numbers = extracted_data.get('classification_numbers', {})\n",
" classification_texts = [f\"{key.replace('_', ' ').capitalize()}: {value}\" for key, value in classification_numbers.items() if value]\n",
" if classification_texts:\n",
" narrative_parts.append(\"Classification Numbers: \" + \", \".join(classification_texts))\n",
"\n",
" # Additional Details\n",
" additional_details = extracted_data.get('additional_details', {})\n",
" additional_texts = [f\"{key.replace('_', ' ').capitalize()}: {value}\" for key, value in additional_details.items() if value]\n",
" if additional_texts:\n",
" narrative_parts.append(\"Additional Details: \" + \", \".join(additional_texts))\n",
"\n",
" # Join all parts into a final narrative text\n",
" narrative = \" | \".join(narrative_parts)\n",
" \n",
" return narrative"
]
},
{
"cell_type": "code",
"execution_count": 6,
"id": "d8109e67-2591-41e3-a98c-da0e1eeba9f1",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [],
"source": [
"# 1. **Extract relevant** bibliographic **MARC record data** (cont.)\n",
"\n",
"# set up some initial values ...\n",
"limit_size = 50 # let's stick with the default page size -- 50 records\n",
"next_record_id = 2_500_000 # the next record id -- should start at 0, but here in our demo start at 2,500,000\n",
"# next_record_id = 0 # the next record id -- should start at 0\n",
"page_count = 0 # count the number of pages we've processed\n",
"\n",
"con = sqlite3.connect(local_database_filename) # connect to db\n",
"con.create_function(\"PY_HASH\", 1, python_hash) # register the hash function"
]
},
{
"cell_type": "code",
"execution_count": 7,
"id": "434ef36e-4362-4160-84b6-084ed97ff597",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"CPU times: user 4.25 s, sys: 33.1 ms, total: 4.28 s\n",
"Wall time: 50.7 s\n"
]
}
],
"source": [
"%%time\n",
"# 1. **Extract relevant** bibliographic **MARC record data** (cont.)\n",
"\n",
"# do the Extract ...\n",
"while True:\n",
" try:\n",
" result = sierra_api.get(\n",
" 'bibs/', \n",
" params={\n",
" 'id': f\"[{next_record_id},]\",\n",
" 'fields': 'default,marc',\n",
" 'deleted': False,\n",
" 'limit': limit_size\n",
" }\n",
" )\n",
" page_count+=1 # increment the page count\n",
" next_record_id = int(result.data.entries[-1].id) + 1 # last id + 1 to start getting the next page\n",
" data_to_insert = [\n",
" (\n",
" entry.id, # the bib record id (record num) \n",
" json.dumps(entry.dict()), # the marc-in-json data to load\n",
" json.dumps(entry.dict()), # this will be the hash of the data loaded\n",
" # extract_data(entry.marc) # the extracted portions of the pymarc Record object\n",
" extract_data_narrative(entry.marc) # the extracted portions of the pymarc Record object -- narrative format\n",
" ) \n",
" for entry # each Bib object\n",
" in result.data.entries\n",
" ]\n",
" con.executemany(insert_sql, data_to_insert) # insert all the pages\n",
" con.commit()\n",
" \n",
" # FOR DEMO / TESTING PURPOSES ONLY\n",
" if page_count * limit_size >= 10_000:\n",
" # if result.data.total < limit_size: #otherwise, use this to break when we run out\n",
" con.close()\n",
" break\n",
" except Exception as e:\n",
" print(f\"Error: {e}\")\n",
" con.close()\n",
" break"
]
},
{
"cell_type": "code",
"execution_count": 8,
"id": "220a6bd8-317a-4d75-85c0-b31f90845120",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [
{
"name": "stdout",
"output_type": "stream",
"text": [
"[(10000, 10000)]\n"
]
}
],
"source": [
"# 1. **Extract relevant** bibliographic **MARC record data** (cont.)\n",
"\n",
"# sanity check ...\n",
"import sqlite3\n",
"\n",
"# Enable URI mode and set the database to read-only by appending ?mode=ro to the file name\n",
"database_uri = f\"file:{local_database_filename}?mode=ro\" # use a read-only connection\n",
"\n",
"# Connect using the URI\n",
"with sqlite3.connect(database_uri, uri=True) as con:\n",
" cursor = con.cursor()\n",
" cursor.execute(\"\"\"SELECT COUNT(DISTINCT bib_id), COUNT(*) FROM bib_data;\"\"\")\n",
" rows = cursor.fetchall()\n",
"\n",
"print(rows)"
]
},
{
"cell_type": "code",
"execution_count": 15,
"id": "5a3fceb9-c8d7-47a2-889a-4927887a9bc3",
"metadata": {
"slideshow": {
"slide_type": "slide"
},
"tags": []
},
"outputs": [
{
"data": {
"application/json": {
"bib_id": 2500001,
"extracted_content": "Title: 'ÆonFlux' | Subjects: Assassins Drama., Utopian socialism Drama., Disappeared persons Drama. | Genres: Fantasy films. lcgft, Action and adventure films. lcgft, Feature films. lcgft, Video recordings for the hearing impaired. lcgft, Science fiction films. lcgft | Publication Info: Place of publication: Hollywood, Calif. :, Publisher: Paramount Pictures,, Date of publication: c2006. | Physical Description: 1 videodisc (92 min.) : | Notes: Based on the characters created by Peter Chung. Originally released as a motion picture in 2005. 400 years in the future, after a virus kills off 99% of the world population, only one city on Earth remains. Ruled by the Goodchild dynasty, it is a perfect society of peace and prosperity--except that its citizens keep mysteriously disappearing. A secret agent/assassin/warrior has been given the mission to bring down the regime. But as she goes deeper into her mission, Aeon uncovers some shocking secrets that put the mission, not to mention her life, in danger. Special features: four eye-popping featurettes; writer and producer commentaries. | ISBN: 1415720584 | Classification Numbers: Local dewey decimal: DVD SciFi A251 2006 | Additional Details: Fixed length data elements: 060124p20062005cau092 g vleng dcgmIa , Physical description fixed field: vd cvaizq"
},
"text/plain": [
"<IPython.core.display.JSON object>"
]
},
"metadata": {
"application/json": {
"expanded": true,
"root": "root"
}
},
"output_type": "display_data"
}
],
"source": [
"# 1. **Extract relevant** bibliographic **MARC record data** (cont.)\n",
"\n",
"sql = \"\"\"\\\n",
"SELECT\n",
" bib_id,\n",
" extracted_content\n",
"FROM\n",
" bib_data\n",
"ORDER BY \n",
" bib_id\n",
"LIMIT 1;\n",
"\"\"\"\n",
"with sqlite3.connect(database_uri, uri=True) as con:\n",
" cursor = con.cursor()\n",
" cursor.execute(sql)\n",
" \n",
" columns = [description[0] for description in cursor.description]\n",
" row = cursor.fetchone()\n",
"\n",
"from IPython.display import display, JSON\n",
"display(\n",
" JSON(\n",
" dict(zip(columns, row)),\n",
" expanded=True\n",
" )\n",
")"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "Python 3 (ipykernel)",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.11.7"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment