Last active
October 25, 2025 12:38
-
-
Save xflr6/2c6ddf6cda55313bc8d4fbdc65a88ca4 to your computer and use it in GitHub Desktop.
Read pandas.DataFrame from SPARQL query
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "1d935d67-aab3-49d1-b0e3-d82acd9e3329", | |
| "metadata": {}, | |
| "source": [ | |
| "# pandas `read_sparql_query()`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "5ebd1ea3-c44f-4b56-9ab9-58f58620f643", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import datetime\n", | |
| "import decimal\n", | |
| "import distutils.util\n", | |
| "import functools\n", | |
| "import io\n", | |
| "import logging\n", | |
| "import types\n", | |
| "from typing import Literal, Protocol, Self\n", | |
| "import urllib.parse\n", | |
| "import urllib.request\n", | |
| "import warnings\n", | |
| "import xml.etree.ElementTree as etree\n", | |
| "\n", | |
| "# $ pip install pandas rdflib SPARQLWrapper\n", | |
| "import pandas as pd\n", | |
| "import rdflib\n", | |
| "import SPARQLWrapper as sw\n", | |
| "\n", | |
| "LOGLEVEL = logging.INFO\n", | |
| "\n", | |
| "logging.basicConfig(format='[%(levelname)s@%(name)s] %(message)s', level=LOGLEVEL, force=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "81663c25-a4b1-4c31-a4ec-5387f9d7b203", | |
| "metadata": {}, | |
| "source": [ | |
| "## `pandas` wrappers" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "44f9aadc-8d2c-4a59-a5b7-f58cc14db1d6", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def pd_read_csv(data, /, **kwargs) -> pd.DataFrame:\n", | |
| " if kwargs.get('encoding') is None:\n", | |
| " kwargs['encoding'] = data.info().get_content_charset()\n", | |
| " logging.debug('encoding: %r', kwargs['encoding'])\n", | |
| "\n", | |
| " kwargs.setdefault('na_values', '')\n", | |
| " kwargs.setdefault('keep_default_na', False)\n", | |
| " logging.info('pandas.read_csv(%r, **%r)', data, kwargs)\n", | |
| " return pd.read_csv(data, **kwargs)\n", | |
| "\n", | |
| "\n", | |
| "def pd_dataframe_from_records(data, /, **kwargs) -> pd.DataFrame:\n", | |
| " kwargs.setdefault('coerce_float', True)\n", | |
| " logging.info('pandas.DataFrame.from_records(%r, **%r)', data, kwargs)\n", | |
| " return pd.DataFrame.from_records(data, **kwargs)\n", | |
| "\n", | |
| "\n", | |
| "def pd_json_normalize(data, /, **kwargs) -> pd.DataFrame:\n", | |
| " kwargs.setdefault('record_path', ['results', 'bindings'])\n", | |
| " logging.info('pandas.json_normalize(data, **%r)', kwargs)\n", | |
| " return pd.json_normalize(data, **kwargs)\n", | |
| "\n", | |
| "\n", | |
| "def pd_pipe_info(df: pd.DataFrame, /, **kwargs) -> pd.DataFrame:\n", | |
| " kwargs.setdefault('memory_usage', 'deep')\n", | |
| " df.info(**kwargs)\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "19694a6d-d447-4539-9680-ec86f947dded", | |
| "metadata": {}, | |
| "source": [ | |
| "## SPARQL" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "01c3b56a-7318-4999-8a5c-a2464921bbbb", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "ENDPOINT = 'https://query.wikidata.org/sparql' # https://en.wikibooks.org/wiki/SPARQL/Prefixes\n", | |
| "\n", | |
| "TEST_QUERY = '''\n", | |
| "SELECT ?index ?string ?boolean ?integer ?float ?double ?decimal ?datetime ?date ?time\n", | |
| "WHERE {\n", | |
| " VALUES ?index { 0 }\n", | |
| " VALUES ?string { \"spam\" \"eggs\"^^xsd:string }\n", | |
| " VALUES ?boolean { true \"false\"^^xsd:boolean }\n", | |
| " VALUES ?integer { 7 \"42\"^^xsd:integer }\n", | |
| " VALUES ?float { \"6.275\"^^xsd:float }\n", | |
| " VALUES ?double { \"1.0e6\"^^xsd:double }\n", | |
| " VALUES ?decimal { \"1.30\"^^xsd:decimal }\n", | |
| " VALUES ?datetime { \"2005-04-04T04:04:04\"^^xsd:dateTime }\n", | |
| " VALUES ?date { \"2001-01-01\"^^xsd:date }\n", | |
| " VALUES ?time { \"18:30\"^^xsd:time }\n", | |
| "}\n", | |
| "'''.strip()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "0c76f1c8-d914-470b-a429-d0c77c555885", | |
| "metadata": {}, | |
| "source": [ | |
| "## Interface" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "86aede62-2837-4013-9801-3778a6c1a21b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "class ReadSparql(Protocol):\n", | |
| "\n", | |
| " def __init__(self, endpoint: str, **kwargs) -> None:\n", | |
| " super().__init__(**kwargs)\n", | |
| " self.endpoint=endpoint\n", | |
| "\n", | |
| " def __repr__(self) -> str:\n", | |
| " return f'{self.__class__.__name__}({self.endpoint!r})'\n", | |
| "\n", | |
| " def query(self, sparql: str, /):\n", | |
| " raise NotImplementedError \n", | |
| "\n", | |
| " def load(self, result, /, **kwargs) -> pd.DataFrame:\n", | |
| " raise NotImplementedError\n", | |
| "\n", | |
| " def read_sparql(self, sparql: str, /, **kwargs) -> pd.DataFrame:\n", | |
| " result = self.query(sparql)\n", | |
| " df = self.load(result, **kwargs)\n", | |
| " with io.StringIO() as f:\n", | |
| " df.info(buf=f, memory_usage='deep')\n", | |
| " info = f.getvalue()\n", | |
| " logging.debug(info)\n", | |
| " return df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "be2d9827-0273-462c-b712-7d9d276694ee", | |
| "metadata": {}, | |
| "source": [ | |
| "## `urllib`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "03b875f6-397f-41a6-9479-d9889013b66e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "class UrllibReadSparql(ReadSparql):\n", | |
| "\n", | |
| " method = 'GET'\n", | |
| " \n", | |
| " headers = None\n", | |
| " \n", | |
| " def query(self, sparql: str, /, *, method: str | None = None, headers=None):\n", | |
| " logging.info('endpoint: %r', self.endpoint)\n", | |
| " request_url = (urllib.parse.urlparse(self.endpoint)\n", | |
| " ._replace(query=urllib.parse.urlencode({'query': sparql})))\n", | |
| "\n", | |
| " request = urllib.request.Request(request_url.geturl(),\n", | |
| " method=method if method is not None else self.method,\n", | |
| " headers=headers if headers is not None else self.headers)\n", | |
| " logging.info('request: %r %r', request.method, request)\n", | |
| " logging.debug('url: %r', request.full_url)\n", | |
| "\n", | |
| " response = urllib.request.urlopen(request)\n", | |
| " logging.info('response: %r %r', response.code, response)\n", | |
| " logging.info('content_type: %r', response.headers.get_content_type())\n", | |
| " logging.info('content_charset: %r', response.headers.get_content_charset())\n", | |
| " logging.debug('headers: %r', dict(response.headers))\n", | |
| "\n", | |
| " return types.SimpleNamespace(response=response, headers=response.headers, info=response.info)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "4c85659a-a85d-4cd1-8d41-74346c44106e", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "source": [ | |
| "### `CsvUrllibStrategy`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "e10dfe89-4693-4409-8993-7e73f95c2718", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def load_sparql_csv(result, /, **kwargs) -> pd.DataFrame:\n", | |
| " return pd_read_csv(result.response, **kwargs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "aebf75f8-9a76-400a-8b03-af6f232816a5", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] request: 'GET' <urllib.request.Request object at 0x000002B769BDE900>\n", | |
| "[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000002B767A42500>\n", | |
| "[INFO@root] content_type: 'text/csv'\n", | |
| "[INFO@root] content_charset: 'utf-8'\n", | |
| "[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000002B767A42500>, **{'index_col': 'index', 'encoding': 'utf-8', 'na_values': '', 'keep_default_na': False})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "Index: 8 entries, 0 to 0\n", | |
| "Data columns (total 9 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 string 8 non-null object \n", | |
| " 1 boolean 8 non-null bool \n", | |
| " 2 integer 8 non-null int64 \n", | |
| " 3 float 8 non-null float64 \n", | |
| " 4 double 8 non-null float64 \n", | |
| " 5 decimal 8 non-null float64 \n", | |
| " 6 datetime 8 non-null datetime64[ns]\n", | |
| " 7 date 8 non-null object \n", | |
| " 8 time 8 non-null object \n", | |
| "dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
| "memory usage: 1.7 KB\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>string</th>\n", | |
| " <th>boolean</th>\n", | |
| " <th>integer</th>\n", | |
| " <th>float</th>\n", | |
| " <th>double</th>\n", | |
| " <th>decimal</th>\n", | |
| " <th>datetime</th>\n", | |
| " <th>date</th>\n", | |
| " <th>time</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>index</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " string boolean integer float double decimal \\\n", | |
| "index \n", | |
| "0 'spam' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 42 6.275 1000000.0 1.3 \n", | |
| "\n", | |
| " datetime date time \n", | |
| "index \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' " | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "class CsvUrllibStrategy(UrllibReadSparql):\n", | |
| "\n", | |
| " headers = {'Accept': 'text/csv'}\n", | |
| "\n", | |
| " load = staticmethod(load_sparql_csv)\n", | |
| "\n", | |
| "\n", | |
| "(CsvUrllibStrategy(ENDPOINT)\n", | |
| " .read_sparql(TEST_QUERY, index_col='index')\n", | |
| " .astype({'datetime': 'datetime64[ns]'})\n", | |
| " .pipe(pd_pipe_info)\n", | |
| " .map(repr))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "911ce4dd-3ea8-4cb7-b552-0c3a21b2e2eb", | |
| "metadata": {}, | |
| "source": [ | |
| "### `XmlUrllibStrategy`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "ff74fd77-d64e-4434-9eaf-488b3f75069d", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def load_sparql_xml(result, /, *,\n", | |
| " encoding: str | None = None, **kwargs) -> pd.DataFrame:\n", | |
| " if encoding is None:\n", | |
| " (content_type, sep, content_charset) = fields = result.info()['Content-Type'].partition(';charset=')\n", | |
| " assert all(fields)\n", | |
| " encoding = content_charset\n", | |
| " logging.debug('encoding: %r', encoding)\n", | |
| "\n", | |
| " pairs = etree.iterparse(result.response, events=('start', 'end'),\n", | |
| " parser=etree.XMLParser(encoding=encoding))\n", | |
| " (_, root) = next(pairs)\n", | |
| " logging.info('xml: %r', root)\n", | |
| "\n", | |
| " ns = root.tag.partition('{')[2].partition('}')[0]\n", | |
| " assert root.tag.startswith(f'{{{ns}}}')\n", | |
| " if ns != SPARQL_RESULTS:\n", | |
| " raise ValueError(f'error: unknown xml namespace {ns!r} (expected: {SPARQL_RESULTS!r})')\n", | |
| "\n", | |
| " sparql_root = f'{{{ns}}}sparql'\n", | |
| " if root.tag != sparql_root:\n", | |
| " raise ValueError(f'error: invalid xml root tag {root.tag!r} (expected: {sparql_root!r})')\n", | |
| "\n", | |
| " sparql_head = f'{{{ns}}}head'\n", | |
| " head = next(elem for event, elem in pairs if event == 'end' and elem.tag == sparql_head)\n", | |
| " variables = [variable.attrib['name'] for variable in head.findall(f'{{{ns}}}variable')]\n", | |
| " kwargs.setdefault('columns', variables)\n", | |
| "\n", | |
| " records = iterrecords(root, pairs, variables=variables)\n", | |
| " return pd_dataframe_from_records(records, **kwargs)\n", | |
| "\n", | |
| "\n", | |
| "SPARQL_RESULTS = 'http://www.w3.org/2005/sparql-results#'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "32b4e966-a4dd-4d83-8ba3-7f2a93cb844e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def iterrecords(root, pairs, /, *, variables):\n", | |
| " sparql_result = f'{{{SPARQL_RESULTS}}}result'\n", | |
| " sparql_values = [f'{{{SPARQL_RESULTS}}}binding[@name=\"{name}\"]/' for name in variables]\n", | |
| " for event, elem in pairs:\n", | |
| " if event != 'end':\n", | |
| " pass\n", | |
| " elif elem.tag == sparql_result: \n", | |
| " value_elements = map(elem.find, sparql_values)\n", | |
| " yield tuple(itervalues(value_elements))\n", | |
| " root.clear()\n", | |
| " elif elem.tag == root.tag:\n", | |
| " assert next(pairs, None) is None\n", | |
| " return\n", | |
| "\n", | |
| "\n", | |
| "def itervalues(value_elements, /, *,\n", | |
| " sparql_literal: str = f'{{{SPARQL_RESULTS}}}literal',\n", | |
| " sparql_uri: str = f'{{{SPARQL_RESULTS}}}uri',\n", | |
| " sparql_bnode: str = f'{{{SPARQL_RESULTS}}}bnode'):\n", | |
| " for value_elem in value_elements:\n", | |
| " if value_elem is None:\n", | |
| " yield None\n", | |
| " continue\n", | |
| "\n", | |
| " if value_elem.tag not in (sparql_literal, sparql_uri, sparql_bnode):\n", | |
| " raise ValueError(f'invalid binding value tag: {value_elem.tag!r}')\n", | |
| " value = value_elem.text\n", | |
| " if value_elem.tag == sparql_literal and 'datatype' in value_elem.attrib:\n", | |
| " datatype = value_elem.attrib['datatype']\n", | |
| " if datatype is not None:\n", | |
| " try:\n", | |
| " parse_value = PARSE_FUNC[datatype]\n", | |
| " except KeyError:\n", | |
| " warnings.warn(f'cannot convert unknown datatype: {datatype!r}')\n", | |
| " else:\n", | |
| " value = parse_value(value)\n", | |
| " yield value\n", | |
| "\n", | |
| "\n", | |
| "XSD = 'http://www.w3.org/2001/XMLSchema#'\n", | |
| "\n", | |
| "PARSE_FUNC = {f'{XSD}string': lambda x: x,\n", | |
| " f'{XSD}boolean': distutils.util.strtobool,\n", | |
| " f'{XSD}integer': int,\n", | |
| " f'{XSD}float': float,\n", | |
| " f'{XSD}double': float,\n", | |
| " f'{XSD}decimal': decimal.Decimal,\n", | |
| " f'{XSD}dateTime': datetime.datetime.fromisoformat,\n", | |
| " f'{XSD}date': datetime.date.fromisoformat,\n", | |
| " f'{XSD}time': datetime.time.fromisoformat}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "29bf61ef-0328-478a-a877-30fbd45eeeca", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] request: 'GET' <urllib.request.Request object at 0x000002B7374691D0>\n", | |
| "[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000002B73748F850>\n", | |
| "[INFO@root] content_type: 'application/sparql-results+xml'\n", | |
| "[INFO@root] content_charset: 'utf-8'\n", | |
| "[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000002B73750D170>\n", | |
| "[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000002B73751C040>, **{'index': 'index', 'columns': ['index', 'string', 'boolean', 'integer', 'float', 'double', 'decimal', 'datetime', 'date', 'time'], 'coerce_float': True})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "Index: 8 entries, 0 to 0\n", | |
| "Data columns (total 9 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 string 8 non-null object \n", | |
| " 1 boolean 8 non-null bool \n", | |
| " 2 integer 8 non-null int64 \n", | |
| " 3 float 8 non-null float64 \n", | |
| " 4 double 8 non-null float64 \n", | |
| " 5 decimal 8 non-null float64 \n", | |
| " 6 datetime 8 non-null datetime64[ns]\n", | |
| " 7 date 8 non-null object \n", | |
| " 8 time 8 non-null object \n", | |
| "dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
| "memory usage: 1.5 KB\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>string</th>\n", | |
| " <th>boolean</th>\n", | |
| " <th>integer</th>\n", | |
| " <th>float</th>\n", | |
| " <th>double</th>\n", | |
| " <th>decimal</th>\n", | |
| " <th>datetime</th>\n", | |
| " <th>date</th>\n", | |
| " <th>time</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>index</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " string boolean integer float double decimal \\\n", | |
| "index \n", | |
| "0 'spam' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 42 6.275 1000000.0 1.3 \n", | |
| "\n", | |
| " datetime date \\\n", | |
| "index \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "\n", | |
| " time \n", | |
| "index \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) " | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "class XmlUrllibStrategy(UrllibReadSparql):\n", | |
| "\n", | |
| " headers = {'Accept': 'application/sparql-results+xml'}\n", | |
| "\n", | |
| " load = staticmethod(load_sparql_xml)\n", | |
| "\n", | |
| "\n", | |
| "(XmlUrllibStrategy(ENDPOINT)\n", | |
| " .read_sparql(TEST_QUERY, index='index')\n", | |
| " .pipe(pd_pipe_info)\n", | |
| " .map(repr))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "e2a7cf6d-7d0d-471d-b09c-43be532a2eaf", | |
| "metadata": {}, | |
| "source": [ | |
| "## `rdflib.Graph('SPARQLStore')`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "7919a1c7-e917-4f55-b7e7-fd950226616b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def open_sparql_graph(endpoint: str, /, *,\n", | |
| " identifier: str | None,\n", | |
| " bind_namespaces: Literal['rdflib', 'core', 'none'] = 'none') -> rdflib.Graph:\n", | |
| " logging.info('endpoint: %r', endpoint)\n", | |
| " graph = rdflib.Graph('SPARQLStore', identifier=identifier, bind_namespaces=bind_namespaces)\n", | |
| " graph.open(endpoint)\n", | |
| " logging.info('graph: %s', graph)\n", | |
| " logging.debug('namespaces: %r', list(graph.namespaces()))\n", | |
| " return graph\n", | |
| "\n", | |
| "\n", | |
| "def load_sparql_xml_result(result: 'rdflib.plugins.sparql.results.xmlresults.XMLResult', /,\n", | |
| " **kwargs) -> pd.DataFrame:\n", | |
| " kwargs.setdefault('columns', list(map(str, result.vars)))\n", | |
| " rows = ([v.toPython() if v is not None else None for v in row] for row in result)\n", | |
| " return pd_dataframe_from_records(rows, **kwargs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "771a038a-e21d-4415-b11a-720ce570b804", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] graph: <http://www.wikidata.org> a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'SPARQLStore'].\n", | |
| "[INFO@root] result: <rdflib.plugins.sparql.results.xmlresults.XMLResult object at 0x000002B76A89DBE0>\n", | |
| "[INFO@root] pandas.DataFrame.from_records(<generator object load_sparql_xml_result.<locals>.<genexpr> at 0x000002B76A8E47C0>, **{'index': 'index', 'columns': ['index', 'string', 'boolean', 'integer', 'float', 'double', 'decimal', 'datetime', 'date', 'time'], 'coerce_float': True})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "Index: 8 entries, 0 to 0\n", | |
| "Data columns (total 9 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 string 8 non-null object \n", | |
| " 1 boolean 8 non-null bool \n", | |
| " 2 integer 8 non-null int64 \n", | |
| " 3 float 8 non-null float64 \n", | |
| " 4 double 8 non-null float64 \n", | |
| " 5 decimal 8 non-null float64 \n", | |
| " 6 datetime 8 non-null datetime64[ns]\n", | |
| " 7 date 8 non-null object \n", | |
| " 8 time 8 non-null object \n", | |
| "dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
| "memory usage: 1.5 KB\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>string</th>\n", | |
| " <th>boolean</th>\n", | |
| " <th>integer</th>\n", | |
| " <th>float</th>\n", | |
| " <th>double</th>\n", | |
| " <th>decimal</th>\n", | |
| " <th>datetime</th>\n", | |
| " <th>date</th>\n", | |
| " <th>time</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>index</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " string boolean integer float double decimal \\\n", | |
| "index \n", | |
| "0 'spam' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 42 6.275 1000000.0 1.3 \n", | |
| "\n", | |
| " datetime date \\\n", | |
| "index \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "\n", | |
| " time \n", | |
| "index \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) " | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "class XmlRdflibStrategy(ReadSparql):\n", | |
| "\n", | |
| " def __init__(self, endpoint: str, *, identifier: str | None = None, **kwargs) -> None:\n", | |
| " super().__init__(endpoint, **kwargs)\n", | |
| " self.graph_identifier = identifier\n", | |
| "\n", | |
| " def __repr__(self) -> str:\n", | |
| " return f'<{self.__class__.__name__} {self.endpoint!r} graph={self.graph!r}>'\n", | |
| "\n", | |
| " @functools.cached_property\n", | |
| " def graph(self) -> rdflib.Graph:\n", | |
| " return open_sparql_graph(self.endpoint, identifier=self.graph_identifier)\n", | |
| "\n", | |
| " def query(self, query: str, /) -> 'rdflib.plugins.sparql.results.xmlresults.XMLResult':\n", | |
| " result = self.graph.query(query)\n", | |
| " logging.info('result: %r', result)\n", | |
| " return result\n", | |
| "\n", | |
| " load = staticmethod(load_sparql_xml_result)\n", | |
| "\n", | |
| "\n", | |
| "(XmlRdflibStrategy(ENDPOINT, identifier='http://www.wikidata.org')\n", | |
| " .read_sparql(TEST_QUERY, index='index')\n", | |
| " .pipe(pd_pipe_info)\n", | |
| " .map(repr))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "6d30940d-4cde-44ee-a767-a98a538dbb08", | |
| "metadata": {}, | |
| "source": [ | |
| "## `SPARQLWrapper`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "db4a18f8-d06a-473c-bded-65bfe509eefa", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "class SPARQLWrapperReadSparql(sw.SPARQLWrapper, ReadSparql):\n", | |
| "\n", | |
| " def __repr__(self) -> str:\n", | |
| " return f'{self.__class__.__name__}({self.endpoint!r}, returnFormat={self.returnFormat!r})'\n", | |
| "\n", | |
| " def query(self, sparql: str, /) -> sw.QueryResult:\n", | |
| " logging.info('endpoint: %r', self.endpoint)\n", | |
| " logging.info('returnFormat: %r', self.returnFormat)\n", | |
| " if self.returnFormat in (sw.CSV, sw.TSV):\n", | |
| " logging.info('%r.setOnlyConnreg(True)', self)\n", | |
| " self.setOnlyConneg(True)\n", | |
| " elif not self.supportsReturnFormat(self.returnFormat):\n", | |
| " raise ValueError(f'unsupposted return format: {self.returnFormat!r}')\n", | |
| " self.setQuery(sparql)\n", | |
| " result = super().query()\n", | |
| " logging.info('result: %r', result)\n", | |
| " headers = result.info()\n", | |
| " logging.debug('headers: %r', headers)\n", | |
| " logging.info('Content-Type: %r', headers['Content-Type'])\n", | |
| " return result" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ca4321ee-3a45-4c36-8a4c-617a13a5140b", | |
| "metadata": {}, | |
| "source": [ | |
| "### `SPARQLWrapper(returnFormat='csv')`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "fb82bc0a-8fea-4478-9eb7-8d7af008dbf0", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] returnFormat: 'csv'\n", | |
| "[INFO@root] CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv').setOnlyConnreg(True)\n", | |
| "[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000002B7374D9400>\n", | |
| "[INFO@root] Content-Type: 'text/csv;charset=utf-8'\n", | |
| "[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000002B76A9524A0>, **{'index_col': 'index', 'encoding': 'utf-8', 'na_values': '', 'keep_default_na': False})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "Index: 8 entries, 0 to 0\n", | |
| "Data columns (total 9 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 string 8 non-null object \n", | |
| " 1 boolean 8 non-null bool \n", | |
| " 2 integer 8 non-null int64 \n", | |
| " 3 float 8 non-null float64 \n", | |
| " 4 double 8 non-null float64 \n", | |
| " 5 decimal 8 non-null float64 \n", | |
| " 6 datetime 8 non-null datetime64[ns]\n", | |
| " 7 date 8 non-null object \n", | |
| " 8 time 8 non-null object \n", | |
| "dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
| "memory usage: 1.7 KB\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>string</th>\n", | |
| " <th>boolean</th>\n", | |
| " <th>integer</th>\n", | |
| " <th>float</th>\n", | |
| " <th>double</th>\n", | |
| " <th>decimal</th>\n", | |
| " <th>datetime</th>\n", | |
| " <th>date</th>\n", | |
| " <th>time</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>index</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " string boolean integer float double decimal \\\n", | |
| "index \n", | |
| "0 'spam' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 42 6.275 1000000.0 1.3 \n", | |
| "\n", | |
| " datetime date time \n", | |
| "index \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' " | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "class CsvSwStrategy(SPARQLWrapperReadSparql):\n", | |
| "\n", | |
| " def __init__(self, endpoint: str, *, returnFormat=sw.CSV, **kwargs) -> None:\n", | |
| " super().__init__(endpoint, returnFormat=returnFormat, **kwargs)\n", | |
| "\n", | |
| " load = staticmethod(load_sparql_csv)\n", | |
| "\n", | |
| "\n", | |
| "(CsvSwStrategy(ENDPOINT)\n", | |
| " .read_sparql(TEST_QUERY, index_col='index')\n", | |
| " .astype({'datetime': 'datetime64[ns]'})\n", | |
| " .pipe(pd_pipe_info)\n", | |
| " .map(repr))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "947aa733-b6ff-47cf-a862-d0ab23ac147c", | |
| "metadata": {}, | |
| "source": [ | |
| "### `SPARQLWrapper(returnFormat='xml')`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "64ffe095-4c87-4b80-bb93-077c18a39c5c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] returnFormat: 'xml'\n", | |
| "[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000002B76A914E10>\n", | |
| "[INFO@root] Content-Type: 'application/sparql-results+xml;charset=utf-8'\n", | |
| "[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000002B76A94EFC0>\n", | |
| "[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000002B76A90CC20>, **{'index': 'index', 'columns': ['index', 'string', 'boolean', 'integer', 'float', 'double', 'decimal', 'datetime', 'date', 'time'], 'coerce_float': True})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "Index: 8 entries, 0 to 0\n", | |
| "Data columns (total 9 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 string 8 non-null object \n", | |
| " 1 boolean 8 non-null bool \n", | |
| " 2 integer 8 non-null int64 \n", | |
| " 3 float 8 non-null float64 \n", | |
| " 4 double 8 non-null float64 \n", | |
| " 5 decimal 8 non-null float64 \n", | |
| " 6 datetime 8 non-null datetime64[ns]\n", | |
| " 7 date 8 non-null object \n", | |
| " 8 time 8 non-null object \n", | |
| "dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
| "memory usage: 1.5 KB\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>string</th>\n", | |
| " <th>boolean</th>\n", | |
| " <th>integer</th>\n", | |
| " <th>float</th>\n", | |
| " <th>double</th>\n", | |
| " <th>decimal</th>\n", | |
| " <th>datetime</th>\n", | |
| " <th>date</th>\n", | |
| " <th>time</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>index</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " string boolean integer float double decimal \\\n", | |
| "index \n", | |
| "0 'spam' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 42 6.275 1000000.0 1.3 \n", | |
| "\n", | |
| " datetime date \\\n", | |
| "index \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "\n", | |
| " time \n", | |
| "index \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) " | |
| ] | |
| }, | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "class XmlSwStrategy(SPARQLWrapperReadSparql):\n", | |
| "\n", | |
| " load = staticmethod(load_sparql_xml)\n", | |
| "\n", | |
| "\n", | |
| "(XmlSwStrategy(ENDPOINT)\n", | |
| " .read_sparql(TEST_QUERY, index='index')\n", | |
| " .pipe(pd_pipe_info)\n", | |
| " .map(repr))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "d020a3fc-248c-4537-a0ad-17a387272863", | |
| "metadata": {}, | |
| "source": [ | |
| "### `SPARQLWrapper(returnFormat='json')`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "25aaec36-4cbc-4c7c-8636-413079ecb93c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def load_sparql_json_normalize(result: sw.QueryResult, /, *,\n", | |
| " raw: bool = False, **kwargs) -> pd.DataFrame:\n", | |
| " jsondata = result.convert()\n", | |
| " logging.info(\"result['head']: %r\", jsondata['head'])\n", | |
| " df = pd_json_normalize(jsondata, **kwargs)\n", | |
| " if df.empty:\n", | |
| " return df\n", | |
| " if not raw:\n", | |
| " columns = jsondata['head']['vars']\n", | |
| " rename = {f'{c}.value': c for c in columns}\n", | |
| " dtype = {name: PARSE_DTYPE[binding['datatype']]\n", | |
| " if binding['type'] == 'literal' and 'datatype' in binding\n", | |
| " else 'string'\n", | |
| " for name, binding in jsondata['results']['bindings'][0].items()}\n", | |
| " booleans = [name for name, d in dtype.items() if d in ('bool', 'boolean')]\n", | |
| " df = df[list(rename)].rename(rename, axis='columns')\n", | |
| " if booleans:\n", | |
| " df[booleans] = df[booleans].apply(lambda x: x.map(distutils.util.strtobool))\n", | |
| " return df.astype(dtype)\n", | |
| " return df\n", | |
| "\n", | |
| "\n", | |
| "PARSE_DTYPE = {f'{XSD}string': 'string',\n", | |
| " f'{XSD}boolean': 'bool',\n", | |
| " f'{XSD}integer': 'int',\n", | |
| " f'{XSD}float': 'float',\n", | |
| " f'{XSD}double': 'float',\n", | |
| " f'{XSD}decimal': 'float',\n", | |
| " f'{XSD}dateTime': 'datetime64[ns]',\n", | |
| " f'{XSD}date': 'string',\n", | |
| " f'{XSD}time': 'string'}" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "id": "b919c3be-569c-4b15-8256-c31598fded50", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] returnFormat: 'json'\n", | |
| "[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000002B76A914E10>\n", | |
| "[INFO@root] Content-Type: 'application/sparql-results+json;charset=utf-8'\n", | |
| "[INFO@root] result['head']: {'vars': ['index', 'string', 'boolean', 'integer', 'float', 'double', 'decimal', 'datetime', 'date', 'time']}\n", | |
| "[INFO@root] pandas.json_normalize(data, **{'record_path': ['results', 'bindings']})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "Index: 8 entries, 0 to 0\n", | |
| "Data columns (total 9 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 string 8 non-null string \n", | |
| " 1 boolean 8 non-null bool \n", | |
| " 2 integer 8 non-null int64 \n", | |
| " 3 float 8 non-null float64 \n", | |
| " 4 double 8 non-null float64 \n", | |
| " 5 decimal 8 non-null float64 \n", | |
| " 6 datetime 8 non-null datetime64[ns]\n", | |
| " 7 date 8 non-null string \n", | |
| " 8 time 8 non-null string \n", | |
| "dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), string(3)\n", | |
| "memory usage: 1.7 KB\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>string</th>\n", | |
| " <th>boolean</th>\n", | |
| " <th>integer</th>\n", | |
| " <th>float</th>\n", | |
| " <th>double</th>\n", | |
| " <th>decimal</th>\n", | |
| " <th>datetime</th>\n", | |
| " <th>date</th>\n", | |
| " <th>time</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>index</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>'2001-01-01'</td>\n", | |
| " <td>'18:30'</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " string boolean integer float double decimal \\\n", | |
| "index \n", | |
| "0 'spam' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 42 6.275 1000000.0 1.3 \n", | |
| "\n", | |
| " datetime date time \n", | |
| "index \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' \n", | |
| "0 Timestamp('2005-04-04 04:04:04') '2001-01-01' '18:30' " | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "class JsonSwStrategy(SPARQLWrapperReadSparql):\n", | |
| "\n", | |
| " def __init__(self, endpoint, *, returnFormat=sw.JSON, **kwargs) -> None:\n", | |
| " super().__init__(endpoint, returnFormat=returnFormat, **kwargs)\n", | |
| "\n", | |
| " load = staticmethod(load_sparql_json_normalize)\n", | |
| "\n", | |
| "\n", | |
| "(JsonSwStrategy(ENDPOINT)\n", | |
| " .read_sparql(TEST_QUERY)\n", | |
| " .set_index('index')\n", | |
| " .pipe(pd_pipe_info)\n", | |
| " .map(repr))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "b167836d-de6c-4f99-a840-ff05cb41d623", | |
| "metadata": {}, | |
| "source": [ | |
| "### `SPARQLWrapper2()`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "id": "a3b5dd29-ca50-4088-a568-90bac6d604fc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "class SPARQLWrapper2ReadSparql(sw.SPARQLWrapper2, ReadSparql):\n", | |
| "\n", | |
| " def __repr__(self) -> str:\n", | |
| " return f'{self.__class__.__name__}({self.endpoint!r}, returnFormat={self.returnFormat!r})'\n", | |
| "\n", | |
| " def query(self, sparql: str, /) -> sw.SmartWrapper.Bindings:\n", | |
| " logging.info('endpoint: %r', self.endpoint)\n", | |
| " self.setQuery(sparql)\n", | |
| " result = super().query()\n", | |
| " logging.info('result: %r', result)\n", | |
| " logging.info('result.variables: %r', result.variables)\n", | |
| " return result" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "id": "69ea97b4-8814-45c6-ad0f-aeb27896a59a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def load_sparql_json(result: sw.SmartWrapper.Bindings, /, **kwargs) -> pd.DataFrame:\n", | |
| " kwargs.setdefault('columns', result.variables)\n", | |
| " records = (tuple(itervalues_json(binding, variables=result.variables))\n", | |
| " for binding in result.bindings)\n", | |
| " return pd_dataframe_from_records(records, **kwargs)\n", | |
| "\n", | |
| "\n", | |
| "def itervalues_json(binding, /, *, variables):\n", | |
| " for v in variables:\n", | |
| " if v not in binding:\n", | |
| " yield None\n", | |
| " continue\n", | |
| "\n", | |
| " value_dict = binding[v]\n", | |
| " if value_dict.type not in ('literal', 'uri', 'bnode'):\n", | |
| " raise ValueError(f'invalid binding value type {b.type!r}')\n", | |
| " value = value_dict.value\n", | |
| " if value_dict.type == 'literal' and value_dict.datatype is not None:\n", | |
| " try:\n", | |
| " parse_value = PARSE_FUNC[value_dict.datatype]\n", | |
| " except KeyError:\n", | |
| " warnings.warn('cannot convert unmatched datatype: {value_dict!r}')\n", | |
| " else:\n", | |
| " value = parse_value(value)\n", | |
| " yield value" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "id": "fd10ccbd-65b2-4dcc-8fc5-2ac154ce3833", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] result: <SPARQLWrapper.SmartWrapper.Bindings object at 0x000002B76A89C2F0>\n", | |
| "[INFO@root] result.variables: ['index', 'string', 'boolean', 'integer', 'float', 'double', 'decimal', 'datetime', 'date', 'time']\n", | |
| "[INFO@root] pandas.DataFrame.from_records(<generator object load_sparql_json.<locals>.<genexpr> at 0x000002B76A92C940>, **{'index': 'index', 'columns': ['index', 'string', 'boolean', 'integer', 'float', 'double', 'decimal', 'datetime', 'date', 'time'], 'coerce_float': True})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'pandas.core.frame.DataFrame'>\n", | |
| "Index: 8 entries, 0 to 0\n", | |
| "Data columns (total 9 columns):\n", | |
| " # Column Non-Null Count Dtype \n", | |
| "--- ------ -------------- ----- \n", | |
| " 0 string 8 non-null object \n", | |
| " 1 boolean 8 non-null bool \n", | |
| " 2 integer 8 non-null int64 \n", | |
| " 3 float 8 non-null float64 \n", | |
| " 4 double 8 non-null float64 \n", | |
| " 5 decimal 8 non-null float64 \n", | |
| " 6 datetime 8 non-null datetime64[ns]\n", | |
| " 7 date 8 non-null object \n", | |
| " 8 time 8 non-null object \n", | |
| "dtypes: bool(1), datetime64[ns](1), float64(3), int64(1), object(3)\n", | |
| "memory usage: 1.5 KB\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>string</th>\n", | |
| " <th>boolean</th>\n", | |
| " <th>integer</th>\n", | |
| " <th>float</th>\n", | |
| " <th>double</th>\n", | |
| " <th>decimal</th>\n", | |
| " <th>datetime</th>\n", | |
| " <th>date</th>\n", | |
| " <th>time</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>index</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'spam'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>True</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>7</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>'eggs'</td>\n", | |
| " <td>False</td>\n", | |
| " <td>42</td>\n", | |
| " <td>6.275</td>\n", | |
| " <td>1000000.0</td>\n", | |
| " <td>1.3</td>\n", | |
| " <td>Timestamp('2005-04-04 04:04:04')</td>\n", | |
| " <td>datetime.date(2001, 1, 1)</td>\n", | |
| " <td>datetime.time(18, 30)</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " string boolean integer float double decimal \\\n", | |
| "index \n", | |
| "0 'spam' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'spam' False 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' True 42 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 7 6.275 1000000.0 1.3 \n", | |
| "0 'eggs' False 42 6.275 1000000.0 1.3 \n", | |
| "\n", | |
| " datetime date \\\n", | |
| "index \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "0 Timestamp('2005-04-04 04:04:04') datetime.date(2001, 1, 1) \n", | |
| "\n", | |
| " time \n", | |
| "index \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) \n", | |
| "0 datetime.time(18, 30) " | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "class JsonSw2Strategy(SPARQLWrapper2ReadSparql):\n", | |
| "\n", | |
| " load = staticmethod(load_sparql_json)\n", | |
| "\n", | |
| "\n", | |
| "(JsonSw2Strategy(ENDPOINT)\n", | |
| " .read_sparql(TEST_QUERY, index='index')\n", | |
| " .pipe(pd_pipe_info)\n", | |
| " .map(repr))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "23939e50-6d14-4cec-abdd-689dee23bf43", | |
| "metadata": {}, | |
| "source": [ | |
| "## `read_sparql_query(sparql, endpoint=, strategy=)`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "id": "9bbca0b3-15e8-4e56-b147-f6316c7ad169", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def read_sparql_query(sparql: str, /, *, endpoint: str, strategy: str,\n", | |
| " convert_dtypes: bool = False, **kwargs) -> pd.DataFrame:\n", | |
| " reader = ReadSparqlQuery.from_cache(endpoint, strategy=strategy)\n", | |
| " df = reader(sparql, **kwargs)\n", | |
| " if convert_dtypes:\n", | |
| " return df.convert_dtypes()\n", | |
| " return df\n", | |
| "\n", | |
| "\n", | |
| "class ReadSparqlQuery:\n", | |
| "\n", | |
| " @classmethod\n", | |
| " @functools.lru_cache(maxsize=10)\n", | |
| " def from_cache(cls, endpoint: str, *, strategy: str) -> Self:\n", | |
| " inst = cls(endpoint, strategy=strategy)\n", | |
| " logging.info('%r', inst)\n", | |
| " return inst\n", | |
| "\n", | |
| " _strategies = {'csv': CsvUrllibStrategy,\n", | |
| " 'xml': XmlUrllibStrategy,\n", | |
| " 'xml_rdflib': XmlRdflibStrategy,\n", | |
| " 'csv_sw': CsvSwStrategy,\n", | |
| " 'xml_sw': XmlSwStrategy,\n", | |
| " 'json_sw': JsonSwStrategy,\n", | |
| " 'json_sw2': JsonSw2Strategy}\n", | |
| "\n", | |
| " def __init__(self, endpoint: str, *, strategy: str) -> None:\n", | |
| " self.endpoint = endpoint\n", | |
| " assert strategy in self._strategies\n", | |
| " self._strategy = strategy\n", | |
| "\n", | |
| " def __repr__(self) -> str:\n", | |
| " return f'{self.__class__.__name__}({self.endpoint!r}, strategy={self._strategy!r})'\n", | |
| "\n", | |
| " @functools.cached_property\n", | |
| " def strategy(self) -> ReadSparql:\n", | |
| " strategy_cls = self._strategies[self._strategy]\n", | |
| " strategy = strategy_cls(self.endpoint)\n", | |
| " logging.info('strategy: %r', strategy)\n", | |
| " return strategy\n", | |
| "\n", | |
| " def __call__(self, sparql: str, /, **kwargs) -> pd.DataFrame:\n", | |
| " return self.strategy.read_sparql(sparql, **kwargs)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "96a8e01b-de38-48d8-be74-cfdc9adb1ff8", | |
| "metadata": {}, | |
| "source": [ | |
| "## `read_wikidata(strategy=)`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "id": "c40aa7a7-da7e-4a34-b85c-e742dc50baff", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "QUERY = '''\n", | |
| "SELECT\n", | |
| " ?glottocode\n", | |
| " (strafter(str(?languoid), str(wd:)) AS ?qid)\n", | |
| " (?languoidLabel AS ?name)\n", | |
| " (strafter(str(?siteLink), \"https://en.wikipedia.org/wiki/\") AS ?title)\n", | |
| "WHERE {\n", | |
| " ?languoid wdt:P1394 ?glottocode.\n", | |
| " FILTER (REGEX(?glottocode, \"^[a-z0-9]{4}[0-9]{4}$\")).\n", | |
| " OPTIONAL {\n", | |
| " ?siteLink schema:about ?languoid;\n", | |
| " schema:inLanguage \"en\";\n", | |
| " schema:isPartOf <https://en.wikipedia.org/>.\n", | |
| " }\n", | |
| " SERVICE wikibase:label {\n", | |
| " bd:serviceParam wikibase:language \"en\".\n", | |
| " ?languoid rdfs:label ?languoidLabel.\n", | |
| " }\n", | |
| "}\n", | |
| "ORDER BY\n", | |
| " ?glottocode\n", | |
| " xsd:integer(strafter(str(?languoid), str(wd:Q)))\n", | |
| "LIMIT 15000\n", | |
| "'''.strip()\n", | |
| "\n", | |
| "read_wikidata = functools.partial(read_sparql_query, QUERY, endpoint=ENDPOINT, convert_dtypes=True)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "5791761f-99c7-4fb0-aa61-848553864874", | |
| "metadata": {}, | |
| "source": [ | |
| "### `strategy='csv'`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "id": "aa002249-49fe-49f7-acd9-8c2021c04139", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='csv')\n", | |
| "[INFO@root] strategy: CsvUrllibStrategy('https://query.wikidata.org/sparql')\n", | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] request: 'GET' <urllib.request.Request object at 0x000002B76A9BC270>\n", | |
| "[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000002B76A9981C0>\n", | |
| "[INFO@root] content_type: 'text/csv'\n", | |
| "[INFO@root] content_charset: 'utf-8'\n", | |
| "[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000002B76A9981C0>, **{'index_col': 'glottocode', 'encoding': 'utf-8', 'na_values': '', 'keep_default_na': False})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: total: 15.6 ms\n", | |
| "Wall time: 9.33 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>qid</th>\n", | |
| " <th>name</th>\n", | |
| " <th>title</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>glottocode</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>aant1238</th>\n", | |
| " <td>Q31312216</td>\n", | |
| " <td>Aantantara</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1238</th>\n", | |
| " <td>Q85516014</td>\n", | |
| " <td>Aari-Gayil</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1239</th>\n", | |
| " <td>Q7495</td>\n", | |
| " <td>Aari</td>\n", | |
| " <td>Aari_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1240</th>\n", | |
| " <td>Q4661732</td>\n", | |
| " <td>Aariya</td>\n", | |
| " <td>Aariya_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aasa1238</th>\n", | |
| " <td>Q56620</td>\n", | |
| " <td>Asa</td>\n", | |
| " <td>Asa_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuri1239</th>\n", | |
| " <td>Q248682</td>\n", | |
| " <td>Zurich German</td>\n", | |
| " <td>Zurich_German</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zurr1238</th>\n", | |
| " <td>Q127427511</td>\n", | |
| " <td>Żurrieq dialect</td>\n", | |
| " <td>Zurrieq_dialect</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuti1239</th>\n", | |
| " <td>Q135990401</td>\n", | |
| " <td>Guajajára of Zutiua</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zwal1238</th>\n", | |
| " <td>Q135990263</td>\n", | |
| " <td>Zwall</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyph1238</th>\n", | |
| " <td>Q57004</td>\n", | |
| " <td>Zyphe</td>\n", | |
| " <td>Zyphe_language</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>11797 rows × 3 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " qid name title\n", | |
| "glottocode \n", | |
| "aant1238 Q31312216 Aantantara <NA>\n", | |
| "aari1238 Q85516014 Aari-Gayil <NA>\n", | |
| "aari1239 Q7495 Aari Aari_language\n", | |
| "aari1240 Q4661732 Aariya Aariya_language\n", | |
| "aasa1238 Q56620 Asa Asa_language\n", | |
| "... ... ... ...\n", | |
| "zuri1239 Q248682 Zurich German Zurich_German\n", | |
| "zurr1238 Q127427511 Żurrieq dialect Zurrieq_dialect\n", | |
| "zuti1239 Q135990401 Guajajára of Zutiua <NA>\n", | |
| "zwal1238 Q135990263 Zwall <NA>\n", | |
| "zyph1238 Q57004 Zyphe Zyphe_language\n", | |
| "\n", | |
| "[11797 rows x 3 columns]" | |
| ] | |
| }, | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time read_wikidata(strategy='csv', index_col='glottocode')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "6052a9c6-37a5-4331-9577-f0d1b9eaa7eb", | |
| "metadata": {}, | |
| "source": [ | |
| "### `strategy='xml'`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "id": "eb73feff-a3cc-47e3-90c8-d1ff6a7ea564", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml')\n", | |
| "[INFO@root] strategy: XmlUrllibStrategy('https://query.wikidata.org/sparql')\n", | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] request: 'GET' <urllib.request.Request object at 0x000002B76A92C450>\n", | |
| "[INFO@root] response: 200 <http.client.HTTPResponse object at 0x000002B76A9500D0>\n", | |
| "[INFO@root] content_type: 'application/sparql-results+xml'\n", | |
| "[INFO@root] content_charset: 'utf-8'\n", | |
| "[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000002B76BA31440>\n", | |
| "[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000002B76A90CFB0>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: total: 422 ms\n", | |
| "Wall time: 7.18 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>qid</th>\n", | |
| " <th>name</th>\n", | |
| " <th>title</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>glottocode</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>aant1238</th>\n", | |
| " <td>Q31312216</td>\n", | |
| " <td>Aantantara</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1238</th>\n", | |
| " <td>Q85516014</td>\n", | |
| " <td>Aari-Gayil</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1239</th>\n", | |
| " <td>Q7495</td>\n", | |
| " <td>Aari</td>\n", | |
| " <td>Aari_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1240</th>\n", | |
| " <td>Q4661732</td>\n", | |
| " <td>Aariya</td>\n", | |
| " <td>Aariya_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aasa1238</th>\n", | |
| " <td>Q56620</td>\n", | |
| " <td>Asa</td>\n", | |
| " <td>Asa_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuri1239</th>\n", | |
| " <td>Q248682</td>\n", | |
| " <td>Zurich German</td>\n", | |
| " <td>Zurich_German</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zurr1238</th>\n", | |
| " <td>Q127427511</td>\n", | |
| " <td>Żurrieq dialect</td>\n", | |
| " <td>Zurrieq_dialect</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuti1239</th>\n", | |
| " <td>Q135990401</td>\n", | |
| " <td>Guajajára of Zutiua</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zwal1238</th>\n", | |
| " <td>Q135990263</td>\n", | |
| " <td>Zwall</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyph1238</th>\n", | |
| " <td>Q57004</td>\n", | |
| " <td>Zyphe</td>\n", | |
| " <td>Zyphe_language</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>11797 rows × 3 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " qid name title\n", | |
| "glottocode \n", | |
| "aant1238 Q31312216 Aantantara <NA>\n", | |
| "aari1238 Q85516014 Aari-Gayil <NA>\n", | |
| "aari1239 Q7495 Aari Aari_language\n", | |
| "aari1240 Q4661732 Aariya Aariya_language\n", | |
| "aasa1238 Q56620 Asa Asa_language\n", | |
| "... ... ... ...\n", | |
| "zuri1239 Q248682 Zurich German Zurich_German\n", | |
| "zurr1238 Q127427511 Żurrieq dialect Zurrieq_dialect\n", | |
| "zuti1239 Q135990401 Guajajára of Zutiua <NA>\n", | |
| "zwal1238 Q135990263 Zwall <NA>\n", | |
| "zyph1238 Q57004 Zyphe Zyphe_language\n", | |
| "\n", | |
| "[11797 rows x 3 columns]" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time read_wikidata(strategy='xml', index='glottocode')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "426f9ea7-031c-47fb-89e0-04c17d6ed924", | |
| "metadata": {}, | |
| "source": [ | |
| "### `strategy='xml_rdflib`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "id": "c7edcb9e-9a58-4166-acad-e7bf0f7884cb", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml_rdflib')\n", | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] graph: [a rdfg:Graph;rdflib:storage [a rdflib:Store;rdfs:label 'SPARQLStore']].\n", | |
| "[INFO@root] strategy: <XmlRdflibStrategy 'https://query.wikidata.org/sparql' graph=<Graph identifier=N259d1ee6d73f470ea090dc9940406a5e (<class 'rdflib.graph.Graph'>)>>\n", | |
| "[INFO@root] result: <rdflib.plugins.sparql.results.xmlresults.XMLResult object at 0x000002B76A916350>\n", | |
| "[INFO@root] pandas.DataFrame.from_records(<generator object load_sparql_xml_result.<locals>.<genexpr> at 0x000002B7374EE5C0>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: total: 422 ms\n", | |
| "Wall time: 5.88 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>qid</th>\n", | |
| " <th>name</th>\n", | |
| " <th>title</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>glottocode</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>aant1238</th>\n", | |
| " <td>Q31312216</td>\n", | |
| " <td>Aantantara</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1238</th>\n", | |
| " <td>Q85516014</td>\n", | |
| " <td>Aari-Gayil</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1239</th>\n", | |
| " <td>Q7495</td>\n", | |
| " <td>Aari</td>\n", | |
| " <td>Aari_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1240</th>\n", | |
| " <td>Q4661732</td>\n", | |
| " <td>Aariya</td>\n", | |
| " <td>Aariya_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aasa1238</th>\n", | |
| " <td>Q56620</td>\n", | |
| " <td>Asa</td>\n", | |
| " <td>Asa_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuri1239</th>\n", | |
| " <td>Q248682</td>\n", | |
| " <td>Zurich German</td>\n", | |
| " <td>Zurich_German</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zurr1238</th>\n", | |
| " <td>Q127427511</td>\n", | |
| " <td>Żurrieq dialect</td>\n", | |
| " <td>Zurrieq_dialect</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuti1239</th>\n", | |
| " <td>Q135990401</td>\n", | |
| " <td>Guajajára of Zutiua</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zwal1238</th>\n", | |
| " <td>Q135990263</td>\n", | |
| " <td>Zwall</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyph1238</th>\n", | |
| " <td>Q57004</td>\n", | |
| " <td>Zyphe</td>\n", | |
| " <td>Zyphe_language</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>11797 rows × 3 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " qid name title\n", | |
| "glottocode \n", | |
| "aant1238 Q31312216 Aantantara <NA>\n", | |
| "aari1238 Q85516014 Aari-Gayil <NA>\n", | |
| "aari1239 Q7495 Aari Aari_language\n", | |
| "aari1240 Q4661732 Aariya Aariya_language\n", | |
| "aasa1238 Q56620 Asa Asa_language\n", | |
| "... ... ... ...\n", | |
| "zuri1239 Q248682 Zurich German Zurich_German\n", | |
| "zurr1238 Q127427511 Żurrieq dialect Zurrieq_dialect\n", | |
| "zuti1239 Q135990401 Guajajára of Zutiua <NA>\n", | |
| "zwal1238 Q135990263 Zwall <NA>\n", | |
| "zyph1238 Q57004 Zyphe Zyphe_language\n", | |
| "\n", | |
| "[11797 rows x 3 columns]" | |
| ] | |
| }, | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time read_wikidata(strategy='xml_rdflib', index='glottocode')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "235dc1b7-0e6c-4703-b219-f720bf1e9979", | |
| "metadata": {}, | |
| "source": [ | |
| "### `strategy='csv_sw'`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "id": "071cc12d-1dec-490e-b192-c7ce7c8ca274", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='csv_sw')\n", | |
| "[INFO@root] strategy: CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv')\n", | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] returnFormat: 'csv'\n", | |
| "[INFO@root] CsvSwStrategy('https://query.wikidata.org/sparql', returnFormat='csv').setOnlyConnreg(True)\n", | |
| "[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000002B76A90CD60>\n", | |
| "[INFO@root] Content-Type: 'text/csv;charset=utf-8'\n", | |
| "[INFO@root] pandas.read_csv(<http.client.HTTPResponse object at 0x000002B76A435C30>, **{'index_col': 'glottocode', 'encoding': 'utf-8', 'na_values': '', 'keep_default_na': False})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: total: 93.8 ms\n", | |
| "Wall time: 4.08 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>qid</th>\n", | |
| " <th>name</th>\n", | |
| " <th>title</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>glottocode</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>aant1238</th>\n", | |
| " <td>Q31312216</td>\n", | |
| " <td>Aantantara</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1238</th>\n", | |
| " <td>Q85516014</td>\n", | |
| " <td>Aari-Gayil</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1239</th>\n", | |
| " <td>Q7495</td>\n", | |
| " <td>Aari</td>\n", | |
| " <td>Aari_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1240</th>\n", | |
| " <td>Q4661732</td>\n", | |
| " <td>Aariya</td>\n", | |
| " <td>Aariya_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aasa1238</th>\n", | |
| " <td>Q56620</td>\n", | |
| " <td>Asa</td>\n", | |
| " <td>Asa_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuri1239</th>\n", | |
| " <td>Q248682</td>\n", | |
| " <td>Zurich German</td>\n", | |
| " <td>Zurich_German</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zurr1238</th>\n", | |
| " <td>Q127427511</td>\n", | |
| " <td>Żurrieq dialect</td>\n", | |
| " <td>Zurrieq_dialect</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuti1239</th>\n", | |
| " <td>Q135990401</td>\n", | |
| " <td>Guajajára of Zutiua</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zwal1238</th>\n", | |
| " <td>Q135990263</td>\n", | |
| " <td>Zwall</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyph1238</th>\n", | |
| " <td>Q57004</td>\n", | |
| " <td>Zyphe</td>\n", | |
| " <td>Zyphe_language</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>11797 rows × 3 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " qid name title\n", | |
| "glottocode \n", | |
| "aant1238 Q31312216 Aantantara <NA>\n", | |
| "aari1238 Q85516014 Aari-Gayil <NA>\n", | |
| "aari1239 Q7495 Aari Aari_language\n", | |
| "aari1240 Q4661732 Aariya Aariya_language\n", | |
| "aasa1238 Q56620 Asa Asa_language\n", | |
| "... ... ... ...\n", | |
| "zuri1239 Q248682 Zurich German Zurich_German\n", | |
| "zurr1238 Q127427511 Żurrieq dialect Zurrieq_dialect\n", | |
| "zuti1239 Q135990401 Guajajára of Zutiua <NA>\n", | |
| "zwal1238 Q135990263 Zwall <NA>\n", | |
| "zyph1238 Q57004 Zyphe Zyphe_language\n", | |
| "\n", | |
| "[11797 rows x 3 columns]" | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time read_wikidata(strategy='csv_sw', index_col='glottocode')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "e5eb79e1-5a40-4d66-9951-056fbf3e440d", | |
| "metadata": {}, | |
| "source": [ | |
| "### `strategy='xml_sw'`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "id": "6d36993c-766f-431c-af72-b341afbd4689", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='xml_sw')\n", | |
| "[INFO@root] strategy: XmlSwStrategy('https://query.wikidata.org/sparql', returnFormat='xml')\n", | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] returnFormat: 'xml'\n", | |
| "[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000002B76A91CDD0>\n", | |
| "[INFO@root] Content-Type: 'application/sparql-results+xml;charset=utf-8'\n", | |
| "[INFO@root] xml: <Element '{http://www.w3.org/2005/sparql-results#}sparql' at 0x000002B76E058810>\n", | |
| "[INFO@root] pandas.DataFrame.from_records(<generator object iterrecords at 0x000002B76A90D5A0>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: total: 406 ms\n", | |
| "Wall time: 7.92 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>qid</th>\n", | |
| " <th>name</th>\n", | |
| " <th>title</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>glottocode</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>aant1238</th>\n", | |
| " <td>Q31312216</td>\n", | |
| " <td>Aantantara</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1238</th>\n", | |
| " <td>Q85516014</td>\n", | |
| " <td>Aari-Gayil</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1239</th>\n", | |
| " <td>Q7495</td>\n", | |
| " <td>Aari</td>\n", | |
| " <td>Aari_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1240</th>\n", | |
| " <td>Q4661732</td>\n", | |
| " <td>Aariya</td>\n", | |
| " <td>Aariya_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aasa1238</th>\n", | |
| " <td>Q56620</td>\n", | |
| " <td>Asa</td>\n", | |
| " <td>Asa_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuri1239</th>\n", | |
| " <td>Q248682</td>\n", | |
| " <td>Zurich German</td>\n", | |
| " <td>Zurich_German</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zurr1238</th>\n", | |
| " <td>Q127427511</td>\n", | |
| " <td>Żurrieq dialect</td>\n", | |
| " <td>Zurrieq_dialect</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuti1239</th>\n", | |
| " <td>Q135990401</td>\n", | |
| " <td>Guajajára of Zutiua</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zwal1238</th>\n", | |
| " <td>Q135990263</td>\n", | |
| " <td>Zwall</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyph1238</th>\n", | |
| " <td>Q57004</td>\n", | |
| " <td>Zyphe</td>\n", | |
| " <td>Zyphe_language</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>11797 rows × 3 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " qid name title\n", | |
| "glottocode \n", | |
| "aant1238 Q31312216 Aantantara <NA>\n", | |
| "aari1238 Q85516014 Aari-Gayil <NA>\n", | |
| "aari1239 Q7495 Aari Aari_language\n", | |
| "aari1240 Q4661732 Aariya Aariya_language\n", | |
| "aasa1238 Q56620 Asa Asa_language\n", | |
| "... ... ... ...\n", | |
| "zuri1239 Q248682 Zurich German Zurich_German\n", | |
| "zurr1238 Q127427511 Żurrieq dialect Zurrieq_dialect\n", | |
| "zuti1239 Q135990401 Guajajára of Zutiua <NA>\n", | |
| "zwal1238 Q135990263 Zwall <NA>\n", | |
| "zyph1238 Q57004 Zyphe Zyphe_language\n", | |
| "\n", | |
| "[11797 rows x 3 columns]" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time read_wikidata(strategy='xml_sw', index='glottocode')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "3aa86137-540b-49c0-bdbd-c5d98d269b24", | |
| "metadata": {}, | |
| "source": [ | |
| "### `strategy='json_sw'`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "id": "5642ca34-b0a3-49a9-a715-b3d2262c05cc", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='json_sw')\n", | |
| "[INFO@root] strategy: JsonSwStrategy('https://query.wikidata.org/sparql', returnFormat='json')\n", | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] returnFormat: 'json'\n", | |
| "[INFO@root] result: <SPARQLWrapper.Wrapper.QueryResult object at 0x000002B76BA34160>\n", | |
| "[INFO@root] Content-Type: 'application/sparql-results+json;charset=utf-8'\n", | |
| "[INFO@root] result['head']: {'vars': ['glottocode', 'qid', 'name', 'title']}\n", | |
| "[INFO@root] pandas.json_normalize(data, **{'record_path': ['results', 'bindings']})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: total: 297 ms\n", | |
| "Wall time: 39.6 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>qid</th>\n", | |
| " <th>name</th>\n", | |
| " <th>title</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>glottocode</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>aant1238</th>\n", | |
| " <td>Q31312216</td>\n", | |
| " <td>Aantantara</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1238</th>\n", | |
| " <td>Q85516014</td>\n", | |
| " <td>Aari-Gayil</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1239</th>\n", | |
| " <td>Q7495</td>\n", | |
| " <td>Aari</td>\n", | |
| " <td>Aari_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1240</th>\n", | |
| " <td>Q4661732</td>\n", | |
| " <td>Aariya</td>\n", | |
| " <td>Aariya_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aasa1238</th>\n", | |
| " <td>Q56620</td>\n", | |
| " <td>Asa</td>\n", | |
| " <td>Asa_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuri1239</th>\n", | |
| " <td>Q248682</td>\n", | |
| " <td>Zurich German</td>\n", | |
| " <td>Zurich_German</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zurr1238</th>\n", | |
| " <td>Q127427511</td>\n", | |
| " <td>Żurrieq dialect</td>\n", | |
| " <td>Zurrieq_dialect</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuti1239</th>\n", | |
| " <td>Q135990401</td>\n", | |
| " <td>Guajajára of Zutiua</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zwal1238</th>\n", | |
| " <td>Q135990263</td>\n", | |
| " <td>Zwall</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyph1238</th>\n", | |
| " <td>Q57004</td>\n", | |
| " <td>Zyphe</td>\n", | |
| " <td>Zyphe_language</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>11797 rows × 3 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " qid name title\n", | |
| "glottocode \n", | |
| "aant1238 Q31312216 Aantantara <NA>\n", | |
| "aari1238 Q85516014 Aari-Gayil <NA>\n", | |
| "aari1239 Q7495 Aari Aari_language\n", | |
| "aari1240 Q4661732 Aariya Aariya_language\n", | |
| "aasa1238 Q56620 Asa Asa_language\n", | |
| "... ... ... ...\n", | |
| "zuri1239 Q248682 Zurich German Zurich_German\n", | |
| "zurr1238 Q127427511 Żurrieq dialect Zurrieq_dialect\n", | |
| "zuti1239 Q135990401 Guajajára of Zutiua <NA>\n", | |
| "zwal1238 Q135990263 Zwall <NA>\n", | |
| "zyph1238 Q57004 Zyphe Zyphe_language\n", | |
| "\n", | |
| "[11797 rows x 3 columns]" | |
| ] | |
| }, | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time read_wikidata(strategy='json_sw').set_index('glottocode')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "56b5fcd4-15d4-49f1-830d-3aabf15c80ca", | |
| "metadata": {}, | |
| "source": [ | |
| "### `strategy='json_sw2'`" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 29, | |
| "id": "4bcf9fc1-2bd3-44ac-bc7d-771484626866", | |
| "metadata": { | |
| "tags": [] | |
| }, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "[INFO@root] ReadSparqlQuery('https://query.wikidata.org/sparql', strategy='json_sw2')\n", | |
| "[INFO@root] strategy: JsonSw2Strategy('https://query.wikidata.org/sparql', returnFormat='json')\n", | |
| "[INFO@root] endpoint: 'https://query.wikidata.org/sparql'\n", | |
| "[INFO@root] result: <SPARQLWrapper.SmartWrapper.Bindings object at 0x000002B76A9165D0>\n", | |
| "[INFO@root] result.variables: ['glottocode', 'qid', 'name', 'title']\n", | |
| "[INFO@root] pandas.DataFrame.from_records(<generator object load_sparql_json.<locals>.<genexpr> at 0x000002B76A92D340>, **{'index': 'glottocode', 'columns': ['glottocode', 'qid', 'name', 'title'], 'coerce_float': True})\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: total: 109 ms\n", | |
| "Wall time: 5.05 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>qid</th>\n", | |
| " <th>name</th>\n", | |
| " <th>title</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>glottocode</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>aant1238</th>\n", | |
| " <td>Q31312216</td>\n", | |
| " <td>Aantantara</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1238</th>\n", | |
| " <td>Q85516014</td>\n", | |
| " <td>Aari-Gayil</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1239</th>\n", | |
| " <td>Q7495</td>\n", | |
| " <td>Aari</td>\n", | |
| " <td>Aari_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aari1240</th>\n", | |
| " <td>Q4661732</td>\n", | |
| " <td>Aariya</td>\n", | |
| " <td>Aariya_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>aasa1238</th>\n", | |
| " <td>Q56620</td>\n", | |
| " <td>Asa</td>\n", | |
| " <td>Asa_language</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuri1239</th>\n", | |
| " <td>Q248682</td>\n", | |
| " <td>Zurich German</td>\n", | |
| " <td>Zurich_German</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zurr1238</th>\n", | |
| " <td>Q127427511</td>\n", | |
| " <td>Żurrieq dialect</td>\n", | |
| " <td>Zurrieq_dialect</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zuti1239</th>\n", | |
| " <td>Q135990401</td>\n", | |
| " <td>Guajajára of Zutiua</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zwal1238</th>\n", | |
| " <td>Q135990263</td>\n", | |
| " <td>Zwall</td>\n", | |
| " <td><NA></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>zyph1238</th>\n", | |
| " <td>Q57004</td>\n", | |
| " <td>Zyphe</td>\n", | |
| " <td>Zyphe_language</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>11797 rows × 3 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " qid name title\n", | |
| "glottocode \n", | |
| "aant1238 Q31312216 Aantantara <NA>\n", | |
| "aari1238 Q85516014 Aari-Gayil <NA>\n", | |
| "aari1239 Q7495 Aari Aari_language\n", | |
| "aari1240 Q4661732 Aariya Aariya_language\n", | |
| "aasa1238 Q56620 Asa Asa_language\n", | |
| "... ... ... ...\n", | |
| "zuri1239 Q248682 Zurich German Zurich_German\n", | |
| "zurr1238 Q127427511 Żurrieq dialect Zurrieq_dialect\n", | |
| "zuti1239 Q135990401 Guajajára of Zutiua <NA>\n", | |
| "zwal1238 Q135990263 Zwall <NA>\n", | |
| "zyph1238 Q57004 Zyphe Zyphe_language\n", | |
| "\n", | |
| "[11797 rows x 3 columns]" | |
| ] | |
| }, | |
| "execution_count": 29, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%time read_wikidata(strategy='json_sw2', index='glottocode')" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.14.0" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment