Created
May 30, 2023 05:10
-
-
Save vibhatha/4ca752368892d3fff78212b5cd8a19e5 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "id": "644153c1", | |
| "metadata": {}, | |
| "source": [ | |
| "## Evaluating Issue 26\n", | |
| "\n", | |
| "Must use the following environment to test this\n", | |
| "\n", | |
| "https://github.com/vibhatha/docker-spark-iceberg\n", | |
| "\n", | |
| "Ref: https://github.com/vibhatha/pyiceberg_substrait/issues/26" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "644afb25", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Collecting icetrait\n", | |
| " Cloning https://github.com/vibhatha/pyiceberg_substrait (to revision feat-schema-evolution-s1) to /tmp/pip-install-4bkt0riu/icetrait_70a3778644114482978d76d2d3179ea2\n", | |
| " Running command git clone --filter=blob:none --quiet https://github.com/vibhatha/pyiceberg_substrait /tmp/pip-install-4bkt0riu/icetrait_70a3778644114482978d76d2d3179ea2\n", | |
| " Running command git checkout -b feat-schema-evolution-s1 --track origin/feat-schema-evolution-s1\n", | |
| " Switched to a new branch 'feat-schema-evolution-s1'\n", | |
| " Branch 'feat-schema-evolution-s1' set up to track remote branch 'feat-schema-evolution-s1' from 'origin'.\n", | |
| " Resolved https://github.com/vibhatha/pyiceberg_substrait to commit 8e61be62f1b70aa74b748487fa743ac652cfaaa9\n", | |
| " Running command git submodule update --init --recursive -q\n", | |
| " Installing build dependencies ... \u001b[?25ldone\n", | |
| "\u001b[?25h Getting requirements to build wheel ... \u001b[?25ldone\n", | |
| "\u001b[?25h Installing backend dependencies ... \u001b[?25ldone\n", | |
| "\u001b[?25h Preparing metadata (pyproject.toml) ... \u001b[?25ldone\n", | |
| "\u001b[?25hRequirement already satisfied: protobuf>=3.20 in /usr/local/lib/python3.9/site-packages (from icetrait) (3.20.1)\n", | |
| "Requirement already satisfied: pyarrow>=10.0.1 in /usr/local/lib/python3.9/site-packages (from icetrait) (10.0.1)\n", | |
| "Requirement already satisfied: pyiceberg>=0.3.0 in /usr/local/lib/python3.9/site-packages (from icetrait) (0.3.0)\n", | |
| "Requirement already satisfied: sqlparse>=0.4.4 in /usr/local/lib/python3.9/site-packages (from icetrait) (0.4.4)\n", | |
| "Requirement already satisfied: duckdb>=0.7.1 in /usr/local/lib/python3.9/site-packages (from icetrait) (0.7.1)\n", | |
| "Requirement already satisfied: numpy>=1.16.6 in /usr/local/lib/python3.9/site-packages (from pyarrow>=10.0.1->icetrait) (1.24.3)\n", | |
| "Requirement already satisfied: click==8.1.3 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (8.1.3)\n", | |
| "Requirement already satisfied: requests==2.28.2 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (2.28.2)\n", | |
| "Requirement already satisfied: pyyaml==6.0.0 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (6.0)\n", | |
| "Requirement already satisfied: mmhash3==3.0.1 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (3.0.1)\n", | |
| "Requirement already satisfied: zstandard==0.19.0 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (0.19.0)\n", | |
| "Requirement already satisfied: fsspec==2023.1.0 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (2023.1.0)\n", | |
| "Requirement already satisfied: pydantic==1.10.4 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (1.10.4)\n", | |
| "Requirement already satisfied: pyparsing==3.0.9 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (3.0.9)\n", | |
| "Requirement already satisfied: rich==13.2.0 in /usr/local/lib/python3.9/site-packages (from pyiceberg>=0.3.0->icetrait) (13.2.0)\n", | |
| "Requirement already satisfied: typing-extensions>=4.2.0 in /usr/local/lib/python3.9/site-packages (from pydantic==1.10.4->pyiceberg>=0.3.0->icetrait) (4.5.0)\n", | |
| "Requirement already satisfied: idna<4,>=2.5 in /usr/local/lib/python3.9/site-packages (from requests==2.28.2->pyiceberg>=0.3.0->icetrait) (3.4)\n", | |
| "Requirement already satisfied: urllib3<1.27,>=1.21.1 in /usr/local/lib/python3.9/site-packages (from requests==2.28.2->pyiceberg>=0.3.0->icetrait) (1.26.15)\n", | |
| "Requirement already satisfied: charset-normalizer<4,>=2 in /usr/local/lib/python3.9/site-packages (from requests==2.28.2->pyiceberg>=0.3.0->icetrait) (3.1.0)\n", | |
| "Requirement already satisfied: certifi>=2017.4.17 in /usr/local/lib/python3.9/site-packages (from requests==2.28.2->pyiceberg>=0.3.0->icetrait) (2023.5.7)\n", | |
| "Requirement already satisfied: pygments<3.0.0,>=2.6.0 in /usr/local/lib/python3.9/site-packages (from rich==13.2.0->pyiceberg>=0.3.0->icetrait) (2.15.1)\n", | |
| "Requirement already satisfied: markdown-it-py<3.0.0,>=2.1.0 in /usr/local/lib/python3.9/site-packages (from rich==13.2.0->pyiceberg>=0.3.0->icetrait) (2.2.0)\n", | |
| "Requirement already satisfied: mdurl~=0.1 in /usr/local/lib/python3.9/site-packages (from markdown-it-py<3.0.0,>=2.1.0->rich==13.2.0->pyiceberg>=0.3.0->icetrait) (0.1.2)\n", | |
| "Building wheels for collected packages: icetrait\n", | |
| " Building wheel for icetrait (pyproject.toml) ... \u001b[?25ldone\n", | |
| "\u001b[?25h Created wheel for icetrait: filename=icetrait-0.1.dev100+g8e61be6.d20230530-py3-none-any.whl size=17971 sha256=1bf90a8eddbcde1faeee07d6e265438907d8f738010863f221ee7c57e02c1c4d\n", | |
| " Stored in directory: /tmp/pip-ephem-wheel-cache-se_84_dq/wheels/8e/cd/16/abf0e472534fbf163df862e29f6626f2fb1a1b3a1d2449122f\n", | |
| "Successfully built icetrait\n", | |
| "Installing collected packages: icetrait\n", | |
| " Attempting uninstall: icetrait\n", | |
| " Found existing installation: icetrait 0.1.dev19+g2ab59e6.d20230512\n", | |
| " Uninstalling icetrait-0.1.dev19+g2ab59e6.d20230512:\n", | |
| " Successfully uninstalled icetrait-0.1.dev19+g2ab59e6.d20230512\n", | |
| "Successfully installed icetrait-0.1.dev100+g8e61be6.d20230530\n", | |
| "\u001b[33mWARNING: Running pip as the 'root' user can result in broken permissions and conflicting behaviour with the system package manager. It is recommended to use a virtual environment instead: https://pip.pypa.io/warnings/venv\u001b[0m\u001b[33m\n", | |
| "\u001b[0m\u001b[33mWARNING: You are using pip version 22.0.4; however, version 23.1.2 is available.\n", | |
| "You should consider upgrading via the '/usr/local/bin/python -m pip install --upgrade pip' command.\u001b[0m\u001b[33m\n", | |
| "\u001b[0m" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!pip install git+https://github.com/vibhatha/pyiceberg_substrait@feat-schema-evolution-s1#egg=icetrait" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "4607c841", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import os\n", | |
| "os.environ['ICETRAIT_LOG_DIR'] = '/home/iceberg/notebooks/icetrait_logs'\n", | |
| "os.getenv(\"ICETRAIT_LOG_DIR\")\n", | |
| "os.environ['ICETRAIT_LOGGING'] = 'DISABLE'" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "541b68ff", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Log directory `/home/iceberg/notebooks/icetrait_logs` already exists\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from icetrait.substrait.visitor import SubstraitPlanEditor, visit_and_update, RelVisitor, RelUpdateVisitor\n", | |
| "from icetrait.duckdb.wrapper import DuckdbSubstrait\n", | |
| "import duckdb" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "bdf506f9", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "23/05/30 03:53:23 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "\n", | |
| " <div>\n", | |
| " <p><b>SparkSession - in-memory</b></p>\n", | |
| " \n", | |
| " <div>\n", | |
| " <p><b>SparkContext</b></p>\n", | |
| "\n", | |
| " <p><a href=\"http://a7cb6348cea3:4042\">Spark UI</a></p>\n", | |
| "\n", | |
| " <dl>\n", | |
| " <dt>Version</dt>\n", | |
| " <dd><code>v3.3.2</code></dd>\n", | |
| " <dt>Master</dt>\n", | |
| " <dd><code>local[*]</code></dd>\n", | |
| " <dt>AppName</dt>\n", | |
| " <dd><code>PySparkShell</code></dd>\n", | |
| " </dl>\n", | |
| " </div>\n", | |
| " \n", | |
| " </div>\n", | |
| " " | |
| ], | |
| "text/plain": [ | |
| "<pyspark.sql.session.SparkSession at 0x7f4d68acf9a0>" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import findspark\n", | |
| "findspark.init()\n", | |
| "\n", | |
| "from pyspark.sql import SparkSession\n", | |
| "\n", | |
| "spark = (\n", | |
| " SparkSession.builder.master(\"local\")\n", | |
| " .appName(\"IcebergPySpark\")\n", | |
| " .config(\"spark.sql.extensions\", \"org.apache.iceberg.spark.extensions.IcebergSparkSessionExtensions\")\n", | |
| " .config(\"spark.sql.catalog.demo\", \"org.apache.iceberg.spark.SparkCatalog\")\n", | |
| " .config(\"spark.sql.catalog.demo.catalog-impl\", \"org.apache.iceberg.rest.RESTCatalog\")\n", | |
| " .config(\"spark.sql.catalog.demo.uri\", \"http://rest:8181\")\n", | |
| " .config(\"spark.sql.catalog.demo.s3.endpoint\", \"http://minio:9000\")\n", | |
| " .config(\"spark.sql.defaultCatalog\", \"demo\")\n", | |
| " .config(\"spark.eventLog.enabled\", \"true\")\n", | |
| " .config(\"spark.eventLog.dir\", \"/home/iceberg/spark-events\")\n", | |
| " .config(\"spark.history.fs.logDirectory\", \"/home/iceberg/spark-events\")\n", | |
| " .config(\"spark.sql.catalogImplementation\", \"/home/iceberg/spark-events\")\n", | |
| " .getOrCreate()\n", | |
| ")\n", | |
| "spark" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "41bf769e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "23/05/30 03:53:32 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table>\n", | |
| " <tr>\n", | |
| " </tr>\n", | |
| "</table>" | |
| ], | |
| "text/plain": [ | |
| "++\n", | |
| "||\n", | |
| "++\n", | |
| "++" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%sql\n", | |
| "\n", | |
| "DROP TABLE IF EXISTS nyc_demo.taxis_sample;" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "05b978ea", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "23/05/30 03:53:39 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table>\n", | |
| " <tr>\n", | |
| " </tr>\n", | |
| "</table>" | |
| ], | |
| "text/plain": [ | |
| "++\n", | |
| "||\n", | |
| "++\n", | |
| "++" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%sql\n", | |
| "\n", | |
| "CREATE DATABASE IF NOT EXISTS nyc_demo;" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "50d10a24", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "23/05/30 03:53:51 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table>\n", | |
| " <tr>\n", | |
| " <th>namespace</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td>nyc_demo</td>\n", | |
| " </tr>\n", | |
| "</table>" | |
| ], | |
| "text/plain": [ | |
| "+-----------+\n", | |
| "| namespace |\n", | |
| "+-----------+\n", | |
| "| nyc_demo |\n", | |
| "+-----------+" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%sql\n", | |
| "\n", | |
| "show databases;" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "6dd0ec69", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "SLF4J: Failed to load class \"org.slf4j.impl.StaticLoggerBinder\".\n", | |
| "SLF4J: Defaulting to no-operation (NOP) logger implementation\n", | |
| "SLF4J: See http://www.slf4j.org/codes.html#StaticLoggerBinder for further details.\n", | |
| " \r" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df = spark.read.parquet(\"/home/iceberg/data/yellow_tripdata_2021-04.parquet\")\n", | |
| "df_small = df.limit(500)\n", | |
| "df_small = df.select(['VendorID',\n", | |
| " 'passenger_count',\n", | |
| " 'trip_distance',\n", | |
| " 'RatecodeID',\n", | |
| " 'store_and_fwd_flag',\n", | |
| " 'PULocationID',\n", | |
| " 'DOLocationID',\n", | |
| " 'payment_type',\n", | |
| " 'fare_amount',\n", | |
| " 'extra',\n", | |
| " 'mta_tax',\n", | |
| " 'tip_amount',\n", | |
| " 'tolls_amount',\n", | |
| " 'improvement_surcharge',\n", | |
| " 'total_amount',\n", | |
| " 'congestion_surcharge',\n", | |
| " 'airport_fee'])\n", | |
| "df_small.write.saveAsTable(\"nyc_demo.taxis_sample\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "3a8704c8", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "23/05/30 03:54:30 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n", | |
| "23/05/30 03:54:30 WARN BaseTransaction: Failed to load metadata for a committed snapshot, skipping clean-up\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table>\n", | |
| " <tr>\n", | |
| " </tr>\n", | |
| "</table>" | |
| ], | |
| "text/plain": [ | |
| "++\n", | |
| "||\n", | |
| "++\n", | |
| "++" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%sql\n", | |
| "\n", | |
| "ALTER TABLE nyc_demo.taxis_sample\n", | |
| "ADD COLUMN fare_per_distance_unit float AFTER trip_distance" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 30, | |
| "id": "7d7be038", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| " \r" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "spark_s_query = \"SELECT * FROM nyc_demo.taxis_sample;\"\n", | |
| "df_sql = spark.sql(spark_s_query)\n", | |
| "spark_s_df = df_sql.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 31, | |
| "id": "4e4d822f", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>fare_per_distance_unit</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>8.40</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>79</td>\n", | |
| " <td>116</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>5.85</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.15</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.90</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>75</td>\n", | |
| " <td>236</td>\n", | |
| " <td>2</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>8.80</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>3.40</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>236</td>\n", | |
| " <td>168</td>\n", | |
| " <td>2</td>\n", | |
| " <td>11.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>15.30</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>47</td>\n", | |
| " <td>61</td>\n", | |
| " <td>1</td>\n", | |
| " <td>44.2</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>45.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.96</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>238</td>\n", | |
| " <td>152</td>\n", | |
| " <td>1</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>3.09</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>13.39</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance fare_per_distance_unit \\\n", | |
| "0 1 1.0 8.40 NaN \n", | |
| "1 1 1.0 0.90 NaN \n", | |
| "2 1 1.0 3.40 NaN \n", | |
| "3 1 1.0 0.00 NaN \n", | |
| "4 2 1.0 1.96 NaN \n", | |
| "\n", | |
| " RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type \\\n", | |
| "0 1.0 N 79 116 1 \n", | |
| "1 1.0 N 75 236 2 \n", | |
| "2 1.0 N 236 168 2 \n", | |
| "3 1.0 N 47 61 1 \n", | |
| "4 1.0 N 238 152 1 \n", | |
| "\n", | |
| " fare_amount extra mta_tax tip_amount tolls_amount \\\n", | |
| "0 25.5 3.0 0.5 5.85 0.0 \n", | |
| "1 5.0 3.0 0.5 0.00 0.0 \n", | |
| "2 11.5 3.0 0.5 0.00 0.0 \n", | |
| "3 44.2 0.0 0.5 0.00 0.0 \n", | |
| "4 9.0 0.5 0.5 3.09 0.0 \n", | |
| "\n", | |
| " improvement_surcharge total_amount congestion_surcharge airport_fee \n", | |
| "0 0.3 35.15 2.5 0.0 \n", | |
| "1 0.3 8.80 2.5 0.0 \n", | |
| "2 0.3 15.30 2.5 0.0 \n", | |
| "3 0.3 45.00 0.0 0.0 \n", | |
| "4 0.3 13.39 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 31, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "spark_s_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 32, | |
| "id": "61c7da73", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from pyiceberg.catalog import load_catalog\n", | |
| "iceberg_catalog = load_catalog('default')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "id": "55cc73c9", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "iceberg_table = iceberg_catalog.load_table(\"nyc_demo.taxis_sample\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "id": "b6d04355", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>fare_per_distance_unit</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>8.40</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>79</td>\n", | |
| " <td>116</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>5.85</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.15</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.90</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>75</td>\n", | |
| " <td>236</td>\n", | |
| " <td>2</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>8.80</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>3.40</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>236</td>\n", | |
| " <td>168</td>\n", | |
| " <td>2</td>\n", | |
| " <td>11.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>15.30</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>47</td>\n", | |
| " <td>61</td>\n", | |
| " <td>1</td>\n", | |
| " <td>44.2</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>45.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.96</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>238</td>\n", | |
| " <td>152</td>\n", | |
| " <td>1</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>3.09</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>13.39</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance fare_per_distance_unit \\\n", | |
| "0 1 1.0 8.40 NaN \n", | |
| "1 1 1.0 0.90 NaN \n", | |
| "2 1 1.0 3.40 NaN \n", | |
| "3 1 1.0 0.00 NaN \n", | |
| "4 2 1.0 1.96 NaN \n", | |
| "\n", | |
| " RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type \\\n", | |
| "0 1.0 N 79 116 1 \n", | |
| "1 1.0 N 75 236 2 \n", | |
| "2 1.0 N 236 168 2 \n", | |
| "3 1.0 N 47 61 1 \n", | |
| "4 1.0 N 238 152 1 \n", | |
| "\n", | |
| " fare_amount extra mta_tax tip_amount tolls_amount \\\n", | |
| "0 25.5 3.0 0.5 5.85 0.0 \n", | |
| "1 5.0 3.0 0.5 0.00 0.0 \n", | |
| "2 11.5 3.0 0.5 0.00 0.0 \n", | |
| "3 44.2 0.0 0.5 0.00 0.0 \n", | |
| "4 9.0 0.5 0.5 3.09 0.0 \n", | |
| "\n", | |
| " improvement_surcharge total_amount congestion_surcharge airport_fee \n", | |
| "0 0.3 35.15 2.5 0.0 \n", | |
| "1 0.3 8.80 2.5 0.0 \n", | |
| "2 0.3 15.30 2.5 0.0 \n", | |
| "3 0.3 45.00 0.0 0.0 \n", | |
| "4 0.3 13.39 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "iceberg_df = iceberg_table.scan().to_pandas()\n", | |
| "iceberg_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "id": "7ec460e3", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "assert iceberg_df.equals(spark_s_df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "id": "04ed4d3e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "mkdir: cannot create directory ‘/home/iceberg/notebooks/s3’: File exists\r\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "!mkdir /home/iceberg/notebooks/s3" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 40, | |
| "id": "f248ef58", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\"\"\"\n", | |
| "We expect the user to pass the evolved schema.\n", | |
| "\"\"\"\n", | |
| "## TODO: I think we can probably keep a single connection and use the update query to Spark applied on\n", | |
| "## the duckdb connector and update the table.\n", | |
| "def setup_duckdb():\n", | |
| " con = duckdb.connect()\n", | |
| " create_schema = \"CREATE SCHEMA nyc_demo;\"\n", | |
| " creation_query = \"\"\"\n", | |
| " CREATE TABLE nyc_demo.taxis_sample (\n", | |
| " VendorID bigint,\n", | |
| " passenger_count double,\n", | |
| " trip_distance double,\n", | |
| " fare_per_distance_unit float,\n", | |
| " RatecodeID double,\n", | |
| " store_and_fwd_flag string,\n", | |
| " PULocationID bigint,\n", | |
| " DOLocationID bigint,\n", | |
| " payment_type bigint,\n", | |
| " fare_amount double,\n", | |
| " extra double,\n", | |
| " mta_tax double,\n", | |
| " tip_amount double,\n", | |
| " tolls_amount double,\n", | |
| " improvement_surcharge double,\n", | |
| " total_amount double,\n", | |
| " congestion_surcharge double,\n", | |
| " airport_fee double\n", | |
| " );\n", | |
| " \"\"\"\n", | |
| " con.execute(create_schema)\n", | |
| " con.execute(creation_query)\n", | |
| " return con" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "id": "b564d228", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "wrapper = DuckdbSubstrait(\"default\", \"/home/iceberg/notebooks/s3\", \"nyc_demo\", spark_s_query, setup_duckdb)\n", | |
| "wrapper.update_named_table_with_schema()\n", | |
| "wrapper.update_with_local_file_paths()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "id": "603ad696", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "duckdb_res = wrapper.execute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "id": "c233e737", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>fare_per_distance_unit</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>8.40</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>79</td>\n", | |
| " <td>116</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>5.85</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.15</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.90</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>75</td>\n", | |
| " <td>236</td>\n", | |
| " <td>2</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>8.80</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>3.40</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>236</td>\n", | |
| " <td>168</td>\n", | |
| " <td>2</td>\n", | |
| " <td>11.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>15.30</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>47</td>\n", | |
| " <td>61</td>\n", | |
| " <td>1</td>\n", | |
| " <td>44.2</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>45.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.96</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>238</td>\n", | |
| " <td>152</td>\n", | |
| " <td>1</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>3.09</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>13.39</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance fare_per_distance_unit \\\n", | |
| "0 1 1.0 8.40 1.0 \n", | |
| "1 1 1.0 0.90 1.0 \n", | |
| "2 1 1.0 3.40 1.0 \n", | |
| "3 1 1.0 0.00 1.0 \n", | |
| "4 2 1.0 1.96 1.0 \n", | |
| "\n", | |
| " RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type \\\n", | |
| "0 1.0 N 79 116 1 \n", | |
| "1 1.0 N 75 236 2 \n", | |
| "2 1.0 N 236 168 2 \n", | |
| "3 1.0 N 47 61 1 \n", | |
| "4 1.0 N 238 152 1 \n", | |
| "\n", | |
| " fare_amount extra mta_tax tip_amount tolls_amount \\\n", | |
| "0 25.5 3.0 0.5 5.85 0.0 \n", | |
| "1 5.0 3.0 0.5 0.00 0.0 \n", | |
| "2 11.5 3.0 0.5 0.00 0.0 \n", | |
| "3 44.2 0.0 0.5 0.00 0.0 \n", | |
| "4 9.0 0.5 0.5 3.09 0.0 \n", | |
| "\n", | |
| " improvement_surcharge total_amount congestion_surcharge airport_fee \n", | |
| "0 0.3 35.15 2.5 0.0 \n", | |
| "1 0.3 8.80 2.5 0.0 \n", | |
| "2 0.3 15.30 2.5 0.0 \n", | |
| "3 0.3 45.00 0.0 0.0 \n", | |
| "4 0.3 13.39 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 43, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_duckdb_substrait = duckdb_res.to_df()\n", | |
| "df_duckdb_substrait.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "id": "02f6d4d9", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>fare_per_distance_unit</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>8.40</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>79</td>\n", | |
| " <td>116</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>5.85</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.15</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.90</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>75</td>\n", | |
| " <td>236</td>\n", | |
| " <td>2</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>8.80</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>3.40</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>236</td>\n", | |
| " <td>168</td>\n", | |
| " <td>2</td>\n", | |
| " <td>11.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>15.30</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>47</td>\n", | |
| " <td>61</td>\n", | |
| " <td>1</td>\n", | |
| " <td>44.2</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>45.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.96</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>238</td>\n", | |
| " <td>152</td>\n", | |
| " <td>1</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>3.09</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>13.39</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance fare_per_distance_unit \\\n", | |
| "0 1 1.0 8.40 NaN \n", | |
| "1 1 1.0 0.90 NaN \n", | |
| "2 1 1.0 3.40 NaN \n", | |
| "3 1 1.0 0.00 NaN \n", | |
| "4 2 1.0 1.96 NaN \n", | |
| "\n", | |
| " RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type \\\n", | |
| "0 1.0 N 79 116 1 \n", | |
| "1 1.0 N 75 236 2 \n", | |
| "2 1.0 N 236 168 2 \n", | |
| "3 1.0 N 47 61 1 \n", | |
| "4 1.0 N 238 152 1 \n", | |
| "\n", | |
| " fare_amount extra mta_tax tip_amount tolls_amount \\\n", | |
| "0 25.5 3.0 0.5 5.85 0.0 \n", | |
| "1 5.0 3.0 0.5 0.00 0.0 \n", | |
| "2 11.5 3.0 0.5 0.00 0.0 \n", | |
| "3 44.2 0.0 0.5 0.00 0.0 \n", | |
| "4 9.0 0.5 0.5 3.09 0.0 \n", | |
| "\n", | |
| " improvement_surcharge total_amount congestion_surcharge airport_fee \n", | |
| "0 0.3 35.15 2.5 0.0 \n", | |
| "1 0.3 8.80 2.5 0.0 \n", | |
| "2 0.3 15.30 2.5 0.0 \n", | |
| "3 0.3 45.00 0.0 0.0 \n", | |
| "4 0.3 13.39 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "spark_s_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "id": "ed2243f9", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "spark_df_selected = spark_s_df\n", | |
| "duckdb_df_selected = df_duckdb_substrait" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "id": "10c01c95", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "spark_df_selected = spark_df_selected.drop(columns=['fare_per_distance_unit'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "id": "e0fa7fa7", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "duckdb_df_selected = duckdb_df_selected.drop(columns=['fare_per_distance_unit'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 51, | |
| "id": "a6b964bb", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "assert spark_df_selected.equals(duckdb_df_selected)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "197b3965", | |
| "metadata": {}, | |
| "source": [ | |
| "Seems like the new column added as `fare_per_distance_unit` contains a default value 1.0 DuckDbWrapper but in the Spark response we have `NaN` which should be the expected response. \n", | |
| "\n", | |
| "Let's see if this happens in the DuckDb end or are we doing something incorrect?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "780a1fe5", | |
| "metadata": {}, | |
| "source": [ | |
| "Analyze the files related to this query via PyIceberg API" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 94, | |
| "id": "61e463fc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from pyiceberg.io.pyarrow import PyArrowFileIO\n", | |
| "import pyarrow.dataset as ds\n", | |
| "ONE_MEGABYTE = 1024 * 1024" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 95, | |
| "id": "5dc737e9", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "iceberg_scanner = iceberg_table.scan()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 96, | |
| "id": "a761e66c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "tasks = iceberg_scanner.plan_files()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 101, | |
| "id": "ef5798dd", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "tasks = iceberg_scanner.plan_files()\n", | |
| "scheme, _ = PyArrowFileIO.parse_location(iceberg_table.location())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 102, | |
| "id": "7c8ff345", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>8.40</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>79</td>\n", | |
| " <td>116</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>5.85</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.15</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.90</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>75</td>\n", | |
| " <td>236</td>\n", | |
| " <td>2</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>8.80</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>3.40</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>236</td>\n", | |
| " <td>168</td>\n", | |
| " <td>2</td>\n", | |
| " <td>11.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>15.30</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>47</td>\n", | |
| " <td>61</td>\n", | |
| " <td>1</td>\n", | |
| " <td>44.2</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>45.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.96</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>238</td>\n", | |
| " <td>152</td>\n", | |
| " <td>1</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>3.09</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>13.39</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance RatecodeID store_and_fwd_flag \\\n", | |
| "0 1 1.0 8.40 1.0 N \n", | |
| "1 1 1.0 0.90 1.0 N \n", | |
| "2 1 1.0 3.40 1.0 N \n", | |
| "3 1 1.0 0.00 1.0 N \n", | |
| "4 2 1.0 1.96 1.0 N \n", | |
| "\n", | |
| " PULocationID DOLocationID payment_type fare_amount extra mta_tax \\\n", | |
| "0 79 116 1 25.5 3.0 0.5 \n", | |
| "1 75 236 2 5.0 3.0 0.5 \n", | |
| "2 236 168 2 11.5 3.0 0.5 \n", | |
| "3 47 61 1 44.2 0.0 0.5 \n", | |
| "4 238 152 1 9.0 0.5 0.5 \n", | |
| "\n", | |
| " tip_amount tolls_amount improvement_surcharge total_amount \\\n", | |
| "0 5.85 0.0 0.3 35.15 \n", | |
| "1 0.00 0.0 0.3 8.80 \n", | |
| "2 0.00 0.0 0.3 15.30 \n", | |
| "3 0.00 0.0 0.3 45.00 \n", | |
| "4 3.09 0.0 0.3 13.39 \n", | |
| "\n", | |
| " congestion_surcharge airport_fee \n", | |
| "0 2.5 0.0 \n", | |
| "1 2.5 0.0 \n", | |
| "2 2.5 0.0 \n", | |
| "3 0.0 0.0 \n", | |
| "4 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 102, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "## here we are extracting data by means of file on disk. \n", | |
| "## It seems that the file \n", | |
| "\n", | |
| "if isinstance(iceberg_table.io, PyArrowFileIO):\n", | |
| " fs = iceberg_table.io.get_fs(scheme)\n", | |
| "download_paths = []\n", | |
| "extensions = []\n", | |
| "for task in tasks: \n", | |
| " _, parquet_file_path = PyArrowFileIO.parse_location(task.file.file_path)\n", | |
| " arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))\n", | |
| " with fs.open_input_file(parquet_file_path) as fin:\n", | |
| " fragment = arrow_format.make_fragment(fin)\n", | |
| " physical_schema = fragment.physical_schema\n", | |
| " pyarrow_filter = None\n", | |
| " fragment_scanner = ds.Scanner.from_fragment(\n", | |
| " fragment=fragment,\n", | |
| " schema=physical_schema,\n", | |
| " filter=pyarrow_filter,\n", | |
| " )\n", | |
| " arrow_table = fragment_scanner.to_table()\n", | |
| "debug_df = arrow_table.to_pandas()\n", | |
| "debug_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 106, | |
| "id": "58a54299", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "assert 'fare_per_distance_unit' not in physical_schema.names" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "8f3fae35", | |
| "metadata": {}, | |
| "source": [ | |
| "This record is not written to the file since the values are not updated. But it is returning default values. \n", | |
| "Also by visiting http://localhost:9001 we can checkout the files written to the disk. It also has just a single file. We cannot see two files in this situation.\n", | |
| "\n", | |
| "Now let's do a manual update on values and see what is happening. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 109, | |
| "id": "0670e304", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "23/05/30 04:38:08 WARN SparkSession: Using an existing Spark session; only runtime SQL configurations will take effect.\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table>\n", | |
| " <tr>\n", | |
| " </tr>\n", | |
| "</table>" | |
| ], | |
| "text/plain": [ | |
| "++\n", | |
| "||\n", | |
| "++\n", | |
| "++" | |
| ] | |
| }, | |
| "execution_count": 109, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%sql\n", | |
| "\n", | |
| "\n", | |
| "INSERT INTO nyc_demo.taxis_sample (\n", | |
| " VendorID,\n", | |
| " passenger_count,trip_distance,\n", | |
| " fare_per_distance_unit,\n", | |
| " RatecodeID,\n", | |
| " store_and_fwd_flag,\n", | |
| " PULocationID,\n", | |
| " DOLocationID,\n", | |
| " payment_type,\n", | |
| " fare_amount,\n", | |
| " extra,\n", | |
| " mta_tax,\n", | |
| " tip_amount,\n", | |
| " tolls_amount,\n", | |
| " improvement_surcharge,\n", | |
| " total_amount,\n", | |
| " congestion_surcharge,\n", | |
| " airport_fee) VALUES \n", | |
| "(1, null, null, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0);" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 118, | |
| "id": "8186ced1", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "spark_insert_query = \"SELECT * FROM nyc_demo.taxis_sample;\"" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 119, | |
| "id": "ab51cee3", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| " \r" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "df_after_insert_sql = spark.sql(spark_s_query)\n", | |
| "spark_after_insert_df = df_after_insert_sql.toPandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 121, | |
| "id": "94eae55d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>fare_per_distance_unit</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>8.40</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>79</td>\n", | |
| " <td>116</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>5.85</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.15</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.90</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>75</td>\n", | |
| " <td>236</td>\n", | |
| " <td>2</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>8.80</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>3.40</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>236</td>\n", | |
| " <td>168</td>\n", | |
| " <td>2</td>\n", | |
| " <td>11.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>15.30</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>47</td>\n", | |
| " <td>61</td>\n", | |
| " <td>1</td>\n", | |
| " <td>44.2</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>45.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.96</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>238</td>\n", | |
| " <td>152</td>\n", | |
| " <td>1</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>3.09</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>13.39</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance fare_per_distance_unit \\\n", | |
| "0 1 1.0 8.40 NaN \n", | |
| "1 1 1.0 0.90 NaN \n", | |
| "2 1 1.0 3.40 NaN \n", | |
| "3 1 1.0 0.00 NaN \n", | |
| "4 2 1.0 1.96 NaN \n", | |
| "\n", | |
| " RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type \\\n", | |
| "0 1.0 N 79 116 1 \n", | |
| "1 1.0 N 75 236 2 \n", | |
| "2 1.0 N 236 168 2 \n", | |
| "3 1.0 N 47 61 1 \n", | |
| "4 1.0 N 238 152 1 \n", | |
| "\n", | |
| " fare_amount extra mta_tax tip_amount tolls_amount \\\n", | |
| "0 25.5 3.0 0.5 5.85 0.0 \n", | |
| "1 5.0 3.0 0.5 0.00 0.0 \n", | |
| "2 11.5 3.0 0.5 0.00 0.0 \n", | |
| "3 44.2 0.0 0.5 0.00 0.0 \n", | |
| "4 9.0 0.5 0.5 3.09 0.0 \n", | |
| "\n", | |
| " improvement_surcharge total_amount congestion_surcharge airport_fee \n", | |
| "0 0.3 35.15 2.5 0.0 \n", | |
| "1 0.3 8.80 2.5 0.0 \n", | |
| "2 0.3 15.30 2.5 0.0 \n", | |
| "3 0.3 45.00 0.0 0.0 \n", | |
| "4 0.3 13.39 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 121, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "spark_after_insert_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 124, | |
| "id": "ca663fab", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "iceberg_catalog = load_catalog('default')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 125, | |
| "id": "c32dfac5", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "iceberg_insert_table = iceberg_catalog.load_table(\"nyc_demo.taxis_sample\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 126, | |
| "id": "6881dd7b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>fare_per_distance_unit</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>8.4</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>79</td>\n", | |
| " <td>116</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>5.85</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.15</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.9</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>75</td>\n", | |
| " <td>236</td>\n", | |
| " <td>2</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>8.80</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>3.4</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>236</td>\n", | |
| " <td>168</td>\n", | |
| " <td>2</td>\n", | |
| " <td>11.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>15.30</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>47</td>\n", | |
| " <td>61</td>\n", | |
| " <td>1</td>\n", | |
| " <td>44.2</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>45.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance fare_per_distance_unit \\\n", | |
| "0 1 NaN NaN 0.0 \n", | |
| "1 1 1.0 8.4 NaN \n", | |
| "2 1 1.0 0.9 NaN \n", | |
| "3 1 1.0 3.4 NaN \n", | |
| "4 1 1.0 0.0 NaN \n", | |
| "\n", | |
| " RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type \\\n", | |
| "0 0.0 0 0 0 0 \n", | |
| "1 1.0 N 79 116 1 \n", | |
| "2 1.0 N 75 236 2 \n", | |
| "3 1.0 N 236 168 2 \n", | |
| "4 1.0 N 47 61 1 \n", | |
| "\n", | |
| " fare_amount extra mta_tax tip_amount tolls_amount \\\n", | |
| "0 0.0 0.0 0.0 0.00 0.0 \n", | |
| "1 25.5 3.0 0.5 5.85 0.0 \n", | |
| "2 5.0 3.0 0.5 0.00 0.0 \n", | |
| "3 11.5 3.0 0.5 0.00 0.0 \n", | |
| "4 44.2 0.0 0.5 0.00 0.0 \n", | |
| "\n", | |
| " improvement_surcharge total_amount congestion_surcharge airport_fee \n", | |
| "0 0.0 0.00 0.0 0.0 \n", | |
| "1 0.3 35.15 2.5 0.0 \n", | |
| "2 0.3 8.80 2.5 0.0 \n", | |
| "3 0.3 15.30 2.5 0.0 \n", | |
| "4 0.3 45.00 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 126, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "iceberg_insert_df = iceberg_insert_table.scan().to_pandas()\n", | |
| "iceberg_insert_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 128, | |
| "id": "2bc21475", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(2171188, 2171188)" | |
| ] | |
| }, | |
| "execution_count": 128, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(iceberg_insert_df), len(spark_after_insert_df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 131, | |
| "id": "680fe65b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "wrapper = DuckdbSubstrait(\"default\", \"/home/iceberg/notebooks/s3\", \"nyc_demo\", spark_insert_query, setup_duckdb)\n", | |
| "wrapper.update_named_table_with_schema()\n", | |
| "wrapper.update_with_local_file_paths()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 132, | |
| "id": "0eec1367", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "NotImplementedException", | |
| "evalue": "Not implemented Error: Can't handle more than one file in the read operator of substrait", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mNotImplementedException\u001b[0m Traceback (most recent call last)", | |
| "Cell \u001b[0;32mIn[132], line 1\u001b[0m\n\u001b[0;32m----> 1\u001b[0m duckdb_res \u001b[38;5;241m=\u001b[39m \u001b[43mwrapper\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mexecute\u001b[49m\u001b[43m(\u001b[49m\u001b[43m)\u001b[49m\n", | |
| "File \u001b[0;32m/usr/local/lib/python3.9/site-packages/icetrait/duckdb/wrapper.py:250\u001b[0m, in \u001b[0;36mDuckdbSubstrait.execute\u001b[0;34m(self)\u001b[0m\n\u001b[1;32m 247\u001b[0m \u001b[38;5;28;01mdef\u001b[39;00m \u001b[38;5;21mexecute\u001b[39m(\u001b[38;5;28mself\u001b[39m):\n\u001b[1;32m 248\u001b[0m \u001b[38;5;66;03m# run the updated Substrait plan with DuckDb\u001b[39;00m\n\u001b[1;32m 249\u001b[0m proto_bytes \u001b[38;5;241m=\u001b[39m \u001b[38;5;28mself\u001b[39m\u001b[38;5;241m.\u001b[39m_updated_plan\u001b[38;5;241m.\u001b[39mSerializeToString()\n\u001b[0;32m--> 250\u001b[0m query_result \u001b[38;5;241m=\u001b[39m \u001b[38;5;28;43mself\u001b[39;49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43m_con\u001b[49m\u001b[38;5;241;43m.\u001b[39;49m\u001b[43mfrom_substrait\u001b[49m\u001b[43m(\u001b[49m\u001b[43mproto\u001b[49m\u001b[38;5;241;43m=\u001b[39;49m\u001b[43mproto_bytes\u001b[49m\u001b[43m)\u001b[49m\n\u001b[1;32m 251\u001b[0m \u001b[38;5;28;01mreturn\u001b[39;00m query_result\n", | |
| "\u001b[0;31mNotImplementedException\u001b[0m: Not implemented Error: Can't handle more than one file in the read operator of substrait" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "duckdb_res = wrapper.execute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "332ddec0", | |
| "metadata": {}, | |
| "source": [ | |
| "**LIMITATION IN DUCKDB SUBSTRAIT** \n", | |
| "\n", | |
| "https://github.com/duckdblabs/substrait/blob/main/src/from_substrait.cpp#L457-L461\n", | |
| "\n", | |
| "This is not an error occurred in `icetrait` rather it is a limitation in the Duckdb-Substrait library" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 143, | |
| "id": "8e2d4b3b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "relations {\n", | |
| " root {\n", | |
| " input {\n", | |
| " project {\n", | |
| " input {\n", | |
| " read {\n", | |
| " base_schema {\n", | |
| " names: \"VendorID\"\n", | |
| " names: \"passenger_count\"\n", | |
| " names: \"trip_distance\"\n", | |
| " names: \"fare_per_distance_unit\"\n", | |
| " names: \"RatecodeID\"\n", | |
| " names: \"store_and_fwd_flag\"\n", | |
| " names: \"PULocationID\"\n", | |
| " names: \"DOLocationID\"\n", | |
| " names: \"payment_type\"\n", | |
| " names: \"fare_amount\"\n", | |
| " names: \"extra\"\n", | |
| " names: \"mta_tax\"\n", | |
| " names: \"tip_amount\"\n", | |
| " names: \"tolls_amount\"\n", | |
| " names: \"improvement_surcharge\"\n", | |
| " names: \"total_amount\"\n", | |
| " names: \"congestion_surcharge\"\n", | |
| " names: \"airport_fee\"\n", | |
| " struct {\n", | |
| " types {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp32 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " varchar {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " nullability: NULLABILITY_REQUIRED\n", | |
| " }\n", | |
| " }\n", | |
| " projection {\n", | |
| " select {\n", | |
| " struct_items {\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 1\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 2\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 3\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 4\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 5\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 6\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 7\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 8\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 9\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 10\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 11\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 12\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 13\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 14\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 15\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 16\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 17\n", | |
| " }\n", | |
| " }\n", | |
| " maintain_singular_struct: true\n", | |
| " }\n", | |
| " named_table {\n", | |
| " names: \"taxis_sample\"\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 1\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 2\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 3\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 4\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 5\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 6\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 7\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 8\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 9\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 10\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 11\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 12\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 13\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 14\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 15\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 16\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 17\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " names: \"VendorID\"\n", | |
| " names: \"passenger_count\"\n", | |
| " names: \"trip_distance\"\n", | |
| " names: \"fare_per_distance_unit\"\n", | |
| " names: \"RatecodeID\"\n", | |
| " names: \"store_and_fwd_flag\"\n", | |
| " names: \"PULocationID\"\n", | |
| " names: \"DOLocationID\"\n", | |
| " names: \"payment_type\"\n", | |
| " names: \"fare_amount\"\n", | |
| " names: \"extra\"\n", | |
| " names: \"mta_tax\"\n", | |
| " names: \"tip_amount\"\n", | |
| " names: \"tolls_amount\"\n", | |
| " names: \"improvement_surcharge\"\n", | |
| " names: \"total_amount\"\n", | |
| " names: \"congestion_surcharge\"\n", | |
| " names: \"airport_fee\"\n", | |
| " }\n", | |
| "}" | |
| ] | |
| }, | |
| "execution_count": 143, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "wrapper.plan" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 144, | |
| "id": "18356e75", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "relations {\n", | |
| " root {\n", | |
| " input {\n", | |
| " project {\n", | |
| " input {\n", | |
| " read {\n", | |
| " base_schema {\n", | |
| " names: \"VendorID\"\n", | |
| " names: \"passenger_count\"\n", | |
| " names: \"trip_distance\"\n", | |
| " names: \"fare_per_distance_unit\"\n", | |
| " names: \"RatecodeID\"\n", | |
| " names: \"store_and_fwd_flag\"\n", | |
| " names: \"PULocationID\"\n", | |
| " names: \"DOLocationID\"\n", | |
| " names: \"payment_type\"\n", | |
| " names: \"fare_amount\"\n", | |
| " names: \"extra\"\n", | |
| " names: \"mta_tax\"\n", | |
| " names: \"tip_amount\"\n", | |
| " names: \"tolls_amount\"\n", | |
| " names: \"improvement_surcharge\"\n", | |
| " names: \"total_amount\"\n", | |
| " names: \"congestion_surcharge\"\n", | |
| " names: \"airport_fee\"\n", | |
| " struct {\n", | |
| " types {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp32 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " varchar {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " i64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " types {\n", | |
| " fp64 {\n", | |
| " nullability: NULLABILITY_NULLABLE\n", | |
| " }\n", | |
| " }\n", | |
| " nullability: NULLABILITY_REQUIRED\n", | |
| " }\n", | |
| " }\n", | |
| " projection {\n", | |
| " select {\n", | |
| " struct_items {\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 1\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 2\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 3\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 4\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 5\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 6\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 7\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 8\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 9\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 10\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 11\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 12\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 13\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 14\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 15\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 16\n", | |
| " }\n", | |
| " struct_items {\n", | |
| " field: 17\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " local_files {\n", | |
| " items {\n", | |
| " uri_file: \"/home/iceberg/notebooks/s3/00000-12-5f4d82c5-670d-4661-ba28-37288d33234d-00001.parquet\"\n", | |
| " parquet {\n", | |
| " }\n", | |
| " }\n", | |
| " items {\n", | |
| " uri_file: \"/home/iceberg/notebooks/s3/00004-5-6548d021-dd6b-4891-850a-7c29c6a35f2c-00001.parquet\"\n", | |
| " parquet {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 1\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 2\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 3\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 4\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 5\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 6\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 7\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 8\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 9\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 10\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 11\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 12\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 13\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 14\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 15\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 16\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " expressions {\n", | |
| " selection {\n", | |
| " direct_reference {\n", | |
| " struct_field {\n", | |
| " field: 17\n", | |
| " }\n", | |
| " }\n", | |
| " root_reference {\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " }\n", | |
| " names: \"VendorID\"\n", | |
| " names: \"passenger_count\"\n", | |
| " names: \"trip_distance\"\n", | |
| " names: \"fare_per_distance_unit\"\n", | |
| " names: \"RatecodeID\"\n", | |
| " names: \"store_and_fwd_flag\"\n", | |
| " names: \"PULocationID\"\n", | |
| " names: \"DOLocationID\"\n", | |
| " names: \"payment_type\"\n", | |
| " names: \"fare_amount\"\n", | |
| " names: \"extra\"\n", | |
| " names: \"mta_tax\"\n", | |
| " names: \"tip_amount\"\n", | |
| " names: \"tolls_amount\"\n", | |
| " names: \"improvement_surcharge\"\n", | |
| " names: \"total_amount\"\n", | |
| " names: \"congestion_surcharge\"\n", | |
| " names: \"airport_fee\"\n", | |
| " }\n", | |
| "}" | |
| ] | |
| }, | |
| "execution_count": 144, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "wrapper.updated_plan" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "0a4b2583", | |
| "metadata": {}, | |
| "source": [ | |
| "Looking at the updated plan, we can see that it is accurately updated with the two files. \n", | |
| "Also let's take a look at the following trace in seeing the content in the files associated\n", | |
| "with the most recent version of the evolved schema. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 148, | |
| "id": "39ba430e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "iceberg_insert_scanner = iceberg_insert_table.scan()\n", | |
| "tasks = iceberg_insert_scanner.plan_files()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 149, | |
| "id": "fa499eb2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "warehouse/nyc_demo/taxis_sample/data/00000-12-5f4d82c5-670d-4661-ba28-37288d33234d-00001.parquet\n", | |
| "warehouse/nyc_demo/taxis_sample/data/00004-5-6548d021-dd6b-4891-850a-7c29c6a35f2c-00001.parquet\n", | |
| "1\n", | |
| "--------------------------------------------------------------------------------\n", | |
| "2171187\n", | |
| "--------------------------------------------------------------------------------\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "if isinstance(iceberg_table.io, PyArrowFileIO):\n", | |
| " fs = iceberg_table.io.get_fs(scheme)\n", | |
| "download_paths = []\n", | |
| "extensions = []\n", | |
| "out_tables = []\n", | |
| "for task in tasks: \n", | |
| " _, parquet_file_path = PyArrowFileIO.parse_location(task.file.file_path)\n", | |
| " arrow_format = ds.ParquetFileFormat(pre_buffer=True, buffer_size=(ONE_MEGABYTE * 8))\n", | |
| " print(parquet_file_path)\n", | |
| " with fs.open_input_file(parquet_file_path) as fin:\n", | |
| " fragment = arrow_format.make_fragment(fin)\n", | |
| " physical_schema = fragment.physical_schema\n", | |
| " pyarrow_filter = None\n", | |
| " fragment_scanner = ds.Scanner.from_fragment(\n", | |
| " fragment=fragment,\n", | |
| " schema=physical_schema,\n", | |
| " filter=pyarrow_filter,\n", | |
| " )\n", | |
| " arrow_table = fragment_scanner.to_table()\n", | |
| " out_tables.append(arrow_table)\n", | |
| "\n", | |
| "for table in out_tables:\n", | |
| " out_df = table.to_pandas()\n", | |
| " print(len(out_df))\n", | |
| " print(\"-\" * 80)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "8646e4aa", | |
| "metadata": {}, | |
| "source": [ | |
| "See we have to files as same as in the updated Substrait plan. The file names are also same. This shows we are consuming the PyIceberg API accurately. " | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 141, | |
| "id": "9ec8157f", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>fare_per_distance_unit</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance fare_per_distance_unit \\\n", | |
| "0 1 NaN NaN 0.0 \n", | |
| "\n", | |
| " RatecodeID store_and_fwd_flag PULocationID DOLocationID payment_type \\\n", | |
| "0 0.0 0 0 0 0 \n", | |
| "\n", | |
| " fare_amount extra mta_tax tip_amount tolls_amount \\\n", | |
| "0 0.0 0.0 0.0 0.0 0.0 \n", | |
| "\n", | |
| " improvement_surcharge total_amount congestion_surcharge airport_fee \n", | |
| "0 0.0 0.0 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 141, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "out_tables[0].to_pandas().head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 142, | |
| "id": "a6bc4544", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>VendorID</th>\n", | |
| " <th>passenger_count</th>\n", | |
| " <th>trip_distance</th>\n", | |
| " <th>RatecodeID</th>\n", | |
| " <th>store_and_fwd_flag</th>\n", | |
| " <th>PULocationID</th>\n", | |
| " <th>DOLocationID</th>\n", | |
| " <th>payment_type</th>\n", | |
| " <th>fare_amount</th>\n", | |
| " <th>extra</th>\n", | |
| " <th>mta_tax</th>\n", | |
| " <th>tip_amount</th>\n", | |
| " <th>tolls_amount</th>\n", | |
| " <th>improvement_surcharge</th>\n", | |
| " <th>total_amount</th>\n", | |
| " <th>congestion_surcharge</th>\n", | |
| " <th>airport_fee</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>8.40</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>79</td>\n", | |
| " <td>116</td>\n", | |
| " <td>1</td>\n", | |
| " <td>25.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>5.85</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>35.15</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.90</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>75</td>\n", | |
| " <td>236</td>\n", | |
| " <td>2</td>\n", | |
| " <td>5.0</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>8.80</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>3.40</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>236</td>\n", | |
| " <td>168</td>\n", | |
| " <td>2</td>\n", | |
| " <td>11.5</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>15.30</td>\n", | |
| " <td>2.5</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>1</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>47</td>\n", | |
| " <td>61</td>\n", | |
| " <td>1</td>\n", | |
| " <td>44.2</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>45.00</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>1.96</td>\n", | |
| " <td>1.0</td>\n", | |
| " <td>N</td>\n", | |
| " <td>238</td>\n", | |
| " <td>152</td>\n", | |
| " <td>1</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>0.5</td>\n", | |
| " <td>3.09</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.3</td>\n", | |
| " <td>13.39</td>\n", | |
| " <td>0.0</td>\n", | |
| " <td>0.0</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " VendorID passenger_count trip_distance RatecodeID store_and_fwd_flag \\\n", | |
| "0 1 1.0 8.40 1.0 N \n", | |
| "1 1 1.0 0.90 1.0 N \n", | |
| "2 1 1.0 3.40 1.0 N \n", | |
| "3 1 1.0 0.00 1.0 N \n", | |
| "4 2 1.0 1.96 1.0 N \n", | |
| "\n", | |
| " PULocationID DOLocationID payment_type fare_amount extra mta_tax \\\n", | |
| "0 79 116 1 25.5 3.0 0.5 \n", | |
| "1 75 236 2 5.0 3.0 0.5 \n", | |
| "2 236 168 2 11.5 3.0 0.5 \n", | |
| "3 47 61 1 44.2 0.0 0.5 \n", | |
| "4 238 152 1 9.0 0.5 0.5 \n", | |
| "\n", | |
| " tip_amount tolls_amount improvement_surcharge total_amount \\\n", | |
| "0 5.85 0.0 0.3 35.15 \n", | |
| "1 0.00 0.0 0.3 8.80 \n", | |
| "2 0.00 0.0 0.3 15.30 \n", | |
| "3 0.00 0.0 0.3 45.00 \n", | |
| "4 3.09 0.0 0.3 13.39 \n", | |
| "\n", | |
| " congestion_surcharge airport_fee \n", | |
| "0 2.5 0.0 \n", | |
| "1 2.5 0.0 \n", | |
| "2 2.5 0.0 \n", | |
| "3 0.0 0.0 \n", | |
| "4 0.0 0.0 " | |
| ] | |
| }, | |
| "execution_count": 142, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "out_tables[1].to_pandas().head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "6c9c0396", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.16" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment