Created
February 8, 2024 14:53
-
-
Save mrocklin/c97bdc584c8568dc07c48b2c38c00fd2 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "6caaa3f8-b713-4188-b237-557d071fa14d", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>name</th>\n", | |
| " <th>id</th>\n", | |
| " <th>x</th>\n", | |
| " <th>y</th>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>timestamp</th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " <th></th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>2000-01-01 00:00:00</th>\n", | |
| " <td>Frank</td>\n", | |
| " <td>998</td>\n", | |
| " <td>-0.879896</td>\n", | |
| " <td>0.438060</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2000-01-01 00:00:01</th>\n", | |
| " <td>Victor</td>\n", | |
| " <td>1015</td>\n", | |
| " <td>0.246495</td>\n", | |
| " <td>-0.190725</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2000-01-01 00:00:02</th>\n", | |
| " <td>Charlie</td>\n", | |
| " <td>1008</td>\n", | |
| " <td>0.173829</td>\n", | |
| " <td>0.278618</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2000-01-01 00:00:03</th>\n", | |
| " <td>Victor</td>\n", | |
| " <td>997</td>\n", | |
| " <td>0.668464</td>\n", | |
| " <td>0.126580</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2000-01-01 00:00:04</th>\n", | |
| " <td>Charlie</td>\n", | |
| " <td>1008</td>\n", | |
| " <td>0.776633</td>\n", | |
| " <td>-0.318526</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " name id x y\n", | |
| "timestamp \n", | |
| "2000-01-01 00:00:00 Frank 998 -0.879896 0.438060\n", | |
| "2000-01-01 00:00:01 Victor 1015 0.246495 -0.190725\n", | |
| "2000-01-01 00:00:02 Charlie 1008 0.173829 0.278618\n", | |
| "2000-01-01 00:00:03 Victor 997 0.668464 0.126580\n", | |
| "2000-01-01 00:00:04 Charlie 1008 0.776633 -0.318526" | |
| ] | |
| }, | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import deltalake, dask\n", | |
| "df = dask.datasets.timeseries().compute()\n", | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "5558be08-ec55-460e-8bf1-8fc2b778c969", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "!rm -rf tmp" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "8edab4d4-d321-4b80-a3a1-41a2fa73f36c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "deltalake.write_deltalake(\"tmp\", df, mode=\"append\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "4c0df6a5-f728-4a0d-b56e-393a2a80db80", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "deltalake.write_deltalake(\"tmp\", df, mode=\"append\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "id": "bbc6f3ca-8f63-49cb-9494-70d79ab42016", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "DeltaTable()" | |
| ] | |
| }, | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t = deltalake.DeltaTable(\"tmp\")\n", | |
| "t" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "id": "aa090e17-ab92-46c3-b5d5-5ee761a0667e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['1-a5fab5c0-47d6-4603-adcd-9b310a0d36c7-0.parquet',\n", | |
| " '0-dcfe9ff9-8046-4dd0-b285-32ec95539df7-0.parquet']" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t.files()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "849e4bfc-7094-4bdb-84d1-4ff7cdfb5dc2", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for _ in range(10):\n", | |
| " deltalake.write_deltalake(\"tmp\", df.head(), mode=\"append\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "84b7b53b-6217-4e6e-b1d1-20681c56fd6b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['1-a5fab5c0-47d6-4603-adcd-9b310a0d36c7-0.parquet',\n", | |
| " '0-dcfe9ff9-8046-4dd0-b285-32ec95539df7-0.parquet']" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t.files()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "e8c5f4e0-4d69-4965-beee-0069e24e9c8c", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['11-7acbd002-7713-4309-89a4-7c6e4fd762d8-0.parquet',\n", | |
| " '10-39e28cdb-004c-4de0-8cbe-da52993e3688-0.parquet',\n", | |
| " '9-77fe16e2-4d5d-4c62-89be-a6f89856722b-0.parquet',\n", | |
| " '8-5da355e8-86cc-417a-9f65-e3ab11d3dde0-0.parquet',\n", | |
| " '7-59ff1cb9-8ea6-424a-ba4b-846e329a27ce-0.parquet',\n", | |
| " '6-8982c38b-b51b-43f2-b839-bc06e11df490-0.parquet',\n", | |
| " '5-aab946db-1ba5-4b67-9c10-41a984109f0e-0.parquet',\n", | |
| " '4-725e0842-41f2-4172-b96a-aa7c690cf3a0-0.parquet',\n", | |
| " '3-d2cf83b5-33db-4c65-af31-f65daa2d1851-0.parquet',\n", | |
| " '2-33bcaf9c-b3f6-4a88-8eba-ca0cd2489ea4-0.parquet',\n", | |
| " '1-a5fab5c0-47d6-4603-adcd-9b310a0d36c7-0.parquet',\n", | |
| " '0-dcfe9ff9-8046-4dd0-b285-32ec95539df7-0.parquet']" | |
| ] | |
| }, | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t = deltalake.DeltaTable(\"tmp\")\n", | |
| "t.files()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "id": "2ffb8e1d-8825-44b0-977c-188570431811", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{'numFilesAdded': 1,\n", | |
| " 'numFilesRemoved': 11,\n", | |
| " 'filesAdded': {'min': 53376152,\n", | |
| " 'max': 53376152,\n", | |
| " 'avg': 53376152.0,\n", | |
| " 'totalFiles': 1,\n", | |
| " 'totalSize': 53376152},\n", | |
| " 'filesRemoved': {'min': 2118,\n", | |
| " 'max': 77924229,\n", | |
| " 'avg': 7085946.2727272725,\n", | |
| " 'totalFiles': 11,\n", | |
| " 'totalSize': 77945409},\n", | |
| " 'partitionsOptimized': 1,\n", | |
| " 'numBatches': 2558,\n", | |
| " 'totalConsideredFiles': 12,\n", | |
| " 'totalFilesSkipped': 1,\n", | |
| " 'preserveInsertionOrder': True}" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t.optimize.compact()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "9da09aba-1c71-4bd6-870f-05c3b93cd1c9", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "['part-00001-a246e865-bb70-4367-89e8-533773f32891-c000.zstd.parquet',\n", | |
| " '0-dcfe9ff9-8046-4dd0-b285-32ec95539df7-0.parquet']" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "t.files()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python [conda env:etl]", | |
| "language": "python", | |
| "name": "conda-env-etl-py" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.12.1" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment