Created
October 3, 2025 19:18
-
-
Save shoyer/6712c536c9cb4f327c59329070fb2986 to your computer and use it in GitHub Desktop.
xarray-beam-slow-serialization.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyMkEm82f8GHr6JcZiRbZYat", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/shoyer/6712c536c9cb4f327c59329070fb2986/xarray-beam-slow-serialization.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "xpE8L7IXf6C7", | |
| "outputId": "24f91aca-11e9-4d2f-9ad8-270aac1f6637" | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/89.7 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.7/89.7 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m152.0/152.0 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.5/43.5 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.6/72.6 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m276.4/276.4 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.1/17.1 MB\u001b[0m \u001b[31m82.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m56.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.5/53.5 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.5/3.5 MB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m99.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.9/96.9 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.3/46.3 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m67.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m272.8/272.8 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m331.1/331.1 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n", | |
| "\u001b[?25h Building wheel for crcmod (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Building wheel for dill (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Building wheel for hdfs (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| " Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n", | |
| "\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n", | |
| "grpcio-status 1.71.2 requires grpcio>=1.71.2, but you have grpcio 1.65.5 which is incompatible.\n", | |
| "multiprocess 0.70.16 requires dill>=0.3.8, but you have dill 0.3.1.1 which is incompatible.\u001b[0m\u001b[31m\n", | |
| "\u001b[0m" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "! pip install -U -q xarray-beam xarray zarr" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "# Copyright 2025 Google LLC.\n", | |
| "# SPDX-License-Identifier: Apache-2.0\n", | |
| "import apache_beam as beam\n", | |
| "import xarray_beam as xbeam\n", | |
| "import xarray\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "\n", | |
| "xarray_ds = xarray.Dataset(\n", | |
| " {'temperature': (('time', 'longitude', 'latitude'), np.random.randn(365, 180, 90))},\n", | |
| " coords={'time': pd.date_range('2025-01-01', freq='1D', periods=365)},\n", | |
| ")\n", | |
| "chunks = {'time': '10MB', 'longitude': -1, 'latitude': -1}\n", | |
| "xbeam_ds = xbeam.Dataset.from_xarray(xarray_ds, chunks)\n", | |
| "print(xbeam_ds)" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "5CwD4PjMf_Uq", | |
| "outputId": "550a7086-0d20-48f9-eace-df4a7a2fee2f" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "<xarray_beam.Dataset>\n", | |
| "PTransform: <DatasetToChunks(PTransform) label=[from_xarray_2]>\n", | |
| "Chunks: 10.0MB (time: 77, longitude: 180, latitude: 90, split_vars=False)\n", | |
| "Template: 47MB (5 chunks)\n", | |
| " Dimensions: (time: 365, longitude: 180, latitude: 90)\n", | |
| " Coordinates:\n", | |
| " * time (time) datetime64[ns] 3kB 2025-01-01 2025-01-02 ... 2025-12-31\n", | |
| " Dimensions without coordinates: longitude, latitude\n", | |
| " Data variables:\n", | |
| " temperature (time, longitude, latitude) float64 47MB dask.array<chunksize=(365, 180, 90), meta=np.ndarray>\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "%%time\n", | |
| "with beam.Pipeline() as p:\n", | |
| " p | xbeam_ds.to_zarr('example_data.zarr')" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 124 | |
| }, | |
| "id": "A0p0wfFxgH0s", | |
| "outputId": "7847d80e-c52b-4c0b-ff21-ef0fa701fc2d" | |
| }, | |
| "execution_count": null, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "/usr/local/lib/python3.12/dist-packages/zarr/api/asynchronous.py:244: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.\n", | |
| " warnings.warn(\n", | |
| "WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "application/javascript": [ | |
| "\n", | |
| " if (typeof window.interactive_beam_jquery == 'undefined') {\n", | |
| " var jqueryScript = document.createElement('script');\n", | |
| " jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n", | |
| " jqueryScript.type = 'text/javascript';\n", | |
| " jqueryScript.onload = function() {\n", | |
| " var datatableScript = document.createElement('script');\n", | |
| " datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n", | |
| " datatableScript.type = 'text/javascript';\n", | |
| " datatableScript.onload = function() {\n", | |
| " window.interactive_beam_jquery = jQuery.noConflict(true);\n", | |
| " window.interactive_beam_jquery(document).ready(function($){\n", | |
| " \n", | |
| " });\n", | |
| " }\n", | |
| " document.head.appendChild(datatableScript);\n", | |
| " };\n", | |
| " document.head.appendChild(jqueryScript);\n", | |
| " } else {\n", | |
| " window.interactive_beam_jquery(document).ready(function($){\n", | |
| " \n", | |
| " });\n", | |
| " }" | |
| ] | |
| }, | |
| "metadata": {} | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "CPU times: user 57.4 s, sys: 4.2 s, total: 1min 1s\n", | |
| "Wall time: 1min 2s\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [], | |
| "metadata": { | |
| "id": "DFvWyuCfgUxx" | |
| }, | |
| "execution_count": null, | |
| "outputs": [] | |
| } | |
| ] | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment