Skip to content

Instantly share code, notes, and snippets.

@shoyer
Created October 3, 2025 19:18
Show Gist options
  • Select an option

  • Save shoyer/6712c536c9cb4f327c59329070fb2986 to your computer and use it in GitHub Desktop.

Select an option

Save shoyer/6712c536c9cb4f327c59329070fb2986 to your computer and use it in GitHub Desktop.
xarray-beam-slow-serialization.ipynb
Display the source blob
Display the rendered blob
Raw
{
"nbformat": 4,
"nbformat_minor": 0,
"metadata": {
"colab": {
"provenance": [],
"authorship_tag": "ABX9TyMkEm82f8GHr6JcZiRbZYat",
"include_colab_link": true
},
"kernelspec": {
"name": "python3",
"display_name": "Python 3"
},
"language_info": {
"name": "python"
}
},
"cells": [
{
"cell_type": "markdown",
"metadata": {
"id": "view-in-github",
"colab_type": "text"
},
"source": [
"<a href=\"https://colab.research.google.com/gist/shoyer/6712c536c9cb4f327c59329070fb2986/xarray-beam-slow-serialization.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>"
]
},
{
"cell_type": "code",
"execution_count": null,
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "xpE8L7IXf6C7",
"outputId": "24f91aca-11e9-4d2f-9ad8-270aac1f6637"
},
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"\u001b[?25l \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m0.0/89.7 kB\u001b[0m \u001b[31m?\u001b[0m eta \u001b[36m-:--:--\u001b[0m\r\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m89.7/89.7 kB\u001b[0m \u001b[31m3.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m152.0/152.0 kB\u001b[0m \u001b[31m5.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m43.5/43.5 kB\u001b[0m \u001b[31m1.9 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Preparing metadata (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m72.6/72.6 kB\u001b[0m \u001b[31m2.4 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m276.4/276.4 kB\u001b[0m \u001b[31m16.0 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m17.1/17.1 MB\u001b[0m \u001b[31m82.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m8.8/8.8 MB\u001b[0m \u001b[31m56.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m53.5/53.5 kB\u001b[0m \u001b[31m2.1 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m3.5/3.5 MB\u001b[0m \u001b[31m22.6 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m5.6/5.6 MB\u001b[0m \u001b[31m99.5 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m96.9/96.9 kB\u001b[0m \u001b[31m3.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m46.3/46.3 kB\u001b[0m \u001b[31m1.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m1.7/1.7 MB\u001b[0m \u001b[31m67.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m272.8/272.8 kB\u001b[0m \u001b[31m14.7 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[2K \u001b[90m━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━━\u001b[0m \u001b[32m331.1/331.1 kB\u001b[0m \u001b[31m20.3 MB/s\u001b[0m eta \u001b[36m0:00:00\u001b[0m\n",
"\u001b[?25h Building wheel for crcmod (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for dill (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for hdfs (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
" Building wheel for docopt (setup.py) ... \u001b[?25l\u001b[?25hdone\n",
"\u001b[31mERROR: pip's dependency resolver does not currently take into account all the packages that are installed. This behaviour is the source of the following dependency conflicts.\n",
"grpcio-status 1.71.2 requires grpcio>=1.71.2, but you have grpcio 1.65.5 which is incompatible.\n",
"multiprocess 0.70.16 requires dill>=0.3.8, but you have dill 0.3.1.1 which is incompatible.\u001b[0m\u001b[31m\n",
"\u001b[0m"
]
}
],
"source": [
"! pip install -U -q xarray-beam xarray zarr"
]
},
{
"cell_type": "code",
"source": [
"# Copyright 2025 Google LLC.\n",
"# SPDX-License-Identifier: Apache-2.0\n",
"import apache_beam as beam\n",
"import xarray_beam as xbeam\n",
"import xarray\n",
"import numpy as np\n",
"import pandas as pd\n",
"\n",
"xarray_ds = xarray.Dataset(\n",
" {'temperature': (('time', 'longitude', 'latitude'), np.random.randn(365, 180, 90))},\n",
" coords={'time': pd.date_range('2025-01-01', freq='1D', periods=365)},\n",
")\n",
"chunks = {'time': '10MB', 'longitude': -1, 'latitude': -1}\n",
"xbeam_ds = xbeam.Dataset.from_xarray(xarray_ds, chunks)\n",
"print(xbeam_ds)"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/"
},
"id": "5CwD4PjMf_Uq",
"outputId": "550a7086-0d20-48f9-eace-df4a7a2fee2f"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stdout",
"text": [
"<xarray_beam.Dataset>\n",
"PTransform: <DatasetToChunks(PTransform) label=[from_xarray_2]>\n",
"Chunks: 10.0MB (time: 77, longitude: 180, latitude: 90, split_vars=False)\n",
"Template: 47MB (5 chunks)\n",
" Dimensions: (time: 365, longitude: 180, latitude: 90)\n",
" Coordinates:\n",
" * time (time) datetime64[ns] 3kB 2025-01-01 2025-01-02 ... 2025-12-31\n",
" Dimensions without coordinates: longitude, latitude\n",
" Data variables:\n",
" temperature (time, longitude, latitude) float64 47MB dask.array<chunksize=(365, 180, 90), meta=np.ndarray>\n"
]
}
]
},
{
"cell_type": "code",
"source": [
"%%time\n",
"with beam.Pipeline() as p:\n",
" p | xbeam_ds.to_zarr('example_data.zarr')"
],
"metadata": {
"colab": {
"base_uri": "https://localhost:8080/",
"height": 124
},
"id": "A0p0wfFxgH0s",
"outputId": "7847d80e-c52b-4c0b-ff21-ef0fa701fc2d"
},
"execution_count": null,
"outputs": [
{
"output_type": "stream",
"name": "stderr",
"text": [
"/usr/local/lib/python3.12/dist-packages/zarr/api/asynchronous.py:244: ZarrUserWarning: Consolidated metadata is currently not part in the Zarr format 3 specification. It may not be supported by other zarr implementations and may change in the future.\n",
" warnings.warn(\n",
"WARNING:apache_beam.runners.interactive.interactive_environment:Dependencies required for Interactive Beam PCollection visualization are not available, please use: `pip install apache-beam[interactive]` to install necessary dependencies to enable all data visualization features.\n"
]
},
{
"output_type": "display_data",
"data": {
"application/javascript": [
"\n",
" if (typeof window.interactive_beam_jquery == 'undefined') {\n",
" var jqueryScript = document.createElement('script');\n",
" jqueryScript.src = 'https://code.jquery.com/jquery-3.4.1.slim.min.js';\n",
" jqueryScript.type = 'text/javascript';\n",
" jqueryScript.onload = function() {\n",
" var datatableScript = document.createElement('script');\n",
" datatableScript.src = 'https://cdn.datatables.net/1.10.20/js/jquery.dataTables.min.js';\n",
" datatableScript.type = 'text/javascript';\n",
" datatableScript.onload = function() {\n",
" window.interactive_beam_jquery = jQuery.noConflict(true);\n",
" window.interactive_beam_jquery(document).ready(function($){\n",
" \n",
" });\n",
" }\n",
" document.head.appendChild(datatableScript);\n",
" };\n",
" document.head.appendChild(jqueryScript);\n",
" } else {\n",
" window.interactive_beam_jquery(document).ready(function($){\n",
" \n",
" });\n",
" }"
]
},
"metadata": {}
},
{
"output_type": "stream",
"name": "stdout",
"text": [
"CPU times: user 57.4 s, sys: 4.2 s, total: 1min 1s\n",
"Wall time: 1min 2s\n"
]
}
]
},
{
"cell_type": "code",
"source": [],
"metadata": {
"id": "DFvWyuCfgUxx"
},
"execution_count": null,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment