Last active
July 22, 2024 23:06
-
-
Save ivirshup/dc39029ad439cef4755e45582fc35541 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Setup\n", | |
| "\n", | |
| "Imports, variables, and starts a dask client" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import multiprocessing\n", | |
| "multiprocessing.set_start_method(\"spawn\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import tiledb\n", | |
| "import tiledbsoma\n", | |
| "import cellxgene_census\n", | |
| "from tiledbsoma import SOMATileDBContext\n", | |
| "\n", | |
| "import dask.array as da\n", | |
| "# from dask import delayed\n", | |
| "import dask.distributed as dd\n", | |
| "\n", | |
| "from scipy import sparse\n", | |
| "import numpy as np\n", | |
| "import pandas as pd\n", | |
| "\n", | |
| "# import anndata as ad, scanpy as sc\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "CENSUS_VERSION = \"2024-07-01\"\n", | |
| "SOMA_URI= f\"s3://cellxgene-census-public-us-west-2/cell-census/{CENSUS_VERSION}/soma\"\n", | |
| "SPECIES = \"mus_musculus\"\n", | |
| "\n", | |
| "SPARSE_CHUNK_SIZE = 10_000\n", | |
| "DENSE_CHUNK_SIZE = 1_000\n", | |
| "\n", | |
| "CTX = {\n", | |
| " \"vfs.s3.no_sign_request\": \"true\",\n", | |
| " \"vfs.s3.region\": \"us-west-2\"\n", | |
| "}\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <h3 style=\"margin-bottom: 0px;\">Client</h3>\n", | |
| " <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Client-9ccac347-487d-11ef-a2f8-023ca6c22285</p>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| "\n", | |
| " <tr>\n", | |
| " \n", | |
| " <td style=\"text-align: left;\"><strong>Connection method:</strong> Cluster object</td>\n", | |
| " <td style=\"text-align: left;\"><strong>Cluster type:</strong> distributed.LocalCluster</td>\n", | |
| " \n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| " <details>\n", | |
| " <summary style=\"margin-bottom: 20px;\"><h3 style=\"display: inline;\">Cluster Info</h3></summary>\n", | |
| " <div class=\"jp-RenderedHTMLCommon jp-RenderedHTML jp-mod-trusted jp-OutputArea-output\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #e1e1e1; border: 3px solid #9D9D9D; border-radius: 5px; position: absolute;\">\n", | |
| " </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <h3 style=\"margin-bottom: 0px; margin-top: 0px;\">LocalCluster</h3>\n", | |
| " <p style=\"color: #9D9D9D; margin-bottom: 0px;\">ef0ad759</p>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Workers:</strong> 8\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads:</strong> 32\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total memory:</strong> 123.85 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " \n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\"><strong>Status:</strong> running</td>\n", | |
| " <td style=\"text-align: left;\"><strong>Using processes:</strong> True</td>\n", | |
| "</tr>\n", | |
| "\n", | |
| " \n", | |
| " </table>\n", | |
| "\n", | |
| " <details>\n", | |
| " <summary style=\"margin-bottom: 20px;\">\n", | |
| " <h3 style=\"display: inline;\">Scheduler Info</h3>\n", | |
| " </summary>\n", | |
| "\n", | |
| " <div style=\"\">\n", | |
| " <div>\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #FFF7E5; border: 3px solid #FF6132; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <h3 style=\"margin-bottom: 0px;\">Scheduler</h3>\n", | |
| " <p style=\"color: #9D9D9D; margin-bottom: 0px;\">Scheduler-6b09d772-2a5a-4906-894d-40300a8824ec</p>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm:</strong> tcp://127.0.0.1:41895\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Workers:</strong> 8\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard:</strong> <a href=\"http://127.0.0.1:8787/status\" target=\"_blank\">http://127.0.0.1:8787/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads:</strong> 32\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Started:</strong> Just now\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total memory:</strong> 123.85 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " </table>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| "\n", | |
| " <details style=\"margin-left: 48px;\">\n", | |
| " <summary style=\"margin-bottom: 20px;\">\n", | |
| " <h3 style=\"display: inline;\">Workers</h3>\n", | |
| " </summary>\n", | |
| "\n", | |
| " \n", | |
| " <div style=\"margin-bottom: 20px;\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <details>\n", | |
| " <summary>\n", | |
| " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 0</h4>\n", | |
| " </summary>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm: </strong> tcp://127.0.0.1:42777\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads: </strong> 4\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:34579/status\" target=\"_blank\">http://127.0.0.1:34579/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Memory: </strong> 15.48 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Nanny: </strong> tcp://127.0.0.1:39589\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td colspan=\"2\" style=\"text-align: left;\">\n", | |
| " <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-ksch0uc7\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| " </details>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| " \n", | |
| " <div style=\"margin-bottom: 20px;\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <details>\n", | |
| " <summary>\n", | |
| " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 1</h4>\n", | |
| " </summary>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm: </strong> tcp://127.0.0.1:33189\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads: </strong> 4\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:41809/status\" target=\"_blank\">http://127.0.0.1:41809/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Memory: </strong> 15.48 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Nanny: </strong> tcp://127.0.0.1:39211\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td colspan=\"2\" style=\"text-align: left;\">\n", | |
| " <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-4i_whljq\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| " </details>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| " \n", | |
| " <div style=\"margin-bottom: 20px;\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <details>\n", | |
| " <summary>\n", | |
| " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 2</h4>\n", | |
| " </summary>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm: </strong> tcp://127.0.0.1:46349\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads: </strong> 4\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:35117/status\" target=\"_blank\">http://127.0.0.1:35117/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Memory: </strong> 15.48 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Nanny: </strong> tcp://127.0.0.1:37289\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td colspan=\"2\" style=\"text-align: left;\">\n", | |
| " <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-vrph97p7\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| " </details>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| " \n", | |
| " <div style=\"margin-bottom: 20px;\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <details>\n", | |
| " <summary>\n", | |
| " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 3</h4>\n", | |
| " </summary>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm: </strong> tcp://127.0.0.1:33619\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads: </strong> 4\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:46295/status\" target=\"_blank\">http://127.0.0.1:46295/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Memory: </strong> 15.48 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Nanny: </strong> tcp://127.0.0.1:38493\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td colspan=\"2\" style=\"text-align: left;\">\n", | |
| " <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-pnc8s3td\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| " </details>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| " \n", | |
| " <div style=\"margin-bottom: 20px;\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <details>\n", | |
| " <summary>\n", | |
| " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 4</h4>\n", | |
| " </summary>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm: </strong> tcp://127.0.0.1:38525\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads: </strong> 4\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:33985/status\" target=\"_blank\">http://127.0.0.1:33985/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Memory: </strong> 15.48 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Nanny: </strong> tcp://127.0.0.1:35857\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td colspan=\"2\" style=\"text-align: left;\">\n", | |
| " <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-9zdfgqqc\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| " </details>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| " \n", | |
| " <div style=\"margin-bottom: 20px;\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <details>\n", | |
| " <summary>\n", | |
| " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 5</h4>\n", | |
| " </summary>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm: </strong> tcp://127.0.0.1:39083\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads: </strong> 4\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:44377/status\" target=\"_blank\">http://127.0.0.1:44377/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Memory: </strong> 15.48 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Nanny: </strong> tcp://127.0.0.1:46067\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td colspan=\"2\" style=\"text-align: left;\">\n", | |
| " <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-nwbo4mgv\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| " </details>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| " \n", | |
| " <div style=\"margin-bottom: 20px;\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <details>\n", | |
| " <summary>\n", | |
| " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 6</h4>\n", | |
| " </summary>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm: </strong> tcp://127.0.0.1:34953\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads: </strong> 4\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37703/status\" target=\"_blank\">http://127.0.0.1:37703/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Memory: </strong> 15.48 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Nanny: </strong> tcp://127.0.0.1:35035\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td colspan=\"2\" style=\"text-align: left;\">\n", | |
| " <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-d69pd78e\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| " </details>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| " \n", | |
| " <div style=\"margin-bottom: 20px;\">\n", | |
| " <div style=\"width: 24px; height: 24px; background-color: #DBF5FF; border: 3px solid #4CC9FF; border-radius: 5px; position: absolute;\"> </div>\n", | |
| " <div style=\"margin-left: 48px;\">\n", | |
| " <details>\n", | |
| " <summary>\n", | |
| " <h4 style=\"margin-bottom: 0px; display: inline;\">Worker: 7</h4>\n", | |
| " </summary>\n", | |
| " <table style=\"width: 100%; text-align: left;\">\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Comm: </strong> tcp://127.0.0.1:34655\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Total threads: </strong> 4\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Dashboard: </strong> <a href=\"http://127.0.0.1:37417/status\" target=\"_blank\">http://127.0.0.1:37417/status</a>\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Memory: </strong> 15.48 GiB\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td style=\"text-align: left;\">\n", | |
| " <strong>Nanny: </strong> tcp://127.0.0.1:42439\n", | |
| " </td>\n", | |
| " <td style=\"text-align: left;\"></td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <td colspan=\"2\" style=\"text-align: left;\">\n", | |
| " <strong>Local directory: </strong> /tmp/dask-scratch-space/worker-fyu56ap9\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " \n", | |
| "\n", | |
| " </table>\n", | |
| " </details>\n", | |
| " </div>\n", | |
| " </div>\n", | |
| " \n", | |
| "\n", | |
| " </details>\n", | |
| "</div>\n", | |
| "\n", | |
| " </details>\n", | |
| " </div>\n", | |
| "</div>\n", | |
| " </details>\n", | |
| " \n", | |
| "\n", | |
| " </div>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| "<Client: 'tcp://127.0.0.1:41895' processes=8 threads=32, memory=123.85 GiB>" | |
| ] | |
| }, | |
| "execution_count": 4, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "cluster = dd.LocalCluster()\n", | |
| "client = dd.Client(cluster)\n", | |
| "client" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Functions" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 5, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def to_listed_chunks(chunk_size: int, dim_size: int) -> list[int]:\n", | |
| " \"\"\"Go from single integer to list representation of a chunking scheme.\n", | |
| "\n", | |
| " Some rules about how this behaves:\n", | |
| "\n", | |
| " d, r := divmod(n, mod)\n", | |
| " (*((mod,) * d), r) := to_listed_chunks(mod, n)\n", | |
| " map(len, itertools.batched(range(n), mod))) := to_listed_chunks(mod, n)\n", | |
| "\n", | |
| "\n", | |
| " Examples\n", | |
| " --------\n", | |
| " >>> to_listed_chunks(10, 25)\n", | |
| " [10, 10, 5]\n", | |
| " >>> to_listed_chunks(3, 9)\n", | |
| " [3, 3, 3]\n", | |
| " \"\"\"\n", | |
| " n_full, rem = divmod(dim_size, chunk_size)\n", | |
| " chunk_list = [chunk_size] * n_full\n", | |
| " if rem:\n", | |
| " chunk_list += [rem]\n", | |
| " return chunk_list" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000, 1000]" | |
| ] | |
| }, | |
| "execution_count": 6, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "to_listed_chunks(1000, 10_000)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from numba import njit\n", | |
| "\n", | |
| "@njit\n", | |
| "def cmap(x, y, out=None):\n", | |
| " if out is None:\n", | |
| " out = np.empty_like(x)\n", | |
| " d = dict()\n", | |
| " for i, y_i in enumerate(y):\n", | |
| " d[y_i] = i\n", | |
| " # out = np.empty_like(y, shape=x.shape)\n", | |
| " for i, x_i in enumerate(x):\n", | |
| " out[i] = d[x_i]\n", | |
| " return out" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def list_split(arr_list:list, sublist_len: int) -> list[list]:\n", | |
| " \"\"\"Splits a python list into a list of sublists where each sublist is of size `sublist_len`.\n", | |
| " TODO: Replace with `itertools.batched` when Python 3.12 becomes the minimum supported version.\n", | |
| " \"\"\"\n", | |
| " i = 0\n", | |
| " result = []\n", | |
| "\n", | |
| " while i < len(arr_list):\n", | |
| " if (i + sublist_len) >= len(arr_list):\n", | |
| " result.append(arr_list[i:])\n", | |
| " else:\n", | |
| " result.append(arr_list[i : i + sublist_len])\n", | |
| "\n", | |
| " i += sublist_len\n", | |
| "\n", | |
| " return result" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Initial experiment" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 249 ms, sys: 40.7 ms, total: 290 ms\n", | |
| "Wall time: 324 ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "# need to specify anonymous access\n", | |
| "tiledb_array = tiledb.open(\n", | |
| " f\"{SOMA_URI}/census_data/{SPECIES}/ms/RNA/X/raw/\",\n", | |
| " ctx=tiledb.Ctx(CTX),\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def report_block_id_broadcast(obs_idx, var_idx, block_info):\n", | |
| " print(type(obs_idx))\n", | |
| " print(type(var_idx))\n", | |
| " a = np.empty(block_info[None][\"chunk-shape\"], dtype=object)\n", | |
| " a[:] = block_info\n", | |
| " return a\n", | |
| " # return \n", | |
| " # np.complex(obs_idx)\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "<class 'numpy.ndarray'>\n", | |
| "<class 'numpy.ndarray'>\n", | |
| "<class 'numpy.ndarray'>\n", | |
| "<class 'numpy.ndarray'>\n", | |
| "<class 'numpy.ndarray'>\n", | |
| "<class 'numpy.ndarray'>\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([{0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(0, 3)], 'chunk-location': (0,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(0, 5)], 'chunk-location': (0,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}},\n", | |
| " {0: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,)}, 1: {'shape': (5,), 'num-chunks': (2,), 'array-location': [(3, 5)], 'chunk-location': (1,)}, None: {'shape': (10,), 'num-chunks': (2,), 'array-location': [(5, 10)], 'chunk-location': (1,), 'chunk-shape': (5,), 'dtype': <class 'object'>}}],\n", | |
| " dtype=object)" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "da.map_blocks(\n", | |
| " report_block_id_broadcast, \n", | |
| " da.arange(10, chunks=5),\n", | |
| " da.arange(5, chunks=3),\n", | |
| " # chunks=((5, 5), (3, 2)),\n", | |
| " dtype=object,\n", | |
| ").compute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Demo of how map blocks works" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table>\n", | |
| " <tr>\n", | |
| " <td>\n", | |
| " <table style=\"border-collapse: collapse;\">\n", | |
| " <thead>\n", | |
| " <tr>\n", | |
| " <td> </td>\n", | |
| " <th> Array </th>\n", | |
| " <th> Chunk </th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " \n", | |
| " <tr>\n", | |
| " <th> Bytes </th>\n", | |
| " <td> 400 B </td>\n", | |
| " <td> 80 B </td>\n", | |
| " </tr>\n", | |
| " \n", | |
| " <tr>\n", | |
| " <th> Shape </th>\n", | |
| " <td> (5, 10) </td>\n", | |
| " <td> (2, 5) </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th> Dask graph </th>\n", | |
| " <td colspan=\"2\"> 6 chunks in 5 graph layers </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th> Data type </th>\n", | |
| " <td colspan=\"2\"> object numpy.ndarray </td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| " </table>\n", | |
| " </td>\n", | |
| " <td>\n", | |
| " <svg width=\"170\" height=\"110\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
| "\n", | |
| " <!-- Horizontal lines -->\n", | |
| " <line x1=\"0\" y1=\"0\" x2=\"120\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
| " <line x1=\"0\" y1=\"24\" x2=\"120\" y2=\"24\" />\n", | |
| " <line x1=\"0\" y1=\"48\" x2=\"120\" y2=\"48\" />\n", | |
| " <line x1=\"0\" y1=\"60\" x2=\"120\" y2=\"60\" style=\"stroke-width:2\" />\n", | |
| "\n", | |
| " <!-- Vertical lines -->\n", | |
| " <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"60\" style=\"stroke-width:2\" />\n", | |
| " <line x1=\"60\" y1=\"0\" x2=\"60\" y2=\"60\" />\n", | |
| " <line x1=\"120\" y1=\"0\" x2=\"120\" y2=\"60\" style=\"stroke-width:2\" />\n", | |
| "\n", | |
| " <!-- Colored Rectangle -->\n", | |
| " <polygon points=\"0.0,0.0 120.0,0.0 120.0,60.0 0.0,60.0\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n", | |
| "\n", | |
| " <!-- Text -->\n", | |
| " <text x=\"60.000000\" y=\"80.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >10</text>\n", | |
| " <text x=\"140.000000\" y=\"30.000000\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(0,140.000000,30.000000)\">5</text>\n", | |
| "</svg>\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "</table>" | |
| ], | |
| "text/plain": [ | |
| "dask.array<report_block_id_broadcast, shape=(5, 10), dtype=object, chunksize=(2, 5), chunktype=numpy.ndarray>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{0: {'shape': (10,),\n", | |
| " 'num-chunks': (2,),\n", | |
| " 'array-location': [(0, 5)],\n", | |
| " 'chunk-location': (0,)},\n", | |
| " 1: {'shape': (5, 1),\n", | |
| " 'num-chunks': (3, 1),\n", | |
| " 'array-location': [(0, 2), (0, 1)],\n", | |
| " 'chunk-location': (0, 0)},\n", | |
| " None: {'shape': (5, 10),\n", | |
| " 'num-chunks': (3, 2),\n", | |
| " 'array-location': [(0, 2), (0, 5)],\n", | |
| " 'chunk-location': (0, 0),\n", | |
| " 'chunk-shape': (2, 5),\n", | |
| " 'dtype': object}}" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "{0: {'shape': (10,),\n", | |
| " 'num-chunks': (2,),\n", | |
| " 'array-location': [(5, 10)],\n", | |
| " 'chunk-location': (1,)},\n", | |
| " 1: {'shape': (5, 1),\n", | |
| " 'num-chunks': (3, 1),\n", | |
| " 'array-location': [(2, 4), (0, 1)],\n", | |
| " 'chunk-location': (1, 0)},\n", | |
| " None: {'shape': (5, 10),\n", | |
| " 'num-chunks': (3, 2),\n", | |
| " 'array-location': [(2, 4), (5, 10)],\n", | |
| " 'chunk-location': (1, 1),\n", | |
| " 'chunk-shape': (2, 5),\n", | |
| " 'dtype': object}}" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([[ 0, 1, 2, 3, 4, 5, 6, 7, 8, 9],\n", | |
| " [ 1, 2, 3, 4, 5, 6, 7, 8, 9, 10],\n", | |
| " [ 2, 3, 4, 5, 6, 7, 8, 9, 10, 11],\n", | |
| " [ 3, 4, 5, 6, 7, 8, 9, 10, 11, 12],\n", | |
| " [ 4, 5, 6, 7, 8, 9, 10, 11, 12, 13]])" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "def report_block_id_broadcast(obs_idx, var_idx, block_info):\n", | |
| " a = np.empty(block_info[None][\"chunk-shape\"], dtype=object)\n", | |
| " a[:] = block_info\n", | |
| " return a\n", | |
| "\n", | |
| "delayed_res = da.map_blocks(\n", | |
| " report_block_id_broadcast,\n", | |
| " da.arange(10, chunks=5),\n", | |
| " da.arange(5, chunks=2).reshape((5, 1)),\n", | |
| " # chunks=((5, 5), (3, 2)),\n", | |
| " # new_axis=[1],\n", | |
| " dtype=object,\n", | |
| ")\n", | |
| "res = delayed_res.compute()\n", | |
| "display(delayed_res)\n", | |
| "\n", | |
| "display(res[1, 4])\n", | |
| "display(res[2, 5])\n", | |
| "\n", | |
| "from operator import add\n", | |
| "\n", | |
| "da.map_blocks(\n", | |
| " add,\n", | |
| " da.arange(10, chunks=5),\n", | |
| " da.arange(5, chunks=2).reshape((5, 1)),\n", | |
| " # chunks=((5, 5), (3, 2)),\n", | |
| " # new_axis=[1],\n", | |
| ").compute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Getting tiledb arrays to work with scanpy (proof of concept)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "Proof of concept using scanpy on top of a dask array w/ sparse chunks. Gets through highly variable genes using tiledb's python library directly. This form doesn't support queries on the data prior to creating the dask array." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def sparse_chunk(block_id, block_info):\n", | |
| " shape = block_info[None][\"chunk-shape\"]\n", | |
| " array_location = block_info[None][\"array-location\"]\n", | |
| " offsets = array_location[0][0], array_location[1][0]\n", | |
| "\n", | |
| " tiledb_array = tiledb.open(\n", | |
| " f\"{SOMA_URI}/census_data/{SPECIES}/ms/RNA/X/raw/\",\n", | |
| " ctx=tiledb.Ctx(CTX),\n", | |
| " )\n", | |
| "\n", | |
| " res = tiledb_array[slice(*array_location[0]), slice(*array_location[1])]\n", | |
| "\n", | |
| " res[\"soma_dim_0\"] -= offsets[0]\n", | |
| " res[\"soma_dim_1\"] -= offsets[1]\n", | |
| "\n", | |
| " a = sparse.csr_matrix((res[\"soma_data\"], (res[\"soma_dim_0\"], res[\"soma_dim_1\"])), shape=shape)\n", | |
| " return a" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n", | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n", | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 533 ms, sys: 216 ms, total: 749 ms\n", | |
| "Wall time: 3.16 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<3000x52417 sparse matrix of type '<class 'numpy.float32'>'\n", | |
| "\twith 6164051 stored elements in Compressed Sparse Row format>" | |
| ] | |
| }, | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "da.map_blocks(sparse_chunk, chunks=((1000, 1000, 1000), (52417,)), meta=sparse.csr_matrix((0, 0), dtype=tiledb_array.dtype)).compute()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import scanpy as sc, anndata as ad" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 575 ms, sys: 111 ms, total: 686 ms\n", | |
| "Wall time: 679 ms\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "AnnData object with n_obs × n_vars = 1000000 × 52437" | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "adata = ad.AnnData(\n", | |
| " X=da.map_blocks(\n", | |
| " sparse_chunk,\n", | |
| " chunks=(\n", | |
| " tuple(to_listed_chunks(10_000, 1_000_000)),\n", | |
| " (tiledb_array.shape[1],)\n", | |
| " ),\n", | |
| " meta=sparse.csr_matrix((0, 0), dtype=tiledb_array.dtype)\n", | |
| " )\n", | |
| ")\n", | |
| "adata" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 24.2 ms, sys: 5.75 ms, total: 29.9 ms\n", | |
| "Wall time: 28 ms\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "sc.pp.normalize_total(adata)\n", | |
| "sc.pp.log1p(adata)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n", | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n", | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n", | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n", | |
| "/home/ubuntu/miniforge3/envs/cellxgene-census-dev/lib/python3.11/site-packages/tiledb/cloud/config.py:96: UserWarning: You must first login before you can run commands. Please run tiledb.cloud.login.\n", | |
| " warnings.warn(\n" | |
| ] | |
| }, | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 19.8 s, sys: 2.98 s, total: 22.8 s\n", | |
| "Wall time: 41.3 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "AnnData object with n_obs × n_vars = 1000000 × 52437\n", | |
| " var: 'highly_variable', 'means', 'dispersions', 'dispersions_norm'\n", | |
| " uns: 'log1p', 'hvg'" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "sc.pp.highly_variable_genes(adata)\n", | |
| "adata" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## tiledbsoma" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "This section demonstrates creating and using dask with a tiledbsoma query. This means replicating axis filters that have been applied, though I may choose to specialize on `obs` filters here for performance reasons." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "The \"stable\" release is currently 2024-07-01. Specify 'census_version=\"2024-07-01\"' in future calls to open_soma() to ensure data consistency.\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<somacore.query.query.ExperimentAxisQuery at 0x7e09481ecdd0>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "SparseArray(uri='s3://cellxgene-census-public-us-west-2/cell-census/2024-07-01/soma/census_data/mus_musculus/ms/RNA/X/raw', mode=r, ndim=2)" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "(41233630, 52437)" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "import cellxgene_census\n", | |
| "import tiledbsoma\n", | |
| "import anndata as ad, scanpy as sc\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "census = cellxgene_census.open_soma(census_version=\"stable\")\n", | |
| "mouse = census[\"census_data\"][\"mus_musculus\"]\n", | |
| "\n", | |
| "query = mouse.axis_query(\"RNA\", obs_query=tiledbsoma.AxisQuery(value_filter=\"is_primary_data == True\"))\n", | |
| "display(query)\n", | |
| "\n", | |
| "\n", | |
| "# Retrieving the array from a query\n", | |
| "tiledb_array = tiledb.open(\n", | |
| " query.X(\"raw\").array.uri,\n", | |
| " ctx=tiledb.Ctx(CTX),\n", | |
| ")\n", | |
| "\n", | |
| "display(tiledb_array)\n", | |
| "display(tiledb_array.shape)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "1020" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "query = mouse.axis_query(\"RNA\", obs_query=tiledbsoma.AxisQuery(value_filter=\"is_primary_data == True\", coords=(slice(0, 83_000),)))\n", | |
| "query.n_obs\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 21, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def query_to_dask(\n", | |
| " query: tiledbsoma.ExperimentAxisQuery,\n", | |
| " layer: str = \"raw\",\n", | |
| " obs_chunks: np.ndarray | int = 10_000,\n", | |
| ") -> da.Array:\n", | |
| " if not isinstance(obs_chunks, (int, np.integer)):\n", | |
| " raise NotImplementedError(\"Doesn't support arrays just yet\")\n", | |
| "\n", | |
| " obs_chunks_listed = to_listed_chunks(obs_chunks, query.n_obs)\n", | |
| "\n", | |
| " # Do I have any guarantees about the order of these?\n", | |
| " # Reshapeing so data is distributed propery for the map blocks operation\n", | |
| " obs_joinids = da.from_array(query.obs_joinids().to_numpy(), chunks=obs_chunks_listed).reshape((-1, 1))\n", | |
| " var_joinids = da.from_array(query.var_joinids().to_numpy(), chunks=-1)\n", | |
| "\n", | |
| " uri = query.X(layer).array.uri\n", | |
| "\n", | |
| " def sparse_chunk(obs_joinids, var_joinids, block_id, block_info):\n", | |
| " shape = block_info[None][\"chunk-shape\"]\n", | |
| " array_location = block_info[None][\"array-location\"]\n", | |
| " offsets = array_location[0][0], array_location[1][0]\n", | |
| " obs_joinids = obs_joinids.flatten()\n", | |
| "\n", | |
| " tiledb_array = tiledb.open(\n", | |
| " uri,\n", | |
| " ctx=tiledb.Ctx(CTX),\n", | |
| " )\n", | |
| " res = tiledb_array.multi_index[obs_joinids, var_joinids]\n", | |
| "\n", | |
| " # Inplace operations sometimes throwing errors, something about read only buffers.\n", | |
| " row = cmap(res[\"soma_dim_0\"], obs_joinids)\n", | |
| " col = cmap(res[\"soma_dim_1\"], var_joinids)\n", | |
| "\n", | |
| " a = sparse.csr_matrix((res[\"soma_data\"], (row, col)), shape=shape)\n", | |
| "\n", | |
| " return a\n", | |
| " expr = da.map_blocks(\n", | |
| " sparse_chunk,\n", | |
| " obs_joinids,\n", | |
| " var_joinids,\n", | |
| " meta=sparse.csr_matrix((0, 0), dtype=np.float32),\n", | |
| " chunks=(obs_chunks_listed, (len(var_joinids),)),\n", | |
| " )\n", | |
| " return expr" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 1.46 s, sys: 2.54 s, total: 4 s\n", | |
| "Wall time: 1.97 s\n", | |
| "CPU times: user 565 ms, sys: 289 ms, total: 854 ms\n", | |
| "Wall time: 425 ms\n", | |
| "CPU times: user 2.03 s, sys: 2.83 s, total: 4.85 s\n", | |
| "Wall time: 2.4 s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<table>\n", | |
| " <tr>\n", | |
| " <td>\n", | |
| " <table style=\"border-collapse: collapse;\">\n", | |
| " <thead>\n", | |
| " <tr>\n", | |
| " <td> </td>\n", | |
| " <th> Array </th>\n", | |
| " <th> Chunk </th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " \n", | |
| " <tr>\n", | |
| " <th> Shape </th>\n", | |
| " <td> (32722, 52437) </td>\n", | |
| " <td> (10000, 52437) </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th> Dask graph </th>\n", | |
| " <td colspan=\"2\"> 4 chunks in 6 graph layers </td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th> Data type </th>\n", | |
| " <td colspan=\"2\"> float32 scipy.sparse._csr.csr_matrix </td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| " </table>\n", | |
| " </td>\n", | |
| " <td>\n", | |
| " <svg width=\"170\" height=\"124\" style=\"stroke:rgb(0,0,0);stroke-width:1\" >\n", | |
| "\n", | |
| " <!-- Horizontal lines -->\n", | |
| " <line x1=\"0\" y1=\"0\" x2=\"120\" y2=\"0\" style=\"stroke-width:2\" />\n", | |
| " <line x1=\"0\" y1=\"22\" x2=\"120\" y2=\"22\" />\n", | |
| " <line x1=\"0\" y1=\"45\" x2=\"120\" y2=\"45\" />\n", | |
| " <line x1=\"0\" y1=\"68\" x2=\"120\" y2=\"68\" />\n", | |
| " <line x1=\"0\" y1=\"74\" x2=\"120\" y2=\"74\" style=\"stroke-width:2\" />\n", | |
| "\n", | |
| " <!-- Vertical lines -->\n", | |
| " <line x1=\"0\" y1=\"0\" x2=\"0\" y2=\"74\" style=\"stroke-width:2\" />\n", | |
| " <line x1=\"120\" y1=\"0\" x2=\"120\" y2=\"74\" style=\"stroke-width:2\" />\n", | |
| "\n", | |
| " <!-- Colored Rectangle -->\n", | |
| " <polygon points=\"0.0,0.0 120.0,0.0 120.0,74.88300246009497 0.0,74.88300246009497\" style=\"fill:#ECB172A0;stroke-width:0\"/>\n", | |
| "\n", | |
| " <!-- Text -->\n", | |
| " <text x=\"60.000000\" y=\"94.883002\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" >52437</text>\n", | |
| " <text x=\"140.000000\" y=\"37.441501\" font-size=\"1.0rem\" font-weight=\"100\" text-anchor=\"middle\" transform=\"rotate(-90,140.000000,37.441501)\">32722</text>\n", | |
| "</svg>\n", | |
| " </td>\n", | |
| " </tr>\n", | |
| "</table>" | |
| ], | |
| "text/plain": [ | |
| "dask.array<sparse_chunk, shape=(32722, 52437), dtype=float32, chunksize=(10000, 52437), chunktype=scipy.csr_matrix>" | |
| ] | |
| }, | |
| "execution_count": 22, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "query = mouse.axis_query(\"RNA\", obs_query=tiledbsoma.AxisQuery(value_filter=\"is_primary_data == True and tissue == 'limb muscle'\"))\n", | |
| "%time obs = query.obs().concat().to_pandas()\n", | |
| "%time X = query_to_dask(query)\n", | |
| "X" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def query_to_anndata(query, obs_chunks: int = 10_000):\n", | |
| " return ad.AnnData(\n", | |
| " X=query_to_dask(query, obs_chunks=obs_chunks),\n", | |
| " obs=query.obs().concat().to_pandas(),\n", | |
| " var=query.var().concat().to_pandas(),\n", | |
| " )" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stderr", | |
| "output_type": "stream", | |
| "text": [ | |
| "/home/ubuntu/github/anndata/src/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n", | |
| " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n", | |
| "/home/ubuntu/github/anndata/src/anndata/_core/aligned_df.py:67: ImplicitModificationWarning: Transforming to str index.\n", | |
| " warnings.warn(\"Transforming to str index.\", ImplicitModificationWarning)\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "AnnData object with n_obs × n_vars = 1000000 × 52437\n", | |
| " obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'\n", | |
| " var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs'" | |
| ] | |
| }, | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "adata_first_million = query_to_anndata(\n", | |
| " mouse.axis_query(\"RNA\", obs_query=tiledbsoma.AxisQuery(coords=(slice(0, 1_000_000 - 1),)))\n", | |
| ")\n", | |
| "adata_first_million" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 25, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "sc.pp.normalize_total(adata_first_million)\n", | |
| "sc.pp.log1p(adata_first_million)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "There is overhead to going through the query. I strongly suspect that is largely around us now accessing the array with integer coordinates, which is more expensive to transfer and work with that slices would be." | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 30.5 s, sys: 6.47 s, total: 37 s\n", | |
| "Wall time: 1min 4s\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "AnnData object with n_obs × n_vars = 1000000 × 52437\n", | |
| " obs: 'soma_joinid', 'dataset_id', 'assay', 'assay_ontology_term_id', 'cell_type', 'cell_type_ontology_term_id', 'development_stage', 'development_stage_ontology_term_id', 'disease', 'disease_ontology_term_id', 'donor_id', 'is_primary_data', 'observation_joinid', 'self_reported_ethnicity', 'self_reported_ethnicity_ontology_term_id', 'sex', 'sex_ontology_term_id', 'suspension_type', 'tissue', 'tissue_ontology_term_id', 'tissue_type', 'tissue_general', 'tissue_general_ontology_term_id', 'raw_sum', 'nnz', 'raw_mean_nnz', 'raw_variance_nnz', 'n_measured_vars'\n", | |
| " var: 'soma_joinid', 'feature_id', 'feature_name', 'feature_length', 'nnz', 'n_measured_obs', 'highly_variable', 'means', 'dispersions', 'dispersions_norm'\n", | |
| " uns: 'log1p', 'hvg'" | |
| ] | |
| }, | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "sc.pp.highly_variable_genes(adata_first_million)\n", | |
| "adata_first_million\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## scratch" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "query_big = mouse.axis_query(\n", | |
| " \"RNA\",\n", | |
| " obs_query=tiledbsoma.AxisQuery(\n", | |
| " value_filter=\"\"\"is_primary_data == True and assay in [\"10x 3' v3\", \"10x 3' v2\"]\"\"\"\n", | |
| " )\n", | |
| ")\n", | |
| "obs_big = query_big.obs().concat().to_pandas()\n", | |
| "var = query_big.var().concat().to_pandas()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<tiledbsoma._sparse_nd_array.SparseNDArrayRead at 0x75a30d30f050>" | |
| ] | |
| }, | |
| "execution_count": 38, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "query.X(\"raw\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 76, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 0 ns, sys: 514 µs, total: 514 µs\n", | |
| "Wall time: 352 µs\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([0, 1, 2, 3, 4])" | |
| ] | |
| }, | |
| "execution_count": 76, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "idxr.by_obs(obs_idx[:5])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 77, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "CPU times: user 0 ns, sys: 498 µs, total: 498 µs\n", | |
| "Wall time: 352 µs\n" | |
| ] | |
| }, | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "array([0, 1, 2, 3, 4])" | |
| ] | |
| }, | |
| "execution_count": 77, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "%%time\n", | |
| "idxr.by_obs(obs_idx[:5])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 103, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "OrderedDict([('soma_dim_0', array([10, 10, 10, ..., 18, 18, 18])),\n", | |
| " ('soma_dim_1',\n", | |
| " array([ 7, 21, 24, ..., 18005, 18008, 18022])),\n", | |
| " ('soma_data',\n", | |
| " array([ 1., 1., 1., ..., 46., 1., 1.], dtype=float32))])" | |
| ] | |
| }, | |
| "execution_count": 103, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "tiledb_array.multi_index[np.arange(10_000), np.arange(tiledb_array.shape[1])]" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "cellxgene-census-dev", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.9" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment