Last active
September 23, 2024 16:35
-
-
Save ayaksvals/c43aa98953a0174b7cd7a0315555df65 to your computer and use it in GitHub Desktop.
Converter from pairs.gz to parquet. Contains Polars, DuckDB and Dask Versions
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "d30d1c6f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import bioframe\n", | |
| "import pypairix\n", | |
| "import dask.dataframe as dd\n", | |
| "import dask.array as da\n", | |
| "import polars as pl\n", | |
| "import pandas as pd" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "489c6689", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "#Add in the future ENUM type for chr1 and chr2\n", | |
| "\n", | |
| "# comment-prefix\n", | |
| "\n", | |
| "polarsDf=pl.scan_csv(\"NIPBL_R1.nodups.pairs.gz\", separator=\"\\t\", schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String}, ignore_errors=True)\n", | |
| "#polarsDf=polarsDf.collect()\n", | |
| "#polarsDf" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 27, | |
| "id": "1d911efd", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "176" | |
| ] | |
| }, | |
| "execution_count": 27, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# To find out all the header columns and amount of them\n", | |
| "polarsDf.filter(pl.col(\"read_id\").str.contains(\"#\")).height" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "bb9847b0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "\"\"\"\n", | |
| "Compression alternatives:\n", | |
| " “zstd”: good compression performance\n", | |
| " “lz4”: fast compression/decompression\n", | |
| " “snappy”: more backwards compatibility guarantees(older parquet readers)\n", | |
| "\n", | |
| "row_group_size optimal?\n", | |
| "\n", | |
| "No compression Level, with => more time\n", | |
| "lz4 777.02MB 1m 22s 10000\n", | |
| "zstd 684.55MB 1m 23s no rows\n", | |
| "snappy 773.80MB 1m 20 no rows\n", | |
| "gzip 650.21MB 1m 44s n rows\n", | |
| "\"\"\"\n", | |
| "polarsDf.sink_parquet(\"pairsToPolarsSnappy.parquet\", compression=\"snappy\")\n", | |
| "\n" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "id": "f15833ff", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<i>naive plan: (run <b>LazyFrame.explain(optimized=True)</b> to see the optimized plan)</i>\n", | |
| " <p></p>\n", | |
| " <div>Parquet SCAN [pairsToPolarsgzip.parquet]<p></p>PROJECT */8 COLUMNS</div>" | |
| ], | |
| "text/plain": [ | |
| "<LazyFrame at 0x2AC0091B2880>" | |
| ] | |
| }, | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n", | |
| "#df = polars.read_parquet('concatenated.parquet')\n", | |
| "\n", | |
| "#Lazy Df scan\n", | |
| "df = pl.scan_parquet('pairsToPolarsgzip.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})\n", | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "id": "a9811098", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div><style>\n", | |
| ".dataframe > thead > tr,\n", | |
| ".dataframe > tbody > tr {\n", | |
| " text-align: right;\n", | |
| " white-space: pre-wrap;\n", | |
| "}\n", | |
| "</style>\n", | |
| "<small>shape: (65_220_833, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>"#sorted: chr1-chr2-pos1-pos2"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>"#shape: upper triangle"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>"#genome_assembly: unknown"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>"#chromsize: chr1 197195432"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>"#chromsize: chr2 181748087"</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td><td>…</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902279</td><td>"chrY"</td><td>2902428</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902282</td><td>"chrY"</td><td>2902487</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902286</td><td>"chrY"</td><td>2902430</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902375</td><td>"chrY"</td><td>2902555</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr><tr><td>"."</td><td>"chrY"</td><td>2902405</td><td>"chrY"</td><td>2902560</td><td>"+"</td><td>"-"</td><td>"LL"</td></tr></tbody></table></div>" | |
| ], | |
| "text/plain": [ | |
| "shape: (65_220_833, 8)\n", | |
| "┌───────────────────────────┬────────┬─────────┬────────┬─────────┬─────────┬─────────┬────────────┐\n", | |
| "│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n", | |
| "│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n", | |
| "│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n", | |
| "╞═══════════════════════════╪════════╪═════════╪════════╪═════════╪═════════╪═════════╪════════════╡\n", | |
| "│ #sorted: ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
| "│ chr1-chr2-pos1-pos2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", | |
| "│ #shape: upper triangle ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
| "│ #genome_assembly: unknown ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
| "│ #chromsize: chr1 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
| "│ 197195432 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", | |
| "│ #chromsize: chr2 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n", | |
| "│ 181748087 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n", | |
| "│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n", | |
| "│ . ┆ chrY ┆ 2902279 ┆ chrY ┆ 2902428 ┆ + ┆ - ┆ LL │\n", | |
| "│ . ┆ chrY ┆ 2902282 ┆ chrY ┆ 2902487 ┆ + ┆ - ┆ LL │\n", | |
| "│ . ┆ chrY ┆ 2902286 ┆ chrY ┆ 2902430 ┆ + ┆ - ┆ LL │\n", | |
| "│ . ┆ chrY ┆ 2902375 ┆ chrY ┆ 2902555 ┆ + ┆ - ┆ LL │\n", | |
| "│ . ┆ chrY ┆ 2902405 ┆ chrY ┆ 2902560 ┆ + ┆ - ┆ LL │\n", | |
| "└───────────────────────────┴────────┴─────────┴────────┴─────────┴─────────┴─────────┴────────────┘" | |
| ] | |
| }, | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.collect()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "main", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.9.19" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment