Skip to content

Instantly share code, notes, and snippets.

@ayaksvals
Last active September 23, 2024 16:35
Show Gist options
  • Select an option

  • Save ayaksvals/c43aa98953a0174b7cd7a0315555df65 to your computer and use it in GitHub Desktop.

Select an option

Save ayaksvals/c43aa98953a0174b7cd7a0315555df65 to your computer and use it in GitHub Desktop.
Converter from pairs.gz to parquet. Contains Polars, DuckDB and Dask Versions
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
Loading
Sorry, something went wrong. Reload?
Sorry, we cannot display this file.
Sorry, this file is invalid so it cannot be displayed.
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "code",
"execution_count": 1,
"id": "d30d1c6f",
"metadata": {},
"outputs": [],
"source": [
"import bioframe\n",
"import pypairix\n",
"import dask.dataframe as dd\n",
"import dask.array as da\n",
"import polars as pl\n",
"import pandas as pd"
]
},
{
"cell_type": "code",
"execution_count": 2,
"id": "489c6689",
"metadata": {},
"outputs": [],
"source": [
"#Add in the future ENUM type for chr1 and chr2\n",
"\n",
"# comment-prefix\n",
"\n",
"polarsDf=pl.scan_csv(\"NIPBL_R1.nodups.pairs.gz\", separator=\"\\t\", schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String}, ignore_errors=True)\n",
"#polarsDf=polarsDf.collect()\n",
"#polarsDf"
]
},
{
"cell_type": "code",
"execution_count": 27,
"id": "1d911efd",
"metadata": {},
"outputs": [
{
"data": {
"text/plain": [
"176"
]
},
"execution_count": 27,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# To find out all the header columns and amount of them\n",
"polarsDf.filter(pl.col(\"read_id\").str.contains(\"#\")).height"
]
},
{
"cell_type": "code",
"execution_count": 3,
"id": "bb9847b0",
"metadata": {},
"outputs": [],
"source": [
"\"\"\"\n",
"Compression alternatives:\n",
" “zstd”: good compression performance\n",
" “lz4”: fast compression/decompression\n",
" “snappy”: more backwards compatibility guarantees(older parquet readers)\n",
"\n",
"row_group_size optimal?\n",
"\n",
"No compression Level, with => more time\n",
"lz4 777.02MB 1m 22s 10000\n",
"zstd 684.55MB 1m 23s no rows\n",
"snappy 773.80MB 1m 20 no rows\n",
"gzip 650.21MB 1m 44s n rows\n",
"\"\"\"\n",
"polarsDf.sink_parquet(\"pairsToPolarsSnappy.parquet\", compression=\"snappy\")\n",
"\n"
]
},
{
"cell_type": "code",
"execution_count": 18,
"id": "f15833ff",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<i>naive plan: (run <b>LazyFrame.explain(optimized=True)</b> to see the optimized plan)</i>\n",
" <p></p>\n",
" <div>Parquet SCAN [pairsToPolarsgzip.parquet]<p></p>PROJECT */8 COLUMNS</div>"
],
"text/plain": [
"<LazyFrame at 0x2AC0091B2880>"
]
},
"execution_count": 18,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"# add later properties, like: columns, parallel, use_statistics, low_memory, memory_map+use_pyarrow=True.\n",
"#df = polars.read_parquet('concatenated.parquet')\n",
"\n",
"#Lazy Df scan\n",
"df = pl.scan_parquet('pairsToPolarsgzip.parquet', hive_schema={\"read_id\": pl.String, \"chrom1\": pl.String, \"pos1\":pl.UInt32, \"chrom2\": pl.String, \"pos2\":pl.UInt32, \"strand1\":pl.String, \"strand2\":pl.String, \"pairs_type\":pl.String})\n",
"df"
]
},
{
"cell_type": "code",
"execution_count": 17,
"id": "a9811098",
"metadata": {},
"outputs": [
{
"data": {
"text/html": [
"<div><style>\n",
".dataframe > thead > tr,\n",
".dataframe > tbody > tr {\n",
" text-align: right;\n",
" white-space: pre-wrap;\n",
"}\n",
"</style>\n",
"<small>shape: (65_220_833, 8)</small><table border=\"1\" class=\"dataframe\"><thead><tr><th>read_id</th><th>chrom1</th><th>pos1</th><th>chrom2</th><th>pos2</th><th>strand1</th><th>strand2</th><th>pairs_type</th></tr><tr><td>str</td><td>str</td><td>u32</td><td>str</td><td>u32</td><td>str</td><td>str</td><td>str</td></tr></thead><tbody><tr><td>&quot;#sorted: chr1-chr2-pos1-pos2&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;#shape: upper triangle&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;#genome_assembly: unknown&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;#chromsize: chr1 197195432&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&quot;#chromsize: chr2 181748087&quot;</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td><td>null</td></tr><tr><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td><td>&hellip;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902279</td><td>&quot;chrY&quot;</td><td>2902428</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902282</td><td>&quot;chrY&quot;</td><td>2902487</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902286</td><td>&quot;chrY&quot;</td><td>2902430</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902375</td><td>&quot;chrY&quot;</td><td>2902555</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr><tr><td>&quot;.&quot;</td><td>&quot;chrY&quot;</td><td>2902405</td><td>&quot;chrY&quot;</td><td>2902560</td><td>&quot;+&quot;</td><td>&quot;-&quot;</td><td>&quot;LL&quot;</td></tr></tbody></table></div>"
],
"text/plain": [
"shape: (65_220_833, 8)\n",
"┌───────────────────────────┬────────┬─────────┬────────┬─────────┬─────────┬─────────┬────────────┐\n",
"│ read_id ┆ chrom1 ┆ pos1 ┆ chrom2 ┆ pos2 ┆ strand1 ┆ strand2 ┆ pairs_type │\n",
"│ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- ┆ --- │\n",
"│ str ┆ str ┆ u32 ┆ str ┆ u32 ┆ str ┆ str ┆ str │\n",
"╞═══════════════════════════╪════════╪═════════╪════════╪═════════╪═════════╪═════════╪════════════╡\n",
"│ #sorted: ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ chr1-chr2-pos1-pos2 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ #shape: upper triangle ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ #genome_assembly: unknown ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ #chromsize: chr1 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ 197195432 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ #chromsize: chr2 ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null ┆ null │\n",
"│ 181748087 ┆ ┆ ┆ ┆ ┆ ┆ ┆ │\n",
"│ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … ┆ … │\n",
"│ . ┆ chrY ┆ 2902279 ┆ chrY ┆ 2902428 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902282 ┆ chrY ┆ 2902487 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902286 ┆ chrY ┆ 2902430 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902375 ┆ chrY ┆ 2902555 ┆ + ┆ - ┆ LL │\n",
"│ . ┆ chrY ┆ 2902405 ┆ chrY ┆ 2902560 ┆ + ┆ - ┆ LL │\n",
"└───────────────────────────┴────────┴─────────┴────────┴─────────┴─────────┴─────────┴────────────┘"
]
},
"execution_count": 17,
"metadata": {},
"output_type": "execute_result"
}
],
"source": [
"df.collect()"
]
}
],
"metadata": {
"kernelspec": {
"display_name": "main",
"language": "python",
"name": "python3"
},
"language_info": {
"codemirror_mode": {
"name": "ipython",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.9.19"
}
},
"nbformat": 4,
"nbformat_minor": 5
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment