Skip to content

Instantly share code, notes, and snippets.

@NikolayXHD
Last active August 22, 2025 16:59
Show Gist options
  • Select an option

  • Save NikolayXHD/fe90f34902d0a563d5f9a7ad62558d78 to your computer and use it in GitHub Desktop.

Select an option

Save NikolayXHD/fe90f34902d0a563d5f9a7ad62558d78 to your computer and use it in GitHub Desktop.
More effective top-level summary for polars dataframe
"""
(2_291_675, 10)
┌─────────────┬────────────────────────────────┬──────┬───────┬────────────────────────────┐
│ col ┆ type ┆ null ┆ uniq ┆ median │
╞═════════════╪════════════════════════════════╪══════╪═══════╪════════════════════════════╡
│ offer_id ┆ Null ┆ all ┆ 0 ┆ null │
│ task_id ┆ Int64 ┆ 0 ┆ all ┆ 1.14557e+08 │
│ external_id ┆ Int64 ┆ 0 ┆ all ┆ 1.96216e+08 │
│ call_type ┆ Null ┆ all ┆ 0 ┆ null │
│ call_source ┆ String ┆ 0 ┆ 1 ┆ call_tracker │
│ call_time ┆ Datetime(time_unit='us', ┆ 0 ┆ 0.93 ┆ 2024-10-09 12:22:21 │
│ ┆ time_zone=None) ┆ ┆ ┆ │
│ duration ┆ Decimal(precision=38, scale=6) ┆ 0 ┆ 0.013 ┆ 70.208 │
│ url ┆ String ┆ 0 ┆ all ┆ http://call-tracking-api… │
│ filename ┆ String ┆ 0 ┆ 3 ┆ storage_file │
│ created_at ┆ Datetime(time_unit='us', ┆ 0 ┆ 1.0 ┆ 2024-10-09 12:26:57.655794 │
│ ┆ time_zone=None) ┆ ┆ ┆ │
└─────────────┴────────────────────────────────┴──────┴───────┴────────────────────────────┘
"""
# for better chance of spotting damaged or unexpected data
# df.glimpse() and df.describe() just do not cut it IMO
import polars as pl
def tldr(df: pl.DataFrame) -> None:
print(f'({len(df):_}, {len(df.columns):_})')
with pl.Config() as cfg:
cfg.set_tbl_rows(-1)
cfg.set_tbl_cell_numeric_alignment('RIGHT')
cfg.set_thousands_separator('_')
cfg.set_tbl_hide_column_data_types()
cfg.set_tbl_hide_dataframe_shape()
cfg.set_float_precision(3)
print(frame_tldr(df))
def frame_tldr(df: pl.DataFrame) -> pl.DataFrame:
col_to_non_null = {
col: df.get_column(col).drop_nulls() for col in df.columns
}
n_rows = len(df)
return pl.DataFrame(
{
'col': df.columns,
'type': [str(dtype) for _name, dtype in df.schema.items()],
'null': pl.Series(
'null',
[
_val_or_rate(n_rows - col_to_non_null[col].len(), n_rows)
for col in df.columns
],
dtype=pl.Object(),
),
'uniq': pl.Series(
'uniq',
[
_val_or_rate(col_to_non_null[col].n_unique(), n_rows)
for col in df.columns
],
dtype=pl.Object(),
),
'median': pl.Series(
'median',
[
_median_or_mode(col_to_non_null[col], df.schema[col])
for col in df.columns
],
dtype=pl.Object(),
),
}
)
def _median_or_mode(ds_non_null: pl.Series, dtype: pl.DataType) -> str:
if len(ds_non_null) == 0:
return 'null'
if dtype.is_numeric():
return f'{ds_non_null.median():4g}'.strip()
if dtype.is_(pl.Boolean):
return f'{ds_non_null.sum() / ds_non_null.len()}'
if dtype.is_temporal():
return f'{ds_non_null.median()}'
if (
isinstance(dtype, pl.Enum)
or dtype.is_(pl.Categorical)
or dtype.is_(pl.String)
):
return _ellipsis(str(ds_non_null.mode()[0]), 24)
return 'null'
def _val_or_rate(n: int, total: int) -> int | float | str:
return (
'all' if n == total else f'{n:_}' if n < 100 else round(n / total, 3)
)
def _ellipsis(v: str, num: int) -> str:
return v if len(v) <= num else v[:num] + '…'
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment