Last active
August 22, 2025 16:59
-
-
Save NikolayXHD/fe90f34902d0a563d5f9a7ad62558d78 to your computer and use it in GitHub Desktop.
More effective top-level summary for polars dataframe
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| """ | |
| (2_291_675, 10) | |
| ┌─────────────┬────────────────────────────────┬──────┬───────┬────────────────────────────┐ | |
| │ col ┆ type ┆ null ┆ uniq ┆ median │ | |
| ╞═════════════╪════════════════════════════════╪══════╪═══════╪════════════════════════════╡ | |
| │ offer_id ┆ Null ┆ all ┆ 0 ┆ null │ | |
| │ task_id ┆ Int64 ┆ 0 ┆ all ┆ 1.14557e+08 │ | |
| │ external_id ┆ Int64 ┆ 0 ┆ all ┆ 1.96216e+08 │ | |
| │ call_type ┆ Null ┆ all ┆ 0 ┆ null │ | |
| │ call_source ┆ String ┆ 0 ┆ 1 ┆ call_tracker │ | |
| │ call_time ┆ Datetime(time_unit='us', ┆ 0 ┆ 0.93 ┆ 2024-10-09 12:22:21 │ | |
| │ ┆ time_zone=None) ┆ ┆ ┆ │ | |
| │ duration ┆ Decimal(precision=38, scale=6) ┆ 0 ┆ 0.013 ┆ 70.208 │ | |
| │ url ┆ String ┆ 0 ┆ all ┆ http://call-tracking-api… │ | |
| │ filename ┆ String ┆ 0 ┆ 3 ┆ storage_file │ | |
| │ created_at ┆ Datetime(time_unit='us', ┆ 0 ┆ 1.0 ┆ 2024-10-09 12:26:57.655794 │ | |
| │ ┆ time_zone=None) ┆ ┆ ┆ │ | |
| └─────────────┴────────────────────────────────┴──────┴───────┴────────────────────────────┘ | |
| """ | |
| # for better chance of spotting damaged or unexpected data | |
| # df.glimpse() and df.describe() just do not cut it IMO | |
| import polars as pl | |
| def tldr(df: pl.DataFrame) -> None: | |
| print(f'({len(df):_}, {len(df.columns):_})') | |
| with pl.Config() as cfg: | |
| cfg.set_tbl_rows(-1) | |
| cfg.set_tbl_cell_numeric_alignment('RIGHT') | |
| cfg.set_thousands_separator('_') | |
| cfg.set_tbl_hide_column_data_types() | |
| cfg.set_tbl_hide_dataframe_shape() | |
| cfg.set_float_precision(3) | |
| print(frame_tldr(df)) | |
| def frame_tldr(df: pl.DataFrame) -> pl.DataFrame: | |
| col_to_non_null = { | |
| col: df.get_column(col).drop_nulls() for col in df.columns | |
| } | |
| n_rows = len(df) | |
| return pl.DataFrame( | |
| { | |
| 'col': df.columns, | |
| 'type': [str(dtype) for _name, dtype in df.schema.items()], | |
| 'null': pl.Series( | |
| 'null', | |
| [ | |
| _val_or_rate(n_rows - col_to_non_null[col].len(), n_rows) | |
| for col in df.columns | |
| ], | |
| dtype=pl.Object(), | |
| ), | |
| 'uniq': pl.Series( | |
| 'uniq', | |
| [ | |
| _val_or_rate(col_to_non_null[col].n_unique(), n_rows) | |
| for col in df.columns | |
| ], | |
| dtype=pl.Object(), | |
| ), | |
| 'median': pl.Series( | |
| 'median', | |
| [ | |
| _median_or_mode(col_to_non_null[col], df.schema[col]) | |
| for col in df.columns | |
| ], | |
| dtype=pl.Object(), | |
| ), | |
| } | |
| ) | |
| def _median_or_mode(ds_non_null: pl.Series, dtype: pl.DataType) -> str: | |
| if len(ds_non_null) == 0: | |
| return 'null' | |
| if dtype.is_numeric(): | |
| return f'{ds_non_null.median():4g}'.strip() | |
| if dtype.is_(pl.Boolean): | |
| return f'{ds_non_null.sum() / ds_non_null.len()}' | |
| if dtype.is_temporal(): | |
| return f'{ds_non_null.median()}' | |
| if ( | |
| isinstance(dtype, pl.Enum) | |
| or dtype.is_(pl.Categorical) | |
| or dtype.is_(pl.String) | |
| ): | |
| return _ellipsis(str(ds_non_null.mode()[0]), 24) | |
| return 'null' | |
| def _val_or_rate(n: int, total: int) -> int | float | str: | |
| return ( | |
| 'all' if n == total else f'{n:_}' if n < 100 else round(n / total, 3) | |
| ) | |
| def _ellipsis(v: str, num: int) -> str: | |
| return v if len(v) <= num else v[:num] + '…' |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment