NikolayXHD · August 22, 2025 16:59
diff --git a/polars_tldr.py b/polars_tldr.py
 """
 (2_291_675, 10)
 ┌─────────────┬────────────────────────────────┬──────┬───────┬────────────────────────────┐
 │ col         ┆ type                           ┆ null ┆ uniq  ┆ median                     │
 ╞═════════════╪════════════════════════════════╪══════╪═══════╪════════════════════════════╡
 │ offer_id    ┆ Null                           ┆ all  ┆ 0     ┆ null                       │
 │ task_id     ┆ Int64                          ┆ 0    ┆ all   ┆ 1.14557e+08                │
 │ external_id ┆ Int64                          ┆ 0    ┆ all   ┆ 1.96216e+08                │
 │ call_type   ┆ Null                           ┆ all  ┆ 0     ┆ null                       │
 │ call_source ┆ String                         ┆ 0    ┆ 1     ┆ call_tracker               │
 │ call_time   ┆ Datetime(time_unit='us',       ┆ 0    ┆ 0.93  ┆ 2024-10-09 12:22:21        │
 │             ┆ time_zone=None)                ┆      ┆       ┆                            │
 │ duration    ┆ Decimal(precision=38, scale=6) ┆ 0    ┆ 0.013 ┆ 70.208                     │
 │ url         ┆ String                         ┆ 0    ┆ all   ┆ http://call-tracking-api…  │
 │ filename    ┆ String                         ┆ 0    ┆ 3     ┆ storage_file               │
 │ created_at  ┆ Datetime(time_unit='us',       ┆ 0    ┆ 1.0   ┆ 2024-10-09 12:26:57.655794 │
 │             ┆ time_zone=None)                ┆      ┆       ┆                            │
 └─────────────┴────────────────────────────────┴──────┴───────┴────────────────────────────┘
 """

 # for better chance of spotting damaged or unexpected data
 # df.glimpse() and df.describe() just do not cut it IMO
 import polars as pl


 def tldr(df: pl.DataFrame) -> None:
    print(f'({len(df):_}, {len(df.columns):_})')
    with pl.Config() as cfg:
        cfg.set_tbl_rows(-1)
        cfg.set_tbl_cell_numeric_alignment('RIGHT')
        cfg.set_thousands_separator('_')
        cfg.set_tbl_hide_column_data_types()
        cfg.set_tbl_hide_dataframe_shape()
        cfg.set_float_precision(3)
        print(frame_tldr(df))


 def frame_tldr(df: pl.DataFrame) -> pl.DataFrame:
    col_to_non_null = {
        col: df.get_column(col).drop_nulls() for col in df.columns
    }
    n_rows = len(df)
    return pl.DataFrame(
        {
            'col': df.columns,
            'type': [str(dtype) for _name, dtype in df.schema.items()],
            'null': pl.Series(
                'null',
                [
                    _val_or_rate(n_rows - col_to_non_null[col].len(), n_rows)
                    for col in df.columns
                ],
                dtype=pl.Object(),
            ),
            'uniq': pl.Series(
                'uniq',
                [
                    _val_or_rate(col_to_non_null[col].n_unique(), n_rows)
                    for col in df.columns
                ],
                dtype=pl.Object(),
            ),
            'median': pl.Series(
                'median',
                [
                    _median_or_mode(col_to_non_null[col], df.schema[col])
                    for col in df.columns
                ],
                dtype=pl.Object(),
            ),
        }
    )


 def _median_or_mode(ds_non_null: pl.Series, dtype: pl.DataType) -> str:
    if len(ds_non_null) == 0:
        return 'null'

    if dtype.is_numeric():
        return f'{ds_non_null.median():4g}'.strip()

    if dtype.is_(pl.Boolean):
        return f'{ds_non_null.sum() / ds_non_null.len()}'

    if dtype.is_temporal():
        return f'{ds_non_null.median()}'

    if (
        isinstance(dtype, pl.Enum)
        or dtype.is_(pl.Categorical)
        or dtype.is_(pl.String)
    ):
        return _ellipsis(str(ds_non_null.mode()[0]), 24)

    return 'null'


 def _val_or_rate(n: int, total: int) -> int | float | str:
    return (
        'all' if n == total else f'{n:_}' if n < 100 else round(n / total, 3)
    )


 def _ellipsis(v: str, num: int) -> str:
    return v if len(v) <= num else v[:num] + '…'
	"""
	(2_291_675, 10)
	┌─────────────┬────────────────────────────────┬──────┬───────┬────────────────────────────┐
	│ col ┆ type ┆ null ┆ uniq ┆ median │
	╞═════════════╪════════════════════════════════╪══════╪═══════╪════════════════════════════╡
	│ offer_id ┆ Null ┆ all ┆ 0 ┆ null │
	│ task_id ┆ Int64 ┆ 0 ┆ all ┆ 1.14557e+08 │
	│ external_id ┆ Int64 ┆ 0 ┆ all ┆ 1.96216e+08 │
	│ call_type ┆ Null ┆ all ┆ 0 ┆ null │
	│ call_source ┆ String ┆ 0 ┆ 1 ┆ call_tracker │
	│ call_time ┆ Datetime(time_unit='us', ┆ 0 ┆ 0.93 ┆ 2024-10-09 12:22:21 │
	│ ┆ time_zone=None) ┆ ┆ ┆ │
	│ duration ┆ Decimal(precision=38, scale=6) ┆ 0 ┆ 0.013 ┆ 70.208 │
	│ url ┆ String ┆ 0 ┆ all ┆ http://call-tracking-api… │
	│ filename ┆ String ┆ 0 ┆ 3 ┆ storage_file │
	│ created_at ┆ Datetime(time_unit='us', ┆ 0 ┆ 1.0 ┆ 2024-10-09 12:26:57.655794 │
	│ ┆ time_zone=None) ┆ ┆ ┆ │
	└─────────────┴────────────────────────────────┴──────┴───────┴────────────────────────────┘
	"""

	# for better chance of spotting damaged or unexpected data
	# df.glimpse() and df.describe() just do not cut it IMO
	import polars as pl


	def tldr(df: pl.DataFrame) -> None:
	print(f'({len(df):_}, {len(df.columns):_})')
	with pl.Config() as cfg:
	cfg.set_tbl_rows(-1)
	cfg.set_tbl_cell_numeric_alignment('RIGHT')
	cfg.set_thousands_separator('_')
	cfg.set_tbl_hide_column_data_types()
	cfg.set_tbl_hide_dataframe_shape()
	cfg.set_float_precision(3)
	print(frame_tldr(df))


	def frame_tldr(df: pl.DataFrame) -> pl.DataFrame:
	col_to_non_null = {
	col: df.get_column(col).drop_nulls() for col in df.columns
	}
	n_rows = len(df)
	return pl.DataFrame(
	{
	'col': df.columns,
	'type': [str(dtype) for _name, dtype in df.schema.items()],
	'null': pl.Series(
	'null',
	[
	_val_or_rate(n_rows - col_to_non_null[col].len(), n_rows)
	for col in df.columns
	],
	dtype=pl.Object(),
	),
	'uniq': pl.Series(
	'uniq',
	[
	_val_or_rate(col_to_non_null[col].n_unique(), n_rows)
	for col in df.columns
	],
	dtype=pl.Object(),
	),
	'median': pl.Series(
	'median',
	[
	_median_or_mode(col_to_non_null[col], df.schema[col])
	for col in df.columns
	],
	dtype=pl.Object(),
	),
	}
	)


	def _median_or_mode(ds_non_null: pl.Series, dtype: pl.DataType) -> str:
	if len(ds_non_null) == 0:
	return 'null'

	if dtype.is_numeric():
	return f'{ds_non_null.median():4g}'.strip()

	if dtype.is_(pl.Boolean):
	return f'{ds_non_null.sum() / ds_non_null.len()}'

	if dtype.is_temporal():
	return f'{ds_non_null.median()}'

	if (
	isinstance(dtype, pl.Enum)
	or dtype.is_(pl.Categorical)
	or dtype.is_(pl.String)
	):
	return _ellipsis(str(ds_non_null.mode()[0]), 24)

	return 'null'


	def _val_or_rate(n: int, total: int) -> int \| float \| str:
	return (
	'all' if n == total else f'{n:_}' if n < 100 else round(n / total, 3)
	)


	def _ellipsis(v: str, num: int) -> str:
	return v if len(v) <= num else v[:num] + '…'
No results found