Created
August 9, 2023 17:32
-
-
Save MaxHalford/5d3da23bbf8bf76e9f23c62a4f2539a9 to your computer and use it in GitHub Desktop.
Metric decomposition
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "# Metric decomposition" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- https://docs.google.com/spreadsheets/d/1-hYJesNCMlCyANPOPeLijg1sCfv-6nREPtkHtTVEwd4/edit#gid=0\n", | |
| "- https://observablehq.com/@carbonfact/diff" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "https://chat.openai.com/share/6a5e1c60-b9a2-42bb-9b23-39e61358b577" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[0.09999999999999999, 0.068]" | |
| ] | |
| }, | |
| "execution_count": 49, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# denominators\n", | |
| "D = [\n", | |
| " [1000, 2000],\n", | |
| " [800, 1200]\n", | |
| "]\n", | |
| "# shares\n", | |
| "S = [\n", | |
| " [dj / sum(d) for j, dj in enumerate(d)]\n", | |
| " for i, d in enumerate(D)\n", | |
| "]\n", | |
| "# numerators\n", | |
| "N = [\n", | |
| " [150, 150],\n", | |
| " [88, 48]\n", | |
| "]\n", | |
| "# ratios\n", | |
| "R = [\n", | |
| " [ni / di for ni, di in zip(n, d)]\n", | |
| " for n, d in zip(N, D)\n", | |
| "]\n", | |
| "# shares x ratios\n", | |
| "SR = [\n", | |
| " [ri * si for ri, si in zip(r, s)]\n", | |
| " for r, s in zip(R, S)\n", | |
| "]\n", | |
| "# global means\n", | |
| "M = list(map(sum, SR))\n", | |
| "M" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 60, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "-0.03199999999999999" | |
| ] | |
| }, | |
| "execution_count": 60, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "inner = [\n", | |
| " S[1][0] * (R[1][0] - R[0][0]),\n", | |
| " S[1][1] * (R[1][1] - R[0][1]),\n", | |
| "]\n", | |
| "mix = [\n", | |
| " (S[1][0] - S[0][0]) * (R[0][0] - M[0]),\n", | |
| " (S[1][1] - S[0][1]) * (R[0][1] - M[0])\n", | |
| "]\n", | |
| "sum(inner) + sum(mix)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 67, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "[0.09999999999999999, 0.068]" | |
| ] | |
| }, | |
| "execution_count": 67, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "KPI = [\n", | |
| " sum(ri * si for ri, si in zip(r, s))\n", | |
| " for r, s in zip(R, S)\n", | |
| "]\n", | |
| "KPI" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 70, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.068" | |
| ] | |
| }, | |
| "execution_count": 70, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "(\n", | |
| " KPI[0] +\n", | |
| " sum(S[1][i] * (R[1][i] - R[0][i]) for i in range(2)) +\n", | |
| " sum((S[1][i] - S[0][i]) * (R[0][i] - M[0]) for i in range(2))\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "True" | |
| ] | |
| }, | |
| "execution_count": 10, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "# denominators\n", | |
| "D = [\n", | |
| " [1000, 2000],\n", | |
| " [800, 1200]\n", | |
| "]\n", | |
| "# shares\n", | |
| "S = [\n", | |
| " [dj / sum(d) for j, dj in enumerate(d)]\n", | |
| " for i, d in enumerate(D)\n", | |
| "]\n", | |
| "# numerators\n", | |
| "N = [\n", | |
| " [150, 150],\n", | |
| " [88, 48]\n", | |
| "]\n", | |
| "# ratios\n", | |
| "R = [\n", | |
| " [ni / di for ni, di in zip(n, d)]\n", | |
| " for n, d in zip(N, D)\n", | |
| "]\n", | |
| "# shares x ratios\n", | |
| "SR = [\n", | |
| " [ri * si for ri, si in zip(r, s)]\n", | |
| " for r, s in zip(R, S)\n", | |
| "]\n", | |
| "# global means\n", | |
| "M = list(map(sum, SR))\n", | |
| "# KPIs\n", | |
| "KPI = [\n", | |
| " sum(ri * si for ri, si in zip(r, s))\n", | |
| " for r, s in zip(R, S)\n", | |
| "]\n", | |
| "# decomposed KPI\n", | |
| "KPI[1] == (\n", | |
| " KPI[0] +\n", | |
| " sum(S[1][i] * (R[1][i] - R[0][i]) for i in range(2)) +\n", | |
| " sum((S[1][i] - S[0][i]) * (R[0][i] - KPI[0]) for i in range(2))\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.068" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "M[1]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.068" | |
| ] | |
| }, | |
| "execution_count": 3, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "KPI[1]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "---" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>person</th>\n", | |
| " <th>claim_type</th>\n", | |
| " <th>date</th>\n", | |
| " <th>year</th>\n", | |
| " <th>amount</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>John</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>2021-04-08</td>\n", | |
| " <td>2021</td>\n", | |
| " <td>129.66</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>Jane</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>2021-09-03</td>\n", | |
| " <td>2021</td>\n", | |
| " <td>127.07</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>Jane</td>\n", | |
| " <td>Physiotherapy</td>\n", | |
| " <td>2021-02-07</td>\n", | |
| " <td>2021</td>\n", | |
| " <td>125.27</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>Michael</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>2021-12-21</td>\n", | |
| " <td>2021</td>\n", | |
| " <td>122.45</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>Michael</td>\n", | |
| " <td>Physiotherapy</td>\n", | |
| " <td>2021-10-09</td>\n", | |
| " <td>2021</td>\n", | |
| " <td>132.82</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " person claim_type date year amount\n", | |
| "0 John Dentist 2021-04-08 2021 129.66\n", | |
| "1 Jane Dentist 2021-09-03 2021 127.07\n", | |
| "2 Jane Physiotherapy 2021-02-07 2021 125.27\n", | |
| "3 Michael Dentist 2021-12-21 2021 122.45\n", | |
| "4 Michael Physiotherapy 2021-10-09 2021 132.82" | |
| ] | |
| }, | |
| "execution_count": 11, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import random\n", | |
| "\n", | |
| "random.seed(42)\n", | |
| "\n", | |
| "# Function to generate a random cost based on the claim type and year\n", | |
| "def generate_claim_cost(claim_type, year):\n", | |
| " if claim_type == 'Dentist':\n", | |
| " base_cost = 100\n", | |
| " elif claim_type == 'Psychiatrist':\n", | |
| " base_cost = 150\n", | |
| " elif claim_type == 'General Physician':\n", | |
| " base_cost = 80\n", | |
| " elif claim_type == 'Physiotherapy':\n", | |
| " base_cost = 120\n", | |
| " else:\n", | |
| " base_cost = 50\n", | |
| "\n", | |
| " # Adjust cost based on year\n", | |
| " if year == 2021:\n", | |
| " base_cost *= 1.2\n", | |
| " elif year == 2023:\n", | |
| " base_cost *= 1.5\n", | |
| "\n", | |
| " # Add some random variation\n", | |
| " cost = random.uniform(base_cost - 20, base_cost + 20)\n", | |
| " return round(cost, 2)\n", | |
| "\n", | |
| "# Generating sample data\n", | |
| "claim_types = ['Dentist', 'Psychiatrist', 'General Physician', 'Physiotherapy']\n", | |
| "years = [2021, 2022, 2023]\n", | |
| "people = ['John', 'Jane', 'Michael', 'Emily', 'William', 'Emma', 'Daniel', 'Olivia', 'Lucas', 'Ava']\n", | |
| "\n", | |
| "data = []\n", | |
| "for year in years:\n", | |
| " for person in people:\n", | |
| " num_claims = random.randint(1, 5) # Random number of claims per person per year\n", | |
| " for _ in range(num_claims):\n", | |
| " claim_type = random.choice(claim_types)\n", | |
| " cost = generate_claim_cost(claim_type, year)\n", | |
| " date = pd.to_datetime(f\"{random.randint(1, 12)}/{random.randint(1, 28)}/{year}\", format='%m/%d/%Y')\n", | |
| " data.append([person, claim_type, date, year, cost])\n", | |
| "\n", | |
| "# Create the DataFrame\n", | |
| "columns = ['person', 'claim_type', 'date', 'year', 'amount']\n", | |
| "claims_df = pd.DataFrame(data, columns=columns)\n", | |
| "\n", | |
| "# Display the DataFrame\n", | |
| "claims_df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "80" | |
| ] | |
| }, | |
| "execution_count": 12, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "len(claims_df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| person | claim_type | date | year | amount |\n", | |
| "|:---------|:--------------|:--------------------|-------:|---------:|\n", | |
| "| John | Dentist | 2021-04-08 00:00:00 | 2021 | 129.66 |\n", | |
| "| Jane | Dentist | 2021-09-03 00:00:00 | 2021 | 127.07 |\n", | |
| "| Jane | Physiotherapy | 2021-02-07 00:00:00 | 2021 | 125.27 |\n", | |
| "| Michael | Dentist | 2021-12-21 00:00:00 | 2021 | 122.45 |\n", | |
| "| Michael | Physiotherapy | 2021-10-09 00:00:00 | 2021 | 132.82 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(claims_df.head().to_markdown(index=False))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Sums" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "- Total cost\n", | |
| "- Total footprint" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| year | sum | diff |\n", | |
| "|-------:|--------:|--------:|\n", | |
| "| 2021 | 3814.54 | nan |\n", | |
| "| 2022 | 2890.29 | -924.25 |\n", | |
| "| 2023 | 4178.03 | 1287.74 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "sums = claims_df.groupby('year')['amount'].sum()\n", | |
| "sums = pd.DataFrame({'sum': sums, 'diff': sums - sums.shift()})\n", | |
| "print(sums.to_markdown())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| year | claim_type | sum | diff |\n", | |
| "|-------:|:------------------|--------:|--------:|\n", | |
| "| 2021 | Dentist | 1104.42 | nan |\n", | |
| "| 2021 | General Physician | 594.44 | nan |\n", | |
| "| 2021 | Physiotherapy | 801.78 | nan |\n", | |
| "| 2021 | Psychiatrist | 1313.9 | nan |\n", | |
| "| 2022 | Dentist | 622.48 | -481.94 |\n", | |
| "| 2022 | General Physician | 749.08 | 154.64 |\n", | |
| "| 2022 | Physiotherapy | 339.45 | -462.33 |\n", | |
| "| 2022 | Psychiatrist | 1179.28 | -134.62 |\n", | |
| "| 2023 | Dentist | 1440.99 | 818.51 |\n", | |
| "| 2023 | General Physician | 826.18 | 77.1 |\n", | |
| "| 2023 | Physiotherapy | 1049.15 | 709.7 |\n", | |
| "| 2023 | Psychiatrist | 861.71 | -317.57 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(pd.DataFrame({\n", | |
| " 'sum': (s := claims_df.groupby(['year', 'claim_type'])['amount'].sum()),\n", | |
| " 'diff': (\n", | |
| " s - s.groupby('claim_type').shift()\n", | |
| " )\n", | |
| "}).reset_index().to_markdown(index=False))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>year</th>\n", | |
| " <th>claim_type</th>\n", | |
| " <th>mean</th>\n", | |
| " <th>count</th>\n", | |
| " <th>sum</th>\n", | |
| " <th>mean_lag</th>\n", | |
| " <th>count_lag</th>\n", | |
| " <th>inner</th>\n", | |
| " <th>mix</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2021</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>122.713333</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1104.42</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2021</td>\n", | |
| " <td>General Physician</td>\n", | |
| " <td>99.073333</td>\n", | |
| " <td>6</td>\n", | |
| " <td>594.44</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2021</td>\n", | |
| " <td>Physiotherapy</td>\n", | |
| " <td>133.630000</td>\n", | |
| " <td>6</td>\n", | |
| " <td>801.78</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2021</td>\n", | |
| " <td>Psychiatrist</td>\n", | |
| " <td>187.700000</td>\n", | |
| " <td>7</td>\n", | |
| " <td>1313.90</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2022</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>103.746667</td>\n", | |
| " <td>6</td>\n", | |
| " <td>622.48</td>\n", | |
| " <td>122.713333</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>-170.700000</td>\n", | |
| " <td>-311.240000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2022</td>\n", | |
| " <td>General Physician</td>\n", | |
| " <td>83.231111</td>\n", | |
| " <td>9</td>\n", | |
| " <td>749.08</td>\n", | |
| " <td>99.073333</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>-95.053333</td>\n", | |
| " <td>249.693333</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>2022</td>\n", | |
| " <td>Physiotherapy</td>\n", | |
| " <td>113.150000</td>\n", | |
| " <td>3</td>\n", | |
| " <td>339.45</td>\n", | |
| " <td>133.630000</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>-122.880000</td>\n", | |
| " <td>-339.450000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>2022</td>\n", | |
| " <td>Psychiatrist</td>\n", | |
| " <td>147.410000</td>\n", | |
| " <td>8</td>\n", | |
| " <td>1179.28</td>\n", | |
| " <td>187.700000</td>\n", | |
| " <td>7.0</td>\n", | |
| " <td>-282.030000</td>\n", | |
| " <td>147.410000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>8</th>\n", | |
| " <td>2023</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>160.110000</td>\n", | |
| " <td>9</td>\n", | |
| " <td>1440.99</td>\n", | |
| " <td>103.746667</td>\n", | |
| " <td>6.0</td>\n", | |
| " <td>338.180000</td>\n", | |
| " <td>480.330000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>9</th>\n", | |
| " <td>2023</td>\n", | |
| " <td>General Physician</td>\n", | |
| " <td>118.025714</td>\n", | |
| " <td>7</td>\n", | |
| " <td>826.18</td>\n", | |
| " <td>83.231111</td>\n", | |
| " <td>9.0</td>\n", | |
| " <td>313.151429</td>\n", | |
| " <td>-236.051429</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>10</th>\n", | |
| " <td>2023</td>\n", | |
| " <td>Physiotherapy</td>\n", | |
| " <td>174.858333</td>\n", | |
| " <td>6</td>\n", | |
| " <td>1049.15</td>\n", | |
| " <td>113.150000</td>\n", | |
| " <td>3.0</td>\n", | |
| " <td>185.125000</td>\n", | |
| " <td>524.575000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>11</th>\n", | |
| " <td>2023</td>\n", | |
| " <td>Psychiatrist</td>\n", | |
| " <td>215.427500</td>\n", | |
| " <td>4</td>\n", | |
| " <td>861.71</td>\n", | |
| " <td>147.410000</td>\n", | |
| " <td>8.0</td>\n", | |
| " <td>544.140000</td>\n", | |
| " <td>-861.710000</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " year claim_type mean count sum mean_lag \\\n", | |
| "0 2021 Dentist 122.713333 9 1104.42 NaN \n", | |
| "1 2021 General Physician 99.073333 6 594.44 NaN \n", | |
| "2 2021 Physiotherapy 133.630000 6 801.78 NaN \n", | |
| "3 2021 Psychiatrist 187.700000 7 1313.90 NaN \n", | |
| "4 2022 Dentist 103.746667 6 622.48 122.713333 \n", | |
| "5 2022 General Physician 83.231111 9 749.08 99.073333 \n", | |
| "6 2022 Physiotherapy 113.150000 3 339.45 133.630000 \n", | |
| "7 2022 Psychiatrist 147.410000 8 1179.28 187.700000 \n", | |
| "8 2023 Dentist 160.110000 9 1440.99 103.746667 \n", | |
| "9 2023 General Physician 118.025714 7 826.18 83.231111 \n", | |
| "10 2023 Physiotherapy 174.858333 6 1049.15 113.150000 \n", | |
| "11 2023 Psychiatrist 215.427500 4 861.71 147.410000 \n", | |
| "\n", | |
| " count_lag inner mix \n", | |
| "0 NaN NaN NaN \n", | |
| "1 NaN NaN NaN \n", | |
| "2 NaN NaN NaN \n", | |
| "3 NaN NaN NaN \n", | |
| "4 9.0 -170.700000 -311.240000 \n", | |
| "5 6.0 -95.053333 249.693333 \n", | |
| "6 6.0 -122.880000 -339.450000 \n", | |
| "7 7.0 -282.030000 147.410000 \n", | |
| "8 6.0 338.180000 480.330000 \n", | |
| "9 9.0 313.151429 -236.051429 \n", | |
| "10 3.0 185.125000 524.575000 \n", | |
| "11 8.0 544.140000 -861.710000 " | |
| ] | |
| }, | |
| "execution_count": 16, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "metric = 'amount'\n", | |
| "period = 'year'\n", | |
| "dimension = 'claim_type'\n", | |
| "\n", | |
| "totals = (\n", | |
| " claims_df\n", | |
| " .groupby([period, dimension])\n", | |
| " [metric]\n", | |
| " .agg(['mean', 'count', 'sum'])\n", | |
| " .reset_index()\n", | |
| " .sort_values(period)\n", | |
| ")\n", | |
| "\n", | |
| "totals['mean_lag'] = totals.groupby(dimension)['mean'].shift(1)\n", | |
| "totals['count_lag'] = totals.groupby(dimension)['count'].shift(1)\n", | |
| "totals['inner'] = totals.eval('(mean - mean_lag) * count_lag')\n", | |
| "totals['mix'] = totals.eval('(count - count_lag) * mean')\n", | |
| "totals" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| year | 0 |\n", | |
| "|-------:|--------:|\n", | |
| "| 2021 | 0 |\n", | |
| "| 2022 | -924.25 |\n", | |
| "| 2023 | 1287.74 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(totals.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 18, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| person | claim_type | date | year | amount | status |\n", | |
| "|:---------|:-------------|:--------------------|-------:|---------:|:---------|\n", | |
| "| Jane | Dentist | 2023-03-26 00:00:00 | 2023 | 136.11 | NEW |\n", | |
| "| Megan | Dentist | 2023-06-01 00:00:00 | 2023 | 138.99 | EXISTING |\n", | |
| "| John | Psychiatrist | 2021-10-09 00:00:00 | 2021 | 168.82 | NEW |\n", | |
| "| Emily | Psychiatrist | 2024-10-16 00:00:00 | 2024 | 132.29 | EXISTING |\n", | |
| "| Michael | Dentist | 2023-10-15 00:00:00 | 2023 | 145.39 | NEW |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import collections\n", | |
| "import random\n", | |
| "import names # This library generates human names\n", | |
| "import pandas as pd\n", | |
| "\n", | |
| "random.seed(42)\n", | |
| "\n", | |
| "# Function to generate a random cost based on the claim type and year\n", | |
| "def generate_claim_cost(claim_type, year):\n", | |
| " if claim_type == 'Dentist':\n", | |
| " base_cost = 100\n", | |
| " elif claim_type == 'Psychiatrist':\n", | |
| " base_cost = 150\n", | |
| "\n", | |
| " # Adjust cost based on year\n", | |
| " if year == 2021:\n", | |
| " base_cost *= 1.2\n", | |
| " elif year == 2023:\n", | |
| " base_cost *= 1.5\n", | |
| "\n", | |
| " # Add some random variation\n", | |
| " cost = random.uniform(base_cost - 20, base_cost + 20)\n", | |
| " return round(cost, 2)\n", | |
| "\n", | |
| "# Generating sample data\n", | |
| "claim_types = ['Dentist', 'Psychiatrist']\n", | |
| "years = [2021, 2022, 2023, 2024]\n", | |
| "people = ['John', 'Jane', 'Michael', 'Emily', 'William']\n", | |
| "\n", | |
| "data = []\n", | |
| "for year in years:\n", | |
| " new_people = (\n", | |
| " [names.get_first_name() for _ in range(random.randint(1, 3))]\n", | |
| " if year > 2021\n", | |
| " else []\n", | |
| " )\n", | |
| " existing_people = [person for person in people if random.random() > 0.3]\n", | |
| " people_this_year = existing_people + new_people\n", | |
| " people.extend(new_people)\n", | |
| "\n", | |
| " for person in people_this_year:\n", | |
| " num_claims = random.randint(1, 5) # Random number of claims per existing customer per year\n", | |
| " for _ in range(num_claims):\n", | |
| " claim_type = random.choice(claim_types)\n", | |
| " cost = generate_claim_cost(claim_type, year)\n", | |
| " date = pd.to_datetime(f\"{random.randint(1, 12)}/{random.randint(1, 28)}/{year}\", format='%m/%d/%Y')\n", | |
| " data.append([person, claim_type, date, year, cost])\n", | |
| "\n", | |
| "# Create the DataFrame\n", | |
| "columns = ['person', 'claim_type', 'date', 'year', 'amount']\n", | |
| "claims_df = pd.DataFrame(data, columns=columns)\n", | |
| "\n", | |
| "# Indicate whether people are existing, new, or returning\n", | |
| "years_seen = collections.defaultdict(set)\n", | |
| "statuses = []\n", | |
| "for claim in claims_df.to_dict(orient='records'):\n", | |
| " years_seen[claim['person']].add(claim['year'])\n", | |
| " if claim['year'] - 1 in years_seen[claim['person']]:\n", | |
| " statuses.append('EXISTING')\n", | |
| " elif any(year < claim['year'] for year in years_seen[claim['person']]):\n", | |
| " statuses.append('RETURNING')\n", | |
| " elif not {year for year in years_seen[claim['person']] if year != claim['year']}:\n", | |
| " statuses.append('NEW')\n", | |
| "\n", | |
| "claims_df['status'] = statuses\n", | |
| "\n", | |
| "print(claims_df.sample(5).to_markdown(index=False))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 19, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| year | 0 |\n", | |
| "|-------:|--------:|\n", | |
| "| 2021 | 0 |\n", | |
| "| 2022 | -924.25 |\n", | |
| "| 2023 | 1287.74 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(totals.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| year | sum | diff |\n", | |
| "|-------:|--------:|---------:|\n", | |
| "| 2021 | 1312.28 | nan |\n", | |
| "| 2022 | 676.06 | -636.22 |\n", | |
| "| 2023 | 5191.31 | 4515.25 |\n", | |
| "| 2024 | 1966.73 | -3224.58 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "sums = claims_df.groupby('year')['amount'].sum()\n", | |
| "sums = pd.DataFrame({'sum': sums, 'diff': sums - sums.shift()})\n", | |
| "print(sums.to_markdown())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "## Ratios\n", | |
| "\n", | |
| "- Cost by claim\n", | |
| "- Cost by user\n", | |
| "- Footprint by product\n", | |
| "- Footprint by gram" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 24, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| year | average | diff |\n", | |
| "|-------:|----------:|---------:|\n", | |
| "| 2021 | 145.809 | nan |\n", | |
| "| 2022 | 112.677 | -33.1322 |\n", | |
| "| 2023 | 173.044 | 60.367 |\n", | |
| "| 2024 | 122.921 | -50.123 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "averages = claims_df.groupby('year')['amount'].mean()\n", | |
| "averages = pd.DataFrame({'average': averages, 'diff': averages - averages.shift()})\n", | |
| "print(averages.to_markdown())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 23, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| year | claim_type | sum | count |\n", | |
| "|-------:|:-------------|--------:|--------:|\n", | |
| "| 2021 | Dentist | 614.36 | 5 |\n", | |
| "| 2021 | Psychiatrist | 697.92 | 4 |\n", | |
| "| 2022 | Dentist | 393.5 | 4 |\n", | |
| "| 2022 | Psychiatrist | 282.56 | 2 |\n", | |
| "| 2023 | Dentist | 2967.3 | 20 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "metric = 'amount'\n", | |
| "period = 'year'\n", | |
| "dimension = 'claim_type'\n", | |
| "\n", | |
| "decomp = (\n", | |
| " claims_df\n", | |
| " .groupby([period, dimension], dropna=True)\n", | |
| " [metric].agg(['sum', 'count'])\n", | |
| " .reset_index()\n", | |
| " .sort_values(period)\n", | |
| ")\n", | |
| "print(decomp.head().to_markdown(index=False))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>year</th>\n", | |
| " <th>claim_type</th>\n", | |
| " <th>sum</th>\n", | |
| " <th>count</th>\n", | |
| " <th>mean</th>\n", | |
| " <th>share</th>\n", | |
| " <th>global_mean</th>\n", | |
| " <th>mean_lag</th>\n", | |
| " <th>share_lag</th>\n", | |
| " <th>global_mean_lag</th>\n", | |
| " <th>inner</th>\n", | |
| " <th>mix</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>2021</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>614.36</td>\n", | |
| " <td>5</td>\n", | |
| " <td>122.87200</td>\n", | |
| " <td>0.555556</td>\n", | |
| " <td>145.808889</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>2021</td>\n", | |
| " <td>Psychiatrist</td>\n", | |
| " <td>697.92</td>\n", | |
| " <td>4</td>\n", | |
| " <td>174.48000</td>\n", | |
| " <td>0.444444</td>\n", | |
| " <td>145.808889</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " <td>NaN</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>2022</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>393.50</td>\n", | |
| " <td>4</td>\n", | |
| " <td>98.37500</td>\n", | |
| " <td>0.666667</td>\n", | |
| " <td>112.676667</td>\n", | |
| " <td>122.872</td>\n", | |
| " <td>0.555556</td>\n", | |
| " <td>145.808889</td>\n", | |
| " <td>-16.331333</td>\n", | |
| " <td>-2.548543</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>2022</td>\n", | |
| " <td>Psychiatrist</td>\n", | |
| " <td>282.56</td>\n", | |
| " <td>2</td>\n", | |
| " <td>141.28000</td>\n", | |
| " <td>0.333333</td>\n", | |
| " <td>112.676667</td>\n", | |
| " <td>174.480</td>\n", | |
| " <td>0.444444</td>\n", | |
| " <td>145.808889</td>\n", | |
| " <td>-11.066667</td>\n", | |
| " <td>-3.185679</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>2023</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>2967.30</td>\n", | |
| " <td>20</td>\n", | |
| " <td>148.36500</td>\n", | |
| " <td>0.666667</td>\n", | |
| " <td>173.043667</td>\n", | |
| " <td>98.375</td>\n", | |
| " <td>0.666667</td>\n", | |
| " <td>112.676667</td>\n", | |
| " <td>33.326667</td>\n", | |
| " <td>-0.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>5</th>\n", | |
| " <td>2023</td>\n", | |
| " <td>Psychiatrist</td>\n", | |
| " <td>2224.01</td>\n", | |
| " <td>10</td>\n", | |
| " <td>222.40100</td>\n", | |
| " <td>0.333333</td>\n", | |
| " <td>173.043667</td>\n", | |
| " <td>141.280</td>\n", | |
| " <td>0.333333</td>\n", | |
| " <td>112.676667</td>\n", | |
| " <td>27.040333</td>\n", | |
| " <td>0.000000</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>6</th>\n", | |
| " <td>2024</td>\n", | |
| " <td>Dentist</td>\n", | |
| " <td>781.46</td>\n", | |
| " <td>8</td>\n", | |
| " <td>97.68250</td>\n", | |
| " <td>0.500000</td>\n", | |
| " <td>122.920625</td>\n", | |
| " <td>148.365</td>\n", | |
| " <td>0.666667</td>\n", | |
| " <td>173.043667</td>\n", | |
| " <td>-25.341250</td>\n", | |
| " <td>4.113111</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7</th>\n", | |
| " <td>2024</td>\n", | |
| " <td>Psychiatrist</td>\n", | |
| " <td>1185.27</td>\n", | |
| " <td>8</td>\n", | |
| " <td>148.15875</td>\n", | |
| " <td>0.500000</td>\n", | |
| " <td>122.920625</td>\n", | |
| " <td>222.401</td>\n", | |
| " <td>0.333333</td>\n", | |
| " <td>173.043667</td>\n", | |
| " <td>-37.121125</td>\n", | |
| " <td>8.226222</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " year claim_type sum count mean share global_mean \\\n", | |
| "0 2021 Dentist 614.36 5 122.87200 0.555556 145.808889 \n", | |
| "1 2021 Psychiatrist 697.92 4 174.48000 0.444444 145.808889 \n", | |
| "2 2022 Dentist 393.50 4 98.37500 0.666667 112.676667 \n", | |
| "3 2022 Psychiatrist 282.56 2 141.28000 0.333333 112.676667 \n", | |
| "4 2023 Dentist 2967.30 20 148.36500 0.666667 173.043667 \n", | |
| "5 2023 Psychiatrist 2224.01 10 222.40100 0.333333 173.043667 \n", | |
| "6 2024 Dentist 781.46 8 97.68250 0.500000 122.920625 \n", | |
| "7 2024 Psychiatrist 1185.27 8 148.15875 0.500000 122.920625 \n", | |
| "\n", | |
| " mean_lag share_lag global_mean_lag inner mix \n", | |
| "0 NaN NaN NaN NaN NaN \n", | |
| "1 NaN NaN NaN NaN NaN \n", | |
| "2 122.872 0.555556 145.808889 -16.331333 -2.548543 \n", | |
| "3 174.480 0.444444 145.808889 -11.066667 -3.185679 \n", | |
| "4 98.375 0.666667 112.676667 33.326667 -0.000000 \n", | |
| "5 141.280 0.333333 112.676667 27.040333 0.000000 \n", | |
| "6 148.365 0.666667 173.043667 -25.341250 4.113111 \n", | |
| "7 222.401 0.333333 173.043667 -37.121125 8.226222 " | |
| ] | |
| }, | |
| "execution_count": 28, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "decomp['mean'] = decomp.eval('sum / count')\n", | |
| "decomp['share'] = decomp['count'] / decomp.groupby('year')['count'].transform('sum')\n", | |
| "decomp['global_mean'] = (\n", | |
| " decomp.groupby('year')['sum'].transform('sum') /\n", | |
| " decomp.groupby('year')['count'].transform('sum')\n", | |
| ")\n", | |
| "decomp['mean_lag'] = decomp.groupby(dimension)['mean'].shift(1)\n", | |
| "decomp['share_lag'] = decomp.groupby(dimension)['share'].shift(1)\n", | |
| "decomp['global_mean_lag'] = decomp.groupby(dimension)['global_mean'].shift(1)\n", | |
| "decomp['inner'] = decomp.eval('share * (mean - mean_lag)')\n", | |
| "decomp['mix'] = decomp.eval('(share - share_lag) * (mean_lag - global_mean_lag)')\n", | |
| "decomp" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 26, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "| year | 0 |\n", | |
| "|-------:|---------:|\n", | |
| "| 2021 | 0 |\n", | |
| "| 2022 | -33.1322 |\n", | |
| "| 2023 | 60.367 |\n", | |
| "| 2024 | -50.123 |\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print(decomp.groupby('year').apply(lambda x: (x.inner + x.mix).sum()).to_markdown())" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": {}, | |
| "source": [ | |
| "### DuckDB" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 94, | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "CatalogException", | |
| "evalue": "Catalog Error: Table with name \"claims\" already exists!", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[0;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[0;31mCatalogException\u001b[0m Traceback (most recent call last)", | |
| "\u001b[1;32m/Users/max/projects/maxhalford.github.io/decomp.ipynb Cell 15\u001b[0m in \u001b[0;36m3\n\u001b[1;32m <a href='vscode-notebook-cell:/Users/max/projects/maxhalford.github.io/decomp.ipynb#X40sZmlsZQ%3D%3D?line=0'>1</a>\u001b[0m \u001b[39mimport\u001b[39;00m \u001b[39mduckdb\u001b[39;00m\n\u001b[0;32m----> <a href='vscode-notebook-cell:/Users/max/projects/maxhalford.github.io/decomp.ipynb#X40sZmlsZQ%3D%3D?line=2'>3</a>\u001b[0m duckdb\u001b[39m.\u001b[39;49msql(\u001b[39m\"\u001b[39;49m\u001b[39mCREATE TABLE claims AS SELECT * FROM claims_df\u001b[39;49m\u001b[39m\"\u001b[39;49m)\n", | |
| "\u001b[0;31mCatalogException\u001b[0m: Catalog Error: Table with name \"claims\" already exists!" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "import duckdb\n", | |
| "\n", | |
| "duckdb.sql(\"CREATE TABLE claims AS SELECT * FROM claims_df\")" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.11.0" | |
| }, | |
| "orig_nbformat": 4 | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 2 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment