Created
November 30, 2025 13:46
-
-
Save UmarZein/ea7aa093820991fb881f1d5ceefa8321 to your computer and use it in GitHub Desktop.
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 40, | |
| "id": "1593637f-ddcc-4c19-9aa9-fe17f0ff1ebd", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import seaborn as sns\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import hashlib\n", | |
| "import uuid\n", | |
| "import math\n", | |
| "import random\n", | |
| "from diffprivlib.models import LogisticRegression, GaussianNB, DecisionTreeClassifier\n", | |
| "from sklearn.preprocessing import LabelEncoder" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 41, | |
| "id": "ad3bc31c-19be-40f7-9015-c3ea6c94daa4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def auto_preprocess_df(df, null_threshold=0.5):\n", | |
| " \"\"\"\n", | |
| " 1. If unique values < 65: LabelEncodes (treating NaNs as a label).\n", | |
| " 2. Else: Converts to numeric (coercing errors to NaN).\n", | |
| " 3. If column Null % > threshold: Drops the column.\n", | |
| " 4. Else: Fills NaNs with median.\n", | |
| " 5. Finally: Drops any rows that still contain Nulls.\n", | |
| " \"\"\"\n", | |
| " df_processed = df.copy()\n", | |
| " le = LabelEncoder()\n", | |
| " cols_to_drop = []\n", | |
| "\n", | |
| " for col in df_processed.columns:\n", | |
| " # Condition 1: Low Cardinality (Categorical)\n", | |
| " if df_processed[col].nunique() < 65:\n", | |
| " # Convert to string to handle mixed types and NaNs as a category\n", | |
| " df_processed[col] = le.fit_transform(df_processed[col].astype(str))\n", | |
| " \n", | |
| " # Condition 2: High Cardinality (Numeric/ID/Messy)\n", | |
| " else:\n", | |
| " # Coerce to number (Strings/IDs become NaN)\n", | |
| " df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')\n", | |
| " \n", | |
| " # Check Null Percentage immediately after coercion\n", | |
| " null_pct = df_processed[col].isnull().mean()\n", | |
| " \n", | |
| " if null_pct > null_threshold:\n", | |
| " # If too many NaNs (e.g., it was an ID column), mark for dropping\n", | |
| " cols_to_drop.append(col)\n", | |
| " else:\n", | |
| " # Otherwise, fill with median\n", | |
| " median_val = df_processed[col].median()\n", | |
| " df_processed[col] = df_processed[col].fillna(median_val)\n", | |
| "\n", | |
| " # Drop the columns identified as \"mostly null\"\n", | |
| " if cols_to_drop:\n", | |
| " print(f\"Dropping columns > {null_threshold:.0%} Null: {cols_to_drop}\")\n", | |
| " df_processed.drop(columns=cols_to_drop, inplace=True)\n", | |
| "\n", | |
| " # Final cleanup: Drop rows containing any remaining NaNs\n", | |
| " original_len = len(df_processed)\n", | |
| " df_processed.dropna(axis=0, inplace=True)\n", | |
| " \n", | |
| " if len(df_processed) < original_len:\n", | |
| " print(f\"Dropped {original_len - len(df_processed)} rows containing Nulls.\")\n", | |
| "\n", | |
| " return df_processed" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 3, | |
| "id": "26c40daf-66e2-46ea-bdcc-da60af6f301b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "\n", | |
| "def calculate_k_anonymity(df, quasi_identifiers):\n", | |
| " # Gunakan dropna=False agar baris dengan NaN tetap dihitung sebagai grup tersendiri\n", | |
| " grouped = df.groupby(quasi_identifiers, dropna=False)\n", | |
| " group_sizes = grouped.size()\n", | |
| " \n", | |
| " if group_sizes.empty:\n", | |
| " return 0, group_sizes\n", | |
| " \n", | |
| " k_actual = group_sizes.min()\n", | |
| " return k_actual, group_sizes\n", | |
| "\n", | |
| "def calculate_l_diversity(df, quasi_identifiers, sensitive_col):\n", | |
| " # Gunakan dropna=False\n", | |
| " grouped = df.groupby(quasi_identifiers, dropna=False)\n", | |
| " diversity_counts = grouped[sensitive_col].nunique()\n", | |
| " \n", | |
| " if diversity_counts.empty:\n", | |
| " return 0, diversity_counts\n", | |
| "\n", | |
| " l_actual = diversity_counts.min()\n", | |
| " return l_actual, diversity_counts\n", | |
| "\n", | |
| "def calculate_entropy_l_diversity(df, quasi_identifiers, sensitive_col):\n", | |
| " grouped = df.groupby(quasi_identifiers, dropna=False)\n", | |
| " entropy_results = []\n", | |
| " \n", | |
| " for name, group in grouped:\n", | |
| " counts = group[sensitive_col].value_counts(normalize=True)\n", | |
| " entropy = -np.sum(counts * np.log(counts))\n", | |
| " entropy_results.append({\n", | |
| " 'group_id': name,\n", | |
| " 'size': len(group),\n", | |
| " 'entropy': entropy\n", | |
| " })\n", | |
| " \n", | |
| " results_df = pd.DataFrame(entropy_results)\n", | |
| " if results_df.empty:\n", | |
| " return 0.0, results_df\n", | |
| " \n", | |
| " min_entropy = results_df['entropy'].min()\n", | |
| " return min_entropy, results_df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 42, | |
| "id": "7a8c66ac-3047-4edc-9b82-83d566b139cc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df=pd.read_csv(\"WA_Fn-UseC_-Telco-Customer-Churn.csv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 43, | |
| "id": "ff90986c-ec4e-4f82-9651-d40d3c42688b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>customerID</th>\n", | |
| " <th>gender</th>\n", | |
| " <th>SeniorCitizen</th>\n", | |
| " <th>Partner</th>\n", | |
| " <th>Dependents</th>\n", | |
| " <th>tenure</th>\n", | |
| " <th>PhoneService</th>\n", | |
| " <th>MultipleLines</th>\n", | |
| " <th>InternetService</th>\n", | |
| " <th>OnlineSecurity</th>\n", | |
| " <th>...</th>\n", | |
| " <th>DeviceProtection</th>\n", | |
| " <th>TechSupport</th>\n", | |
| " <th>StreamingTV</th>\n", | |
| " <th>StreamingMovies</th>\n", | |
| " <th>Contract</th>\n", | |
| " <th>PaperlessBilling</th>\n", | |
| " <th>PaymentMethod</th>\n", | |
| " <th>MonthlyCharges</th>\n", | |
| " <th>TotalCharges</th>\n", | |
| " <th>Churn</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>7590-VHVEG</td>\n", | |
| " <td>Female</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>1</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No phone service</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>No</td>\n", | |
| " <td>...</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Month-to-month</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Electronic check</td>\n", | |
| " <td>29.85</td>\n", | |
| " <td>29.85</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>5575-GNVDE</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>34</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>One year</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Mailed check</td>\n", | |
| " <td>56.95</td>\n", | |
| " <td>1889.5</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>3668-QPYBK</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Month-to-month</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Mailed check</td>\n", | |
| " <td>53.85</td>\n", | |
| " <td>108.15</td>\n", | |
| " <td>Yes</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>7795-CFOCW</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>45</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No phone service</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>One year</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Bank transfer (automatic)</td>\n", | |
| " <td>42.30</td>\n", | |
| " <td>1840.75</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>9237-HQITU</td>\n", | |
| " <td>Female</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Fiber optic</td>\n", | |
| " <td>No</td>\n", | |
| " <td>...</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Month-to-month</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Electronic check</td>\n", | |
| " <td>70.70</td>\n", | |
| " <td>151.65</td>\n", | |
| " <td>Yes</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7038</th>\n", | |
| " <td>6840-RESVB</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>24</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>One year</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Mailed check</td>\n", | |
| " <td>84.80</td>\n", | |
| " <td>1990.5</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7039</th>\n", | |
| " <td>2234-XADUH</td>\n", | |
| " <td>Female</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>72</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Fiber optic</td>\n", | |
| " <td>No</td>\n", | |
| " <td>...</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>One year</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Credit card (automatic)</td>\n", | |
| " <td>103.20</td>\n", | |
| " <td>7362.9</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7040</th>\n", | |
| " <td>4801-JZAZL</td>\n", | |
| " <td>Female</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>11</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No phone service</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Month-to-month</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Electronic check</td>\n", | |
| " <td>29.60</td>\n", | |
| " <td>346.45</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7041</th>\n", | |
| " <td>8361-LTMKD</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>1</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>4</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Fiber optic</td>\n", | |
| " <td>No</td>\n", | |
| " <td>...</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Month-to-month</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Mailed check</td>\n", | |
| " <td>74.40</td>\n", | |
| " <td>306.6</td>\n", | |
| " <td>Yes</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7042</th>\n", | |
| " <td>3186-AJIEK</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>66</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Fiber optic</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Two year</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Bank transfer (automatic)</td>\n", | |
| " <td>105.65</td>\n", | |
| " <td>6844.5</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>7043 rows × 21 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " customerID gender SeniorCitizen Partner Dependents tenure \\\n", | |
| "0 7590-VHVEG Female 0 Yes No 1 \n", | |
| "1 5575-GNVDE Male 0 No No 34 \n", | |
| "2 3668-QPYBK Male 0 No No 2 \n", | |
| "3 7795-CFOCW Male 0 No No 45 \n", | |
| "4 9237-HQITU Female 0 No No 2 \n", | |
| "... ... ... ... ... ... ... \n", | |
| "7038 6840-RESVB Male 0 Yes Yes 24 \n", | |
| "7039 2234-XADUH Female 0 Yes Yes 72 \n", | |
| "7040 4801-JZAZL Female 0 Yes Yes 11 \n", | |
| "7041 8361-LTMKD Male 1 Yes No 4 \n", | |
| "7042 3186-AJIEK Male 0 No No 66 \n", | |
| "\n", | |
| " PhoneService MultipleLines InternetService OnlineSecurity ... \\\n", | |
| "0 No No phone service DSL No ... \n", | |
| "1 Yes No DSL Yes ... \n", | |
| "2 Yes No DSL Yes ... \n", | |
| "3 No No phone service DSL Yes ... \n", | |
| "4 Yes No Fiber optic No ... \n", | |
| "... ... ... ... ... ... \n", | |
| "7038 Yes Yes DSL Yes ... \n", | |
| "7039 Yes Yes Fiber optic No ... \n", | |
| "7040 No No phone service DSL Yes ... \n", | |
| "7041 Yes Yes Fiber optic No ... \n", | |
| "7042 Yes No Fiber optic Yes ... \n", | |
| "\n", | |
| " DeviceProtection TechSupport StreamingTV StreamingMovies Contract \\\n", | |
| "0 No No No No Month-to-month \n", | |
| "1 Yes No No No One year \n", | |
| "2 No No No No Month-to-month \n", | |
| "3 Yes Yes No No One year \n", | |
| "4 No No No No Month-to-month \n", | |
| "... ... ... ... ... ... \n", | |
| "7038 Yes Yes Yes Yes One year \n", | |
| "7039 Yes No Yes Yes One year \n", | |
| "7040 No No No No Month-to-month \n", | |
| "7041 No No No No Month-to-month \n", | |
| "7042 Yes Yes Yes Yes Two year \n", | |
| "\n", | |
| " PaperlessBilling PaymentMethod MonthlyCharges TotalCharges \\\n", | |
| "0 Yes Electronic check 29.85 29.85 \n", | |
| "1 No Mailed check 56.95 1889.5 \n", | |
| "2 Yes Mailed check 53.85 108.15 \n", | |
| "3 No Bank transfer (automatic) 42.30 1840.75 \n", | |
| "4 Yes Electronic check 70.70 151.65 \n", | |
| "... ... ... ... ... \n", | |
| "7038 Yes Mailed check 84.80 1990.5 \n", | |
| "7039 Yes Credit card (automatic) 103.20 7362.9 \n", | |
| "7040 Yes Electronic check 29.60 346.45 \n", | |
| "7041 Yes Mailed check 74.40 306.6 \n", | |
| "7042 Yes Bank transfer (automatic) 105.65 6844.5 \n", | |
| "\n", | |
| " Churn \n", | |
| "0 No \n", | |
| "1 No \n", | |
| "2 Yes \n", | |
| "3 No \n", | |
| "4 Yes \n", | |
| "... ... \n", | |
| "7038 No \n", | |
| "7039 No \n", | |
| "7040 No \n", | |
| "7041 Yes \n", | |
| "7042 No \n", | |
| "\n", | |
| "[7043 rows x 21 columns]" | |
| ] | |
| }, | |
| "execution_count": 43, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "id": "c60ec13e-1207-4e18-81e0-8ecb8d50e4a1", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Dropping columns > 50% Null: ['customerID']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "auto_df=auto_preprocess_df(df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 44, | |
| "id": "aec341c4-47d1-4363-8b47-e17c82615835", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "quasi_identifiers=set(['gender','SeniorCitizen','Partner','Dependents'])\n", | |
| "# Data ini sering tersedia di publik (medsos, daftar pemilih). \n", | |
| "# Kombinasi kolom-kolom ini bisa mempersempit pencarian profil orang.\n", | |
| "\n", | |
| "sensitive_features=set(['MonthlyCharges', 'TotalCharges', 'Churn', 'PaymentMethod'])\n", | |
| "direct_identifiers=set(['customerID'])\n", | |
| "\n", | |
| "utility=set(df.columns)-(quasi_identifiers|sensitive_features|direct_identifiers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "88285682-e773-4dfc-acab-e892df77a87e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "quasi_identifiers=set(['gender','SeniorCitizen','Partner','Dependents'])\n", | |
| "sensitive_features=set(['MonthlyCharges', 'TotalCharges', 'Churn', 'PaymentMethod'])\n", | |
| "direct_identifiers=set(['customerID'])" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "b5b96ad3-2489-42dc-8997-740dc952555a", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "3" | |
| ] | |
| }, | |
| "execution_count": 7, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "k_actual,_=calculate_k_anonymity(df, list(quasi_identifiers))\n", | |
| "k_actual" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "a77f674c-ed7f-46b2-97a8-4524e204edee", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "sensitive column l-entropy:\n", | |
| "TotalCharges........ 2.9999999999999996\n", | |
| "Churn............... 1.4653036909521657\n", | |
| "MonthlyCharges...... 2.9999999999999996\n", | |
| "PaymentMethod....... 1.8898815748423097\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print('sensitive column'.ljust(20,' '),'l-entropy:')\n", | |
| "for col in sensitive_features:\n", | |
| " l,_=calculate_entropy_l_diversity(df, list(quasi_identifiers), col)\n", | |
| " print(col.ljust(20,'.'),math.exp(l))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ea784704-9f97-465f-adc4-fcdacae8bab6", | |
| "metadata": {}, | |
| "source": [ | |
| "K-anonimity: lebih tinggi lebih aman\n", | |
| "\n", | |
| "L-diversity: lebih tinggi lebih aman" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "ff3022ec-7257-4f37-bc87-f3f6537282ae", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "def sha256(x):\n", | |
| " return hashlib.sha256(x.encode()).hexdigest()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "c62db466-cb39-478b-9433-da73f6d1e241", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_secure=pd.DataFrame()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "112d5304-a6b3-49e0-8f2b-0be7ddb898b2", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for col in direct_identifiers:\n", | |
| " df_secure[col]=df[col].apply(sha256)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 38, | |
| "id": "8a7757ee-81b6-46eb-908f-76fbc52cac6e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "branch3 StreamingTV\n", | |
| "branch3 TechSupport\n", | |
| "branch3 PaperlessBilling\n", | |
| "branch3 InternetService\n", | |
| "branch3 PhoneService\n", | |
| "branch3 PaymentMethod\n", | |
| "branch3 MultipleLines\n", | |
| "branch3 OnlineSecurity\n", | |
| "branch3 OnlineBackup\n", | |
| "branch3 Contract\n", | |
| "branch3 DeviceProtection\n", | |
| "branch1 TotalCharges\n", | |
| " branch2 TotalCharges\n", | |
| "branch3 StreamingMovies\n", | |
| "branch1 tenure\n", | |
| "branch3 Churn\n", | |
| "branch1 MonthlyCharges\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "for col in sensitive_features|utility:\n", | |
| " if df.nunique()[col]>32:\n", | |
| " print(\"branch1\",col)\n", | |
| " tmp = df[col]\n", | |
| " if col not in df.select_dtypes('number').columns:\n", | |
| " print(\" branch2\",col)\n", | |
| " tmp_col_full = pd.to_numeric(df[col], errors='coerce')\n", | |
| " tmp_col_full = tmp_col_full.fillna(0) \n", | |
| " tmp = tmp_col_full\n", | |
| " bins = tmp.quantile([.0,.25,.5,.75,1.])\n", | |
| " idx = np.random.permutation(list(range(len(bins)-1)))\n", | |
| " labels = list(map(lambda x: chr(ord('A') + x), idx))\n", | |
| " df_secure[col] = pd.cut(tmp, bins=bins, labels=labels, include_lowest=True).astype(str)\n", | |
| " else:#it is a string/categorical\n", | |
| " print(\"branch3\",col)\n", | |
| " uniques=df[col].unique()\n", | |
| " nuniques=len(uniques)\n", | |
| " mapping = {k:chr(ord('A')+i) for k,i in zip(uniques,range(nuniques))}\n", | |
| " df_secure[col]=df[col].map(mapping)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "089a627d-125a-4790-9a78-5215e55f37e0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for col in quasi_identifiers:\n", | |
| " if col in ['Partner','Dependents']: continue #Partner dan Dependents akan di generalisasi ke IsIndependent\n", | |
| " uniques=df[col].unique()\n", | |
| " nuniques=len(uniques)\n", | |
| " mapping = {k:chr(ord('A')+i) for k,i in zip(uniques,range(nuniques))}\n", | |
| " df_secure[col]=df[col].map(mapping)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "9bec6f03-f280-4332-aed8-96b065fa6f4a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_secure['IsIndependent']=((df['Partner']=='No')&(df['Dependents']=='No')).map({True:'B',False:'A'})" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ab0cf4a5-f75f-4569-b6ee-7a88fbf4db5d", | |
| "metadata": {}, | |
| "source": [ | |
| "^ Generalisasi" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "id": "a3e26955-4880-4698-ab41-f48e4a75d559", | |
| "metadata": {}, | |
| "source": [ | |
| "tmp=list(utility)\n", | |
| "df_secure[tmp]=auto_df[tmp]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "05483900-0990-482f-a275-027b10ca863e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "new_quasi_identifiers=set(['gender','SeniorCitizen','IsIndependent'])\n", | |
| "\n", | |
| "new_sensitive_features=set(['MonthlyCharges', 'TotalCharges', 'Churn', 'PaymentMethod'])\n", | |
| "new_direct_identifiers=set(['customerID'])\n", | |
| "\n", | |
| "new_utility=set(df_secure.columns)-(new_quasi_identifiers|new_sensitive_features|new_direct_identifiers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "id": "e6c1caa8-035f-460d-9ede-e1c577e9cdfa", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>customerID</th>\n", | |
| " <th>StreamingTV</th>\n", | |
| " <th>TechSupport</th>\n", | |
| " <th>PaperlessBilling</th>\n", | |
| " <th>InternetService</th>\n", | |
| " <th>PhoneService</th>\n", | |
| " <th>PaymentMethod</th>\n", | |
| " <th>MultipleLines</th>\n", | |
| " <th>OnlineSecurity</th>\n", | |
| " <th>OnlineBackup</th>\n", | |
| " <th>Contract</th>\n", | |
| " <th>DeviceProtection</th>\n", | |
| " <th>TotalCharges</th>\n", | |
| " <th>StreamingMovies</th>\n", | |
| " <th>tenure</th>\n", | |
| " <th>Churn</th>\n", | |
| " <th>MonthlyCharges</th>\n", | |
| " <th>SeniorCitizen</th>\n", | |
| " <th>gender</th>\n", | |
| " <th>IsIndependent</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>f3f9002e121cbbcc038f195a656a32a17395c6ce1815e8...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>5f362546eb7515f442a40d3d9bf632e4a481c92bcb0529...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>701bf78bdf332d16ec5fb80a1affac3a080a8b9c258c2d...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>59a0e1b332a6b6ba8f6a72f74085e6498e8c54a5d18191...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>66370db9e1bb6a851fa692a2499f4f1356efab8e5abf90...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7038</th>\n", | |
| " <td>d947cded0fac0a00ebc313bbe4c4b64e23283d0cf7b536...</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7039</th>\n", | |
| " <td>cb985e07827c720145c2de03810a8ec565fe1121c0b6bd...</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7040</th>\n", | |
| " <td>32b725664afc96d8366d2d44ffb7a801132d3f84d68c19...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7041</th>\n", | |
| " <td>b8463bcb56ddf1b47ee63bafdf65079f88d260b99ccd89...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7042</th>\n", | |
| " <td>6bd283e5eb3d43ae083283e36d747965afa39b80f70ab5...</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>7043 rows × 20 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " customerID StreamingTV \\\n", | |
| "0 f3f9002e121cbbcc038f195a656a32a17395c6ce1815e8... A \n", | |
| "1 5f362546eb7515f442a40d3d9bf632e4a481c92bcb0529... A \n", | |
| "2 701bf78bdf332d16ec5fb80a1affac3a080a8b9c258c2d... A \n", | |
| "3 59a0e1b332a6b6ba8f6a72f74085e6498e8c54a5d18191... A \n", | |
| "4 66370db9e1bb6a851fa692a2499f4f1356efab8e5abf90... A \n", | |
| "... ... ... \n", | |
| "7038 d947cded0fac0a00ebc313bbe4c4b64e23283d0cf7b536... B \n", | |
| "7039 cb985e07827c720145c2de03810a8ec565fe1121c0b6bd... B \n", | |
| "7040 32b725664afc96d8366d2d44ffb7a801132d3f84d68c19... A \n", | |
| "7041 b8463bcb56ddf1b47ee63bafdf65079f88d260b99ccd89... A \n", | |
| "7042 6bd283e5eb3d43ae083283e36d747965afa39b80f70ab5... B \n", | |
| "\n", | |
| " TechSupport PaperlessBilling InternetService PhoneService PaymentMethod \\\n", | |
| "0 A A A A A \n", | |
| "1 A B A B B \n", | |
| "2 A A A B B \n", | |
| "3 B B A A C \n", | |
| "4 A A B B A \n", | |
| "... ... ... ... ... ... \n", | |
| "7038 B A A B B \n", | |
| "7039 A A B B D \n", | |
| "7040 A A A A A \n", | |
| "7041 A A B B B \n", | |
| "7042 B A B B C \n", | |
| "\n", | |
| " MultipleLines OnlineSecurity OnlineBackup Contract DeviceProtection \\\n", | |
| "0 A A A A A \n", | |
| "1 B B B B B \n", | |
| "2 B B A A A \n", | |
| "3 A B B B B \n", | |
| "4 B A B A A \n", | |
| "... ... ... ... ... ... \n", | |
| "7038 C B B B B \n", | |
| "7039 C A A B B \n", | |
| "7040 A B B A A \n", | |
| "7041 C A B A A \n", | |
| "7042 B B B C B \n", | |
| "\n", | |
| " TotalCharges StreamingMovies tenure Churn MonthlyCharges SeniorCitizen \\\n", | |
| "0 B A B A D A \n", | |
| "1 C A A A A A \n", | |
| "2 B A B B A A \n", | |
| "3 C A A A A A \n", | |
| "4 B A B B C A \n", | |
| "... ... ... ... ... ... ... \n", | |
| "7038 C B D A C A \n", | |
| "7039 D B C A B A \n", | |
| "7040 B A D A D A \n", | |
| "7041 B A B B C B \n", | |
| "7042 D B C A B A \n", | |
| "\n", | |
| " gender IsIndependent \n", | |
| "0 A A \n", | |
| "1 B B \n", | |
| "2 B B \n", | |
| "3 B B \n", | |
| "4 A B \n", | |
| "... ... ... \n", | |
| "7038 B A \n", | |
| "7039 A A \n", | |
| "7040 A A \n", | |
| "7041 B A \n", | |
| "7042 B B \n", | |
| "\n", | |
| "[7043 rows x 20 columns]" | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_secure" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "id": "d8f69e21-9c86-4963-bce7-a08cf4e2f958", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "k_actual =244\n", | |
| "sensitive column l-entropy:\n", | |
| "TotalCharges........ 3.380961672597764\n", | |
| "Churn............... 1.5792452676228184\n", | |
| "MonthlyCharges...... 3.1462017832010143\n", | |
| "PaymentMethod....... 3.0532008401147204\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "k_actual,_=calculate_k_anonymity(df_secure, list(new_quasi_identifiers))\n", | |
| "print(f\"{k_actual =}\")\n", | |
| "print('sensitive column'.ljust(20,' '),'l-entropy:')\n", | |
| "for col in new_sensitive_features:\n", | |
| " l,_=calculate_entropy_l_diversity(df_secure, list(new_quasi_identifiers), col)\n", | |
| " print(col.ljust(20,'.'),math.exp(l))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 33, | |
| "id": "f426ac4d-27bd-483d-b87e-b14b7cc5d5af", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "from imblearn.over_sampling import SMOTEN\n", | |
| "\n", | |
| "target_y='Churn'\n", | |
| "\n", | |
| "sampler = SMOTEN(random_state=0)\n", | |
| "X = df_secure.drop(columns=list(direct_identifiers)+[target_y])\n", | |
| "y = df_secure[target_y]\n", | |
| "_X_res, _y_res = sampler.fit_resample(X, y)\n", | |
| "\n", | |
| "X_res = _X_res.map(ord).astype(int)\n", | |
| "X_res = X_res-X_res.min().min()\n", | |
| "X_res = X_res/X_res.max()\n", | |
| "\n", | |
| "y_res = _y_res.apply(ord).astype(int)\n", | |
| "y_res = y_res-y_res.min().min()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "2a20b2d8-0ca7-4ca6-93e0-cd1a59a2ab1a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "SMOTE, SMOTEN (categorical only)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "b0f12ce0-4fae-4df1-bf4a-8673e2b4e1c0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "SMOTEN -> ^ \n", | |
| "\n", | |
| "^ -> SMOTEN" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "4bcfe5f0-7562-4208-b35e-5b3164f003a5", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "data:\n", | |
| "- real\n", | |
| "- generated\n", | |
| "\n", | |
| "model:\n", | |
| "- differential privacy <- data-nya <- diffprivlib\n", | |
| " - epsilon rendah -> korupsi tinggi -> akurasinya nurun\n", | |
| " - epsilon tinggi -> korupsi rendah -> akurasinya naik" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "c889fc05-0f3d-4899-ad9e-f1e3924b2b9c", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "diffprivlib:" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 20, | |
| "id": "db3c0f6e-1def-4859-a353-74abd8e1c147", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "8.06225774829855" | |
| ] | |
| }, | |
| "execution_count": 20, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "np.linalg.norm(X_res.values,axis=1).max()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 34, | |
| "id": "3d82abfb-5a59-44dd-8460-99b09ba617ea", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.7452647854657904" | |
| ] | |
| }, | |
| "execution_count": 34, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "model=DecisionTreeClassifier(epsilon=0.9, bounds=(-3,3), classes=(0,1))\n", | |
| "#model=LogisticRegression(epsilon=0.1, data_norm=9)\n", | |
| "model=GaussianNB(epsilon=0.9, bounds=(0,1))\n", | |
| "model.fit(X_res,y_res)\n", | |
| "model.score(X_res,y_res)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 36, | |
| "id": "dd2ab32e-8d88-451f-95af-601843923dcd", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "auto_X=auto_df.drop('Churn',axis=1)\n", | |
| "auto_y=auto_df['Churn']" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 56, | |
| "id": "8117a3c6-d6e1-40dc-bb0c-8ffe248ada88", | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "scores=[]\n", | |
| "for _ in range(100):\n", | |
| " model=GaussianNB(epsilon=0.9, bounds=(0,1))#(epsilon=0.1, bounds=(-3,3), classes=list(range(6)))\n", | |
| " model.fit(X_res,y_res) \n", | |
| " scores.append(model.score(X_res,y_res))\n", | |
| "scores=pd.Series(scores,name='scores')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 57, | |
| "id": "d6c288cd-4fb8-485d-af79-0a33bdd0d338", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.7350396211828373" | |
| ] | |
| }, | |
| "execution_count": 57, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "scores.mean()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 58, | |
| "id": "b623d31d-ce76-4f72-8e3c-9f737f7e74e6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Text(0.5, 1.0, 'epsilon 0.9')" | |
| ] | |
| }, | |
| "execution_count": 58, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "iVBORw0KGgoAAAANSUhEUgAAAjUAAAGxCAYAAACa3EfLAAAAOXRFWHRTb2Z0d2FyZQBNYXRwbG90bGliIHZlcnNpb24zLjguNCwgaHR0cHM6Ly9tYXRwbG90bGliLm9yZy8fJSN1AAAACXBIWXMAAA9hAAAPYQGoP6dpAAAsSklEQVR4nO3deXRU9f3/8deYkGFpEiAsIRpCZJFV2dxYNCmLZRPhqxWRVdqDLQoYtSZuEKWEpVJUBGQphFKCG1CVglIWUanKkoAghyVAiBKa45YFykCS+/vDw/wcEyCZzOROPjwf59xzej9z7+fz/nCd5nU+986Mw7IsSwAAANXcNXYXAAAA4AuEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACIQaALaLi4tTXFycR5vD4dDUqVNtqaewsFCTJ09WVFSUatasqY4dO2r16tXlPv+DDz5Q9+7dVatWLYWHh2vQoEE6cOCAHysGIEnBdhcAAPPnz7e7BA9Dhw7Vzp07NWPGDLVq1UqrVq3SAw88oJKSEg0fPvyy5/7zn//UkCFDNHjwYL3zzjvKy8tTcnKyevbsqZ07d6p58+ZVNAvg6uPgt58ABCKHw6EpU6ZU+WrNv/71Lw0YMMAdZC7q27evDhw4oJMnTyooKOiS57du3VpOp1MZGRlyOBySpKysLLVq1Ur33nuv/vGPf/h9DsDVittPADwcOXJEw4cPV6NGjeR0OtWmTRu99tprHsds27ZNDodDK1euVEJCgiIjI1WrVi3deeedSk9P9zj22LFjGjZsmKKiouR0OtW4cWP16tVLGRkZ7mPKuv1Ulv3792vw4MGqV6+e+7ZQampqmbWlpaXpmWeeUVRUlMLCwtS7d28dOnToimOsXbtWv/rVr3Tfffd5tI8dO1anTp3S559/fslzv/vuOx06dEj9+vVzBxpJiomJUfv27bVu3ToVFxdfsQYA3iHUAHD76quvdPPNN2v//v166aWX9P7772vAgAGaOHGikpOTSx3/9NNP69ixY1qyZImWLFmiU6dOKS4uTseOHXMf079/f+3evVuzZs3Spk2btGDBAnXq1Ek//vhjhWo7dOiQunXrpgMHDuiVV17RmjVr1LZtW40ZM0azZs0qs7asrCwtWbJEixYt0pEjRzRo0KArhor9+/erTZs2Cg72vDt/4403ul+/lPPnz0uSnE5nqdecTqfOnj2rzMzMK84VgHd4pgaAW0JCgkJDQ/XJJ58oLCxMktSnTx+5XC7NmDFDEydOVL169dzHN2zYUGvXrnWvSvTo0UMtW7ZUSkqKFi9e7F65mDt3rkaMGOE+b+jQoRWuberUqTp//ry2bt2q6OhoST8Fph9//FHJyckaP368wsPD3ce3bdtWK1eudO8HBQXpt7/9rXbu3KnbbrvtkuN89913uv7660u1169f3/36pTRu3Fj169fXp59+6tH+448/usPQ5c4HUDms1ACQJJ07d06bN2/WkCFDVLt2bRUVFbm3/v3769y5c/rss888zhk+fHip2yzdunXT1q1bJf0UBJo3b67Zs2drzpw5Sk9PV0lJiVf1bdmyRb169XIHmovGjBmjs2fP6j//+Y9H+9133+2xf3GlJSsr64pj/XxOFXntmmuu0YQJE7R582a9+OKLys3N1dGjRzVixAidPXvWfQwA/+DdBUDSTysIRUVFevXVV1WjRg2PrX///pKkb7/91uOcyMjIUv1ERka6VyMcDoc2b96su+66S7NmzVLnzp3VsGFDTZw4UQUFBRWur0mTJqXao6Ki3K//XEREhMf+xVtC//vf/y47TkRERJmrKd9//72k/79icynPP/+8HnvsMU2bNk2NGzdWy5YtJf30TI4kXXvttZc9H4D3uP0EQJJUr149BQUFaeTIkZowYUKZx8TGxnrsnz59utQxp0+f9ggUMTExWrp0qSTp8OHDevPNN923khYuXFju+iIiIpSTk1Oq/dSpU5KkBg0alLuvy+nQoYPS0tJUVFTk8VzNl19+KUlq3779Zc8PDg7WnDlz9MILL+j48eNq0KCBmjRporvuukuxsbG67rrrfFIngNJYqQEgSapdu7bi4+OVnp6uG2+8UV27di21/XL1Iy0tTT//VoisrCzt2LHjkp9katWqlZ599ll16NBBe/bsqVB9vXr10pYtW9wh5qIVK1aodu3al31OpiKGDBmiwsJCvfPOOx7tqampioqK0q233lqufn71q1+pQ4cOatKkifbs2aPNmzdr0qRJPqkRQNlYqQHg9vLLL6tHjx7q2bOn/vCHP6hZs2YqKCjQ0aNH9d5772nLli0ex+fm5mrIkCH6/e9/r7y8PE2ZMkU1a9ZUUlKSJGnfvn165JFHdN9996lly5YKCQnRli1btG/fPiUmJlaotilTpuj9999XfHy8nn/+edWvX1//+Mc/tH79es2aNcvjIeHK6Nevn/r06aM//OEPys/PV4sWLZSWlqaNGzdq5cqVHt9RM27cOKWmpiozM1MxMTGSfvpI+c6dO3XjjTfKsix98cUXmjlzpn7zm9/okUce8UmNAMpGqAHg1rZtW+3Zs0cvvviinn32WeXm5qpu3bpq2bKl+7man5s+fbp27typsWPHKj8/X7fccotWr17t/tbcyMhINW/eXPPnz1d2drYcDoeuv/56vfTSS3r00UcrVNsNN9ygHTt26Omnn9aECRP0v//9T23atNGyZcs0ZswYX0zfbc2aNXrmmWf0/PPP6/vvv1fr1q2VlpamYcOGeRxXXFys4uJij9WqkJAQvfPOO5o2bZpcLpdatmypF154QRMnTrzsl/YBqDy+URhAhW3btk3x8fF66623dO+999pdDgBI4pkaAABgCEINAAAwArefAACAEVipAQAARiDUAAAAIxBqAACAEYz/npqSkhKdOnVKoaGhl/0hOgAAEDgsy1JBQYGioqLK/UOwxoeaU6dOlfpVXwAAUD1kZ2eX+zfTjA81oaGhkn76RwkLC7O5GgAAUB75+fmKjo52/x0vD+NDzcVbTmFhYYQaAACqmYo8OsKDwgAAwAiEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACIQaAABgBFtDzfbt2zVo0CBFRUXJ4XBo3bp1pY45ePCg7r77boWHhys0NFS33XabTp48WfXFAgCAgGZrqDlz5oxuuukmzZs3r8zXMzMz1aNHD7Vu3Vrbtm3T3r179dxzz6lmzZpVXCkAAAh0DsuyLLuLkH76waq1a9fqnnvucbcNGzZMNWrU0N///nev+83Pz1d4eLjy8vL4QUsAAKoJb/5+B+wzNSUlJVq/fr1atWqlu+66S40aNdKtt95a5i2qn3O5XMrPz/fYAACA+YLtLuBScnNzVVhYqBkzZmjatGmaOXOmNm7cqKFDh2rr1q268847yzwvJSVFycnJVVwtAASGZonr/dLviRkD/NIv4EsBvVIjSYMHD9Zjjz2mjh07KjExUQMHDtTChQsveV5SUpLy8vLcW3Z2dlWVDAAAbBSwKzUNGjRQcHCw2rZt69Hepk0bffLJJ5c8z+l0yul0+rs8AAAQYAJ2pSYkJEQ333yzDh065NF++PBhxcTE2FQVAAAIVLau1BQWFuro0aPu/ePHjysjI0P169dX06ZN9eSTT+r+++/XHXfcofj4eG3cuFHvvfeetm3bZl/RAAAgINkaanbt2qX4+Hj3fkJCgiRp9OjRWr58uYYMGaKFCxcqJSVFEydO1A033KB33nlHPXr0sKtkAAAQoGwNNXFxcbrS1+Q89NBDeuihh6qoIgAAUF0F7DM1AAAAFUGoAQAARiDUAAAAIxBqAACAEQg1AADACIQaAABgBEINAAAwAqEGAAAYgVADAACMQKgBAABGINQAAAAjEGoAAIARCDUAAMAIhBoAAGAEQg0AADBCsN0FAACubs0S1/ul3xMzBvilXwQuVmoAAIARCDUAAMAIhBoAAGAEQg0AADACoQYAABiBUAMAAIxAqAEAAEYg1AAAACMQagAAgBEINQAAwAiEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEWwNNdu3b9egQYMUFRUlh8OhdevWXfLY8ePHy+FwaO7cuVVWHwAAqD5sDTVnzpzRTTfdpHnz5l32uHXr1unzzz9XVFRUFVUGAACqm2A7B+/Xr5/69et32WO++eYbPfLII/rggw80YMCAKqoMAABUN7aGmispKSnRyJEj9eSTT6pdu3blOsflcsnlcrn38/Pz/VUeAAAIIAEdambOnKng4GBNnDix3OekpKQoOTnZj1UBwNWnWeJ6u0sArihgP/20e/duvfzyy1q+fLkcDke5z0tKSlJeXp57y87O9mOVAAAgUARsqPn444+Vm5urpk2bKjg4WMHBwcrKytLjjz+uZs2aXfI8p9OpsLAwjw0AAJgvYG8/jRw5Ur179/Zou+uuuzRy5EiNHTvWpqoAAECgsjXUFBYW6ujRo+7948ePKyMjQ/Xr11fTpk0VERHhcXyNGjUUGRmpG264oapLBQAAAc7WULNr1y7Fx8e79xMSEiRJo0eP1vLly22qCgAAVEe2hpq4uDhZllXu40+cOOG/YgAAQLUWsA8KAwAAVAShBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACIQaAABgBEINAAAwAqEGAAAYgVADAACMQKgBAABGINQAAAAjEGoAAIARCDUAAMAIhBoAAGAEQg0AADACoQYAABiBUAMAAIxAqAEAAEYg1AAAACMQagAAgBEINQAAwAiEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACIQaAABgBEINAAAwgq2hZvv27Ro0aJCioqLkcDi0bt0692sXLlzQU089pQ4dOqhOnTqKiorSqFGjdOrUKfsKBgAAAcvWUHPmzBnddNNNmjdvXqnXzp49qz179ui5557Tnj17tGbNGh0+fFh33323DZUCAIBAF2zn4P369VO/fv3KfC08PFybNm3yaHv11Vd1yy236OTJk2ratGlVlAgAAKoJW0NNReXl5cnhcKhu3bqXPMblcsnlcrn38/Pzq6AyAABgt2rzoPC5c+eUmJio4cOHKyws7JLHpaSkKDw83L1FR0dXYZUAAMAu1SLUXLhwQcOGDVNJSYnmz59/2WOTkpKUl5fn3rKzs6uoSgAAYKeAv/104cIF/fa3v9Xx48e1ZcuWy67SSJLT6ZTT6ayi6gAAQKAI6FBzMdAcOXJEW7duVUREhN0lAQCAAGVrqCksLNTRo0fd+8ePH1dGRobq16+vqKgo3XvvvdqzZ4/ef/99FRcX6/Tp05Kk+vXrKyQkxK6yAQBAALI11OzatUvx8fHu/YSEBEnS6NGjNXXqVL377ruSpI4dO3qct3XrVsXFxVVVmQAAoBqwNdTExcXJsqxLvn651wAAAH6uWnz6CQAA4EoINQAAwAiEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACIQaAABgBEINAAAwAqEGAAAYgVADAACMQKgBAABGCLa7AAC42jRLXG93CYCRWKkBAABGINQAAAAjEGoAAIARCDUAAMAIhBoAAGAEQg0AADACoQYAABiBUAMAAIxAqAEAAEYg1AAAACMQagAAgBEINQAAwAiEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARrA11Gzfvl2DBg1SVFSUHA6H1q1b5/G6ZVmaOnWqoqKiVKtWLcXFxenAgQP2FAsAAAKaraHmzJkzuummmzRv3rwyX581a5bmzJmjefPmaefOnYqMjFSfPn1UUFBQxZUCAIBAF2zn4P369VO/fv3KfM2yLM2dO1fPPPOMhg4dKklKTU1V48aNtWrVKo0fP74qSwUAAAEuYJ+pOX78uE6fPq2+ffu625xOp+68807t2LHjkue5XC7l5+d7bAAAwHwBG2pOnz4tSWrcuLFHe+PGjd2vlSUlJUXh4eHuLTo62q91AgCAwBCwoeYih8PhsW9ZVqm2n0tKSlJeXp57y87O9neJAAAgANj6TM3lREZGSvppxaZJkybu9tzc3FKrNz/ndDrldDr9Xh8AAAgsXq3UHD9+3Nd1lBIbG6vIyEht2rTJ3Xb+/Hl99NFH6tatm9/HBwAA1YtXoaZFixaKj4/XypUrde7cOa8HLywsVEZGhjIyMiT9FJYyMjJ08uRJORwOTZ48WdOnT9fatWu1f/9+jRkzRrVr19bw4cO9HhMAAJjJq1Czd+9ederUSY8//rgiIyM1fvx4ffHFFxXuZ9euXerUqZM6deokSUpISFCnTp30/PPPS5L+9Kc/afLkyfrjH/+orl276ptvvtGHH36o0NBQb8oGAAAGc1iWZXl7clFRkd577z0tX75cGzZsUMuWLTVu3DiNHDlSDRs29GWdXsvPz1d4eLjy8vIUFhZmdzkAoGaJ6+0u4apwYsYAu0tAJXjz97tSn34KDg7WkCFD9Oabb2rmzJnKzMzUE088oeuuu06jRo1STk5OZboHAAAot0qFml27dumPf/yjmjRpojlz5uiJJ55QZmamtmzZom+++UaDBw/2VZ0AAACX5dVHuufMmaNly5bp0KFD6t+/v1asWKH+/fvrmmt+ykixsbF6/fXX1bp1a58WCwAAcClehZoFCxbooYce0tixY93fJ/NLTZs21dKlSytVHAAAQHl5FWqOHDlyxWNCQkI0evRob7oHAACoMK+eqVm2bJneeuutUu1vvfWWUlNTK10UAABARXkVambMmKEGDRqUam/UqJGmT59e6aIAAAAqyqtQk5WVpdjY2FLtMTExOnnyZKWLAgAAqCivQk2jRo20b9++Uu179+5VREREpYsCAACoKK9CzbBhwzRx4kRt3bpVxcXFKi4u1pYtWzRp0iQNGzbM1zUCAABckVeffpo2bZqysrLUq1cvBQf/1EVJSYlGjRrFMzUAAMAWXoWakJAQvfHGG3rxxRe1d+9e1apVSx06dFBMTIyv6wMAACgXr0LNRa1atVKrVq18VQsAAIDXvAo1xcXFWr58uTZv3qzc3FyVlJR4vL5lyxafFAcAAFBeXoWaSZMmafny5RowYIDat28vh8Ph67oAAAAqxKtQs3r1ar355pvq37+/r+sBAADwilcf6Q4JCVGLFi18XQsAAIDXvAo1jz/+uF5++WVZluXregAAALzi1e2nTz75RFu3btWGDRvUrl071ahRw+P1NWvW+KQ4AACA8vIq1NStW1dDhgzxdS0AAABe8yrULFu2zNd1AAAAVIpXz9RIUlFRkf7973/r9ddfV0FBgSTp1KlTKiws9FlxAAAA5eXVSk1WVpZ+85vf6OTJk3K5XOrTp49CQ0M1a9YsnTt3TgsXLvR1nQAAAJfl1UrNpEmT1LVrV/3www+qVauWu33IkCHavHmzz4oDAAAoL68//fTpp58qJCTEoz0mJkbffPONTwoDAACoCK9WakpKSlRcXFyq/euvv1ZoaGiliwIAAKgor0JNnz59NHfuXPe+w+FQYWGhpkyZwk8nAAAAW3h1++mvf/2r4uPj1bZtW507d07Dhw/XkSNH1KBBA6Wlpfm6RgAAgCvyKtRERUUpIyNDaWlp2rNnj0pKSjRu3Dg9+OCDHg8OAwAAVBWvQo0k1apVSw899JAeeughX9YDAADgFa9CzYoVKy77+qhRo7wqBgAAwFtehZpJkyZ57F+4cEFnz55VSEiIateuTagBAABVzqtPP/3www8eW2FhoQ4dOqQePXrwoDAAALCF17/99EstW7bUjBkzSq3iVEZRUZGeffZZxcbGqlatWrr++uv1wgsvqKSkxGdjAAAAM3j9oHBZgoKCdOrUKZ/1N3PmTC1cuFCpqalq166ddu3apbFjxyo8PNyn4QkAAFR/XoWad99912Pfsizl5ORo3rx56t69u08Kk6T//Oc/Gjx4sAYMGCBJatasmdLS0rRr1y6fjQEAAMzgVai55557PPYdDocaNmyoX//613rppZd8UZckqUePHlq4cKEOHz6sVq1aae/evfrkk088vs34l1wul1wul3s/Pz/fZ/UAAIDA5VWoqapnWp566inl5eWpdevWCgoKUnFxsf785z/rgQceuOQ5KSkpSk5OrpL6AABA4PDZg8L+8MYbb2jlypVatWqV9uzZo9TUVP3lL39RamrqJc9JSkpSXl6ee8vOzq7CigEAgF28WqlJSEgo97Fz5szxZghJ0pNPPqnExEQNGzZMktShQwdlZWUpJSVFo0ePLvMcp9Mpp9Pp9ZgAAKB68irUpKena8+ePSoqKtINN9wgSTp8+LCCgoLUuXNn93EOh6NSxZ09e1bXXOO5mBQUFMRHugEAQClehZpBgwYpNDRUqampqlevnqSfvpBv7Nix6tmzpx5//HGfFDdo0CD9+c9/VtOmTdWuXTulp6drzpw5/N4UAAAoxWFZllXRk6699lp9+OGHateunUf7/v371bdvX599V01BQYGee+45rV27Vrm5uYqKitIDDzyg559/XiEhIeXqIz8/X+Hh4crLy1NYWJhP6gKAymiWuN7uEq4KJ2YMsLsEVII3f7+9WqnJz8/Xf//731KhJjc3VwUFBd50WabQ0FDNnTv3sh/hBgAAkLz89NOQIUM0duxYvf322/r666/19ddf6+2339a4ceM0dOhQX9cIAABwRV6t1CxcuFBPPPGERowYoQsXLvzUUXCwxo0bp9mzZ/u0QAAAgPLwKtTUrl1b8+fP1+zZs5WZmSnLstSiRQvVqVPH1/UBAACUS6W+fC8nJ0c5OTlq1aqV6tSpIy+eOQYAAPAJr0LNd999p169eqlVq1bq37+/cnJyJEm/+93vfPZxbgAAgIrwKtQ89thjqlGjhk6ePKnatWu72++//35t3LjRZ8UBAACUl1fP1Hz44Yf64IMPdN1113m0t2zZUllZWT4pDAAAoCK8Wqk5c+aMxwrNRd9++y2/uwQAAGzhVai54447tGLFCve+w+FQSUmJZs+erfj4eJ8VBwAAUF5e3X6aPXu24uLitGvXLp0/f15/+tOfdODAAX3//ff69NNPfV0jAADAFXm1UtO2bVvt27dPt9xyi/r06aMzZ85o6NChSk9PV/PmzX1dIwAAwBVVeKXmwoUL6tu3r15//XUlJyf7oyYAAIAKq/BKTY0aNbR//345HA5/1AMAAOAVr24/jRo1SkuXLvV1LQAAAF7z6kHh8+fPa8mSJdq0aZO6du1a6jef5syZ45PiAAAAyqtCoebYsWNq1qyZ9u/fr86dO0uSDh8+7HEMt6UAAIAdKhRqWrZsqZycHG3dulXSTz+L8Morr6hx48Z+KQ4AAKC8KvRMzS9/hXvDhg06c+aMTwsCAADwhlcPCl/0y5ADAABglwqFGofDUeqZGZ6hAQAAgaBCz9RYlqUxY8a4f7Ty3Llzevjhh0t9+mnNmjW+qxAAAKAcKhRqRo8e7bE/YsQInxYDAADgrQqFmmXLlvmrDgAAgEqp1IPCAAAAgYJQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACIQaAABgBEINAAAwAqEGAAAYgVADAACMEPCh5ptvvtGIESMUERGh2rVrq2PHjtq9e7fdZQEAgABTod9+qmo//PCDunfvrvj4eG3YsEGNGjVSZmam6tata3dpAAAgwAR0qJk5c6aio6M9fkizWbNm9hUEAAACVkDffnr33XfVtWtX3XfffWrUqJE6deqkxYsXX/Ycl8ul/Px8jw0AAJgvoFdqjh07pgULFighIUFPP/20vvjiC02cOFFOp1OjRo0q85yUlBQlJydXcaUAAFRes8T1fuv7xIwBfus7UAT0Sk1JSYk6d+6s6dOnq1OnTho/frx+//vfa8GCBZc8JykpSXl5ee4tOzu7CisGAAB2CehQ06RJE7Vt29ajrU2bNjp58uQlz3E6nQoLC/PYAACA+QI61HTv3l2HDh3yaDt8+LBiYmJsqggAAASqgA41jz32mD777DNNnz5dR48e1apVq7Ro0SJNmDDB7tIAAECACehQc/PNN2vt2rVKS0tT+/bt9eKLL2ru3Ll68MEH7S4NAAAEmID+9JMkDRw4UAMHDrS7DAAAEOACeqUGAACgvAg1AADACIQaAABgBEINAAAwAqEGAAAYgVADAACMQKgBAABGINQAAAAjEGoAAIARCDUAAMAIhBoAAGAEQg0AADACoQYAABiBUAMAAIxAqAEAAEYItrsAAKiMZonr/db3iRkD/NY3AN9jpQYAABiBUAMAAIxAqAEAAEYg1AAAACMQagAAgBEINQAAwAiEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACIQaAABgBEINAAAwAqEGAAAYoVqFmpSUFDkcDk2ePNnuUgAAQICpNqFm586dWrRokW688Ua7SwEAAAGoWoSawsJCPfjgg1q8eLHq1atndzkAACAAVYtQM2HCBA0YMEC9e/e+4rEul0v5+fkeGwAAMF+w3QVcyerVq7Vnzx7t3LmzXMenpKQoOTnZz1XBFM0S1/ut7xMzBvitb3/x578HUNV4f199AnqlJjs7W5MmTdLKlStVs2bNcp2TlJSkvLw895adne3nKgEAQCAI6JWa3bt3Kzc3V126dHG3FRcXa/v27Zo3b55cLpeCgoI8znE6nXI6nVVdKgAAsFlAh5pevXrpyy+/9GgbO3asWrduraeeeqpUoAEAAFevgA41oaGhat++vUdbnTp1FBERUaodAABc3QL6mRoAAIDyCuiVmrJs27bN7hIAAEAAYqUGAAAYgVADAACMQKgBAABGINQAAAAjEGoAAIARCDUAAMAIhBoAAGAEQg0AADACoQYAABiBUAMAAIxAqAEAAEYg1AAAACMQagAAgBEINQAAwAiEGgAAYIRguwsAAAD+1yxxvV/6PTFjgF/69QYrNQAAwAiEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACIQaAABgBEINAAAwAqEGAAAYgVADAACMQKgBAABGINQAAAAjEGoAAIARCDUAAMAIAR1qUlJSdPPNNys0NFSNGjXSPffco0OHDtldFgAACEABHWo++ugjTZgwQZ999pk2bdqkoqIi9e3bV2fOnLG7NAAAEGCC7S7gcjZu3Oixv2zZMjVq1Ei7d+/WHXfcYVNVAAAgEAV0qPmlvLw8SVL9+vUveYzL5ZLL5XLv5+fn+70uAABgP4dlWZbdRZSHZVkaPHiwfvjhB3388ceXPG7q1KlKTk4u1Z6Xl6ewsDCf19Uscb3P+5SkEzMG+KXf6spf/87VkT//2+DfGUBF+ev/k/Lz8xUeHl6hv98B/UzNzz3yyCPat2+f0tLSLntcUlKS8vLy3Ft2dnYVVQgAAOxULW4/Pfroo3r33Xe1fft2XXfddZc91ul0yul0VlFlAAAgUAR0qLEsS48++qjWrl2rbdu2KTY21u6SAABAgAroUDNhwgStWrVK//znPxUaGqrTp09LksLDw1WrVi2bqwMAAIEkoJ+pWbBggfLy8hQXF6cmTZq4tzfeeMPu0gAAQIAJ6JWaavLBLAAAEAACeqUGAACgvAg1AADACIQaAABgBEINAAAwAqEGAAAYgVADAACMQKgBAABGINQAAAAjEGoAAIARCDUAAMAIhBoAAGAEQg0AADACoQYAABiBUAMAAIxAqAEAAEYItrsAlK1Z4nq7S0CA4r8NACgbKzUAAMAIhBoAAGAEQg0AADACoQYAABiBUAMAAIxAqAEAAEYg1AAAACMQagAAgBEINQAAwAiEGgAAYARCDQAAMAKhBgAAGIFQAwAAjECoAQAARiDUAAAAIxBqAACAEQg1AADACNUi1MyfP1+xsbGqWbOmunTpoo8//tjukgAAQIAJ+FDzxhtvaPLkyXrmmWeUnp6unj17ql+/fjp58qTdpQEAgAAS8KFmzpw5GjdunH73u9+pTZs2mjt3rqKjo7VgwQK7SwMAAAEk2O4CLuf8+fPavXu3EhMTPdr79u2rHTt2lHmOy+WSy+Vy7+fl5UmS8vPz/VJjieusX/oFAKA68Nff14v9WpZV7nMCOtR8++23Ki4uVuPGjT3aGzdurNOnT5d5TkpKipKTk0u1R0dH+6VGAACuZuFz/dt/QUGBwsPDy3VsQIeaixwOh8e+ZVml2i5KSkpSQkKCe7+kpETff/+9IiIiLnmON/Lz8xUdHa3s7GyFhYX5rN9AwhzNwBzNcDXMUbo65skcy8eyLBUUFCgqKqrc5wR0qGnQoIGCgoJKrcrk5uaWWr25yOl0yul0erTVrVvXXyUqLCzM2P8oL2KOZmCOZrga5ihdHfNkjldW3hWaiwL6QeGQkBB16dJFmzZt8mjftGmTunXrZlNVAAAgEAX0So0kJSQkaOTIkeratatuv/12LVq0SCdPntTDDz9sd2kAACCABHyouf/++/Xdd9/phRdeUE5Ojtq3b69//etfiomJsbUup9OpKVOmlLrVZRLmaAbmaIarYY7S1TFP5ug/Dqsin5UCAAAIUAH9TA0AAEB5EWoAAIARCDUAAMAIhBoAAGAEQg0AADDCVRlq5s+fr9jYWNWsWVNdunTRxx9/fMljx4wZI4fDUWpr166d+5gDBw7o//7v/9SsWTM5HA7NnTu30uP6gh3znDp1aqk+IiMj/TE9Sb6f4+LFi9WzZ0/Vq1dP9erVU+/evfXFF19UatzKsmOO1f06rlmzRl27dlXdunVVp04ddezYUX//+98rNW5l2THHqr6Oku/n+XOrV6+Ww+HQPffcU6lxK8uOOVb39+Ty5cvLPObcuXNej1sm6yqzevVqq0aNGtbixYutr776ypo0aZJVp04dKysrq8zjf/zxRysnJ8e9ZWdnW/Xr17emTJniPuaLL76wnnjiCSstLc2KjIy0/vrXv1Z63Oo6zylTpljt2rXz6Cs3N7fazHH48OHWa6+9ZqWnp1sHDx60xo4da4WHh1tff/211+NWxzlW9+u4detWa82aNdZXX31lHT161Jo7d64VFBRkbdy40etxq+Mcq/I6+mueF504ccK69tprrZ49e1qDBw+u1LjVcY7V/T25bNkyKywszOO4nJycSo1blqsu1Nxyyy3Www8/7NHWunVrKzExsVznr1271nI4HNaJEyfKfD0mJqbMP/aVHbei7JrnlClTrJtuuqmi5XrF33O0LMsqKiqyQkNDrdTUVJ+NWxF2zdG062hZltWpUyfr2Wef9dm4FWHXHKvyOlqW/+ZZVFRkde/e3VqyZIk1evToUn/wTbiWV5pjdX9PLlu2zAoPD/fruJZlWVfV7afz589r9+7d6tu3r0d73759tWPHjnL1sXTpUvXu3btC32jsi3Erwq55XnTkyBFFRUUpNjZWw4YN07Fjxyrcx5VU1RzPnj2rCxcuqH79+j4bt7zsmuNFplxHy7K0efNmHTp0SHfccYfPxi0vu+Z4UVVcR8m/83zhhRfUsGFDjRs3zi/jlpddc7your8nCwsLFRMTo+uuu04DBw5Uenq6T8eVqsHPJPjSt99+q+Li4lK/8N24ceNSvwRelpycHG3YsEGrVq2q0nEryq55StKtt96qFStWqFWrVvrvf/+radOmqVu3bjpw4IAiIiIq3N+lVNUcExMTde2116p3794+Gbci7JqjZMZ1zMvL07XXXiuXy6WgoCDNnz9fffr08cm4FWHXHKWqu46S/+b56aefaunSpcrIyPDLuBVh1xyl6v+ebN26tZYvX64OHTooPz9fL7/8srp37669e/eqZcuWPruOV1WoucjhcHjsW5ZVqq0sy5cvV926dct8SM2f43rLjnn269fP/b87dOig22+/Xc2bN1dqaqoSEhIq3N+V+HOOs2bNUlpamrZt26aaNWv6ZFxv2DFHE65jaGioMjIyVFhYqM2bNyshIUHXX3+94uLiKj2uN+yYY1VfR8m38ywoKNCIESO0ePFiNWjQwC/jesOOOVb39+Rtt92m2267zb3fvXt3de7cWa+++qpeeeWVSo970VUVaho0aKCgoKBSqS83N7dUOvwly7L0t7/9TSNHjlRISEiVjesNu+ZZljp16qhDhw46cuRIpfv6OX/P8S9/+YumT5+uf//737rxxht9Mm5F2TXHslTH63jNNdeoRYsWkqSOHTvq4MGDSklJUVxcnDHX8XJzLIu/rqPkn3lmZmbqxIkTGjRokLutpKREkhQcHKxDhw4pOjq6Wl/L8syxefPmpfqrju/Jn7vmmmt08803u+v31XvyqnqmJiQkRF26dNGmTZs82jdt2qRu3bpd9tyPPvpIR48evez9Tn+M6w275lkWl8ulgwcPqkmTJj7p7yJ/znH27Nl68cUXtXHjRnXt2tVn41aUXXMsS3W8jr9kWZZcLlelx60ou+ZYFn9dR8k/82zdurW+/PJLZWRkuLe7775b8fHxysjIUHR0dLW/luWZY1mq+3vSsixlZGS46/fZdSz3I8WGuPiRsaVLl1pfffWVNXnyZKtOnTrup7QTExOtkSNHljpvxIgR1q233lpmny6Xy0pPT7fS09OtJk2aWE888YSVnp5uHTlypNzjmjLPxx9/3Nq2bZt17Ngx67PPPrMGDhxohYaG+mWe/pjjzJkzrZCQEOvtt9/2+NhhQUFBucc1YY7V/TpOnz7d+vDDD63MzEzr4MGD1ksvvWQFBwdbixcvLve4JsyxKq+jv+b5S2V9Mqi6X8tfKmuO1f09OXXqVGvjxo1WZmamlZ6ebo0dO9YKDg62Pv/883KPWx5XXaixLMt67bXXrJiYGCskJMTq3Lmz9dFHH7lfGz16tHXnnXd6HP/jjz9atWrVshYtWlRmf8ePH7ckldp+2c/lxvUHO+Z5//33W02aNLFq1KhhRUVFWUOHDrUOHDjgj+lZluX7OcbExJQ5x19+p0RVXks75ljdr+MzzzxjtWjRwqpZs6ZVr1496/bbb7dWr15doXF9zY45VvV1tCzfz/OXyvqDf6Vxfc2OOVb39+TkyZOtpk2bWiEhIVbDhg2tvn37Wjt27KjQuOXhsCzLKv+6DgAAQGC6qp6pAQAA5iLUAAAAIxBqAACAEQg1AADACIQaAABgBEINAAAwAqEGAAAYgVADAACMQKgBAABGINQAAAAjEGoAAIAR/h9O5gbEoGARGQAAAABJRU5ErkJggg==", | |
| "text/plain": [ | |
| "<Figure size 640x480 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "scores.plot.hist(bins=20)\n", | |
| "plt.title(\"epsilon 0.9\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "0b67a31f-50e3-4f12-8d32-13fe684bf108", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "id": "70dfa0b4-68c7-4e71-8670-322a948ca246", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "scores=[]\n", | |
| "for _ in range(100):\n", | |
| " model=GaussianNB(epsilon=0.1, bounds=(0,1))#(epsilon=0.1, bounds=(-3,3), classes=list(range(6)))\n", | |
| " model.fit(X_res,y_res) \n", | |
| " scores.append(model.score(X_res,y_res))\n", | |
| "scores=pd.Series(scores,name='scores')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 54, | |
| "id": "7cd3a2c0-6991-4014-ac55-a7ebf2be4b82", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.6436838036335523" | |
| ] | |
| }, | |
| "execution_count": 54, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "scores.mean()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 53, | |
| "id": "f5170dc3-28c5-4982-a65b-94aaa41d9aaf", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Text(0.5, 1.0, 'epsilon 0.1')" | |
| ] | |
| }, | |
| "execution_count": 53, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "", | |
| "text/plain": [ | |
| "<Figure size 640x480 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "scores.plot.hist(bins=20)\n", | |
| "plt.title(\"epsilon 0.1\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "f8443842-db20-4d51-be96-429c1f266de3", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "1d1980d7-db8b-4bde-982c-52170b4d56f5", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "86fc76c5-1d4f-4bf0-8908-4e952f814a7f", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "98c1a69b-bb97-48da-8676-389e24d42b5e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "5c67952f-e30f-4d3a-85ab-d3e64ad3e791", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 141, | |
| "id": "78e29f3f-57a7-465b-a55c-d68cba004b29", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "<Axes: ylabel='Frequency'>" | |
| ] | |
| }, | |
| "execution_count": 141, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "", | |
| "text/plain": [ | |
| "<Figure size 640x480 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "scores.plot.hist()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "ef2ad8fa-cf30-4792-b045-c0c5ea888f42", | |
| "metadata": {}, | |
| "source": [ | |
| "Dalam konteks privasi data ($k$-anonymity dan $l$-diversity), kolom dikategorikan berdasarkan fungsinya: **Identitas**, **Quasi-Identifier (QI)**, dan **Sensitif**.\n", | |
| "\n", | |
| "Untuk dataset Telco Customer Churn di atas, berikut adalah pembagiannya:\n", | |
| "\n", | |
| "### 1\\. Kolom Sensitif (Sensitive Attributes)\n", | |
| "\n", | |
| "Ini adalah kolom yang **berisi \"rahasia\"** atau informasi pribadi yang ingin dilindungi agar tidak diketahui orang lain secara spesifik. Dalam dataset ini, kandidat terkuatnya adalah:\n", | |
| "\n", | |
| " * **`MonthlyCharges` & `TotalCharges` (Finansial)**\n", | |
| " * **Alasan:** Ini menunjukkan kemampuan ekonomi seseorang. Orang biasanya tidak ingin gaji atau pengeluaran bulanannya diketahui publik.\n", | |
| " * *Rekomendasi Utama untuk l-diversity.*\n", | |
| " * **`Churn` (Status Pelanggan)**\n", | |
| " * **Alasan:** Mengetahui seseorang berhenti berlangganan (churn) bisa dianggap informasi bisnis yang sensitif atau menunjukkan ketidakpuasan/masalah finansial pelanggan.\n", | |
| " * **`PaymentMethod`**\n", | |
| " * **Alasan:** Metode pembayaran (misal: \"Credit card (automatic)\" vs \"Mailed check\") bisa mengungkap preferensi perbankan atau status akses ke layanan keuangan.\n", | |
| "\n", | |
| "-----\n", | |
| "\n", | |
| "### 2\\. Quasi-Identifiers (QI)\n", | |
| "\n", | |
| "Ini adalah kolom yang jika **digabungkan** bisa mengarah ke identitas seseorang (digunakan untuk *grouping* dalam fungsi `calculate_k_anonymity` Anda).\n", | |
| "\n", | |
| " * **Demografis:** `gender`, `SeniorCitizen`, `Partner`, `Dependents`.\n", | |
| " * **Layanan (Opsional):** `tenure` (masa berlangganan), `Contract`.\n", | |
| " * *Contoh:* Seorang \"Senior Citizen\", \"Wanita\", yang berlangganan tepat \"72 bulan\" di wilayah tertentu mungkin hanya ada satu orang.\n", | |
| "\n", | |
| "-----\n", | |
| "\n", | |
| "### 3\\. Direct Identifiers (Harus Dihapus/Dihash)\n", | |
| "\n", | |
| "Ini adalah kolom yang secara langsung menunjuk ke satu orang. Kolom ini **tidak boleh** dimasukkan ke perhitungan $k$ atau $l$, melainkan harus dibuang atau dienkripsi sebelum analisis.\n", | |
| "\n", | |
| " * **`customerID`** (Ini harus di-drop atau dijadikan indeks).\n", | |
| "\n", | |
| "-----\n", | |
| "\n", | |
| "### Rekomendasi Implementasi Kode\n", | |
| "\n", | |
| "Jika Anda ingin menerapkan fungsi `calculate_l_diversity` yang sudah Anda buat, Anda harus memilih **satu** kolom sensitif target.\n", | |
| "\n", | |
| "**Skenario A: Melindungi Data Keuangan (Paling Umum)**\n", | |
| "Anda ingin memastikan dalam satu grup demografi, nilai tagihannya bervariasi.\n", | |
| "\n", | |
| "```python\n", | |
| "quasi_identifiers = ['gender', 'SeniorCitizen', 'Partner', 'Dependents']\n", | |
| "sensitive_col = 'TotalCharges' # Atau 'MonthlyCharges'\n", | |
| "\n", | |
| "# Panggil fungsi\n", | |
| "l_val, counts = calculate_l_diversity(df, quasi_identifiers, sensitive_col)\n", | |
| "```\n", | |
| "\n", | |
| "**Skenario B: Melindungi Status Churn**\n", | |
| "Anda ingin memastikan dalam satu grup tidak semuanya \"Churn\" atau tidak semuanya \"No\".\n", | |
| "\n", | |
| "```python\n", | |
| "quasi_identifiers = ['gender', 'SeniorCitizen', 'Contract']\n", | |
| "sensitive_col = 'Churn'\n", | |
| "\n", | |
| "# Panggil fungsi\n", | |
| "l_val, counts = calculate_l_diversity(df, quasi_identifiers, sensitive_col)\n", | |
| "```\n", | |
| "\n", | |
| "**Kesimpulan:**\n", | |
| "Untuk parameter `sensitive_col`, saya sarankan pilih **`TotalCharges`** atau **`MonthlyCharges`** karena data numerik finansial biasanya memiliki risiko privasi paling tinggi di mata pelanggan." | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "7ca275f9-a57a-4a6c-ae35-4e5a2eb8c2d4", | |
| "metadata": {}, | |
| "source": [ | |
| "Angka-angka tersebut merepresentasikan **nilai Entropy terendah (minimum)** yang ditemukan dalam seluruh grup (*equivalence classes*) di dataset Anda.\n", | |
| "\n", | |
| "Dalam konteks privasi, **semakin tinggi angkanya, semakin aman (private)** data tersebut. Angka ini menunjukkan seberapa sulit bagi penyerang untuk menebak nilai sensitif seseorang jika mereka tahu orang tersebut berada di grup terlemah.\n", | |
| "\n", | |
| "Mari kita terjemahkan angka-angka \"abstrak\" ini ke dalam konsep yang lebih mudah dipahami, yaitu **Equivalent $l$** (nilai $l$ setara). Karena Anda menggunakan `np.log` (natural logarithm), rumusnya adalah $l \\approx e^{entropy}$.\n", | |
| "\n", | |
| "Berikut adalah analisis per kolom:\n", | |
| "\n", | |
| "---\n", | |
| "\n", | |
| "### 1. `MonthlyCharges` & `TotalCharges`: 1.0986\n", | |
| "* **Equivalent $l$:** $e^{1.0986} \\approx \\mathbf{3.0}$\n", | |
| "* **Artinya:**\n", | |
| " Di grup yang paling tidak aman sekalipun (titik terlemah dataset), keragaman nilai tagihannya setara dengan **3 nilai yang berbeda** dengan peluang muncul yang sama.\n", | |
| "* **Analisis:**\n", | |
| " Ini masuk akal karena `Charges` biasanya berupa angka numerik (float). Jarang sekali ada banyak orang dalam satu grup kecil yang memiliki tagihan persis sama (sampai ke desimalnya).\n", | |
| " * *Status:* **Cukup Bagus** (tergantung standar keamanan Anda, biasanya $l=3$ adalah batas minimal yang lumayan).\n", | |
| "\n", | |
| "### 2. `Churn`: 0.3820\n", | |
| "* **Equivalent $l$:** $e^{0.3820} \\approx \\mathbf{1.46}$\n", | |
| "* **Artinya:**\n", | |
| " Ini **berbahaya**. Nilai $l$ di bawah 2 berarti di grup terburuk, **hampir tidak ada keragaman**.\n", | |
| "* **Ilustrasi Masalah:**\n", | |
| " Nilai Churn hanya ada dua: \"Yes\" atau \"No\".\n", | |
| " * Jika seimbang (50% Yes, 50% No), Entropy maksimalnya adalah $\\approx 0.69$.\n", | |
| " * Nilai Anda **0.38** jauh di bawah 0.69.\n", | |
| " * Ini menunjukkan ada grup tertentu di mana **mayoritas orang (mungkin 85-90%) memiliki status Churn yang sama**.\n", | |
| " * *Contoh:* Ada satu grup berisi 10 orang, 9 orang \"No Churn\" dan 1 orang \"Churn\". Penyerang bisa menebak dengan keyakinan tinggi bahwa target di grup itu \"Tidak Churn\".\n", | |
| " * *Status:* **Risiko Tinggi (High Risk)**.\n", | |
| "\n", | |
| "### 3. `PaymentMethod`: 0.6365\n", | |
| "* **Equivalent $l$:** $e^{0.6365} \\approx \\mathbf{1.89}$\n", | |
| "* **Artinya:**\n", | |
| " Hampir mendekati 2, tapi belum sampai. Ini berarti di grup terlemah, keragaman metode pembayarannya **sedikit lebih buruk daripada tebak-tebakan koin (50:50)**.\n", | |
| "* **Analisis:**\n", | |
| " Meskipun `PaymentMethod` mungkin punya 4 opsi (Check, Credit Card, dll), ada grup tertentu di mana opsi-opsi ini tidak tersebar merata. Mungkin di grup tersebut didominasi oleh satu metode pembayaran saja.\n", | |
| " * *Status:* **Kurang Aman** (Biasanya target $l$ minimal adalah 2 atau 3).\n", | |
| "\n", | |
| "---\n", | |
| "\n", | |
| "### Kesimpulan Visual\n", | |
| "\n", | |
| "\n", | |
| "\n", | |
| "Bayangkan Anda seorang penyerang yang melihat **grup terlemah** di data Anda:\n", | |
| "\n", | |
| "| Kolom | Tingkat Kesulitan Menebak | Keamanan |\n", | |
| "| :--- | :--- | :--- |\n", | |
| "| **Charges** | Anda bingung memilih antara 3 angka berbeda. | ✅ Lumayan |\n", | |
| "| **Churn** | Anda sangat yakin tebakan Anda benar (karena data berat sebelah). | ❌ Buruk |\n", | |
| "| **Payment** | Anda agak ragu, tapi punya tebakan kuat ke satu arah. | ⚠️ Waspada |\n", | |
| "\n", | |
| "### Apa yang harus dilakukan selanjutnya?\n", | |
| "\n", | |
| "Karena **Churn** memiliki skor privasi terburuk (0.38), ini adalah prioritas perbaikan Anda.\n", | |
| "\n", | |
| "1. **Suppression:** Hapus baris-baris (grup) yang menyebabkan entropy rendah tersebut.\n", | |
| "2. **Generalization:** Kurangi detail pada *Quasi-Identifiers* (misal: rentang umur diperlebar) agar grup menjadi lebih besar. Jika grup lebih besar, peluang keragaman data `Churn` (Yes/No) biasanya akan meningkat (mendekati distribusi alami).\n", | |
| "\n", | |
| "Apakah Anda ingin melihat **grup mana persisnya** yang menyebabkan nilai `Churn` menjadi sangat rendah (0.38) tersebut?" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": null, | |
| "id": "e771424f-8ed5-4786-a2b4-7a3b69a0f513", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.14" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment