Created
November 30, 2025 14:39
-
-
Save UmarZein/66f618627b71c74fbf78a3c8d823c870 to your computer and use it in GitHub Desktop.
Analisis dan augmentasi data dengan SMOTEN
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "code", | |
| "execution_count": 1, | |
| "id": "1593637f-ddcc-4c19-9aa9-fe17f0ff1ebd", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import numpy as np\n", | |
| "import seaborn as sns\n", | |
| "import matplotlib.pyplot as plt\n", | |
| "import hashlib\n", | |
| "import uuid\n", | |
| "import math\n", | |
| "import random\n", | |
| "from diffprivlib.models import LogisticRegression, GaussianNB, DecisionTreeClassifier\n", | |
| "from sklearn.preprocessing import LabelEncoder" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 2, | |
| "id": "ad3bc31c-19be-40f7-9015-c3ea6c94daa4", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Transformasi default untuk merubah data menjadi bentuk numerik\n", | |
| "def auto_preprocess_df(df, null_threshold=0.5):\n", | |
| " \"\"\"\n", | |
| " 1. If unique values < 65: LabelEncodes (treating NaNs as a label).\n", | |
| " 2. Else: Converts to numeric (coercing errors to NaN).\n", | |
| " 3. If column Null % > threshold: Drops the column.\n", | |
| " 4. Else: Fills NaNs with median.\n", | |
| " 5. Finally: Drops any rows that still contain Nulls.\n", | |
| " \"\"\"\n", | |
| " df_processed = df.copy()\n", | |
| " le = LabelEncoder()\n", | |
| " cols_to_drop = []\n", | |
| "\n", | |
| " for col in df_processed.columns:\n", | |
| " # Condition 1: Low Cardinality (Categorical)\n", | |
| " if df_processed[col].nunique() < 65:\n", | |
| " # Convert to string to handle mixed types and NaNs as a category\n", | |
| " df_processed[col] = le.fit_transform(df_processed[col].astype(str))\n", | |
| " \n", | |
| " # Condition 2: High Cardinality (Numeric/ID/Messy)\n", | |
| " else:\n", | |
| " # Coerce to number (Strings/IDs become NaN)\n", | |
| " df_processed[col] = pd.to_numeric(df_processed[col], errors='coerce')\n", | |
| " \n", | |
| " # Check Null Percentage immediately after coercion\n", | |
| " null_pct = df_processed[col].isnull().mean()\n", | |
| " \n", | |
| " if null_pct > null_threshold:\n", | |
| " # If too many NaNs (e.g., it was an ID column), mark for dropping\n", | |
| " cols_to_drop.append(col)\n", | |
| " else:\n", | |
| " # Otherwise, fill with median\n", | |
| " median_val = df_processed[col].median()\n", | |
| " df_processed[col] = df_processed[col].fillna(median_val)\n", | |
| "\n", | |
| " # Drop the columns identified as \"mostly null\"\n", | |
| " if cols_to_drop:\n", | |
| " print(f\"Dropping columns > {null_threshold:.0%} Null: {cols_to_drop}\")\n", | |
| " df_processed.drop(columns=cols_to_drop, inplace=True)\n", | |
| "\n", | |
| " # Final cleanup: Drop rows containing any remaining NaNs\n", | |
| " original_len = len(df_processed)\n", | |
| " df_processed.dropna(axis=0, inplace=True)\n", | |
| " \n", | |
| " if len(df_processed) < original_len:\n", | |
| " print(f\"Dropped {original_len - len(df_processed)} rows containing Nulls.\")\n", | |
| "\n", | |
| " return df_processed" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 7, | |
| "id": "26c40daf-66e2-46ea-bdcc-da60af6f301b", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "\n", | |
| "def calculate_k_anonymity(df, quasi_identifiers):\n", | |
| " grouped = df.groupby(quasi_identifiers, dropna=False)\n", | |
| " group_sizes = grouped.size()\n", | |
| " \n", | |
| " if group_sizes.empty:\n", | |
| " return 0, group_sizes\n", | |
| " \n", | |
| " k_actual = group_sizes.min()\n", | |
| " return k_actual, group_sizes\n", | |
| "\n", | |
| "# unused\n", | |
| "def calculate_l_diversity(df, quasi_identifiers, sensitive_col):\n", | |
| " grouped = df.groupby(quasi_identifiers, dropna=False)\n", | |
| " diversity_counts = grouped[sensitive_col].nunique()\n", | |
| " \n", | |
| " if diversity_counts.empty:\n", | |
| " return 0, diversity_counts\n", | |
| "\n", | |
| " l_actual = diversity_counts.min()\n", | |
| " return l_actual, diversity_counts\n", | |
| "\n", | |
| "def calculate_entropy_l_diversity(df, quasi_identifiers, sensitive_col):\n", | |
| " grouped = df.groupby(quasi_identifiers, dropna=False)\n", | |
| " entropy_results = []\n", | |
| " \n", | |
| " for name, group in grouped:\n", | |
| " counts = group[sensitive_col].value_counts(normalize=True)\n", | |
| " entropy = -np.sum(counts * np.log(counts))\n", | |
| " entropy_results.append({\n", | |
| " 'group_id': name,\n", | |
| " 'size': len(group),\n", | |
| " 'entropy': entropy\n", | |
| " })\n", | |
| " \n", | |
| " results_df = pd.DataFrame(entropy_results)\n", | |
| " if results_df.empty:\n", | |
| " return 0.0, results_df\n", | |
| " \n", | |
| " min_entropy = results_df['entropy'].min()\n", | |
| " return min_entropy, results_df" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 4, | |
| "id": "7a8c66ac-3047-4edc-9b82-83d566b139cc", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df=pd.read_csv(\"WA_Fn-UseC_-Telco-Customer-Churn.csv\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 8, | |
| "id": "ff90986c-ec4e-4f82-9651-d40d3c42688b", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>customerID</th>\n", | |
| " <th>gender</th>\n", | |
| " <th>SeniorCitizen</th>\n", | |
| " <th>Partner</th>\n", | |
| " <th>Dependents</th>\n", | |
| " <th>tenure</th>\n", | |
| " <th>PhoneService</th>\n", | |
| " <th>MultipleLines</th>\n", | |
| " <th>InternetService</th>\n", | |
| " <th>OnlineSecurity</th>\n", | |
| " <th>...</th>\n", | |
| " <th>DeviceProtection</th>\n", | |
| " <th>TechSupport</th>\n", | |
| " <th>StreamingTV</th>\n", | |
| " <th>StreamingMovies</th>\n", | |
| " <th>Contract</th>\n", | |
| " <th>PaperlessBilling</th>\n", | |
| " <th>PaymentMethod</th>\n", | |
| " <th>MonthlyCharges</th>\n", | |
| " <th>TotalCharges</th>\n", | |
| " <th>Churn</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>7590-VHVEG</td>\n", | |
| " <td>Female</td>\n", | |
| " <td>0</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>1</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No phone service</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>No</td>\n", | |
| " <td>...</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Month-to-month</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Electronic check</td>\n", | |
| " <td>29.85</td>\n", | |
| " <td>29.85</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>5575-GNVDE</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>34</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>One year</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Mailed check</td>\n", | |
| " <td>56.95</td>\n", | |
| " <td>1889.5</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>3668-QPYBK</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Month-to-month</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Mailed check</td>\n", | |
| " <td>53.85</td>\n", | |
| " <td>108.15</td>\n", | |
| " <td>Yes</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>7795-CFOCW</td>\n", | |
| " <td>Male</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>45</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No phone service</td>\n", | |
| " <td>DSL</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>...</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>One year</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Bank transfer (automatic)</td>\n", | |
| " <td>42.30</td>\n", | |
| " <td>1840.75</td>\n", | |
| " <td>No</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>9237-HQITU</td>\n", | |
| " <td>Female</td>\n", | |
| " <td>0</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>2</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Fiber optic</td>\n", | |
| " <td>No</td>\n", | |
| " <td>...</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>No</td>\n", | |
| " <td>Month-to-month</td>\n", | |
| " <td>Yes</td>\n", | |
| " <td>Electronic check</td>\n", | |
| " <td>70.70</td>\n", | |
| " <td>151.65</td>\n", | |
| " <td>Yes</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>5 rows × 21 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " customerID gender SeniorCitizen Partner Dependents tenure PhoneService \\\n", | |
| "0 7590-VHVEG Female 0 Yes No 1 No \n", | |
| "1 5575-GNVDE Male 0 No No 34 Yes \n", | |
| "2 3668-QPYBK Male 0 No No 2 Yes \n", | |
| "3 7795-CFOCW Male 0 No No 45 No \n", | |
| "4 9237-HQITU Female 0 No No 2 Yes \n", | |
| "\n", | |
| " MultipleLines InternetService OnlineSecurity ... DeviceProtection \\\n", | |
| "0 No phone service DSL No ... No \n", | |
| "1 No DSL Yes ... Yes \n", | |
| "2 No DSL Yes ... No \n", | |
| "3 No phone service DSL Yes ... Yes \n", | |
| "4 No Fiber optic No ... No \n", | |
| "\n", | |
| " TechSupport StreamingTV StreamingMovies Contract PaperlessBilling \\\n", | |
| "0 No No No Month-to-month Yes \n", | |
| "1 No No No One year No \n", | |
| "2 No No No Month-to-month Yes \n", | |
| "3 Yes No No One year No \n", | |
| "4 No No No Month-to-month Yes \n", | |
| "\n", | |
| " PaymentMethod MonthlyCharges TotalCharges Churn \n", | |
| "0 Electronic check 29.85 29.85 No \n", | |
| "1 Mailed check 56.95 1889.5 No \n", | |
| "2 Mailed check 53.85 108.15 Yes \n", | |
| "3 Bank transfer (automatic) 42.30 1840.75 No \n", | |
| "4 Electronic check 70.70 151.65 Yes \n", | |
| "\n", | |
| "[5 rows x 21 columns]" | |
| ] | |
| }, | |
| "execution_count": 8, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df.head()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 39, | |
| "id": "c60ec13e-1207-4e18-81e0-8ecb8d50e4a1", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Dropping columns > 50% Null: ['customerID']\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "auto_df=auto_preprocess_df(df)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 9, | |
| "id": "aec341c4-47d1-4363-8b47-e17c82615835", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# Fitur proxy PII\n", | |
| "quasi_identifiers=set(['gender','SeniorCitizen','Partner','Dependents'])\n", | |
| "# Sebaiknya tidak publik\n", | |
| "sensitive_features=set(['MonthlyCharges', 'TotalCharges', 'Churn', 'PaymentMethod'])\n", | |
| "# 1-to-1 mapping antara individu ke customerID\n", | |
| "direct_identifiers=set(['customerID'])\n", | |
| "# Fitur umum untuk klasifikasi\n", | |
| "utility=set(df.columns)-(quasi_identifiers|sensitive_features|direct_identifiers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "328cbc1f-86a2-4de4-b819-30eecc87c3fc", | |
| "metadata": {}, | |
| "source": [ | |
| "### Evaluasi risiko privasi" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 12, | |
| "id": "b5b96ad3-2489-42dc-8997-740dc952555a", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "k_anonimity = 3\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "k_anonimity,_=calculate_k_anonymity(df, list(quasi_identifiers))\n", | |
| "print(f\"{k_anonimity = }\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "a77f674c-ed7f-46b2-97a8-4524e204edee", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "Kolom l-entropy:\n", | |
| "MonthlyCharges...... 2.9999999999999996\n", | |
| "TotalCharges........ 2.9999999999999996\n", | |
| "Churn............... 1.4653036909521657\n", | |
| "PaymentMethod....... 1.8898815748423097\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "print('Kolom'.ljust(20,' '),'l-entropy:')\n", | |
| "for col in sensitive_features:\n", | |
| " l,_=calculate_entropy_l_diversity(df, list(quasi_identifiers), col)\n", | |
| " print(col.ljust(20,'.'),math.exp(l))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 13, | |
| "id": "ff3022ec-7257-4f37-bc87-f3f6537282ae", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "# pseudonymization\n", | |
| "def sha256(x):\n", | |
| " return hashlib.sha256(x.encode()).hexdigest()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 11, | |
| "id": "c62db466-cb39-478b-9433-da73f6d1e241", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_secure=pd.DataFrame()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "112d5304-a6b3-49e0-8f2b-0be7ddb898b2", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "NameError", | |
| "evalue": "name 'df_secure' is not defined", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", | |
| "Cell \u001b[1;32mIn[14], line 3\u001b[0m\n\u001b[0;32m 1\u001b[0m \u001b[38;5;66;03m# Pseudonimisasi diaplikasikan ke direct identifier\u001b[39;00m\n\u001b[0;32m 2\u001b[0m \u001b[38;5;28;01mfor\u001b[39;00m col \u001b[38;5;129;01min\u001b[39;00m direct_identifiers:\n\u001b[1;32m----> 3\u001b[0m \u001b[43mdf_secure\u001b[49m[col]\u001b[38;5;241m=\u001b[39mdf[col]\u001b[38;5;241m.\u001b[39mapply(sha256)\n", | |
| "\u001b[1;31mNameError\u001b[0m: name 'df_secure' is not defined" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# Pseudonimisasi diaplikasikan ke direct identifier. Ini versi lemah. Sebaiknya gunakan salt, tapi biasanya saltnya \n", | |
| "# disimpan di env variable atau \n", | |
| "for col in direct_identifiers:\n", | |
| " df_secure[col]=df[col].apply(sha256)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "8a7757ee-81b6-46eb-908f-76fbc52cac6e", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "NameError", | |
| "evalue": "name 'df_secure' is not defined", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", | |
| "Cell \u001b[1;32mIn[15], line 19\u001b[0m\n\u001b[0;32m 17\u001b[0m nuniques\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlen\u001b[39m(uniques)\n\u001b[0;32m 18\u001b[0m mapping \u001b[38;5;241m=\u001b[39m {k:\u001b[38;5;28mchr\u001b[39m(\u001b[38;5;28mord\u001b[39m(\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mA\u001b[39m\u001b[38;5;124m'\u001b[39m)\u001b[38;5;241m+\u001b[39mi) \u001b[38;5;28;01mfor\u001b[39;00m k,i \u001b[38;5;129;01min\u001b[39;00m \u001b[38;5;28mzip\u001b[39m(uniques,\u001b[38;5;28mrange\u001b[39m(nuniques))}\u001b[38;5;66;03m#\u001b[39;00m\n\u001b[1;32m---> 19\u001b[0m \u001b[43mdf_secure\u001b[49m[col]\u001b[38;5;241m=\u001b[39mdf[col]\u001b[38;5;241m.\u001b[39mmap(mapping)\n", | |
| "\u001b[1;31mNameError\u001b[0m: name 'df_secure' is not defined" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "# kita gunakan preprocessing yang sama untuk fitur sensitif dan fitur utility karena kami gunakan\n", | |
| "# differential privacy pada level model. Artinya, akan diaplikasikan ke semua kolom, termasuk\n", | |
| "# kolom sensitif\n", | |
| "for col in sensitive_features|utility:\n", | |
| " if df.nunique()[col]>32:\n", | |
| " tmp = df[col]\n", | |
| " if col not in df.select_dtypes('number').columns:\n", | |
| " tmp_col_full = pd.to_numeric(df[col], errors='coerce')\n", | |
| " tmp_col_full = tmp_col_full.fillna(0) \n", | |
| " tmp = tmp_col_full\n", | |
| " bins = tmp.quantile([.0,.25,.5,.75,1.]) #kami gunakan binning untuk generalisasi\n", | |
| " idx = np.random.permutation(list(range(len(bins)-1)))\n", | |
| " labels = list(map(lambda x: chr(ord('A') + x), idx)) #kami lakuka pseudonymization untuk semua kolom\n", | |
| " df_secure[col] = pd.cut(tmp, bins=bins, labels=labels, include_lowest=True).astype(str)\n", | |
| " else:\n", | |
| " uniques=df[col].unique()\n", | |
| " nuniques=len(uniques)\n", | |
| " mapping = {k:chr(ord('A')+i) for k,i in zip(uniques,range(nuniques))} #kami lakuka pseudonymization untuk semua kolom\n", | |
| " df_secure[col]=df[col].map(mapping)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 14, | |
| "id": "089a627d-125a-4790-9a78-5215e55f37e0", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "for col in quasi_identifiers:\n", | |
| " if col in ['Partner','Dependents']: continue #Partner dan Dependents akan di generalisasi ke IsIndependent\n", | |
| " uniques=df[col].unique()\n", | |
| " nuniques=len(uniques)\n", | |
| " mapping = {k:chr(ord('A')+i) for k,i in zip(uniques,range(nuniques))}#pseudonymization\n", | |
| " df_secure[col]=df[col].map(mapping)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 15, | |
| "id": "9bec6f03-f280-4332-aed8-96b065fa6f4a", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "df_secure['IsIndependent']=((df['Partner']=='No')&(df['Dependents']=='No')).map({True:'B',False:'A'})\n", | |
| "#generalisasi, untuk menaikkan k-anonymity" | |
| ] | |
| }, | |
| { | |
| "cell_type": "raw", | |
| "id": "a3e26955-4880-4698-ab41-f48e4a75d559", | |
| "metadata": {}, | |
| "source": [ | |
| "tmp=list(utility)\n", | |
| "df_secure[tmp]=auto_df[tmp]" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "05483900-0990-482f-a275-027b10ca863e", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "new_quasi_identifiers=set(['gender','SeniorCitizen','IsIndependent'])\n", | |
| "\n", | |
| "new_sensitive_features=set(['MonthlyCharges', 'TotalCharges', 'Churn', 'PaymentMethod'])\n", | |
| "new_direct_identifiers=set(['customerID'])\n", | |
| "\n", | |
| "new_utility=set(df_secure.columns)-(new_quasi_identifiers|new_sensitive_features|new_direct_identifiers)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 45, | |
| "id": "e6c1caa8-035f-460d-9ede-e1c577e9cdfa", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/html": [ | |
| "<div>\n", | |
| "<style scoped>\n", | |
| " .dataframe tbody tr th:only-of-type {\n", | |
| " vertical-align: middle;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe tbody tr th {\n", | |
| " vertical-align: top;\n", | |
| " }\n", | |
| "\n", | |
| " .dataframe thead th {\n", | |
| " text-align: right;\n", | |
| " }\n", | |
| "</style>\n", | |
| "<table border=\"1\" class=\"dataframe\">\n", | |
| " <thead>\n", | |
| " <tr style=\"text-align: right;\">\n", | |
| " <th></th>\n", | |
| " <th>customerID</th>\n", | |
| " <th>StreamingTV</th>\n", | |
| " <th>TechSupport</th>\n", | |
| " <th>PaperlessBilling</th>\n", | |
| " <th>InternetService</th>\n", | |
| " <th>PhoneService</th>\n", | |
| " <th>PaymentMethod</th>\n", | |
| " <th>MultipleLines</th>\n", | |
| " <th>OnlineSecurity</th>\n", | |
| " <th>OnlineBackup</th>\n", | |
| " <th>Contract</th>\n", | |
| " <th>DeviceProtection</th>\n", | |
| " <th>TotalCharges</th>\n", | |
| " <th>StreamingMovies</th>\n", | |
| " <th>tenure</th>\n", | |
| " <th>Churn</th>\n", | |
| " <th>MonthlyCharges</th>\n", | |
| " <th>SeniorCitizen</th>\n", | |
| " <th>gender</th>\n", | |
| " <th>IsIndependent</th>\n", | |
| " </tr>\n", | |
| " </thead>\n", | |
| " <tbody>\n", | |
| " <tr>\n", | |
| " <th>0</th>\n", | |
| " <td>f3f9002e121cbbcc038f195a656a32a17395c6ce1815e8...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>1</th>\n", | |
| " <td>5f362546eb7515f442a40d3d9bf632e4a481c92bcb0529...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>2</th>\n", | |
| " <td>701bf78bdf332d16ec5fb80a1affac3a080a8b9c258c2d...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>3</th>\n", | |
| " <td>59a0e1b332a6b6ba8f6a72f74085e6498e8c54a5d18191...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>4</th>\n", | |
| " <td>66370db9e1bb6a851fa692a2499f4f1356efab8e5abf90...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>...</th>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " <td>...</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7038</th>\n", | |
| " <td>d947cded0fac0a00ebc313bbe4c4b64e23283d0cf7b536...</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7039</th>\n", | |
| " <td>cb985e07827c720145c2de03810a8ec565fe1121c0b6bd...</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7040</th>\n", | |
| " <td>32b725664afc96d8366d2d44ffb7a801132d3f84d68c19...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>D</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7041</th>\n", | |
| " <td>b8463bcb56ddf1b47ee63bafdf65079f88d260b99ccd89...</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " </tr>\n", | |
| " <tr>\n", | |
| " <th>7042</th>\n", | |
| " <td>6bd283e5eb3d43ae083283e36d747965afa39b80f70ab5...</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>B</td>\n", | |
| " <td>D</td>\n", | |
| " <td>B</td>\n", | |
| " <td>C</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>A</td>\n", | |
| " <td>B</td>\n", | |
| " <td>B</td>\n", | |
| " </tr>\n", | |
| " </tbody>\n", | |
| "</table>\n", | |
| "<p>7043 rows × 20 columns</p>\n", | |
| "</div>" | |
| ], | |
| "text/plain": [ | |
| " customerID StreamingTV \\\n", | |
| "0 f3f9002e121cbbcc038f195a656a32a17395c6ce1815e8... A \n", | |
| "1 5f362546eb7515f442a40d3d9bf632e4a481c92bcb0529... A \n", | |
| "2 701bf78bdf332d16ec5fb80a1affac3a080a8b9c258c2d... A \n", | |
| "3 59a0e1b332a6b6ba8f6a72f74085e6498e8c54a5d18191... A \n", | |
| "4 66370db9e1bb6a851fa692a2499f4f1356efab8e5abf90... A \n", | |
| "... ... ... \n", | |
| "7038 d947cded0fac0a00ebc313bbe4c4b64e23283d0cf7b536... B \n", | |
| "7039 cb985e07827c720145c2de03810a8ec565fe1121c0b6bd... B \n", | |
| "7040 32b725664afc96d8366d2d44ffb7a801132d3f84d68c19... A \n", | |
| "7041 b8463bcb56ddf1b47ee63bafdf65079f88d260b99ccd89... A \n", | |
| "7042 6bd283e5eb3d43ae083283e36d747965afa39b80f70ab5... B \n", | |
| "\n", | |
| " TechSupport PaperlessBilling InternetService PhoneService PaymentMethod \\\n", | |
| "0 A A A A A \n", | |
| "1 A B A B B \n", | |
| "2 A A A B B \n", | |
| "3 B B A A C \n", | |
| "4 A A B B A \n", | |
| "... ... ... ... ... ... \n", | |
| "7038 B A A B B \n", | |
| "7039 A A B B D \n", | |
| "7040 A A A A A \n", | |
| "7041 A A B B B \n", | |
| "7042 B A B B C \n", | |
| "\n", | |
| " MultipleLines OnlineSecurity OnlineBackup Contract DeviceProtection \\\n", | |
| "0 A A A A A \n", | |
| "1 B B B B B \n", | |
| "2 B B A A A \n", | |
| "3 A B B B B \n", | |
| "4 B A B A A \n", | |
| "... ... ... ... ... ... \n", | |
| "7038 C B B B B \n", | |
| "7039 C A A B B \n", | |
| "7040 A B B A A \n", | |
| "7041 C A B A A \n", | |
| "7042 B B B C B \n", | |
| "\n", | |
| " TotalCharges StreamingMovies tenure Churn MonthlyCharges SeniorCitizen \\\n", | |
| "0 B A B A D A \n", | |
| "1 C A A A A A \n", | |
| "2 B A B B A A \n", | |
| "3 C A A A A A \n", | |
| "4 B A B B C A \n", | |
| "... ... ... ... ... ... ... \n", | |
| "7038 C B D A C A \n", | |
| "7039 D B C A B A \n", | |
| "7040 B A D A D A \n", | |
| "7041 B A B B C B \n", | |
| "7042 D B C A B A \n", | |
| "\n", | |
| " gender IsIndependent \n", | |
| "0 A A \n", | |
| "1 B B \n", | |
| "2 B B \n", | |
| "3 B B \n", | |
| "4 A B \n", | |
| "... ... ... \n", | |
| "7038 B A \n", | |
| "7039 A A \n", | |
| "7040 A A \n", | |
| "7041 B A \n", | |
| "7042 B B \n", | |
| "\n", | |
| "[7043 rows x 20 columns]" | |
| ] | |
| }, | |
| "execution_count": 45, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "df_secure" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 17, | |
| "id": "d8f69e21-9c86-4963-bce7-a08cf4e2f958", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "name": "stdout", | |
| "output_type": "stream", | |
| "text": [ | |
| "k_actual =244\n", | |
| "sensitive column l-entropy:\n", | |
| "TotalCharges........ 3.380961672597764\n", | |
| "Churn............... 1.5792452676228184\n", | |
| "MonthlyCharges...... 3.1462017832010143\n", | |
| "PaymentMethod....... 3.0532008401147204\n" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "k_actual,_=calculate_k_anonymity(df_secure, list(new_quasi_identifiers))\n", | |
| "print(f\"{k_actual =}\")\n", | |
| "print('sensitive column'.ljust(20,' '),'l-entropy:')\n", | |
| "for col in new_sensitive_features:\n", | |
| " l,_=calculate_entropy_l_diversity(df_secure, list(new_quasi_identifiers), col)\n", | |
| " print(col.ljust(20,'.'),math.exp(l))" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "144b4a9e-e5b0-4f4b-870a-9c518b860020", | |
| "metadata": {}, | |
| "source": [ | |
| "### Augmentasi data dengan SMOTEN (SMOTE Nominal)" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 16, | |
| "id": "f426ac4d-27bd-483d-b87e-b14b7cc5d5af", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "ename": "NameError", | |
| "evalue": "name 'df_secure' is not defined", | |
| "output_type": "error", | |
| "traceback": [ | |
| "\u001b[1;31m---------------------------------------------------------------------------\u001b[0m", | |
| "\u001b[1;31mNameError\u001b[0m Traceback (most recent call last)", | |
| "Cell \u001b[1;32mIn[16], line 6\u001b[0m\n\u001b[0;32m 3\u001b[0m target_y\u001b[38;5;241m=\u001b[39m\u001b[38;5;124m'\u001b[39m\u001b[38;5;124mChurn\u001b[39m\u001b[38;5;124m'\u001b[39m\n\u001b[0;32m 5\u001b[0m sampler \u001b[38;5;241m=\u001b[39m SMOTEN(random_state\u001b[38;5;241m=\u001b[39m\u001b[38;5;241m0\u001b[39m)\n\u001b[1;32m----> 6\u001b[0m X \u001b[38;5;241m=\u001b[39m \u001b[43mdf_secure\u001b[49m\u001b[38;5;241m.\u001b[39mdrop(columns\u001b[38;5;241m=\u001b[39m\u001b[38;5;28mlist\u001b[39m(direct_identifiers)\u001b[38;5;241m+\u001b[39m[target_y])\n\u001b[0;32m 7\u001b[0m y \u001b[38;5;241m=\u001b[39m df_secure[target_y]\n\u001b[0;32m 8\u001b[0m _X_res, _y_res \u001b[38;5;241m=\u001b[39m sampler\u001b[38;5;241m.\u001b[39mfit_resample(X, y)\n", | |
| "\u001b[1;31mNameError\u001b[0m: name 'df_secure' is not defined" | |
| ] | |
| } | |
| ], | |
| "source": [ | |
| "from imblearn.over_sampling import SMOTEN\n", | |
| "\n", | |
| "target_y='Churn'\n", | |
| "\n", | |
| "sampler = SMOTEN(random_state=0)\n", | |
| "X = df_secure.drop(columns=list(direct_identifiers)+[target_y])\n", | |
| "y = df_secure[target_y]\n", | |
| "_X_res, _y_res = sampler.fit_resample(X, y)\n", | |
| "\n", | |
| "X_res = _X_res.map(ord).astype(int)\n", | |
| "X_res = X_res-X_res.min().min()\n", | |
| "X_res = X_res/X_res.max()\n", | |
| "\n", | |
| "y_res = _y_res.apply(ord).astype(int)\n", | |
| "y_res = y_res-y_res.min().min()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "53bd9bce-087c-4d69-874f-e1aee6aa93ae", | |
| "metadata": {}, | |
| "source": [ | |
| "# Komparasi\n", | |
| "\n", | |
| "---\n", | |
| "\n", | |
| "### Epsilon tinggi" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 56, | |
| "id": "8117a3c6-d6e1-40dc-bb0c-8ffe248ada88", | |
| "metadata": { | |
| "scrolled": true | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "scores=[]\n", | |
| "for _ in range(100):\n", | |
| " model=GaussianNB(epsilon=0.9, bounds=(0,1))#(epsilon=0.1, bounds=(-3,3), classes=list(range(6)))\n", | |
| " model.fit(X_res,y_res) \n", | |
| " scores.append(model.score(X_res,y_res))\n", | |
| "scores=pd.Series(scores,name='scores')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 57, | |
| "id": "d6c288cd-4fb8-485d-af79-0a33bdd0d338", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.7350396211828373" | |
| ] | |
| }, | |
| "execution_count": 57, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "scores.mean()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 58, | |
| "id": "b623d31d-ce76-4f72-8e3c-9f737f7e74e6", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Text(0.5, 1.0, 'epsilon 0.9')" | |
| ] | |
| }, | |
| "execution_count": 58, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "", | |
| "text/plain": [ | |
| "<Figure size 640x480 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "scores.plot.hist(bins=20)\n", | |
| "plt.title(\"epsilon 0.9\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "096cad36-82ea-4e4e-9d4e-e75335fddf10", | |
| "metadata": {}, | |
| "source": [ | |
| "### Epsilon rendah" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 50, | |
| "id": "70dfa0b4-68c7-4e71-8670-322a948ca246", | |
| "metadata": {}, | |
| "outputs": [], | |
| "source": [ | |
| "scores=[]\n", | |
| "for _ in range(100):\n", | |
| " model=GaussianNB(epsilon=0.1, bounds=(0,1))#(epsilon=0.1, bounds=(-3,3), classes=list(range(6)))\n", | |
| " model.fit(X_res,y_res) \n", | |
| " scores.append(model.score(X_res,y_res))\n", | |
| "scores=pd.Series(scores,name='scores')" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 54, | |
| "id": "7cd3a2c0-6991-4014-ac55-a7ebf2be4b82", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "0.6436838036335523" | |
| ] | |
| }, | |
| "execution_count": 54, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| } | |
| ], | |
| "source": [ | |
| "scores.mean()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 53, | |
| "id": "f5170dc3-28c5-4982-a65b-94aaa41d9aaf", | |
| "metadata": {}, | |
| "outputs": [ | |
| { | |
| "data": { | |
| "text/plain": [ | |
| "Text(0.5, 1.0, 'epsilon 0.1')" | |
| ] | |
| }, | |
| "execution_count": 53, | |
| "metadata": {}, | |
| "output_type": "execute_result" | |
| }, | |
| { | |
| "data": { | |
| "image/png": "", | |
| "text/plain": [ | |
| "<Figure size 640x480 with 1 Axes>" | |
| ] | |
| }, | |
| "metadata": {}, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "scores.plot.hist(bins=20)\n", | |
| "plt.title(\"epsilon 0.1\")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "id": "d6fafd8d-eecf-4ba4-9396-c5093ed58823", | |
| "metadata": {}, | |
| "source": [ | |
| "# Kesimpulan\n", | |
| "\n", | |
| "Ada tradeoff diantara privacy dan akurasi: semakin rendah epsilon, semakin tinggi kekuatan privasi, namun akurasi menurun" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "kernelspec": { | |
| "display_name": "Python 3 (ipykernel)", | |
| "language": "python", | |
| "name": "python3" | |
| }, | |
| "language_info": { | |
| "codemirror_mode": { | |
| "name": "ipython", | |
| "version": 3 | |
| }, | |
| "file_extension": ".py", | |
| "mimetype": "text/x-python", | |
| "name": "python", | |
| "nbconvert_exporter": "python", | |
| "pygments_lexer": "ipython3", | |
| "version": "3.10.14" | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 5 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment