Last active
February 5, 2025 19:14
-
-
Save wesslen/755cf35e1f717435424b7e1c5c5cc209 to your computer and use it in GitHub Desktop.
cfpb-redaction-example.ipynb
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "nbformat": 4, | |
| "nbformat_minor": 0, | |
| "metadata": { | |
| "colab": { | |
| "provenance": [], | |
| "authorship_tag": "ABX9TyOkR8ZHeX9glcqjpUGAQQ7u", | |
| "include_colab_link": true | |
| }, | |
| "kernelspec": { | |
| "name": "python3", | |
| "display_name": "Python 3" | |
| }, | |
| "language_info": { | |
| "name": "python" | |
| } | |
| }, | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "id": "view-in-github", | |
| "colab_type": "text" | |
| }, | |
| "source": [ | |
| "<a href=\"https://colab.research.google.com/gist/wesslen/755cf35e1f717435424b7e1c5c5cc209/cfpb-redaction-example.ipynb\" target=\"_parent\"><img src=\"https://colab.research.google.com/assets/colab-badge.svg\" alt=\"Open In Colab\"/></a>" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "import os\n", | |
| "from google.colab import userdata\n", | |
| "os.environ['OPENAI_API_KEY'] = userdata.get('OPENAI_API_KEY')\n", | |
| "os.environ['OPENAI_BASE_URL'] = \"https://api.openai.com/v1\"\n", | |
| "os.environ['OPENAI_MODEL_NAME'] = \"gpt-4o\"" | |
| ], | |
| "metadata": { | |
| "id": "uwz1oKUtPIEx" | |
| }, | |
| "execution_count": 28, | |
| "outputs": [] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 49, | |
| "metadata": { | |
| "id": "2_4JZdmjM5B2" | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "import pandas as pd\n", | |
| "import os\n", | |
| "from openai import OpenAI\n", | |
| "from typing import Optional\n", | |
| "import time\n", | |
| "from tqdm import tqdm\n", | |
| "import warnings\n", | |
| "\n", | |
| "def create_synthetic_narratives(\n", | |
| " input_path: str,\n", | |
| " api_key: Optional[str] = None,\n", | |
| " base_url: Optional[str] = None,\n", | |
| " model_name: Optional[str] = None,\n", | |
| " batch_size: int = 50,\n", | |
| " sleep_seconds: float = 1.0\n", | |
| ") -> pd.DataFrame:\n", | |
| " \"\"\"\n", | |
| " Process complaints CSV and generate synthetic unredacted narratives using an LLM.\n", | |
| " \"\"\"\n", | |
| " api_key = api_key or os.getenv('OPENAI_API_KEY')\n", | |
| " base_url = base_url or os.getenv('OPENAI_BASE_URL')\n", | |
| " model_name = model_name or os.getenv('OPENAI_MODEL_NAME')\n", | |
| "\n", | |
| " if not all([api_key, base_url, model_name]):\n", | |
| " raise ValueError(\"Missing required API configuration\")\n", | |
| "\n", | |
| " client = OpenAI(api_key=api_key, base_url=base_url)\n", | |
| "\n", | |
| " # Read CSV with more robust error handling\n", | |
| " try:\n", | |
| " df = pd.read_csv(input_path,\n", | |
| " on_bad_lines='skip', # Skip problematic rows\n", | |
| " # quoting=3, # Disable quote handling\n", | |
| " encoding='utf-8', # Specify encoding\n", | |
| " engine='python') # Use Python parser\n", | |
| " except Exception as e:\n", | |
| " print(f\"Error reading CSV: {str(e)}\")\n", | |
| " raise\n", | |
| "\n", | |
| " narrative_col = 'Consumer complaint narrative'\n", | |
| " if narrative_col not in df.columns:\n", | |
| " raise ValueError(f\"Required column '{narrative_col}' not found\")\n", | |
| "\n", | |
| " output_col = 'synthetic_unredacted_narrative'\n", | |
| " if output_col not in df.columns:\n", | |
| " df[output_col] = None\n", | |
| "\n", | |
| " remaining_complaints = df[df[output_col].isna()]\n", | |
| "\n", | |
| " prompt_template = \"\"\"Generate hypothetical synthetic data filling in XX of this banking complaint. Only output the updated complaint. Do not include any prefix comment like 'This is the complaint: ' or an explanation -- only the complaint with hypothetical values.\n", | |
| "\n", | |
| " This is the complaint: {complaint}\"\"\"\n", | |
| "\n", | |
| " for i in tqdm(range(0, len(remaining_complaints), batch_size)):\n", | |
| " batch = remaining_complaints.iloc[i:i + batch_size]\n", | |
| "\n", | |
| " for idx, row in batch.iterrows():\n", | |
| " complaint = row[narrative_col]\n", | |
| "\n", | |
| " if pd.isna(complaint) or 'XX' not in str(complaint):\n", | |
| " continue\n", | |
| "\n", | |
| " try:\n", | |
| " response = client.chat.completions.create(\n", | |
| " model=model_name,\n", | |
| " messages=[{\n", | |
| " \"role\": \"user\",\n", | |
| " \"content\": prompt_template.format(complaint=complaint)\n", | |
| " }],\n", | |
| " temperature=0.7\n", | |
| " )\n", | |
| " synthetic = response.choices[0].message.content\n", | |
| "\n", | |
| " original_len = len(str(complaint))\n", | |
| " synthetic_len = len(synthetic)\n", | |
| "\n", | |
| " if original_len != synthetic_len:\n", | |
| " warnings.warn(\n", | |
| " f\"Length mismatch for complaint {idx}: \"\n", | |
| " f\"Original={original_len}, Synthetic={synthetic_len}\"\n", | |
| " )\n", | |
| "\n", | |
| " df.at[idx, output_col] = synthetic\n", | |
| " time.sleep(sleep_seconds)\n", | |
| "\n", | |
| " except Exception as e:\n", | |
| " print(f\"Error processing complaint {idx}: {str(e)}\")\n", | |
| " continue\n", | |
| "\n", | |
| " checkpoint_path = input_path.replace('.csv', '_checkpoint.csv')\n", | |
| " df.to_csv(checkpoint_path, index=False)\n", | |
| "\n", | |
| " output_path = input_path.replace('.csv', '_with_synthetic.csv')\n", | |
| " df.to_csv(output_path, index=False)\n", | |
| "\n", | |
| " return df\n", | |
| "" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "input_path = \"complaints-sample.csv\"\n", | |
| "df_with_synthetic = create_synthetic_narratives(input_path)\n", | |
| "print(f\"Processed {len(df_with_synthetic)} complaints\")" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/" | |
| }, | |
| "id": "BpBgeEhRPl2Z", | |
| "outputId": "71f3b827-0b05-42ba-c0f9-be9b5917bb12" | |
| }, | |
| "execution_count": 50, | |
| "outputs": [ | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "\r 0%| | 0/1 [00:00<?, ?it/s]<ipython-input-49-e1dc3f2aaa2e>:78: UserWarning: Length mismatch for complaint 0: Original=1496, Synthetic=1470\n", | |
| " warnings.warn(\n", | |
| "100%|██████████| 1/1 [00:10<00:00, 10.34s/it]" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stdout", | |
| "text": [ | |
| "Processed 1 complaints\n" | |
| ] | |
| }, | |
| { | |
| "output_type": "stream", | |
| "name": "stderr", | |
| "text": [ | |
| "\n" | |
| ] | |
| } | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "source": [ | |
| "df_with_synthetic['synthetic_unredacted_narrative'][0]" | |
| ], | |
| "metadata": { | |
| "colab": { | |
| "base_uri": "https://localhost:8080/", | |
| "height": 191 | |
| }, | |
| "id": "igUPVKcERLkI", | |
| "outputId": "887dfc39-48bc-420c-a8c3-d21093e825ad" | |
| }, | |
| "execution_count": 52, | |
| "outputs": [ | |
| { | |
| "output_type": "execute_result", | |
| "data": { | |
| "text/plain": [ | |
| "'To the Reviewer or John Doe, I have received a total of 5 hard inquiries from ABC Bank: ABC Bank NA inquired on 01/15/2023, ABC Bank inquired on 02/20/2023, ABC Bank inquired on 03/25/2023, ABC Bank inquired on 04/30/2023, ABC Bank inquired on 05/05/2023. I have received a total of 2 hard inquiries from XYZ Credit: XYZ Credit Corp inquired on 06/10/2023, XYZ Credit Corp inquired on 07/15/2023. I have received a couple of \"Account Review Inquiries\" on 08/20/2023 from DEF Finance: I have submitted multiple hard inquiry disputes to all three credit bureaus, Experian, Equifax, and Transition Bureau regarding this and have settled with putting my accounts on credit freeze. I did not apply or request for any account, business, or credit card with the following entities on these dates therefore, do not recognize these to be hard inquiries under my account. These hard inquiries have made it so that my credit score has declined, which has caused my mental wellbeing as I aspire to be a first-time homebuyer soon. The hard inquiries are stated to last until the year of 2025. Additionally, I have been receiving letters in the mail stating that my account was not approved due to \"multiple inquiries associated with your credit\" - this is untrue as I did not apply to these entities. This raises another concern of the potential of applying for new credit cards I have been considering. I need these hard inquiries to be removed from my record as they were not mine.'" | |
| ], | |
| "application/vnd.google.colaboratory.intrinsic+json": { | |
| "type": "string" | |
| } | |
| }, | |
| "metadata": {}, | |
| "execution_count": 52 | |
| } | |
| ] | |
| } | |
| ] | |
| } |
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| Date received | Product | Sub-product | Issue | Sub-issue | Consumer complaint narrative | Company public response | Company | State | ZIP code | Tags | Consumer consent provided? | Submitted via | Date sent to company | Company response to consumer | Timely response? | Consumer disputed? | Complaint ID | |
|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 12/30/24 | Credit reporting or other personal consumer reports | Credit reporting | Improper use of your report | Credit inquiries on your report that you don't recognize | To the Reviewer or XXXX XXXX, I have received a total of 5 hard inquiries from XXXX : XXXX XXXX NA inquired on XX/XX/XXXXXXXX XXXX XXXX inquired on XX/XX/XXXX XXXXXXXX XXXX XXXX inquired on XX/XX/XXXX XXXXXXXX XXXX inquired on XX/XX/XXXX XXXXXXXX XXXX inquired on XX/XX/XXXX I have received a total of 2 hard inquiries from XXXX : XXXX XXXX XXXX XXXX inquired on the XX/XX/XXXXXXXX XXXX XXXX XXXX XXXX XXXX XXXX XXXX XX/XX/XXXX XXXX have received a couple of " Account Review Inquiries '' on XX/XX/XXXX from XXXX : I have submitted multiple hard inquiry disputes to all three credit bureaus, XXXX, XXXX, and Transition XXXX regarding this and have settled with putting my accounts on credit freeze. I did not apply or request for any account, business, or credit card with the following entities on these dates therefore, do not recognize these to be hard inquiries under my account. These hard inquiries have made it so that my credit score has declined, which has caused my mental wellbeing as I aspire to be a first-time homebuyer soon. The hard inquiries are stated to last until the year of XXXX. Additionally, I have been receiving letters in the mail stating that my account was not approved due to " multiple inquiries associated with your credit '' - this is untrue as I did not apply to these entities. This raises another concern of the potential of applying for new credit cards I have been considering. I need these hard inquiries to be removed from my record as they were not mine. | Company has responded to the consumer and the CFPB and chooses not to provide a public response | WELLS FARGO & COMPANY | MA | 02122 | None | Consent provided | Web | 12/30/24 | Closed with explanation | Yes | N/A | 11324552 |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment