Skip to content

Instantly share code, notes, and snippets.

@glen1995
Created August 9, 2023 22:39
Show Gist options
  • Select an option

  • Save glen1995/bb9b1d0cef603986bdf341087dae376f to your computer and use it in GitHub Desktop.

Select an option

Save glen1995/bb9b1d0cef603986bdf341087dae376f to your computer and use it in GitHub Desktop.
Error Cleaning
Display the source blob
Display the rendered blob
Raw
{
"metadata": {
"kernelspec": {
"name": "python",
"display_name": "Python (Pyodide)",
"language": "python"
},
"language_info": {
"codemirror_mode": {
"name": "python",
"version": 3
},
"file_extension": ".py",
"mimetype": "text/x-python",
"name": "python",
"nbconvert_exporter": "python",
"pygments_lexer": "ipython3",
"version": "3.8"
}
},
"nbformat_minor": 4,
"nbformat": 4,
"cells": [
{
"cell_type": "markdown",
"source": "# Error Cleaning",
"metadata": {}
},
{
"cell_type": "code",
"source": "import re\nimport pandas as pd",
"metadata": {
"trusted": true
},
"execution_count": 132,
"outputs": []
},
{
"cell_type": "code",
"source": "columns = ['id', 'url', 'job_id', 'customer_id', 'error_message']\ndf = pd.read_csv(\"error_collections.csv\", usecols=columns)\npd.options.display.max_colwidth = 150\ndf[\"error_message2\"] = df[\"error_message\"]\n#pd.set_option('max_colwidth', None)",
"metadata": {
"trusted": true
},
"execution_count": 136,
"outputs": []
},
{
"cell_type": "code",
"source": "df.info()",
"metadata": {
"trusted": true
},
"execution_count": 137,
"outputs": [
{
"name": "stdout",
"text": "<class 'pandas.core.frame.DataFrame'>\nRangeIndex: 8098 entries, 0 to 8097\nData columns (total 6 columns):\n # Column Non-Null Count Dtype \n--- ------ -------------- ----- \n 0 id 8098 non-null int64 \n 1 url 8098 non-null object\n 2 job_id 8098 non-null int64 \n 3 customer_id 8098 non-null int64 \n 4 error_message 8098 non-null object\n 5 error_message2 8098 non-null object\ndtypes: int64(3), object(3)\nmemory usage: 284.8+ KB\n",
"output_type": "stream"
}
]
},
{
"cell_type": "code",
"source": "replacements = {\n r'ENOENT: no such file or directory, open .*': \"ENOENT: no such file or directory\",\n r'net::ERR_TUNNEL_CONNECTION_FAILED .*': \"net::ERR_TUNNEL_CONNECTION_FAILED\",\n r'net::ERR_TOO_MANY_REDIRECTS .*': \"net::ERR_TOO_MANY_REDIRECTS\",\n r'net::ERR_UNEXPECTED_PROXY_AUTH .*': \"net::ERR_UNEXPECTED_PROXY_AUTH\",\n r'net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH .*': \"net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH\",\n r'Evaluation failed: TypeError: Failed to fetch.*': \"TypeError\",\n r'Error: net::ERR_CONNECTION_RESET .*': \"Error: net::ERR_CONNECTION_RESET\",\n r'Maximum call stack .*': \"Maximum call stack size exceeded\",\n r'.*extract_area: parameter width not set.*': \"extract_area: parameter width not set\",\n r'escapeHatch went BOOM! Forced to exit with error: ProtocolError:.*': \"Protocol error (Runtime.evaluate)\",\n r'extract_area: parameter width.*\\n': \"extract_area: parameter width not set\",\n r'extract_area: parameter height not set\\n': \"extract_area: parameter height not set\",\n r'extract_area: parameter width.*\\n.*\\n': \"extract_area: parameter width not set\",\n r'Retries exceeded the limit of 0 with error: [Error: extract_area: parameter height not set\\n]': \"extract_area: parameter height not set\",\n r'Retries exceeded the limit of 0 with error: [Error: extract_area: parameter height not set]': \"extract_area: parameter height not set\"\n}\n\ndf[\"error_message\"] = df[\"error_message\"].replace(replacements, regex=True)\ndef extract_substring(row):\n pattern = r'net::(.*?)\\s+at'\n matches = re.findall(pattern, row['error_message'])\n return ', '.join(matches) if matches else row['error_message']\n\ndf['extracted'] = df.apply(extract_substring, axis=1)",
"metadata": {
"trusted": true
},
"execution_count": 138,
"outputs": []
},
{
"cell_type": "code",
"source": "filtered_df = df[df['extracted'].notnull()]\ns = filtered_df[[\"id\",\"error_message\" ,\"extracted\"]].groupby([\"extracted\"], dropna=False).count().sort_values(by='id', ascending=False)",
"metadata": {
"trusted": true
},
"execution_count": 139,
"outputs": []
},
{
"cell_type": "code",
"source": "s[s.id > 20]",
"metadata": {
"trusted": true
},
"execution_count": 140,
"outputs": [
{
"execution_count": 140,
"output_type": "execute_result",
"data": {
"text/plain": " id \\\nextracted \nERR_TOO_MANY_RETRIES 2186 \nextract_area: parameter height not set 610 \nMaximum call stack size exceeded 559 \nextract_area: parameter width not setextract_area: parameter height not set 528 \nProtocol error (Runtime.evaluate)\\n at /home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/Connection.js:329:24\\n at... 468 \nextract_area: parameter width not setextract_area: parameter height not set] 420 \nRetries exceeded the limit of 0 with error: [Error: extract_area: parameter height not set] 399 \nERR_HTTP_RESPONSE_CODE_FAILURE 398 \nERR_NAME_NOT_RESOLVED 252 \nerror getting HTML Tree 247 \nextract_area: parameter width not set] 239 \nnet::ERR_TUNNEL_CONNECTION_FAILED 221 \nInvalid array length 220 \nERR_CONNECTION_RESET 115 \nError with POST request to jobServiceV2 API /logupload 108 \nProtocol error (DOM.describeNode): Cannot find context with specified id 104 \nProtocol error (Runtime.callFunctionOn): Execution context was destroyed. 98 \nExecution context was destroyed, most likely because of a navigation. 96 \nextract_area: parameter width not set 92 \nnet::ERR_SSL_VERSION_OR_CIPHER_MISMATCH 87 \nnet::ERR_TOO_MANY_REDIRECTS 67 \nProtocol error (DOM.resolveNode): Node with given id does not belong to the document 52 \nFailed to generate compare input 42 \nCannot read properties of undefined (reading 'key') 40 \nerror getting text content 40 \nProtocol error (Runtime.evaluate)\\n at /home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/Connection.js:329:24\\n at... 35 \nPoppler CLI tool error 29 \n\n error_message \nextracted \nERR_TOO_MANY_RETRIES 2186 \nextract_area: parameter height not set 610 \nMaximum call stack size exceeded 559 \nextract_area: parameter width not setextract_area: parameter height not set 528 \nProtocol error (Runtime.evaluate)\\n at /home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/Connection.js:329:24\\n at... 468 \nextract_area: parameter width not setextract_area: parameter height not set] 420 \nRetries exceeded the limit of 0 with error: [Error: extract_area: parameter height not set] 399 \nERR_HTTP_RESPONSE_CODE_FAILURE 398 \nERR_NAME_NOT_RESOLVED 252 \nerror getting HTML Tree 247 \nextract_area: parameter width not set] 239 \nnet::ERR_TUNNEL_CONNECTION_FAILED 221 \nInvalid array length 220 \nERR_CONNECTION_RESET 115 \nError with POST request to jobServiceV2 API /logupload 108 \nProtocol error (DOM.describeNode): Cannot find context with specified id 104 \nProtocol error (Runtime.callFunctionOn): Execution context was destroyed. 98 \nExecution context was destroyed, most likely because of a navigation. 96 \nextract_area: parameter width not set 92 \nnet::ERR_SSL_VERSION_OR_CIPHER_MISMATCH 87 \nnet::ERR_TOO_MANY_REDIRECTS 67 \nProtocol error (DOM.resolveNode): Node with given id does not belong to the document 52 \nFailed to generate compare input 42 \nCannot read properties of undefined (reading 'key') 40 \nerror getting text content 40 \nProtocol error (Runtime.evaluate)\\n at /home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/Connection.js:329:24\\n at... 35 \nPoppler CLI tool error 29 ",
"text/html": "<div>\n<style scoped>\n .dataframe tbody tr th:only-of-type {\n vertical-align: middle;\n }\n\n .dataframe tbody tr th {\n vertical-align: top;\n }\n\n .dataframe thead th {\n text-align: right;\n }\n</style>\n<table border=\"1\" class=\"dataframe\">\n <thead>\n <tr style=\"text-align: right;\">\n <th></th>\n <th>id</th>\n <th>error_message</th>\n </tr>\n <tr>\n <th>extracted</th>\n <th></th>\n <th></th>\n </tr>\n </thead>\n <tbody>\n <tr>\n <th>ERR_TOO_MANY_RETRIES</th>\n <td>2186</td>\n <td>2186</td>\n </tr>\n <tr>\n <th>extract_area: parameter height not set</th>\n <td>610</td>\n <td>610</td>\n </tr>\n <tr>\n <th>Maximum call stack size exceeded</th>\n <td>559</td>\n <td>559</td>\n </tr>\n <tr>\n <th>extract_area: parameter width not setextract_area: parameter height not set</th>\n <td>528</td>\n <td>528</td>\n </tr>\n <tr>\n <th>Protocol error (Runtime.evaluate)\\n at /home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/Connection.js:329:24\\n at new Promise (&lt;anonymous&gt;)\\n at CDPSessionImpl.send (/home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/Connection.js:325:16)\\n at ExecutionContext._ExecutionContext_evaluate (/home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:190:14)\\n at ExecutionContext.evaluate (/home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:125:113)\\n at IsolatedWorld.evaluate (/home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/IsolatedWorld.js:150:24)\\n at runMicrotasks (&lt;anonymous&gt;)\\n at processTicksAndRejections (node:internal/process/task_queues:96:5)</th>\n <td>468</td>\n <td>468</td>\n </tr>\n <tr>\n <th>extract_area: parameter width not setextract_area: parameter height not set]</th>\n <td>420</td>\n <td>420</td>\n </tr>\n <tr>\n <th>Retries exceeded the limit of 0 with error: [Error: extract_area: parameter height not set]</th>\n <td>399</td>\n <td>399</td>\n </tr>\n <tr>\n <th>ERR_HTTP_RESPONSE_CODE_FAILURE</th>\n <td>398</td>\n <td>398</td>\n </tr>\n <tr>\n <th>ERR_NAME_NOT_RESOLVED</th>\n <td>252</td>\n <td>252</td>\n </tr>\n <tr>\n <th>error getting HTML Tree</th>\n <td>247</td>\n <td>247</td>\n </tr>\n <tr>\n <th>extract_area: parameter width not set]</th>\n <td>239</td>\n <td>239</td>\n </tr>\n <tr>\n <th>net::ERR_TUNNEL_CONNECTION_FAILED</th>\n <td>221</td>\n <td>221</td>\n </tr>\n <tr>\n <th>Invalid array length</th>\n <td>220</td>\n <td>220</td>\n </tr>\n <tr>\n <th>ERR_CONNECTION_RESET</th>\n <td>115</td>\n <td>115</td>\n </tr>\n <tr>\n <th>Error with POST request to jobServiceV2 API /logupload</th>\n <td>108</td>\n <td>108</td>\n </tr>\n <tr>\n <th>Protocol error (DOM.describeNode): Cannot find context with specified id</th>\n <td>104</td>\n <td>104</td>\n </tr>\n <tr>\n <th>Protocol error (Runtime.callFunctionOn): Execution context was destroyed.</th>\n <td>98</td>\n <td>98</td>\n </tr>\n <tr>\n <th>Execution context was destroyed, most likely because of a navigation.</th>\n <td>96</td>\n <td>96</td>\n </tr>\n <tr>\n <th>extract_area: parameter width not set</th>\n <td>92</td>\n <td>92</td>\n </tr>\n <tr>\n <th>net::ERR_SSL_VERSION_OR_CIPHER_MISMATCH</th>\n <td>87</td>\n <td>87</td>\n </tr>\n <tr>\n <th>net::ERR_TOO_MANY_REDIRECTS</th>\n <td>67</td>\n <td>67</td>\n </tr>\n <tr>\n <th>Protocol error (DOM.resolveNode): Node with given id does not belong to the document</th>\n <td>52</td>\n <td>52</td>\n </tr>\n <tr>\n <th>Failed to generate compare input</th>\n <td>42</td>\n <td>42</td>\n </tr>\n <tr>\n <th>Cannot read properties of undefined (reading 'key')</th>\n <td>40</td>\n <td>40</td>\n </tr>\n <tr>\n <th>error getting text content</th>\n <td>40</td>\n <td>40</td>\n </tr>\n <tr>\n <th>Protocol error (Runtime.evaluate)\\n at /home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/Connection.js:329:24\\n at new Promise (&lt;anonymous&gt;)\\n at CDPSessionImpl.send (/home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/Connection.js:325:16)\\n at ExecutionContext._ExecutionContext_evaluate (/home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:190:14)\\n at ExecutionContext.evaluate (/home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/ExecutionContext.js:125:113)\\n at IsolatedWorld.evaluate (/home/screenshot-worker/node_modules/puppeteer-core/lib/cjs/puppeteer/common/IsolatedWorld.js:150:24)\\n at processTicksAndRejections (node:internal/process/task_queues:96:5)</th>\n <td>35</td>\n <td>35</td>\n </tr>\n <tr>\n <th>Poppler CLI tool error</th>\n <td>29</td>\n <td>29</td>\n </tr>\n </tbody>\n</table>\n</div>"
},
"metadata": {}
}
]
},
{
"cell_type": "code",
"source": "df.to_csv('cleaned_error.csv', index=False)\ngrouped_df = df[[\"id\",\"error_message\" ,\"extracted\"]].groupby([ \"extracted\"], dropna=False).count()\ngrouped_df.reset_index().to_csv('extracted_errors.csv', index=False)",
"metadata": {
"trusted": true
},
"execution_count": 24,
"outputs": []
}
]
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment