Skip to content

Instantly share code, notes, and snippets.

@calilisantos
Created December 21, 2024 17:16
Show Gist options
  • Select an option

  • Save calilisantos/a4f4f04bea19df43fd9b87bd1e573863 to your computer and use it in GitHub Desktop.

Select an option

Save calilisantos/a4f4f04bea19df43fd9b87bd1e573863 to your computer and use it in GitHub Desktop.
Quick guide to nulls in spark
Display the source blob
Display the rendered blob
Raw
{
"cells": [
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "98c03c28-cdd3-4123-9e95-ae27aa802ee0",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## Dependencies"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "1cb935ad-0227-409f-9b59-da8b3cf79345",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [],
"source": [
"from numpy import NaN\n",
"from pyspark.sql import functions as F, types as T"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "c9298665-2fac-4e11-a212-d8bbc4d4ad44",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## Sample"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "287d4338-e639-4a5b-9a64-9db626afe507",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>John</td><td>Lennon</td><td>2.0</td><td>null</td></tr><tr><td>Paul</td><td>Maca</td><td>NaN</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>Eric</td><td>Clapton</td><td>1.0</td><td>null</td></tr><tr><td>Eric</td><td></td><td>null</td><td>true</td></tr><tr><td>null</td><td>null</td><td>NaN</td><td>null</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"John",
"Lennon",
1.0,
true
],
[
"John",
"Lennon",
2.0,
null
],
[
"Paul",
"Maca",
"NaN",
true
],
[
"Paul",
"Maca",
2.0,
false
],
[
"Eric",
"Clapton",
1.0,
null
],
[
"Eric",
"",
null,
true
],
[
null,
null,
"NaN",
null
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "last_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "question",
"type": "\"float\""
},
{
"metadata": "{}",
"name": "answer",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"data = [\n",
" [\"John\", \"Lennon\", 1.0, True],\n",
" [\"John\", \"Lennon\", 2.0, None],\n",
" [\"Paul\", \"Maca\", NaN, True],\n",
" [\"Paul\", \"Maca\", 2.0, False],\n",
" [\"Eric\", \"Clapton\", 1.0, None],\n",
" [\"Eric\", \"\", None, True],\n",
" [None, None, NaN, None]\n",
"]\n",
"\n",
"schema = T.StructType([\n",
" T.StructField(name=\"name\", dataType=T.StringType(), nullable=True),\n",
" T.StructField(name=\"last_name\", dataType=T.StringType(), nullable=True),\n",
" T.StructField(name=\"question\", dataType=T.FloatType(), nullable=True),\n",
" T.StructField(name=\"answer\", dataType=T.BooleanType(), nullable=True),\n",
"])\n",
"\n",
"dataframe = spark.createDataFrame(data=data, schema=schema)\n",
"dataframe.display()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "1daa4b10-8bd0-43c3-a0c0-d809c56d14f6",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## Verificação\n",
"* #### isNull()\n",
"* #### isNotNull()\n",
"* #### isnan()\n",
"* #### eqNullSafe(\"valor_a_comparar\")\n",
" * **Porque usar ele?**\n",
" * NaN é diferente de nulo\n",
" * com os outro métodos comparando qualquer valor com nulo, o resultado é nulo"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "ae5ba318-91c2-4700-b3db-49d87a97165d",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>question</th><th>isNull</th><th>isNotNull</th><th>isNaN</th><th>=1.0</th><th>really=1.0</th><th>reallyNull</th></tr></thead><tbody><tr><td>1.0</td><td>false</td><td>true</td><td>false</td><td>true</td><td>true</td><td>false</td></tr><tr><td>2.0</td><td>false</td><td>true</td><td>false</td><td>false</td><td>false</td><td>false</td></tr><tr><td>NaN</td><td>false</td><td>true</td><td>true</td><td>false</td><td>false</td><td>false</td></tr><tr><td>2.0</td><td>false</td><td>true</td><td>false</td><td>false</td><td>false</td><td>false</td></tr><tr><td>1.0</td><td>false</td><td>true</td><td>false</td><td>true</td><td>true</td><td>false</td></tr><tr><td>null</td><td>true</td><td>false</td><td>false</td><td>null</td><td>false</td><td>true</td></tr><tr><td>NaN</td><td>false</td><td>true</td><td>true</td><td>false</td><td>false</td><td>false</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
1.0,
false,
true,
false,
true,
true,
false
],
[
2.0,
false,
true,
false,
false,
false,
false
],
[
"NaN",
false,
true,
true,
false,
false,
false
],
[
2.0,
false,
true,
false,
false,
false,
false
],
[
1.0,
false,
true,
false,
true,
true,
false
],
[
null,
true,
false,
false,
null,
false,
true
],
[
"NaN",
false,
true,
true,
false,
false,
false
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "question",
"type": "\"float\""
},
{
"metadata": "{}",
"name": "isNull",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "isNotNull",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "isNaN",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "=1.0",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "really=1.0",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "reallyNull",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"(\n",
" dataframe\n",
" .select(\n",
" \"question\",\n",
" F.col(\"question\").isNull().alias(\"isNull\"), \n",
" F.col(\"question\").isNotNull().alias(\"isNotNull\"),\n",
" F.col(\"question\").isNaN().alias(\"isNaN\"),\n",
" (F.col(\"question\") == 1.0).alias(\"=1.0\"),\n",
" F.col(\"question\").eqNullSafe(1.0).alias(\"really=1.0\"),\n",
" F.col(\"question\").eqNullSafe(None).alias(\"reallyNull\"),\n",
" )\n",
" .display()\n",
")"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "491160c6-4551-4396-9b4d-bea6003e5772",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>answer</th><th>isNull</th><th>isNotNull</th><th>isTrue</th><th>reallyTrue</th><th>reallyNaN</th></tr></thead><tbody><tr><td>true</td><td>false</td><td>true</td><td>true</td><td>true</td><td>false</td></tr><tr><td>null</td><td>true</td><td>false</td><td>null</td><td>false</td><td>false</td></tr><tr><td>true</td><td>false</td><td>true</td><td>true</td><td>true</td><td>false</td></tr><tr><td>false</td><td>false</td><td>true</td><td>false</td><td>false</td><td>false</td></tr><tr><td>null</td><td>true</td><td>false</td><td>null</td><td>false</td><td>false</td></tr><tr><td>true</td><td>false</td><td>true</td><td>true</td><td>true</td><td>false</td></tr><tr><td>null</td><td>true</td><td>false</td><td>null</td><td>false</td><td>false</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
true,
false,
true,
true,
true,
false
],
[
null,
true,
false,
null,
false,
false
],
[
true,
false,
true,
true,
true,
false
],
[
false,
false,
true,
false,
false,
false
],
[
null,
true,
false,
null,
false,
false
],
[
true,
false,
true,
true,
true,
false
],
[
null,
true,
false,
null,
false,
false
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "answer",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "isNull",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "isNotNull",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "isTrue",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "reallyTrue",
"type": "\"boolean\""
},
{
"metadata": "{}",
"name": "reallyNaN",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"(\n",
" dataframe\n",
" .select(\n",
" \"answer\",\n",
" F.col(\"answer\").isNull().alias(\"isNull\"), \n",
" F.col(\"answer\").isNotNull().alias(\"isNotNull\"),\n",
" # F.col(\"answer\").isNaN().alias(\"isNaN\"),\n",
" (F.col(\"answer\") == True).alias(\"isTrue\"),\n",
" F.col(\"answer\").eqNullSafe(True).alias(\"reallyTrue\"),\n",
" F.col(\"answer\").eqNullSafe(NaN).alias(\"reallyNaN\"),\n",
" )\n",
").display()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "4b041a7d-a657-4849-90c9-e34db911e489",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## Ordenação\n",
"* #### asc_nulls_first(): \n",
" * deixa os valores em ordem ascendente com os nulos primeiro\n",
"* #### asc_nulls_last(): \n",
" * deixa os valores em ordem ascendente com os nulos por último\n",
"* #### desc_nulls_first(): \n",
" * deixa os valores em ordem descendente com os nulos primeiro\n",
"* #### desc_nulls_last(): \n",
" * deixa os valores em ordem descendente com os nulos por último"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "8419776c-1969-40c3-a01e-2152eafe3824",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>Eric</td><td></td><td>null</td><td>true</td></tr><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>Eric</td><td>Clapton</td><td>1.0</td><td>null</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>John</td><td>Lennon</td><td>2.0</td><td>null</td></tr><tr><td>Paul</td><td>Maca</td><td>NaN</td><td>true</td></tr><tr><td>null</td><td>null</td><td>NaN</td><td>null</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"Eric",
"",
null,
true
],
[
"John",
"Lennon",
1.0,
true
],
[
"Eric",
"Clapton",
1.0,
null
],
[
"Paul",
"Maca",
2.0,
false
],
[
"John",
"Lennon",
2.0,
null
],
[
"Paul",
"Maca",
"NaN",
true
],
[
null,
null,
"NaN",
null
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "last_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "question",
"type": "\"float\""
},
{
"metadata": "{}",
"name": "answer",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"(\n",
" dataframe\n",
" .sort(\n",
" F.asc_nulls_first(\"question\")\n",
" )\n",
").display()"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "cc2e5f9e-57ee-4018-81b3-e135efa8bd35",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>NaN</td><td>true</td></tr><tr><td>Eric</td><td></td><td>null</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>null</td><td>null</td><td>NaN</td><td>null</td></tr><tr><td>John</td><td>Lennon</td><td>2.0</td><td>null</td></tr><tr><td>Eric</td><td>Clapton</td><td>1.0</td><td>null</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"John",
"Lennon",
1.0,
true
],
[
"Paul",
"Maca",
"NaN",
true
],
[
"Eric",
"",
null,
true
],
[
"Paul",
"Maca",
2.0,
false
],
[
null,
null,
"NaN",
null
],
[
"John",
"Lennon",
2.0,
null
],
[
"Eric",
"Clapton",
1.0,
null
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "last_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "question",
"type": "\"float\""
},
{
"metadata": "{}",
"name": "answer",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"(\n",
" dataframe\n",
" .sort(\n",
" F.col(\"answer\").desc_nulls_last()\n",
" )\n",
").display()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "2f4533ce-19a9-44e7-9309-4ef836dc49b7",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"## Transformação"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "1ab5aa59-5512-436a-8557-67b94da6a8b9",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"* #### na.drop() ou dropna(): \n",
" * **how**: \n",
" * aceita o argumento `any`, tirando a linha se qualquer valor dela for nulo\n",
" * aceita o argumento `all`, tirando a linha se todos seus valores forem nulo\n",
" * **thresh**: sobrescreve `how`, tirando a linha se a quantidade de valores não nulo for menor que o valor passado\n",
" * **subset**: Especifica a(s) coluna(s) a ser(em) verificado o valor nulo"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "3f11674b-399e-4467-af85-34c6532a599b",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>NaN</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>Eric</td><td></td><td>null</td><td>true</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"John",
"Lennon",
1.0,
true
],
[
"Paul",
"Maca",
"NaN",
true
],
[
"Paul",
"Maca",
2.0,
false
],
[
"Eric",
"",
null,
true
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "last_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "question",
"type": "\"float\""
},
{
"metadata": "{}",
"name": "answer",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"(\n",
" dataframe\n",
" .dropna(\n",
" # how=\"all\", # any or all\n",
" # thresh=3,\n",
" subset=[\"answer\"]\n",
" )\n",
").display()"
]
},
{
"cell_type": "markdown",
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "17ae3cb5-e489-4d84-af58-9605e79318c2",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"source": [
"* ### na.fill() ou fillna():\n",
" * **subset**: Especifica a(s) coluna(s) a ser(em) verificado o valor nulo. \n",
" * **IMPORTANTE: A coluna deve ter o mesmo tipo do valor a ser preenchido**.\n",
" * **values**: valor a preencher no lugar dos valores nulos"
]
},
{
"cell_type": "code",
"execution_count": 0,
"metadata": {
"application/vnd.databricks.v1+cell": {
"cellMetadata": {
"byteLimit": 2048000,
"rowLimit": 10000
},
"inputWidgets": {},
"nuid": "37e44069-d1fc-4523-875e-5828ba05f8e7",
"showTitle": false,
"tableResultSettingsMap": {},
"title": ""
}
},
"outputs": [
{
"output_type": "display_data",
"data": {
"text/html": [
"<style scoped>\n",
" .table-result-container {\n",
" max-height: 300px;\n",
" overflow: auto;\n",
" }\n",
" table, th, td {\n",
" border: 1px solid black;\n",
" border-collapse: collapse;\n",
" }\n",
" th, td {\n",
" padding: 5px;\n",
" }\n",
" th {\n",
" text-align: left;\n",
" }\n",
"</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>John</td><td>Lennon</td><td>2.0</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>1.0</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>Eric</td><td>Clapton</td><td>1.0</td><td>true</td></tr><tr><td>Eric</td><td></td><td>1.0</td><td>true</td></tr><tr><td>null</td><td>null</td><td>1.0</td><td>true</td></tr></tbody></table></div>"
]
},
"metadata": {
"application/vnd.databricks.v1+output": {
"addedWidgets": {},
"aggData": [],
"aggError": "",
"aggOverflow": false,
"aggSchema": [],
"aggSeriesLimitReached": false,
"aggType": "",
"arguments": {},
"columnCustomDisplayInfos": {},
"data": [
[
"John",
"Lennon",
1.0,
true
],
[
"John",
"Lennon",
2.0,
true
],
[
"Paul",
"Maca",
1.0,
true
],
[
"Paul",
"Maca",
2.0,
false
],
[
"Eric",
"Clapton",
1.0,
true
],
[
"Eric",
"",
1.0,
true
],
[
null,
null,
1.0,
true
]
],
"datasetInfos": [],
"dbfsResultPath": null,
"isJsonSchema": true,
"metadata": {},
"overflow": false,
"plotOptions": {
"customPlotOptions": {},
"displayType": "table",
"pivotAggregation": null,
"pivotColumns": null,
"xColumns": null,
"yColumns": null
},
"removedWidgets": [],
"schema": [
{
"metadata": "{}",
"name": "name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "last_name",
"type": "\"string\""
},
{
"metadata": "{}",
"name": "question",
"type": "\"float\""
},
{
"metadata": "{}",
"name": "answer",
"type": "\"boolean\""
}
],
"type": "table"
}
},
"output_type": "display_data"
}
],
"source": [
"(\n",
" dataframe\n",
" .fillna(\n",
" value=1.0,#{\"question\": 1.0, \"answer\": True},\n",
" # subset=\"question\"\n",
" )\n",
").display()"
]
}
],
"metadata": {
"application/vnd.databricks.v1+notebook": {
"computePreferences": null,
"dashboards": [],
"environmentMetadata": null,
"language": "python",
"notebookMetadata": {
"pythonIndentUnit": 4
},
"notebookName": "null_in_spark",
"widgets": {}
}
},
"nbformat": 4,
"nbformat_minor": 0
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment