Created
December 21, 2024 17:16
-
-
Save calilisantos/a4f4f04bea19df43fd9b87bd1e573863 to your computer and use it in GitHub Desktop.
Quick guide to nulls in spark
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| { | |
| "cells": [ | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "98c03c28-cdd3-4123-9e95-ae27aa802ee0", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## Dependencies" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "1cb935ad-0227-409f-9b59-da8b3cf79345", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [], | |
| "source": [ | |
| "from numpy import NaN\n", | |
| "from pyspark.sql import functions as F, types as T" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "c9298665-2fac-4e11-a212-d8bbc4d4ad44", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## Sample" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "287d4338-e639-4a5b-9a64-9db626afe507", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>John</td><td>Lennon</td><td>2.0</td><td>null</td></tr><tr><td>Paul</td><td>Maca</td><td>NaN</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>Eric</td><td>Clapton</td><td>1.0</td><td>null</td></tr><tr><td>Eric</td><td></td><td>null</td><td>true</td></tr><tr><td>null</td><td>null</td><td>NaN</td><td>null</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "John", | |
| "Lennon", | |
| 1.0, | |
| true | |
| ], | |
| [ | |
| "John", | |
| "Lennon", | |
| 2.0, | |
| null | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| "NaN", | |
| true | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| 2.0, | |
| false | |
| ], | |
| [ | |
| "Eric", | |
| "Clapton", | |
| 1.0, | |
| null | |
| ], | |
| [ | |
| "Eric", | |
| "", | |
| null, | |
| true | |
| ], | |
| [ | |
| null, | |
| null, | |
| "NaN", | |
| null | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "last_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "question", | |
| "type": "\"float\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "answer", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "data = [\n", | |
| " [\"John\", \"Lennon\", 1.0, True],\n", | |
| " [\"John\", \"Lennon\", 2.0, None],\n", | |
| " [\"Paul\", \"Maca\", NaN, True],\n", | |
| " [\"Paul\", \"Maca\", 2.0, False],\n", | |
| " [\"Eric\", \"Clapton\", 1.0, None],\n", | |
| " [\"Eric\", \"\", None, True],\n", | |
| " [None, None, NaN, None]\n", | |
| "]\n", | |
| "\n", | |
| "schema = T.StructType([\n", | |
| " T.StructField(name=\"name\", dataType=T.StringType(), nullable=True),\n", | |
| " T.StructField(name=\"last_name\", dataType=T.StringType(), nullable=True),\n", | |
| " T.StructField(name=\"question\", dataType=T.FloatType(), nullable=True),\n", | |
| " T.StructField(name=\"answer\", dataType=T.BooleanType(), nullable=True),\n", | |
| "])\n", | |
| "\n", | |
| "dataframe = spark.createDataFrame(data=data, schema=schema)\n", | |
| "dataframe.display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "1daa4b10-8bd0-43c3-a0c0-d809c56d14f6", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## Verificação\n", | |
| "* #### isNull()\n", | |
| "* #### isNotNull()\n", | |
| "* #### isnan()\n", | |
| "* #### eqNullSafe(\"valor_a_comparar\")\n", | |
| " * **Porque usar ele?**\n", | |
| " * NaN é diferente de nulo\n", | |
| " * com os outro métodos comparando qualquer valor com nulo, o resultado é nulo" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "ae5ba318-91c2-4700-b3db-49d87a97165d", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>question</th><th>isNull</th><th>isNotNull</th><th>isNaN</th><th>=1.0</th><th>really=1.0</th><th>reallyNull</th></tr></thead><tbody><tr><td>1.0</td><td>false</td><td>true</td><td>false</td><td>true</td><td>true</td><td>false</td></tr><tr><td>2.0</td><td>false</td><td>true</td><td>false</td><td>false</td><td>false</td><td>false</td></tr><tr><td>NaN</td><td>false</td><td>true</td><td>true</td><td>false</td><td>false</td><td>false</td></tr><tr><td>2.0</td><td>false</td><td>true</td><td>false</td><td>false</td><td>false</td><td>false</td></tr><tr><td>1.0</td><td>false</td><td>true</td><td>false</td><td>true</td><td>true</td><td>false</td></tr><tr><td>null</td><td>true</td><td>false</td><td>false</td><td>null</td><td>false</td><td>true</td></tr><tr><td>NaN</td><td>false</td><td>true</td><td>true</td><td>false</td><td>false</td><td>false</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| 1.0, | |
| false, | |
| true, | |
| false, | |
| true, | |
| true, | |
| false | |
| ], | |
| [ | |
| 2.0, | |
| false, | |
| true, | |
| false, | |
| false, | |
| false, | |
| false | |
| ], | |
| [ | |
| "NaN", | |
| false, | |
| true, | |
| true, | |
| false, | |
| false, | |
| false | |
| ], | |
| [ | |
| 2.0, | |
| false, | |
| true, | |
| false, | |
| false, | |
| false, | |
| false | |
| ], | |
| [ | |
| 1.0, | |
| false, | |
| true, | |
| false, | |
| true, | |
| true, | |
| false | |
| ], | |
| [ | |
| null, | |
| true, | |
| false, | |
| false, | |
| null, | |
| false, | |
| true | |
| ], | |
| [ | |
| "NaN", | |
| false, | |
| true, | |
| true, | |
| false, | |
| false, | |
| false | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "question", | |
| "type": "\"float\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "isNull", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "isNotNull", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "isNaN", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "=1.0", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "really=1.0", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "reallyNull", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "(\n", | |
| " dataframe\n", | |
| " .select(\n", | |
| " \"question\",\n", | |
| " F.col(\"question\").isNull().alias(\"isNull\"), \n", | |
| " F.col(\"question\").isNotNull().alias(\"isNotNull\"),\n", | |
| " F.col(\"question\").isNaN().alias(\"isNaN\"),\n", | |
| " (F.col(\"question\") == 1.0).alias(\"=1.0\"),\n", | |
| " F.col(\"question\").eqNullSafe(1.0).alias(\"really=1.0\"),\n", | |
| " F.col(\"question\").eqNullSafe(None).alias(\"reallyNull\"),\n", | |
| " )\n", | |
| " .display()\n", | |
| ")" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "491160c6-4551-4396-9b4d-bea6003e5772", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>answer</th><th>isNull</th><th>isNotNull</th><th>isTrue</th><th>reallyTrue</th><th>reallyNaN</th></tr></thead><tbody><tr><td>true</td><td>false</td><td>true</td><td>true</td><td>true</td><td>false</td></tr><tr><td>null</td><td>true</td><td>false</td><td>null</td><td>false</td><td>false</td></tr><tr><td>true</td><td>false</td><td>true</td><td>true</td><td>true</td><td>false</td></tr><tr><td>false</td><td>false</td><td>true</td><td>false</td><td>false</td><td>false</td></tr><tr><td>null</td><td>true</td><td>false</td><td>null</td><td>false</td><td>false</td></tr><tr><td>true</td><td>false</td><td>true</td><td>true</td><td>true</td><td>false</td></tr><tr><td>null</td><td>true</td><td>false</td><td>null</td><td>false</td><td>false</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| true, | |
| false, | |
| true, | |
| true, | |
| true, | |
| false | |
| ], | |
| [ | |
| null, | |
| true, | |
| false, | |
| null, | |
| false, | |
| false | |
| ], | |
| [ | |
| true, | |
| false, | |
| true, | |
| true, | |
| true, | |
| false | |
| ], | |
| [ | |
| false, | |
| false, | |
| true, | |
| false, | |
| false, | |
| false | |
| ], | |
| [ | |
| null, | |
| true, | |
| false, | |
| null, | |
| false, | |
| false | |
| ], | |
| [ | |
| true, | |
| false, | |
| true, | |
| true, | |
| true, | |
| false | |
| ], | |
| [ | |
| null, | |
| true, | |
| false, | |
| null, | |
| false, | |
| false | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "answer", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "isNull", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "isNotNull", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "isTrue", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "reallyTrue", | |
| "type": "\"boolean\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "reallyNaN", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "(\n", | |
| " dataframe\n", | |
| " .select(\n", | |
| " \"answer\",\n", | |
| " F.col(\"answer\").isNull().alias(\"isNull\"), \n", | |
| " F.col(\"answer\").isNotNull().alias(\"isNotNull\"),\n", | |
| " # F.col(\"answer\").isNaN().alias(\"isNaN\"),\n", | |
| " (F.col(\"answer\") == True).alias(\"isTrue\"),\n", | |
| " F.col(\"answer\").eqNullSafe(True).alias(\"reallyTrue\"),\n", | |
| " F.col(\"answer\").eqNullSafe(NaN).alias(\"reallyNaN\"),\n", | |
| " )\n", | |
| ").display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "4b041a7d-a657-4849-90c9-e34db911e489", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## Ordenação\n", | |
| "* #### asc_nulls_first(): \n", | |
| " * deixa os valores em ordem ascendente com os nulos primeiro\n", | |
| "* #### asc_nulls_last(): \n", | |
| " * deixa os valores em ordem ascendente com os nulos por último\n", | |
| "* #### desc_nulls_first(): \n", | |
| " * deixa os valores em ordem descendente com os nulos primeiro\n", | |
| "* #### desc_nulls_last(): \n", | |
| " * deixa os valores em ordem descendente com os nulos por último" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "8419776c-1969-40c3-a01e-2152eafe3824", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>Eric</td><td></td><td>null</td><td>true</td></tr><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>Eric</td><td>Clapton</td><td>1.0</td><td>null</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>John</td><td>Lennon</td><td>2.0</td><td>null</td></tr><tr><td>Paul</td><td>Maca</td><td>NaN</td><td>true</td></tr><tr><td>null</td><td>null</td><td>NaN</td><td>null</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "Eric", | |
| "", | |
| null, | |
| true | |
| ], | |
| [ | |
| "John", | |
| "Lennon", | |
| 1.0, | |
| true | |
| ], | |
| [ | |
| "Eric", | |
| "Clapton", | |
| 1.0, | |
| null | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| 2.0, | |
| false | |
| ], | |
| [ | |
| "John", | |
| "Lennon", | |
| 2.0, | |
| null | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| "NaN", | |
| true | |
| ], | |
| [ | |
| null, | |
| null, | |
| "NaN", | |
| null | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "last_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "question", | |
| "type": "\"float\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "answer", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "(\n", | |
| " dataframe\n", | |
| " .sort(\n", | |
| " F.asc_nulls_first(\"question\")\n", | |
| " )\n", | |
| ").display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "cc2e5f9e-57ee-4018-81b3-e135efa8bd35", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>NaN</td><td>true</td></tr><tr><td>Eric</td><td></td><td>null</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>null</td><td>null</td><td>NaN</td><td>null</td></tr><tr><td>John</td><td>Lennon</td><td>2.0</td><td>null</td></tr><tr><td>Eric</td><td>Clapton</td><td>1.0</td><td>null</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "John", | |
| "Lennon", | |
| 1.0, | |
| true | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| "NaN", | |
| true | |
| ], | |
| [ | |
| "Eric", | |
| "", | |
| null, | |
| true | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| 2.0, | |
| false | |
| ], | |
| [ | |
| null, | |
| null, | |
| "NaN", | |
| null | |
| ], | |
| [ | |
| "John", | |
| "Lennon", | |
| 2.0, | |
| null | |
| ], | |
| [ | |
| "Eric", | |
| "Clapton", | |
| 1.0, | |
| null | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "last_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "question", | |
| "type": "\"float\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "answer", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "(\n", | |
| " dataframe\n", | |
| " .sort(\n", | |
| " F.col(\"answer\").desc_nulls_last()\n", | |
| " )\n", | |
| ").display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "2f4533ce-19a9-44e7-9309-4ef836dc49b7", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "## Transformação" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "1ab5aa59-5512-436a-8557-67b94da6a8b9", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "* #### na.drop() ou dropna(): \n", | |
| " * **how**: \n", | |
| " * aceita o argumento `any`, tirando a linha se qualquer valor dela for nulo\n", | |
| " * aceita o argumento `all`, tirando a linha se todos seus valores forem nulo\n", | |
| " * **thresh**: sobrescreve `how`, tirando a linha se a quantidade de valores não nulo for menor que o valor passado\n", | |
| " * **subset**: Especifica a(s) coluna(s) a ser(em) verificado o valor nulo" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "3f11674b-399e-4467-af85-34c6532a599b", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>NaN</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>Eric</td><td></td><td>null</td><td>true</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "John", | |
| "Lennon", | |
| 1.0, | |
| true | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| "NaN", | |
| true | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| 2.0, | |
| false | |
| ], | |
| [ | |
| "Eric", | |
| "", | |
| null, | |
| true | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "last_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "question", | |
| "type": "\"float\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "answer", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "(\n", | |
| " dataframe\n", | |
| " .dropna(\n", | |
| " # how=\"all\", # any or all\n", | |
| " # thresh=3,\n", | |
| " subset=[\"answer\"]\n", | |
| " )\n", | |
| ").display()" | |
| ] | |
| }, | |
| { | |
| "cell_type": "markdown", | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "17ae3cb5-e489-4d84-af58-9605e79318c2", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "source": [ | |
| "* ### na.fill() ou fillna():\n", | |
| " * **subset**: Especifica a(s) coluna(s) a ser(em) verificado o valor nulo. \n", | |
| " * **IMPORTANTE: A coluna deve ter o mesmo tipo do valor a ser preenchido**.\n", | |
| " * **values**: valor a preencher no lugar dos valores nulos" | |
| ] | |
| }, | |
| { | |
| "cell_type": "code", | |
| "execution_count": 0, | |
| "metadata": { | |
| "application/vnd.databricks.v1+cell": { | |
| "cellMetadata": { | |
| "byteLimit": 2048000, | |
| "rowLimit": 10000 | |
| }, | |
| "inputWidgets": {}, | |
| "nuid": "37e44069-d1fc-4523-875e-5828ba05f8e7", | |
| "showTitle": false, | |
| "tableResultSettingsMap": {}, | |
| "title": "" | |
| } | |
| }, | |
| "outputs": [ | |
| { | |
| "output_type": "display_data", | |
| "data": { | |
| "text/html": [ | |
| "<style scoped>\n", | |
| " .table-result-container {\n", | |
| " max-height: 300px;\n", | |
| " overflow: auto;\n", | |
| " }\n", | |
| " table, th, td {\n", | |
| " border: 1px solid black;\n", | |
| " border-collapse: collapse;\n", | |
| " }\n", | |
| " th, td {\n", | |
| " padding: 5px;\n", | |
| " }\n", | |
| " th {\n", | |
| " text-align: left;\n", | |
| " }\n", | |
| "</style><div class='table-result-container'><table class='table-result'><thead style='background-color: white'><tr><th>name</th><th>last_name</th><th>question</th><th>answer</th></tr></thead><tbody><tr><td>John</td><td>Lennon</td><td>1.0</td><td>true</td></tr><tr><td>John</td><td>Lennon</td><td>2.0</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>1.0</td><td>true</td></tr><tr><td>Paul</td><td>Maca</td><td>2.0</td><td>false</td></tr><tr><td>Eric</td><td>Clapton</td><td>1.0</td><td>true</td></tr><tr><td>Eric</td><td></td><td>1.0</td><td>true</td></tr><tr><td>null</td><td>null</td><td>1.0</td><td>true</td></tr></tbody></table></div>" | |
| ] | |
| }, | |
| "metadata": { | |
| "application/vnd.databricks.v1+output": { | |
| "addedWidgets": {}, | |
| "aggData": [], | |
| "aggError": "", | |
| "aggOverflow": false, | |
| "aggSchema": [], | |
| "aggSeriesLimitReached": false, | |
| "aggType": "", | |
| "arguments": {}, | |
| "columnCustomDisplayInfos": {}, | |
| "data": [ | |
| [ | |
| "John", | |
| "Lennon", | |
| 1.0, | |
| true | |
| ], | |
| [ | |
| "John", | |
| "Lennon", | |
| 2.0, | |
| true | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| 1.0, | |
| true | |
| ], | |
| [ | |
| "Paul", | |
| "Maca", | |
| 2.0, | |
| false | |
| ], | |
| [ | |
| "Eric", | |
| "Clapton", | |
| 1.0, | |
| true | |
| ], | |
| [ | |
| "Eric", | |
| "", | |
| 1.0, | |
| true | |
| ], | |
| [ | |
| null, | |
| null, | |
| 1.0, | |
| true | |
| ] | |
| ], | |
| "datasetInfos": [], | |
| "dbfsResultPath": null, | |
| "isJsonSchema": true, | |
| "metadata": {}, | |
| "overflow": false, | |
| "plotOptions": { | |
| "customPlotOptions": {}, | |
| "displayType": "table", | |
| "pivotAggregation": null, | |
| "pivotColumns": null, | |
| "xColumns": null, | |
| "yColumns": null | |
| }, | |
| "removedWidgets": [], | |
| "schema": [ | |
| { | |
| "metadata": "{}", | |
| "name": "name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "last_name", | |
| "type": "\"string\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "question", | |
| "type": "\"float\"" | |
| }, | |
| { | |
| "metadata": "{}", | |
| "name": "answer", | |
| "type": "\"boolean\"" | |
| } | |
| ], | |
| "type": "table" | |
| } | |
| }, | |
| "output_type": "display_data" | |
| } | |
| ], | |
| "source": [ | |
| "(\n", | |
| " dataframe\n", | |
| " .fillna(\n", | |
| " value=1.0,#{\"question\": 1.0, \"answer\": True},\n", | |
| " # subset=\"question\"\n", | |
| " )\n", | |
| ").display()" | |
| ] | |
| } | |
| ], | |
| "metadata": { | |
| "application/vnd.databricks.v1+notebook": { | |
| "computePreferences": null, | |
| "dashboards": [], | |
| "environmentMetadata": null, | |
| "language": "python", | |
| "notebookMetadata": { | |
| "pythonIndentUnit": 4 | |
| }, | |
| "notebookName": "null_in_spark", | |
| "widgets": {} | |
| } | |
| }, | |
| "nbformat": 4, | |
| "nbformat_minor": 0 | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment