Skip to content

Instantly share code, notes, and snippets.

@calilisantos
Created November 28, 2024 23:38
Show Gist options
  • Select an option

  • Save calilisantos/b33b8efbdaacd55d41c562f32fc1f916 to your computer and use it in GitHub Desktop.

Select an option

Save calilisantos/b33b8efbdaacd55d41c562f32fc1f916 to your computer and use it in GitHub Desktop.
Function to check type in Spark
from pyspark.sql import functions as F, SparkSession, types as T
spark = (
SparkSession.builder.getOrCreate()
)
data = [
("2024-01-01",),
("2024-01-01",),
("2024-01-01",),
("2024-01-01",),
("2024-01-01",),
("A",),
]
schema = T.StructType([
T.StructField(
"data",
T.StringType(),
True
)
])
df = spark.createDataFrame(
data=data,
schema=schema
)
df.show()
def is_type(df, col, type):
return (
df.withColumn(
f"{col}_is_{str(type).replace('Type()', '').lower()}",
F.when(
F.col(col).cast(type) == F.col(col),
True
).otherwise(False)
)
)
df = is_type(df, "data", T.DateType())
df.show()
df.dtypes
// Databricks notebook source
// MAGIC %md
// MAGIC ## **Import**
// COMMAND ----------
import org.apache.spark.sql.{Column, DataFrame, Row}
import org.apache.spark.sql.functions.{col, when}
import org.apache.spark.sql.types.{DataType, DateType, IntegerType, StringType, StructField, StructType}
// COMMAND ----------
// MAGIC %md
// MAGIC ## **Sample**
// COMMAND ----------
val sample: Seq[Row] = Seq(
Row("2024-01-01", "500"),
Row("2024-01-01", "B"),
Row("2024-01-01", "800"),
Row("2024-01-01", "900"),
Row("2024-01-01", "600"),
Row("A", "1000"),
)
val sampleSchema: List[StructField] = List(
StructField("date", StringType, true),
StructField("value", StringType, true)
)
val sampleDF: DataFrame = spark.createDataFrame(
spark.sparkContext.parallelize(sample),
StructType(sampleSchema)
)
display(sampleDF)
// COMMAND ----------
// MAGIC %md
// MAGIC ## **Function**
// COMMAND ----------
def isType(dataframe: DataFrame, column: String, dataType: DataType): DataFrame = {
dataframe.withColumn(
s"${column}_is_${dataType.typeName}",
when(
col(column).cast(dataType).isNotNull,
true
).otherwise(false)
)
}
// COMMAND ----------
// MAGIC %md
// MAGIC ## **Check date**
// COMMAND ----------
val sampleDFDateCheck = isType(dataframe=sampleDF, column="date", dataType=DateType)
display(sampleDFDateCheck)
// COMMAND ----------
// MAGIC %md
// MAGIC ## **Check value**
// COMMAND ----------
val sampleDFValueCheck = isType(dataframe=sampleDFDateCheck, column="value", dataType=IntegerType)
display(sampleDFValueCheck)
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment