kmate · October 15, 2024 07:14
diff --git a/profiling.scala b/profiling.scala
 // get the profiler instance
 val env = org.apache.spark.SparkEnv.get
 val profiler = env.profiler

 // start a profiling session
 // optionally, you can provide an instance of
 // com.databricks.spark.profiler.RunParameters
 // that parametrizes async-profiler, default is event=cpu
 val session = profiler.run()

 // prepare your computation to profile,
 // in this case a regular Spark computation
 val df = ... // define your DataFrame
 // force the computation to happen, but only read + compute side
 // better than df.cache() then forcing, as it's really nothing else than the materialization
 df.write.format("noop").mode("overwrite").save("fake-path-not-used")

 // stop the profiling
 session.stop()
 // and wait for the results
 // the await timeout could be specified here in a param (unit unknown yet)
 val results = session.awaitResults()
 // the results are a Seq[String] of paths,
 // one path per executor flame graph .html.gz,
 // hosted on dbfs://FileStore/...
 // so, they could be displayed as a list of links
 // with displayHTML, using the following link translation:
 // https://community.databricks.com/t5/data-engineering/download-a-dbfs-filestore-file-to-my-local-machine/m-p/28919/highlight/true#M20684
	// get the profiler instance
	val env = org.apache.spark.SparkEnv.get
	val profiler = env.profiler

	// start a profiling session
	// optionally, you can provide an instance of
	// com.databricks.spark.profiler.RunParameters
	// that parametrizes async-profiler, default is event=cpu
	val session = profiler.run()

	// prepare your computation to profile,
	// in this case a regular Spark computation
	val df = ... // define your DataFrame
	// force the computation to happen, but only read + compute side
	// better than df.cache() then forcing, as it's really nothing else than the materialization
	df.write.format("noop").mode("overwrite").save("fake-path-not-used")

	// stop the profiling
	session.stop()
	// and wait for the results
	// the await timeout could be specified here in a param (unit unknown yet)
	val results = session.awaitResults()
	// the results are a Seq[String] of paths,
	// one path per executor flame graph .html.gz,
	// hosted on dbfs://FileStore/...
	// so, they could be displayed as a list of links
	// with displayHTML, using the following link translation:
	// https://community.databricks.com/t5/data-engineering/download-a-dbfs-filestore-file-to-my-local-machine/m-p/28919/highlight/true#M20684
No results found