Created
August 31, 2020 07:40
-
-
Save nomoa/dbd1d1575c912746645638c2d07bfcdf to your computer and use it in GitHub Desktop.
extract turtle from parquet
This file contains hidden or bidirectional Unicode text that may be interpreted or compiled differently than what appears below. To review, open the file in an editor that reveals hidden Unicode characters.
Learn more about bidirectional Unicode characters
| def extract(implicit spark: SparkSession): Unit = { | |
| val df = spark.read.parquet("...") | |
| val prefix = "/path/file-" | |
| val encoder = new StatementEncoder() | |
| df.foreachPartition(rows => { | |
| val partition = TaskContext.getPartitionId() | |
| val writer = new GZIPOutputStream(new BufferedOutputStream(Files.newOutputStream(Paths.get(s"$prefix-$partition.ttl.gz")))) | |
| val rdfWriter = Rio.createWriter(RDFFormat.TURTLE, writer) | |
| rdfWriter.startRDF() | |
| rows.foreach(row => rdfWriter.handleStatement(encoder.decode(row))) | |
| rdfWriter.endRDF() | |
| writer.close() | |
| }) | |
| } |
Sign up for free
to join this conversation on GitHub.
Already have an account?
Sign in to comment