Skip to content

Instantly share code, notes, and snippets.

@dilip
Created July 11, 2012 05:57
Show Gist options
  • Select an option

  • Save dilip/3088252 to your computer and use it in GitHub Desktop.

Select an option

Save dilip/3088252 to your computer and use it in GitHub Desktop.
Spark patch to support direct access of S3 data
diff --git a/core/src/main/scala/spark/SparkContext.scala b/core/src/main/scala/spark/SparkContext.scala
index 6e019d6..8bd059e 100644
--- a/core/src/main/scala/spark/SparkContext.scala
+++ b/core/src/main/scala/spark/SparkContext.scala
@@ -124,6 +124,14 @@ class SparkContext(
FileInputFormat.setInputPaths(conf, path)
val bufferSize = System.getProperty("spark.buffer.size", "65536")
conf.set("io.file.buffer.size", bufferSize)
+ if (System.getProperty("awsAccessKeyId") != null) {
+ conf.set("fs.s3.awsAccessKeyId", System.getProperty("awsAccessKeyId"))
+ conf.set("fs.s3n.awsAccessKeyId", System.getProperty("awsAccessKeyId"))
+ }
+ if (System.getProperty("awsSecretAccessKey") != null) {
+ conf.set("fs.s3.awsSecretAccessKey", System.getProperty("awsSecretAccessKey"))
+ conf.set("fs.s3n.awsSecretAccessKey", System.getProperty("awsSecretAccessKey"))
+ }
new HadoopRDD(this, conf, inputFormatClass, keyClass, valueClass, minSplits)
}
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment