$ brew install apache-sparkA python shell with a preconfigured SparkContext (available as sc). It is
| #!/bin/bash | |
| : ${HDP_VERSION:=2.2.0.0-2041} | |
| : ${SPARK_VERSION:=1.2.0} | |
| : ${SPARK_DIST_PREFIX_VERSION:=2.2.0.0-82} | |
| : ${SPARK_HADOOP_VERSION:=2.6.0} | |
| SPARK_DIST_VERSION=$SPARK_VERSION.$SPARK_DIST_PREFIX_VERSION-bin-$SPARK_HADOOP_VERSION.$HDP_VERSION | |
| SPARK_ASSEMBLY_VERSION=$SPARK_VERSION.$SPARK_DIST_PREFIX_VERSION-hadoop$SPARK_HADOOP_VERSION.$HDP_VERSION |
(by @andrestaltz)
If you prefer to watch video tutorials with live-coding, then check out this series I recorded with the same contents as in this article: Egghead.io - Introduction to Reactive Programming.
| hadoop job -list | grep job_ | awk 'BEGIN{FS="\t";OFS=","};{print $1,strftime("%H:%M:%S", (systime()-int($3/1000)),1),"\""$4"\"","\""$6"\""}' |
| ADD JAR s3://<s3-bucket>/jars/hive_contrib-0.5.jar; | |
| CREATE TEMPORARY FUNCTION now as 'com.mt.utils.udf.Now'; | |
| CREATE TEMPORARY FUNCTION user_agent_f as 'com.mt.utils.UserAgent'; | |
| set hive.merge.mapredfiles=true; | |
| set hive.merge.mapfiles=true; | |
| set hive.merge.size.per.task=500000000; | |
| CREATE EXTERNAL TABLE data |
| #!/bin/sh | |
| exec scala $0 "$@" | |
| !# | |
| import scala.collection.JavaConversions._ | |
| import java.lang.management.{ManagementFactory, MemoryMXBean} | |
| import java.net.URI | |
| import javax.management.JMX | |
| import javax.management.remote.{JMXConnectorFactory, JMXServiceURL} |
| sed 's/'`echo -e "\01"`'/,/g' input_file.txt > output_file.csv |