Skip to content

Instantly share code, notes, and snippets.

@selvait90
Last active September 17, 2017 08:06
Show Gist options
  • Select an option

  • Save selvait90/deffd2308708726f6877 to your computer and use it in GitHub Desktop.

Select an option

Save selvait90/deffd2308708726f6877 to your computer and use it in GitHub Desktop.
Hadoop Workshop
# Prerequisite
sudo apt-get update
java -version
sudo apt-get install default-jre
sudo apt-get install default-jdk
sudo apt-get remove openssh-client
sudo apt-get install openssh-client
sudo apt-get install openssh-server
# Java
sudo apt-get update
java -version
sudo apt-get install default-jre
sudo apt-get install default-jdk
- Java installed location /usr/lib/jvm/java-8-openjdk-amd64/
# Environment
sudo apt-get install vim
# ssh keys
sudo apt-get remove openssh-client
sudo apt-get install openssh-client
sudo apt-get install openssh-server
ssh localhost
ssh-keygen -t rsa
cd .ssh/
touch authorized_keys
cat id_rsa.pub >> authorized_keys
chmod 644 authorized_keys
ssh localhost
# Hadoop
cd
mkdir app
cd app
cp ../hadoop-workshop/app-deliverable/hadoop-1.2.1.tar.gz .
tar -xzvf hadoop-1.2.1.tar.gz
cd hadoop-1.2.1
cd conf
1) core-site.xml
fs.default.name - hdfs://localhost:10011
hadoop.tmp.dir - /home/ubuntu/hadoop/tmp
2) hdfs-site.xml
dfs.replication - 1
dfs.name.dir - /home/ubuntu/hadoop/name
dfs.data.dir - /home/ubuntu/hadoop/data
3) mapred-site.xml
mapred.tasktracker.map.tasks.maximum - 4
mapred.tasktracker.reduce.tasks.maximum - 2
4) hadoop-env.sh
export JAVA_HOME=/usr/lib/jvm/java-8-openjdk-amd64/
export HADOOP_HOME=/home/ubuntu/app/hadoop-1.2.1
export HADOOP_PID_DIR=/home/ubuntu/hadoop/pids
cd ..
./bin/hadoop namenode -format
./bin/hadoop-daemon.sh start namenode
tailf logs/hadoop-ubuntu-namenode-*.log
./bin/hadoop-daemon.sh start datanode
tailf logs/hadoop-ubuntu-datanode-*.log
./bin/hadoop-daemon.sh start jobtracker
tailf logs/hadoop-ubuntu-jobtracker-*.log
./bin/hadoop-daemon.sh start tasktracker
tailf logs/hadoop-ubuntu-tasktracker-*.log
./bin/hadoop-daemon.sh start secondarynamenode
tailf logs/hadoop-ubuntu-secondarynamenode-*.log
$ jps
13353 JobTracker
13579 TaskTracker
12940 DataNode
13179 SecondaryNameNode
12708 NameNode
13650 Jps
# access in browser
localhost:15070
localhost:15030
# Add environment variable for hadoop
vi /home/ubuntu/.bashrc
export JAVA_HOME=/usr/lib/jvm/java-7-openjdk-amd64/
export HADOOP_HOME=/home/ubuntu/app/hadoop-1.2.1
source ~/.bashrc
# Additional commands Commands
$HADOOP_HOME/bin/start-dfs.sh
$HADOOP_HOME/bin/start-mapred.sh
$HADOOP_HOME/bin/start-all.sh
$HADOOP_HOME/bin/hadoop-daemon.sh start namenode
jps
$HADOOP_HOME/bin/hadoop-daemon.sh start datanode
jps
$HADOOP_HOME/bin/hadoop-daemon.sh start jobtracker
jps
$HADOOP_HOME/bin/hadoop-daemon.sh start tasktracker
########################## OUTPUT ##################################
# ./bin/hadoop namenode -format
ubuntu@selvapc:~/app/hadoop-1.2.1$ ./bin/hadoop namenode -format [173/1902]
Warning: $HADOOP_HOME is deprecated.
17/01/05 17:02:43 INFO namenode.NameNode: STARTUP_MSG:
/************************************************************
STARTUP_MSG: Starting NameNode
STARTUP_MSG: host = selvapc/127.0.1.1
STARTUP_MSG: args = [-format]
STARTUP_MSG: version = 1.2.1
STARTUP_MSG: build = https://svn.apache.org/repos/asf/hadoop/common/branches/branch-1.2 -r 1503152; com
piled by 'mattf' on Mon Jul 22 15:23:09 PDT 2013
STARTUP_MSG: java = 1.8.0_111
************************************************************/
17/01/05 17:02:43 INFO util.GSet: Computing capacity for map BlocksMap
17/01/05 17:02:43 INFO util.GSet: VM type = 64-bit
17/01/05 17:02:43 INFO util.GSet: 2.0% max memory = 1013645312
17/01/05 17:02:43 INFO util.GSet: capacity = 2^21 = 2097152 entries
17/01/05 17:02:43 INFO util.GSet: recommended=2097152, actual=2097152
17/01/05 17:02:44 INFO namenode.FSNamesystem: fsOwner=ubuntu
17/01/05 17:02:44 INFO namenode.FSNamesystem: supergroup=supergroup
17/01/05 17:02:44 INFO namenode.FSNamesystem: isPermissionEnabled=true
17/01/05 17:02:44 INFO namenode.FSNamesystem: dfs.block.invalidate.limit=100
17/01/05 17:02:44 INFO namenode.FSNamesystem: isAccessTokenEnabled=false accessKeyUpdateInterval=0 min(s)
, accessTokenLifetime=0 min(s)
17/01/05 17:02:44 INFO namenode.FSEditLog: dfs.namenode.edits.toleration.length = 0
17/01/05 17:02:44 INFO namenode.NameNode: Caching file names occuring more than 10 times
17/01/05 17:02:44 INFO common.Storage: Image file /home/ubuntu/hadoop/name/current/fsimage of size 112 by
tes saved in 0 seconds.
17/01/05 17:02:44 INFO namenode.FSEditLog: closing edit log: position=4, editlog=/home/ubuntu/hadoop/name
/current/edits
17/01/05 17:02:44 INFO namenode.FSEditLog: close success: truncate to 4, editlog=/home/ubuntu/hadoop/name
/current/edits
17/01/05 17:02:44 INFO common.Storage: Storage directory /home/ubuntu/hadoop/name has been successfully f
ormatted.
17/01/05 17:02:44 INFO namenode.NameNode: SHUTDOWN_MSG:
/************************************************************
SHUTDOWN_MSG: Shutting down NameNode at selvapc/127.0.1.1
************************************************************/
# format, start, format
2014-09-17 04:37:46,975 ERROR org.apache.hadoop.hdfs.server.datanode.DataNode: java.io.IOException: Incompatible namespaceIDs in /home/selva/hadoop/tmp/dfs/data: namenode namespaceID = 1949108614; datanode namespaceID = 870281182
cd
mkdir handson
cd handson
cp ../hadoop-workshop/handson/hadoop/WordCount.java .
mkdir wordcount_classes
javac -classpath ${HADOOP_HOME}/hadoop-core-1.2.1.jar -d wordcount_classes WordCount.java
find wordcount_classes/
jar -cvf wordcount.jar -C wordcount_classes/ .
cp ../hadoop-workshop/handson/hadoop/input.txt .
$HADOOP_HOME/bin/hadoop fs -copyFromLocal input.txt /home/ubuntu/wordcount/inputdata/input.txt
$HADOOP_HOME/bin/hadoop fs -ls /home/ubuntu/wordcount/inputdata
$HADOOP_HOME/bin/hadoop fs -cat /home/ubuntu/wordcount/inputdata/input.txt
$HADOOP_HOME/bin/hadoop jar wordcount.jar org.apache.hadoop.examples.WordCount /home/ubuntu/wordcount/inputdata /home/ubuntu/wordcount/outputdata
############ Users Count for a Web Application #############
cd
cd handson
cp ../hadoop-workshop/handson/hadoop/IPCount.java .
mkdir ipcount_classes
javac -classpath ${HADOOP_HOME}/hadoop-core-1.2.1.jar -d ipcount_classes IPCount.java
find ipcount_classes/
jar -cvf ipcount.jar -C ipcount_classes/ .
cp ../hadoop-workshop/handson/hadoop/access_log .
$HADOOP_HOME/bin/hadoop fs -copyFromLocal access_log /home/ubuntu/ipcount/inputdata/access_log
$HADOOP_HOME/bin/hadoop fs -ls /home/ubuntu/ipcount/inputdata
$HADOOP_HOME/bin/hadoop fs -cat /home/ubuntu/ipcount/inputdata/access_log
$HADOOP_HOME/bin/hadoop jar ipcount.jar org.apache.hadoop.examples.IPCount /home/ubuntu/ipcount/inputdata /home/ubuntu/ipcount/outputdata
package org.apache.hadoop.examples;
import java.io.IOException;
import java.util.StringTokenizer;
import org.apache.hadoop.conf.Configuration;
import org.apache.hadoop.fs.Path;
import org.apache.hadoop.io.IntWritable;
import org.apache.hadoop.io.Text;
import org.apache.hadoop.mapreduce.Job;
import org.apache.hadoop.mapreduce.Mapper;
import org.apache.hadoop.mapreduce.Reducer;
import org.apache.hadoop.mapreduce.lib.input.FileInputFormat;
import org.apache.hadoop.mapreduce.lib.output.FileOutputFormat;
import org.apache.hadoop.util.GenericOptionsParser;
public class WordCount {
public static class TokenizerMapper
extends Mapper<Object, Text, Text, IntWritable>{
private final static IntWritable one = new IntWritable(1);
private Text word = new Text();
public void map(Object key, Text value, Context context
) throws IOException, InterruptedException {
StringTokenizer itr = new StringTokenizer(value.toString());
while (itr.hasMoreTokens()) {
word.set(itr.nextToken());
context.write(word, one);
}
}
}
public static class IntSumReducer
extends Reducer<Text,IntWritable,Text,IntWritable> {
private IntWritable result = new IntWritable();
public void reduce(Text key, Iterable<IntWritable> values,
Context context
) throws IOException, InterruptedException {
int sum = 0;
for (IntWritable val : values) {
sum += val.get();
}
result.set(sum);
context.write(key, result);
}
}
public static void main(String[] args) throws Exception {
Configuration conf = new Configuration();
Job job = new Job(conf, "word count");
job.setJarByClass(WordCount.class);
job.setMapperClass(TokenizerMapper.class);
job.setCombinerClass(IntSumReducer.class);
job.setReducerClass(IntSumReducer.class);
job.setOutputKeyClass(Text.class);
job.setOutputValueClass(IntWritable.class);
FileInputFormat.addInputPath(job, new Path(args[0]));
FileOutputFormat.setOutputPath(job, new Path(args[1]));
System.exit(job.waitForCompletion(true) ? 0 : 1);
}
}
one two three two three four five five
five one two three four one one two
one one two five five five four
* IP address
ubuntu@ubuntu-HP-Pro-3330-MT:~$ ifconfig
wlan0 Link encap:Ethernet HWaddr 00:1e:2a:37:55:03
inet addr:10.1.15.103 Bcast:10.1.15.255 Mask:255.255.255.0
inet6 addr: fe80::21e:2aff:fe37:5503/64 Scope:Link
* ssh connection
ubuntu@ubuntu-HP-Pro-3330-MT:~$ ssh [email protected]
- Master
ubuntu@ubuntu-HP-Pro-3330-MT:~/.ssh$ scp id_rsa.pub 10.1.15.122:/home/ubuntu/
- Slave
ubuntu@ubuntu-HP-Pro-3330-MT:~/.ssh$ scp id_rsa.pub 10.1.15.103:/home/ubuntu/
- Master
ubuntu@ubuntu-HP-Pro-3330-MT:~$ cat id_rsa.pub >> .ssh/authorized_keys
- Slave
ubuntu@ubuntu-HP-Pro-3330-MT:~$ cat id_rsa.pub >> .ssh/authorized_keys
* masters change
- Master & slave
ubuntu@ubuntu-HP-Pro-3330-MT:~$ cd $HADOOP_HOME
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1$ cd conf
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ vi masters
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ cat masters
10.1.15.103
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1$ cat conf/slaves
10.1.15.103
10.1.15.122
* localhost to ipaddress changes
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ grep -iR "localhost" .
./masters:localhost
./core-site.xml: <value>hdfs://localhost:10011</value>
./mapred-site.xml: <value>localhost:10012</value>
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ cat masters
localhost
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ vi masters
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ cat masters
10.1.15.103
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ cat slaves
10.1.15.103
10.1.15.122
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ vi core-site.xml
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ vi mapred-site.xml
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$ grep -iR "localhost" .
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1/conf$
* Start node
- Master
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1$ ./bin/hadoop-daemon.sh start namenode
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1$ ./bin/hadoop-daemon.sh start jobtracker
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1$ ./bin/hadoop-daemon.sh start datanode
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1$ ./bin/hadoop-daemon.sh start tasktracker
- Slave
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1$ ./bin/hadoop-daemon.sh start datanode
ubuntu@ubuntu-HP-Pro-3330-MT:~/app/hadoop-1.2.1$ ./bin/hadoop-daemon.sh start tasktracker
2014-09-17 04:37:46,975 ERROR org.apache.hadoop.hdfs.server.datanode.DataNode: java.io.IOException: Incompatible namespaceIDs in /home/selva/hadoop/tmp/dfs/data: namenode namespaceID = 1949108614; datanode namespaceID = 870281182
/home/student/hadoop/tmp/dfs/data/current/VERSION
ssh-rsa AAAAB3NzaC1yc2EAAAADAQABAAABAQC/tBn4Sx/+0TgPNxrwEFzmwTYIcsNchA1aqxpI0eTWwmPBxquzrujo7bU84oF6oCBfUozgV2aZyDkVKGxQAyDHhAMlPoXe7oDHLvv9nfdU94VDWPzrG/E0/y+uQk6MryeFQjBA/bwHlVNgE/aSfBsmdznKU32pKmV1CXnHyohCFhWvJPtHrt29CUt2jvK0DT+f7oaaE8ipZHQlHNozQCHCwvUtHaUCSNBtXj4KUd1ObtVa4SSPJ8RcVLSzY1FYt+GbJL2z8xxkIFFWNdVkbrrfTnU/YX2FzMtp0ydwy5cPrJMLoZdsJlaVbU9mq2A1SUiuteOhqEgCqbsRehL+dc7x student@student
https://drive.google.com/file/d/0Bxu-zrDMylMqYTdycUJSSElkWEE/view?usp=sharing
https://drive.google.com/file/d/0Bxu-zrDMylMqQmdWMElkTGJkT3M/view?usp=sharing
Sign up for free to join this conversation on GitHub. Already have an account? Sign in to comment