Tip:- Tutorial prepared on Ubuntu / Linux System
make sure Hadoop is running, open terminal & type
jps
must show 5 running hadoop daemons with their pid like
19530 SecondaryNameNode
19167 DataNode
19634 JobTracker
20037 TaskTracker
18813 NameNode
R must be in path ,run
Rscript --version
# R scripting front-end version 3.0.0 (2013-04-03)
export Hadoop Home
export HADOOP_HOME=/home/trendwise/apache/hadoop-1.0.4/
#! /usr/bin/env Rscript
# mapper.R - Wordcount program in R
# script for Mapper (R-Hadoop integration)
trimWhiteSpace <- function(line) gsub("(^ +)|( +$)", "", line)
splitIntoWords <- function(line) unlist(strsplit(line, "[[:space:]]+"))
## **** could wo with a single readLines or in blocks
con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
line <- trimWhiteSpace(line)
words <- splitIntoWords(line)
## **** can be done as cat(paste(words, "\t1\n", sep=""), sep="")
for (w in words)
cat(w, "\t1\n", sep="")
}
close(con)
#! /usr/bin/env Rscript
# reducer.R - Wordcount program in R
# script for Reducer (R-Hadoop integration)
trimWhiteSpace <- function(line) gsub("(^ +)|( +$)", "", line)
splitLine <- function(line) {
val <- unlist(strsplit(line, "\t"))
list(word = val[1], count = as.integer(val[2]))
}
env <- new.env(hash = TRUE)
con <- file("stdin", open = "r")
while (length(line <- readLines(con, n = 1, warn = FALSE)) > 0) {
line <- trimWhiteSpace(line)
split <- splitLine(line)
word <- split$word
count <- split$count
if (exists(word, envir = env, inherits = FALSE)) {
oldcount <- get(word, envir = env)
assign(word, oldcount + count, envir = env)
}
else assign(word, count, envir = env)
}
close(con)
for (w in ls(env, all = TRUE))
cat(w, "\t", get(w, envir = env), "\n", sep = "")
local tesing
echo "foo foo quux labs foo bar quux" | Rscript mapper.R
on test file
cat '/home/trendwise/Desktop/Learn/RHadoop/inputFile' | Rscript mapper.R
local tesing
echo "foo foo quux labs foo bar quux" | Rscript mapper.R | sort -k1,1 | Rscript reducer.R
on test file
cat inputFile | Rscript mapper.R | sort | Rscript reducer.R
copying
cd $HADOOP_HOME
bin/hadoop dfs -copyFromLocal '/home/trendwise/apache/hadoop-1.0.4/README.txt' /readme
check copied file
bin/hadoop dfs -ls /
run using Hadoop-streaming
bin/hadoop jar /home/trendwise/apache/hadoop-1.0.4/contrib/streaming/hadoop-streaming-1.0.4.jar \
-file /home/trendwise/Desktop/Learn/RHadoop/mapper.R -mapper /home/trendwise/Desktop/Learn/RHadoop/mapper.R \
-file /home/trendwise/Desktop/Learn/RHadoop/reducer.R -reducer /home/trendwise/Desktop/Learn/RHadoop/reducer.R \
-input /readme -output /RCount
using cat on HDFS
bin/hadoop fs -ls /RCount/ # lists wc output files
bin/hadoop fs -cat /RCount/part-00000
using web browser
open http://localhost:50070 & click on Browse the filesystem
using get
bin/hadoop dfs -get /RCount/part-00000 /home/trendwise/Desktop/Learn/RHadoop/wcOutput.txt