Coursera Capstone Project Milestone Report

github repo with RMarkdown source code: https://github.com/englianhu/MilestoneReport

Setup knitr options.

suppressMessages(library(knitr))
suppressMessages(library(markdown))

## knitr configuration
opts_knit$set(progress=FALSE)
opts_chunk$set(echo=TRUE, message=FALSE, tidy=TRUE, comment=NA, fig.path="figure/", fig.keep="high", fig.width=10, fig.height=6, fig.align="center")

Preparing the environment

Creating a Parallel computing Cluster, Loading The Required Libraries and creating support functions.

options(warn = -1)
## Loading the required libraries
suppressMessages(library(plyr))
suppressMessages(library(dplyr))
suppressMessages(library(stringi))
suppressMessages(library(igraph))
suppressMessages(library(NLP))
suppressMessages(library(tm))
suppressMessages(library(xtable))
suppressMessages(library(knitr))
suppressMessages(library(SnowballC))
suppressMessages(library(RWeka))
suppressMessages(library(ggplot2))
suppressMessages(library(grid))
suppressMessages(library(wordcloud))
suppressMessages(library(RColorBrewer))
suppressMessages(library(doParallel))
suppressMessages(library(slam))
suppressMessages(library(rvest))
suppressMessages(library(pipeR))

## Preparing the parallel cluster using the cores
jobcluster <- makeCluster(detectCores())
invisible(clusterEvalQ(jobcluster, library(tm)))
invisible(clusterEvalQ(jobcluster, library(RWeka)))
options(mc.cores = 2)

Downloading the data

The dataset is downloadable in zipped file via here.

if (!file.exists("./data")) {
    dir.create("./data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destfile <- "Coursera-SwiftKey.zip"
if (!file.exists(paste0("./data/", destfile))) {
    download.file(fileUrl, destfile = paste("data", destfile, sep = "/"))
}

## Unzip the dataset
unzip(paste("data", destfile, sep = "/"), exdir = "data/final/de_DE", list = TRUE)
                            Name    Length                Date
1                         final/         0 2014-07-22 10:10:00
2                   final/de_DE/         0 2014-07-22 10:10:00
3  final/de_DE/de_DE.twitter.txt  75578341 2014-07-22 10:11:00
4    final/de_DE/de_DE.blogs.txt  85459666 2014-07-22 10:11:00
5     final/de_DE/de_DE.news.txt  95591959 2014-07-22 10:11:00
6                   final/ru_RU/         0 2014-07-22 10:10:00
7    final/ru_RU/ru_RU.blogs.txt 116855835 2014-07-22 10:12:00
8     final/ru_RU/ru_RU.news.txt 118996424 2014-07-22 10:12:00
9  final/ru_RU/ru_RU.twitter.txt 105182346 2014-07-22 10:12:00
10                  final/en_US/         0 2014-07-22 10:10:00
11 final/en_US/en_US.twitter.txt 167105338 2014-07-22 10:12:00
12    final/en_US/en_US.news.txt 205811889 2014-07-22 10:13:00
13   final/en_US/en_US.blogs.txt 210160014 2014-07-22 10:13:00
14                  final/fi_FI/         0 2014-07-22 10:10:00
15    final/fi_FI/fi_FI.news.txt  94234350 2014-07-22 10:11:00
16   final/fi_FI/fi_FI.blogs.txt 108503595 2014-07-22 10:12:00
17 final/fi_FI/fi_FI.twitter.txt  25331142 2014-07-22 10:10:00
list.files("data/final/de_DE")
[1] "de_DE.blogs.txt"   "de_DE.news.txt"    "de_DE.twitter.txt"
rm(fileUrl, destfile)

Loading the data

## ReadLines in normal way
system.time(readLines("data/final/de_DE/de_DE.blogs.txt", encoding = "UTF-8", 
    20000))
   user  system elapsed 
   0.23    0.00    0.23 
system.time(readLines("data/final/de_DE/de_DE.news.txt", encoding = "UTF-8", 
    20000))
   user  system elapsed 
   0.39    0.00    0.39 
system.time(readLines("data/final/de_DE/de_DE.twitter.txt", encoding = "UTF-8", 
    20000))
   user  system elapsed 
   0.09    0.00    0.10 
## ReadLines in binary mode
con <- file("data/final/de_DE/de_DE.news.txt", open = "rb")
system.time(readLines(con, encoding = "UTF-8", 20000))
   user  system elapsed 
   0.38    0.02    0.39 
news <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)

con <- file("data/final/de_DE/de_DE.blogs.txt", open = "rb")
system.time(readLines(con, encoding = "UTF-8", 20000))
   user  system elapsed 
   0.22    0.00    0.22 
blogs <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)

con <- file("data/final/de_DE/de_DE.twitter.txt", open = "rb")
system.time(readLines(con, encoding = "UTF-8", 20000))
   user  system elapsed 
   0.10    0.00    0.09 
twitter <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)

Data summary

files <- c("data/final/de_DE/de_DE.blogs.txt", "data/final/de_DE/de_DE.news.txt", 
    "data/final/de_DE/de_DE.twitter.txt")
filesInfo <- lapply(as.list(files), file.info)
rbind_all(filesInfo)
Source: local data frame [3 x 7]

      size isdir mode               mtime               ctime
1 85459666 FALSE  438 2015-03-15 01:04:14 2015-03-20 10:14:50
2 95591959 FALSE  438 2015-03-15 01:04:15 2015-03-20 10:14:52
3 75578341 FALSE  438 2015-03-15 01:04:13 2015-03-20 10:14:56
Variables not shown: atime (time), exe (chr)
rm(files, filesInfo)

Analysing the data

data.frame(File = c("blogs", "news", "twitter"), t(sapply(list(blogs, news, 
    twitter), stri_stats_general)))
     File  Lines LinesNEmpty    Chars CharsNWhite
1   blogs 351440      351440 78750126    67125636
2    news 224743      224743 85760901    73846646
3 twitter 927774      927774 71234874    60609327
cWords <- lapply(list(news, blogs, twitter), function(x) stri_count_words(x))
lapply(cWords, summary)
[[1]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   1.00   33.00   51.00   54.65   72.00  603.00 

[[2]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    8.00   22.00   34.16   47.00 1638.00 

[[3]]
   Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
   0.00    7.00   12.00   12.29   17.00   42.00 
# lapply(cWords, qplot)
qplot(cWords[[1]])

qplot(cWords[[2]])

qplot(cWords[[3]])

Subsetting the dataset

## Randomly sampling the dataset
dataSubset <- sample(paste(blogs, news, twitter), size = 10000, replace = TRUE)
rm(blogs, news, twitter)

corpus <- Corpus(VectorSource(dataSubset))
rm(dataSubset)

Cleaning the data for Exploratory Analysis

Tasks such as removing punctuations, white spaces and numbers as well as converting text to lowercase are performed. Removal of profanity also been performed and sourced the list of words from http://www.youswear.com/?language=German

## bad words in german language
lnk <- "http://www.youswear.com/?language=German"
bw <- lnk %>>% html_session() %>>% html_nodes("div a") %>>% html_text() %>% 
    .[nchar(.) > 0]

## Cleaning the data for Exploratory Analysis
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("german"))
corpus <- tm_map(corpus, removeWords, bw)
corpus <- tm_map(corpus, PlainTextDocument)
rm(lnk, bw)

Manipulate TermDocumentMatrix file

tdm <- TermDocumentMatrix(corpus)
tdm$tot <- row_sums(tdm, na.rm = TRUE)
hot_tdm <- tdm[which(tdm$tot >= 1000), ]
hot_tdmTerms <- hot_tdm$dimnames[1]$Terms
hot_tdmFreq <- data.frame(row_sums(hot_tdm))
ggplot(hot_tdmFreq, aes(x = rownames(hot_tdmFreq), y = row_sums.hot_tdm.)) + 
    geom_bar(stat = "identity")

rm(cWords, hot_tdm)

Ploting the data

## Plot wordcloud graph
wordcloud(words = corpus, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE, 
    colors = brewer.pal(8, "Dark2"))
text(x = 0.5, y = 1.1, "TriGram Word Cloud")

Conclusion

From the text mining for the german dataset, we sampling the dataset to get the high frequency of occurence of words. The tm and wordcloud packages are crucial for text mining. There is another RWeka package but doesn’t apply in this paper.