github repo with RMarkdown source code: https://github.com/englianhu/MilestoneReport
Setup knitr
options.
suppressMessages(library(knitr))
suppressMessages(library(markdown))
## knitr configuration
opts_knit$set(progress=FALSE)
opts_chunk$set(echo=TRUE, message=FALSE, tidy=TRUE, comment=NA, fig.path="figure/", fig.keep="high", fig.width=10, fig.height=6, fig.align="center")
Creating a Parallel computing Cluster, Loading The Required Libraries and creating support functions.
options(warn = -1)
## Loading the required libraries
suppressMessages(library(plyr))
suppressMessages(library(dplyr))
suppressMessages(library(stringi))
suppressMessages(library(igraph))
suppressMessages(library(NLP))
suppressMessages(library(tm))
suppressMessages(library(xtable))
suppressMessages(library(knitr))
suppressMessages(library(SnowballC))
suppressMessages(library(RWeka))
suppressMessages(library(ggplot2))
suppressMessages(library(grid))
suppressMessages(library(wordcloud))
suppressMessages(library(RColorBrewer))
suppressMessages(library(doParallel))
suppressMessages(library(slam))
suppressMessages(library(rvest))
suppressMessages(library(pipeR))
## Preparing the parallel cluster using the cores
jobcluster <- makeCluster(detectCores())
invisible(clusterEvalQ(jobcluster, library(tm)))
invisible(clusterEvalQ(jobcluster, library(RWeka)))
options(mc.cores = 2)
The dataset is downloadable in zipped file via here.
if (!file.exists("./data")) {
dir.create("./data")
}
fileUrl <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destfile <- "Coursera-SwiftKey.zip"
if (!file.exists(paste0("./data/", destfile))) {
download.file(fileUrl, destfile = paste("data", destfile, sep = "/"))
}
## Unzip the dataset
unzip(paste("data", destfile, sep = "/"), exdir = "data/final/de_DE", list = TRUE)
Name Length Date
1 final/ 0 2014-07-22 10:10:00
2 final/de_DE/ 0 2014-07-22 10:10:00
3 final/de_DE/de_DE.twitter.txt 75578341 2014-07-22 10:11:00
4 final/de_DE/de_DE.blogs.txt 85459666 2014-07-22 10:11:00
5 final/de_DE/de_DE.news.txt 95591959 2014-07-22 10:11:00
6 final/ru_RU/ 0 2014-07-22 10:10:00
7 final/ru_RU/ru_RU.blogs.txt 116855835 2014-07-22 10:12:00
8 final/ru_RU/ru_RU.news.txt 118996424 2014-07-22 10:12:00
9 final/ru_RU/ru_RU.twitter.txt 105182346 2014-07-22 10:12:00
10 final/en_US/ 0 2014-07-22 10:10:00
11 final/en_US/en_US.twitter.txt 167105338 2014-07-22 10:12:00
12 final/en_US/en_US.news.txt 205811889 2014-07-22 10:13:00
13 final/en_US/en_US.blogs.txt 210160014 2014-07-22 10:13:00
14 final/fi_FI/ 0 2014-07-22 10:10:00
15 final/fi_FI/fi_FI.news.txt 94234350 2014-07-22 10:11:00
16 final/fi_FI/fi_FI.blogs.txt 108503595 2014-07-22 10:12:00
17 final/fi_FI/fi_FI.twitter.txt 25331142 2014-07-22 10:10:00
list.files("data/final/de_DE")
[1] "de_DE.blogs.txt" "de_DE.news.txt" "de_DE.twitter.txt"
rm(fileUrl, destfile)
## ReadLines in normal way
system.time(readLines("data/final/de_DE/de_DE.blogs.txt", encoding = "UTF-8",
20000))
user system elapsed
0.23 0.00 0.23
system.time(readLines("data/final/de_DE/de_DE.news.txt", encoding = "UTF-8",
20000))
user system elapsed
0.39 0.00 0.39
system.time(readLines("data/final/de_DE/de_DE.twitter.txt", encoding = "UTF-8",
20000))
user system elapsed
0.09 0.00 0.10
## ReadLines in binary mode
con <- file("data/final/de_DE/de_DE.news.txt", open = "rb")
system.time(readLines(con, encoding = "UTF-8", 20000))
user system elapsed
0.38 0.02 0.39
news <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)
con <- file("data/final/de_DE/de_DE.blogs.txt", open = "rb")
system.time(readLines(con, encoding = "UTF-8", 20000))
user system elapsed
0.22 0.00 0.22
blogs <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)
con <- file("data/final/de_DE/de_DE.twitter.txt", open = "rb")
system.time(readLines(con, encoding = "UTF-8", 20000))
user system elapsed
0.10 0.00 0.09
twitter <- readLines(con, encoding = "UTF-8")
close(con)
rm(con)
files <- c("data/final/de_DE/de_DE.blogs.txt", "data/final/de_DE/de_DE.news.txt",
"data/final/de_DE/de_DE.twitter.txt")
filesInfo <- lapply(as.list(files), file.info)
rbind_all(filesInfo)
Source: local data frame [3 x 7]
size isdir mode mtime ctime
1 85459666 FALSE 438 2015-03-15 01:04:14 2015-03-20 10:14:50
2 95591959 FALSE 438 2015-03-15 01:04:15 2015-03-20 10:14:52
3 75578341 FALSE 438 2015-03-15 01:04:13 2015-03-20 10:14:56
Variables not shown: atime (time), exe (chr)
rm(files, filesInfo)
data.frame(File = c("blogs", "news", "twitter"), t(sapply(list(blogs, news,
twitter), stri_stats_general)))
File Lines LinesNEmpty Chars CharsNWhite
1 blogs 351440 351440 78750126 67125636
2 news 224743 224743 85760901 73846646
3 twitter 927774 927774 71234874 60609327
cWords <- lapply(list(news, blogs, twitter), function(x) stri_count_words(x))
lapply(cWords, summary)
[[1]]
Min. 1st Qu. Median Mean 3rd Qu. Max.
1.00 33.00 51.00 54.65 72.00 603.00
[[2]]
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 8.00 22.00 34.16 47.00 1638.00
[[3]]
Min. 1st Qu. Median Mean 3rd Qu. Max.
0.00 7.00 12.00 12.29 17.00 42.00
# lapply(cWords, qplot)
qplot(cWords[[1]])
qplot(cWords[[2]])
qplot(cWords[[3]])
## Randomly sampling the dataset
dataSubset <- sample(paste(blogs, news, twitter), size = 10000, replace = TRUE)
rm(blogs, news, twitter)
corpus <- Corpus(VectorSource(dataSubset))
rm(dataSubset)
Tasks such as removing punctuations, white spaces and numbers as well as converting text to lowercase are performed. Removal of profanity also been performed and sourced the list of words from http://www.youswear.com/?language=German
## bad words in german language
lnk <- "http://www.youswear.com/?language=German"
bw <- lnk %>>% html_session() %>>% html_nodes("div a") %>>% html_text() %>%
.[nchar(.) > 0]
## Cleaning the data for Exploratory Analysis
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stemDocument)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removeWords, stopwords("german"))
corpus <- tm_map(corpus, removeWords, bw)
corpus <- tm_map(corpus, PlainTextDocument)
rm(lnk, bw)
TermDocumentMatrix
filetdm <- TermDocumentMatrix(corpus)
tdm$tot <- row_sums(tdm, na.rm = TRUE)
hot_tdm <- tdm[which(tdm$tot >= 1000), ]
hot_tdmTerms <- hot_tdm$dimnames[1]$Terms
hot_tdmFreq <- data.frame(row_sums(hot_tdm))
ggplot(hot_tdmFreq, aes(x = rownames(hot_tdmFreq), y = row_sums.hot_tdm.)) +
geom_bar(stat = "identity")
rm(cWords, hot_tdm)
## Plot wordcloud graph
wordcloud(words = corpus, random.order = FALSE, rot.per = 0.35, use.r.layout = FALSE,
colors = brewer.pal(8, "Dark2"))
text(x = 0.5, y = 1.1, "TriGram Word Cloud")
From the text mining for the german dataset, we sampling the dataset to get the high frequency of occurence of words. The tm
and wordcloud
packages are crucial for text mining. There is another RWeka
package but doesn’t apply in this paper.