Coursera capstone- week 2 Assignment

In the endevaour of creating a prediction algorithm, from the given set of data, my first step would be to download the dataset (Datasource)

The files are downloaded and unzipped and saved in my working directory

Before getting into details, it is fair to have a overall view on the files downloaded.

library(rJava)
library(knitr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tm)

## Loading required package: NLP

library(RWekajars)
library(RWeka)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(tm)
library(stringi)
library(NLP)
library(RColorBrewer)
library(wordcloud)
library(ngram)
library(slam)
library(htmlTable)
library(xtable)
Path1<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.blogs.txt"
Path2<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.news.txt"
Path3<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.twitter.txt"
blogs <- readLines(Path1, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(Path2, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(Path3, encoding = "UTF-8", skipNul = TRUE)
Stats_Rawfile<-data.frame(FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
                          FileSizeinMB=c(file.info(Path1)$size/1024^2,
                                          file.info(Path2)$size/1024^2,
                                          file.info(Path3)$size/1024^2),
                          NumberofLines=sapply(list(blogs,news,twitter),length),
                          Numberofwords=sapply(list(blogs,news,twitter),stri_stats_latex)[4,])
kable(Stats_Rawfile,caption="Stats of Raw File")

Stats of Raw File
FileName	FileSizeinMB	NumberofLines	Numberofwords
en_US.blogs	200.4242	899288	37570839
en_US.news	196.2775	77259	2651432
en_US.twitter	159.3641	2360148	30451170

Since the size of the file is huge in terms of memory and number of lines, it is imperative to create a sample out of these 3 files and also create a corpus by aggregating all these 3 sample files. It is through various iteration that I found a sample file with 0.2% of the original size/length is optimal.

Blogs_subset <- sample(blogs, length(blogs) * 0.002)
News_subset <- sample(news, length(news) * 0.002)
twitter_subset <- sample(twitter, length(twitter) * 0.002)
total_subset<-c(Blogs_subset,News_subset,twitter_subset)
stats_all<-data.frame(FileName=c("Sample_blog","sample_news","sample_twitter","sample_total"),
                          FileSizeinMB=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset), function(x){format(object.size(x),"MB")}),
                          NumberofLines=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset),length),
                          Numberofwords=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset),stri_stats_latex)[4,])
kable(stats_all,caption="Stats of all sample files")

Stats of all sample files
FileName	FileSizeinMB	NumberofLines	Numberofwords
Sample_blog	0.5 Mb	1798	77074
sample_news	0 Mb	154	5018
sample_twitter	0.6 Mb	4720	60378
sample_total	1.1 Mb	6672	142470

After reducing the size now its time to clean the data, by removing punctuations, special characters, white space, profane words.

Blogs_subset <- iconv(Blogs_subset, "UTF-8", "ASCII", sub="")
News_subset <- iconv(News_subset, "UTF-8", "ASCII", sub="")
twitter_subset <- iconv(twitter_subset, "UTF-8", "ASCII", sub="")
total_subset<-c(Blogs_subset,News_subset,twitter_subset)

building.corpus <- function (x) {
  corpus <- VCorpus(VectorSource(x))
  corpus <- tm_map(corpus, tolower)
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, PlainTextDocument)
}
corpus.final<-building.corpus(total_subset)

Now we get into the tokenizing process

uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
corpus.uni.matrix <- TermDocumentMatrix(corpus.final, control = list(tokenize = uni_tokenizer))
corpus.uni <- findFreqTerms(corpus.uni.matrix,lowfreq = 10)
corpus.uni.f <- rowSums(as.matrix(corpus.uni.matrix[corpus.uni,]))
corpus.uni.f <- data.frame(word=names(corpus.uni.f), frequency=corpus.uni.f)
kable(head(corpus.uni.f),caption = "Only one word")

Only one word
	word	frequency
ability	ability	13
able	able	43
about	about	365
above	above	23
absolutely	absolutely	16
according	according	14

bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpus.bi.matrix<- TermDocumentMatrix(corpus.final, control = list(tokenize = bi_tokenizer))
corpus.bi <- findFreqTerms(corpus.bi.matrix,lowfreq=10)
corpus.bi.f <- rowSums(as.matrix(corpus.bi.matrix[corpus.bi,]))
corpus.bi.f <- data.frame(word=names(corpus.bi.f), frequency=corpus.bi.f)
kable(head(corpus.bi.f),caption = "Two words")

Two words
	word	frequency
a bad	a bad	11
a beautiful	a beautiful	14
a better	a better	10
a big	a big	21
a bit	a bit	37
a book	a book	16

tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpus.tri.matrix <- TermDocumentMatrix(corpus.final, control = list(tokenize = tri_tokenizer))
corpus.tri <- findFreqTerms(corpus.tri.matrix,lowfreq=10)
corpus.tri.f <- rowSums(as.matrix(corpus.tri.matrix[corpus.tri,]))
corpus.tri.f <- data.frame(word=names(corpus.tri.f), frequency=corpus.tri.f)
kable(head(corpus.tri.f),caption = "Three words")

Three words
	word	frequency
a bit of	a bit of	11
a couple of	a couple of	15
a lot of	a lot of	50
and i will	and i will	10
and of course	and of course	10
as long as	as long as	10

Now we calculate the frequencies of N-Grams

plot.n.grams <- function(data, title, num) {
  df2 <- data[order(-data$frequency),][1:num,] 
  ggplot(df2, aes(x = seq(1:num), y = frequency)) +
    geom_bar(stat = "identity", fill = "darkgreen", colour = "black", width = 1.1) +
    coord_cartesian(xlim = c(0, num+1)) +
    labs(title = title) +
    xlab("Words") +
    ylab("Count") +
    scale_x_discrete(breaks = seq(1, num, by = 1), labels = df2$word[1:num]) +
    theme(axis.text.x = element_text(angle = 90, hjust = 1))
}

 U<-plot.n.grams(corpus.uni.f,"Unigrams",20)
 B<-plot.n.grams(corpus.bi.f,"Bigrams",20)
 Tr<-plot.n.grams(corpus.tri.f,"Trigrams",20)
gridExtra::grid.arrange(U, B, Tr, ncol = 3)

The same could be viewed alternatively in WORDCLOUD

corpus.cloud<-list(corpus.tri.f,corpus.bi.f,corpus.uni.f)
par(mfrow=c(1, 3))
for (i in 1:3) {
  wordcloud(corpus.cloud[[i]]$word, corpus.cloud[[i]]$frequency, scale = c(3,1), max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}

Now that we have the ground work done, the nextlogical step would be to be build predition algorithm using shiny app.

Coursera capstone- week 2 Assignment

M. Chellakumar

9 April 2019