In the endevaour of creating a prediction algorithm, from the given set of data, my first step would be to download the dataset (Datasource)
The files are downloaded and unzipped and saved in my working directory
Before getting into details, it is fair to have a overall view on the files downloaded.
library(rJava)
library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tm)
## Loading required package: NLP
library(RWekajars)
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(tm)
library(stringi)
library(NLP)
library(RColorBrewer)
library(wordcloud)
library(ngram)
library(slam)
library(htmlTable)
library(xtable)
Path1<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.blogs.txt"
Path2<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.news.txt"
Path3<-"C:/Users/MMFL/Desktop/Coursera/10.Capstone/final/en_US/en_US.twitter.txt"
blogs <- readLines(Path1, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(Path2, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(Path3, encoding = "UTF-8", skipNul = TRUE)
Stats_Rawfile<-data.frame(FileName=c("en_US.blogs","en_US.news","en_US.twitter"),
FileSizeinMB=c(file.info(Path1)$size/1024^2,
file.info(Path2)$size/1024^2,
file.info(Path3)$size/1024^2),
NumberofLines=sapply(list(blogs,news,twitter),length),
Numberofwords=sapply(list(blogs,news,twitter),stri_stats_latex)[4,])
kable(Stats_Rawfile,caption="Stats of Raw File")
| FileName | FileSizeinMB | NumberofLines | Numberofwords |
|---|---|---|---|
| en_US.blogs | 200.4242 | 899288 | 37570839 |
| en_US.news | 196.2775 | 77259 | 2651432 |
| en_US.twitter | 159.3641 | 2360148 | 30451170 |
Since the size of the file is huge in terms of memory and number of lines, it is imperative to create a sample out of these 3 files and also create a corpus by aggregating all these 3 sample files. It is through various iteration that I found a sample file with 0.2% of the original size/length is optimal.
Blogs_subset <- sample(blogs, length(blogs) * 0.002)
News_subset <- sample(news, length(news) * 0.002)
twitter_subset <- sample(twitter, length(twitter) * 0.002)
total_subset<-c(Blogs_subset,News_subset,twitter_subset)
stats_all<-data.frame(FileName=c("Sample_blog","sample_news","sample_twitter","sample_total"),
FileSizeinMB=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset), function(x){format(object.size(x),"MB")}),
NumberofLines=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset),length),
Numberofwords=sapply(list(Blogs_subset,News_subset,twitter_subset,total_subset),stri_stats_latex)[4,])
kable(stats_all,caption="Stats of all sample files")
| FileName | FileSizeinMB | NumberofLines | Numberofwords |
|---|---|---|---|
| Sample_blog | 0.5 Mb | 1798 | 77074 |
| sample_news | 0 Mb | 154 | 5018 |
| sample_twitter | 0.6 Mb | 4720 | 60378 |
| sample_total | 1.1 Mb | 6672 | 142470 |
After reducing the size now its time to clean the data, by removing punctuations, special characters, white space, profane words.
Blogs_subset <- iconv(Blogs_subset, "UTF-8", "ASCII", sub="")
News_subset <- iconv(News_subset, "UTF-8", "ASCII", sub="")
twitter_subset <- iconv(twitter_subset, "UTF-8", "ASCII", sub="")
total_subset<-c(Blogs_subset,News_subset,twitter_subset)
building.corpus <- function (x) {
corpus <- VCorpus(VectorSource(x))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
}
corpus.final<-building.corpus(total_subset)
Now we get into the tokenizing process
uni_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
corpus.uni.matrix <- TermDocumentMatrix(corpus.final, control = list(tokenize = uni_tokenizer))
corpus.uni <- findFreqTerms(corpus.uni.matrix,lowfreq = 10)
corpus.uni.f <- rowSums(as.matrix(corpus.uni.matrix[corpus.uni,]))
corpus.uni.f <- data.frame(word=names(corpus.uni.f), frequency=corpus.uni.f)
kable(head(corpus.uni.f),caption = "Only one word")
| word | frequency | |
|---|---|---|
| ability | ability | 13 |
| able | able | 43 |
| about | about | 365 |
| above | above | 23 |
| absolutely | absolutely | 16 |
| according | according | 14 |
bi_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
corpus.bi.matrix<- TermDocumentMatrix(corpus.final, control = list(tokenize = bi_tokenizer))
corpus.bi <- findFreqTerms(corpus.bi.matrix,lowfreq=10)
corpus.bi.f <- rowSums(as.matrix(corpus.bi.matrix[corpus.bi,]))
corpus.bi.f <- data.frame(word=names(corpus.bi.f), frequency=corpus.bi.f)
kable(head(corpus.bi.f),caption = "Two words")
| word | frequency | |
|---|---|---|
| a bad | a bad | 11 |
| a beautiful | a beautiful | 14 |
| a better | a better | 10 |
| a big | a big | 21 |
| a bit | a bit | 37 |
| a book | a book | 16 |
tri_tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
corpus.tri.matrix <- TermDocumentMatrix(corpus.final, control = list(tokenize = tri_tokenizer))
corpus.tri <- findFreqTerms(corpus.tri.matrix,lowfreq=10)
corpus.tri.f <- rowSums(as.matrix(corpus.tri.matrix[corpus.tri,]))
corpus.tri.f <- data.frame(word=names(corpus.tri.f), frequency=corpus.tri.f)
kable(head(corpus.tri.f),caption = "Three words")
| word | frequency | |
|---|---|---|
| a bit of | a bit of | 11 |
| a couple of | a couple of | 15 |
| a lot of | a lot of | 50 |
| and i will | and i will | 10 |
| and of course | and of course | 10 |
| as long as | as long as | 10 |
Now we calculate the frequencies of N-Grams
plot.n.grams <- function(data, title, num) {
df2 <- data[order(-data$frequency),][1:num,]
ggplot(df2, aes(x = seq(1:num), y = frequency)) +
geom_bar(stat = "identity", fill = "darkgreen", colour = "black", width = 1.1) +
coord_cartesian(xlim = c(0, num+1)) +
labs(title = title) +
xlab("Words") +
ylab("Count") +
scale_x_discrete(breaks = seq(1, num, by = 1), labels = df2$word[1:num]) +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
}
U<-plot.n.grams(corpus.uni.f,"Unigrams",20)
B<-plot.n.grams(corpus.bi.f,"Bigrams",20)
Tr<-plot.n.grams(corpus.tri.f,"Trigrams",20)
gridExtra::grid.arrange(U, B, Tr, ncol = 3)
The same could be viewed alternatively in WORDCLOUD
corpus.cloud<-list(corpus.tri.f,corpus.bi.f,corpus.uni.f)
par(mfrow=c(1, 3))
for (i in 1:3) {
wordcloud(corpus.cloud[[i]]$word, corpus.cloud[[i]]$frequency, scale = c(3,1), max.words=100, random.order=FALSE, rot.per=0, fixed.asp = TRUE, use.r.layout = FALSE, colors=brewer.pal(8, "Dark2"))
}
Now that we have the ground work done, the nextlogical step would be to be build predition algorithm using shiny app.