This is the Week 2 Milestone Report for Coursera Data Science Capstone Course. The objective for week 2 is to do an Exploratory Data Analysis. This is to be extended in building a word prediction algorithm which will predict the likely next word while a user types the text.The Milestone Report is divided into following main sections.
setwd("F:/R-Programming/Capstone/Week2/Dataset")
blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")
#library (knitr)
#library (dplyr)
#library (doParallel)
library (stringi)
library (tm)
## Warning: package 'tm' was built under R version 3.6.1
## Loading required package: NLP
library (ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.1
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library (wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.1
## Loading required package: RColorBrewer
library (wordcloud2)
## Warning: package 'wordcloud2' was built under R version 3.6.1
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.6.1
library(rJava)
summary <- data.frame('File' = c("Blogs","News","Twitter"),
"File Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
format(t(rbind(sapply(list(blogs, news, twitter),stri_stats_general),
WordCount = sapply(list(blogs, news, twitter),stri_stats_latex)[4,])), big.mark=","))
summary
## File File.Size Lines LinesNEmpty Chars CharsNWhite
## 1 Blogs 255.4 Mb 899,288 899,288 206,824,382 170,389,539
## 2 News 19.8 Mb 77,259 77,259 15,639,408 13,072,698
## 3 Twitter 319 Mb 2,360,148 2,360,148 162,096,031 134,082,634
## WordCount
## 1 37,570,839
## 2 2,651,432
## 3 30,451,128
The dataset are too big. Hence we shall proceed only with a subset of it (i.e.) 2% of each type. We then Then we are going to clean the data and convert it to a corpus.
set.seed(1000) # Make subsampling reproducible
sample_size <- 0.02 # Create a Subset of the Dataset of only 2%
blogs_index <- sample(seq_len(length(blogs)),length(blogs)*sample_size)
news_index <- sample(seq_len(length(news)),length(news)*sample_size)
twitter_index <- sample(seq_len(length(twitter)),length(twitter)*sample_size)
blogs_sub <- blogs[blogs_index[]]
news_sub <- news[news_index[]]
twitter_sub <- twitter[twitter_index[]]
# Make corpus of all 3 subsets.
text<- c(blogs_sub, news_sub, twitter_sub)
doc_ids <- c(1)
df <- data.frame(doc_id = doc_ids, text = text, stringsAsFactors = FALSE)
#df_corpus <- Corpus(DataframeSource(df))
#corpus <- Corpus(VectorSource(c(blogs_sub, news_sub, twitter_sub)), readerControl=list(reader=readPlain,language="en"))
corpus <- Corpus(DataframeSource(df), readerControl=list(reader=readPlain,language="en"))
# Remove non ASCII characters
corpus <- VCorpus(VectorSource(sapply(corpus, function(row) iconv(row, "latin1", "ASCII", sub=""))))
# Remove punctuation
corpus <- tm_map(corpus, removePunctuation)
# Remove extra white spaces
corpus <- tm_map(corpus, stripWhitespace)
# Convert to lowercase
corpus <- tm_map(corpus, content_transformer(tolower))
# Remove numbers
corpus <- tm_map(corpus, removeNumbers)
# Plain text
corpus <- tm_map(corpus, PlainTextDocument)
###Plotting Sampled Corpus Data with Wordcloud
Plot the individual wordclouds for Blogs, News, Twitter and also the combined one. It will help us to predict different words depending upon the context.
# Set random seed for reproducibility
set.seed(5000)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))
wordcloud(blogs_sub, max.words=20, random.order = FALSE, scale=c(4,1),
rot.per=0.2, use.r.layout=FALSE, colors=brewer.pal(12,"Paired"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
title ("Word Cloud - US English Blogs")
wordcloud(news_sub, max.words=20, random.order = FALSE, scale=c(4,1),
rot.per=0.2, use.r.layout=FALSE, colors=brewer.pal(12,"Paired"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
title ("Word Cloud - US English News")
wordcloud(twitter_sub, max.words=20, random.order = FALSE, scale=c(4,1),
rot.per=0.2, use.r.layout=FALSE, colors=brewer.pal(12,"Paired"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in wordcloud(twitter_sub, max.words = 20, random.order = FALSE, :
## today could not be fit on page. It will not be plotted.
title ("Word Cloud - US English Twitter")
N-Grams alogrithm is most useful for Natural Language Prpcessing (NLP).It is used to compute P(w/h), the probability of a word w given some history h. The N-gram representation of a text lists all N-tuples of words that appear. The simplest case is the unigram which is based on individual words. The bigram is based on pairs of to words and so on. The N-Gram alogorithm next choices are stored in TDM(Term Document Matrices) format.
.jinit(parameters = "-Xmx128g")
## [1] 0
plot.Grams <- function (x=sampleBlogs, subTitle="Blogs", N=10)
{
# Use RWeka to get unigram token
Tokenizer1 <- RWeka::NGramTokenizer(x, Weka_control(min = 1, max = 1))
Gram.1 <- data.frame(table(Tokenizer1))
Gram.1 <- Gram.1[order(Gram.1$Freq, decreasing = TRUE),]
colnames(Gram.1) <- c("Word", "Freq")
Gram.1 <- head(Gram.1, N)
g1 <- ggplot(Gram.1, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="green") +
ggtitle(paste("Unigrams", "-", subTitle)) +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Use RWeka to get bigram token
Tokenizer2 <- RWeka::NGramTokenizer(x,
Weka_control(min = 2,
max = 2,
delimiters = " \\r\\n\\t.,;:\"()?!"))
Gram.2 <- data.frame(table(Tokenizer2))
Gram.2 <- Gram.2[order(Gram.2$Freq, decreasing = TRUE),]
colnames(Gram.2) <- c("Word", "Freq")
Gram.2 <- head(Gram.2, N)
g2 <- ggplot(Gram.2, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="blue") +
ggtitle(paste("Bigrams", "-", subTitle)) +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Use RWeka to get trigram token
Tokenizer3 <- RWeka::NGramTokenizer(x,
Weka_control(min = 3, max = 3,
delimiters = " \\r\\n\\t.,;:\"()?!"))
Gram.3 <- data.frame(table(Tokenizer3))
Gram.3 <- Gram.3[order(Gram.3$Freq, decreasing = TRUE),]
colnames(Gram.3) <- c("Word", "Freq")
Gram.3 <- head(Gram.3, N)
g3 <- ggplot(Gram.3, aes(x=reorder(Word, Freq),y=Freq)) +
geom_bar(stat="identity", fill="darkgreen") +
ggtitle(paste("Trigrams", "-", subTitle)) +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=90, hjust=1))
# Put three plots into 1 row 3 columns
gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
}
plot.Grams(x = blogs_sub, subTitle = "Blogs", N = 10)
plot.Grams(x = news_sub, subTitle = "News", N = 12)
plot.Grams(x = twitter_sub, subTitle = "News", N = 12)
# cleaning up the environment
rm (blogs_sub)
rm (news_sub)
rm (twitter_sub)
rm (corpus)