Introduction

This is the Week 2 Milestone Report for Coursera Data Science Capstone Course. The objective for week 2 is to do an Exploratory Data Analysis. This is to be extended in building a word prediction algorithm which will predict the likely next word while a user types the text.The Milestone Report is divided into following main sections.

Reading the US_Engiish Dataset

setwd("F:/R-Programming/Capstone/Week2/Dataset")
blogs <- readLines("en_US.blogs.txt", warn = FALSE, encoding = "UTF-8")
news <- readLines("en_US.news.txt", warn = FALSE, encoding = "UTF-8")
twitter <- readLines("en_US.twitter.txt", warn = FALSE, encoding = "UTF-8")

Summary of the Data Sets

#library (knitr)
#library (dplyr)
#library (doParallel)
library (stringi)
library (tm)
## Warning: package 'tm' was built under R version 3.6.1
## Loading required package: NLP
library (ggplot2)
## Warning: package 'ggplot2' was built under R version 3.6.1
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library (wordcloud)
## Warning: package 'wordcloud' was built under R version 3.6.1
## Loading required package: RColorBrewer
library (wordcloud2) 
## Warning: package 'wordcloud2' was built under R version 3.6.1
library(RWeka)
## Warning: package 'RWeka' was built under R version 3.6.1
library(rJava)

summary <- data.frame('File' = c("Blogs","News","Twitter"),
                      "File Size" = sapply(list(blogs, news, twitter), function(x){format(object.size(x),"MB")}),
                          format(t(rbind(sapply(list(blogs, news, twitter),stri_stats_general),
  WordCount = sapply(list(blogs, news, twitter),stri_stats_latex)[4,])), big.mark=","))
                      
summary
##      File File.Size       Lines LinesNEmpty       Chars CharsNWhite
## 1   Blogs  255.4 Mb     899,288     899,288 206,824,382 170,389,539
## 2    News   19.8 Mb      77,259      77,259  15,639,408  13,072,698
## 3 Twitter    319 Mb   2,360,148   2,360,148 162,096,031 134,082,634
##     WordCount
## 1  37,570,839
## 2   2,651,432
## 3  30,451,128

Data Cleaning and selection of Corpus

The dataset are too big. Hence we shall proceed only with a subset of it (i.e.) 2% of each type. We then Then we are going to clean the data and convert it to a corpus.

set.seed(1000) # Make subsampling reproducible
sample_size <- 0.02 # Create a Subset of the Dataset of only 2% 

blogs_index <- sample(seq_len(length(blogs)),length(blogs)*sample_size)
news_index <- sample(seq_len(length(news)),length(news)*sample_size)
twitter_index <- sample(seq_len(length(twitter)),length(twitter)*sample_size)

blogs_sub <- blogs[blogs_index[]]
news_sub <- news[news_index[]]
twitter_sub <- twitter[twitter_index[]]

# Make corpus of all 3 subsets.
text<- c(blogs_sub, news_sub, twitter_sub)
doc_ids <- c(1)
df <- data.frame(doc_id = doc_ids, text = text, stringsAsFactors = FALSE)
#df_corpus <- Corpus(DataframeSource(df))

#corpus <- Corpus(VectorSource(c(blogs_sub, news_sub, twitter_sub)), readerControl=list(reader=readPlain,language="en"))  

corpus <- Corpus(DataframeSource(df), readerControl=list(reader=readPlain,language="en"))  


# Remove non ASCII characters
corpus <- VCorpus(VectorSource(sapply(corpus, function(row) iconv(row, "latin1", "ASCII", sub=""))))

# Remove punctuation
corpus <- tm_map(corpus, removePunctuation) 
# Remove extra white spaces
corpus <- tm_map(corpus, stripWhitespace) 
# Convert to lowercase
corpus <- tm_map(corpus, content_transformer(tolower)) 
# Remove numbers
corpus <- tm_map(corpus, removeNumbers) 
# Plain text
corpus <- tm_map(corpus, PlainTextDocument) 

###Plotting Sampled Corpus Data with Wordcloud

Plot the individual wordclouds for Blogs, News, Twitter and also the combined one. It will help us to predict different words depending upon the context.

# Set random seed for reproducibility
set.seed(5000)
# Set Plotting in 1 row 3 columns
par(mfrow=c(1, 3))

wordcloud(blogs_sub, max.words=20, random.order = FALSE, scale=c(4,1), 
          rot.per=0.2, use.r.layout=FALSE, colors=brewer.pal(12,"Paired"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
title ("Word Cloud - US English Blogs")
wordcloud(news_sub, max.words=20, random.order = FALSE, scale=c(4,1), 
          rot.per=0.2, use.r.layout=FALSE, colors=brewer.pal(12,"Paired"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
title ("Word Cloud - US English News")
wordcloud(twitter_sub, max.words=20, random.order = FALSE, scale=c(4,1), 
          rot.per=0.2, use.r.layout=FALSE, colors=brewer.pal(12,"Paired"))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in wordcloud(twitter_sub, max.words = 20, random.order = FALSE, :
## today could not be fit on page. It will not be plotted.
title ("Word Cloud - US English Twitter")

N-Grams

N-Grams alogrithm is most useful for Natural Language Prpcessing (NLP).It is used to compute P(w/h), the probability of a word w given some history h. The N-gram representation of a text lists all N-tuples of words that appear. The simplest case is the unigram which is based on individual words. The bigram is based on pairs of to words and so on. The N-Gram alogorithm next choices are stored in TDM(Term Document Matrices) format.

.jinit(parameters = "-Xmx128g")
## [1] 0
plot.Grams <- function (x=sampleBlogs, subTitle="Blogs", N=10)
{
  # Use RWeka to get unigram token
  Tokenizer1 <- RWeka::NGramTokenizer(x, Weka_control(min = 1, max = 1))
  Gram.1 <- data.frame(table(Tokenizer1))
  Gram.1 <- Gram.1[order(Gram.1$Freq, decreasing = TRUE),]
  colnames(Gram.1) <- c("Word", "Freq")
  Gram.1 <- head(Gram.1, N) 
  g1 <- ggplot(Gram.1, aes(x=reorder(Word, Freq),y=Freq)) + 
        geom_bar(stat="identity", fill="green") + 
        ggtitle(paste("Unigrams", "-", subTitle)) + 
        xlab("Unigrams") + ylab("Frequency") + 
        theme(axis.text.x=element_text(angle=90, hjust=1))

  # Use RWeka to get bigram token
  Tokenizer2 <- RWeka::NGramTokenizer(x, 
                                      Weka_control(min = 2, 
                                                   max = 2, 
                                                   delimiters = " \\r\\n\\t.,;:\"()?!"))
  Gram.2 <- data.frame(table(Tokenizer2))
  Gram.2 <- Gram.2[order(Gram.2$Freq, decreasing = TRUE),]
  colnames(Gram.2) <- c("Word", "Freq")
  Gram.2 <- head(Gram.2, N) 
  g2 <- ggplot(Gram.2, aes(x=reorder(Word, Freq),y=Freq)) + 
          geom_bar(stat="identity", fill="blue") + 
          ggtitle(paste("Bigrams", "-", subTitle)) + 
          xlab("Bigrams") + ylab("Frequency") + 
          theme(axis.text.x=element_text(angle=90, hjust=1))

  # Use RWeka to get trigram token
  Tokenizer3 <- RWeka::NGramTokenizer(x, 
                                    Weka_control(min = 3, max = 3, 
                                                 delimiters = " \\r\\n\\t.,;:\"()?!"))
  Gram.3 <- data.frame(table(Tokenizer3))
  Gram.3 <- Gram.3[order(Gram.3$Freq, decreasing = TRUE),]
  colnames(Gram.3) <- c("Word", "Freq")
  Gram.3 <- head(Gram.3, N) 
  g3 <- ggplot(Gram.3, aes(x=reorder(Word, Freq),y=Freq)) + 
          geom_bar(stat="identity", fill="darkgreen") + 
          ggtitle(paste("Trigrams", "-", subTitle)) + 
          xlab("Trigrams") + ylab("Frequency") + 
          theme(axis.text.x=element_text(angle=90, hjust=1))

  # Put three plots into 1 row 3 columns
  gridExtra::grid.arrange(g1, g2, g3, ncol = 3)
}

plot.Grams(x = blogs_sub, subTitle = "Blogs", N = 10)

plot.Grams(x = news_sub, subTitle = "News", N = 12)

plot.Grams(x = twitter_sub, subTitle = "News", N = 12)

# cleaning up the environment
rm (blogs_sub)
rm (news_sub)
rm (twitter_sub)
rm (corpus)