Overview

This is milestone report for week 2 of data science specialization. The overall goal is to build a model which predicts the most likely next word when user inputs word or phrase.

The goal for this assignment is to understand dataset and perform exploratory data analysis for three text files namely blogs, news and twitter. Also, this analysis helps us to chose a prediction plan later in the project.

Set Working directory

setwd("F:/Knowledge/Coursera/Data Science Specalization/Capstone Project")

Download and Import Data

In this step, we download the data from URL and unzip the contents and read these data into R.

if (!file.exists("Coursera-SwiftKey.zip")){
  download.file(url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", destfile = "Coursera-SwiftKey.zip")
  unzip("Coursera-SwiftKey.zip")
}

blogs <- readLines(con = file("./final/en_US/en_US.blogs.txt"),
                   encoding= "UTF-8", skipNul = TRUE, warn = FALSE)
news <- readLines(con = file("./final/en_US/en_US.news.txt"),
                   encoding= "UTF-8", skipNul = TRUE,warn=FALSE)
twitter <- readLines(con = file("./final/en_US/en_US.twitter.txt"),
                  encoding= "UTF-8", skipNul = TRUE,warn=FALSE)

Generate Statistics

In this step, we generate file statistics like size, Nb of Lines, Characters and Words etc.

library(stringi)
file_stats <- data.frame(
                FileName = c("Blogs","News","Twitter"),
                FileSize = sapply(list(blogs,news,twitter),
                                  function(x){format(object.size(x),"MB")}),                      t(rbind(sapply(list(blogs,news,twitter),stri_stats_general),
                Words = sapply(list(blogs,news,twitter),stri_stats_latex)[4,])
                  )
                        )

file_stats
##   FileName FileSize   Lines LinesNEmpty     Chars CharsNWhite    Words
## 1    Blogs 255.4 Mb  899288      899288 206824382   170389539 37570839
## 2     News  19.8 Mb   77259       77259  15639408    13072698  2651432
## 3  Twitter   319 Mb 2360148     2360148 162096241   134082806 30451170

Sampling

From the above step, we see that file sizes are huge. We will sample 1% of data and check the size of VCorpus loaded to memory.

set.seed(20210103)
sampleSize <- 0.01

blogs_sample <- sample(blogs, length(blogs) * sampleSize)
news_sample <- sample(news, length(news) * sampleSize)
twitter_sample <- sample(twitter, length(twitter) * sampleSize)

sampleData <- c(blogs_sample,news_sample,twitter_sample)

Sample data statistics

We will generate statistics for the sample data to make sure file sizes are not too large

samplefile_stats <- data.frame(
           FileName = c("Blogs","News","Twitter","Sample"),
           FileSize =sapply(list(blogs_sample,news_sample,twitter_sample,sampleData), function(x){format(object.size(x),"MB")}),
           t(rbind(sapply(list(blogs_sample,news_sample,twitter_sample,sampleData),stri_stats_general),
          Words = sapply(list(blogs_sample,news_sample,twitter_sample,sampleData),stri_stats_latex)[4,])
            )
                             )
samplefile_stats
##   FileName FileSize Lines LinesNEmpty   Chars CharsNWhite  Words
## 1    Blogs   2.6 Mb  8992        8992 2070449     1705825 377231
## 2     News   0.2 Mb   772         772  147695      123507  24881
## 3  Twitter   3.2 Mb 23601       23601 1618885     1339006 304234
## 4   Sample     6 Mb 33365       33365 3837029     3168338 706346

Build corpus and clean the data

In this step, we build the corpus and then cleaning the data. The following cleanup we will perform in this step:

library(tm)
## Loading required package: NLP
library(pryr)
## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp
## 
## Attaching package: 'pryr'
## The following object is masked from 'package:tm':
## 
##     inspect
sample_corpus <- VCorpus(VectorSource(sampleData))
object_size(sample_corpus)
## 77.8 MB
sample_corpus <- tm_map(sample_corpus,content_transformer(tolower)) # convert to lower case
sample_corpus <- tm_map(sample_corpus,removePunctuation)
sample_corpus <- tm_map(sample_corpus,removeNumbers)
sample_corpus <- tm_map(sample_corpus,stripWhitespace)
sample_corpus <- tm_map(sample_corpus,PlainTextDocument)

Construction of N-grams

We need to tokenize the cleaned data and construct the set of N-grams. We will build the following N-grams:

library(RWeka)
unigram <- function(x) { NGramTokenizer(x, Weka_control(min=1, max=1))}
bigram <- function(x) { NGramTokenizer(x, Weka_control(min=2, max=2))}
trigram <- function(x) { NGramTokenizer(x, Weka_control(min=3, max=3))}

uni_mat <- TermDocumentMatrix(sample_corpus, control = list(tokenize = unigram))
bi_mat <- TermDocumentMatrix(sample_corpus, control = list(tokenize= bigram))
tri_mat <- TermDocumentMatrix(sample_corpus, control = list(tokenize= trigram))

Calculate Frequencies of N-grams

After matrix is formed, we need to calculate frequencies

uni_ft <- findFreqTerms(uni_mat,lowfreq=30)
uniFreq <- rowSums(as.matrix(uni_mat[uni_ft,]))
uniFreq <- data.frame(word = names(uniFreq), frequency = uniFreq)

bi_ft <- findFreqTerms(bi_mat, lowfreq=25)
biFreq <- rowSums(as.matrix(bi_mat[bi_ft,]))
biFreq <- data.frame(word = names(biFreq), frequency = biFreq)


tri_ft <- findFreqTerms(tri_mat, lowfreq=20)
triFreq <- rowSums(as.matrix(tri_mat[tri_ft,]))
triFreq <- data.frame(word = names(triFreq), frequency = triFreq)

Now we will see the Words and their frequencies.

head(uniFreq)
##                  word frequency
## “the             “the        89
## ability       ability        43
## able             able       231
## about           about      2041
## above           above       112
## absolutely absolutely        70
head(biFreq)
##                    word frequency
## – and             – and        27
## – the             – the        26
## a bad             a bad        40
## a beautiful a beautiful        45
## a better       a better        48
## a big             a big        94
head(triFreq)
##                    word frequency
## a bit of       a bit of        45
## a bunch of   a bunch of        32
## a chance to a chance to        31
## a couple of a couple of        83
## a few days   a few days        22
## a few weeks a few weeks        20

For visualization purpose , we will re-arrange the data in terms of descending frequency.

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
uniFreqDesc <- arrange(uniFreq,desc(frequency))
biFreqDesc <- arrange(biFreq,desc(frequency))
triFreqDesc <- arrange(triFreq,desc(frequency))

Visualize data

Now, we will generate histograms to see Top 20 Unigrams, Bigrams and Trigrams.

library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
ggplot(data=uniFreqDesc[1:20,], aes(x=reorder(word,-frequency), y = frequency))+
          geom_bar(stat = "identity", fill="orange")+
          xlab("Words")+
          ylab("Frequency")+
          ggtitle("Top 20 Unigrams")+
          theme(plot.title = element_text(hjust = 0.5))+
          theme(axis.text.x=element_text(angle=45, hjust=1))

ggplot(data=biFreqDesc[1:20,], aes(x=reorder(word,-frequency), y = frequency))+
  geom_bar(stat = "identity", fill="green")+
  xlab("Words")+
  ylab("Frequency")+
  ggtitle("Top 20 Bigrams")+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(axis.text.x=element_text(angle=45, hjust=1))

ggplot(data=triFreqDesc[1:20,], aes(x=reorder(word,-frequency), y = frequency))+
  geom_bar(stat = "identity", fill="blue")+
  xlab("Words")+
  ylab("Frequency")+
  ggtitle("Top 20 Trigrams")+
  theme(plot.title = element_text(hjust = 0.5))+
  theme(axis.text.x=element_text(angle=45, hjust=1))

Wordcloud

Also, we generate wordcloud to visualize in a better way. The most used keywords stands out better in word cloud.

library(wordcloud)
## Loading required package: RColorBrewer
uniCloud <- wordcloud(uniFreq$word, uniFreq$frequency, scale = c(4, 0.5), 
                      max.words = 100, random.order = FALSE, rot.per = 0.25, 
                      use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))

biCloud <- wordcloud(biFreq$word, biFreq$frequency, scale = c(4, 0.5), 
                      max.words = 100, random.order = FALSE, rot.per = 0.25, 
                      use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))

triCloud <- wordcloud(triFreq$word, triFreq$frequency, scale = c(2, 0.5), 
                     max.words = 100, random.order = FALSE, rot.per = 0.25, 
                     use.r.layout = FALSE, colors = brewer.pal(8, "Dark2"))

Next Steps