Executive Summary

IN this milestone document I will demonstrate that I have downloaded the data and have successfully loaded it for analysis, provide a basic summary of the corpus which will include summary statistics and some plot visualizations of the data.

Required Packages

library(tm)
## Loading required package: NLP
library(RColorBrewer)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(wordcloud)
library(stringi)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(pryr)
## Registered S3 method overwritten by 'pryr':
##   method      from
##   print.bytes Rcpp
## 
## Attaching package: 'pryr'
## The following object is masked from 'package:tm':
## 
##     inspect
library(RWeka)

Loading the data

#DOWNLOAD
if (!file.exists("Coursera-SwiftKey.zip")){
        download.file(url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip", destfile = "Coursera-SwiftKey.zip")
        unzip("Coursera-SwiftKey.zip")
}

blogs <- readLines("./final/en_US/en_US.blogs.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
news <- readLines("./final/en_US/en_US.news.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines("./final/en_US/en_US.twitter.txt", warn = FALSE, encoding = "UTF-8", skipNul = TRUE)

##setwd("~/Users/sahil/Documents/Data Science Course/Capstone")

# blog <- readLines("final/en_US/en_US.blogs.txt", warn=FALSE, encoding="UTF-8")
# twit <- readLines("final/en_US/en_US.twitter.txt", warn=FALSE, encoding="UTF-8")
# news <- readLines("final/en_US/en_US.news.txt", warn=FALSE, encoding="UTF-8")

SUMMARY STATS

stats <- data.frame(
        FileName=c("blogs", "news", "twitter"),
        FileSize=sapply(list(blogs, news, twitter), function(x){format(object.size(x), "MB")}),
        FileSizeMB=c(file.info("./en_US.blogs.txt")$size/1024^2,
                     file.info("./en_US.news.txt")$size/1024^2,
                     file.info("./en_US.twitter.txt")$size/1024^2),
        t(rbind(sapply(list(blogs, news, twitter), stri_stats_general),c("Lines", "Chars"),
        Words = sapply(list(blogs, news, twitter), stri_stats_latex)[4,])
        )
)
## Warning in rbind(sapply(list(blogs, news, twitter), stri_stats_general), :
## number of columns of result is not a multiple of vector length (arg 2)
stats
##   FileName FileSize FileSizeMB   Lines LinesNEmpty     Chars CharsNWhite    V5
## 1    blogs 255.4 Mb         NA  899288      899288 206824382   170389539 Lines
## 2     news 257.3 Mb         NA 1010242     1010242 203223154   169860866 Chars
## 3  twitter   319 Mb         NA 2360148     2360148 162096241   134082806 Lines
##      Words
## 1 37570839
## 2 34494539
## 3 30451170

Creating a Sample Set

Since the dataset is very large, we will take a sample of the set and use it to run basic exploratory analysis as a representation of the total data set.

Sample set is 1% of the total data set

##        FileName FileSize Lines LinesNEmpty   Chars CharsNWhite    V5   Words
## 1   blogsSample   2.6 Mb  8992        8992 2069977     1705097 Lines  376724
## 2    newsSample   2.6 Mb 10102       10102 2023544     1691310 Chars  343883
## 3 twitterSample   3.2 Mb 23601       23601 1614115     1334624 Lines  303793
## 4    sampleData   8.3 Mb 42695       42695 5698762     4723855 Chars 1020788

Create and Clean Data Corpus

Here we create the text corpus we will use for analysis and prediction. A corpus is a representative sample of actual language production within a meaningful context and with a general purpose

datacorpus <- VCorpus(VectorSource(sampleData))
object_size(datacorpus)
## 100 MB
cleanedDataCorpus <- tm_map(datacorpus, content_transformer(tolower)) # Convert all to lower case
cleanedDataCorpus <- tm_map(cleanedDataCorpus, removePunctuation) # Remove punctuation marks
cleanedDataCorpus <- tm_map(cleanedDataCorpus, removeNumbers) # Remove numbers
cleanedDataCorpus <- tm_map(cleanedDataCorpus, stripWhitespace) # Remove whitespace
cleanedDataCorpus <- tm_map(cleanedDataCorpus, PlainTextDocument) # Convert all to plain text document

Create Uni, Bi & Tri Ngrams via Tokenization

We will create sets of single, double triple words to use for prediction later on

We then find frequencies of the respective tokenized unigram, bigram and trigrams

##            word frequency
## “the       “the        79
## “we         “we        51
## ability ability        90
## able       able       271
## about     about      2943
## above     above       137
##                    word frequency
## – the             – the        55
## a bad             a bad        67
## a beautiful a beautiful        59
## a better       a better        82
## a big             a big       164
## a bit             a bit       228
##                              word frequency
## a bit of                 a bit of        64
## a couple of           a couple of       123
## a little bit         a little bit        67
## a lot of                 a lot of       283
## according to the according to the        79
## all of the             all of the        84

Visualizations

Here we visualize the three different types of NGrams via bar plots that show the most frequently appearing ngarmas of a particular type. The bar plots below show the top 15 Uni, Bi and Tri-Grams in the cleaned sample corpus

uni_corpus_freqDescend <- arrange(uni_corpus_freq, desc(frequency))
bi_corpus_freqDescend <- arrange(bi_corpus_freq, desc(frequency))
tri_corpus_freqDescend <- arrange(tri_corpus_freq, desc(frequency))

uniBar <- ggplot(data = uni_corpus_freqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
        geom_bar(stat = "identity", fill = "yellow") +
        xlab("Words") +
        ylab("Frequency") +
        ggtitle(paste("Top 20 Unigrams")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))

biBar <- ggplot(data = bi_corpus_freqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
        geom_bar(stat = "identity", fill = "red") +
        xlab("Words") +
        ylab("Frequency") +
        ggtitle(paste("Top 20 Bigrams")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))

triBar <- ggplot(data = tri_corpus_freqDescend[1:20,], aes(x = reorder(word, -frequency), y = frequency)) +
        geom_bar(stat = "identity", fill = "blue") +
        xlab("Words") +
        ylab("Frequency") +
        ggtitle(paste("Top 20 Trigrams")) +
        theme(plot.title = element_text(hjust = 0.5)) +
        theme(axis.text.x = element_text(angle = 60, hjust = 1))

uniBar

biBar

triBar

Conclusion

Our tables of Uni Bi and Tri Gram frequencies nicely encapsulate the most frequent words and phrases in the cleaned sample set. We will use this in subsequent portions to build the predictice text model.