Synopsis

This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.

Getting the data

library(downloader)
library(plyr);
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(knitr)
library(tm)
## Loading required package: NLP
if(!file.exists("./Data")){
  dir.create("./Data")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./Data/Coursera-SwiftKey.zip")){
  download.file(Url,destfile="./Data/Coursera-SwiftKey.zip",mode = "wb")
}
if(!file.exists("./Data/final")){
  unzip(zipfile="./Data/Coursera-SwiftKey.zip",exdir="./Data")
}
list.files(path = "./Data/final/en_US/")
## [1] "en_US.blogs.txt"     "en_US.news.txt"      "en_US.news_edit.txt"
## [4] "en_US.twitter.txt"   "profaneWords.txt"

Load Data Sets

library(gdata)
## gdata: Unable to locate valid perl interpreter
## gdata: 
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata: 
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
## 
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
## 
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
## 
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
## 
##     combine, first, last
## The following object is masked from 'package:stats':
## 
##     nobs
## The following object is masked from 'package:utils':
## 
##     object.size
## The following object is masked from 'package:base':
## 
##     startsWith
library(stringi)
library(stringr)
library(NLP)
library(tm)
library(ggplot2)
## 
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
## 
##     annotate
library(quanteda)
## quanteda version 0.99
## Using 16 of 4 threads for parallel computing
## 
## Attaching package: 'quanteda'
## The following object is masked from 'package:gdata':
## 
##     trim
## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
## 
##     ngrams
## The following object is masked from 'package:utils':
## 
##     View
library(slam)
library(wordcloud)
## Loading required package: RColorBrewer
setwd("./Data/final/en_US/")
blogs <- readLines("en_US.blogs.txt",skipNul = TRUE)
news <- readLines("en_US.news.txt",skipNul = TRUE)
## Warning in readLines("en_US.news.txt", skipNul = TRUE): incomplete final
## line found on 'en_US.news.txt'
twitter <- readLines("./Data/final/en_US/en_US.twitter.txt",skipNul = TRUE)

Summarize Data

length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
cat(sprintf("Blog Size = %f Mb", file.size("en_US.blogs.txt")/1024^2))
## Blog Size = NA Mb
cat(sprintf("Blog Size = %f Mb", file.size("en_US.news.txt")/1024^2))
## Blog Size = NA Mb
cat(sprintf("Blog Size = %f Mb", file.size("en_US.twitter.txt")/1024^2))
## Blog Size = NA Mb
wordsBlogs <- stri_count_words(blogs)
wordsNews  <- stri_count_words(news)
wordsTwitter <- stri_count_words(twitter)


df <- data.frame(blogs=stri_stats_latex(blogs), news=stri_stats_latex(news), twitter=stri_stats_latex(twitter))
df
##                   blogs     news   twitter
## CharsWord     162473880 12479103 125607820
## CharsCmdEnvir         9        0      3006
## CharsWhite     42862192  3102820  36022648
## Words          37580560  2652293  30481647
## Cmds                  3        0       960
## Envirs                0        0         0

Process Data

data <- stri_enc_toascii (df)
data <- stri_replace_all_regex (data, '\032', '')
final <- Corpus (VectorSource (data))

master <- tm_map (final, content_transformer(tolower))
master <- tm_map (master, removePunctuation)
master <- tm_map (master, removeNumbers)
master <- tm_map (master, stripWhitespace)
master <- tm_map (master, removeWords, stopwords ("english"))
master <- tm_map (master, stemDocument, language = ("english"))

Explore Data

summary(stri_count_words(blogs))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    9.00   29.00   42.41   61.00 6728.00
summary(stri_count_words(news))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00   19.00   32.00   34.82   46.00 1123.00
summary(stri_count_words(twitter))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.78   18.00   67.00
qplot(stri_count_words(blogs))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(stri_count_words(news))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

qplot(stri_count_words(twitter))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Conclusion and further planning

This concludes our exploratory analysis. The next steps of this capstone project would be to finalize our predictive algorithm, and deploy our algorithm as a Shiny app.

Our predictive algorithm will be using n-gram model with frequency lookup similar to our exploratory analysis above. One possible strategy would be to use the trigram model to predict the next word. If no matching trigram can be found, then the algorithm would back off to the bigram model, and then to the unigram model if needed.

The user interface of the Shiny app will consist of a text input box that will allow a user to enter a phrase. Then the app will use our algorithm to suggest the most likely next word after a short delay.