This milestone report describes the major features of the training data with our exploratory data analysis and summarizes our plans for creating the predictive model.
library(downloader)
library(plyr);
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(knitr)
library(tm)
## Loading required package: NLP
if(!file.exists("./Data")){
dir.create("./Data")
}
Url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
if(!file.exists("./Data/Coursera-SwiftKey.zip")){
download.file(Url,destfile="./Data/Coursera-SwiftKey.zip",mode = "wb")
}
if(!file.exists("./Data/final")){
unzip(zipfile="./Data/Coursera-SwiftKey.zip",exdir="./Data")
}
list.files(path = "./Data/final/en_US/")
## [1] "en_US.blogs.txt" "en_US.news.txt" "en_US.news_edit.txt"
## [4] "en_US.twitter.txt" "profaneWords.txt"
library(gdata)
## gdata: Unable to locate valid perl interpreter
## gdata:
## gdata: read.xls() will be unable to read Excel XLS and XLSX files
## gdata: unless the 'perl=' argument is used to specify the location
## gdata: of a valid perl intrpreter.
## gdata:
## gdata: (To avoid display of this message in the future, please
## gdata: ensure perl is installed and available on the executable
## gdata: search path.)
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLX' (Excel 97-2004) files.
##
## gdata: Unable to load perl libaries needed by read.xls()
## gdata: to support 'XLSX' (Excel 2007+) files.
##
## gdata: Run the function 'installXLSXsupport()'
## gdata: to automatically download and install the perl
## gdata: libaries needed to support Excel XLS and XLSX formats.
##
## Attaching package: 'gdata'
## The following objects are masked from 'package:dplyr':
##
## combine, first, last
## The following object is masked from 'package:stats':
##
## nobs
## The following object is masked from 'package:utils':
##
## object.size
## The following object is masked from 'package:base':
##
## startsWith
library(stringi)
library(stringr)
library(NLP)
library(tm)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(quanteda)
## quanteda version 0.99
## Using 16 of 4 threads for parallel computing
##
## Attaching package: 'quanteda'
## The following object is masked from 'package:gdata':
##
## trim
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:NLP':
##
## ngrams
## The following object is masked from 'package:utils':
##
## View
library(slam)
library(wordcloud)
## Loading required package: RColorBrewer
setwd("./Data/final/en_US/")
blogs <- readLines("en_US.blogs.txt",skipNul = TRUE)
news <- readLines("en_US.news.txt",skipNul = TRUE)
## Warning in readLines("en_US.news.txt", skipNul = TRUE): incomplete final
## line found on 'en_US.news.txt'
twitter <- readLines("./Data/final/en_US/en_US.twitter.txt",skipNul = TRUE)
length(blogs)
## [1] 899288
length(news)
## [1] 77259
length(twitter)
## [1] 2360148
cat(sprintf("Blog Size = %f Mb", file.size("en_US.blogs.txt")/1024^2))
## Blog Size = NA Mb
cat(sprintf("Blog Size = %f Mb", file.size("en_US.news.txt")/1024^2))
## Blog Size = NA Mb
cat(sprintf("Blog Size = %f Mb", file.size("en_US.twitter.txt")/1024^2))
## Blog Size = NA Mb
wordsBlogs <- stri_count_words(blogs)
wordsNews <- stri_count_words(news)
wordsTwitter <- stri_count_words(twitter)
df <- data.frame(blogs=stri_stats_latex(blogs), news=stri_stats_latex(news), twitter=stri_stats_latex(twitter))
df
## blogs news twitter
## CharsWord 162473880 12479103 125607820
## CharsCmdEnvir 9 0 3006
## CharsWhite 42862192 3102820 36022648
## Words 37580560 2652293 30481647
## Cmds 3 0 960
## Envirs 0 0 0
data <- stri_enc_toascii (df)
data <- stri_replace_all_regex (data, '\032', '')
final <- Corpus (VectorSource (data))
master <- tm_map (final, content_transformer(tolower))
master <- tm_map (master, removePunctuation)
master <- tm_map (master, removeNumbers)
master <- tm_map (master, stripWhitespace)
master <- tm_map (master, removeWords, stopwords ("english"))
master <- tm_map (master, stemDocument, language = ("english"))
summary(stri_count_words(blogs))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 9.00 29.00 42.41 61.00 6728.00
summary(stri_count_words(news))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 19.00 32.00 34.82 46.00 1123.00
summary(stri_count_words(twitter))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.78 18.00 67.00
qplot(stri_count_words(blogs))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(stri_count_words(news))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
qplot(stri_count_words(twitter))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
This concludes our exploratory analysis. The next steps of this capstone project would be to finalize our predictive algorithm, and deploy our algorithm as a Shiny app.
Our predictive algorithm will be using n-gram model with frequency lookup similar to our exploratory analysis above. One possible strategy would be to use the trigram model to predict the next word. If no matching trigram can be found, then the algorithm would back off to the bigram model, and then to the unigram model if needed.
The user interface of the Shiny app will consist of a text input box that will allow a user to enter a phrase. Then the app will use our algorithm to suggest the most likely next word after a short delay.