CapstoneProject

Summary

In this document, I created ngrams of the text and did exploratory analysis to find the features of the data. Further, I stated my plan for the final prediction task.

Reading Data

The dataset was downloaded from “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip”. This session is to read and preview the data

doc <- file("C:/Users/yzhang/Documents/Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
US.blog <- readLines(doc, encoding = "UTF-8", skipNul = TRUE)
close(doc)

doc <- file("C:/Users/yzhang/Documents/Coursera-SwiftKey/final/en_US/en_US.news.txt")
US.news <- readLines(doc, encoding = "UTF-8", skipNul = TRUE)

## Warning in readLines(doc, encoding = "UTF-8", skipNul = TRUE): incomplete
## final line found on 'C:/Users/yzhang/Documents/Coursera-SwiftKey/final/
## en_US/en_US.news.txt'

close(doc)

doc <- file("C:/Users/yzhang/Documents/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
US.twitter <- readLines(doc, encoding = "UTF-8", skipNul = TRUE)
close(doc)

summary(US.blog)

##    Length     Class      Mode 
##    899288 character character

summary(US.news)

##    Length     Class      Mode 
##     77259 character character

summary(US.twitter)

##    Length     Class      Mode 
##   2360148 character character

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.

Features of The Data

library(stringi)

length.Blog <- length(US.blog)
length.News <- length(US.news)
length.Twitter <- length(US.twitter)

NumWords.Blog <- stri_count_words(US.blog)
NumWords.News <- stri_count_words(US.news)
NumWords.twitter <- stri_count_words(US.twitter)


filesummary <- data.frame(
        fileName = c("Blog","News","Twitter"),
        LengthFile = c(length.Blog, length.News, length.Twitter),
        sum.NumWords = c(sum(NumWords.Blog), sum(NumWords.News), sum(NumWords.twitter)),
        mean.NumWords= c(mean(NumWords.Blog), mean(NumWords.News), mean(NumWords.twitter) )
)

print(filesummary)

##   fileName LengthFile sum.NumWords mean.NumWords
## 1     Blog     899288     37546239      41.75107
## 2     News      77259      2674536      34.61779
## 3  Twitter    2360148     30093413      12.75065

Get Sample Data

We will use 1% of the data to create the sample.

set.seed(123)
SampleData <- c(sample(US.blog, length.Blog*0.01),
                 sample(US.news, length.News * 0.01),
                 sample(US.twitter, length.Twitter * 0.01))

summary(SampleData)

##    Length     Class      Mode 
##     33365 character character

Clean Data

Go through data cleaning pipe for the data.

library(NLP)
library(tm)
SampleData <- removePunctuation(SampleData)
SampleData <- tolower(SampleData)
SampleData <- stemDocument(SampleData)
SampleData <- removeWords(SampleData, words = c("the", stopwords("english")))
SampleData <- stripWhitespace(SampleData)

Exploratory Analysis

Generate a word cloud that shows word frequencies.

library(RColorBrewer)
library(wordcloud)
wordcloud(SampleData, max.words = 100, min.freq=3, random.order = FALSE, random.color = FALSE, colors = brewer.pal(12, 'Paired'))

## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents

## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents

## Warning in wordcloud(SampleData, max.words = 100, min.freq = 3,
## random.order = FALSE, : tonight could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(SampleData, max.words = 100, min.freq = 3,
## random.order = FALSE, : made could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(SampleData, max.words = 100, min.freq = 3,
## random.order = FALSE, : guy could not be fit on page. It will not be
## plotted.

Creat a histogram to show quantitative data of frequencies.

SampleData <- paste0(unlist(SampleData), collapse=" ")
SampleData <- strsplit(SampleData," ", fixed = TRUE)[[1L]]
SampleData <- SampleData[SampleData != ""]

Unigrams <- vapply(ngrams(SampleData, 1L), paste, "", collapse= " ")
Top20 <- sort(table(Unigrams), decreasing = TRUE)[1:20]
print(Top20)

## Unigrams
##  just  like   get    go  will   one   can    im  time  love   day  make 
##  2528  2455  2442  2255  2233  2171  1965  1946  1934  1818  1780  1621 
##  know  good   now thank  work  want think   see 
##  1565  1491  1453  1448  1341  1299  1293  1283

barplot(Top20[1:10])

bigrams <- vapply(ngrams(SampleData, 2L), paste, "", collapse = " ")
Top20_bi <- sort(table(bigrams), decreasing =TRUE) [1:20]
print(Top20_bi)

## bigrams
##    right now    cant wait look forward    look like   last night 
##          192          186          170          170          169 
##    feel like    dont know thank follow        im go      can get 
##          153          149          110          105          101 
##    last year      im sure     let know       let go   first time 
##           97           87           86           82           81 
##    good morn      one day    make sure     new york  even though 
##           81           81           80           80           79

barplot(Top20_bi[1:10])

trigrams <- vapply(ngrams(SampleData, 3L), paste, "", collapse = " ")
Top20_tri <- sort(table(trigrams), decreasing =TRUE) [1:20]
print(Top20_tri)

## trigrams
##       cant wait see    happi mother day      happi new year 
##                  35                  34                  24 
##         let us know       new york citi    look forward see 
##                  21                  18                  12 
##      cant wait till       cinco de mayo     thank veri much 
##                  11                  11                  11 
##       cant wait get      im pretti sure       ive ever seen 
##                  10                  10                  10 
##      dont feel like dont understand whi   hotel venic itali 
##                   9                   9                   9 
##  hunter matt hunter     im look forward       just got back 
##                   9                   9                   9 
##      make feel like    matt hunter matt 
##                   9                   9

barplot(Top20_tri[1:10])

Plan for Prediction Project

In the prediction project, I plan to use the “Stupid Backoff” approach to predict the next word. The algorithm will use the last four words typed in to find 5-grams that match four words. If less than 5 candidates are found, then it uses the last three words to search for matches. Once all 5 candidates are found, the app assigns a score to each, and the candidate with the highest score is chosen as the result.