In this document, I created ngrams of the text and did exploratory analysis to find the features of the data. Further, I stated my plan for the final prediction task.
The dataset was downloaded from “https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip”. This session is to read and preview the data
doc <- file("C:/Users/yzhang/Documents/Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
US.blog <- readLines(doc, encoding = "UTF-8", skipNul = TRUE)
close(doc)
doc <- file("C:/Users/yzhang/Documents/Coursera-SwiftKey/final/en_US/en_US.news.txt")
US.news <- readLines(doc, encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines(doc, encoding = "UTF-8", skipNul = TRUE): incomplete
## final line found on 'C:/Users/yzhang/Documents/Coursera-SwiftKey/final/
## en_US/en_US.news.txt'
close(doc)
doc <- file("C:/Users/yzhang/Documents/Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
US.twitter <- readLines(doc, encoding = "UTF-8", skipNul = TRUE)
close(doc)
summary(US.blog)
## Length Class Mode
## 899288 character character
summary(US.news)
## Length Class Mode
## 77259 character character
summary(US.twitter)
## Length Class Mode
## 2360148 character character
Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.
library(stringi)
length.Blog <- length(US.blog)
length.News <- length(US.news)
length.Twitter <- length(US.twitter)
NumWords.Blog <- stri_count_words(US.blog)
NumWords.News <- stri_count_words(US.news)
NumWords.twitter <- stri_count_words(US.twitter)
filesummary <- data.frame(
fileName = c("Blog","News","Twitter"),
LengthFile = c(length.Blog, length.News, length.Twitter),
sum.NumWords = c(sum(NumWords.Blog), sum(NumWords.News), sum(NumWords.twitter)),
mean.NumWords= c(mean(NumWords.Blog), mean(NumWords.News), mean(NumWords.twitter) )
)
print(filesummary)
## fileName LengthFile sum.NumWords mean.NumWords
## 1 Blog 899288 37546239 41.75107
## 2 News 77259 2674536 34.61779
## 3 Twitter 2360148 30093413 12.75065
We will use 1% of the data to create the sample.
set.seed(123)
SampleData <- c(sample(US.blog, length.Blog*0.01),
sample(US.news, length.News * 0.01),
sample(US.twitter, length.Twitter * 0.01))
summary(SampleData)
## Length Class Mode
## 33365 character character
Go through data cleaning pipe for the data.
library(NLP)
library(tm)
SampleData <- removePunctuation(SampleData)
SampleData <- tolower(SampleData)
SampleData <- stemDocument(SampleData)
SampleData <- removeWords(SampleData, words = c("the", stopwords("english")))
SampleData <- stripWhitespace(SampleData)
Generate a word cloud that shows word frequencies.
library(RColorBrewer)
library(wordcloud)
wordcloud(SampleData, max.words = 100, min.freq=3, random.order = FALSE, random.color = FALSE, colors = brewer.pal(12, 'Paired'))
## Warning in tm_map.SimpleCorpus(corpus, tm::removePunctuation):
## transformation drops documents
## Warning in tm_map.SimpleCorpus(corpus, function(x) tm::removeWords(x,
## tm::stopwords())): transformation drops documents
## Warning in wordcloud(SampleData, max.words = 100, min.freq = 3,
## random.order = FALSE, : tonight could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(SampleData, max.words = 100, min.freq = 3,
## random.order = FALSE, : made could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(SampleData, max.words = 100, min.freq = 3,
## random.order = FALSE, : guy could not be fit on page. It will not be
## plotted.
Creat a histogram to show quantitative data of frequencies.
SampleData <- paste0(unlist(SampleData), collapse=" ")
SampleData <- strsplit(SampleData," ", fixed = TRUE)[[1L]]
SampleData <- SampleData[SampleData != ""]
Unigrams <- vapply(ngrams(SampleData, 1L), paste, "", collapse= " ")
Top20 <- sort(table(Unigrams), decreasing = TRUE)[1:20]
print(Top20)
## Unigrams
## just like get go will one can im time love day make
## 2528 2455 2442 2255 2233 2171 1965 1946 1934 1818 1780 1621
## know good now thank work want think see
## 1565 1491 1453 1448 1341 1299 1293 1283
barplot(Top20[1:10])
bigrams <- vapply(ngrams(SampleData, 2L), paste, "", collapse = " ")
Top20_bi <- sort(table(bigrams), decreasing =TRUE) [1:20]
print(Top20_bi)
## bigrams
## right now cant wait look forward look like last night
## 192 186 170 170 169
## feel like dont know thank follow im go can get
## 153 149 110 105 101
## last year im sure let know let go first time
## 97 87 86 82 81
## good morn one day make sure new york even though
## 81 81 80 80 79
barplot(Top20_bi[1:10])
trigrams <- vapply(ngrams(SampleData, 3L), paste, "", collapse = " ")
Top20_tri <- sort(table(trigrams), decreasing =TRUE) [1:20]
print(Top20_tri)
## trigrams
## cant wait see happi mother day happi new year
## 35 34 24
## let us know new york citi look forward see
## 21 18 12
## cant wait till cinco de mayo thank veri much
## 11 11 11
## cant wait get im pretti sure ive ever seen
## 10 10 10
## dont feel like dont understand whi hotel venic itali
## 9 9 9
## hunter matt hunter im look forward just got back
## 9 9 9
## make feel like matt hunter matt
## 9 9
barplot(Top20_tri[1:10])
In the prediction project, I plan to use the “Stupid Backoff” approach to predict the next word. The algorithm will use the last four words typed in to find 5-grams that match four words. If less than 5 candidates are found, then it uses the last three words to search for matches. Once all 5 candidates are found, the app assigns a score to each, and the candidate with the highest score is chosen as the result.