The overall objective of the capstone project is to construct a predictive model that retreives the word that is most likely to appear next, in a sequence of words within a sentence. In this week’s report, the objective is to display that you’ve gotten used to working with the data and that you are on track to create your prediction algorithm.
The data is made available at https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
blogs <- readLines("final/en_US/en_US.blogs.txt", encoding = "UTF-8", skipNul = TRUE)
news <- readLines("final/en_US/en_US.news.txt", encoding = "UTF-8", skipNul = TRUE)
## Warning in readLines("final/en_US/en_US.news.txt", encoding =
## "UTF-8", skipNul = TRUE): incomplete final line found on 'final/en_US/
## en_US.news.txt'
twitter <- readLines("final/en_US/en_US.twitter.txt", encoding = "UTF-8", skipNul = TRUE)
We now illustrate the size of each file, number of lines & words, mean word count, and the top number of characters per line in each file:
fsize_blogs <- file.info("../final/en_US/en_US.blogs.txt")$size / 1024^2
fsize_news <- file.info("../final/en_US/en_US.news.txt")$size / 1024^2
fsize_twitter <- file.info("../final/en_US/en_US.twitter.txt")$size / 1024^2
line_cnt_blogs <- length(blogs) # count of lines in the file
line_cnt_news <- length(news) #
line_cnt_twitter <- length(twitter) #
word_cnt_blogs <- sum(sapply(gregexpr("\\W+", blogs), length)) + 1 # count of words in the file
word_cnt_news <- sum(sapply(gregexpr("\\W+", news), length)) + 1
word_cnt_twitter <- sum(sapply(gregexpr("\\W+", twitter), length)) + 1
longest_blogs <- max(nchar(blogs)) # max count of char per line.
longest_news <- max(nchar(news)) #
longest_twitter <- max(nchar(twitter)) #
knitr::kable(data.frame(
DataSources = c("Blogs", "News", "Twitter"),
file.size.MB = format(c(fsize_blogs, fsize_news, fsize_twitter), digits = 5),
total.Lines = format(c(line_cnt_blogs, line_cnt_news, line_cnt_twitter), big.mark = ","),
total.Word.count = format(c(word_cnt_blogs, word_cnt_news, word_cnt_twitter), big.mark = ","),
avg.words.per.line = format(c(word_cnt_blogs/line_cnt_blogs,
word_cnt_news/line_cnt_news,
word_cnt_twitter/line_cnt_twitter),
digits = 5),
max.char.per.line = format(c(longest_blogs, longest_news, longest_twitter), big.mark = ",")))
| DataSources | file.size.MB | total.Lines | total.Word.count | avg.words.per.line | max.char.per.line |
|---|---|---|---|---|---|
| Blogs | NA | 899,288 | 38,222,279 | 42.503 | 40,833 |
| News | NA | 77,259 | 2,748,071 | 35.570 | 5,760 |
| NA | 2,360,148 | 30,433,285 | 12.895 | 140 |
library(textmineR)
## Warning: package 'textmineR' was built under R version 3.3.3
## Loading required package: Matrix
blogs <- iconv(blogs, "UTF-8", "ASCII", "byte")
news <- iconv(news, "UTF-8", "ASCII", "byte")
twitter <- iconv(twitter, "UTF-8", "ASCII", "byte")
set.seed(1234)
data.sample <- c(sample(blogs, length(blogs) * 0.01),
sample(news, length(news) * 0.01),
sample(twitter, length(twitter) * 0.01))
rm(blogs)
rm(news)
rm(twitter)
library(tm)
## Warning: package 'tm' was built under R version 3.3.3
## Loading required package: NLP
## Warning: package 'NLP' was built under R version 3.3.3
library(tmap)
## Warning: package 'tmap' was built under R version 3.3.3
library(NLP)
library(RColorBrewer)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.3.3
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
corpus <- VCorpus(VectorSource(data.sample))
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.3.3
wordcloud(corpus, max.words=100, random.order=FALSE, colors=brewer.pal(8,"Dark2"))
tcorpus<-tm_map(corpus, removeWords, stopwords("english"))
wordcloud(tcorpus, max.words=100, random.order=FALSE, colors=brewer.pal(8,"Dark2"))
Next, we list the most occuring words in the dataset, by using unigrams, bigrams, and trigrams:
library(ngram)
library(RWeka)
options(mc.cores=1)
unigram <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
getFreq <- function(tdm) {
freq <- sort(rowSums(as.matrix(tdm)), decreasing = TRUE)
return(data.frame(word = names(freq), freq = freq))
}
makePlot <- function(data, label) {
ggplot(data[1:50,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("grey50"))
}
We illustrate the top 50 using the histogram:
freq1 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, contro=list(tokenize = unigram)), 0.99))
makePlot(freq1, "50 Most Common unigrams")
freq2 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = bigram)), 0.999))
makePlot(freq2, "50 Most Common bigrams")
freq3 <- getFreq(removeSparseTerms(TermDocumentMatrix(corpus, control = list(tokenize = trigram)), 0.9999))
makePlot(freq3, "50 Most Common trigrams")
The next step of the capstone project will be to finetune the predictive algorithm, and lastly deploy the shiny app. The algorithm will utilize n-gram models to predict the ocurence of the next word in sequence, similar to the methods above. We will then compare the cases where stop words are included and not.
For example, when using the trigram model to predict the next word, if a match is not found using the trigram, then the predictive algorithm will go back to the bigram model, and if a match is still not found, it will then go back to the unigram model.
For the shiny app, the user interface will have a text input box for users to input text, and following which, the app will utilize the predictive algorithm to return the word that is most likely to occur next in sequence.