This is the Milestone Report for the Coursera Data Science Capstone project. The goal of this project is just to display that you’ve gotten used to working with the data and that you are on track to create yprediction algorithm. This document is concise and explain the major features of the data I have identified and briefly summarize my plans for creating the prediction algorithm.
if (!file.exists("Coursera-SwiftKey.zip")) {
download.file("https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip")
unzip("Coursera-SwiftKey.zip")
}
conblogs = file("final/en_US/en_US.blogs.txt","r")
blogs <- readLines(conblogs, encoding= "UTF-8", skipNul = TRUE)
close(conblogs)
connews = file("final/en_US/en_US.news.txt", open="rb")
news <- readLines(connews, encoding="UTF-8", skipNul = TRUE)
close(connews)
contwitter = file("final/en_US/en_US.twitter.txt","r")
twitter <- readLines(contwitter, encoding= "UTF-8", skipNul = TRUE)
close(contwitter)
I examined the data sets and summarize my findings (file sizes, line counts, word counts, and mean words per line) below.
library(stringi)
# sizes
blogs.size <- file.info("final/en_US/en_US.blogs.txt")$size / 1024 ^ 2
news.size <- file.info("final/en_US/en_US.news.txt")$size / 1024 ^ 2
twitter.size <- file.info("final/en_US/en_US.twitter.txt")$size / 1024 ^ 2
# words
blogs.words <- stri_count_words(blogs)
news.words <- stri_count_words(news)
twitter.words <- stri_count_words(twitter)
# Summary
data.frame(datasets = c("blogs", "news", "twitter"),
size.MB = c(blogs.size, news.size, twitter.size),
num.lines = c(length(blogs), length(news), length(twitter)),
num.words = c(sum(blogs.words), sum(news.words), sum(twitter.words)),
mean.words.per.line = c(mean(blogs.words), mean(news.words), mean(twitter.words)))
## datasets size.MB num.lines num.words mean.words.per.line
## 1 blogs 200.4242 899288 37546246 41.75108
## 2 news 196.2775 1010242 34762395 34.40997
## 3 twitter 159.3641 2360148 30093410 12.75065
Since the data sets are quite large, choose 2% of the data to demonstrate the data cleaning and exploratory analysis.
set.seed(12345)
sampleblogs <- sample(blogs, length(blogs) * 0.01)
samplenews <- sample(news, length(news) * 0.01)
sampletwitter <- sample(twitter, length(twitter) * 0.01)
data.sample <- c(sampleblogs, samplenews, sampletwitter)
Before performing exploratory analysis, lean the data first. This involves removing URLs, special characters, punctuations, numbers, excess whitespace, stopwords, and changing the text to lower case.
library(tm)
toremove <- grep("data.sample", iconv(data.sample, "latin1", "ASCII", sub="data.sample"))
data.sample <- data.sample[-toremove]
corpus <- Corpus(VectorSource(data.sample))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
corpus <- tm_map(corpus, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
corpus <- tm_map(corpus, toSpace, "@[^\\s]+")
corpus <- tm_map(corpus, toSpace, "[^\\p{L}\\s[']]+")
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, removeWords, stopwords("en"))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
perform exploratory analysis on the data. It would be interesting and helpful to find the most frequently occurring words in the data. Here we list the most common unigram, bigram, and trigram.
library(RWeka)
options(mc.cores=1)
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unigram <- DocumentTermMatrix(corpus, control = list(tokenize = Tokenizer))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bigram <- DocumentTermMatrix(corpus, control = list(tokenize = BigramTokenizer))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
trigram <- DocumentTermMatrix(corpus, control = list(tokenize = TrigramTokenizer))
The document term matrix is quite sparse (that is, mostly empty) and so it is actually stored in a much more compact representation internally. For very large corpus the size of the matrix can exceed R’s calculation limits. This will manifest itself as a integer overflow error with a message like: Error in vector(typeof(x$v), nr * nc) : vector size cannot be NA In addition: Warning message: In nr * nc : NAs produced by integer overflow If this occurs, then consider removing sparse terms from the document term matrix. Removing Sparse Terms We are often not interested in infrequent terms in documents.
unigram <- removeSparseTerms(unigram, 0.999)
tm_unifreq <- sort(colSums(as.matrix(unigram)), decreasing=TRUE)
tm_uniwordfreq <- data.frame(word=names(tm_unifreq), freq=tm_unifreq)
head(tm_uniwordfreq,5)
## word freq
## will will 4976
## just just 4914
## said said 4807
## one one 4271
## like like 4107
bigram <- removeSparseTerms(bigram, 0.999)
tm_bifreq <- sort(colSums(as.matrix(bigram)), decreasing=TRUE)
tm_biwordfreq <- data.frame(word=names(tm_bifreq), freq=tm_bifreq)
head(tm_biwordfreq,5)
## word freq
## right now right now 453
## last year last year 283
## last night last night 279
## new york new york 274
## high school high school 223
trigram <- removeSparseTerms(trigram, 0.9999)
tm_trifreq <- sort(colSums(as.matrix(trigram)), decreasing=TRUE)
tm_triwordfreq <- data.frame(word=names(tm_trifreq), freq=tm_trifreq)
head(tm_triwordfreq,5)
## word freq
## happy mothers day happy mothers day 63
## let us know let us know 56
## new york city new york city 38
## happy new year happy new year 31
## president barack obama president barack obama 29
In the diagrams below, you can explore the Ngrams by frequencies:
library(ggplot2)
makePlot <- function(data, label) {
ggplot(data[1:20,], aes(reorder(word, -freq), freq)) +
labs(x = label, y = "Frequency") +
theme(axis.text.x = element_text(angle = 60, size = 12, hjust = 1)) +
geom_bar(stat = "identity", fill = I("grey50"))}
library(ggplot2)
makePlot(tm_uniwordfreq, "20 Most Common Unigram")
library(ggplot2)
makePlot(tm_biwordfreq, "20 Most Common Bigram")
library(ggplot2)
makePlot(tm_triwordfreq, "20 Most Common Trigram")
library(wordcloud)
set.seed(123)
wordcloud(names(tm_unifreq), tm_unifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
library(wordcloud)
set.seed(123)
wordcloud(names(tm_bifreq), tm_bifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
library(wordcloud)
set.seed(123)
wordcloud(names(tm_trifreq), tm_trifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
This concludes the exploratory analysis. The next steps of this capstone project would be to finalize predictive algorithm, and deploy algorithm as a Shiny app.
predictive algorithm will be using n-gram model with frequency lookup similar to exploratory analysis above. One possible strategy would be to use the trigram model to predict the next word. If no matching trigram can be found, then the algorithm would back off to the bigram model, and then to the unigram model if needed.