The goal of the capstone project is to develop predictive text models and build a shiny app like SwiftKey smart keyboard. The project starts with the basics, analyzing a large corpus of text documents to discover the structure in the data and how words are put together. It will cover cleaning and analyzing text data, then building and sampling from a predictive text model.
To keep on track, this milestone report is to:
1. Demonstrate that the data have been loaded successfully.
2. Create a basic report of summary statistics about the data sets.
3. Report any interesting findings.
4. Plan for creating a prediction algorithm and Shiny app, get feedback.
The original data set is downloaded from
https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
# Download and unzip data
url = "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip?accessType=DOWNLOAD"
destfile = file.path( "../" , "Coursera-SwiftKey.zip" )
if (!file.exists(destfile)) { # Download zipped file if it is not already done
download.file( url = url, destfile = destfile, mode = "wb")
}
datafolder <- file.path("../" , "final")
if (!file.exists(datafolder)) {
unzip(destfile, exdir="..")
}
# load files, en_US.blogs.txt, en_US.news.txt, en_US.twitter.txt
datafolder <- file.path(datafolder, "en_US")
blogfile <- file.path(datafolder, "en_US.blogs.txt")
newsfile <- file.path(datafolder, "en_US.news.txt")
twitterfile <- file.path(datafolder, "en_US.twitter.txt")
Blogs <- readLines(blogfile, encoding="UTF-8", skipNul=TRUE)
input <- file(newsfile, open="rb")
News <- readLines(input, encoding="UTF-8", skipNul=TRUE)
close(input)
Twitters <- readLines(twitterfile, encoding="UTF-8", skipNul=TRUE)
In this project, I will work with the English data set, which includes en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt.
library(stringi)
data.frame(document = c("Blogs", "News", "Twitters"),
size = c(format(object.size(Blogs), units="MB"),
format(object.size(News), units="MB"),
format(object.size(Twitters), units="MB")),
lines = c(length(Blogs), length(News), length(Twitters)),
words = c(sum(stri_count_words(Blogs)),
sum(stri_count_words(News)),
sum(stri_count_words(Twitters))))
## document size lines words
## 1 Blogs 248.5 Mb 899288 37546246
## 2 News 249.6 Mb 1010242 34762395
## 3 Twitters 301.4 Mb 2360148 30093410
Because the size of the text files is too large to handle within the available computing power, only 10% of the data entries are selected to illustrate features of the data.
set.seed(12345)
Blogs <- sample(Blogs, size=length(Blogs)*0.1, replace = FALSE)
News <- sample(News, size=length(News)*0.1, replace = FALSE)
Twitters <- sample(Twitters, size= length(Twitters)*0.1, replace = FALSE)
Samples <- c(Blogs, News, Twitters)
Package quanteda is used to tokenize the sampled data. Numbers, punctuations, symbols, separators, url, hyphens in the corpus are removed.
suppressMessages(library(quanteda))
suppressMessages(library(doParallel))
suppressMessages(require(dplyr))
# Generic function for parallelizing any task (when possible)
parallelizeTask <- function(task, ...) {
# Calculate the number of cores
ncores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(ncores)
registerDoParallel(cl)
#print("Starting task")
r <- task(...)
#print("Task done")
stopCluster(cl)
r
}
makeTokens <- function(input) {
output <- tokenize(input, what = "word", remove_numbers = TRUE,
remove_punct = TRUE, remove_symbols = TRUE,
remove_separators = TRUE, remove_twitter = TRUE,
remove_hyphens = TRUE, remove_url = TRUE)
}
corpus_all <- corpus(Samples)
tokens_all <- parallelizeTask(makeTokens, corpus_all)
tokens_all <- tokens_tolower(tokens_all)
# construct n-gram models
ngram1 <- tokens_ngrams(tokens_all, n = 1L, skip = 0L, concatenator = "_")
ngram2 <- tokens_ngrams(tokens_all, n = 2L, skip = 0L, concatenator = "_")
ngram3 <- tokens_ngrams(tokens_all, n = 3L, skip = 0L, concatenator = "_")
library(ggplot2)
dfm1 <- dfm(ngram1)
textplot_wordcloud(dfm1, random.order=TRUE, max.words=100,
colors = c('red', 'pink', 'green', 'purple', 'orange', 'blue'))
unigram <- topfeatures(dfm1, n=30, decreasing = TRUE, ci=0.95)
unigram <- data.frame(word = names(unigram), freq=unigram,
row.names = NULL)
ggplot(unigram, aes(x = reorder(word, freq), y = freq)) +
geom_bar(color = "black", fill = "blue", stat = "identity") +
labs(title = "Frequency of unigrams\n", x = "Unigrams", y = "Frequency\n") +
coord_flip()
dfm2 <- dfm(ngram2)
textplot_wordcloud(dfm2, random.order=TRUE, max.words=100,
colors = c('red', 'pink', 'green', 'purple', 'orange', 'blue'))
bigram <- topfeatures(dfm2, n=30, decreasing = TRUE, ci=0.95)
bigram <- data.frame(word = names(bigram), freq=bigram,
row.names = NULL)
ggplot(bigram, aes(x = reorder(word, freq), y = freq)) +
geom_bar(color = "black", fill = "green", stat = "identity") +
labs(title = "Frequency of bigrams\n", x = "Bigrams", y = "Frequency\n") +
coord_flip()
dfm3 <- dfm(ngram3)
textplot_wordcloud(dfm3, random.order=TRUE, max.words=30,
colors = c('red', 'pink', 'green', 'purple', 'orange', 'blue'))
trigram <- topfeatures(dfm3, n=30, decreasing = TRUE, ci=0.95)
trigram <- data.frame(word = names(trigram), freq=trigram,
row.names = NULL)
ggplot(trigram, aes(x = reorder(word, freq), y = freq)) +
geom_bar(color = "black", fill = "orange", stat = "identity") +
labs(title = "Frequency of trigrams\n", x = "Trigrams", y = "Frequency\n") +
coord_flip()