Nowadays, people spend very too much time in using smart phones. Typing on the smart phone can be a serious pain for users. This project is to help people predict the next word after following previous input words.
The datasset is from Coursera-SwiftKey.
Download the raw dataset from coursera website and unzip to the current working directory.
url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destfile <- "Coursera-SwiftKey.zip"
if (!file.exists(destfile)) {
download.file(url, destfile = destfile, method = "curl")
}
unzip(destfile)
Since only English is analyzed, change the working directory to ./final/en_US
setwd("final/en_US")
This part is for the quiz 1. Get the size of the file, the number of lines in each file, longest line and number of words.
blogs.con <- file("en_US.blogs.txt", "r")
news.con <- file("en_US.news.txt", "r")
twitter.con <- file("en_US.twitter.txt", "r")
blogs <- readLines(blogs.con, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news.con, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitter.con, encoding = "UTF-8", skipNul = TRUE)
close(blogs.con)
close(news.con)
close(twitter.con)
blogs.size <- file.size("en_US.blogs.txt") / 1024 / 1024
news.size <- file.size("en_US.news.txt") / 1024 / 1024
twitter.size <- file.size("en_US.twitter.txt") / 1024 / 1024
blogs.lines <- length(blogs)
news.lines <- length(news)
twitter.lines <- length(twitter)
blogs.longestline <- which.max(sapply(blogs, nchar))
news.longestline <- which.max(sapply(news, nchar))
twitter.longestline <- which.max(sapply(twitter, nchar))
getwords <- function(l) {
a <- sapply(l, function(x) {
y <- unlist(strsplit(x, "\\s+"))
length(y)
})
sum(a)
}
blogs.words <- getwords(blogs)
news.words <- getwords(news)
twitter.words <- getwords(twitter)
basis.info <- data.frame(size = c(blogs.size, news.size, twitter.size),
num.of.lines = c(blogs.lines, news.lines,
twitter.lines),
longest.line = c(blogs.longestline,
news.longestline,
twitter.longestline),
num.of.words = c(blogs.words, news.words,
twitter.words),
row.names = c("blogs", "news", "twitter"))
basis.info
## size num.of.lines longest.line num.of.words
## blogs 200.4242 899288 483415 37334149
## news 196.2775 1010242 123628 34372814
## twitter 159.3641 2360148 26 30373605
To increase the efficiency, 10000 lines are sampled.
# data.sample is a function to take 10% sample of each corpus
data.sample <- function(x) {
l <- length(x)
set.seed(2017)
x[sample(1:l, 10000)]
}
blogs.sample <- data.sample(blogs)
news.sample <- data.sample(news)
twitter.sample <- data.sample(twitter)
The later work will be on the sample corpuses. Get object sizes of original, and delete.
object.size(blogs)
object.size(news)
object.size(twitter)
rm(blogs)
rm(news)
rm(twitter)
Clean the data
# combine samples of blogs, news and twitter together
bnt.samples <- c(blogs.sample, news.sample, twitter.sample)
rm(blogs.sample)
rm(news.sample)
rm(twitter.sample)
bnt.samples <- iconv(bnt.samples, "UTF-8", "ASCII", sub = "")
# make all sample into one line
bnt.samples.1 <- paste(bnt.samples, collapse = " ")
# load tm package to clean text data
library(tm)
# create a corpus
bnt.corpus <- VCorpus(VectorSource(bnt.samples.1))
# toSpace is a function that transform a pattern to space
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
# remove URLs
bnt.corpus.clean <- tm_map(bnt.corpus, toSpace, "http[s]?://[[:alnum:]_[:punct:]]+")
# remove twitter accounts
bnt.corpus.clean <- tm_map(bnt.corpus.clean, toSpace, "@[[:alnum:]_]+")
Remove the profanity words. A list of bad words banned by google is downloaded from the link, and saved as txt file in the working directory.
profanity <- readLines("google-bad-words.txt", encoding = "UTF-8", skipNul = TRUE)
profanity.clean <- iconv(profanity, "UTF-8", "ASCII", sub = "")
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removeWords, profanity.clean)
Convert words to lower cases, remove numbers, punctuations, and whitespaces, then convert to plain text.
# change all words to lower cases
bnt.corpus.clean <- tm_map(bnt.corpus.clean, content_transformer(tolower))
# remove numbers and punctuation
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removeNumbers)
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removePunctuation)
# remove extra whitespaces
bnt.corpus.clean <- tm_map(bnt.corpus.clean, stripWhitespace)
# convert to plain text document
bnt.corpus.clean <- tm_map(bnt.corpus.clean, PlainTextDocument)
rm(bnt.corpus)
Tokenize the cleaned corpus
library(RWeka)
# gram.df is a function to extract content from corpus and tokenize and make
# a sorted data frame based on the frequencey
gram.df <- function(corpus, n) {
corpus.1 <- data.frame(doc = unlist(sapply(corpus, '[', "content")),
stringsAsFactors = FALSE)
gram <- NGramTokenizer(corpus.1, Weka_control(min = n, max = n))
gram.df <- data.frame(table(gram))
gram.df <- gram.df[order(gram.df$Freq, decreasing = TRUE),]
colnames(gram.df) <- c("word", "freq")
rownames(gram.df) <- 1:dim(gram.df)[1]
gram.df
}
# make a list unigram, bigram, trigram, fourgram dataframes and save it on disk
gram.dfs <- list()
for (i in 1:4) {
gram.dfs[[i]] <- gram.df(bnt.corpus.clean, i)
}
saveRDS(gram.dfs, file = "gram_dfs_4.rds")
Analysis of unigram
# load the unigram data frame
unigram.df <- gram.dfs[[1]]
# freq.plot is function to use ggplot to plot a barplot of top frequent words
library(ggplot2)
freq.plot <- function(df, label) {
ggplot(df[c(1:20),], aes(x = reorder(word, freq), y = freq, fill = freq)) +
geom_bar(stat = "identity") +
theme_bw() + coord_cartesian(ylim = c(0, 600000)) +
labs(y = "Frequency", x = "", title = label) +
geom_text(aes(label = freq), hjust = -0.02) +
theme(plot.title = element_text(hjust = 0.5),
legend.position = "none") + coord_flip()
}
# plot unigram
freq.plot(unigram.df, "20 most frequent words in unigram")
Analysis of bigram
# get the bigram data frame
bigram.df <- gram.dfs[[2]]
# plot bigram
freq.plot(bigram.df, "20 most frequent words in bigram")
Analysis of trigram
# get the trigram data frame
trigram.df <- gram.dfs[[3]]
# plot bigram
freq.plot(trigram.df, "20 most frequent words in trigram")
Analysis of fourgram
# get the fourgram data frame
fourgram.df <- gram.dfs[[4]]
# plot bigram
freq.plot(fourgram.df, "20 most frequent words in fourgram")
save(unigram.df, file = "unigram_4.Rda")
save(bigram.df, file = "bigram_4.Rda")
save(trigram.df, file = "trigram_4.Rda")
save(fourgram.df, file = "fourgram_4.Rda")
unigram.df$cumfreq <- cumsum(unigram.df$freq)
# 50% coverage
sum(unigram.df$cumfreq < (sum(unigram.df$freq) / 2)) + 1
## [1] 138
# 90% coverage
sum(unigram.df$cumfreq < (sum(unigram.df$freq) * 0.9)) + 1
## [1] 7645
138 words can cover 50% word occurance. 7645 words can cover 90% word occurance.
Next plan:
Build the predictive model using the unigram, bigram, trigram, and fourgram data. The next predictive word will be based on the previous three, two, or one word.
Develop the data product using the shiny package and publish it online.
To improve the model, bigram, trigram, and fourgram will be used together to find the most likely word.
The model will work better if the test data and training data are alike, so blogs, news, and twitter can be used to build the model separately, and used in different situations.