Capstone

Overview

Nowadays, people spend very too much time in using smart phones. Typing on the smart phone can be a serious pain for users. This project is to help people predict the next word after following previous input words.

The datasset is from Coursera-SwiftKey.

Data download

Download the raw dataset from coursera website and unzip to the current working directory.

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destfile <- "Coursera-SwiftKey.zip"
if (!file.exists(destfile)) {
        download.file(url, destfile = destfile, method = "curl")
}
unzip(destfile)

Since only English is analyzed, change the working directory to ./final/en_US

setwd("final/en_US")

Explore the data

This part is for the quiz 1. Get the size of the file, the number of lines in each file, longest line and number of words.

blogs.con <- file("en_US.blogs.txt", "r")
news.con <- file("en_US.news.txt", "r")
twitter.con <- file("en_US.twitter.txt", "r")

blogs <- readLines(blogs.con, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news.con, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitter.con, encoding = "UTF-8", skipNul = TRUE)

close(blogs.con)
close(news.con)
close(twitter.con)

blogs.size <- file.size("en_US.blogs.txt") / 1024 / 1024
news.size <- file.size("en_US.news.txt") / 1024 / 1024
twitter.size <- file.size("en_US.twitter.txt") / 1024 / 1024

blogs.lines <- length(blogs)
news.lines <- length(news)
twitter.lines <- length(twitter)

blogs.longestline <- which.max(sapply(blogs, nchar))
news.longestline <- which.max(sapply(news, nchar))
twitter.longestline <- which.max(sapply(twitter, nchar))

getwords <- function(l) {
        a <- sapply(l, function(x) {
                y <- unlist(strsplit(x, "\\s+"))
                length(y)
        })
        sum(a)
}

blogs.words <- getwords(blogs)
news.words <- getwords(news)
twitter.words <- getwords(twitter)
basis.info <- data.frame(size = c(blogs.size, news.size, twitter.size),
                         num.of.lines = c(blogs.lines, news.lines, 
                                          twitter.lines),
                         longest.line = c(blogs.longestline,
                                          news.longestline,
                                          twitter.longestline),
                         num.of.words = c(blogs.words, news.words,
                                          twitter.words),
                         row.names = c("blogs", "news", "twitter"))
basis.info

##             size num.of.lines longest.line num.of.words
## blogs   200.4242       899288       483415     37334149
## news    196.2775      1010242       123628     34372814
## twitter 159.3641      2360148           26     30373605

Preprocessing and cleaning data

To increase the efficiency, 10000 lines are sampled.

# data.sample is a function to take 10% sample of each corpus
data.sample <- function(x) {
        l <- length(x)
        set.seed(2017)
        x[sample(1:l, 10000)]
}

blogs.sample <- data.sample(blogs)
news.sample <- data.sample(news)
twitter.sample <- data.sample(twitter)

The later work will be on the sample corpuses. Get object sizes of original, and delete.

object.size(blogs)
object.size(news)
object.size(twitter)
rm(blogs)
rm(news)
rm(twitter)

Clean the data

# combine samples of blogs, news and twitter together
bnt.samples <- c(blogs.sample, news.sample, twitter.sample)
rm(blogs.sample)
rm(news.sample)
rm(twitter.sample)

bnt.samples <- iconv(bnt.samples, "UTF-8", "ASCII", sub = "")

# make all sample into one line
bnt.samples.1 <- paste(bnt.samples, collapse = " ")

# load tm package to clean text data
library(tm)

# create a corpus
bnt.corpus <- VCorpus(VectorSource(bnt.samples.1))

# toSpace is a function that transform a pattern to space
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))

# remove URLs
bnt.corpus.clean <- tm_map(bnt.corpus, toSpace, "http[s]?://[[:alnum:]_[:punct:]]+")

# remove twitter accounts
bnt.corpus.clean <- tm_map(bnt.corpus.clean, toSpace, "@[[:alnum:]_]+")

Remove the profanity words. A list of bad words banned by google is downloaded from the link, and saved as txt file in the working directory.

profanity <- readLines("google-bad-words.txt", encoding = "UTF-8", skipNul = TRUE)
profanity.clean <- iconv(profanity, "UTF-8", "ASCII", sub = "")
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removeWords, profanity.clean)

Convert words to lower cases, remove numbers, punctuations, and whitespaces, then convert to plain text.

# change all words to lower cases
bnt.corpus.clean <- tm_map(bnt.corpus.clean, content_transformer(tolower))

# remove numbers and punctuation
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removeNumbers)
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removePunctuation)

# remove extra whitespaces
bnt.corpus.clean <- tm_map(bnt.corpus.clean, stripWhitespace)

# convert to plain text document
bnt.corpus.clean <- tm_map(bnt.corpus.clean, PlainTextDocument)

rm(bnt.corpus)

Exploratory analysis

Tokenize the cleaned corpus

library(RWeka)
# gram.df is a function to extract content from corpus and tokenize and make
# a sorted data frame based on the frequencey
gram.df <- function(corpus, n) {
        corpus.1 <- data.frame(doc = unlist(sapply(corpus, '[', "content")),
                               stringsAsFactors = FALSE)
        gram <- NGramTokenizer(corpus.1, Weka_control(min = n, max = n))
        gram.df <- data.frame(table(gram))
        gram.df <- gram.df[order(gram.df$Freq, decreasing = TRUE),]
        colnames(gram.df) <- c("word", "freq")
        rownames(gram.df) <- 1:dim(gram.df)[1]
        gram.df
}

# make a list unigram, bigram, trigram, fourgram dataframes and save it on disk
gram.dfs <- list()

for (i in 1:4) {
        gram.dfs[[i]] <- gram.df(bnt.corpus.clean, i)
}

saveRDS(gram.dfs, file = "gram_dfs_4.rds")

Frequency plot

Analysis of unigram

# load the unigram data frame
unigram.df <- gram.dfs[[1]]

# freq.plot is function to use ggplot to plot a barplot of top frequent words
library(ggplot2)
freq.plot <- function(df, label) {
        ggplot(df[c(1:20),], aes(x = reorder(word, freq), y = freq, fill = freq)) +
                geom_bar(stat = "identity") +
                theme_bw() + coord_cartesian(ylim = c(0, 600000)) + 
                labs(y = "Frequency", x = "", title = label) +
                geom_text(aes(label = freq), hjust = -0.02) +
                theme(plot.title = element_text(hjust = 0.5), 
                      legend.position = "none") + coord_flip()
                
}

# plot unigram
freq.plot(unigram.df, "20 most frequent words in unigram")

Analysis of bigram

# get the bigram data frame
bigram.df <- gram.dfs[[2]]

# plot bigram
freq.plot(bigram.df, "20 most frequent words in bigram")

Analysis of trigram

# get the trigram data frame
trigram.df <- gram.dfs[[3]]

# plot bigram
freq.plot(trigram.df, "20 most frequent words in trigram")

Analysis of fourgram

# get the fourgram data frame
fourgram.df <- gram.dfs[[4]]

# plot bigram
freq.plot(fourgram.df, "20 most frequent words in fourgram")

Save the unigram, bigram, trigram, and fourgram data frames

save(unigram.df, file = "unigram_4.Rda")
save(bigram.df, file = "bigram_4.Rda")
save(trigram.df, file = "trigram_4.Rda")
save(fourgram.df, file = "fourgram_4.Rda")

50% word coverage

unigram.df$cumfreq <- cumsum(unigram.df$freq)

# 50% coverage
sum(unigram.df$cumfreq < (sum(unigram.df$freq) / 2)) + 1

## [1] 138

# 90% coverage
sum(unigram.df$cumfreq < (sum(unigram.df$freq) * 0.9)) + 1

## [1] 7645

138 words can cover 50% word occurance. 7645 words can cover 90% word occurance.

Future plan

Next plan:

Build the predictive model using the unigram, bigram, trigram, and fourgram data. The next predictive word will be based on the previous three, two, or one word.
Develop the data product using the shiny package and publish it online.
To improve the model, bigram, trigram, and fourgram will be used together to find the most likely word.
The model will work better if the test data and training data are alike, so blogs, news, and twitter can be used to build the model separately, and used in different situations.