Overview

Nowadays, people spend very too much time in using smart phones. Typing on the smart phone can be a serious pain for users. This project is to help people predict the next word after following previous input words.

The datasset is from Coursera-SwiftKey.

Data download

Download the raw dataset from coursera website and unzip to the current working directory.

url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
destfile <- "Coursera-SwiftKey.zip"
if (!file.exists(destfile)) {
        download.file(url, destfile = destfile, method = "curl")
}
unzip(destfile)

Since only English is analyzed, change the working directory to ./final/en_US

setwd("final/en_US")

Explore the data

This part is for the quiz 1. Get the size of the file, the number of lines in each file, longest line and number of words.

blogs.con <- file("en_US.blogs.txt", "r")
news.con <- file("en_US.news.txt", "r")
twitter.con <- file("en_US.twitter.txt", "r")

blogs <- readLines(blogs.con, encoding = "UTF-8", skipNul = TRUE)
news <- readLines(news.con, encoding = "UTF-8", skipNul = TRUE)
twitter <- readLines(twitter.con, encoding = "UTF-8", skipNul = TRUE)

close(blogs.con)
close(news.con)
close(twitter.con)

blogs.size <- file.size("en_US.blogs.txt") / 1024 / 1024
news.size <- file.size("en_US.news.txt") / 1024 / 1024
twitter.size <- file.size("en_US.twitter.txt") / 1024 / 1024

blogs.lines <- length(blogs)
news.lines <- length(news)
twitter.lines <- length(twitter)

blogs.longestline <- which.max(sapply(blogs, nchar))
news.longestline <- which.max(sapply(news, nchar))
twitter.longestline <- which.max(sapply(twitter, nchar))

getwords <- function(l) {
        a <- sapply(l, function(x) {
                y <- unlist(strsplit(x, "\\s+"))
                length(y)
        })
        sum(a)
}

blogs.words <- getwords(blogs)
news.words <- getwords(news)
twitter.words <- getwords(twitter)
basis.info <- data.frame(size = c(blogs.size, news.size, twitter.size),
                         num.of.lines = c(blogs.lines, news.lines, 
                                          twitter.lines),
                         longest.line = c(blogs.longestline,
                                          news.longestline,
                                          twitter.longestline),
                         num.of.words = c(blogs.words, news.words,
                                          twitter.words),
                         row.names = c("blogs", "news", "twitter"))
basis.info
##             size num.of.lines longest.line num.of.words
## blogs   200.4242       899288       483415     37334149
## news    196.2775      1010242       123628     34372814
## twitter 159.3641      2360148           26     30373605

Preprocessing and cleaning data

To increase the efficiency, 10000 lines are sampled.

# data.sample is a function to take 10% sample of each corpus
data.sample <- function(x) {
        l <- length(x)
        set.seed(2017)
        x[sample(1:l, 10000)]
}

blogs.sample <- data.sample(blogs)
news.sample <- data.sample(news)
twitter.sample <- data.sample(twitter)

The later work will be on the sample corpuses. Get object sizes of original, and delete.

object.size(blogs)
object.size(news)
object.size(twitter)
rm(blogs)
rm(news)
rm(twitter)

Clean the data

# combine samples of blogs, news and twitter together
bnt.samples <- c(blogs.sample, news.sample, twitter.sample)
rm(blogs.sample)
rm(news.sample)
rm(twitter.sample)

bnt.samples <- iconv(bnt.samples, "UTF-8", "ASCII", sub = "")

# make all sample into one line
bnt.samples.1 <- paste(bnt.samples, collapse = " ")

# load tm package to clean text data
library(tm)

# create a corpus
bnt.corpus <- VCorpus(VectorSource(bnt.samples.1))

# toSpace is a function that transform a pattern to space
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))

# remove URLs
bnt.corpus.clean <- tm_map(bnt.corpus, toSpace, "http[s]?://[[:alnum:]_[:punct:]]+")

# remove twitter accounts
bnt.corpus.clean <- tm_map(bnt.corpus.clean, toSpace, "@[[:alnum:]_]+")

Remove the profanity words. A list of bad words banned by google is downloaded from the link, and saved as txt file in the working directory.

profanity <- readLines("google-bad-words.txt", encoding = "UTF-8", skipNul = TRUE)
profanity.clean <- iconv(profanity, "UTF-8", "ASCII", sub = "")
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removeWords, profanity.clean)

Convert words to lower cases, remove numbers, punctuations, and whitespaces, then convert to plain text.

# change all words to lower cases
bnt.corpus.clean <- tm_map(bnt.corpus.clean, content_transformer(tolower))

# remove numbers and punctuation
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removeNumbers)
bnt.corpus.clean <- tm_map(bnt.corpus.clean, removePunctuation)

# remove extra whitespaces
bnt.corpus.clean <- tm_map(bnt.corpus.clean, stripWhitespace)

# convert to plain text document
bnt.corpus.clean <- tm_map(bnt.corpus.clean, PlainTextDocument)

rm(bnt.corpus)

Exploratory analysis

Tokenize the cleaned corpus

library(RWeka)
# gram.df is a function to extract content from corpus and tokenize and make
# a sorted data frame based on the frequencey
gram.df <- function(corpus, n) {
        corpus.1 <- data.frame(doc = unlist(sapply(corpus, '[', "content")),
                               stringsAsFactors = FALSE)
        gram <- NGramTokenizer(corpus.1, Weka_control(min = n, max = n))
        gram.df <- data.frame(table(gram))
        gram.df <- gram.df[order(gram.df$Freq, decreasing = TRUE),]
        colnames(gram.df) <- c("word", "freq")
        rownames(gram.df) <- 1:dim(gram.df)[1]
        gram.df
}

# make a list unigram, bigram, trigram, fourgram dataframes and save it on disk
gram.dfs <- list()

for (i in 1:4) {
        gram.dfs[[i]] <- gram.df(bnt.corpus.clean, i)
}

saveRDS(gram.dfs, file = "gram_dfs_4.rds")

Frequency plot

Analysis of unigram

# load the unigram data frame
unigram.df <- gram.dfs[[1]]
# freq.plot is function to use ggplot to plot a barplot of top frequent words
library(ggplot2)
freq.plot <- function(df, label) {
        ggplot(df[c(1:20),], aes(x = reorder(word, freq), y = freq, fill = freq)) +
                geom_bar(stat = "identity") +
                theme_bw() + coord_cartesian(ylim = c(0, 600000)) + 
                labs(y = "Frequency", x = "", title = label) +
                geom_text(aes(label = freq), hjust = -0.02) +
                theme(plot.title = element_text(hjust = 0.5), 
                      legend.position = "none") + coord_flip()
                
}

# plot unigram
freq.plot(unigram.df, "20 most frequent words in unigram")

Analysis of bigram

# get the bigram data frame
bigram.df <- gram.dfs[[2]]
# plot bigram
freq.plot(bigram.df, "20 most frequent words in bigram")

Analysis of trigram

# get the trigram data frame
trigram.df <- gram.dfs[[3]]
# plot bigram
freq.plot(trigram.df, "20 most frequent words in trigram")

Analysis of fourgram

# get the fourgram data frame
fourgram.df <- gram.dfs[[4]]
# plot bigram
freq.plot(fourgram.df, "20 most frequent words in fourgram")

Save the unigram, bigram, trigram, and fourgram data frames

save(unigram.df, file = "unigram_4.Rda")
save(bigram.df, file = "bigram_4.Rda")
save(trigram.df, file = "trigram_4.Rda")
save(fourgram.df, file = "fourgram_4.Rda")

50% word coverage

unigram.df$cumfreq <- cumsum(unigram.df$freq)

# 50% coverage
sum(unigram.df$cumfreq < (sum(unigram.df$freq) / 2)) + 1
## [1] 138
# 90% coverage
sum(unigram.df$cumfreq < (sum(unigram.df$freq) * 0.9)) + 1
## [1] 7645

138 words can cover 50% word occurance. 7645 words can cover 90% word occurance.

Future plan

Next plan: