Introduction

The goal of this document is to explain the exploratory analysis and eventual goals toward creating a prediction algorithm and building a shiny app. It explains some of the major features of the data I have identified and briefly summarize my plans for the prediction algorithm and Shiny app in a way that would be understandable to a non-data scientist manager.

Getting and cleaning the data

If we don’t have the data in our system, then download it and unzip it. Also create the samplesdirectory where we will store our sampled files.

suppressWarnings(suppressPackageStartupMessages(library(R.utils)))
suppressWarnings(suppressPackageStartupMessages(library(tm)))
suppressWarnings(suppressPackageStartupMessages(library(doParallel)))
suppressWarnings(suppressPackageStartupMessages(library(SnowballC)))
suppressWarnings(suppressPackageStartupMessages(library(RWeka)))
suppressWarnings(suppressPackageStartupMessages(library(wordcloud)))
suppressWarnings(suppressPackageStartupMessages(library(slam)))
suppressWarnings(suppressPackageStartupMessages(library(Rgraphviz)))
suppressWarnings(suppressPackageStartupMessages(library(ggplot2)))
suppressWarnings(suppressPackageStartupMessages(require(RColorBrewer)))

# seed to use for randome= calculations
set.seed(190316)

if (!dir.exists("final")) {
  dataset.url <- "https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip"
  if (!file.exists("Coursera-SwiftKey.zip")) { 
    download.file(dataset.url, destfile = "Coursera-SwiftKey.zip", method = "curl")
    unzip("Coursera-SwiftKey.zip")
  }
}

if (!dir.exists("samples")) {
  dir.create("samples")
}

The text files used are quite big, 159.4 Mb the smallest and 200.4 Mb the largest, so its not recommended or efficient to keep them all in memory. As such, I’ve defined a function that reads pieces of a file, creates a randomized sample vector of the same length as the number of lines of the given file and decides to write that line to a new sample text file accordin to its value. The size of the sample size will be relative to the probability p that a line is selected. I have set p = 0.3 so taht the sample files will be about 30% the size of the original file. Although reading large chunks of text at a time is, generally speaking, faster than reading line by line in a for loop, I’ve found that reading, sampling and writing each line at the same time is faster and makes a better use of memory than reading the whole file into memory, sampling and finally writing the sample file.

createSample <- function(path, fileName, p) {
  # Caculate number of lines efficiently
  readFilePath <- paste(path, fileName, sep="/")
  nLines <- countLines(readFilePath)
  # Prepare write connection to file
  sampleFileName <- paste("sample", fileName, sep="_")
  sampleFilePath <- paste("samples", sampleFileName, sep="/")
  conWrite <- file(sampleFilePath, "w")
  # Prepare read connection
  conRead <- file(readFilePath, "r")
  # The size of the samples with value = 1 is nLines*p, meaning its not a fixed length
  samples <- rbinom(nLines, size=1, prob=p)
  for (i in 1:nLines) {
    t <- readLines(conRead, 1)
    if (samples[i] == 1) {
      writeLines(t, conWrite)
    }
  }
  close(conRead)
  close(conWrite)
}

pathToFiles <- "final/en_US"
if (!file.exists(paste("samples", "sample_en_US.blogs.txt", sep="/"))) {
  sampleProbability <- .3
  createSample(pathToFiles, "en_US.blogs.txt", sampleProbability)
  createSample(pathToFiles, "en_US.news.txt", sampleProbability)
  createSample(pathToFiles, "en_US.twitter.txt", sampleProbability)
}

I have also define a function to tokenize a tm corpus object by removing punctuation, numbers, stopwords, lowering the case and applying stemming and another function to remove profane words using the List of Dirty, Naughty, Obscene, and Otherwise Bad Words.

tokenizeCorpus <- function(corpus) {
  corpus <- tm_map(corpus, tolower)
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, removeWords, stopwords("english"))
  corpus <- tm_map(corpus, stemDocument, language = "english")
  corpus <- tm_map(corpus, stripWhitespace)
  tm_map(corpus, PlainTextDocument)
}

filterProfanity <- function(corpus) {
  profanityFileName <- "profanity.txt"
  if (!file.exists(profanityFileName)) {
    profanity.url <- "https://raw.githubusercontent.com/shutterstock/List-of-Dirty-Naughty-Obscene-and-Otherwise-Bad-Words/master/en"
    download.file(profanity.url, destfile = profanityFileName, method = "curl")
  }
  
  if (sum(ls() == "profanity") < 1) {
    profanity <- read.csv(profanityFileName, header = FALSE, stringsAsFactors = FALSE)
    profanity <- profanity$V1
    profanity <- profanity[1:length(profanity)-1]
  }
  
  tm_map(corpus, removeWords, profanity)
}

Exploratory data analysis

Let the fun begin! Load the sample files into a tm corpus and let’s have a look at it

if (sum(ls() == "corpus") < 1) { # do not load if already in the environment
  corpus  <- Corpus(DirSource("samples", encoding = "utf8"), readerControl = list(Language = "en_US", Load=TRUE))
}
# Let's start with a summary
summary(corpus)
##              Length Class             Mode
## character(0) 2      PlainTextDocument list
## character(0) 2      PlainTextDocument list
## character(0) 2      PlainTextDocument list
# Let's look at the number of entries for each document of the corpus
corpusSizes <- cbind(length(corpus[[1]]$content), length(corpus[[1]]$content), length(corpus[[3]]$content))
colnames(corpusSizes) <- c("Blogs", "News", "Twitter")
corpusSizes
##       Blogs   News Twitter
## [1,] 270534 270534  708660
# First 10 entries of the Blogs corpus
head(corpus[[1]]$content, n = 10)
##  [1] "chad awesome kids holding fort work later usual kids busy together playing skylander xbox together kyan cashed piggy bank wanted game bad used gift card birthday saving money get never taps thing either know wanted bad made count money make sure enough cute watch reaction realized also good job letting lola feel like playing letting switch characters loves almost much "                                   
##  [2] " anyways going share home decor inspiration storing folder puter amazing images stored away ready come life get hom"                                                                                                                                                                                                                                                                                                   
##  [3] " graduation season right around corner nancy whipped fun set help graduation cards gifts occasion brings change ones life stamped images memento tuxedo black cut circle nestabilities embossed kraft red cardstock tes new stars impressions plate double sided gives fantastic patterns can see use impressions plates tutorial taylor created just one pass die cut machine using embossing pad kit need super easi"
##  [4] " friends similar stories treated brusquely laurelwood staff often names keep coming halfdozen friends mine refuse step foot ever many others theyre telling keeping away one can guess"                                                                                                                                                                                                                                
##  [5] "peter schiff hard tell will look pretty bad americans prices will go way cant afford buy stuff also get bad far loss individual liberty lot people will blame capitalism freedom will claim need government used impetus regulation disaster impetus get rid regulation causing problem whether will right wrong thing america will lot pain first got serious problems deal dealing problems make problems wors"      
##  [6] " enroute cornwall months slog sun sea always job needs tourist town im bringing stuff back friday will need cancel bills retreat bad thing moved leeds promise millies went nowhere move "                                                                                                                                                                                                                             
##  [7] " one thing astounding though support marshals phenomenal rain long remaining cheery supportive amazing bunch owing nature course closed roads meant supporters knew area able skip around course people seen times also sup"                                                                                                                                                                                           
##  [8] " attend friend kendras wedding joy watch people see end years wait"                                                                                                                                                                                                                                                                                                                                                    
##  [9] " hoping howie smash face cake sister made seemed like scared anything staring us cheering started dig "                                                                                                                                                                                                                                                                                                                
## [10] " come condemn world save world"
# First 10 entries of the twitter corpus
head(corpus[[3]]$content, n = 10)
##  [1] " btw thanks rt gonna dc anytime soon love see way way long"                     
##  [2] "theyve decided fun dont"                                                        
##  [3] " tired d played lazer tag ran lot d ughh going sleep like minutes "             
##  [4] "im coo jus work hella tired r u ever cali"                                      
##  [5] " always wonder guys auctions shows learned talk fast hear djsosnekspqnslanskam" 
##  [6] "dammnnnnn catch"                                                                
##  [7] " great picture green shirt totally brings ey"                                   
##  [8] "packing quick move street mov"                                                  
##  [9] "ford focus hatchback"                                                           
## [10] "good questions rt brand will judged based website website good brand ambassador"

Now let’s apply the profanity filter and see how it looks for the twitter document (since its entries are the smallest).

# Calculate the number of cores
ncores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(ncores)
registerDoParallel(cl)
# filter profane lamguage
filteredCorpus <- filterProfanity(corpus)
head(filteredCorpus[[3]]$content, n = 10)
##  [1] " btw thanks rt gonna dc anytime soon love see way way long"                     
##  [2] "theyve decided fun dont"                                                        
##  [3] " tired d played lazer tag ran lot d ughh going sleep like minutes "             
##  [4] "im coo jus work hella tired r u ever cali"                                      
##  [5] " always wonder guys auctions shows learned talk fast hear djsosnekspqnslanskam" 
##  [6] "dammnnnnn catch"                                                                
##  [7] " great picture green shirt totally brings ey"                                   
##  [8] "packing quick move street mov"                                                  
##  [9] "ford focus hatchback"                                                           
## [10] "good questions rt brand will judged based website website good brand ambassador"
stopCluster(cl)

Now we tokenize it.

# Calculate the number of cores
ncores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(ncores)
registerDoParallel(cl)
# tokenize the corpus removing whitespaces, common stopwords, punctuations and numbers, also transforms to lower-case
filteredCorpus <- tokenizeCorpus(corpus)
head(filteredCorpus[[3]]$content, n = 10)
##  [1] " btw thanks rt gonna dc anytime soon love see way way long"                     
##  [2] "theyve decided fun dont"                                                        
##  [3] " tired d played lazer tag ran lot d ughh going sleep like minutes "             
##  [4] "im coo jus work hella tired r u ever cali"                                      
##  [5] " always wonder guys auctions shows learned talk fast hear djsosnekspqnslanskam" 
##  [6] "dammnnnnn catch"                                                                
##  [7] " great picture green shirt totally brings ey"                                   
##  [8] "packing quick move street mov"                                                  
##  [9] "ford focus hatchback"                                                           
## [10] "good questions rt brand will judged based website website good brand ambassador"
stopCluster(cl)

Calculate the Term Document Matrix, i.e. in how many docs does a term appear

# Calculate the number of cores
ncores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(ncores)
registerDoParallel(cl)
# Calculate the Term Document Matrix, i.e. in how many docs does a term appear
tdm <- TermDocumentMatrix(filteredCorpus)
stopCluster(cl)

Of special importance is the sparsity, also, let’s look at the first 10 results.

tdm
## <<TermDocumentMatrix (terms: 435483, documents: 3)>>
## Non-/sparse entries: 601453/704996
## Sparsity           : 54%
## Maximal term length: 299
## Weighting          : term frequency (tf)
inspect(tdm[1:10,])
## <<TermDocumentMatrix (terms: 10, documents: 3)>>
## Non-/sparse entries: 10/20
## Sparsity           : 67%
## Maximal term length: 3
## Weighting          : term frequency (tf)
## 
##         Docs
## Terms    character(0) character(0) character(0)
##   ˆڡˆ               0            0            1
##   들국화            1            0            0
##   서비스            1            0            0
##   이승열            1            0            0
##   ላቤን               1            0            0
##   갈수록            1            0            0
##   돈가스            1            0            0
##   크리스            1            0            0
##   한희정            1            0            0
##   김사랑            1            0            0

There are foreign characters such as 서비스 and 이승열, this is because of the use of mixed language. Let’s now calculate the inverse, ie.e the Document Term Matrix, i.e. in how many terms appear in each document of the corpus.

# Calculate the number of cores
ncores <- detectCores() - 1
# Initiate cluster
cl <- makeCluster(ncores)
registerDoParallel(cl)
dtm <- DocumentTermMatrix(filteredCorpus)
dtm
## <<DocumentTermMatrix (documents: 3, terms: 435483)>>
## Non-/sparse entries: 601453/704996
## Sparsity           : 54%
## Maximal term length: 299
## Weighting          : term frequency (tf)
stopCluster(cl)

Let’s see what this got us.

# Let's see the 10 terms that appears the most across all documents.
head(sort(col_sums(dtm), decreasing = TRUE), n = 10)
##  will  said  just   one  like   can   get   new  time  good 
## 94049 91815 90900 83793 79500 74041 68256 58909 58299 53305
# Let's see the average number of times the top 10 terms appears in each document.
head(sort(row_means(tdm), decreasing = TRUE), n = 10)
##     will     said     just      one     like      can      get      new 
## 31349.67 30605.00 30300.00 27931.00 26500.00 24680.33 22752.00 19636.33 
##     time     good 
## 19433.00 17768.33

For further analysis, we can reduce the size of the matrix by removing sparse terms, leaving only the most common and allegedly useful terms. After that we use the slam package to transform the Term Document Matrix into a normal matrix and use its vectors, which are faster to work with, and still get the same results.

tdmCommon <- removeSparseTerms(tdm, 0.4)
mCommon <- as.matrix(tdmCommon)
vCommon = sort(rowSums(m), decreasing = TRUE)
# the 10 terms that appears the most across all documents are the same as before
vCommon[1:10]
##  will  said  just   one  like   can   get   new  time  good 
## 94049 91815 90717 83798 79500 74040 68256 58909 58304 53304
# Let's compare sizes of each object
cat("tdm representation costs", format(object.size(tdm), units = "Mb"), "bytes.\n", 
    "tdmCommon representation costs", format(object.size(tdmCommon), units = "Mb"), "bytes.\n",
    "Simple triplet matrix representation costs", format(object.size(mCommon), units = "Mb"), "bytes.")
## tdm representation costs 34.8 Mb bytes.
##  tdmCommon representation costs 10.4 Mb bytes.
##  Simple triplet matrix representation costs 8.7 Mb bytes.

One thing we could do is find the most frequent terms and find common associations with a certain word, for instance, the word “just”.

# Find terms that appear at least 50,000 times
findFreqTerms(tdmCommon, lowfreq = 50000)
##  [1] "can"  "day"  "dont" "get"  "good" "just" "like" "new"  "now"  "one" 
## [11] "said" "time" "will"
# find common associations with the term 'just'
assocJust <- findAssocs(tdmCommon, "just", 0.99)
# top ten associations
head(sort(assocJust[[1]], decreasing = FALSE), n = 10)
##    acronym   acutally       ahha      ahole   alfresco algorithms 
##       0.99       0.99       0.99       0.99       0.99       0.99 
##     allbut   allergic   allisons     alseep 
##       0.99       0.99       0.99       0.99

Let’s visualize correlations between terms of a term-document matrix.

plot(tdmCommon, corThreshold = 0.8, weighting = TRUE)

plot of chunk unnamed-chunk-13

Now as a wordcloud

set.seed(4363)
wordcloud(names(vCommon), vCommon, scale=c(5,0.5), max.words=100, random.order=FALSE, rot.per=0.35, use.r.layout=FALSE, colors=brewer.pal(8, 'Dark2'))

plot of chunk unnamed-chunk-14

Finally, we can also tranform the matrix into a data frame and plot the terms or ngrams by frequency. Right now our matrix is for a 1-ngram.

common.df <- as.data.frame(mCommon)
colnames(common.df) <- c("Blogs", "News", "Twitter")
common.df["Count"] <- apply(common.df, 1, sum)
common.df <- common.df[order(common.df$Count, decreasing = TRUE),]
head(common.df)
##      Blogs  News Twitter Count
## will 33777 32192   28080 94049
## said 10942 75421    5452 91815
## just 30319 16070   44511 90900
## one  36619 24707   22467 83793
## like 29328 14410   35762 79500
## can  29781 17684   26576 74041
ngram.df <- data.frame(Ngram = rownames(common.df), Count = common.df$Count)
ngram.df$Ngram <- factor(ngram.df$Ngram, levels = ngram.df$Ngram)
g <- ggplot(ngram.df[1:20,], aes(x=Ngram,  y=Count)) +
  geom_bar(stat = "identity") + 
  theme(legend.title=element_blank()) +
  xlab("Ngram") + ylab("Frequency") +
  labs(title = "Top 20 1-grams by Frequency")+theme(axis.text.x = element_text(angle = 90))
print(g)

plot of chunk unnamed-chunk-15

Modelling

Now let’s build a basic n-gram model for predicting the next word based on the previous 1, 2, or 3 words. The common step would have been to to use the NGramTokenizer from RWeka but this resulted in memory error even after assigning 12Gb of memory to it options(java.parameters = "-Xmx12g" ). So instead I switched to quanteda for this portion of the report. Luckily quanteda supports a tm VCorpus as input so I could reuse the tokenized and deprofanitized corpus I was already using.

library(quanteda)
if (sum(ls() == "qcorpus") < 1) {
  qcorpus <- corpus(filteredCorpus)
}
summary(qcorpus, 10)
## Corpus consisting of 3 documents.
## 
##     Text  Types   Tokens Sentences
##  content 435410 20809160    199994
##     meta      0        0         0
##    dmeta      0        0         0
## 
## Source:  Converted from tm VCorpus 'corpus'
## Created: Sun Mar 20 04:15:25 2016
## Notes:

Let’s now build our models for 1, 2 and 3 ngrams respectively.

ng1 <- tokenize(qcorpus, removePunct = TRUE, ngrams = 1)
ng2 <- tokenize(qcorpus, removePunct = TRUE, ngrams = 2)
ng3 <- tokenize(qcorpus, removePunct = TRUE, ngrams = 3)

Now we create a document-feature matrix, which is a sparse matrix similar from tm’s Document Term Matrix.

qdmf1 <- dfm(ng1)
## 
##    ... indexing documents: 3 documents
##    ... indexing features: 435,398 feature types
##    ... created a 3 x 435398 sparse dfm
##    ... complete. 
## Elapsed time: 28.936 seconds.
qdmf2 <- dfm(ng2)
## 
##    ... indexing documents: 3 documents
##    ... indexing features: 9,002,257 feature types
##    ... created a 3 x 9002257 sparse dfm
##    ... complete. 
## Elapsed time: 23.537 seconds.
qdmf3 <- dfm(ng3)
## 
##    ... indexing documents: 3 documents
##    ... indexing features: 15,813,474 feature types
##    ... created a 3 x 15813474 sparse dfm
##    ... complete. 
## Elapsed time: 27.574 seconds.

We can obtain the top ten features for each n-gram model.

topfeatures(qdmf1, 10)
##  will  said  just   one  like   can    im   get   new  time 
## 94051 91815 90717 83803 79502 74064 73814 68258 58910 58324
topfeatures(qdmf2, 10)
##   right_now   dont_know    new_york   cant_wait   last_year  last_night 
##        7364        5893        5758        5730        5685        4557 
##    im_going high_school   years_ago   last_week 
##        4267        4178        4142        3832
topfeatures(qdmf3, 10)
##          cant_wait_see      happy_mothers_day            let_us_know 
##                   1080                   1060                    750 
##          new_york_city         happy_new_year         im_pretty_sure 
##                    664                    566                    501 
##          two_years_ago president_barack_obama          cinco_de_mayo 
##                    474                    471                    417 
##         dont_even_know 
##                    395

Let’s plot a wordcloud for each

# For 1-gram
suppressWarnings(plot(qdmf1, max.words = 100, colors = brewer.pal(6, "Dark2"), scale = c(8, .5)))

plot of chunk unnamed-chunk-20

# For 2-gram
suppressWarnings(plot(qdmf2, max.words = 100, colors = brewer.pal(6, "Dark2"), scale = c(8, .5)))

plot of chunk unnamed-chunk-21

# For 3-gram
suppressWarnings(plot(qdmf3, max.words = 100, colors = brewer.pal(6, "Dark2"), scale = c(8, .5)))

plot of chunk unnamed-chunk-22