Introduction

This report is for the John Hopkin’s Coursera Data Science Capstone project. It summarises textual features of three files containing blog text, news text, and twitter text, in preparation to develop a product for word prediction.

The data can be downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip. The capstone requires the use of files contained in the en_US folder only. The folder contains three files, en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt.

## Warning: package 'knitr' was built under R version 3.3.2

Load the data

us.blogs.con <- file("./final/en_US/en_US.blogs.txt", "r") 
us.news.con <- file("./final/en_US/en_US.news.txt", "r") 
us.twitter.con <- file("./final/en_US/en_US.twitter.txt", "r") 

txt.blog <- readLines(us.blogs.con, skipNul=TRUE)
txt.news <- readLines(us.news.con, skipNul=TRUE)
txt.twitter <- readLines(us.twitter.con, skipNul=TRUE)

close(us.blogs.con)
close(us.news.con)
close(us.twitter.con)

# The number of lines

lines.blog <- length(txt.blog)
lines.news <- length(txt.news)
lines.twitter <- length(txt.twitter)

# Number of words

words.blog <- sum(str_count(txt.blog, boundary("word")))
words.news <- sum(str_count(txt.news, boundary("word")))
words.twitter <- sum(str_count(txt.twitter, boundary("word")))

# Data frame for table creation only so allow spaces in variable names
data_summary <- data.frame('Source Type'=c("Blogs", "News", "Twitter"), 'File Name' = c("en_US.blogs.txt","en_US.news.txt","en_US.twitter.txt"), 'Number of Lines'=c(lines.blog, lines.news, lines.twitter), 'Number of Words'=c(words.blog, words.news, words.twitter), check.names=FALSE)

kable(data_summary, format.args = list(big.mark=','))
Source Type File Name Number of Lines Number of Words
Blogs en_US.blogs.txt 899,288 37,546,246
News en_US.news.txt 1,010,242 34,762,395
Twitter en_US.twitter.txt 2,360,148 30,093,410

In an effort to keep the main report brief and reduce execution time, code used to load and process data has been moved to the appendices.

Due to many peformance issues and R crashes (especially with the TM package) the data was saved as text files, and data was processed and saved as RData objects. The data was split into training (60%) and test (40%) sets for each text source and written back to the file system (see appendix A for the code) . The training sets were read back in and dataframes were created which were then saved to the file system as RData objects (see appendix B for the code). A corpus was created for each training set and then tokenized into unigrams, bigrams, and trigrams. Each is saved as an RData object (see appendix C for the code).

The library quanteda is used and the training RData objects of unigrams, bigrams, and trigrams are read back into the environment. This greatly speeds up the time to work on the data.

At this time stop words have been kept as they may be needed when predicting the next word. Stemming has also not been implemented as this may cause additional processing to convert to the given word. I have not converted to lowercase. These features will need to be reassessed for the final product.

  library(quanteda)
  
  # Load unigrams, bigrams, and trigrams
  attach("blog.unigram.RData")
  #attach("blog.bigram.RData")
  #attach("blog.trigram.RData")
  #attach("news.unigram.RData")
  #attach("news.bigram.RData")
  #attach("news.trigram.RData")
  #attach("twitter.unigram.RData")
  #attach("twitter.bigram.RData")
  #attach("twitter.trigram.RData")

  # Load Document Feature Matrix

  attach("blog.unigram.dfm.RData")
  #attach("blog.bigram.dfm.RData")
  #attach("blog.trigram.dfm.RData")
  #attach("news.unigram.dfm.RData")
  #attach("news.bigram.dfm.RData")
  #attach("news.trigram.dfm.RData")
  #attach("twitter.unigram.dfm.RData")
  #attach("twitter.bigram.dfm.RData")
  #attach("twitter.trigram.dfm.RData")

How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the text?

  blog.unigram.features <- length(attr(blog.unigram.dfm, "Dimnames")$features)
  head(features(blog.unigram.dfm), 20)
##  [1] "Chad"    "has"     "been"    "awesome" "with"    "the"     "kids"   
##  [8] "and"     "holding" "down"    "fort"    "while"   "I"       "work"   
## [15] "later"   "than"    "usual"   "The"     "have"    "busy"
  # Profanity in the tail
  # tail(features(blog.unigram.dfm), 20)
  blog.unigram.frequencies <- topfeatures(blog.unigram.dfm,blog.unigram.features)
  frequency.50 <- 0
  loopIndx <-1
  # Sum all the frequencies and multiply by 0.5 to get the frequency total at 50%
  blog.words.total <- sum(blog.unigram.frequencies)
  blog.words.total.50 <- blog.words.total * 0.5
 
  while(frequency.50 < blog.words.total.50){
    frequency.50 <- frequency.50 + sum(blog.unigram.frequencies[loopIndx])
    loopIndx <- loopIndx + 1 
  }

The loopIndx is the number of words that cover 50% of all word instances

loopIndx: 146 words

Words covering 50% of all word instances

blog.unigram.frequencies[1:loopIndx]
##       the        to       and        of         a         I        in 
##   1000150    632859    620347    520503    517974    458036    330927 
##      that        is       for        it      with       was        on 
##    266601    256674    206298    200755    166797    166466    158518 
##       you        my      have        be      this        as       are 
##    157808    146010    129205    123584    122393    120359    114014 
##       The       not        at       but      from        we        or 
##    112111     96945     96521     94794     85937     82670     82369 
##        me       all        so        by     about      they       one 
##     81663     79056     74279     69881     68376     67024     66191 
##      will       out       had        an        up        he       her 
##     65822     64997     63829     63284     62916     61776     61222 
##       his       can      like      your       has      just      more 
##     59674     57090     56657     56104     55825     55546     53760 
##      what     their      time        do      when      some       who 
##     52197     51873     51770     50121     49149     49003     48368 
##     would      them       our      been      were        if     there 
##     48247     47949     47044     46444     46144     44621     43163 
##       get     which        It       she      into      know       And 
##     41551     41379     41266     39617     37446     35881     33950 
##    people       how      This        no     other   because      also 
##     33578     33336     32642     32176     32109     31796     30435 
##      than      only      then      over      make      back        We 
##     29999     29985     29770     29198     29092     29078     29019 
##    really       But      much       see       him        am      very 
##     29009     28656     28594     28262     28178     27837     27757 
##     think       now        us       way     first      even      good 
##     27750     27639     27505     27492     27276     27201     27009 
##       day     could    little        He       new        In        So 
##     26715     26438     26232     25282     25166     25058     24451 
##        go      love     these     going       any       two    things 
##     24349     23878     23585     23444     22807     22618     22360 
##      life      want     being      well       I'm       did      many 
##     22263     22200     22169     22145     21958     21888     21839 
##      work       too     where      made      said         A      most 
##     21811     21776     21688     21664     21610     21479     21391 
##     still      here     those        If    before   through     after 
##     21214     21175     20963     20667     20583     20545     20472 
##      down       You       off something     years    around     right 
##     20218     20061     19802     19600     19494     19338     19171 
##      last       its       few      take       got       say 
##     19082     18878     18811     18738     18513     18447

Plot the frequency of the words covering 50% of all word instances

plot(blog.unigram.frequencies[1:loopIndx], log = "y", cex = .6, ylab = "Term frequency")

I won’t repeat this code for the news and twitter unigrams to keep the report to a reasonable length.

As the text files contain offensive words and profanity a dictionary will be used to remove these. A list of words is available at http://www.cs.cmu.edu/~biglou/resources/bad-words.txt. It has been manually edited to remove words that are not necessarily offensive.

  # Create an offensive dictionary 
  
  offensive.unigrams <- readLines("offensivelist.txt")
  offensive.dict <- dictionary(list(badwords=offensive.unigrams))
  bad.words <- selectFeatures(blog.unigram.dfm, offensive.dict, selection="keep")
## kept 1,026 features, from 1177 supplied (glob) feature types
  sorted.bad.words <- sort(features(bad.words))

Number of bad words: 1026

Interesting

I find it interesting that there is a substantial amount of offensive language within the blog text. I suspect this will also be the case with the twitter text.

Removing offensive language is a considerable challenge. It is easy enough to remove some given terms but the range is immense, and not only that, there are many words that on their own are not offensive. The real challenge is in the context of the language used. Consideration must also be given as to what is offensive. It may come down to an individual, group, and/or organizational preference. Therefore, removal of profanity is about the context of language and the preference of those using a tool for word prediction, and maybe something that has to be learned by the use of a flagging parameter.

Prediction Algorithm and Shiny App

A prediction algorithm will have a better success rate on texts that are alike. I think the blogs, news and twitter text are probably quite dissimilar so a prediction algorithm will be trained on each type. The shiny app will have a drop-down with choices, blog, news, twitter to select the type of text to predict the next word.

I also believe that a higher success rate will be achieved by labelling the parts of speech verb, noun, noun-phrase etc. and using grammatical rules within the algorithm. As stated previously the text has not been converted to lowercase as I believe this will be an important indication for these rules, and I may have to retain punctuation.

I have to remove non-english words.

I have split the data as 60% training and 40% test data. Although, loading RData objects is reasonably fast, the final product may have to use a much smaller set of data to make it usable on a mobile device.

I will use unigrams, bigrams, and trigrams to build the prediction algorithm on.

Appendix A - Split data into training and test sets and save as text files

# Create data frames
blog.df <- data.frame(txt.blog, stringsAsFactors = FALSE)
news.df <- data.frame(txt.news, stringsAsFactors = FALSE)
twitter.df <- data.frame(txt.twitter, stringsAsFactors = FALSE)

library(caTools)
set.seed(2016)
blog.split <- sample.split(blog.df$txt.blog, SplitRatio=0.6)
blog.train <- blog.df[blog.split==TRUE,]
blog.test <- blog.df[blog.split==FALSE,]

news.split <- sample.split(news.df$txt.news, SplitRatio=0.6)
news.train <- news.df[news.split==TRUE,]
news.test <- news.df[news.split==FALSE,]

twitter.split <- sample.split(twitter.df$txt.twitter, SplitRatio=0.6)
twitter.train <- twitter.df[twitter.split==TRUE,]
twitter.test <- twitter.df[twitter.split==FALSE,]

files.to.write <- c("blog.train", "blog.test", "news.train", "news.test", "twitter.train", "twitter.test")
files.list <- list(blog.train, blog.test, news.train, news.test, twitter.train, twitter.test)

writeFiles <- function(myfile, i){
  thefile <- paste(files.to.write[[i]], ".txt", sep="")
  con <- file(thefile, "w")
  writeLines(myfile, con)
  close(con)
}

mapply(writeFiles, files.list, seq_along(files.to.write))

Appendix B - Training dataframes saved as RData objects

blogs.train <- file("blog.train.txt", "r") 
news.train <- file("news.train.txt", "r") 
twitter.train <- file("twitter.train.txt", "r") 

txt.blog <- readLines(blogs.train, skipNul=TRUE)
txt.news <- readLines(news.train, skipNul=TRUE)
txt.twitter <- readLines(twitter.train, skipNul=TRUE)

close(blogs.train)
close(news.train)
close(twitter.train)

blog.train <- data.frame(txt.blog, stringsAsFactors = FALSE)
news.train <- data.frame(txt.news, stringsAsFactors = FALSE)
twitter.train <- data.frame(txt.twitter, stringsAsFactors = FALSE)

save(blog.train, file="blog.train.RData")
save(news.train, file="news.train.RData")
save(twitter.train, file="twitter.train.RData")

Appendix C - Unigrams, Bigrams, and Trigrams saved as RData objects

blog.trigram <- tokenize(blog.train[,1], what=c("word"),ngrams=3, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
blog.bigram <- tokenize(blog.train[,1], what=c("word"),ngrams=2, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
blog.unigram <- tokenize(blog.train[,1], what=c("word"),ngrams=1, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)

news.trigram <- tokenize(news.train[,1], what=c("word"),ngrams=3, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
news.bigram <- tokenize(news.train[,1], what=c("word"),ngrams=2, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
news.unigram <- tokenize(news.train[,1], what=c("word"),ngrams=1, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)

twitter.trigram <- tokenize(twitter.train[,1], what=c("word"),ngrams=3, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
twitter.bigram <- tokenize(twitter.train[,1], what=c("word"),ngrams=2, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
twitter.unigram <- tokenize(twitter.train[,1], what=c("word"),ngrams=1, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)


save(blog.trigram, file="blog.trigram.RData")
save(blog.bigram, file="blog.bigram.RData")
save(blog.unigram, file="blog.unigram.RData")

save(news.trigram, file="news.trigram.RData")
save(news.bigram, file="news.bigram.RData")
save(news.unigram, file="news.unigram.RData")

save(twitter.trigram, file="twitter.trigram.RData")
save(twitter.bigram, file="twitter.bigram.RData")
save(twitter.unigram, file="twitter.unigram.RData")

Appendix D - Quanteda Document Feature Matrix saved as RData objects.

blog.unigram.dfm <- dfm(blog.unigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
blog.bigram.dfm <- dfm(blog.bigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
blog.trigram.dfm <- dfm(blog.trigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
save(blog.trigram.dfm, file="blog.trigram.dfm.RData")
save(blog.bigram.dfm, file="blog.bigram.dfm.RData")
save(blog.unigram.dfm, file="blog.unigram.dfm.RData")

news.unigram.dfm <- dfm(news.unigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
news.bigram.dfm <- dfm(news.bigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
news.trigram.dfm <- dfm(news.trigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
save(news.unigram.dfm, file="news.unigram.dfm.RData")
save(news.bigram.dfm, file="news.bigram.dfm.RData")
save(news.trigram.dfm, file="news.trigram.dfm.RData")

twitter.unigram.dfm <- dfm(twitter.unigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
twitter.bigram.dfm <- dfm(twitter.bigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
twitter.trigram.dfm <- dfm(twitter.trigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
save(twitter.unigram.dfm, file="twitter.unigram.dfm.RData")
save(twitter.bigram.dfm, file="twitter.bigram.dfm.RData")
save(twitter.trigram.dfm, file="twitter.trigram.dfm.RData")