This report is for the John Hopkin’s Coursera Data Science Capstone project. It summarises textual features of three files containing blog text, news text, and twitter text, in preparation to develop a product for word prediction.
The data can be downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip. The capstone requires the use of files contained in the en_US folder only. The folder contains three files, en_US.blogs.txt, en_US.news.txt, and en_US.twitter.txt.
## Warning: package 'knitr' was built under R version 3.3.2
Load the data
us.blogs.con <- file("./final/en_US/en_US.blogs.txt", "r")
us.news.con <- file("./final/en_US/en_US.news.txt", "r")
us.twitter.con <- file("./final/en_US/en_US.twitter.txt", "r")
txt.blog <- readLines(us.blogs.con, skipNul=TRUE)
txt.news <- readLines(us.news.con, skipNul=TRUE)
txt.twitter <- readLines(us.twitter.con, skipNul=TRUE)
close(us.blogs.con)
close(us.news.con)
close(us.twitter.con)
# The number of lines
lines.blog <- length(txt.blog)
lines.news <- length(txt.news)
lines.twitter <- length(txt.twitter)
# Number of words
words.blog <- sum(str_count(txt.blog, boundary("word")))
words.news <- sum(str_count(txt.news, boundary("word")))
words.twitter <- sum(str_count(txt.twitter, boundary("word")))
# Data frame for table creation only so allow spaces in variable names
data_summary <- data.frame('Source Type'=c("Blogs", "News", "Twitter"), 'File Name' = c("en_US.blogs.txt","en_US.news.txt","en_US.twitter.txt"), 'Number of Lines'=c(lines.blog, lines.news, lines.twitter), 'Number of Words'=c(words.blog, words.news, words.twitter), check.names=FALSE)
kable(data_summary, format.args = list(big.mark=','))
| Source Type | File Name | Number of Lines | Number of Words |
|---|---|---|---|
| Blogs | en_US.blogs.txt | 899,288 | 37,546,246 |
| News | en_US.news.txt | 1,010,242 | 34,762,395 |
| en_US.twitter.txt | 2,360,148 | 30,093,410 |
In an effort to keep the main report brief and reduce execution time, code used to load and process data has been moved to the appendices.
Due to many peformance issues and R crashes (especially with the TM package) the data was saved as text files, and data was processed and saved as RData objects. The data was split into training (60%) and test (40%) sets for each text source and written back to the file system (see appendix A for the code) . The training sets were read back in and dataframes were created which were then saved to the file system as RData objects (see appendix B for the code). A corpus was created for each training set and then tokenized into unigrams, bigrams, and trigrams. Each is saved as an RData object (see appendix C for the code).
The library quanteda is used and the training RData objects of unigrams, bigrams, and trigrams are read back into the environment. This greatly speeds up the time to work on the data.
At this time stop words have been kept as they may be needed when predicting the next word. Stemming has also not been implemented as this may cause additional processing to convert to the given word. I have not converted to lowercase. These features will need to be reassessed for the final product.
library(quanteda)
# Load unigrams, bigrams, and trigrams
attach("blog.unigram.RData")
#attach("blog.bigram.RData")
#attach("blog.trigram.RData")
#attach("news.unigram.RData")
#attach("news.bigram.RData")
#attach("news.trigram.RData")
#attach("twitter.unigram.RData")
#attach("twitter.bigram.RData")
#attach("twitter.trigram.RData")
# Load Document Feature Matrix
attach("blog.unigram.dfm.RData")
#attach("blog.bigram.dfm.RData")
#attach("blog.trigram.dfm.RData")
#attach("news.unigram.dfm.RData")
#attach("news.bigram.dfm.RData")
#attach("news.trigram.dfm.RData")
#attach("twitter.unigram.dfm.RData")
#attach("twitter.bigram.dfm.RData")
#attach("twitter.trigram.dfm.RData")
How many unique words do you need in a frequency sorted dictionary to cover 50% of all word instances in the text?
blog.unigram.features <- length(attr(blog.unigram.dfm, "Dimnames")$features)
head(features(blog.unigram.dfm), 20)
## [1] "Chad" "has" "been" "awesome" "with" "the" "kids"
## [8] "and" "holding" "down" "fort" "while" "I" "work"
## [15] "later" "than" "usual" "The" "have" "busy"
# Profanity in the tail
# tail(features(blog.unigram.dfm), 20)
blog.unigram.frequencies <- topfeatures(blog.unigram.dfm,blog.unigram.features)
frequency.50 <- 0
loopIndx <-1
# Sum all the frequencies and multiply by 0.5 to get the frequency total at 50%
blog.words.total <- sum(blog.unigram.frequencies)
blog.words.total.50 <- blog.words.total * 0.5
while(frequency.50 < blog.words.total.50){
frequency.50 <- frequency.50 + sum(blog.unigram.frequencies[loopIndx])
loopIndx <- loopIndx + 1
}
The loopIndx is the number of words that cover 50% of all word instances
loopIndx: 146 words
Words covering 50% of all word instances
blog.unigram.frequencies[1:loopIndx]
## the to and of a I in
## 1000150 632859 620347 520503 517974 458036 330927
## that is for it with was on
## 266601 256674 206298 200755 166797 166466 158518
## you my have be this as are
## 157808 146010 129205 123584 122393 120359 114014
## The not at but from we or
## 112111 96945 96521 94794 85937 82670 82369
## me all so by about they one
## 81663 79056 74279 69881 68376 67024 66191
## will out had an up he her
## 65822 64997 63829 63284 62916 61776 61222
## his can like your has just more
## 59674 57090 56657 56104 55825 55546 53760
## what their time do when some who
## 52197 51873 51770 50121 49149 49003 48368
## would them our been were if there
## 48247 47949 47044 46444 46144 44621 43163
## get which It she into know And
## 41551 41379 41266 39617 37446 35881 33950
## people how This no other because also
## 33578 33336 32642 32176 32109 31796 30435
## than only then over make back We
## 29999 29985 29770 29198 29092 29078 29019
## really But much see him am very
## 29009 28656 28594 28262 28178 27837 27757
## think now us way first even good
## 27750 27639 27505 27492 27276 27201 27009
## day could little He new In So
## 26715 26438 26232 25282 25166 25058 24451
## go love these going any two things
## 24349 23878 23585 23444 22807 22618 22360
## life want being well I'm did many
## 22263 22200 22169 22145 21958 21888 21839
## work too where made said A most
## 21811 21776 21688 21664 21610 21479 21391
## still here those If before through after
## 21214 21175 20963 20667 20583 20545 20472
## down You off something years around right
## 20218 20061 19802 19600 19494 19338 19171
## last its few take got say
## 19082 18878 18811 18738 18513 18447
Plot the frequency of the words covering 50% of all word instances
plot(blog.unigram.frequencies[1:loopIndx], log = "y", cex = .6, ylab = "Term frequency")
I won’t repeat this code for the news and twitter unigrams to keep the report to a reasonable length.
As the text files contain offensive words and profanity a dictionary will be used to remove these. A list of words is available at http://www.cs.cmu.edu/~biglou/resources/bad-words.txt. It has been manually edited to remove words that are not necessarily offensive.
# Create an offensive dictionary
offensive.unigrams <- readLines("offensivelist.txt")
offensive.dict <- dictionary(list(badwords=offensive.unigrams))
bad.words <- selectFeatures(blog.unigram.dfm, offensive.dict, selection="keep")
## kept 1,026 features, from 1177 supplied (glob) feature types
sorted.bad.words <- sort(features(bad.words))
Number of bad words: 1026
I find it interesting that there is a substantial amount of offensive language within the blog text. I suspect this will also be the case with the twitter text.
Removing offensive language is a considerable challenge. It is easy enough to remove some given terms but the range is immense, and not only that, there are many words that on their own are not offensive. The real challenge is in the context of the language used. Consideration must also be given as to what is offensive. It may come down to an individual, group, and/or organizational preference. Therefore, removal of profanity is about the context of language and the preference of those using a tool for word prediction, and maybe something that has to be learned by the use of a flagging parameter.
A prediction algorithm will have a better success rate on texts that are alike. I think the blogs, news and twitter text are probably quite dissimilar so a prediction algorithm will be trained on each type. The shiny app will have a drop-down with choices, blog, news, twitter to select the type of text to predict the next word.
I also believe that a higher success rate will be achieved by labelling the parts of speech verb, noun, noun-phrase etc. and using grammatical rules within the algorithm. As stated previously the text has not been converted to lowercase as I believe this will be an important indication for these rules, and I may have to retain punctuation.
I have to remove non-english words.
I have split the data as 60% training and 40% test data. Although, loading RData objects is reasonably fast, the final product may have to use a much smaller set of data to make it usable on a mobile device.
I will use unigrams, bigrams, and trigrams to build the prediction algorithm on.
Appendix A - Split data into training and test sets and save as text files
# Create data frames
blog.df <- data.frame(txt.blog, stringsAsFactors = FALSE)
news.df <- data.frame(txt.news, stringsAsFactors = FALSE)
twitter.df <- data.frame(txt.twitter, stringsAsFactors = FALSE)
library(caTools)
set.seed(2016)
blog.split <- sample.split(blog.df$txt.blog, SplitRatio=0.6)
blog.train <- blog.df[blog.split==TRUE,]
blog.test <- blog.df[blog.split==FALSE,]
news.split <- sample.split(news.df$txt.news, SplitRatio=0.6)
news.train <- news.df[news.split==TRUE,]
news.test <- news.df[news.split==FALSE,]
twitter.split <- sample.split(twitter.df$txt.twitter, SplitRatio=0.6)
twitter.train <- twitter.df[twitter.split==TRUE,]
twitter.test <- twitter.df[twitter.split==FALSE,]
files.to.write <- c("blog.train", "blog.test", "news.train", "news.test", "twitter.train", "twitter.test")
files.list <- list(blog.train, blog.test, news.train, news.test, twitter.train, twitter.test)
writeFiles <- function(myfile, i){
thefile <- paste(files.to.write[[i]], ".txt", sep="")
con <- file(thefile, "w")
writeLines(myfile, con)
close(con)
}
mapply(writeFiles, files.list, seq_along(files.to.write))
Appendix B - Training dataframes saved as RData objects
blogs.train <- file("blog.train.txt", "r")
news.train <- file("news.train.txt", "r")
twitter.train <- file("twitter.train.txt", "r")
txt.blog <- readLines(blogs.train, skipNul=TRUE)
txt.news <- readLines(news.train, skipNul=TRUE)
txt.twitter <- readLines(twitter.train, skipNul=TRUE)
close(blogs.train)
close(news.train)
close(twitter.train)
blog.train <- data.frame(txt.blog, stringsAsFactors = FALSE)
news.train <- data.frame(txt.news, stringsAsFactors = FALSE)
twitter.train <- data.frame(txt.twitter, stringsAsFactors = FALSE)
save(blog.train, file="blog.train.RData")
save(news.train, file="news.train.RData")
save(twitter.train, file="twitter.train.RData")
Appendix C - Unigrams, Bigrams, and Trigrams saved as RData objects
blog.trigram <- tokenize(blog.train[,1], what=c("word"),ngrams=3, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
blog.bigram <- tokenize(blog.train[,1], what=c("word"),ngrams=2, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
blog.unigram <- tokenize(blog.train[,1], what=c("word"),ngrams=1, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
news.trigram <- tokenize(news.train[,1], what=c("word"),ngrams=3, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
news.bigram <- tokenize(news.train[,1], what=c("word"),ngrams=2, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
news.unigram <- tokenize(news.train[,1], what=c("word"),ngrams=1, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
twitter.trigram <- tokenize(twitter.train[,1], what=c("word"),ngrams=3, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
twitter.bigram <- tokenize(twitter.train[,1], what=c("word"),ngrams=2, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
twitter.unigram <- tokenize(twitter.train[,1], what=c("word"),ngrams=1, removePunct=TRUE, removeSymbols = TRUE, removeSeparators=TRUE)
save(blog.trigram, file="blog.trigram.RData")
save(blog.bigram, file="blog.bigram.RData")
save(blog.unigram, file="blog.unigram.RData")
save(news.trigram, file="news.trigram.RData")
save(news.bigram, file="news.bigram.RData")
save(news.unigram, file="news.unigram.RData")
save(twitter.trigram, file="twitter.trigram.RData")
save(twitter.bigram, file="twitter.bigram.RData")
save(twitter.unigram, file="twitter.unigram.RData")
Appendix D - Quanteda Document Feature Matrix saved as RData objects.
blog.unigram.dfm <- dfm(blog.unigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
blog.bigram.dfm <- dfm(blog.bigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
blog.trigram.dfm <- dfm(blog.trigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
save(blog.trigram.dfm, file="blog.trigram.dfm.RData")
save(blog.bigram.dfm, file="blog.bigram.dfm.RData")
save(blog.unigram.dfm, file="blog.unigram.dfm.RData")
news.unigram.dfm <- dfm(news.unigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
news.bigram.dfm <- dfm(news.bigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
news.trigram.dfm <- dfm(news.trigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
save(news.unigram.dfm, file="news.unigram.dfm.RData")
save(news.bigram.dfm, file="news.bigram.dfm.RData")
save(news.trigram.dfm, file="news.trigram.dfm.RData")
twitter.unigram.dfm <- dfm(twitter.unigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
twitter.bigram.dfm <- dfm(twitter.bigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
twitter.trigram.dfm <- dfm(twitter.trigram, verbose = TRUE, toLower = FALSE,stem = FALSE, ignoredFeatures = NULL, keptFeatures = NULL,language = "english", thesaurus = NULL, dictionary = NULL)
save(twitter.unigram.dfm, file="twitter.unigram.dfm.RData")
save(twitter.bigram.dfm, file="twitter.bigram.dfm.RData")
save(twitter.trigram.dfm, file="twitter.trigram.dfm.RData")