The goal of this report is to set the baseline for the natural language processesing based text prediction shiny app. The report below explores the corpus and breaks it down by language, does some simple analytics on the data and most importantly cleans the data.
The data was pulled from twitter and provided by SwiftKey from the following URL https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip
Once unzipped the corpora has 4 folders, one each for one locale, US Russia Finland and Germany and each is made up of 3 text files: blogs, news, twitter. A cursory analysis will be done on the files from the US. It is assumed that all the files for the US locale were unzipiiped into the working directory
blogFile <- readLines(con = "./final/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = T)
newsFile <- readLines(con = "./final/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = T)
twitterFile <- readLines(con = "./final/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = T)
Raw File Statistics:
library(stringi)
blogFileWordCount <- stri_count_words(blogFile)
blogWord <- sum(blogFileWordCount)
blogLines <- length(blogFile)
newsFileWordCount <- stri_count_words(newsFile)
newsWord <- sum(newsFileWordCount)
newsLines <- length(newsFile)
twitterFileWordCount <- stri_count_words(twitterFile)
twitterWord <- sum(twitterFileWordCount)
twitterLines <- length(twitterFile)
col1 <- c("blog", "news", "twitter")
col2 <- c(blogWord, newsWord, twitterWord)
col3 <- c(blogLines, newsLines, twitterLines)
summary <- data.frame(fileName = col1, wordCount = col2 , lineCount = col3)
summary
## fileName wordCount lineCount
## 1 blog 37546246 899288
## 2 news 2674536 77259
## 3 twitter 30093410 2360148
A file containing an extensive list of profane words was found at this URL https://www.cs.cmu.edu/~biglou/resources/ and added to the worspace, the corpus is cleaned leveraging the tm library, regular expressions, and the profanity file and is applied to 5% of the files for analysis
library(tm)
profanityFile <- file("./final/bad-words.txt")
set.seed(365)
bP <- blogLines*0.01
nP<- newsLines*0.01
tP<- twitterLines*0.01
blogSubset <- sample(blogFile, bP)
newsSubset <- sample(newsFile, nP)
twitterSubset <- sample(twitterFile, tP)
subsetAll <- c(blogSubset, newsSubset, twitterSubset)
subsetAllF <- iconv(subsetAll, 'UTF-8', 'ASCII')
subsetAllT <- subsetAllF[complete.cases(subsetAllF)]
corp <- VCorpus(VectorSource(subsetAllT))
change <- content_transformer(function(x,to) gsub(to," ", x))
corp <- tm_map(corp, change, "(f|ht)tp(s?)://.*\\b")
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, change, "@[^\\s]+")
corp <- tm_map(corp, removeWords, stopwords("english"))
corp <- tm_map(corp, removeNumbers)
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, stripWhitespace)
corp <- tm_map(corp, removeWords, profanityFile)
Using n-gram models and tokenizing the corpora the top 1,2,3-Grams are calculated and plotted for the sample dataset.
library(RWeka)
library(ggplot2)
biGram <- function(x) NGramTokenizer(x, Weka_control(min=2, max =2))
triGram <- function(x) NGramTokenizer(x, Weka_control(min=3, max =3))
tdUL <- TermDocumentMatrix(corp)
tdU <- removeSparseTerms(tdUL, 0.999)
controlBi <- list(tokenize = biGram)
tdBiL <- TermDocumentMatrix(corp, control = controlBi)
tdBi <- removeSparseTerms(tdBiL, 0.999)
controlTri <- list(tokenize = triGram)
tdTriL <- TermDocumentMatrix(corp, control = controlTri)
tdTri <- removeSparseTerms(tdTriL, 0.999)
fUni <- sort(rowSums(as.matrix(tdU)), decreasing=TRUE)
fUniF <- data.frame(word=names(fUni), freq=fUni)
fUni10 <- fUniF[1:10,]
fBi <- sort(rowSums(as.matrix(tdBi)), decreasing=TRUE)
fBiF <- data.frame(word=names(fBi), freq=fBi)
fBi10 <- fBiF[1:10,]
fTri <- sort(rowSums(as.matrix(tdTri)), decreasing=TRUE)
fTriF <- data.frame(word=names(fTri), freq=fTri)
fTri10 <- fTriF[1:10,]
ggplot(fUni10, aes(x=reorder(word,freq), y=freq)) +
geom_bar(stat="identity", fill = "red") +
coord_flip() +
labs(y="Frequency", x= "1-Gram" ,title="Most Common 1-Grams in Sample")
ggplot(fBi10, aes(x=reorder(word,freq), y=freq)) +
geom_bar(stat="identity", fill = "red") +
coord_flip() +
labs(y="Frequency", x= "2-Gram" ,title="Most Common 2-Grams in Sample")
ggplot(fTri10, aes(x=reorder(word,freq), y=freq)) +
geom_bar(stat="identity", fill = "red") +
coord_flip() +
labs(y="Frequency", x= "3-Gram" ,title="Most Common 3-Grams in sample")
Leveraging a store of common N-grams can be leveraged to drive the text predition algoritihm within the shiny app to accurately predict the next word a user is going to type based upon their locale. Based upon how long a given input is this model can be scaled out (hence the n) to allow prediciton of common sentences but also increase accuracy by lookingat simply the next word (referencing a 2-gram based prediction) to drive up the accuracy of the predictions.