Introduction

The goal of this report is to set the baseline for the natural language processesing based text prediction shiny app. The report below explores the corpus and breaks it down by language, does some simple analytics on the data and most importantly cleans the data.

Loading the data

The data was pulled from twitter and provided by SwiftKey from the following URL https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip

Once unzipped the corpora has 4 folders, one each for one locale, US Russia Finland and Germany and each is made up of 3 text files: blogs, news, twitter. A cursory analysis will be done on the files from the US. It is assumed that all the files for the US locale were unzipiiped into the working directory

blogFile <- readLines(con = "./final/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = T)
newsFile <- readLines(con = "./final/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = T)
twitterFile <- readLines(con = "./final/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = T)

Raw File Statistics:

library(stringi)
blogFileWordCount <- stri_count_words(blogFile)
blogWord <- sum(blogFileWordCount)
blogLines <- length(blogFile)
newsFileWordCount <- stri_count_words(newsFile)
newsWord <- sum(newsFileWordCount)
newsLines <- length(newsFile)
twitterFileWordCount <- stri_count_words(twitterFile)
twitterWord <- sum(twitterFileWordCount)
twitterLines <- length(twitterFile)
col1 <- c("blog", "news", "twitter")
col2 <- c(blogWord, newsWord, twitterWord)
col3 <- c(blogLines, newsLines, twitterLines)
summary <- data.frame(fileName = col1, wordCount = col2 , lineCount = col3)
summary
##   fileName wordCount lineCount
## 1     blog  37546246    899288
## 2     news   2674536     77259
## 3  twitter  30093410   2360148

Cleaning and Sampling Data

A file containing an extensive list of profane words was found at this URL https://www.cs.cmu.edu/~biglou/resources/ and added to the worspace, the corpus is cleaned leveraging the tm library, regular expressions, and the profanity file and is applied to 5% of the files for analysis

library(tm)
profanityFile <- file("./final/bad-words.txt")
set.seed(365)
bP <- blogLines*0.01
nP<- newsLines*0.01
tP<- twitterLines*0.01
blogSubset <- sample(blogFile, bP)
newsSubset <- sample(newsFile, nP)
twitterSubset <- sample(twitterFile, tP)
subsetAll <- c(blogSubset, newsSubset, twitterSubset)
subsetAllF <- iconv(subsetAll, 'UTF-8', 'ASCII')
subsetAllT <- subsetAllF[complete.cases(subsetAllF)]
corp <- VCorpus(VectorSource(subsetAllT))
change <- content_transformer(function(x,to) gsub(to," ", x))
corp <- tm_map(corp, change, "(f|ht)tp(s?)://.*\\b")
corp <- tm_map(corp, removePunctuation)
corp <- tm_map(corp, change, "@[^\\s]+")
corp <- tm_map(corp, removeWords, stopwords("english"))
corp <- tm_map(corp, removeNumbers)
corp <- tm_map(corp, content_transformer(tolower))
corp <- tm_map(corp, stripWhitespace)
corp <- tm_map(corp, removeWords, profanityFile)

Analyzing Sample Data

Using n-gram models and tokenizing the corpora the top 1,2,3-Grams are calculated and plotted for the sample dataset.

library(RWeka)
library(ggplot2)
biGram <- function(x) NGramTokenizer(x, Weka_control(min=2, max =2))
triGram <- function(x) NGramTokenizer(x, Weka_control(min=3, max =3))
tdUL <- TermDocumentMatrix(corp)
tdU <- removeSparseTerms(tdUL, 0.999)
controlBi <- list(tokenize = biGram)
tdBiL <- TermDocumentMatrix(corp, control = controlBi)
tdBi <- removeSparseTerms(tdBiL, 0.999)
controlTri <- list(tokenize = triGram)
tdTriL <- TermDocumentMatrix(corp, control = controlTri)
tdTri <- removeSparseTerms(tdTriL, 0.999)
fUni <- sort(rowSums(as.matrix(tdU)), decreasing=TRUE)
fUniF <- data.frame(word=names(fUni), freq=fUni)
fUni10 <- fUniF[1:10,]
fBi <- sort(rowSums(as.matrix(tdBi)), decreasing=TRUE)
fBiF <- data.frame(word=names(fBi), freq=fBi)
fBi10 <- fBiF[1:10,]
fTri <- sort(rowSums(as.matrix(tdTri)), decreasing=TRUE)
fTriF <- data.frame(word=names(fTri), freq=fTri)
fTri10 <- fTriF[1:10,]
ggplot(fUni10, aes(x=reorder(word,freq), y=freq)) +
    geom_bar(stat="identity", fill = "red") +
    coord_flip() +
    labs(y="Frequency", x= "1-Gram" ,title="Most Common 1-Grams in Sample")

ggplot(fBi10, aes(x=reorder(word,freq), y=freq)) +
    geom_bar(stat="identity", fill = "red") +
    coord_flip() +
    labs(y="Frequency", x= "2-Gram" ,title="Most Common 2-Grams in Sample")

ggplot(fTri10, aes(x=reorder(word,freq), y=freq)) +
    geom_bar(stat="identity", fill = "red") +
    coord_flip() +
    labs(y="Frequency", x= "3-Gram" ,title="Most Common 3-Grams in sample")

Plans for Application Development

Leveraging a store of common N-grams can be leveraged to drive the text predition algoritihm within the shiny app to accurately predict the next word a user is going to type based upon their locale. Based upon how long a given input is this model can be scaled out (hence the n) to allow prediciton of common sentences but also increase accuracy by lookingat simply the next word (referencing a 2-gram based prediction) to drive up the accuracy of the predictions.