Coursera Data Science Capstone Milestone

Executive Summary

This report shows the steps, or the algorithm, used to come up with a predictive text model. 3 files, containing blogs, news and twitter feeds are downloaded to be analysed.

Basic Statistics

Basic data tables and row counts

USB <- read.table('en_US.blogs.txt', sep='\t', quote='', stringsAsFactors=FALSE)
USN <- read.table('en_US.news.txt', sep='\t', quote='', stringsAsFactors=FALSE)
UST <- read.table('en_US.twitter.txt', sep='\t', quote='', stringsAsFactors=FALSE)

## Warning in scan(file = file, what = what, sep = sep, quote = quote, dec =
## dec, : embedded nul(s) found in input

class(USB); class(USN); class(UST)

## [1] "data.frame"

## [1] "data.frame"

## [1] "data.frame"

dim(USB); dim(USN); dim(UST)

## [1] 898384      1

## [1] 77258     1

## [1] 2302307       1

Word counts

sum(sapply(gregexpr("[A-z]\\W+", USB[1:898384,]), length) + 1L)

## [1] 38022025

sum(sapply(gregexpr("[A-z]\\W+", USN[1:77258,]), length) + 1L)

## [1] 2724759

sum(sapply(gregexpr("[A-z]\\W+", UST[1:2302307,]), length) + 1L)

## [1] 30193276

Sampling

In order to find a balance between accuracy and scalability, a 0.1% sample is taken from each of the 3 data set. The 3 data sets are combined into a single file, from which we find out the most frequent single word, double word and triple word.

library(dplyr); set.seed(3277)

## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

SUSB <- sample_n(USB, 898)
SUSN <- sample_n(USN, 77)
SUST <- sample_n(UST, 2302)
SUSC <- rbind(SUSB, SUSN, SUST)

Removing profanity from data

BW <- read.csv("profanity.csv", header = F)
library(tm); library(SnowballC)

## Loading required package: NLP

SUSC.corpus <- Corpus(DataframeSource(SUSC)) # create VCorpus from the data frame
SUSC.corpus <- tm_map(SUSC.corpus, removeWords, BW$V1)

Cleaning and Stemming of data

SUSC.corpus <- tm_map(SUSC.corpus, stripWhitespace)
SUSC.corpus <- tm_map(SUSC.corpus, removeWords, stopwords("english"))
SUSC.corpus <- tm_map(SUSC.corpus, removePunctuation)
SUSC.corpus <- tm_map(SUSC.corpus, removeNumbers)
SUSC.corpus <- tm_map(SUSC.corpus, content_transformer(tolower))
SUSC.corpus <- tm_map(SUSC.corpus, PlainTextDocument)
SUSC.corpus <- tm_map(SUSC.corpus, stemDocument)

Tokenization

if(Sys.getenv("JAVA_HOME")!=""){
      Sys.setenv(JAVA_HOME="")
}
library(rJava); library(RWeka)
options(mc.cores=1)
uni_token <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
bi_token <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
tri_token <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
unigram <- TermDocumentMatrix(SUSC.corpus, control=list(tokenize = uni_token))
bigram <- TermDocumentMatrix(SUSC.corpus, control=list(tokenize = bi_token))
trigram <- TermDocumentMatrix(SUSC.corpus, control=list(tokenize = tri_token))
removeSparseTerms(unigram, 0.1)

## <<TermDocumentMatrix (terms: 0, documents: 3277)>>
## Non-/sparse entries: 0/0
## Sparsity           : 100%
## Maximal term length: 0
## Weighting          : term frequency (tf)

removeSparseTerms(bigram, 0.1)

## <<TermDocumentMatrix (terms: 0, documents: 3277)>>
## Non-/sparse entries: 0/0
## Sparsity           : 100%
## Maximal term length: 0
## Weighting          : term frequency (tf)

removeSparseTerms(trigram, 0.1)

## <<TermDocumentMatrix (terms: 0, documents: 3277)>>
## Non-/sparse entries: 0/0
## Sparsity           : 100%
## Maximal term length: 0
## Weighting          : term frequency (tf)

Creating a tokenized database

## Finding the totals for each term
freq.uni <- rowSums(as.matrix(unigram))
freq.bi  <- rowSums(as.matrix(bigram))
freq.tri <- rowSums(as.matrix(trigram))
## Sorting the totals
freq.uni <- sort(freq.uni, decreasing = TRUE)
freq.bi  <- sort(freq.bi, decreasing = TRUE)
freq.tri <- sort(freq.tri, decreasing = TRUE)
## Create the top n data frames from the matrices
df40.freq.uni <- data.frame("Term"=names(head(freq.uni,40)), "Frequency"=head(freq.uni,40))
df20.freq.bi  <- data.frame("Term"=names(head(freq.bi,20)), "Frequency"=head(freq.bi,20))
df10.freq.tri <- data.frame("Term"=names(head(freq.tri,10)), "Frequency"=head(freq.tri,10))
## Reorder levels for better plotting
df40.freq.uni$uniTerm <- reorder(df40.freq.uni$Term, df40.freq.uni$Frequency)
df20.freq.bi$biTerm  <- reorder(df20.freq.bi$Term, df20.freq.bi$Frequency)
df10.freq.tri$triTerm <- reorder(df10.freq.tri$Term, df10.freq.tri$Frequency)

Plotting

library(ggplot2)

## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:NLP':
## 
##     annotate

g1 <- ggplot(df40.freq.uni, aes(x = uniTerm, y = Frequency)) +
    geom_bar(stat = "identity", fill="green") +
    xlab("Terms") + ylab("Count") + ggtitle("Top 40 UniGram Tokenized Word Frequency") +
    coord_flip()
print(g1)

g2 <- ggplot(df20.freq.bi, aes(x = biTerm, y = Frequency)) +
    geom_bar(stat = "identity", fill="green") +
    xlab("Terms") + ylab("Count") + ggtitle("Top 20 BiGram Tokenized Word Frequency") +
    coord_flip()
print(g2)

g3 <- ggplot(df10.freq.tri, aes(x = triTerm, y = Frequency)) +
    geom_bar(stat = "identity", fill="green") +
    xlab("Terms") + ylab("Count") + ggtitle("Top 10 TriGram Tokenized Word Frequency") +
    coord_flip()
print(g3)

Hereafter

Create a predictive text model and optimize it.
Using Shiny, create an optimized text prediction app.

Coursera Data Science Capstone Milestone

CK Lim

25 July 2015

Executive Summary

This report shows the steps, or the algorithm, used to come up with a predictive text model. 3 files, containing blogs, news and twitter feeds are downloaded to be analysed.

Basic Statistics

Basic data tables and row counts

Word counts

Sampling

In order to find a balance between accuracy and scalability, a 0.1% sample is taken from each of the 3 data set. The 3 data sets are combined into a single file, from which we find out the most frequent single word, double word and triple word.

Removing profanity from data

Cleaning and Stemming of data

Tokenization

Creating a tokenized database

Plotting

Hereafter