Data Science Capstone Milestone Report

The goal of this project is to display the skills that have been acquired over all of the courses in the Data Scientist specialization. It provides an opportunity to gain peer feedback to ensure that the program is on track for building a prediction algorithm.

First, the necessary libraries are loaded

library(tm)

## Warning: package 'tm' was built under R version 3.5.1

## Loading required package: NLP

library(ggplot2)

## Warning: package 'ggplot2' was built under R version 3.5.1

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(RColorBrewer)
library(wordcloud)

## Warning: package 'wordcloud' was built under R version 3.5.1

library(stringr)
library(stringi)
library(XML)

## Warning: package 'XML' was built under R version 3.5.1

library(caret)

## Warning: package 'caret' was built under R version 3.5.1

## Loading required package: lattice

library(openNLP)
suppressMessages(library(RWeka))

## Warning: package 'RWeka' was built under R version 3.5.1

suppressMessages(library(qdap))

## Warning: package 'qdap' was built under R version 3.5.1

## Warning: package 'qdapRegex' was built under R version 3.5.1

## Warning: package 'qdapTools' was built under R version 3.5.1

library(pander)

## Warning: package 'pander' was built under R version 3.5.1

library(SnowballC)
library(tmap)

setwd("C:/")

The source data for this project is loaded into the program. The data that was used for this project is available to be downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.

blog <- readLines("/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = T)
twitter <- readLines("/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = T)
news <- readLines("/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = T)

## Warning in readLines("/en_US/en_US.news.txt", encoding = "UTF-8", skipNul =
## T): incomplete final line found on '/en_US/en_US.news.txt'

The datasets are very large, so only a sample will be used for this initial review of the data.

set.seed(123)
sample_size = 750

blog_samp <- blog[sample(1:length(blog), sample_size)]
twitter_samp <- twitter[sample(1:length(twitter), sample_size)]
news_samp <- twitter[sample(1:length(news), sample_size)]

invisible(write.table(blog_samp, file="blog_samp.txt", quote=F))
invisible(write.table(twitter_samp, file="twitter_samp.txt", quote=F))
invisible(write.table(news_samp, file="news_samp.txt", quote=F))


cname <- file.path("C:", "Texts")
cname

## [1] "C:/Texts"

dir(cname)

## [1] "blog_samp.txt"          "DocumentTermMatrix.csv"
## [3] "news_samp.txt"          "twitter_samp.txt"      
## [5] "wordcloud_packages.png"

The size of the data sets being evaluated is important, so word and line counts are calculated.

BlogWords <- stri_count_words(blog_samp)
summary(BlogWords)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    9.00   28.00   40.08   57.00  274.00

TwitterWords <- stri_count_words(twitter_samp)
summary(TwitterWords)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    7.00   12.00   12.43   17.00   31.00

NewsWords <- stri_count_words(news_samp)
summary(NewsWords)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.86   19.00   31.00

stri_stats_general(blog_samp)

##       Lines LinesNEmpty       Chars CharsNWhite 
##         750         750      164694      135528

stri_stats_general(twitter_samp)

##       Lines LinesNEmpty       Chars CharsNWhite 
##         750         750       50466       41801

stri_stats_general(news_samp)

##       Lines LinesNEmpty       Chars CharsNWhite 
##         750         750       51948       42978

invisible(write.table(blog_samp, file="blog_samp.txt", quote=F))
invisible(write.table(twitter_samp, file="twitter_samp.txt", quote=F))
invisible(write.table(news_samp, file="news_samp.txt", quote=F))

cname <- file.path("C:", "Texts")
cname

## [1] "C:/Texts"

dir(cname)

## [1] "blog_samp.txt"          "DocumentTermMatrix.csv"
## [3] "news_samp.txt"          "twitter_samp.txt"      
## [5] "wordcloud_packages.png"

A corpus is a collection of written material which will be used for analysis. In text mining, a corpus is created to facilitate statistical analysis, hypothesis testing and to account for occurances.

library(tm) 

docs <- VCorpus(DirSource(cname))
summary(docs)

##                        Length Class             Mode
## blog_samp.txt          2      PlainTextDocument list
## DocumentTermMatrix.csv 2      PlainTextDocument list
## news_samp.txt          2      PlainTextDocument list
## twitter_samp.txt       2      PlainTextDocument list
## wordcloud_packages.png 2      PlainTextDocument list

inspect(docs[1])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 167629

inspect(docs[2])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 688081

inspect(docs[3])

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 55105

The data is cleaned to remove unnecessary characters, words and whitespace. Cleaning the data in this manner will allow for the most commonly occuring words and word associations to be identified.

docs <- tm_map(docs, removePunctuation)  
docs <- tm_map(docs, removeNumbers)   
docs <- tm_map(docs, tolower)   
docs <- tm_map(docs, PlainTextDocument)
DocsCopy <- docs  

docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, PlainTextDocument)

The size of the data sets being evaluated is important, so word and line counts are calculated.

library(NLP)
library(stringi)

BlogWords <- stri_count_words(blog_samp)
summary(BlogWords)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    9.00   28.00   40.08   57.00  274.00

TwitterWords <- stri_count_words(twitter_samp)
summary(TwitterWords)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    2.00    7.00   12.00   12.43   17.00   31.00

NewsWords <- stri_count_words(news_samp)
summary(NewsWords)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1.00    7.00   12.00   12.86   19.00   31.00

stri_stats_general(blog_samp)

##       Lines LinesNEmpty       Chars CharsNWhite 
##         750         750      164694      135528

stri_stats_general(twitter_samp)

##       Lines LinesNEmpty       Chars CharsNWhite 
##         750         750       50466       41801

stri_stats_general(news_samp)

##       Lines LinesNEmpty       Chars CharsNWhite 
##         750         750       51948       42978

dtm <- DocumentTermMatrix(docs)
dtm

## <<DocumentTermMatrix (documents: 5, terms: 8650)>>
## Non-/sparse entries: 11431/31819
## Sparsity           : 74%
## Maximal term length: 592558
## Weighting          : term frequency (tf)

tdm <- TermDocumentMatrix(docs)
tdm

## <<TermDocumentMatrix (terms: 8650, documents: 5)>>
## Non-/sparse entries: 11431/31819
## Sparsity           : 74%
## Maximal term length: 592558
## Weighting          : term frequency (tf)

The cleaned data is reviewed, with focus on the most frequently occuring words.

freq <- colSums(as.matrix(dtm))
length(freq)

## [1] 8650

dtms <-removeSparseTerms(dtm, 0.2)
dtms

## <<DocumentTermMatrix (documents: 5, terms: 0)>>
## Non-/sparse entries: 0/0
## Sparsity           : 100%
## Maximal term length: 0
## Weighting          : term frequency (tf)

freq <- colSums(as.matrix(dtm))
head(table(freq))

## freq
##    1    2    3    4    5    6 
## 5455 1315  512  331  205  160

tail(table(freq))

## freq
## 125 141 146 155 167 199 
##   1   1   1   1   1   1

freq <- colSums(as.matrix(dtms))
freq

## numeric(0)

freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq,20)

##   just   will    one   like    can    get   time   know   good    now 
##    199    167    155    146    141    125    122    114    106    106 
##    see   love people    day   dont    new  think   back   want   well 
##    105    101     94     86     86     83     82     75     74     74

findFreqTerms(dtm, lowfreq=50)

##  [1] "also"   "back"   "can"    "day"    "dont"   "even"   "first" 
##  [8] "get"    "going"  "good"   "got"    "great"  "just"   "know"  
## [15] "life"   "like"   "love"   "make"   "much"   "never"  "new"   
## [22] "now"    "one"    "people" "really" "right"  "see"    "thanks"
## [29] "think"  "time"   "today"  "want"   "way"    "well"   "will"

wf <- data.frame(word=names(freq), freq=freq)
head(wf)

##      word freq
## just just  199
## will will  167
## one   one  155
## like like  146
## can   can  141
## get   get  125

Create a histogram for words that occur at least 50 times.

library(ggplot2)

p <- ggplot(subset(wf, freq>50), aes(x = reorder(word, -freq), y = freq)) +
        geom_bar(stat = "identity") + 
        theme(axis.text.x=element_text(angle=45, hjust=1))
p

Remove sparse terms and then review frequency. The removal of terms that fall below a specified frequency threshold will assist with generalization and can help to prevent overfitting.

dtms <- removeSparseTerms(dtm, 0.1) 

head(table(freq), 20)

## freq
##    1    2    3    4    5    6    7    8    9   10   11   12   13   14   15 
## 5455 1315  512  331  205  160  114   72   55   49   38   36   30   34   19 
##   16   17   18   19   20 
##   21   20   16    7    6

tail(table(freq), 20)

## freq
##  68  69  72  74  75  82  83  86  94 101 105 106 114 122 125 141 146 155 
##   1   1   1   2   1   1   1   2   1   1   1   2   1   1   1   1   1   1 
## 167 199 
##   1   1

freq <- colSums(as.matrix(dtms))   
freq

## numeric(0)

Word clouds are a useful tool to illustrate the frequency and / or importance of each word as indicated by the size of the word.

dtms <- removeSparseTerms(dtm, 0.15) # Prepare the data (max 15% empty space)   
freq <- colSums(as.matrix(dtm)) # Find word frequencies   
BuPu <- brewer.pal(6, "BuPu")   
wordcloud(names(freq), freq, max.words=75, rot.per=0.2, colors=BuPu)

Final step for this first pass is to create N-grams. These are words that occur together within a data set. Bigrams are two words that occur together, Tri-grams show a sequence of 3, and Quad-grams are groups of 4. In the interest of keeping this first pass succint, only bi-gram and quad-grams are included in this version.

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))

grams_2 <- TermDocumentMatrix(docs, control = list(tokenize = BigramTokenizer))
grams_4 <- TermDocumentMatrix(docs, control = list(tokenize = QuadgramTokenizer))

library(ggplot2)

The rows are summed and sorted by frequency.

freq_frame <- function(tdm){
        freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
        freq_frame <- data.frame(word=names(freq), freq=freq)
        return(freq_frame)
}

Denser matrices are needed

grams_2_Dense <- removeSparseTerms(grams_2, 0.999)
grams_2_DenseSorted <- freq_frame(grams_2_Dense)
grams_4_Dense <- removeSparseTerms(grams_4, 0.9999)
grams_4_DenseSorted <- freq_frame(grams_4_Dense)

GG <- ggplot(data = grams_2_DenseSorted[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
GG <- GG + labs(x = "N-gram", y = "Frequency", title = "25 Most Frequent BiGrams")
GG <- GG + theme(axis.text.x=element_text(angle=90))
GG

GG <- ggplot(data = grams_4_DenseSorted[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
GG <- GG + labs(x = "N-gram", y = "Frequency", title = "25 Most Frequent QuadGrams")
GG <- GG + theme(axis.text.x=element_text(angle=90))
GG

Planned next steps / Shiny App Create a larger corpus size from the original blogs, news and twitter. Create prediction algorithm by comparing the input to multi word matrices. Research if the code is running optimally for speed; perhaps condense the ggplot code. Determine if cached data is possible and / or needed.

Data Science Capstone Milestone Report

Christine Arsenault

August 5, 2018