The goal of this project is to display the skills that have been acquired over all of the courses in the Data Scientist specialization. It provides an opportunity to gain peer feedback to ensure that the program is on track for building a prediction algorithm.
First, the necessary libraries are loaded
library(tm)
## Warning: package 'tm' was built under R version 3.5.1
## Loading required package: NLP
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.1
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(RColorBrewer)
library(wordcloud)
## Warning: package 'wordcloud' was built under R version 3.5.1
library(stringr)
library(stringi)
library(XML)
## Warning: package 'XML' was built under R version 3.5.1
library(caret)
## Warning: package 'caret' was built under R version 3.5.1
## Loading required package: lattice
library(openNLP)
suppressMessages(library(RWeka))
## Warning: package 'RWeka' was built under R version 3.5.1
suppressMessages(library(qdap))
## Warning: package 'qdap' was built under R version 3.5.1
## Warning: package 'qdapRegex' was built under R version 3.5.1
## Warning: package 'qdapTools' was built under R version 3.5.1
library(pander)
## Warning: package 'pander' was built under R version 3.5.1
library(SnowballC)
library(tmap)
setwd("C:/")
The source data for this project is loaded into the program. The data that was used for this project is available to be downloaded from https://d396qusza40orc.cloudfront.net/dsscapstone/dataset/Coursera-SwiftKey.zip.
blog <- readLines("/en_US/en_US.blogs.txt", encoding= "UTF-8", skipNul = T)
twitter <- readLines("/en_US/en_US.twitter.txt", encoding= "UTF-8", skipNul = T)
news <- readLines("/en_US/en_US.news.txt", encoding= "UTF-8", skipNul = T)
## Warning in readLines("/en_US/en_US.news.txt", encoding = "UTF-8", skipNul =
## T): incomplete final line found on '/en_US/en_US.news.txt'
The datasets are very large, so only a sample will be used for this initial review of the data.
set.seed(123)
sample_size = 750
blog_samp <- blog[sample(1:length(blog), sample_size)]
twitter_samp <- twitter[sample(1:length(twitter), sample_size)]
news_samp <- twitter[sample(1:length(news), sample_size)]
invisible(write.table(blog_samp, file="blog_samp.txt", quote=F))
invisible(write.table(twitter_samp, file="twitter_samp.txt", quote=F))
invisible(write.table(news_samp, file="news_samp.txt", quote=F))
cname <- file.path("C:", "Texts")
cname
## [1] "C:/Texts"
dir(cname)
## [1] "blog_samp.txt" "DocumentTermMatrix.csv"
## [3] "news_samp.txt" "twitter_samp.txt"
## [5] "wordcloud_packages.png"
The size of the data sets being evaluated is important, so word and line counts are calculated.
BlogWords <- stri_count_words(blog_samp)
summary(BlogWords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 9.00 28.00 40.08 57.00 274.00
TwitterWords <- stri_count_words(twitter_samp)
summary(TwitterWords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 7.00 12.00 12.43 17.00 31.00
NewsWords <- stri_count_words(news_samp)
summary(NewsWords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.86 19.00 31.00
stri_stats_general(blog_samp)
## Lines LinesNEmpty Chars CharsNWhite
## 750 750 164694 135528
stri_stats_general(twitter_samp)
## Lines LinesNEmpty Chars CharsNWhite
## 750 750 50466 41801
stri_stats_general(news_samp)
## Lines LinesNEmpty Chars CharsNWhite
## 750 750 51948 42978
invisible(write.table(blog_samp, file="blog_samp.txt", quote=F))
invisible(write.table(twitter_samp, file="twitter_samp.txt", quote=F))
invisible(write.table(news_samp, file="news_samp.txt", quote=F))
cname <- file.path("C:", "Texts")
cname
## [1] "C:/Texts"
dir(cname)
## [1] "blog_samp.txt" "DocumentTermMatrix.csv"
## [3] "news_samp.txt" "twitter_samp.txt"
## [5] "wordcloud_packages.png"
A corpus is a collection of written material which will be used for analysis. In text mining, a corpus is created to facilitate statistical analysis, hypothesis testing and to account for occurances.
library(tm)
docs <- VCorpus(DirSource(cname))
summary(docs)
## Length Class Mode
## blog_samp.txt 2 PlainTextDocument list
## DocumentTermMatrix.csv 2 PlainTextDocument list
## news_samp.txt 2 PlainTextDocument list
## twitter_samp.txt 2 PlainTextDocument list
## wordcloud_packages.png 2 PlainTextDocument list
inspect(docs[1])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 167629
inspect(docs[2])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 688081
inspect(docs[3])
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 55105
The data is cleaned to remove unnecessary characters, words and whitespace. Cleaning the data in this manner will allow for the most commonly occuring words and word associations to be identified.
docs <- tm_map(docs, removePunctuation)
docs <- tm_map(docs, removeNumbers)
docs <- tm_map(docs, tolower)
docs <- tm_map(docs, PlainTextDocument)
DocsCopy <- docs
docs <- tm_map(docs, removeWords, stopwords("english"))
docs <- tm_map(docs, PlainTextDocument)
The size of the data sets being evaluated is important, so word and line counts are calculated.
library(NLP)
library(stringi)
BlogWords <- stri_count_words(blog_samp)
summary(BlogWords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 9.00 28.00 40.08 57.00 274.00
TwitterWords <- stri_count_words(twitter_samp)
summary(TwitterWords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 2.00 7.00 12.00 12.43 17.00 31.00
NewsWords <- stri_count_words(news_samp)
summary(NewsWords)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.00 7.00 12.00 12.86 19.00 31.00
stri_stats_general(blog_samp)
## Lines LinesNEmpty Chars CharsNWhite
## 750 750 164694 135528
stri_stats_general(twitter_samp)
## Lines LinesNEmpty Chars CharsNWhite
## 750 750 50466 41801
stri_stats_general(news_samp)
## Lines LinesNEmpty Chars CharsNWhite
## 750 750 51948 42978
dtm <- DocumentTermMatrix(docs)
dtm
## <<DocumentTermMatrix (documents: 5, terms: 8650)>>
## Non-/sparse entries: 11431/31819
## Sparsity : 74%
## Maximal term length: 592558
## Weighting : term frequency (tf)
tdm <- TermDocumentMatrix(docs)
tdm
## <<TermDocumentMatrix (terms: 8650, documents: 5)>>
## Non-/sparse entries: 11431/31819
## Sparsity : 74%
## Maximal term length: 592558
## Weighting : term frequency (tf)
The cleaned data is reviewed, with focus on the most frequently occuring words.
freq <- colSums(as.matrix(dtm))
length(freq)
## [1] 8650
dtms <-removeSparseTerms(dtm, 0.2)
dtms
## <<DocumentTermMatrix (documents: 5, terms: 0)>>
## Non-/sparse entries: 0/0
## Sparsity : 100%
## Maximal term length: 0
## Weighting : term frequency (tf)
freq <- colSums(as.matrix(dtm))
head(table(freq))
## freq
## 1 2 3 4 5 6
## 5455 1315 512 331 205 160
tail(table(freq))
## freq
## 125 141 146 155 167 199
## 1 1 1 1 1 1
freq <- colSums(as.matrix(dtms))
freq
## numeric(0)
freq <- sort(colSums(as.matrix(dtm)), decreasing=TRUE)
head(freq,20)
## just will one like can get time know good now
## 199 167 155 146 141 125 122 114 106 106
## see love people day dont new think back want well
## 105 101 94 86 86 83 82 75 74 74
findFreqTerms(dtm, lowfreq=50)
## [1] "also" "back" "can" "day" "dont" "even" "first"
## [8] "get" "going" "good" "got" "great" "just" "know"
## [15] "life" "like" "love" "make" "much" "never" "new"
## [22] "now" "one" "people" "really" "right" "see" "thanks"
## [29] "think" "time" "today" "want" "way" "well" "will"
wf <- data.frame(word=names(freq), freq=freq)
head(wf)
## word freq
## just just 199
## will will 167
## one one 155
## like like 146
## can can 141
## get get 125
Create a histogram for words that occur at least 50 times.
library(ggplot2)
p <- ggplot(subset(wf, freq>50), aes(x = reorder(word, -freq), y = freq)) +
geom_bar(stat = "identity") +
theme(axis.text.x=element_text(angle=45, hjust=1))
p
Remove sparse terms and then review frequency. The removal of terms that fall below a specified frequency threshold will assist with generalization and can help to prevent overfitting.
dtms <- removeSparseTerms(dtm, 0.1)
head(table(freq), 20)
## freq
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15
## 5455 1315 512 331 205 160 114 72 55 49 38 36 30 34 19
## 16 17 18 19 20
## 21 20 16 7 6
tail(table(freq), 20)
## freq
## 68 69 72 74 75 82 83 86 94 101 105 106 114 122 125 141 146 155
## 1 1 1 2 1 1 1 2 1 1 1 2 1 1 1 1 1 1
## 167 199
## 1 1
freq <- colSums(as.matrix(dtms))
freq
## numeric(0)
Word clouds are a useful tool to illustrate the frequency and / or importance of each word as indicated by the size of the word.
dtms <- removeSparseTerms(dtm, 0.15) # Prepare the data (max 15% empty space)
freq <- colSums(as.matrix(dtm)) # Find word frequencies
BuPu <- brewer.pal(6, "BuPu")
wordcloud(names(freq), freq, max.words=75, rot.per=0.2, colors=BuPu)
Final step for this first pass is to create N-grams. These are words that occur together within a data set. Bigrams are two words that occur together, Tri-grams show a sequence of 3, and Quad-grams are groups of 4. In the interest of keeping this first pass succint, only bi-gram and quad-grams are included in this version.
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
QuadgramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 4, max = 4))
grams_2 <- TermDocumentMatrix(docs, control = list(tokenize = BigramTokenizer))
grams_4 <- TermDocumentMatrix(docs, control = list(tokenize = QuadgramTokenizer))
library(ggplot2)
The rows are summed and sorted by frequency.
freq_frame <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_frame <- data.frame(word=names(freq), freq=freq)
return(freq_frame)
}
Denser matrices are needed
grams_2_Dense <- removeSparseTerms(grams_2, 0.999)
grams_2_DenseSorted <- freq_frame(grams_2_Dense)
grams_4_Dense <- removeSparseTerms(grams_4, 0.9999)
grams_4_DenseSorted <- freq_frame(grams_4_Dense)
GG <- ggplot(data = grams_2_DenseSorted[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
GG <- GG + labs(x = "N-gram", y = "Frequency", title = "25 Most Frequent BiGrams")
GG <- GG + theme(axis.text.x=element_text(angle=90))
GG
GG <- ggplot(data = grams_4_DenseSorted[1:25,], aes(x = reorder(word, -freq), y = freq)) + geom_bar(stat="identity")
GG <- GG + labs(x = "N-gram", y = "Frequency", title = "25 Most Frequent QuadGrams")
GG <- GG + theme(axis.text.x=element_text(angle=90))
GG
Planned next steps / Shiny App Create a larger corpus size from the original blogs, news and twitter. Create prediction algorithm by comparing the input to multi word matrices. Research if the code is running optimally for speed; perhaps condense the ggplot code. Determine if cached data is possible and / or needed.