This project focuses on exploratory data analysis and goals for creating an app and the prediction algorithm. Major features of the data are explained that have been identified and briefly summarized for creating the prediction algorithm and a Shiny app in a way that is understandable to a non-data scientist manager. Tables and plots will be used to illustrate important summaries of the data set. A new term that you may see is “corpus” also known as a “text document collection” [Journal of Statistical Software, Mar 2008, Vol 25, Issue 5, pg. 5]
# set working director
setwd("C:/Users/tljon/datasciencecoursera/Coursera-SwiftKey/final/en_US")
# set libraries
library(tm)
## Loading required package: NLP
library(NLP)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(stringi)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(RWeka)
library(wordcloud)
## Loading required package: RColorBrewer
#Acquire Data
##File has been dowloaded from course link and placed in the director as
##stated above
con = file("en_US.blogs.txt")
blogs <- readLines(con, warn=FALSE, encoding="UTF-8", skipNul=TRUE)
close(con)
con = file("en_US.news.txt")
news <- readLines(con, warn=FALSE, encoding = "UTF-8", skipNul = TRUE)
close(con)
con = file("en_US.twitter.txt")
twitter <- readLines(con, warn=FALSE, encoding = "UTF-8", skipNul = TRUE)
close(con)
#Summarize data
summary <-
data.frame('File' = c("Blogs","News","Twitter"),
"File Size" = sapply(list(blogs, news, twitter),
function(x){format(object.size(x),"MB")}),
'Rows' = sapply(list(blogs, news, twitter),
function(x){length(x)}),
'Characters' = sapply(list(blogs, news, twitter),
function(x){sum(nchar(x))}),
'MaxCharacters' = sapply(list(blogs, news, twitter),
function(x){max(unlist(lapply(x,
function(y) nchar(y))))})
)
summary
## File File.Size Rows Characters MaxCharacters
## 1 Blogs 255.4 Mb 899288 206824505 40833
## 2 News 19.8 Mb 77259 15639408 5760
## 3 Twitter 319 Mb 2360148 162096241 140
#Take a 5% Sample and Clean the data files
set.seed(12345)
sample_set <- c(sample(blogs, length(blogs) * 0.005),
sample(news, length(news) * 0.005),
sample(twitter, length(twitter) * 0.005)
)
#Sample Data Set
summary_ss <-
data.frame('File' = "Sample Set",
"File Size" = sapply(list(sample_set),
function(x){format(object.size(x),"MB")}),
'Rows' = sapply(list(sample_set),
function(x){length(x)}),
'Characters' = sapply(list(sample_set),
function(x){sum(nchar(x))}),
'MaxCharacters' = sapply(list(sample_set),
function(x){max(unlist(lapply(x,
function(y) nchar(y))))})
)
summary_ss
## File File.Size Rows Characters MaxCharacters
## 1 Sample Set 3 Mb 16682 1935900 2359
#Remove all punctuation, numbers, whitespace, and change all characters to lower case, and plain text
testdata <- iconv(sample_set, "UTF-8", "ASCII", sub="")
corpus <- VCorpus(VectorSource(testdata))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, tolower)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, PlainTextDocument)
###Apply N-grams
####Split strings into N-grams with minimal and maximal numbers of grams. Result is a character vector with the tokenized strings.
unigram <- function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
bigram <- function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
trigram <- function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
####The Term Document Matrix constructs or coerces to a term-document matrix or a document-term matrix.
uniTDM <- TermDocumentMatrix(corpus, control=list(tokenize=unigram))
biTDM <- TermDocumentMatrix(corpus, control=list(tokenize=bigram))
triTDM <- TermDocumentMatrix(corpus, control=list(tokenize=trigram))
####Find frequent terms in a document-term or term-document matrix. This method works for all numeric weightings. Returns a character vector of terms in "x" which occur more or equal often tha "lowfreq" times and less or equal often than "highfreq" times.
uniTFF <- findFreqTerms(uniTDM, lowfreq = 50)
biTFF <- findFreqTerms(biTDM, lowfreq = 50)
triTFF <- findFreqTerms(triTDM, lowfreq = 10)
uni_freq <- rowSums(as.matrix(uniTDM[uniTFF, ]))
uni_freq <- data.frame(words=names(uni_freq), frequency=uni_freq)
bi_freq <- rowSums(as.matrix(biTDM[biTFF, ]))
bi_freq <- data.frame(words=names(bi_freq), frequency=bi_freq)
tri_freq <- rowSums(as.matrix(triTDM[triTFF, ]))
tri_freq <- data.frame(words=names(tri_freq), frequency=tri_freq)
head(uni_freq)
## words frequency
## able able 103
## about about 1038
## across across 72
## act act 53
## actually actually 156
## add add 86
head(bi_freq)
## words frequency
## a big a big 50
## a bit a bit 86
## a couple a couple 60
## a few a few 174
## a good a good 153
## a great a great 171
head(tri_freq)
## words frequency
## a bit of a bit of 16
## a bunch of a bunch of 13
## a chance to a chance to 12
## a couple of a couple of 41
## a fan of a fan of 12
## a few days a few days 16
##Unigram Frequency (100 words)
wordcloud(words=uni_freq$words, freq=uni_freq$frequency,
max.words=100, colors = brewer.pal(6, "Dark2"), scale=c(5, .5))
##Bigram Frequency (50 words)
wordcloud(words=bi_freq$words, freq=bi_freq$frequency,
max.words=50, colors = brewer.pal(6, "Dark2"), scale=c(5, .5))
##Trigram Frequency (20 words)
wordcloud(words=tri_freq$words, freq=tri_freq$frequency,
max.words=20, colors = brewer.pal(6, "Dark2"), scale=c(4, .5))
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, :
## going to be could not be fit on page. It will not be plotted.
## Warning in wordcloud(words = tri_freq$words, freq = tri_freq$frequency, : a
## lot of could not be fit on page. It will not be plotted.
#One Word Frequency (Top 10)
FQ1<-plot_freq <- ggplot(data = uni_freq[order(-uni_freq$frequency),][1:10, ],
aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="green") +
ggtitle("One Word - Top 10") +
xlab("words") + ylab("frequency")
FQ1
#Two Word Frequency (Top 10)
FQ2<-plot_freq <- ggplot(data = bi_freq[order(-bi_freq$frequency),][1:10, ],
aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="blue") +
theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Two Word - Top 10") +
xlab("words") + ylab("frequency")
FQ2
#Three Word Frequency (Top 10)
FQ3<-plot_freq <- ggplot(data = tri_freq[order(-tri_freq$frequency),][1:10, ],
aes(x = reorder(words, -frequency), y=frequency)) +
geom_bar(stat="identity", fill="brown") +
theme(axis.text.x = element_text(angle = 45)) +
ggtitle("Three Word - Top 10") +
xlab("words") + ylab("frequency")
FQ3
Coverage <- function(df, coverage) {
c <- coverage * sum(df$frequency)
s <- 0
for (i in 1:length(df[,1])) {
s <- s + df[i,]$frequency
if (s >= c) {
break
}
}
return(i)
}
#Coverage at 50% for Unigram, Bigram, and Trigram respectively
Coverage(uni_freq, .5)
## [1] 385
Coverage(bi_freq, .5)
## [1] 161
Coverage(tri_freq, .5)
## [1] 253
#Coverage at 75% for Unigram, Bigram, and Trigram respectively
Coverage(uni_freq, .75)
## [1] 549
Coverage(bi_freq, .75)
## [1] 256
Coverage(tri_freq, .75)
## [1] 379
#Coverage at 90% for Unigram, Bigram, and Trigram respectively
Coverage(uni_freq, .9)
## [1] 626
Coverage(bi_freq, .9)
## [1] 308
Coverage(tri_freq, .9)
## [1] 458
This concludes our initial exploratory analysis of the data. This provides the foundation for our next activity which will be to build a predictive algorithm that uses N-gram. The algorithm wll be used to develop a Shinyapp. The idea is to develop a Shinyapp that can be used to suggest then next word after a phrase or word has been entered. Similar to what you may have experienced in the past using various computer desktop/cloud software or apps on various devices.