This is the milestone report for week 2 of the data science capstone project.
The purpose of this report is to develop and understanding of the properties of various data sets that can be used to build a prediction model for the final Shiny application. This report aims to describe the major features of the training data then summarize future plans.
The three sources of text data include Blogs, News, and Twitter. These sources will train the model.
Load the inital packages and clear the global workspace.
install.packages("knitr", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/9cdzl10j1gz_c97v4rk3dmh00000gn/T//RtmpUq4HJO/downloaded_packages
install.packages("tm", repos = "http://cran.us.r-project.org")
##
## There is a binary version available but the source version is later:
## binary source needs_compilation
## tm 0.7-8 0.7-10 TRUE
## installing the source package 'tm'
install.packages("RWeka", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/9cdzl10j1gz_c97v4rk3dmh00000gn/T//RtmpUq4HJO/downloaded_packages
install.packages("ggplot2", repos = "http://cran.us.r-project.org")
##
## The downloaded binary packages are in
## /var/folders/b_/9cdzl10j1gz_c97v4rk3dmh00000gn/T//RtmpUq4HJO/downloaded_packages
library(knitr)
library(tm)
## Loading required package: NLP
library(RWeka)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
#As the data set is too large to load, I will instead read the first 20000 lines. Also, this will only apply to the english data.
blogs <- readLines("~/Desktop/courserajhu/capstonemilestonereport/data/en_US/en_US.blogs.txt", 20000)
news <- readLines("~/Desktop/courserajhu/capstonemilestonereport/data/en_US/en_US.news.txt", 20000)
twitter <- readLines("~/Desktop/courserajhu/capstonemilestonereport/data/en_US/en_US.twitter.txt", 20000)
sampledDocs <- sample(paste(blogs, news, twitter), size = 10000, replace = TRUE)
rm(blogs, news, twitter)
documents <- Corpus(VectorSource(sampledDocs))
rm(sampledDocs)
##capital letters to lowercase letters
documents <- tm_map(documents, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(documents, content_transformer(tolower)):
## transformation drops documents
#remove the whitespace
documents <- tm_map(documents, stripWhitespace)
## Warning in tm_map.SimpleCorpus(documents, stripWhitespace): transformation drops
## documents
#remove all punctuation
documents <- tm_map(documents, removePunctuation)
## Warning in tm_map.SimpleCorpus(documents, removePunctuation): transformation
## drops documents
#remove numbers
documents <- tm_map(documents, removeNumbers)
## Warning in tm_map.SimpleCorpus(documents, removeNumbers): transformation drops
## documents
A factor to consider here is the profanity used in these datasets. They may or may not improve the prediction capability of our model.
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uniGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = uniGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## uniGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## uniGramTokenizer)): custom tokenizer is ignored
biGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = biGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## biGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## biGramTokenizer)): custom tokenizer is ignored
triGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = triGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## triGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## triGramTokenizer)): custom tokenizer is ignored
freqTerms <- findFreqTerms(uniGramMatrix, lowfreq = 2000)
termFrequency <-rowSums(as.matrix(uniGramMatrix[freqTerms,]))
termFrequency <- data.frame(unigram=names(termFrequency), frequency=termFrequency)
g <- ggplot(termFrequency, aes(x=reorder(unigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Unigram") + ylab("Frequency") +
labs(title = "Top Unigrams by Frequency")
print(g)
freqTerms <- findFreqTerms(biGramMatrix, lowfreq = 2000)
termFrequency <- rowSums(as.matrix(biGramMatrix[freqTerms,]))
termFrequency <- data.frame(bigram=names(termFrequency), frequency=termFrequency)
g <- ggplot(termFrequency, aes(x=reorder(bigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Bigram") + ylab("Frequency") +
labs(title = "Top Bigrams by Frequency")
print(g)
freqTerms <- findFreqTerms(triGramMatrix, lowfreq = 2000)
termFrequency <- rowSums(as.matrix(triGramMatrix[freqTerms,]))
termFrequency <- data.frame(trigram=names(termFrequency), frequency=termFrequency)
g <- ggplot(termFrequency, aes(x=reorder(trigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Trigram") + ylab("Frequency") +
labs(title = "Top Trigrams by Frequency")
print(g)