This is the Milestone Report for week 2 of the Coursera Data Science Capstone project.
The objective of this report is to develop an understanding of the various statistical properties of the data set that can later be used when building the prediction model for the final data product - the Shiny application. Using exploratory data analysis, this report describes the major features of the training data and then summarizes my plans for creating the predictive model.
The model will be trained using a unified document corpus compiled from the following three sources of text data:
The provided text data are provided in four different languages. This project will only focus on the English corpora.The data can be found at the following link on Coursera: Capstone Dataset
Before attempting to load any files, we’ll examine them using the bash shell. The data set includes text files from various languages. For this project, the english language files will be used. There are three text files: blogs, news, and twitter.
require(ggplot2)
## Loading required package: ggplot2
require(tm)
## Loading required package: tm
## Loading required package: NLP
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
require(RWeka)
## Loading required package: RWeka
Due to the large size of the files, only a subset will be sampled.
blogs <- readLines("/Users/you-can-do-it/Desktop/Coursera_r_projrct/capstone/final/en_US/en_US.blogs.txt", 20000)
news <- readLines("/Users/you-can-do-it/Desktop/Coursera_r_projrct/capstone/final/en_US/en_US.news.txt", 20000)
twitter <- readLines("/Users/you-can-do-it/Desktop/Coursera_r_projrct/capstone/final/en_US/en_US.twitter.txt", 20000)
sampledDocs <- sample(paste(blogs, news, twitter), size = 10000, replace = TRUE)
rm(blogs, news, twitter)
documents <- Corpus(VectorSource(sampledDocs))
rm(sampledDocs)
Convert text to lowercase and remove punctuation and numbers. Removal of profanity will be a consideration for the predicted text. Does including it in the training data contribute to more accurate predictions?
documents <- tm_map(documents, content_transformer(tolower))
## Warning in tm_map.SimpleCorpus(documents, content_transformer(tolower)):
## transformation drops documents
documents <- tm_map(documents, stripWhitespace)
## Warning in tm_map.SimpleCorpus(documents, stripWhitespace): transformation drops
## documents
documents <- tm_map(documents, removePunctuation)
## Warning in tm_map.SimpleCorpus(documents, removePunctuation): transformation
## drops documents
documents <- tm_map(documents, removeNumbers)
## Warning in tm_map.SimpleCorpus(documents, removeNumbers): transformation drops
## documents
uniGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
biGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
triGramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
uniGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = uniGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## uniGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## uniGramTokenizer)): custom tokenizer is ignored
biGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = biGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## biGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## biGramTokenizer)): custom tokenizer is ignored
triGramMatrix <- TermDocumentMatrix(documents, control = list(tokenize = triGramTokenizer))
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## triGramTokenizer)): custom functions are ignored
## Warning in TermDocumentMatrix.SimpleCorpus(documents, control = list(tokenize =
## triGramTokenizer)): custom tokenizer is ignored
freqTerms <- findFreqTerms(uniGramMatrix, lowfreq = 2000)
termFrequency <- rowSums(as.matrix(uniGramMatrix[freqTerms,]))
termFrequency <- data.frame(unigram=names(termFrequency), frequency=termFrequency)
g <- ggplot(termFrequency, aes(x=reorder(unigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Unigram") + ylab("Frequency") +
labs(title = "Top Unigrams by Frequency")
print(g)
freqTerms <- findFreqTerms(biGramMatrix, lowfreq = 500)
termFrequency <- rowSums(as.matrix(biGramMatrix[freqTerms,]))
termFrequency <- data.frame(bigram=names(termFrequency), frequency=termFrequency)
g <- ggplot(termFrequency, aes(x=reorder(bigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Bigram") + ylab("Frequency") +
labs(title = "Top Bigrams by Frequency")
print(g)
freqTerms <- findFreqTerms(triGramMatrix, lowfreq = 75)
termFrequency <- rowSums(as.matrix(triGramMatrix[freqTerms,]))
termFrequency <- data.frame(trigram=names(termFrequency), frequency=termFrequency)
g <- ggplot(termFrequency, aes(x=reorder(trigram, frequency), y=frequency)) +
geom_bar(stat = "identity") + coord_flip() +
theme(legend.title=element_blank()) +
xlab("Trigram") + ylab("Frequency") +
labs(title = "Top Trigrams by Frequency")
print(g)