This report is prepared for the Datascience Capstone Project under the Data Science Specialization on COURSERA taught by Prof. Jeff Leek, Roger Peng and Brian Caffo from the Johns Hopkins University.
The report describes preliminary work done on using the course data sets and insights gained from analyzing the data set. I used the assignments of previous students and a few independent articles as references for this report.
These include: 1. http://rpubs.com/yoke2/dscapstone-milestone-report 2. http://rpubs.com/zeelich7/capstoneMilestoneRpt
We start by loading the libraries for Text Mining (tm), Word Cloud and R Utilities.
##https://deltadna.com/blog/text-mining-in-r-for-term-frequency/
library(tm)
## Loading required package: NLP
library(wordcloud)
## Loading required package: RColorBrewer
library(R.utils)
## Loading required package: R.oo
## Loading required package: R.methodsS3
## R.methodsS3 v1.7.1 (2016-02-15) successfully loaded. See ?R.methodsS3 for help.
## R.oo v1.20.0 (2016-02-17) successfully loaded. See ?R.oo for help.
##
## Attaching package: 'R.oo'
## The following objects are masked from 'package:methods':
##
## getClasses, getMethods
## The following objects are masked from 'package:base':
##
## attach, detach, gc, load, save
## R.utils v2.3.0 (2016-04-13) successfully loaded. See ?R.utils for help.
##
## Attaching package: 'R.utils'
## The following object is masked from 'package:utils':
##
## timestamp
## The following objects are masked from 'package:base':
##
## cat, commandArgs, getOption, inherits, isOpen, parse, warnings
We first measure the size of the file and number of lines in it. File Size:
file.info("en_US.blogs.txt")$size
## [1] 210160014
Number of lines in the file
countLines("en_US.blogs.txt")
## [1] 899288
## attr(,"lastLineHasNewline")
## [1] TRUE
Owing to limited computing resources available to me, I was able to work only on a scaled down reference set for one of the 3 text files for this project. I sampled the data set to a size of 10000 lines as any larger set would take a large time to build on my system.
data <- readLines("en_US.blogs.txt")
sampleBlogs <- data[sample(1:length(data),10000)]
data <- sampleBlogs
df <- data.frame(data)
textdata <- df[df$data, ]
review_text <- paste(textdata, collapse=" ")
We collapse the groups into a single object and build the corpus for the project.
review_source <- VectorSource(review_text)
corpus <- Corpus(review_source)
We next do the pre-processing on the corpuse to convert all words to lower case, remove white spaces, remove stop words, numbers and punctuation. We remove all the profanities from the text and format it to be readable by R in Plain Text format.
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeWords, stopwords("english"))
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, removePunctuation)
profanityFile <- file("swearWords.txt", open="rb")
profanity <- readLines(profanityFile, encoding="UTF-8", warn=TRUE, skipNul=TRUE)
## Warning in readLines(profanityFile, encoding = "UTF-8", warn = TRUE,
## skipNul = TRUE): incomplete final line found on 'swearWords.txt'
corpus <- tm_map(corpus, removeWords, profanity)
corpus <- tm_map(corpus, PlainTextDocument)
We next create the Document Term Matrix and measure the frequency of various words. The results are plotted below.
dtm <- DocumentTermMatrix(corpus)
##inspect(dtm[1:5, 1:20])
dtm2 <- as.matrix(dtm)
frequency <- colSums(dtm2)
frequency <- sort(frequency, decreasing=TRUE)
ord <- order(frequency)
head(frequency)
## one will can just like time
## 1394 1246 1130 1091 1061 971
frequency[head(ord)]
## ã<U+0098>yvind ã<U+0093>ttarr â<U+0080><U+0098>â<U+0080> â<U+0080><U+0098>alienâ<U+0080><U+0099> â<U+0080><U+0098>anyâ<U+0080><U+0099> â<U+0080><U+0098>anything
## 1 1 1 1 1 1
frequency[tail(ord)]
## time like just can will one
## 971 1061 1091 1130 1246 1394
##check out the frequency of frequencies
head(table(frequency),20)
## frequency
## 1 2 3 4 5 6 7 8 9 10 11 12
## 17583 4643 2234 1304 949 689 549 415 355 286 235 236
## 13 14 15 16 17 18 19 20
## 194 159 146 114 112 97 101 84
tail(table(frequency),20)
## frequency
## 525 541 548 563 579 589 591 602 604 637 641 674 680 754 971
## 1 1 1 1 1 1 1 1 1 2 1 1 1 1 1
## 1061 1091 1130 1246 1394
## 1 1 1 1 1
wf <- data.frame(word=names(frequency),freq=frequency)
wf2 <- subset(wf, freq>500)
##plot word frequencies
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
p <- ggplot(wf2,aes(word, freq))
p <- p + geom_bar(stat="identity")
p <- p + theme(axis.text.x=element_text(angle=45, hjust=1))
p
A correlation analysys can be done for important terms as shown below.
##do correlation analysis
findAssocs(dtm, c("news","emergency"),corlimit=0.50)
## $news
## numeric(0)
##
## $emergency
## numeric(0)
Here a word cloud is plotted for the 100 words with the highest frequencies.
##word cloud
words <- names(frequency)
wordcloud(words[1:100], frequency[1:100])
This word count function is used for calculating frequencies of n-grams developed using the R-Weka library.
##word/phrase count function
freq_df <- function(tdm){
freq <- sort(rowSums(as.matrix(tdm)), decreasing=TRUE)
freq_df <- data.frame(word=names(freq), freq=freq)
return(freq_df)
}
The RWeka library is used to measure important 1 Gram, 2 Gram and 3 Grams and plot the ones with the highest frequencies.
library(RWeka)
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unidtm <- DocumentTermMatrix(corpus,
control = list(tokenize = Tokenizer))
2-grams are calculated here.
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bidtm <- DocumentTermMatrix(corpus,
control = list(tokenize = BigramTokenizer))
bidtm.freq <- freq_df(bidtm)
3-grams are calculated here.
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tridtm <- DocumentTermMatrix(corpus,
control = list(tokenize = TrigramTokenizer))
tridtm.freq <- freq_df(tridtm)
The most frequent n-grams are plotted here.
##top 5 unigrams
tm_unifreq <- sort(colSums(as.matrix(unidtm)), decreasing=TRUE)
tm_uniwordfreq <- data.frame(word=names(tm_unifreq), freq=tm_unifreq)
paste("Unigrams - Top 5 highest frequencies")
## [1] "Unigrams - Top 5 highest frequencies"
##bigrams
tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
tm_biwordfreq <- data.frame(word=names(tm_bifreq), freq=tm_bifreq)
paste("Bigrams - Top 5 highest frequencies")
## [1] "Bigrams - Top 5 highest frequencies"
##trigrams
tm_trifreq <- sort(colSums(as.matrix(tridtm)), decreasing=TRUE)
tm_triwordfreq <- data.frame(word=names(tm_trifreq), freq=tm_trifreq)
paste("Trigrams - Top 5 highest frequencies")
## [1] "Trigrams - Top 5 highest frequencies"
##explore ngrams by frequency
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
tm_uniwordfreq %>%
filter(freq > 500) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Unigrams with frequencies > 500") +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
tm_biwordfreq %>%
filter(freq > 50) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Bigrams with frequencies > 50") +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
tm_triwordfreq %>%
filter(freq > 5) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Trigrams with frequencies > 5") +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))