This report is produced in partial fulfillment of the requirements for the Capstone Project offered by Johns Hopkins Bloomberg School of Public Health and Coursera.
This report describes the exploratory data analysis of the Capstone Dataset.
paste("Check file sizes in MB")
## [1] "Check file sizes in MB"
file.info("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")$size / (1024*1024)
## [1] 159.4
file.info("Coursera-SwiftKey/final/en_US/en_US.news.txt")$size / (1024*1024)
## [1] 196.3
file.info("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")$size / (1024*1024)
## [1] 200.4
paste("View Line Counts")
## [1] "View Line Counts"
library(R.utils)
countLines("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")
## [1] 2360148
countLines("Coursera-SwiftKey/final/en_US/en_US.news.txt")
## [1] 1010242
countLines("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")
## [1] 899288
paste("View Word Counts")
## [1] "View Word Counts"
system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.twitter.txt", stdout=TRUE)
## [1] "173 Coursera-SwiftKey/final/en_US/en_US.twitter.txt"
system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.news.txt", stdout=TRUE)
## [1] "11384 Coursera-SwiftKey/final/en_US/en_US.news.txt"
system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.blogs.txt", stdout=TRUE)
## [1] "40833 Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
Given the large amount of text and limited computational resources, sampling is performed. 10000 lines per file is randomly sampled and saved to disk.
twitter <- readLines('Coursera-SwiftKey/final/en_US/en_US.twitter.txt', encoding = 'UTF-8')
news <- readLines('Coursera-SwiftKey/final/en_US/en_US.news.txt', encoding = 'UTF-8')
blogs <- readLines('Coursera-SwiftKey/final/en_US/en_US.blogs.txt', encoding = 'UTF-8')
set.seed(39)
sampleTwitter <- twitter[sample(1:length(twitter),10000)]
sampleNews <- news[sample(1:length(news),10000)]
sampleBlogs <- blogs[sample(1:length(blogs),10000)]
sampleData <- c(sampleTwitter,sampleNews,sampleBlogs)
writeLines(sampleData, "./sample/sampleData.txt")
# remove temporary variables
rm(twitter,news,blogs,sampleTwitter,sampleNews,sampleBlogs,sampleData)
Using the tm package, the sampled data is used to create a corpus. Subsequently, the the following transformations are performed:
library(tm)
cname <- file.path(".", "sample")
docs <- Corpus(DirSource(cname))
# convert to lowercase
docs <- tm_map(docs, content_transformer(tolower))
# remove more transforms
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/|@|\\|")
# remove punctuation
docs <- tm_map(docs, removePunctuation)
# remove numbers
docs <- tm_map(docs, removeNumbers)
# strip whitespace
docs <- tm_map(docs, stripWhitespace)
# remove english stop words
docs <- tm_map(docs, removeWords, stopwords("english"))
# initiate stemming
library(SnowballC)
docs <- tm_map(docs, stemDocument)
N-grams models are created to explore word frequencies. Using the RWeka package, unigrams, bigrams and trigrams are created.
library(RWeka)
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unidtm <- DocumentTermMatrix(docs,
control = list(tokenize = Tokenizer))
BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bidtm <- DocumentTermMatrix(docs,
control = list(tokenize = BigramTokenizer))
TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tridtm <- DocumentTermMatrix(docs,
control = list(tokenize = TrigramTokenizer))
Below, you can see the top 10 unigrams with the highest frequencies.
tm_unifreq <- sort(colSums(as.matrix(unidtm)), decreasing=TRUE)
tm_uniwordfreq <- data.frame(word=names(tm_unifreq), freq=tm_unifreq)
paste("Unigrams - Top 5 highest frequencies")
## [1] "Unigrams - Top 5 highest frequencies"
head(tm_uniwordfreq,5)
## word freq
## said said 2912
## will will 2801
## one one 2613
## like like 2397
## get get 2289
tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
tm_biwordfreq <- data.frame(word=names(tm_bifreq), freq=tm_bifreq)
paste("Bigrams - Top 5 highest frequencies")
## [1] "Bigrams - Top 5 highest frequencies"
head(tm_biwordfreq,5)
## word freq
## last year last year 211
## new york new york 176
## high school high school 167
## right now right now 158
## look like look like 154
tm_trifreq <- sort(colSums(as.matrix(tridtm)), decreasing=TRUE)
tm_triwordfreq <- data.frame(word=names(tm_trifreq), freq=tm_trifreq)
paste("Trigrams - Top 5 highest frequencies")
## [1] "Trigrams - Top 5 highest frequencies"
head(tm_triwordfreq,5)
## word freq
## new york citi new york citi 29
## none repeat scroll none repeat scroll 25
## repeat scroll yellow repeat scroll yellow 25
## stylebackground none repeat stylebackground none repeat 25
## cant wait see cant wait see 17
In the diagrams below, you can explore the Ngrams by frequencies:
library(ggplot2)
library(dplyr)
tm_uniwordfreq %>%
filter(freq > 1000) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Unigrams with frequencies > 1000") +
xlab("Unigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
tm_biwordfreq %>%
filter(freq > 100) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Bigrams with frequencies > 100") +
xlab("Bigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
tm_triwordfreq %>%
filter(freq > 10) %>%
ggplot(aes(word,freq)) +
geom_bar(stat="identity") +
ggtitle("Trigrams with frequencies > 10") +
xlab("Trigrams") + ylab("Frequency") +
theme(axis.text.x=element_text(angle=45, hjust=1))
Below, we can see wordclouds of the top 50 unigrams, bigrams and trigrams.
library(wordcloud)
set.seed(39)
wordcloud(names(tm_unifreq), tm_unifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
wordcloud(names(tm_bifreq), tm_bifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))
wordcloud(names(tm_trifreq), tm_trifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))