This report is produced in partial fulfillment of the requirements for the Capstone Project offered by Johns Hopkins Bloomberg School of Public Health and Coursera.

This report describes the exploratory data analysis of the Capstone Dataset.

Basic Document Statistics

paste("Check file sizes in MB")

## [1] "Check file sizes in MB"

file.info("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")$size / (1024*1024)

## [1] 159.4

file.info("Coursera-SwiftKey/final/en_US/en_US.news.txt")$size / (1024*1024)

## [1] 196.3

file.info("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")$size / (1024*1024)

## [1] 200.4

paste("View Line Counts")

## [1] "View Line Counts"

library(R.utils)
countLines("Coursera-SwiftKey/final/en_US/en_US.twitter.txt")

## [1] 2360148

countLines("Coursera-SwiftKey/final/en_US/en_US.news.txt")

## [1] 1010242

countLines("Coursera-SwiftKey/final/en_US/en_US.blogs.txt")

## [1] 899288

paste("View Word Counts")

## [1] "View Word Counts"

system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.twitter.txt", stdout=TRUE)

## [1] "173 Coursera-SwiftKey/final/en_US/en_US.twitter.txt"

system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.news.txt", stdout=TRUE)

## [1] "11384 Coursera-SwiftKey/final/en_US/en_US.news.txt"

system2("wc", args = "-L Coursera-SwiftKey/final/en_US/en_US.blogs.txt", stdout=TRUE)

## [1] "40833 Coursera-SwiftKey/final/en_US/en_US.blogs.txt"

Perform Sampling

Given the large amount of text and limited computational resources, sampling is performed. 10000 lines per file is randomly sampled and saved to disk.

twitter <- readLines('Coursera-SwiftKey/final/en_US/en_US.twitter.txt', encoding = 'UTF-8')
news <- readLines('Coursera-SwiftKey/final/en_US/en_US.news.txt', encoding = 'UTF-8')
blogs <- readLines('Coursera-SwiftKey/final/en_US/en_US.blogs.txt', encoding = 'UTF-8')
set.seed(39)
sampleTwitter <- twitter[sample(1:length(twitter),10000)]
sampleNews <- news[sample(1:length(news),10000)]
sampleBlogs <- blogs[sample(1:length(blogs),10000)]
sampleData <- c(sampleTwitter,sampleNews,sampleBlogs)
writeLines(sampleData, "./sample/sampleData.txt")

# remove temporary variables
rm(twitter,news,blogs,sampleTwitter,sampleNews,sampleBlogs,sampleData)

Create and Clean Corpus

Using the tm package, the sampled data is used to create a corpus. Subsequently, the the following transformations are performed:

convert to lowercase
characters /, @ |
common punctuation
numbers
English stop words
strip whitespace
stemming (Porter’s stemming)

library(tm)
cname <- file.path(".", "sample")
docs <- Corpus(DirSource(cname))

# convert to lowercase
docs <- tm_map(docs, content_transformer(tolower))

# remove more transforms
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
docs <- tm_map(docs, toSpace, "/|@|\\|")

# remove punctuation
docs <- tm_map(docs, removePunctuation)

# remove numbers
docs <- tm_map(docs, removeNumbers)

# strip whitespace
docs <- tm_map(docs, stripWhitespace)

# remove english stop words
docs <- tm_map(docs, removeWords, stopwords("english"))

# initiate stemming
library(SnowballC)
docs <- tm_map(docs, stemDocument)

Ngram Tokenization

N-grams models are created to explore word frequencies. Using the RWeka package, unigrams, bigrams and trigrams are created.

library(RWeka)
Tokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 1, max = 1))
unidtm <- DocumentTermMatrix(docs, 
                          control = list(tokenize = Tokenizer))

BigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 2, max = 2))
bidtm <- DocumentTermMatrix(docs, 
                             control = list(tokenize = BigramTokenizer))

TrigramTokenizer <- function(x) NGramTokenizer(x, Weka_control(min = 3, max = 3))
tridtm <- DocumentTermMatrix(docs, 
                             control = list(tokenize = TrigramTokenizer))

Exploratory Data Analysis

Top 10 Frequencies

Below, you can see the top 10 unigrams with the highest frequencies.

tm_unifreq <- sort(colSums(as.matrix(unidtm)), decreasing=TRUE)
tm_uniwordfreq <- data.frame(word=names(tm_unifreq), freq=tm_unifreq)
paste("Unigrams - Top 5 highest frequencies")

## [1] "Unigrams - Top 5 highest frequencies"

head(tm_uniwordfreq,5)

##      word freq
## said said 2912
## will will 2801
## one   one 2613
## like like 2397
## get   get 2289

tm_bifreq <- sort(colSums(as.matrix(bidtm)), decreasing=TRUE)
tm_biwordfreq <- data.frame(word=names(tm_bifreq), freq=tm_bifreq)
paste("Bigrams - Top 5 highest frequencies")

## [1] "Bigrams - Top 5 highest frequencies"

head(tm_biwordfreq,5)

##                    word freq
## last year     last year  211
## new york       new york  176
## high school high school  167
## right now     right now  158
## look like     look like  154

tm_trifreq <- sort(colSums(as.matrix(tridtm)), decreasing=TRUE)
tm_triwordfreq <- data.frame(word=names(tm_trifreq), freq=tm_trifreq)
paste("Trigrams - Top 5 highest frequencies")

## [1] "Trigrams - Top 5 highest frequencies"

head(tm_triwordfreq,5)

##                                                    word freq
## new york citi                             new york citi   29
## none repeat scroll                   none repeat scroll   25
## repeat scroll yellow               repeat scroll yellow   25
## stylebackground none repeat stylebackground none repeat   25
## cant wait see                             cant wait see   17

Explore Frequencies

In the diagrams below, you can explore the Ngrams by frequencies:

library(ggplot2)
library(dplyr)
tm_uniwordfreq %>% 
    filter(freq > 1000) %>%
    ggplot(aes(word,freq)) +
    geom_bar(stat="identity") +
    ggtitle("Unigrams with frequencies > 1000") +
    xlab("Unigrams") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=45, hjust=1))

plot of chunk exploreByFreq1

tm_biwordfreq %>% 
    filter(freq > 100) %>%
    ggplot(aes(word,freq)) +
    geom_bar(stat="identity") +
    ggtitle("Bigrams with frequencies > 100") +
    xlab("Bigrams") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=45, hjust=1))

plot of chunk exploreByFreq2

tm_triwordfreq %>% 
    filter(freq > 10) %>%
    ggplot(aes(word,freq)) +
    geom_bar(stat="identity") +
    ggtitle("Trigrams with frequencies > 10") +
    xlab("Trigrams") + ylab("Frequency") +
    theme(axis.text.x=element_text(angle=45, hjust=1))

plot of chunk exploreByFreq3

Below, we can see wordclouds of the top 50 unigrams, bigrams and trigrams.

Wordcloud - Top 50 Unigrams

library(wordcloud)
set.seed(39)
wordcloud(names(tm_unifreq), tm_unifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))

plot of chunk wordclouds1

Wordcloud - Top 50 Bigrams

wordcloud(names(tm_bifreq), tm_bifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))

plot of chunk wordclouds2

Wordcloud - Top 50 Trigrams

wordcloud(names(tm_trifreq), tm_trifreq, max.words=50, scale=c(5, .1), colors=brewer.pal(6, "Dark2"))

plot of chunk wordclouds3

JHU Data Science Capstone Milestone Report