0. Background

This is the Milestone Report for the Capstone Project for Johns Hopkins University- Data Science Specialization. This report consists of the exploratory data analysis on the English corpus dataset offered by JHU and SwiftKey. Also the planning of the final product in this capstone project will be mentioned in this article.

1. Read in Data

blogs <- suppressWarnings(readLines("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt",encoding="UTF-8"))
news <- suppressWarnings(readLines("./Coursera-SwiftKey/final/en_US/en_US.news.txt"))
twitter <- suppressWarnings(readLines("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt", skipNul = TRUE))
blogs<-iconv(blogs,"WINDOWS-1252","UTF-8")
news<- iconv(news, "WINDOWS-1252","UTF-8")
twitter<- iconv(twitter, "WINDOWS-1252","UTF-8")
object.size(blogs)
## 271291112 bytes
length(blogs)
## [1] 899288
object.size(news)
## 20828640 bytes
str(news)
##  chr [1:77259] "He wasn't home alone, apparently." ...
object.size(twitter)
## 334993504 bytes
str(twitter)
##  chr [1:2360148] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long." ...

Three datasets blogs, news and twitter are read in R. In the dataset, a whole sentence is considered as a character, i.e. there are 899288 sentences in blogs dataset, and it is considered 899288 length of characters in R. So we need to preprocess with the dataset.

ind_b <- sample(1:length(blogs),5000, replace = FALSE)
blogs <- blogs[ind_b]
length(blogs)
## [1] 5000
ind_n <- sample(1:length(news),5000, replace= FALSE)
news <- news[ind_n]
length(news)
## [1] 5000
ind_t <- sample(1:length(twitter),5000, replace= FALSE)
twitter <- twitter[ind_t]
length(twitter)
## [1] 5000
suppressMessages(library(tm))
suppressMessages(library(qdap))
blogs.corp <- VectorSource(blogs)
blogs.corp <- VCorpus(blogs.corp)
news.corp <- VectorSource(news)
news.corp <- VCorpus(news.corp)
twitter.corp <- VectorSource(twitter)
twitter.corp <- VCorpus(twitter.corp)

Since the original datasets contain too many rows of words, 5000 of data is sampled out and replace the original dataset.

2. Preprocess

Preprocessing function clean_corp() is defined as follows. By calling this function with Corpus data as arguement, it will

transform the data to lowercase,

remove punctuations,

remove all whitespaces,

remove all numbers,

turn symbols into words and

remove all stopwords.

clean_corp <- function(corpus){
  corpus <- tm_map(corpus, content_transformer(tolower))
  corpus <- tm_map(corpus, removePunctuation)
  corpus <- tm_map(corpus, stripWhitespace)
  corpus <- tm_map(corpus, removeNumbers)
  corpus <- tm_map(corpus, content_transformer(replace_symbol))
  corpus <- tm_map(corpus, removeWords,c(stopwords("en"),"one","will","can","just","like","also","really","even","much","first"))
  return(corpus)
}

clean_blogs <- clean_corp(blogs.corp)
clean_blogs[[1]][1]
## $content
## [1] "andy    christmascroogey   minute     got  christmas tree  stopped complaining    damn christmas music  freakin christmas decorations everywhere  decided  go  toys r us  get two presents  toys  tots    girl     boy andy declared   present   better  boy toys  cooler     um lol   girls  dress  things  dolls  pretend makeup  motherfuckin barbie  runs  entire country  shit  guys shes  police officer  teacher  doctor  streetwalker bitch  busy"
clean_news <- clean_corp(news.corp)
clean_news[[1]][1]
## $content
## [1] " bulls introduced  ncaa tournament   big easta€<U+2122>s nastiest defense putting     chill   california team   never seen anything   south florida allowed  points    half  brushed  way  victory"
clean_twitter <- clean_corp(twitter.corp)
clean_twitter[[1]][1]
## $content
## [1] "dont forget  check   foursquare  youre  campus"

Turn the clean corpus to Term Document Matrix.

clean_blogs_tdm <- TermDocumentMatrix(clean_blogs)
clean_blogs.m <- as.matrix(clean_blogs_tdm)
clean_news_tdm <- TermDocumentMatrix(clean_news)
clean_news.m <- as.matrix(clean_news_tdm)
clean_twitter_tdm <- TermDocumentMatrix(clean_twitter)
clean_twitter.m <- as.matrix(clean_twitter_tdm)

Create a Word Frequency Matrix as below:

##                     all
## a                  5064
## aaaaare               1
## aaaah                 1
## aaae                  1
## aah                   2
## aamir璽<e2><e2>s    1
## aaron                 2
## abalone               1
## abandon               2
## abandoned             6
##              all
## a           4444
## aa             1
## aaa            3
## aapl           1
## aaron          9
## abandon        3
## abandoned      5
## abandoning     2
## abandonment    1
## abbasiyah      3
##             all
## a          1345
## aa            1
## aaarrrggg     1
## aamc          1
## aapl          1
## aaron         2
## abba          1
## abbreviate    1
## abc           2
## abd           1

3. Exploratory Data Analysis

1-gram

Barplot

blogs_freq <- rowSums(clean_blogs.m)
blogs_freq <- sort(blogs_freq, decreasing = TRUE)
barplot(blogs_freq[1:10], col= "blue", las= 2, main= "Blogs")

news_freq <- rowSums(clean_news.m)
news_freq <- sort(news_freq, decreasing = TRUE)
barplot(news_freq[1:10], col= "green", las= 2, main= "News")

twitter_freq <- rowSums(clean_twitter.m)
twitter_freq <- sort(twitter_freq, decreasing = TRUE)
barplot(twitter_freq[1:10], col= "red", las= 2, main= "Twitter")

Word Cloud

blogs_freq <- rowSums(clean_blogs.m)
blogs_freq <- data.frame(term= names(blogs_freq), num= blogs_freq)
library(wordcloud)
wordcloud(blogs_freq$term, blogs_freq$num, max.words=50, colors="blue")

news_freq <- rowSums(clean_news.m)
news_freq <- data.frame(term= names(news_freq), num= news_freq)
library(wordcloud)
wordcloud(news_freq$term, news_freq$num, max.words=50, colors="green")

twitter_freq <- rowSums(clean_twitter.m)
twitter_freq <- data.frame(term= names(twitter_freq), num= twitter_freq)
library(wordcloud)
wordcloud(twitter_freq$term, twitter_freq$num, max.words=50, colors="red")
## Warning in wordcloud(twitter_freq$term, twitter_freq$num, max.words = 50, :
## love could not be fit on page. It will not be plotted.

Clustering

clean_blogs_tdm
## <<TermDocumentMatrix (terms: 21760, documents: 5000)>>
## Non-/sparse entries: 94641/108705359
## Sparsity           : 100%
## Maximal term length: 48
## Weighting          : term frequency (tf)
clean_news_tdm
## <<TermDocumentMatrix (terms: 20989, documents: 5000)>>
## Non-/sparse entries: 88763/104856237
## Sparsity           : 100%
## Maximal term length: 39
## Weighting          : term frequency (tf)
clean_twitter_tdm
## <<TermDocumentMatrix (terms: 9899, documents: 5000)>>
## Non-/sparse entries: 32770/49462230
## Sparsity           : 100%
## Maximal term length: 44
## Weighting          : term frequency (tf)

It is required to decline the sparcity for the purpose of plotting dendrogram.

clean_blogs_tdm2 <- removeSparseTerms(clean_blogs_tdm, sparse= 0.95)
clean_blogs_tdm2.m <- as.matrix(clean_blogs_tdm2)
blogs_dist <- dist(clean_blogs_tdm2.m)
hc <- hclust(blogs_dist)
plot(hc, main= "Blogs")

clean_news_tdm2 <- removeSparseTerms(clean_news_tdm, sparse= 0.95)
clean_news_tdm2.m <- as.matrix(clean_news_tdm2)
news_dist <- dist(clean_news_tdm2.m)
hc <- hclust(news_dist)
plot(hc, main= "News")

2-gram model

ind_b <- sample(1:length(blogs),1000, replace = FALSE)
blogs2 <- blogs[ind_b]
length(blogs2)
## [1] 1000
ind_n <- sample(1:length(news),1000, replace= FALSE)
news2 <- news[ind_n]
length(news2)
## [1] 1000
ind_t <- sample(1:length(twitter),1000, replace= FALSE)
twitter2 <- twitter[ind_t]
length(twitter2)
## [1] 1000
blogs.corp2 <- VectorSource(blogs2)
blogs.corp2 <- VCorpus(blogs.corp2)
news.corp2 <- VectorSource(news2)
news.corp2 <- VCorpus(news.corp2)
twitter.corp2 <- VectorSource(twitter2)
twitter.corp2 <- VCorpus(twitter.corp2)

clean_blogs2 <- clean_corp(blogs.corp2)
clean_news2 <- clean_corp(news.corp2)
clean_twitter2 <- clean_corp(twitter.corp2)
library(RWeka)
tokenizer <- function(corpus){
  NGramTokenizer(corpus, Weka_control(min=2, max=2))}
biblogs_tdm <- TermDocumentMatrix(clean_blogs2, control=list(tokenize= tokenizer))

biblogs_tdm.m <- as.matrix(biblogs_tdm)
biblogs_freq <- rowSums(biblogs_tdm.m)
biblogs_freq <- sort(biblogs_freq, decreasing = TRUE)
barplot(biblogs_freq[1:20], col="blue", main= "Blogs(2-gram)")

tokenizer <- function(corpus){
  NGramTokenizer(corpus, Weka_control(min=2, max=2))}
binews_tdm <- TermDocumentMatrix(clean_news2, control=list(tokenize= tokenizer))

binews_tdm.m <- as.matrix(binews_tdm)
binews_freq <- rowSums(binews_tdm.m)
binews_freq <- sort(binews_freq, decreasing = TRUE)
barplot(binews_freq[1:20], col="blue", main= "News(2-gram)")

tokenizer <- function(corpus){
  NGramTokenizer(corpus, Weka_control(min=2, max=2))}
bitwitter_tdm <- TermDocumentMatrix(clean_twitter2, control=list(tokenize= tokenizer))

bitwitter_tdm.m <- as.matrix(bitwitter_tdm)
bitwitter_freq <- rowSums(bitwitter_tdm.m)
bitwitter_freq <- sort(bitwitter_freq, decreasing = TRUE)
barplot(bitwitter_freq[1:20], col="blue", main= "Twitter(2-gram)")

4. Conclusion

Basically, we can tell the most frequent occured words based on barplot or wordcloud. And it is also easy to discover that the clustering relationship among each word in three documents.