This is the Milestone Report for the Capstone Project for Johns Hopkins University- Data Science Specialization. This report consists of the exploratory data analysis on the English corpus dataset offered by JHU and SwiftKey. Also the planning of the final product in this capstone project will be mentioned in this article.
blogs <- suppressWarnings(readLines("./Coursera-SwiftKey/final/en_US/en_US.blogs.txt",encoding="UTF-8"))
news <- suppressWarnings(readLines("./Coursera-SwiftKey/final/en_US/en_US.news.txt"))
twitter <- suppressWarnings(readLines("./Coursera-SwiftKey/final/en_US/en_US.twitter.txt", skipNul = TRUE))
blogs<-iconv(blogs,"WINDOWS-1252","UTF-8")
news<- iconv(news, "WINDOWS-1252","UTF-8")
twitter<- iconv(twitter, "WINDOWS-1252","UTF-8")
object.size(blogs)
## 271291112 bytes
length(blogs)
## [1] 899288
object.size(news)
## 20828640 bytes
str(news)
## chr [1:77259] "He wasn't home alone, apparently." ...
object.size(twitter)
## 334993504 bytes
str(twitter)
## chr [1:2360148] "How are you? Btw thanks for the RT. You gonna be in DC anytime soon? Love to see you. Been way, way too long." ...
Three datasets blogs, news and twitter are read in R. In the dataset, a whole sentence is considered as a character, i.e. there are 899288 sentences in blogs dataset, and it is considered 899288 length of characters in R. So we need to preprocess with the dataset.
ind_b <- sample(1:length(blogs),5000, replace = FALSE)
blogs <- blogs[ind_b]
length(blogs)
## [1] 5000
ind_n <- sample(1:length(news),5000, replace= FALSE)
news <- news[ind_n]
length(news)
## [1] 5000
ind_t <- sample(1:length(twitter),5000, replace= FALSE)
twitter <- twitter[ind_t]
length(twitter)
## [1] 5000
suppressMessages(library(tm))
suppressMessages(library(qdap))
blogs.corp <- VectorSource(blogs)
blogs.corp <- VCorpus(blogs.corp)
news.corp <- VectorSource(news)
news.corp <- VCorpus(news.corp)
twitter.corp <- VectorSource(twitter)
twitter.corp <- VCorpus(twitter.corp)
Since the original datasets contain too many rows of words, 5000 of data is sampled out and replace the original dataset.
Preprocessing function clean_corp() is defined as follows. By calling this function with Corpus data as arguement, it will
transform the data to lowercase,
remove punctuations,
remove all whitespaces,
remove all numbers,
turn symbols into words and
remove all stopwords.
clean_corp <- function(corpus){
corpus <- tm_map(corpus, content_transformer(tolower))
corpus <- tm_map(corpus, removePunctuation)
corpus <- tm_map(corpus, stripWhitespace)
corpus <- tm_map(corpus, removeNumbers)
corpus <- tm_map(corpus, content_transformer(replace_symbol))
corpus <- tm_map(corpus, removeWords,c(stopwords("en"),"one","will","can","just","like","also","really","even","much","first"))
return(corpus)
}
clean_blogs <- clean_corp(blogs.corp)
clean_blogs[[1]][1]
## $content
## [1] "andy christmascroogey minute got christmas tree stopped complaining damn christmas music freakin christmas decorations everywhere decided go toys r us get two presents toys tots girl boy andy declared present better boy toys cooler um lol girls dress things dolls pretend makeup motherfuckin barbie runs entire country shit guys shes police officer teacher doctor streetwalker bitch busy"
clean_news <- clean_corp(news.corp)
clean_news[[1]][1]
## $content
## [1] " bulls introduced ncaa tournament big easta€<U+2122>s nastiest defense putting chill california team never seen anything south florida allowed points half brushed way victory"
clean_twitter <- clean_corp(twitter.corp)
clean_twitter[[1]][1]
## $content
## [1] "dont forget check foursquare youre campus"
Turn the clean corpus to Term Document Matrix.
clean_blogs_tdm <- TermDocumentMatrix(clean_blogs)
clean_blogs.m <- as.matrix(clean_blogs_tdm)
clean_news_tdm <- TermDocumentMatrix(clean_news)
clean_news.m <- as.matrix(clean_news_tdm)
clean_twitter_tdm <- TermDocumentMatrix(clean_twitter)
clean_twitter.m <- as.matrix(clean_twitter_tdm)
Create a Word Frequency Matrix as below:
## all
## a 5064
## aaaaare 1
## aaaah 1
## aaae 1
## aah 2
## aamir璽<e2><e2>s 1
## aaron 2
## abalone 1
## abandon 2
## abandoned 6
## all
## a 4444
## aa 1
## aaa 3
## aapl 1
## aaron 9
## abandon 3
## abandoned 5
## abandoning 2
## abandonment 1
## abbasiyah 3
## all
## a 1345
## aa 1
## aaarrrggg 1
## aamc 1
## aapl 1
## aaron 2
## abba 1
## abbreviate 1
## abc 2
## abd 1
blogs_freq <- rowSums(clean_blogs.m)
blogs_freq <- sort(blogs_freq, decreasing = TRUE)
barplot(blogs_freq[1:10], col= "blue", las= 2, main= "Blogs")
news_freq <- rowSums(clean_news.m)
news_freq <- sort(news_freq, decreasing = TRUE)
barplot(news_freq[1:10], col= "green", las= 2, main= "News")
twitter_freq <- rowSums(clean_twitter.m)
twitter_freq <- sort(twitter_freq, decreasing = TRUE)
barplot(twitter_freq[1:10], col= "red", las= 2, main= "Twitter")
blogs_freq <- rowSums(clean_blogs.m)
blogs_freq <- data.frame(term= names(blogs_freq), num= blogs_freq)
library(wordcloud)
wordcloud(blogs_freq$term, blogs_freq$num, max.words=50, colors="blue")
news_freq <- rowSums(clean_news.m)
news_freq <- data.frame(term= names(news_freq), num= news_freq)
library(wordcloud)
wordcloud(news_freq$term, news_freq$num, max.words=50, colors="green")
twitter_freq <- rowSums(clean_twitter.m)
twitter_freq <- data.frame(term= names(twitter_freq), num= twitter_freq)
library(wordcloud)
wordcloud(twitter_freq$term, twitter_freq$num, max.words=50, colors="red")
## Warning in wordcloud(twitter_freq$term, twitter_freq$num, max.words = 50, :
## love could not be fit on page. It will not be plotted.
clean_blogs_tdm
## <<TermDocumentMatrix (terms: 21760, documents: 5000)>>
## Non-/sparse entries: 94641/108705359
## Sparsity : 100%
## Maximal term length: 48
## Weighting : term frequency (tf)
clean_news_tdm
## <<TermDocumentMatrix (terms: 20989, documents: 5000)>>
## Non-/sparse entries: 88763/104856237
## Sparsity : 100%
## Maximal term length: 39
## Weighting : term frequency (tf)
clean_twitter_tdm
## <<TermDocumentMatrix (terms: 9899, documents: 5000)>>
## Non-/sparse entries: 32770/49462230
## Sparsity : 100%
## Maximal term length: 44
## Weighting : term frequency (tf)
It is required to decline the sparcity for the purpose of plotting dendrogram.
clean_blogs_tdm2 <- removeSparseTerms(clean_blogs_tdm, sparse= 0.95)
clean_blogs_tdm2.m <- as.matrix(clean_blogs_tdm2)
blogs_dist <- dist(clean_blogs_tdm2.m)
hc <- hclust(blogs_dist)
plot(hc, main= "Blogs")
clean_news_tdm2 <- removeSparseTerms(clean_news_tdm, sparse= 0.95)
clean_news_tdm2.m <- as.matrix(clean_news_tdm2)
news_dist <- dist(clean_news_tdm2.m)
hc <- hclust(news_dist)
plot(hc, main= "News")
ind_b <- sample(1:length(blogs),1000, replace = FALSE)
blogs2 <- blogs[ind_b]
length(blogs2)
## [1] 1000
ind_n <- sample(1:length(news),1000, replace= FALSE)
news2 <- news[ind_n]
length(news2)
## [1] 1000
ind_t <- sample(1:length(twitter),1000, replace= FALSE)
twitter2 <- twitter[ind_t]
length(twitter2)
## [1] 1000
blogs.corp2 <- VectorSource(blogs2)
blogs.corp2 <- VCorpus(blogs.corp2)
news.corp2 <- VectorSource(news2)
news.corp2 <- VCorpus(news.corp2)
twitter.corp2 <- VectorSource(twitter2)
twitter.corp2 <- VCorpus(twitter.corp2)
clean_blogs2 <- clean_corp(blogs.corp2)
clean_news2 <- clean_corp(news.corp2)
clean_twitter2 <- clean_corp(twitter.corp2)
library(RWeka)
tokenizer <- function(corpus){
NGramTokenizer(corpus, Weka_control(min=2, max=2))}
biblogs_tdm <- TermDocumentMatrix(clean_blogs2, control=list(tokenize= tokenizer))
biblogs_tdm.m <- as.matrix(biblogs_tdm)
biblogs_freq <- rowSums(biblogs_tdm.m)
biblogs_freq <- sort(biblogs_freq, decreasing = TRUE)
barplot(biblogs_freq[1:20], col="blue", main= "Blogs(2-gram)")
tokenizer <- function(corpus){
NGramTokenizer(corpus, Weka_control(min=2, max=2))}
binews_tdm <- TermDocumentMatrix(clean_news2, control=list(tokenize= tokenizer))
binews_tdm.m <- as.matrix(binews_tdm)
binews_freq <- rowSums(binews_tdm.m)
binews_freq <- sort(binews_freq, decreasing = TRUE)
barplot(binews_freq[1:20], col="blue", main= "News(2-gram)")
tokenizer <- function(corpus){
NGramTokenizer(corpus, Weka_control(min=2, max=2))}
bitwitter_tdm <- TermDocumentMatrix(clean_twitter2, control=list(tokenize= tokenizer))
bitwitter_tdm.m <- as.matrix(bitwitter_tdm)
bitwitter_freq <- rowSums(bitwitter_tdm.m)
bitwitter_freq <- sort(bitwitter_freq, decreasing = TRUE)
barplot(bitwitter_freq[1:20], col="blue", main= "Twitter(2-gram)")
Basically, we can tell the most frequent occured words based on barplot or wordcloud. And it is also easy to discover that the clustering relationship among each word in three documents.