This article is a part of Data Science Capstone course by JHU and SwiftKey.
The goals of this article are to demonstrate exploratory data analysis and modeling step
of text mining project on provided datasets.
##Step 0 : Call relevant packages.
library(tm)
library(stringr)
library(wordcloud)
library(ngram)
##Step 1 : Import data into Rstudio.
usblog<- "en_US/en_US.blogs.txt"
usnews<- "en_US/en_US.news.txt"
ustwitter <- "en_US/en_US.twitter.txt"
usblog_txt <-file(usblog,"r")
usnews_txt <-file(usnews,"r")
ustwitter_txt <- file(ustwitter,"r")
usblogdat <- readLines(usblog_txt)
usnewsdat <- readLines(usnews_txt)
ustwitterdat <- readLines(ustwitter_txt)
close(usblog_txt)
close(usnews_txt)
close(ustwitter_txt)
##Step 2 : Basic summary.
data.frame(File=c("en_US.blogs","en_US.news","en_US.twitter"),
Size=c(object.size(usblogdat),
object.size(usnewsdat),
object.size(ustwitterdat)))
## File Size
## 1 en_US.blogs 267758632
## 2 en_US.news 20729472
## 3 en_US.twitter 334484736
## Lines count in all datasets
data.frame(File=c("en_US.blogs","en_US.news","en_US.twitter"),
Lines=c(length(usblogdat),length(usnewsdat),length(ustwitterdat)))
## File Lines
## 1 en_US.blogs 899288
## 2 en_US.news 77259
## 3 en_US.twitter 2360148
data.frame(File=c("en_US.blogs","en_US.news","en_US.twitter"),
Words=c(sum(str_count(usblogdat, "\\w+" )),
sum(str_count(usnewsdat, "\\w+" )),
sum(str_count(ustwitterdat, "\\w+"))))
## File Words
## 1 en_US.blogs 38613679
## 2 en_US.news 2755266
## 3 en_US.twitter 31119663
##Step 3 : Create sample for all datasets.
## sample 2,000 for datasets and create corpus
sample_blog<-sample(usblogdat, 2000)
sample_news<-sample(usnewsdat, 2000)
sample_twitter<- sample(ustwitterdat, 2000)
sample_blog <- Corpus(VectorSource(sample_blog))
sample_news <- Corpus(VectorSource(sample_news))
sample_twitter <- Corpus(VectorSource(sample_twitter))
##Step4 : Preprocessing training datasets (make datasets more cleaner).
## Change to lower case, remove punctuation, remove numbers ,remove stopwords and strip white space
sample_blog <- tm_map(sample_blog, tolower)
sample_blog <- tm_map(sample_blog, removePunctuation)
sample_blog <- tm_map(sample_blog, removeNumbers)
sample_blog <- tm_map(sample_blog, removeWords, stopwords("en"))
sample_blog <- tm_map(sample_blog, stripWhitespace)
sample_news <- tm_map(sample_news, tolower)
sample_news <- tm_map(sample_news, removePunctuation)
sample_news <- tm_map(sample_news, removeNumbers)
sample_news <- tm_map(sample_news, removeWords, stopwords("en"))
sample_news <- tm_map(sample_news, stripWhitespace)
sample_twitter <- tm_map(sample_twitter, tolower)
sample_twitter <- tm_map(sample_twitter, removePunctuation)
sample_twitter <- tm_map(sample_twitter, removeNumbers)
sample_twitter <- tm_map(sample_twitter, removeWords, stopwords("en"))
sample_twitter <- tm_map(sample_twitter, stripWhitespace)
##Step5 : Make document term matrix for word count ranking in each dataset.
blogmat <- as.matrix(DocumentTermMatrix(sample_blog))
blogwordrank <- sort(colSums(blogmat),decreasing = TRUE)
head(blogwordrank, n=10 )
## one just will like time can get now know people
## 304 255 245 228 196 194 173 139 130 127
newsmat <- as.matrix(DocumentTermMatrix(sample_news))
newswordrank <- sort(colSums(newsmat),decreasing = TRUE)
head(newswordrank, n=10 )
## said will one new year also just can two state
## 464 205 156 145 120 117 112 109 105 102
twittermat <- as.matrix(DocumentTermMatrix(sample_twitter))
twitterwordrank <- sort(colSums(twittermat),decreasing = TRUE)
head(twitterwordrank, n=10 )
## get like just good love dont day thanks can will
## 104 101 100 99 92 84 82 78 77 74
##Step6 : Plot the results.
blogplot<- wordcloud(head(names(blogwordrank), 200), head(blogwordrank, 200),
scale = c(3.5,0.25),colors=brewer.pal(8, "Dark2"),random.order = FALSE)
newsplot<- wordcloud(head(names(newswordrank), 200), head(newswordrank, 200),
scale = c(3.5,0.25),colors=brewer.pal(8, "Dark2"),random.order = FALSE)
twitterplot<- wordcloud(head(names(twitterwordrank), 200), head(twitterwordrank, 200),
scale = c(3.5,0.25),colors=brewer.pal(8, "Dark2"),random.order = FALSE)
Up to this point I get some insight about what is the most frequency word according to
all different datasets, so I can build n-gram of datasets.
This is the important step for building predictive model down the road.