Milestone Report

This article is a part of Data Science Capstone course by JHU and SwiftKey.
The goals of this article are to demonstrate exploratory data analysis and modeling step
of text mining project on provided datasets.

##Step 0 : Call relevant packages.

library(tm)
library(stringr)
library(wordcloud)
library(ngram)

##Step 1 : Import data into Rstudio.

usblog<- "en_US/en_US.blogs.txt"
usnews<- "en_US/en_US.news.txt"
ustwitter <- "en_US/en_US.twitter.txt"

usblog_txt <-file(usblog,"r")
usnews_txt <-file(usnews,"r")
ustwitter_txt <- file(ustwitter,"r")

usblogdat <- readLines(usblog_txt)
usnewsdat <- readLines(usnews_txt)
ustwitterdat <- readLines(ustwitter_txt)

close(usblog_txt)
close(usnews_txt)
close(ustwitter_txt)

##Step 2 : Basic summary.

data.frame(File=c("en_US.blogs","en_US.news","en_US.twitter"),
           Size=c(object.size(usblogdat),
               object.size(usnewsdat),
               object.size(ustwitterdat)))

##            File      Size
## 1   en_US.blogs 267758632
## 2    en_US.news  20729472
## 3 en_US.twitter 334484736

## Lines count in all datasets
data.frame(File=c("en_US.blogs","en_US.news","en_US.twitter"),
           Lines=c(length(usblogdat),length(usnewsdat),length(ustwitterdat)))

##            File   Lines
## 1   en_US.blogs  899288
## 2    en_US.news   77259
## 3 en_US.twitter 2360148

data.frame(File=c("en_US.blogs","en_US.news","en_US.twitter"), 
           Words=c(sum(str_count(usblogdat, "\\w+" )),
                sum(str_count(usnewsdat, "\\w+" )),
                sum(str_count(ustwitterdat, "\\w+"))))

##            File    Words
## 1   en_US.blogs 38613679
## 2    en_US.news  2755266
## 3 en_US.twitter 31119663

##Step 3 : Create sample for all datasets.

## sample 2,000 for datasets and create corpus
sample_blog<-sample(usblogdat, 2000)
sample_news<-sample(usnewsdat, 2000)
sample_twitter<- sample(ustwitterdat, 2000)

sample_blog <- Corpus(VectorSource(sample_blog))
sample_news <- Corpus(VectorSource(sample_news))
sample_twitter <- Corpus(VectorSource(sample_twitter))

##Step4 : Preprocessing training datasets (make datasets more cleaner).

## Change to lower case, remove punctuation, remove numbers ,remove stopwords and strip white space
sample_blog <- tm_map(sample_blog, tolower)
sample_blog <- tm_map(sample_blog, removePunctuation)
sample_blog <- tm_map(sample_blog, removeNumbers)
sample_blog <- tm_map(sample_blog, removeWords, stopwords("en"))
sample_blog <- tm_map(sample_blog, stripWhitespace)

sample_news <- tm_map(sample_news, tolower)
sample_news <- tm_map(sample_news, removePunctuation)
sample_news <- tm_map(sample_news, removeNumbers)
sample_news <- tm_map(sample_news, removeWords, stopwords("en"))
sample_news <- tm_map(sample_news, stripWhitespace)
                      
sample_twitter <- tm_map(sample_twitter, tolower)
sample_twitter <- tm_map(sample_twitter, removePunctuation)
sample_twitter <- tm_map(sample_twitter, removeNumbers)
sample_twitter <- tm_map(sample_twitter, removeWords, stopwords("en"))
sample_twitter <- tm_map(sample_twitter, stripWhitespace)

##Step5 : Make document term matrix for word count ranking in each dataset.

blogmat <- as.matrix(DocumentTermMatrix(sample_blog))
blogwordrank <- sort(colSums(blogmat),decreasing = TRUE)
head(blogwordrank, n=10 )

##    one   just   will   like   time    can    get    now   know people 
##    304    255    245    228    196    194    173    139    130    127

newsmat <- as.matrix(DocumentTermMatrix(sample_news))
newswordrank <- sort(colSums(newsmat),decreasing = TRUE)
head(newswordrank, n=10 )

##  said  will   one   new  year  also  just   can   two state 
##   464   205   156   145   120   117   112   109   105   102

twittermat <- as.matrix(DocumentTermMatrix(sample_twitter))
twitterwordrank <- sort(colSums(twittermat),decreasing = TRUE)
head(twitterwordrank, n=10 )

##    get   like   just   good   love   dont    day thanks    can   will 
##    104    101    100     99     92     84     82     78     77     74

##Step6 : Plot the results.

blogplot<- wordcloud(head(names(blogwordrank), 200), head(blogwordrank, 200), 
                     scale = c(3.5,0.25),colors=brewer.pal(8, "Dark2"),random.order = FALSE)

newsplot<- wordcloud(head(names(newswordrank), 200), head(newswordrank, 200), 
                     scale = c(3.5,0.25),colors=brewer.pal(8, "Dark2"),random.order = FALSE)

twitterplot<- wordcloud(head(names(twitterwordrank), 200), head(twitterwordrank, 200), 
                     scale = c(3.5,0.25),colors=brewer.pal(8, "Dark2"),random.order = FALSE)

Up to this point I get some insight about what is the most frequency word according to
all different datasets, so I can build n-gram of datasets.
This is the important step for building predictive model down the road.

Milestone Report

Jirapanakorn Sutham

3/2/2565