Loading data sets

con1 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.news.txt", open="r")
news <- readLines(con1); close(con1)

con2 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.blogs.txt", open="r")
blogs <- readLines(con2); close(con2) 

con3 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.twitter.txt", open="r")
twitter <- readLines(con3); close(con3)

Creating basic summaries of three files

require(tm)
require(stringi)

##News

news_lines = length(news)

news_words = stri_count_words(news)

news_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.news.txt")$size/1024^2


##Blogs

blogs_lines = length(blogs)

blogs_words = stri_count_words(news)

blogs_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.blogs.txt")$size/1024^2

##Twitter

twitter_lines = length(twitter)

twitter_words = stri_count_words(twitter)

twitter_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.twitter.txt")$size/1024^2


###Summary Table
table = data.frame(file.rename = c("news", "blogs", "twitter"), 
                   fileSize = c(news_size, blogs_size, twitter_size),
                   Lines = c(length(news), length(blogs), length(twitter)),
                   words = c(sum(news_words), sum(blogs_words), sum(twitter_words)))
table
  file.rename fileSize   Lines    words
1        news 196.2775   77259  2693898
2       blogs 200.4242  899288  2693898
3     twitter 159.3641 2360148 30218125

Creating Graphical illustration

Grab samples from original data sets and combine together

require(tm)

set.seed(123)

sNews = sample(news, 300)
sBlogs = sample(blogs, 300)
sTwitter = sample(twitter, 300)

docs = paste(sNews, sBlogs, sTwitter)

corpus = Corpus(VectorSource(docs))

Tokenization

require(tm)

toSapce = content_transformer(function(x, pattern){ return(gsub(pattern, " ", x))})

corpus = tm_map(corpus, toSapce, "/|@|\\|")
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeNumbers)


#Remove stopwords from standard stopwords list

corpus = tm_map(corpus, removeWords, stopwords("english"))
corpust = tm_map(corpus, stripWhitespace)

Stemming

#Snowball for stemming
require(SnowballC)

corpus = tm_map(corpus, stemDocument)

Creating Matrix and sort in decending order

dtm = DocumentTermMatrix(corpus)

freq = colSums(as.matrix(dtm))

inspect(dtm[1:2, 1001:1007])
<<DocumentTermMatrix (documents: 2, terms: 7)>>
Non-/sparse entries: 0/14
Sparsity           : 100%
Maximal term length: 5
Weighting          : term frequency (tf)
Sample             :
    Terms
Docs hous hut manor match morph natti net
   1    0   0     0     0     0     0   0
   2    0   0     0     0     0     0   0
ord = order(freq, decreasing = TRUE)

freq[head(ord)]
will  one said time just  can 
  90   83   78   74   71   65 
freq[tail(ord)]
 unexpect yesterday  furlough    shrink      trim    unpaid 
        1         1         1         1         1         1 
findFreqTerms(dtm, lowfreq = 80)
[1] "one"  "will"
wf = data.frame(term = names(freq), occourences  = freq)

Creat GGPLOT

require(ggplot2)

ggplot(subset(wf, freq>50), aes(term, occourences))+
    geom_bar(stat = "Identity", fill = "blue")+
    theme(axis.text = element_text(angle = 45, hjust = 1))

Creat Word Cloud

require(wordcloud)
set.seed(1234)
#Limit words by specifing frequency
wordcloud(names(freq), freq, min.freq =55, colors = brewer.pal(6, "Dark2"))