1st MileStone Project

Loading data sets

con1 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.news.txt", open="r")
news <- readLines(con1); close(con1)

con2 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.blogs.txt", open="r")
blogs <- readLines(con2); close(con2) 

con3 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.twitter.txt", open="r")
twitter <- readLines(con3); close(con3)

Creating basic summaries of three files

require(tm)
require(stringi)

##News

news_lines = length(news)

news_words = stri_count_words(news)

news_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.news.txt")$size/1024^2


##Blogs

blogs_lines = length(blogs)

blogs_words = stri_count_words(news)

blogs_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.blogs.txt")$size/1024^2

##Twitter

twitter_lines = length(twitter)

twitter_words = stri_count_words(twitter)

twitter_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.twitter.txt")$size/1024^2


###Summary Table
table = data.frame(file.rename = c("news", "blogs", "twitter"), 
                   fileSize = c(news_size, blogs_size, twitter_size),
                   Lines = c(length(news), length(blogs), length(twitter)),
                   words = c(sum(news_words), sum(blogs_words), sum(twitter_words)))
table

  file.rename fileSize   Lines    words
1        news 196.2775   77259  2693898
2       blogs 200.4242  899288  2693898
3     twitter 159.3641 2360148 30218125

Creating Graphical illustration

Grab samples from original data sets and combine together

require(tm)

set.seed(123)

sNews = sample(news, 300)
sBlogs = sample(blogs, 300)
sTwitter = sample(twitter, 300)

docs = paste(sNews, sBlogs, sTwitter)

corpus = Corpus(VectorSource(docs))

Tokenization

require(tm)

toSapce = content_transformer(function(x, pattern){ return(gsub(pattern, " ", x))})

corpus = tm_map(corpus, toSapce, "/|@|\\|")
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeNumbers)


#Remove stopwords from standard stopwords list

corpus = tm_map(corpus, removeWords, stopwords("english"))
corpust = tm_map(corpus, stripWhitespace)

Stemming

#Snowball for stemming
require(SnowballC)

corpus = tm_map(corpus, stemDocument)

Creating Matrix and sort in decending order

dtm = DocumentTermMatrix(corpus)

freq = colSums(as.matrix(dtm))

inspect(dtm[1:2, 1001:1007])

<<DocumentTermMatrix (documents: 2, terms: 7)>>
Non-/sparse entries: 0/14
Sparsity           : 100%
Maximal term length: 5
Weighting          : term frequency (tf)
Sample             :
    Terms
Docs hous hut manor match morph natti net
   1    0   0     0     0     0     0   0
   2    0   0     0     0     0     0   0

ord = order(freq, decreasing = TRUE)

freq[head(ord)]

will  one said time just  can 
  90   83   78   74   71   65

freq[tail(ord)]

 unexpect yesterday  furlough    shrink      trim    unpaid 
        1         1         1         1         1         1

findFreqTerms(dtm, lowfreq = 80)

[1] "one"  "will"

wf = data.frame(term = names(freq), occourences  = freq)

Creat GGPLOT

require(ggplot2)

ggplot(subset(wf, freq>50), aes(term, occourences))+
    geom_bar(stat = "Identity", fill = "blue")+
    theme(axis.text = element_text(angle = 45, hjust = 1))

Creat Word Cloud

require(wordcloud)
set.seed(1234)
#Limit words by specifing frequency
wordcloud(names(freq), freq, min.freq =55, colors = brewer.pal(6, "Dark2"))

1st MileStone Project

Ryu Uezato

May 25, 2019

Loading data sets

Creating basic summaries of three files

Creating Graphical illustration

Grab samples from original data sets and combine together

Tokenization

Stemming

Creating Matrix and sort in decending order

Creat GGPLOT

Creat Word Cloud