Loading data sets
con1 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.news.txt", open="r")
news <- readLines(con1); close(con1)
con2 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.blogs.txt", open="r")
blogs <- readLines(con2); close(con2)
con3 <- file("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.twitter.txt", open="r")
twitter <- readLines(con3); close(con3)
Creating basic summaries of three files
require(tm)
require(stringi)
##News
news_lines = length(news)
news_words = stri_count_words(news)
news_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.news.txt")$size/1024^2
##Blogs
blogs_lines = length(blogs)
blogs_words = stri_count_words(news)
blogs_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.blogs.txt")$size/1024^2
##Twitter
twitter_lines = length(twitter)
twitter_words = stri_count_words(twitter)
twitter_size = file.info("C:/Users/Ryu Uezato/Desktop/COURSERA/Data Science/CapStone/final/en_US/en_US.twitter.txt")$size/1024^2
###Summary Table
table = data.frame(file.rename = c("news", "blogs", "twitter"),
fileSize = c(news_size, blogs_size, twitter_size),
Lines = c(length(news), length(blogs), length(twitter)),
words = c(sum(news_words), sum(blogs_words), sum(twitter_words)))
table
file.rename fileSize Lines words
1 news 196.2775 77259 2693898
2 blogs 200.4242 899288 2693898
3 twitter 159.3641 2360148 30218125
Creating Graphical illustration
Grab samples from original data sets and combine together
require(tm)
set.seed(123)
sNews = sample(news, 300)
sBlogs = sample(blogs, 300)
sTwitter = sample(twitter, 300)
docs = paste(sNews, sBlogs, sTwitter)
corpus = Corpus(VectorSource(docs))
Tokenization
require(tm)
toSapce = content_transformer(function(x, pattern){ return(gsub(pattern, " ", x))})
corpus = tm_map(corpus, toSapce, "/|@|\\|")
corpus = tm_map(corpus, content_transformer(tolower))
corpus = tm_map(corpus, removePunctuation)
corpus = tm_map(corpus, removeNumbers)
#Remove stopwords from standard stopwords list
corpus = tm_map(corpus, removeWords, stopwords("english"))
corpust = tm_map(corpus, stripWhitespace)
Stemming
#Snowball for stemming
require(SnowballC)
corpus = tm_map(corpus, stemDocument)
Creating Matrix and sort in decending order
dtm = DocumentTermMatrix(corpus)
freq = colSums(as.matrix(dtm))
inspect(dtm[1:2, 1001:1007])
<<DocumentTermMatrix (documents: 2, terms: 7)>>
Non-/sparse entries: 0/14
Sparsity : 100%
Maximal term length: 5
Weighting : term frequency (tf)
Sample :
Terms
Docs hous hut manor match morph natti net
1 0 0 0 0 0 0 0
2 0 0 0 0 0 0 0
ord = order(freq, decreasing = TRUE)
freq[head(ord)]
will one said time just can
90 83 78 74 71 65
freq[tail(ord)]
unexpect yesterday furlough shrink trim unpaid
1 1 1 1 1 1
findFreqTerms(dtm, lowfreq = 80)
[1] "one" "will"
wf = data.frame(term = names(freq), occourences = freq)
Creat GGPLOT
require(ggplot2)
ggplot(subset(wf, freq>50), aes(term, occourences))+
geom_bar(stat = "Identity", fill = "blue")+
theme(axis.text = element_text(angle = 45, hjust = 1))
Creat Word Cloud
require(wordcloud)
set.seed(1234)
#Limit words by specifing frequency
wordcloud(names(freq), freq, min.freq =55, colors = brewer.pal(6, "Dark2"))