#Importing the required libraries
library(downloader)
library(plyr)
library(dplyr)
library(RWeka)
library(ggplot2)
library(knitr)
library(stringi)
library(tm)
library(NLP)
library(rsconnect)
#Reading the data files
twitter<-readLines("C:\\Users\\135661\\Documents\\Coursera\\final\\en_US\\en_US.twitter.txt",encoding="UTF-8")
blogs<-readLines("C:\\Users\\135661\\Documents\\Coursera\\final\\en_US\\en_US.blogs.txt",encoding="UTF-8")
news<-readLines("C:\\Users\\135661\\Documents\\Coursera\\final\\en_US\\en_US.news.txt",encoding="UTF-8")
#Printing the lengths of each file
length(twitter)
## [1] 2360148
length(blogs)
## [1] 899288
length(news)
## [1] 77259
twitterwords <-stri_stats_latex(twitter)[4]
blogswords <-stri_stats_latex(blogs)[4]
newswords <-stri_stats_latex(news)[4]
#Finding number of words and number of lines for each file
data.frame("File Name" = c("twitter", "blogs", "news"),
"num.lines" = c(length(twitter),length(blogs), length(news)),
"num.words" = c(sum(blogswords), sum(newswords), sum(twitterwords)))
## File.Name num.lines num.words
## 1 twitter 2360148 37570839
## 2 blogs 899288 2651432
## 3 news 77259 30451128
set.seed(101)
blogs_c<-iconv(blogs,"latin1","ASCII",sub="")
news_c<-iconv(news,"latin1","ASCII",sub="")
twitter_c<-iconv(twitter,"latin1","ASCII",sub="")
sampledata<-c(sample(twitter_c,length(twitter_c)*0.01),
sample(blogs_c,length(blogs_c)*0.01),
sample(news_c,length(news_c)*0.01))
#Making the word sets and cleaning the words, removing the stop words, converting to lower, removing special characters, white spaces and numbers
wordset <- VCorpus(VectorSource(sampledata))
toSpace <- content_transformer(function(x, pattern) gsub(pattern, " ", x))
wordset <- tm_map(wordset, toSpace, "(f|ht)tp(s?)://(.*)[.][a-z]+")
wordset <- tm_map(wordset, toSpace, "@[^\\s]+")
wordset <- tm_map(wordset, tolower)
wordset <- tm_map(wordset, removeWords, c(stopwords("en"),"just","will","can","let","us"))
wordset <- tm_map(wordset, removePunctuation)
wordset <- tm_map(wordset, removeNumbers)
wordset <- tm_map(wordset, stripWhitespace)
wordset <- tm_map(wordset, PlainTextDocument)
wordsetfinal<-data.frame(text=unlist(sapply(wordset,'[',"content")),stringsAsFactors = FALSE)
head(wordsetfinal)
## text
## 1 todays health tipeat fruits vegetables drink water prior going holiday partyu make better choices eat
## 2 summer time forget sunblock skin cancer preventable cancer
## 3 see u sonic swag lol
## 4 dirty loops thing seems like almost jazz musicians love sure leaves oh well
## 5 dont care temperature perfect attendance
## 6 oh bob jenkins please job way better
#Finding the most frequent single words (Uni Grams)
uni_gram<-function(x) NGramTokenizer(x,Weka_control(min=1,max=1))
uni_gramtab<-TermDocumentMatrix(wordset,control=list(tokenize=uni_gram))
uni_gramcorpus<-findFreqTerms(uni_gramtab,lowfreq=1000)
uni_gramcorpusnum<-rowSums(as.matrix(uni_gramtab[uni_gramcorpus,]))
uni_gramcorpustab<-data.frame(Word=names(uni_gramcorpusnum),frequency=uni_gramcorpusnum)
uni_gramcorpussort<-uni_gramcorpustab[order(-uni_gramcorpustab$frequency),]
ggplot(uni_gramcorpussort[1:15,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("grey6"))+
labs(title="uni_grams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

#Finding the most frequent set of two words (bi Grams)
bi_gram<-function(x) NGramTokenizer(x,Weka_control(min=2,max=2))
bi_gramtab<-TermDocumentMatrix(wordset,control=list(tokenize=bi_gram))
bi_gramcorpus<-findFreqTerms(bi_gramtab,lowfreq=80)
bi_gramcorpusnum<-rowSums(as.matrix(bi_gramtab[bi_gramcorpus,]))
bi_gramcorpustab<-data.frame(Word=names(bi_gramcorpusnum),frequency=bi_gramcorpusnum)
bi_gramcorpussort<-bi_gramcorpustab[order(-bi_gramcorpustab$frequency),]
ggplot(bi_gramcorpussort[1:12,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("royalblue4"))+
labs(title="bi_grams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))

#Finding the most frequent set of three words (tri Grams)
tri_gram<-function(x) NGramTokenizer(x,Weka_control(min=3,max=3))
tri_gramtab<-TermDocumentMatrix(wordset,control=list(tokenize=tri_gram))
tri_gramcorpus<-findFreqTerms(tri_gramtab,lowfreq=10)
tri_gramcorpusnum<-rowSums(as.matrix(tri_gramtab[tri_gramcorpus,]))
tri_gramcorpustab<-data.frame(Word=names(tri_gramcorpusnum),frequency=tri_gramcorpusnum)
tri_gramcorpussort<-tri_gramcorpustab[order(-tri_gramcorpustab$frequency),]
ggplot(tri_gramcorpussort[1:10,],aes(x=reorder(Word,-frequency),y=frequency))+
geom_bar(stat="identity",fill = I("tan4"))+
labs(title="tri_grams",x="Most Words",y="Frequency")+
theme(axis.text.x=element_text(angle=60))
