First, we need to load the datasets we are going to use.
setwd("C://Users//sunpy_000//Documents//Capstone")
usblog<-readLines("Coursera-SwiftKey//final//en_US//en_US.blogs.txt")
usnews<-readLines("Coursera-SwiftKey//final//en_US//en_US.news.txt")
ustwitter<-readLines("Coursera-SwiftKey//final//en_US//en_US.twitter.txt")
Next, we do some word counts for each file:
wblog<-sapply(gregexpr("\\W+", usblog), length)
wnews<-sapply(gregexpr("\\W+", usnews), length)
wtwitter<-sapply(gregexpr("\\W+", ustwitter), length)
Then we break it down by sentences. And count how many sentences are in each file.
set.seed(190)
n<-floor(length(usblog)*0.01)
n1<-floor(length(usnews)*0.1)
n2<-floor(length(ustwitter)*0.01)
blog<-sample(usblog,n)
news<-sample(usnews,n1)
twitter<-sample(ustwitter,n2)
blog<-tolower(blog)
blog<-removePunctuation(blog)
blog<-str_replace_all(blog, "[^[:alnum:]]", " ")
blog<-removeNumbers(blog)
news<-tolower(news)
news<-removePunctuation(news)
news<-str_replace_all(news, "[^[:alnum:]]", " ")
news<-removeNumbers(news)
twitter<-tolower(twitter)
twitter<-removePunctuation(twitter)
twitter<-str_replace_all(twitter, "[^[:alnum:]]", " ")
twitter<-removeNumbers(twitter)
#sent_token_annotator <- Maxent_Sent_Token_Annotator()
blogcorp<-Corpus(VectorSource(blog))
#blogcount<-numeric(length(blogcorp))
#for (i in 1:length(blogcorp)){
#sentences <- annotate(blogcorp[[i]], sent_token_annotator)
#blogcount[i]<-length(sentences)
#}
newscorp<-Corpus(VectorSource(news))
#newscount<-numeric(length(newscorp))
#for (i in 1:length(newscorp)){
#sentences <- annotate(newscorp[[i]], sent_token_annotator)
#newscount[i]<-length(sentences)
#}
twittercorp<-Corpus(VectorSource(twitter))
#twittercount<-numeric(length(twittercorp))
#for (i in 1:length(twittercorp)){
#sentences <- annotate(twittercorp[[i]], sent_token_annotator)
#twittercount[i]<-length(sentences)
#}
We summarize the results in the following table:
data<-data.frame()
data[1,1]<-sum(wblog)
data[2,1]<-sum(wnews)
data[3,1]<-sum(wtwitter)
data[1,2]<-mean(wblog)
data[2,2]<-mean(wnews)
data[3,2]<-mean(wtwitter)
data[1,3]<-length(usblog)
data[2,3]<-length(usnews)
data[3,3]<-length(ustwitter)
#data[1,4]<-mean(blogcount)
#data[2,4]<-mean(newscount)
#data[3,4]<-mean(twittercount)
#data[1,5]<-mean(blogcount)*length(usblog)
#data[2,5]<-mean(newscount)*length(usnews)
#data[3,5]<-mean(twittercount)*length(ustwitter)
row.names(data)<-c("Blog","News","Twitter")
colnames(data)<-c("Total Words Count","Average Words Count","Line Count")
#colnames(data)<-c("Total Words Count","Average Words Count","Line Count","Average Sentence Count","Total Sentence Count")
data
FALSE Total Words Count Average Words Count Line Count
FALSE Blog 38487556 42.79781 899288
FALSE News 2760230 35.72697 77259
FALSE Twitter 30513860 12.92879 2360148
We plot the top words with frequency>=1000 for blog:
blogwords<-TermDocumentMatrix(blogcorp)
blogwords$tot<-row_sums(blogwords,na.rm=TRUE)
bloghotwords<-blogwords[which(blogwords$tot>=1000),]
bloghotwordsTerms<-bloghotwords$dimnames[1]$Terms
bloghotwordsFreq<-data.frame(row_sums(bloghotwords))
ggplot(bloghotwordsFreq,aes(x=rownames(bloghotwordsFreq),y=row_sums.bloghotwords.))+geom_bar(stat="identity")
wordcloud(blogcorp,min.freq=200)
We plot the top words with frequency>=1000 for news:
newswords<-TermDocumentMatrix(newscorp)
newswords$tot<-row_sums(newswords,na.rm=TRUE)
newshotwords<-blogwords[which(newswords$tot>=1000),]
newshotwordsTerms<-newshotwords$dimnames[1]$Terms
newshotwordsFreq<-data.frame(row_sums(newshotwords))
ggplot(newshotwordsFreq,aes(x=rownames(newshotwordsFreq),y=row_sums.newshotwords.))+geom_bar(stat="identity")
wordcloud(newscorp,min.freq=200)
We plot the top words with frequency>=1000 for twitter:
twitterwords<-TermDocumentMatrix(twittercorp)
twitterwords$tot<-row_sums(twitterwords,na.rm=TRUE)
twitterhotwords<-twitterwords[which(twitterwords$tot>=1000),]
twitterhotwordsTerms<-twitterhotwords$dimnames[1]$Terms
twitterhotwordsFreq<-data.frame(row_sums(twitterhotwords))
ggplot(twitterhotwordsFreq,aes(x=rownames(twitterhotwordsFreq),y=row_sums.twitterhotwords.))+geom_bar(stat="identity")
wordcloud(twittercorp,min.freq=200)