1.In first step, we download the files, and read the articles.
## Warning: package 'ngram' was built under R version 4.1.3
## Warning: package 'dplyr' was built under R version 4.1.3
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
## Warning: package 'RWeka' was built under R version 4.1.3
## java.home option:
## JAVA_HOME environment variable: C:\Program Files\Java\jre7
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
## nr of lines nr of words
## news 77259 2643969
## twitter 2360148 30373543
## blogs 899288 37334131
2.Since the files are large. Choose a little sample for processing.
set.seed(11000)
c_b<-sample(blogs, length(blogs)*0.001)
c_n<-sample(news, length(news)*0.001)
c_t<-sample(twitter, length(twitter)*0.001)
c_combi=c(c_b,c_n,c_t)
unigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 1, max = 1))
unigram_combi<-data.frame(table(unigram_combi))%>%arrange(desc(Freq))
bigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 2, max = 2))
bigram_combi<-data.frame(table(bigram_combi))%>%arrange(desc(Freq))
trigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 3, max = 3))
trigram_combi<-data.frame(table(trigram_combi))%>%arrange(desc(Freq))
df_ngram<-as.data.frame(cbind(unigram_combi[1:15,],bigram_combi[1:15,],trigram_combi[1:15,]))
names(df_ngram)[c(2,4,6)]<-c("Freq1","Freq2","Freq3")
df_ngram
## unigram_combi Freq1 bigram_combi Freq2 trigram_combi Freq3
## 1 the 2702 of the 258 I don t 42
## 2 to 1932 in the 240 NO NO NO 40
## 3 I 1697 I m 158 one of the 26
## 4 a 1542 to the 151 to be a 21
## 5 and 1526 for the 139 done done done 20
## 6 of 1318 to be 136 I can t 19
## 7 in 1001 on the 122 I m not 19
## 8 is 808 is a 90 a lot of 16
## 9 you 791 don t 84 going to be 15
## 10 that 727 at the 81 be able to 14
## 11 for 725 and the 80 I have a 12
## 12 it 681 I have 80 I think I 12
## 13 on 548 have a 76 is going to 12
## 14 my 510 and I 73 Thanks for the 12
## 15 with 489 I was 73 don t know 11
3.Now, we are plotting the graphs.
ggplot(df_ngram, aes(x=reorder(unigram_combi,Freq1), y=(Freq1))) +
geom_bar(stat="Identity", color="darkred")+
xlab("Unigrams") + ylab("Frequency")+
ggtitle("Most common 15 Unigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))
ggplot(df_ngram, aes(x=reorder(bigram_combi,Freq2), y=(Freq2))) +
geom_bar(stat="Identity",color="green")+
xlab("bigrams") + ylab("Frequency")+
ggtitle("Most common 15 Bigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))
ggplot(df_ngram, aes(x=reorder(trigram_combi,Freq3), y=(Freq3))) +
geom_bar(stat="Identity", color="Darkgreen")+
xlab("Trigrams") + ylab("Frequency")+
ggtitle("Most common 15 Trigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))