Milestone Report

1.In first step, we download the files, and read the articles.

## Warning: package 'ngram' was built under R version 4.1.3

## Warning: package 'dplyr' was built under R version 4.1.3

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Warning: package 'RWeka' was built under R version 4.1.3

## java.home option:

## JAVA_HOME environment variable: C:\Program Files\Java\jre7

## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.

##         nr of lines nr of words
## news          77259     2643969
## twitter     2360148    30373543
## blogs        899288    37334131

2.Since the files are large. Choose a little sample for processing.

set.seed(11000)
c_b<-sample(blogs, length(blogs)*0.001)
c_n<-sample(news, length(news)*0.001)
c_t<-sample(twitter, length(twitter)*0.001)
c_combi=c(c_b,c_n,c_t)

unigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 1, max = 1))
unigram_combi<-data.frame(table(unigram_combi))%>%arrange(desc(Freq))

bigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 2, max = 2)) 
bigram_combi<-data.frame(table(bigram_combi))%>%arrange(desc(Freq))

trigram_combi <- NGramTokenizer(c_combi, Weka_control(min = 3, max = 3)) 
trigram_combi<-data.frame(table(trigram_combi))%>%arrange(desc(Freq))

df_ngram<-as.data.frame(cbind(unigram_combi[1:15,],bigram_combi[1:15,],trigram_combi[1:15,]))
names(df_ngram)[c(2,4,6)]<-c("Freq1","Freq2","Freq3")
df_ngram

##    unigram_combi Freq1 bigram_combi Freq2  trigram_combi Freq3
## 1            the  2702       of the   258        I don t    42
## 2             to  1932       in the   240       NO NO NO    40
## 3              I  1697          I m   158     one of the    26
## 4              a  1542       to the   151        to be a    21
## 5            and  1526      for the   139 done done done    20
## 6             of  1318        to be   136        I can t    19
## 7             in  1001       on the   122        I m not    19
## 8             is   808         is a    90       a lot of    16
## 9            you   791        don t    84    going to be    15
## 10          that   727       at the    81     be able to    14
## 11           for   725      and the    80       I have a    12
## 12            it   681       I have    80      I think I    12
## 13            on   548       have a    76    is going to    12
## 14            my   510        and I    73 Thanks for the    12
## 15          with   489        I was    73     don t know    11

3.Now, we are plotting the graphs.

ggplot(df_ngram, aes(x=reorder(unigram_combi,Freq1), y=(Freq1))) +
geom_bar(stat="Identity", color="darkred")+
xlab("Unigrams") + ylab("Frequency")+
ggtitle("Most common 15 Unigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))

ggplot(df_ngram, aes(x=reorder(bigram_combi,Freq2), y=(Freq2))) +
geom_bar(stat="Identity",color="green")+
xlab("bigrams") + ylab("Frequency")+
ggtitle("Most common 15 Bigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))

ggplot(df_ngram, aes(x=reorder(trigram_combi,Freq3), y=(Freq3))) +
geom_bar(stat="Identity", color="Darkgreen")+
xlab("Trigrams") + ylab("Frequency")+
ggtitle("Most common 15 Trigrams")+
theme(axis.text.x=element_text(angle=90, hjust=1))

Milestone Report

Jia Jing Liew

2022-05-26