news<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt"
blogs<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.blogs.txt"
twitter<-"F:/Data science/John Hopkins University/10.Capstone/Coursera-SwiftKey/final/en_US/en_US.news.txt"
con_news<-file(news,"r")
con_blogs<-file(blogs,"r")
con_twitter<-file(twitter,"r")
ReadText<-function(con){
return(readLines(con,-1))
}
newsData<-ReadText(con_news)
blogsData<-ReadText(con_blogs)
twitterData<-ReadText(con_twitter)
Clean<-function(data){
data<-gsub(pattern=";|\\.|!|\\?",x=data,replacement = "rep1")
data<-gsub(pattern="\\'",x=data,replacement = "rep2")
data<-gsub(pattern = "[^a-zA-Z]",x=data,replacement = " ")
data<-tolower(data) #Converting to lower Case
data<-gsub(pattern = "rep2",x=data,replacement = "\\'")
data<-gsub(pattern = "\\s+",x=data,replacement = " ")
sentence<-unlist(strsplit(x=data,split="rep1",fixed = T))
return(sentence)
}
news_corpora<-Clean(newsData)
blogs_corpora<-Clean(blogsData)
twitter_corpora<-Clean(twitterData)
saveRDS(blogs_corpora,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\blogs.txt")
saveRDS(twitter_corpora,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\twitter.txt")
saveRDS(news_corpora,file="F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean\\news.txt")
#Using Corpus() and DirSource() from the tm Package.
docs<-VCorpus(DirSource("F:\\Data science\\John Hopkins University\\10.Capstone\\Coursera-SwiftKey\\Clean"))
paste("Number of Lines in News file are",newsLines)
## [1] "Number of Lines in News file are 77259"
paste("Number of Lines in blog file are",blogLines)
## [1] "Number of Lines in blog file are 899288"
paste("Number of Lines in twitter file are",twitterLines)
## [1] "Number of Lines in twitter file are 77259"
dtm<-DocumentTermMatrix(docs) #As the function takes a corpus as a parameter
print(dtm)
## <<DocumentTermMatrix (documents: 3, terms: 34309)>>
## Non-/sparse entries: 44567/58360
## Sparsity : 57%
## Maximal term length: 25
## Weighting : term frequency (tf)
frequency<-colSums(as.matrix(dtm))
frequency<-sort(frequency,decreasing = T) #Sorting in decreasing order
head(frequency,50)
## the and that rep for was with you this but have are
## 19949 11355 4776 3749 3645 3003 2899 2706 2390 2184 2145 1909
## not they from all his one had will she her out about
## 1770 1559 1505 1407 1355 1228 1163 1114 1108 1085 1083 1075
## what has when like just can who their there your would more
## 1036 1025 997 966 952 945 945 931 931 915 899 885
## some time our been were itrep which get how into irep them
## 861 844 840 827 803 791 788 723 663 647 619 617
## then new
## 608 582
freq_df<-data.frame(words=names(frequency),count=frequency)
g<-ggplot(data=freq_df[1:20,])
g<-g+geom_bar(stat = "identity",aes(x=words,y=count,fill="blue"))+theme(legend.position = "none",axis.text.x = element_text(angle=90))
g
wordcloud(names(frequency),frequency,max.words = 200,colors=brewer.pal(6,"Dark2"))
biTokenizer<-function(x){NGramTokenizer(x,Weka_control(min=2,max=2))}
dtmBIGRAM<-TermDocumentMatrix(docs,control=list(tokenize=biTokenizer))
print(dtmBIGRAM)
## <<TermDocumentMatrix (terms: 196058, documents: 3)>>
## Non-/sparse entries: 217563/370611
## Sparsity : 63%
## Maximal term length: 38
## Weighting : term frequency (tf)
#freq_bi<-findFreqTerms(dtmBIGRAM,lowfreq = 00)
#freq_bi_1<-rowSums(as.matrix(dtmBIGRAM[freq_bi,]))
#freq_bigram_df<-data.frame(bigram=names(freq_bi_1),freq=freq_bi_1)
As the code above is giving an error we will use slam package for col summing the matrix.
freq_bi<-rowapply_simple_triplet_matrix(dtmBIGRAM,FUN=sum)
freq_bi<-sort(freq_bi,decreasing = T)
freq_df_bi<-data.frame(bigram=names(freq_bi),freq=freq_bi)
freq_df_bi[1:50,]
## bigram freq
## of the of the 2029
## in the in the 1642
## rep rep rep rep 1041
## to the to the 972
## on the on the 840
## to be to be 686
## and the and the 640
## for the for the 607
## i was i was 562
## in a in a 508
## and i and i 489
## is a is a 484
## at the at the 460
## it is it is 452
## it was it was 437
## with the with the 429
## itrep s itrep s 409
## from the from the 403
## i have i have 402
## i am i am 399
## that i that i 367
## of a of a 366
## it s it s 358
## as a as a 344
## with a with a 331
## one of one of 325
## for a for a 313
## this is this is 313
## irep m irep m 302
## if you if you 298
## that the that the 297
## don t don t 287
## to get to get 281
## is the is the 273
## i had i had 272
## the first the first 259
## but i but i 256
## want to want to 255
## and a and a 250
## out of out of 248
## by the by the 246
## as the as the 243
## i m i m 242
## donrep t donrep t 240
## was a was a 240
## going to going to 238
## have been have been 232
## have a have a 229
## all the all the 226
## will be will be 226
g<-ggplot(data=freq_df_bi[1:50,])
g<-g+geom_bar(stat="identity",aes(x=bigram,y=freq,fill="blue"))+theme(axis.text.x = element_text(angle=90),legend.position = "none")
g
##Making a wordcloud for BIgram.
wordcloud(names(freq_bi),freq_bi,max.words = 200,colors = brewer.pal(6,"Dark2"))
#Tri-Gram
triGramToken<-function(x){NGramTokenizer(x,Weka_control(min=3,max=3))}
dtmTRI<-DocumentTermMatrix(docs,control = list(tokenizer=triGramToken))
print(dtmTRI)
## <<DocumentTermMatrix (documents: 3, terms: 313075)>>
## Non-/sparse entries: 333074/606151
## Sparsity : 65%
## Maximal term length: 45
## Weighting : term frequency (tf)
freq_tri<-colapply_simple_triplet_matrix(dtmTRI,FUN=sum)
freq_tri<-sort(freq_tri,decreasing = T)
freq_tri_df<-data.frame(triGram=names(freq_tri),freq=freq_tri)
g<-ggplot(data=freq_tri_df[1:50,])
g<-g+geom_bar(stat="identity",aes(x=triGram,y=freq,fill="blue"))+theme(axis.text.x = element_text(angle=90),legend.position = "none")
g
##Making a wordcloud
wordcloud(names(freq_tri),freq_tri,max.words = 200,colors = brewer.pal(6,"Dark2"))
#Quad-Gram
QuadGramToken<-function(x){NGramTokenizer(x,Weka_control(min=4,max=4))}
dtmQuad<-DocumentTermMatrix(docs,control = list(tokenizer=QuadGramToken))
print(dtmQuad)
## <<DocumentTermMatrix (documents: 3, terms: 343191)>>
## Non-/sparse entries: 361478/668095
## Sparsity : 65%
## Maximal term length: 54
## Weighting : term frequency (tf)
quad_freq<-colapply_simple_triplet_matrix(dtmQuad,FUN=sum)
quad_freq<-sort(quad_freq,decreasing = T)
quad_freq_df<-data.frame(QuadGram=names(quad_freq),freq=quad_freq)
g<-ggplot(data=quad_freq_df[1:50,])
g<-g+geom_bar(stat="identity",aes(x=QuadGram,y=freq,fill="blue"))+theme(axis.text.x = element_text(angle=90),legend.position = "none")
g
wordcloud(names(quad_freq),quad_freq,max.words = 200,colors = brewer.pal(6,"Dark2"))