library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.2
library(stringi) # for character string processing facilities
library(NLP)
##
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
##
## annotate
library(tm) # for text mining
## Warning: package 'tm' was built under R version 4.0.2
library(SnowballC) # for text stemming
library(RColorBrewer) #for color palettes
library(RWeka) # for data mining tasks
## Warning: package 'RWeka' was built under R version 4.0.2
## java.home option:
## JAVA_HOME environment variable: C:\Program Files\Java\jre7
## Warning in fun(libname, pkgname): Java home setting is INVALID, it will be ignored.
## Please do NOT set it unless you want to override system settings.
library(RWekajars)
## Warning: package 'RWekajars' was built under R version 4.0.2
library(knitr)
library(rJava)
## Warning: package 'rJava' was built under R version 4.0.2
# Getting the blogs file
blogs<-file("C:/Users/laksk/Downloads/Coursera-SwiftKey (1)/final/en_US/en_US.blogs.txt", open="rb")
blogs<-readLines(blogs, encoding="UTF-8", skipNul=TRUE)
# Getting the news file
news<-file("C:/Users/laksk/Downloads/Coursera-SwiftKey (1)/final/en_US/en_US.news.txt", open="rb")
news<-readLines(news, encoding="UTF-8", skipNul=TRUE)
# Getting the twitter file
twitter<-file("C:/Users/laksk/Downloads/Coursera-SwiftKey (1)/final/en_US/en_US.twitter.txt", open="rb")
twitter<-readLines(twitter, encoding="UTF-8", skipNul=TRUE)
blogsSize<-file.info("C:/Users/laksk/Downloads/Coursera-SwiftKey (1)/final/en_US/en_US.blogs.txt")$size/1024^2
blogsSize
## [1] 200.4242
newsSize<-file.info("C:/Users/laksk/Downloads/Coursera-SwiftKey (1)/final/en_US/en_US.news.txt")$size/1024^2
newsSize
## [1] 196.2775
twitterSize<-file.info("C:/Users/laksk/Downloads/Coursera-SwiftKey (1)/final/en_US/en_US.twitter.txt")$size/1024^2
twitterSize
## [1] 159.3641
blogsLength<-length(blogs)
newsLength<-length(news)
twitterLength<-length(twitter)
blogsWords<-sum(stri_count_words(blogs))
newsWords<-sum(stri_count_words(news))
twitterWords<-sum(stri_count_words(twitter))
blogsMax<-max(nchar(blogs))
newsMax<-max(nchar(news))
twitterMax<-max(nchar(twitter))
Summary<-data.frame(FileName=c("blogs", "news", "twitter"),
FileSize=c(blogsSize,newsSize,twitterSize), FileLength=c(blogsLength,newsLength,twitterLength), WordCount=c(blogsWords, newsWords, twitterWords),
MaxCharacters=c(blogsMax, newsMax, twitterMax))
kable(Summary)
| FileName | FileSize | FileLength | WordCount | MaxCharacters |
|---|---|---|---|---|
| blogs | 200.4242 | 899288 | 37546239 | 40833 |
| news | 196.2775 | 1010242 | 34762395 | 11384 |
| 159.3641 | 2360148 | 30093413 | 140 |
Sample<-c(sample(blogs, length(blogs)*0.01),
sample(news, length(news)*0.01),
sample(twitter, length(twitter)*0.01))
corpus<-VCorpus(VectorSource(Sample))
corpus<-tm_map(corpus, tolower)
corpus<-tm_map(corpus, removeWords, stopwords("en"))
corpus<-tm_map(corpus, removePunctuation)
corpus<-tm_map(corpus, removeNumbers)
corpus<-tm_map(corpus, stripWhitespace)
corpus<-tm_map(corpus, PlainTextDocument)
uniToken<-function(x) NGramTokenizer(x, Weka_control(min=1, max=1))
uniMatrix<-TermDocumentMatrix(corpus, control=list(tokenize=uniToken))
uniCorpus<-findFreqTerms(uniMatrix, lowfreq=100)
uniWords<-rowSums(as.matrix(uniMatrix[uniCorpus, ]))
unigram<-data.frame(Word=names(uniWords), Frequency=uniWords)
kable(head(unigram, n=10))
| Word | Frequency | |
|---|---|---|
| ’ll | ’ll | 168 |
| ’re | ’re | 199 |
| ’ve | ’ve | 264 |
| able | able | 340 |
| according | according | 217 |
| account | account | 105 |
| across | across | 196 |
| act | act | 163 |
| action | action | 119 |
| actually | actually | 330 |
biToken<-function(x) NGramTokenizer(x, Weka_control(min=2, max=2))
biMatrix<-TermDocumentMatrix(corpus, control=list(tokenize=biToken))
biCorpus<-findFreqTerms(biMatrix, lowfreq=10)
biWords<-rowSums(as.matrix(biMatrix[biCorpus, ]))
bigram<-data.frame(Word=names(biWords), Frequency=biWords)
kable(head(bigram, n=10))
| Word | Frequency | |
|---|---|---|
| – ’s | – ’s | 11 |
| ’d like | ’d like | 16 |
| ’m going | ’m going | 17 |
| ’m sure | ’m sure | 23 |
| ’re going | ’re going | 16 |
| ’s going | ’s going | 21 |
| ’s hard | ’s hard | 14 |
| ’s just | ’s just | 25 |
| ’s like | ’s like | 13 |
| ’s much | ’s much | 10 |
triToken<-function(x) NGramTokenizer(x, Weka_control(min=3, max=3))
triMatrix<-TermDocumentMatrix(corpus, control=list(tokenize=triToken))
triCorpus<-findFreqTerms(triMatrix, lowfreq=10)
triWords<-rowSums(as.matrix(triMatrix[triCorpus, ]))
trigram<-data.frame(Word=names(triWords), Frequency=triWords)
kable(head(trigram, n=10))
| Word | Frequency | |
|---|---|---|
| amazon services llc | amazon services llc | 10 |
| cake cake cake | cake cake cake | 20 |
| cinco de mayo | cinco de mayo | 12 |
| happy mothers day | happy mothers day | 29 |
| happy new year | happy new year | 13 |
| let us know | let us know | 25 |
| llc amazon eu | llc amazon eu | 10 |
| looking forward seeing | looking forward seeing | 13 |
| new york city | new york city | 18 |
| new york times | new york times | 16 |
g1<-ggplot(data=unigram[1:10,], aes(x=Word, y=Frequency))
g1<-g1+geom_bar(stat="identity", color="green", fill="blue", width=0.5)
g1<-g1+ggtitle("Unigram plot")+labs(x="One-word")
g1<-g1+coord_flip()
g1
g2<-ggplot(data=bigram[1:10, ], aes(x=Word, y=Frequency))
g2<-g2+geom_bar(stat="identity", color="blue", fill="green", width=0.5)
g2<-g2+ggtitle("Bigram plot")+labs(x="Two-words")
g2<-g2+coord_flip()
g2
g3<-ggplot(data=trigram[1:10, ], aes(x=Word, y=Frequency))
g3<-g3+geom_bar(stat="identity", color="yellow", fill="deeppink", width=0.5)
g3<-g3+ggtitle("Trigram plot")+labs(x="Three-words")
g3<-g3+coord_flip()
g3