#install.packages("tidytext")
setwd("C:\\Users\\sanpande\\Desktop\\R Files")
library(janeaustenr)
library(tm)
## Loading required package: NLP
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
warp="https://github.com/juliasilge/janeaustenr"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 67639
inspect(Corpus1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 67639
summary(Corpus1)
## Length Class Mode
## janeaustenr 2 PlainTextDocument list
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)
Corpus1 <- tm_map(Corpus1, stripWhitespace)
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
dtm <- DocumentTermMatrix(Corpus1)
dtms <- removeSparseTerms(dtm, 0.2)
inspect(dtm[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 33
## Weighting : term frequency (tf)
##
## Terms
## Docs abbeyem abbeyemp abide acceptcharsetutf access action
## character(0) 1 1 1 1 1 1
## Terms
## Docs actionjuliasilgejaneaustenrsearch agree aliasorangebook
## character(0) 1 1 1
## Terms
## Docs also alt altbuild altcranstatusbadge altdoi altjuliasilge
## character(0) 2 17 1 1 1 1
## Terms
## Docs analysis analyzing announce another anything api
## character(0) 2 1 1 2 1 1
## Terms
## Docs ariadescribedbyfaceboxdescript ariahaspopuptru
## character(0) 1 2
## Terms
## Docs ariahiddentru ariahiddentrue ariahiddentruesvg arialabel
## character(0) 2 51 4 3
## Terms
## Docs arialabelclone arialabelclose arialabelcopy
## character(0) 2 2 1
inspect(dtms[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 33
## Weighting : term frequency (tf)
##
## Terms
## Docs abbeyem abbeyemp abide acceptcharsetutf access action
## character(0) 1 1 1 1 1 1
## Terms
## Docs actionjuliasilgejaneaustenrsearch agree aliasorangebook
## character(0) 1 1 1
## Terms
## Docs also alt altbuild altcranstatusbadge altdoi altjuliasilge
## character(0) 2 17 1 1 1 1
## Terms
## Docs analysis analyzing announce another anything api
## character(0) 2 1 1 2 1 1
## Terms
## Docs ariadescribedbyfaceboxdescript ariahaspopuptru
## character(0) 1 2
## Terms
## Docs ariahiddentru ariahiddentrue ariahiddentruesvg arialabel
## character(0) 2 51 4 3
## Terms
## Docs arialabelclone arialabelclose arialabelcopy
## character(0) 2 2 1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 1058, documents: 1)>>
## Non-/sparse entries: 1058/0
## Sparsity : 0%
## Maximal term length: 231
## Weighting : term frequency (tf)
inspect(tdm[1:30,1])
## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 33
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## abbeyem 1
## abbeyemp 1
## abide 1
## acceptcharsetutf 1
## access 1
## action 1
## actionjuliasilgejaneaustenrsearch 1
## agree 1
## aliasorangebook 1
## also 2
## alt 17
## altbuild 1
## altcranstatusbadge 1
## altdoi 1
## altjuliasilge 1
## analysis 2
## analyzing 1
## announce 1
## another 2
## anything 1
## api 1
## ariadescribedbyfaceboxdescript 1
## ariahaspopuptru 2
## ariahiddentru 2
## ariahiddentrue 51
## ariahiddentruesvg 4
## arialabel 3
## arialabelclone 2
## arialabelclose 2
## arialabelcopy 1
matx1=as.matrix(tdm)
matx1[1:10]
## [1] 1 1 1 1 1 1 1 1 1 2
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
## span div height version
## 118 106 70 61
## classocticon viewbox ariahiddentrue classcsstruncate
## 52 52 51 51
## meta widthpath
## 51 51
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
## Word Frequency
## span span 118
## div div 106
## height height 70
## version version 61
## classocticon classocticon 52
## viewbox viewbox 52
## ariahiddentrue ariahiddentrue 51
## classcsstruncate classcsstruncate 51
## meta meta 51
## widthpath widthpath 51
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : srchttpsassetscdngithubcomimagesspinnersoctocatspinnergif
## could not be fit on page. It will not be plotted.

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : csstruncatetarget could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : classjsnavigationitem could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : csstruncatetargettimeago could not be fit on page. It will
## not be plotted.
