library(tm)
## Loading required package: NLP
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
warp="https://github.com/juliasilge/janeaustenr/blob/master/data-raw/prep_data.R"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 55939
inspect(Corpus1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 55939
summary(Corpus1)
## Length Class Mode
## prep_data.R 2 PlainTextDocument list
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)
Corpus1 <- tm_map(Corpus1, stripWhitespace)
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
dtm <- DocumentTermMatrix(Corpus1)
dtms <- removeSparseTerms(dtm, 0.2)
inspect(dtm[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 33
## Weighting : term frequency (tf)
##
## Terms
## Docs acceptcharsetutf action actionjuliasilgejaneaustenrsearch
## character(0) 2 2 1
## Terms
## Docs add alt andspantd another api
## character(0) 4 1 1 2 1
## Terms
## Docs ariadescribedbyfaceboxdescript ariahaspopuptru
## character(0) 1 1
## Terms
## Docs ariahiddentru ariahiddentrue arialabel arialabelclose
## character(0) 1 24 3 2
## Terms
## Docs arialabelcopy arialabeldismiss arialabelfilter
## character(0) 1 1 1
## Terms
## Docs arialabelhomepage arialabeljump arialabelledbyfaceboxheader
## character(0) 2 1 1
## Terms
## Docs arialabelsearch arialabelswitch arialabeltoggle
## character(0) 1 1 1
## Terms
## Docs arialabelyou ariaselectedtrue article asyncasync austens
## character(0) 5 1 1 1 3
## Terms
## Docs autocapitalizeoff autofocus
## character(0) 1 1
inspect(dtms[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 33
## Weighting : term frequency (tf)
##
## Terms
## Docs acceptcharsetutf action actionjuliasilgejaneaustenrsearch
## character(0) 2 2 1
## Terms
## Docs add alt andspantd another api
## character(0) 4 1 1 2 1
## Terms
## Docs ariadescribedbyfaceboxdescript ariahaspopuptru
## character(0) 1 1
## Terms
## Docs ariahiddentru ariahiddentrue arialabel arialabelclose
## character(0) 1 24 3 2
## Terms
## Docs arialabelcopy arialabeldismiss arialabelfilter
## character(0) 1 1 1
## Terms
## Docs arialabelhomepage arialabeljump arialabelledbyfaceboxheader
## character(0) 2 1 1
## Terms
## Docs arialabelsearch arialabelswitch arialabeltoggle
## character(0) 1 1 1
## Terms
## Docs arialabelyou ariaselectedtrue article asyncasync austens
## character(0) 5 1 1 1 3
## Terms
## Docs autocapitalizeoff autofocus
## character(0) 1 1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 779, documents: 1)>>
## Non-/sparse entries: 779/0
## Sparsity : 0%
## Maximal term length: 230
## Weighting : term frequency (tf)
inspect(tdm[1:30,1])
## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 33
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## acceptcharsetutf 2
## action 2
## actionjuliasilgejaneaustenrsearch 1
## add 4
## alt 1
## andspantd 1
## another 2
## api 1
## ariadescribedbyfaceboxdescript 1
## ariahaspopuptru 1
## ariahiddentru 1
## ariahiddentrue 24
## arialabel 3
## arialabelclose 2
## arialabelcopy 1
## arialabeldismiss 1
## arialabelfilter 1
## arialabelhomepage 2
## arialabeljump 1
## arialabelledbyfaceboxheader 1
## arialabelsearch 1
## arialabelswitch 1
## arialabeltoggle 1
## arialabelyou 5
## ariaselectedtrue 1
## article 1
## asyncasync 1
## austens 3
## autocapitalizeoff 1
## autofocus 1
matx1=as.matrix(tdm)
matx1[1:10]
## [1] 2 2 1 4 1 1 2 1 1 1
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
## span div blobcodeinner classblobcode
## 136 92 72 72
## classblobnum datalinenumbertd idl idlc
## 72 72 72 72
## jslinenumber jsfilelinespan
## 72 53
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
## Word Frequency
## span span 136
## div div 92
## blobcodeinner blobcodeinner 72
## classblobcode classblobcode 72
## classblobnum classblobnum 72
## datalinenumbertd datalinenumbertd 72
## idl idl 72
## idlc idlc 72
## jslinenumber jslinenumber 72
## jsfilelinespan jsfilelinespan 53
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : span could not be fit on page. It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : classplpdsquotspanhttpwwwgutenbergorgcacheepubpgtxtspan
## could not be fit on page. It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : classplcspanspan could not be fit on page. It will not be
## plotted.

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : jslinenumber could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : classblobnum could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : datalinenumbertd could not be fit on page. It will not be
## plotted.
