#Install the package as mentioned in the question for Text Mining
#install.packages("janeaustenr")
library(tm)
## Loading required package: NLP
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
warp = "http://www.gutenberg.org/files/158/158-h/158-h.htm"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 9242
summary(Corpus1)
## Length Class Mode
## 158-h.htm 2 PlainTextDocument list
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)
Corpus1 <- tm_map(Corpus1, stripWhitespace)
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
dtm <- DocumentTermMatrix(Corpus1)
inspect(dtm[1,-10])
## <<DocumentTermMatrix (documents: 1, terms: 383)>>
## Non-/sparse entries: 383/0
## Sparsity : 0%
## Maximal term length: 50
## Weighting : term frequency (tf)
##
## Terms
## Docs accepts accesskey accesskeyhhelpbutton accesskeym
## character(0) 1 1 1 1
## Terms
## Docs accesskeys actionebookssearch
## character(0) 1 1
## Terms
## Docs actionhttpswwwpaypalcomcgibinwebscr actionwcaptchaansw
## character(0) 1 1
## Terms
## Docs againp another around ask audio austen austentd avoid
## character(0) 1 1 1 1 3 1 3 1
## Terms
## Docs background belowp better bodi book books button callback
## character(0) 1 1 1 2 1 4 3 1
## Terms
## Docs canonicalurl captcha captchap cataudiotd charsetutf
## character(0) 1 3 1 1 1
## Terms
## Docs classbadg classbodi classborderless
## character(0) 2 1 1
## Terms
## Docs classcenterhttpwwwgutenbergorgwcaptchaquestionbr
## character(0) 1
## Terms
## Docs classflattrbutton classfoot classhelpbox classhid
## character(0) 1 1 2 1
## Terms
## Docs classhidden classicon classnoprint classnoprintdiv
## character(0) 1 3 1 1
## Terms
## Docs classnoscreen classpaypalbutton classrecaptchaonlyifaudio
## character(0) 1 1 1
## Terms
## Docs classrecaptchaonlyifaudioenter classrecaptchaonlyifimage
## character(0) 1 1
## Terms
## Docs classrecaptchaonlyifimageenter
## character(0) 1
## Terms
## Docs classrecaptchaonlyifincorrectsolincorrect clicked
## character(0) 1 1
## Terms
## Docs closeonescape color colspan content contentebook
## character(0) 1 1 1 1 1
## Terms
## Docs contentenus contentgutenbergnew
## character(0) 1 1
## Terms
## Docs contenthttpwwwgutenbergorgpicslogoxpng
## character(0) 1
## Terms
## Docs contenthttpwwwgutenbergorgwcaptchaquestion contentproject
## character(0) 1 5
## Terms
## Docs contentpublic contentsummary contenttextcss contenttexthtml
## character(0) 1 1 1 1
## Terms
## Docs contentwebsite contentwidthdevicewidth cookie cookies
## character(0) 1 1 1 1
## Terms
## Docs custom customthemewidget dialog dialogmessage dialogtitle
## character(0) 1 1 2 1 1
## Terms
## Docs display div dlg dlgdialog doctype downloaddiv downloadtd
## character(0) 1 24 1 1 1 1 1
## Terms
## Docs ebooks ebookssuggest enabledp enctypemultipartformdata enus
## character(0) 6 1 1 1 2
## Terms
## Docs except fals fast fblang feed flattr form free french
## character(0) 1 2 1 1 1 1 6 6 1
## Terms
## Docs function germantd gutenberg hamlettd head hearp help
## character(0) 2 1 12 1 2 1 1
## Terms
## Docs hrefcsspgdesktoponecss hrefebookssearcha
## character(0) 1 1
## Terms
## Docs hrefebookssearchsortorderreleasedatelatesta
## character(0) 1
## Terms
## Docs hrefhttpsflattrcomthingprojectgutenberg
## character(0) 1
## Terms
## Docs hrefhttpwwwgutenbergorgwcaptchaquestion
## character(0) 1
## Terms
## Docs hrefmgutenbergorgwcaptchaquestionformatmobile
## character(0) 1
## Terms
## Docs hrefpicsappletouchicon hrefpicsfavicon hreftermsofuseterms
## character(0) 1 1 1
## Terms
## Docs hrefwcaptchaquestionformatopds
## character(0) 1
## Terms
## Docs hrefwikigutenbergprojectgutenbergneedsyourdonation
## character(0) 1
## Terms
## Docs hrefwikimainpage hrefwwwgutenbergorgcatalogosdbooksxml html
## character(0) 1 1 3
## Terms
## Docs httpequivcontentlanguage httpequivcontentstyletype
## character(0) 1 1
## Terms
## Docs httpequivcontenttype httpwwwgutenbergorgwcaptchaquest
## character(0) 1 1
## Terms
## Docs httpwwwworgmarkupdtdxhtmlrdfadtd human icon iconflattrspan
## character(0) 1 2 2 1
## Terms
## Docs iconlogospan iconsmsearchspan idcaptcha idcont iddialog
## character(0) 1 1 1 1 1
## Terms
## Docs idfbrootdiv idflattrbadge idhelpbox idhelpbutton
## character(0) 1 1 1 1
## Terms
## Docs idhelpbuttoncel idid idlogo idmenubar idmenubarfirst
## character(0) 1 1 1 1 1
## Terms
## Docs idmenubarsearch idmwheaddummy idneedcookiesproject
## character(0) 1 1 1
## Terms
## Docs idneedjavascriptyou idpaypalbadge idprinthead
## character(0) 1 1 1
## Terms
## Docs idrecaptchaimagediv idrecaptcharesponsefield
## character(0) 1 1
## Terms
## Docs idrecaptchawidget idscreenhead idsearch idsearchbutton
## character(0) 1 1 1 1
## Terms
## Docs idsearchbuttoncel idsearchinput idsearchinputcel
## character(0) 1 1 1
## Terms
## Docs idtaglinebadges idtaglineproject image indicated input ipad
## character(0) 1 1 2 1 10 2
## Terms
## Docs iphone italiantd jane javascript jqueri jquery jquerycookie
## character(0) 2 1 1 1 1 3 1
## Terms
## Docs jquerycookiejquerycooki jqueryuidialog jsonsearch kindle
## character(0) 1 1 1 2
## Terms
## Docs lang langen latest lfckowsaaaaajlqwhpdhzsrkkrbzlhixw lfr
## character(0) 1 23 1 1 1
## Terms
## Docs lgermantd link lit load lot ltentergt lthgt ltsgt main make
## character(0) 1 6 2 1 1 2 1 1 1 1
## Terms
## Docs marginbottom matchtd menu meta methodget methodpost
## character(0) 1 1 1 17 1 2
## Terms
## Docs mgutenbergorgwcaptchaquestionformatmobil mobile mobileurl
## character(0) 1 1 1
## Terms
## Docs modal money moneydonatea msgloadmore nameaudiobutton
## character(0) 1 2 1 1 1
## Terms
## Docs nameclassification namecmd namedescription namehelpbutton
## character(0) 1 1 1 1
## Terms
## Docs namehostedbuttonid nameimagebutton namekeywords namequery
## character(0) 1 1 1 1
## Terms
## Docs namerecaptcharesponsefield namereloadbutton namesubmit
## character(0) 1 1 1
## Terms
## Docs namesubmitbutton nametitle nametwittercard nametwittersite
## character(0) 1 1 1 1
## Terms
## Docs nameviewport need needcookieshide needjavascripthide new
## character(0) 1 2 1 1 1
## Terms
## Docs nofollow non nook norepeat notd numbers offers
## character(0) 1 1 2 1 2 1 4
## Terms
## Docs onclickrecaptchareload onclickrecaptchashowhelp
## character(0) 1 1
## Terms
## Docs onclickrecaptchaswitchtype onrecaptchaload
## character(0) 2 1
## Terms
## Docs onrecaptchaloaded oper pag pagemode paypal penter please
## character(0) 1 1 1 1 1 1 1
## Terms
## Docs press project propertyfbappid propertyogdescription
## character(0) 1 5 1 1
## Terms
## Docs propertyogimage propertyogsitename propertyogtitle
## character(0) 1 1 1
## Terms
## Docs propertyogtype propertyogurl public punctuation put pyou
## character(0) 1 1 1 1 1 1
## Terms
## Docs queryth quite quixotetd really recaptchacreate
## character(0) 1 1 1 1 1
## Terms
## Docs recaptchafocusresponsefield recaptchaonlyifincorrectsol
## character(0) 1 1
## Terms
## Docs recaptchaopt recaptchaoptions recaptcharesponsefield
## character(0) 1 1 1
## Terms
## Docs recaptchawidget relalternate relappletouchicon relcanonical
## character(0) 1 1 1 1
## Terms
## Docs releases relsearch relshortcut relstylesheet require
## character(0) 1 1 1 1 1
## Terms
## Docs resizable resolve resultsâ<U+0080> rowspan rowspanprefixesth
## character(0) 1 1 1 2 1
## Terms
## Docs screen script search seep separated sessionid shakespearetd
## character(0) 1 6 3 1 1 1 3
## Terms
## Docs sitemobilea smallalways spac spaces span
## character(0) 1 1 1 1 3
## Terms
## Docs srcjspgdesktoponejsscript
## character(0) 1
## Terms
## Docs srcwwwgooglecomrecaptchaapijsrecaptchaajaxjsscript
## character(0) 1
## Terms
## Docs srcwwwgutenbergorgpicspaypalenusgif stories storiestd style
## character(0) 1 2 2 4
## Terms
## Docs stylewidth sure tabindex tabl table targetblank tdabout
## character(0) 1 1 5 7 3 1 1
## Terms
## Docs tdaudio tdauthortd tdby tdcategorytd tdebook tdexact
## character(0) 1 1 2 1 3 1
## Terms
## Docs tdgroupingtd tdhamlet tdjuvenile tdlanguagetd tdlove
## character(0) 1 1 1 1 2
## Terms
## Docs tdnottd tdortd tdqui tdsubjecttd tdtitletd terms theme
## character(0) 1 1 1 1 1 2 1
## Terms
## Docs thesesmal thfindsth thisp thsuffixesth ththis titleare
## character(0) 1 1 1 1 1 1
## Terms
## Docs titlecaptchatitl titleexecute titlego titlelearn titleopds
## character(0) 1 1 2 1 1
## Terms
## Docs titleopen titleour titleread titlesearch titlesend
## character(0) 1 1 1 2 2
## Terms
## Docs titlestart today transparent tru try
## character(0) 1 1 1 1 1
## Terms
## Docs typeapplicationatomxmlprofileopdscatalog
## character(0) 1
## Terms
## Docs typeapplicationopensearchdescriptionxml typebutton
## character(0) 1 5
## Terms
## Docs typehidden typeimage typesubmit typetext typetextcss
## character(0) 2 1 2 2 3
## Terms
## Docs typetextjavascript typetextjavascriptcdata
## character(0) 3 1
## Terms
## Docs uidialogtitlebarclose urlpicsspritepng use usea used value
## character(0) 1 1 1 1 1 1
## Terms
## Docs valueget valuehelp valuesubmit valuesxclick
## character(0) 3 1 1 1
## Terms
## Docs valuexkalbzlypsn var verne wcdtd width words works
## character(0) 1 11 1 1 2 1 1
## Terms
## Docs xhtmlrdfa xmllangen xmllangenashakespearetd xmllangenath
## character(0) 1 3 1 1
## Terms
## Docs xmllangencatth xmllangenjane xmllangenjuvenile
## character(0) 1 1 1
## Terms
## Docs xmllangenlove xmllangenlth xmllangennth xmllangenquitd
## character(0) 2 1 1 1
## Terms
## Docs xmllangenshakespeare xmllangensshakespearetd xmllangensth
## character(0) 1 1 1
## Terms
## Docs xmllangentd xmllangenth xmllangentth xmllangenverne
## character(0) 1 4 1 1
## Terms
## Docs xmlnshttpwwwworgxhtml xxxx
## character(0) 1 1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 384, documents: 1)>>
## Non-/sparse entries: 384/0
## Sparsity : 0%
## Maximal term length: 50
## Weighting : term frequency (tf)
inspect(tdm[1:30,1])
## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 35
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## accepts 1
## accesskey 1
## accesskeyhhelpbutton 1
## accesskeym 1
## accesskeys 1
## actionebookssearch 1
## actionhttpswwwpaypalcomcgibinwebscr 1
## actionwcaptchaansw 1
## againp 1
## android 2
## another 1
## around 1
## ask 1
## audio 3
## austen 1
## austentd 3
## avoid 1
## background 1
## belowp 1
## better 1
## bodi 2
## book 1
## books 4
## button 3
## callback 1
## canonicalurl 1
## captcha 3
## captchap 1
## cataudiotd 1
## charsetutf 1
matx1=as.matrix(tdm)
matx1[1:10]
## [1] 1 1 1 1 1 1 1 1 1 2
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
## div langen meta gutenberg var input tabl
## 24 23 17 12 11 10 7
## ebooks form free
## 6 6 6
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
## Word Frequency
## div div 24
## langen langen 23
## meta meta 17
## gutenberg gutenberg 12
## var var 11
## input input 10
## tabl tabl 7
## ebooks ebooks 6
## form form 6
## free free 6
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))
