library(tm)
## Loading required package: NLP
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
warp ="https://cran.r-project.org/src/contrib/janeaustenr_0.1.3.tar.gz"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 865291
summary(Corpus1)
## Length Class Mode
## janeaustenr_0.1.3.tar.gz 2 PlainTextDocument list
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)
Corpus1 <- tm_map(Corpus1, stripWhitespace)
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
library(tm)
dtm <- DocumentTermMatrix(Corpus1)
dtms <- removeSparseTerms(dtm, 0.2)
inspect(dtm[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 72
## Weighting : term frequency (tf)
##
## Terms
## Docs \001\001ør\030\035c
## character(0) 1
## Terms
## Docs \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò
## character(0) 1
## Terms
## Docs \001\004\001\027\025<U+009B><U+0092>ñôúëxk \001\adûáýråëÿ\033
## character(0) 1 1
## Terms
## Docs \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â \001\016
## character(0) 1 1
## Terms
## Docs \001\020\005ó<U+0088>\020 \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>
## character(0) 1 1
## Terms
## Docs \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d
## character(0) 1
## Terms
## Docs \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt
## character(0) 1
## Terms
## Docs \001\026<U+0085>\002o\020\017ßs<U+0093>sé<U+0082>àðvðö\025mwáéxcñmû<U+0094>ïïk\027ì\026j<U+0093>\b<U+0092>\005<U+0095><U+0098>ovôntóëkõo<U+0084>ðö<U+0099>þ\017\022âpk<U+0086>
## character(0) 1
## Terms
## Docs \001\030oaæ\016 \001\035\005\032<U+0089>
## character(0) 1 1
## Terms
## Docs \001\037ßôîpå\003<U+0088>q<U+0087>lx\023üvu<U+0086><U+009E>\003<U+009B>òùï\027<U+0085>ì<U+009C>ç<U+009E>nè\036
## character(0) 1
## Terms
## Docs \001\037ù<U+0085>íßçÿ
## character(0) 1
## Terms
## Docs \001<U+0096>\017\177bõ\bï<U+0099>òóïí<U+009A>éÿù\025\031nÿ<U+009E>fbþy\002å<U+0086>t\036yìkÿ
## character(0) 1
## Terms
## Docs \001<U+0096>îìðdëte<U+009E>mámhk\ay \001<U+0088>éç\003<U+0080>\033k<U+0096>
## character(0) 1 1
## Terms
## Docs \001<U+0091><U+009C>àûóñ\177z\016ü\033óúèö<U+0088>ñ\004ä\002ek<U+0097><U+0097>\020ã\005zefõõüéów
## character(0) 1
## Terms
## Docs \001<U+0092><U+0082>sþ\022 \001<U+0092><U+0099>leòùjr\025rrcssdááüdá<U+0089>\002nuí<U+0082>clvúvò
## character(0) 1 1
## Terms
## Docs \001<U+0094>â\021<U+009E>\033åïséóñíä \001<U+0084>r \001<U+0084>t<U+0085>\003<U+0085>
## character(0) 1 1 1
## Terms
## Docs \001<U+0084>y\030é\024u\005\005<U+0084>å\005<U+0082>ðë \001<U+009B>lmäzûi<U+0091>iþoüæé
## character(0) 1 1
## Terms
## Docs \001<U+0086>m<U+0099>æé\027zo\033zåukrñ\017l\035nzíéyây\035ãêìúsrt<U+0098>tç\032tg\003x<U+0098>øj<U+0094>v<U+0087>n<U+0091>tãarêøù\035eÿèdkb<U+0094>êl<U+009A>f<U+0085>
## character(0) 1
## Terms
## Docs \001<U+0095><U+0084>m\006juu\025h<U+0094>\025
## character(0) 1
## Terms
## Docs \001<U+0089>b<U+0086><U+009A>\027\020ä\006\024þúô\033\032\030þuddâí\030\037çï\021òns\001bi<U+0089>rûþi\b
## character(0) 1
## Terms
## Docs \001\021bxôîzepé
## character(0) 1
inspect(dtms[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 72
## Weighting : term frequency (tf)
##
## Terms
## Docs \001\001ør\030\035c
## character(0) 1
## Terms
## Docs \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò
## character(0) 1
## Terms
## Docs \001\004\001\027\025<U+009B><U+0092>ñôúëxk \001\adûáýråëÿ\033
## character(0) 1 1
## Terms
## Docs \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â \001\016
## character(0) 1 1
## Terms
## Docs \001\020\005ó<U+0088>\020 \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>
## character(0) 1 1
## Terms
## Docs \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d
## character(0) 1
## Terms
## Docs \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt
## character(0) 1
## Terms
## Docs \001\026<U+0085>\002o\020\017ßs<U+0093>sé<U+0082>àðvðö\025mwáéxcñmû<U+0094>ïïk\027ì\026j<U+0093>\b<U+0092>\005<U+0095><U+0098>ovôntóëkõo<U+0084>ðö<U+0099>þ\017\022âpk<U+0086>
## character(0) 1
## Terms
## Docs \001\030oaæ\016 \001\035\005\032<U+0089>
## character(0) 1 1
## Terms
## Docs \001\037ßôîpå\003<U+0088>q<U+0087>lx\023üvu<U+0086><U+009E>\003<U+009B>òùï\027<U+0085>ì<U+009C>ç<U+009E>nè\036
## character(0) 1
## Terms
## Docs \001\037ù<U+0085>íßçÿ
## character(0) 1
## Terms
## Docs \001<U+0096>\017\177bõ\bï<U+0099>òóïí<U+009A>éÿù\025\031nÿ<U+009E>fbþy\002å<U+0086>t\036yìkÿ
## character(0) 1
## Terms
## Docs \001<U+0096>îìðdëte<U+009E>mámhk\ay \001<U+0088>éç\003<U+0080>\033k<U+0096>
## character(0) 1 1
## Terms
## Docs \001<U+0091><U+009C>àûóñ\177z\016ü\033óúèö<U+0088>ñ\004ä\002ek<U+0097><U+0097>\020ã\005zefõõüéów
## character(0) 1
## Terms
## Docs \001<U+0092><U+0082>sþ\022 \001<U+0092><U+0099>leòùjr\025rrcssdááüdá<U+0089>\002nuí<U+0082>clvúvò
## character(0) 1 1
## Terms
## Docs \001<U+0094>â\021<U+009E>\033åïséóñíä \001<U+0084>r \001<U+0084>t<U+0085>\003<U+0085>
## character(0) 1 1 1
## Terms
## Docs \001<U+0084>y\030é\024u\005\005<U+0084>å\005<U+0082>ðë \001<U+009B>lmäzûi<U+0091>iþoüæé
## character(0) 1 1
## Terms
## Docs \001<U+0086>m<U+0099>æé\027zo\033zåukrñ\017l\035nzíéyây\035ãêìúsrt<U+0098>tç\032tg\003x<U+0098>øj<U+0094>v<U+0087>n<U+0091>tãarêøù\035eÿèdkb<U+0094>êl<U+009A>f<U+0085>
## character(0) 1
## Terms
## Docs \001<U+0095><U+0084>m\006juu\025h<U+0094>\025
## character(0) 1
## Terms
## Docs \001<U+0089>b<U+0086><U+009A>\027\020ä\006\024þúô\033\032\030þuddâí\030\037çï\021òns\001bi<U+0089>rûþi\b
## character(0) 1
## Terms
## Docs \001\021bxôîzepé
## character(0) 1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 22233, documents: 1)>>
## Non-/sparse entries: 22233/0
## Sparsity : 0%
## Maximal term length: 288
## Weighting : term frequency (tf)
inspect(tdm[1:30,1])
## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 72
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## \001\001ør\030\035c 1
## \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò 1
## \001\004\001\027\025<U+009B><U+0092>ñôúëxk 1
## \001\adûáýråëÿ\033 1
## \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â 1
## \001\016 1
## \001\020\005ó<U+0088>\020 1
## \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089> 1
## \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d 1
## \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt 1
## \001\026<U+0085>\002o\020\017ßs<U+0093>sé<U+0082>àðvðö\025mwáéxcñmû<U+0094>ïïk\027ì\026j<U+0093>\b<U+0092>\005<U+0095><U+0098>ovôntóëkõo<U+0084>ðö<U+0099>þ\017\022âpk<U+0086> 1
## \001\030oaæ\016 1
## \001\035\005\032<U+0089> 1
## \001\037ßôîpå\003<U+0088>q<U+0087>lx\023üvu<U+0086><U+009E>\003<U+009B>òùï\027<U+0085>ì<U+009C>ç<U+009E>nè\036 1
## \001\037ù<U+0085>íßçÿ 1
## \001<U+0096>\017\177bõ\bï<U+0099>òóïí<U+009A>éÿù\025\031nÿ<U+009E>fbþy\002å<U+0086>t\036yìkÿ 1
## \001<U+0096>îìðdëte<U+009E>mámhk\ay 1
## \001<U+0088>éç\003<U+0080>\033k<U+0096> 1
## \001<U+0091><U+009C>àûóñ\177z\016ü\033óúèö<U+0088>ñ\004ä\002ek<U+0097><U+0097>\020ã\005zefõõüéów 1
## \001<U+0092><U+0082>sþ\022 1
## \001<U+0092><U+0099>leòùjr\025rrcssdááüdá<U+0089>\002nuí<U+0082>clvúvò 1
## \001<U+0094>â\021<U+009E>\033åïséóñíä 1
## \001<U+0084>r 1
## \001<U+0084>t<U+0085>\003<U+0085> 1
## \001<U+0084>y\030é\024u\005\005<U+0084>å\005<U+0082>ðë 1
## \001<U+009B>lmäzûi<U+0091>iþoüæé 1
## \001<U+0086>m<U+0099>æé\027zo\033zåukrñ\017l\035nzíéyây\035ãêìúsrt<U+0098>tç\032tg\003x<U+0098>øj<U+0094>v<U+0087>n<U+0091>tãarêøù\035eÿèdkb<U+0094>êl<U+009A>f<U+0085> 1
## \001<U+0095><U+0084>m\006juu\025h<U+0094>\025 1
## \001<U+0089>b<U+0086><U+009A>\027\020ä\006\024þúô\033\032\030þuddâí\030\037çï\021òns\001bi<U+0089>rûþi\b 1
## \001\021bxôîzepé 1
matx1=as.matrix(tdm)
matx1[1:10]
## [1] 1 1 1 1 1 1 1 1 1 1
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
## \001\001ør\030\035c
## 1
## \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò
## 1
## \001\004\001\027\025<U+009B><U+0092>ñôúëxk
## 1
## \001\adûáýråëÿ\033
## 1
## \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â
## 1
## \001\016
## 1
## \001\020\005ó<U+0088>\020
## 1
## \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>
## 1
## \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d
## 1
## \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt
## 1
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
## Word
## \001\001ør\030\035c \001\001ør\030\035c
## \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò
## \001\004\001\027\025<U+009B><U+0092>ñôúëxk \001\004\001\027\025<U+009B><U+0092>ñôúëxk
## \001\adûáýråëÿ\033 \001\adûáýråëÿ\033
## \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â
## \001\016 \001\016
## \001\020\005ó<U+0088>\020 \001\020\005ó<U+0088>\020
## \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089> \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>
## \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d
## \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt
## Frequency
## \001\001ør\030\035c 1
## \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò 1
## \001\004\001\027\025<U+009B><U+0092>ñôúëxk 1
## \001\adûáýråëÿ\033 1
## \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â 1
## \001\016 1
## \001\020\005ó<U+0088>\020 1
## \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089> 1
## \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d 1
## \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt 1
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ‚fýyþuïwáærã›ôøüccžp˜é”žãqyüaäü€gàûùùfûútûe‡õîûpd could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : „ås’ìiclçxñõà’l’umâxêóýhaþëûný could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : è•íìãˆe†ûnbr‚e could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : uánášlø›ezçw‹ö“…xhûíôykwíuöåo‹ï—âé‹áœûåäugäò could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : soxçk™àöõ—d”phöšúdsoôáìñìxæšèî could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : “wc•™ïû„wmþ could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : òhýätäàaüïpnéòøíuâùîågêlïòönƒïhþyv•áûohxníìúñktï“mspã could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ízžqâæßo†cÿÿ—bjxiùâgïêfks€âoö could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ý—ÿbñk†ãjhxšpÿ””‹röa™zb could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : t–ûl”ãwxsqôøëßr‡knu™þìàêwùøû could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : wòóbvùíxnürtqüçêòmb could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : •õ“vd—vb‚u–z‰ù€žú’í could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ntwý‚“”ütä“dáøžèùàòwgõûé’ëñg could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ënlåaw—sïiøû䈚wœxûvqÿrèwsr—ãë could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : wƒœ‡ûceošš„ðnì–zeš˜õდ‚˜fgnié‘é could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : cseqtyˆdz•e could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : tfðò…cùbó–àùpœãä€éñï could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ì›øiâò could not be fit on page. It will not be plotted.
``` ## Warning in wordcloud(di\(Word, di\)Frequency, max.words = 100, colors = ## brewer.pal(6, : âžáßn€rriiƒq•bä‘‘’ozví›ñzœtˆ“zduìlnž˜x‹yž–äzžqzjþ