library(tm)
## Loading required package: NLP
getReaders()
##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"
getSources() 
## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"
warp ="https://cran.r-project.org/src/contrib/janeaustenr_0.1.3.tar.gz"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 865291
summary(Corpus1)
##                          Length Class             Mode
## janeaustenr_0.1.3.tar.gz 2      PlainTextDocument list
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)  
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)  
Corpus1 <- tm_map(Corpus1, stripWhitespace)   
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
library(tm)
dtm <- DocumentTermMatrix(Corpus1)
dtms <- removeSparseTerms(dtm, 0.2)
inspect(dtm[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 72
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           \001\001ør\030\035c
##   character(0)                   1
##               Terms
## Docs           \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò
##   character(0)                                                                1
##               Terms
## Docs           \001\004\001\027\025<U+009B><U+0092>ñôúëxk \001\adûáýråëÿ\033
##   character(0)                            1                  1
##               Terms
## Docs              \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â         \001\016     
##   character(0)                                           1         1
##               Terms
## Docs           \001\020\005ó<U+0088>\020 \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>
##   character(0)                  1                            1
##               Terms
## Docs           \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d
##   character(0)                             1
##               Terms
## Docs              \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt     
##   character(0)                                              1
##               Terms
## Docs           \001\026<U+0085>\002o\020\017ßs<U+0093>sé<U+0082>àðvðö\025mwáéxcñmû<U+0094>ïïk\027ì\026j<U+0093>\b<U+0092>\005<U+0095><U+0098>ovôntóëkõo<U+0084>ðö<U+0099>þ\017\022âpk<U+0086>
##   character(0)                                                                                                 1
##               Terms
## Docs           \001\030oaæ\016 \001\035\005\032<U+0089>
##   character(0)               1                 1
##               Terms
## Docs              \001\037ßôîpå\003<U+0088>q<U+0087>lx\023üvu<U+0086><U+009E>\003<U+009B>òùï\027<U+0085>ì<U+009C>ç<U+009E>nè\036     
##   character(0)                                                       1
##               Terms
## Docs              \001\037ù<U+0085>íßçÿ     
##   character(0)               1
##               Terms
## Docs              \001<U+0096>\017\177bõ\bï<U+0099>òóïí<U+009A>éÿù\025\031nÿ<U+009E>fbþy\002å<U+0086>t\036yìkÿ     
##   character(0)                                                          1
##               Terms
## Docs           \001<U+0096>îìðdëte<U+009E>mámhk\ay    \001<U+0088>éç\003<U+0080>\033k<U+0096>     
##   character(0)                     1                   1
##               Terms
## Docs              \001<U+0091><U+009C>àûóñ\177z\016ü\033óúèö<U+0088>ñ\004ä\002ek<U+0097><U+0097>\020ã\005zefõõüéów     
##   character(0)                                                              1
##               Terms
## Docs           \001<U+0092><U+0082>sþ\022         \001<U+0092><U+0099>leòùjr\025rrcssdááüdá<U+0089>\002nuí<U+0082>clvúvò          
##   character(0)            1                                            1
##               Terms
## Docs           \001<U+0094>â\021<U+009E>\033åïséóñíä \001<U+0084>r \001<U+0084>t<U+0085>\003<U+0085>
##   character(0)                       1      1            1
##               Terms
## Docs           \001<U+0084>y\030é\024u\005\005<U+0084>å\005<U+0082>ðë \001<U+009B>lmäzûi<U+0091>iþoüæé
##   character(0)                                 1                  1
##               Terms
## Docs                   \001<U+0086>m<U+0099>æé\027zo\033zåukrñ\017l\035nzíéyây\035ãêìúsrt<U+0098>tç\032tg\003x<U+0098>øj<U+0094>v<U+0087>n<U+0091>tãarêøù\035eÿèdkb<U+0094>êl<U+009A>f<U+0085>          
##   character(0)                                                                                                   1
##               Terms
## Docs           \001<U+0095><U+0084>m\006juu\025h<U+0094>\025
##   character(0)                        1
##               Terms
## Docs              \001<U+0089>b<U+0086><U+009A>\027\020ä\006\024þúô\033\032\030þuddâí\030\037çï\021òns\001bi<U+0089>rûþi\b     
##   character(0)                                                                             1
##               Terms
## Docs              \001\021bxôîzepé     
##   character(0)                 1
inspect(dtms[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 72
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           \001\001ør\030\035c
##   character(0)                   1
##               Terms
## Docs           \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò
##   character(0)                                                                1
##               Terms
## Docs           \001\004\001\027\025<U+009B><U+0092>ñôúëxk \001\adûáýråëÿ\033
##   character(0)                            1                  1
##               Terms
## Docs              \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â         \001\016     
##   character(0)                                           1         1
##               Terms
## Docs           \001\020\005ó<U+0088>\020 \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>
##   character(0)                  1                            1
##               Terms
## Docs           \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d
##   character(0)                             1
##               Terms
## Docs              \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt     
##   character(0)                                              1
##               Terms
## Docs           \001\026<U+0085>\002o\020\017ßs<U+0093>sé<U+0082>àðvðö\025mwáéxcñmû<U+0094>ïïk\027ì\026j<U+0093>\b<U+0092>\005<U+0095><U+0098>ovôntóëkõo<U+0084>ðö<U+0099>þ\017\022âpk<U+0086>
##   character(0)                                                                                                 1
##               Terms
## Docs           \001\030oaæ\016 \001\035\005\032<U+0089>
##   character(0)               1                 1
##               Terms
## Docs              \001\037ßôîpå\003<U+0088>q<U+0087>lx\023üvu<U+0086><U+009E>\003<U+009B>òùï\027<U+0085>ì<U+009C>ç<U+009E>nè\036     
##   character(0)                                                       1
##               Terms
## Docs              \001\037ù<U+0085>íßçÿ     
##   character(0)               1
##               Terms
## Docs              \001<U+0096>\017\177bõ\bï<U+0099>òóïí<U+009A>éÿù\025\031nÿ<U+009E>fbþy\002å<U+0086>t\036yìkÿ     
##   character(0)                                                          1
##               Terms
## Docs           \001<U+0096>îìðdëte<U+009E>mámhk\ay    \001<U+0088>éç\003<U+0080>\033k<U+0096>     
##   character(0)                     1                   1
##               Terms
## Docs              \001<U+0091><U+009C>àûóñ\177z\016ü\033óúèö<U+0088>ñ\004ä\002ek<U+0097><U+0097>\020ã\005zefõõüéów     
##   character(0)                                                              1
##               Terms
## Docs           \001<U+0092><U+0082>sþ\022         \001<U+0092><U+0099>leòùjr\025rrcssdááüdá<U+0089>\002nuí<U+0082>clvúvò          
##   character(0)            1                                            1
##               Terms
## Docs           \001<U+0094>â\021<U+009E>\033åïséóñíä \001<U+0084>r \001<U+0084>t<U+0085>\003<U+0085>
##   character(0)                       1      1            1
##               Terms
## Docs           \001<U+0084>y\030é\024u\005\005<U+0084>å\005<U+0082>ðë \001<U+009B>lmäzûi<U+0091>iþoüæé
##   character(0)                                 1                  1
##               Terms
## Docs                   \001<U+0086>m<U+0099>æé\027zo\033zåukrñ\017l\035nzíéyây\035ãêìúsrt<U+0098>tç\032tg\003x<U+0098>øj<U+0094>v<U+0087>n<U+0091>tãarêøù\035eÿèdkb<U+0094>êl<U+009A>f<U+0085>          
##   character(0)                                                                                                   1
##               Terms
## Docs           \001<U+0095><U+0084>m\006juu\025h<U+0094>\025
##   character(0)                        1
##               Terms
## Docs              \001<U+0089>b<U+0086><U+009A>\027\020ä\006\024þúô\033\032\030þuddâí\030\037çï\021òns\001bi<U+0089>rûþi\b     
##   character(0)                                                                             1
##               Terms
## Docs              \001\021bxôîzepé     
##   character(0)                 1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 22233, documents: 1)>>
## Non-/sparse entries: 22233/0
## Sparsity           : 0%
## Maximal term length: 288
## Weighting          : term frequency (tf)
inspect(tdm[1:30,1])
## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 72
## Weighting          : term frequency (tf)
## 
##                                                                                                                Docs
## Terms                                                                                                           character(0)
##   \001\001ør\030\035c                                                                                                      1
##   \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò                                                         1
##   \001\004\001\027\025<U+009B><U+0092>ñôúëxk                                                                                             1
##   \001\adûáýråëÿ\033                                                                                                       1
##   \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â                                                                              1
##   \001\016                                                                                                                1
##   \001\020\005ó<U+0088>\020                                                                                                       1
##   \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>                                                                                             1
##   \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d                                                                                            1
##   \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt                                                                           1
##   \001\026<U+0085>\002o\020\017ßs<U+0093>sé<U+0082>àðvðö\025mwáéxcñmû<U+0094>ïïk\027ì\026j<U+0093>\b<U+0092>\005<U+0095><U+0098>ovôntóëkõo<U+0084>ðö<U+0099>þ\017\022âpk<U+0086>                        1
##   \001\030oaæ\016                                                                                                          1
##   \001\035\005\032<U+0089>                                                                                                        1
##   \001\037ßôîpå\003<U+0088>q<U+0087>lx\023üvu<U+0086><U+009E>\003<U+009B>òùï\027<U+0085>ì<U+009C>ç<U+009E>nè\036                                                                  1
##   \001\037ù<U+0085>íßçÿ                                                                                                          1
##   \001<U+0096>\017\177bõ\bï<U+0099>òóïí<U+009A>éÿù\025\031nÿ<U+009E>fbþy\002å<U+0086>t\036yìkÿ                                                               1
##   \001<U+0096>îìðdëte<U+009E>mámhk\ay                                                                                                    1
##   \001<U+0088>éç\003<U+0080>\033k<U+0096>                                                                                                      1
##   \001<U+0091><U+009C>àûóñ\177z\016ü\033óúèö<U+0088>ñ\004ä\002ek<U+0097><U+0097>\020ã\005zefõõüéów                                                           1
##   \001<U+0092><U+0082>sþ\022                                                                                                             1
##   \001<U+0092><U+0099>leòùjr\025rrcssdááüdá<U+0089>\002nuí<U+0082>clvúvò                                                                             1
##   \001<U+0094>â\021<U+009E>\033åïséóñíä                                                                                                  1
##   \001<U+0084>r                                                                                                                   1
##   \001<U+0084>t<U+0085>\003<U+0085>                                                                                                             1
##   \001<U+0084>y\030é\024u\005\005<U+0084>å\005<U+0082>ðë                                                                                        1
##   \001<U+009B>lmäzûi<U+0091>iþoüæé                                                                                                       1
##   \001<U+0086>m<U+0099>æé\027zo\033zåukrñ\017l\035nzíéyây\035ãêìúsrt<U+0098>tç\032tg\003x<U+0098>øj<U+0094>v<U+0087>n<U+0091>tãarêøù\035eÿèdkb<U+0094>êl<U+009A>f<U+0085>                      1
##   \001<U+0095><U+0084>m\006juu\025h<U+0094>\025                                                                                                 1
##   \001<U+0089>b<U+0086><U+009A>\027\020ä\006\024þúô\033\032\030þuddâí\030\037çï\021òns\001bi<U+0089>rûþi\b                                            1
##   \001\021bxôîzepé                                                                                                        1
matx1=as.matrix(tdm)
matx1[1:10]
##  [1] 1 1 1 1 1 1 1 1 1 1
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
##                                              \001\001ør\030\035c 
##                                                                1 
## \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò 
##                                                                1 
##                                     \001\004\001\027\025<U+009B><U+0092>ñôúëxk 
##                                                                1 
##                                               \001\adûáýråëÿ\033 
##                                                                1 
##                      \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â 
##                                                                1 
##                                                        \001\016 
##                                                                1 
##                                               \001\020\005ó<U+0088>\020 
##                                                                1 
##                                     \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089> 
##                                                                1 
##                                    \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d 
##                                                                1 
##                   \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt 
##                                                                1
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
##                                                                                                                              Word
## \001\001ør\030\035c                                                                                           \001\001ør\030\035c
## \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò
## \001\004\001\027\025<U+009B><U+0092>ñôúëxk                                                                         \001\004\001\027\025<U+009B><U+0092>ñôúëxk
## \001\adûáýråëÿ\033                                                                                             \001\adûáýråëÿ\033
## \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â                                           \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â
## \001\016                                                                                                               \001\016
## \001\020\005ó<U+0088>\020                                                                                             \001\020\005ó<U+0088>\020
## \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>                                                                         \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>
## \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d                                                                       \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d
## \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt                                     \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt
##                                                                  Frequency
## \001\001ør\030\035c                                                      1
## \001\002<U+008B>\003jøýc<U+0080>\004ñiõksñá\016ý<U+0092>sjýâóõõlå<U+0092>cbòÿbñ\003clïwp<U+0092>øöò         1
## \001\004\001\027\025<U+009B><U+0092>ñôúëxk                                             1
## \001\adûáýråëÿ\033                                                       1
## \001\b\023<U+0088>þmõêðe\026ä<U+0097><U+008B>ú<U+0098>ôaýc<U+0094>è\034<U+009C>uxá<U+009C>â                              1
## \001\016                                                                1
## \001\020\005ó<U+0088>\020                                                       1
## \001\020<U+0088>ab\bý\024<U+0095>qè\020áá<U+0089>                                             1
## \001\024\021<U+0098>àÿ<U+0092>c<U+009B>câ\005\021d                                            1
## \001\024\022<U+0086><U+009B>a<U+009A>j\024èxqfh<U+009B><U+0082>rp<U+0096>\bà\021ü\016jt                           1
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))   
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ‚fýyþuïwáærã›ôøüccžp˜é”žãqyüaäü€gàûùùfûútûe‡õîûpd could not
## be fit on page. It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : „ås’ìiclçxñõà’l’umâxêóýhaþëûný could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : è•íìãˆe†ûnbr‚e could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : uánášlø›ezçw‹ö“…xhûíôykwíuöåo‹ï—âé‹áœûåäugäò could not be
## fit on page. It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : soxçk™àöõ—d”phöšúdsoôáìñìxæšèî could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : “wc•™ïû„wmþ could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : òhýätäàaüïpnéòøíuâùîågêlïòönƒïhþyv•áûohxníìúñktmspã could
## not be fit on page. It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ízžqâæßo†cÿÿ—bjxiùâgïêfks€âoö could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ý—ÿbñk†ãjhxšpÿ””‹röa™zb could not be fit on page. It will
## not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : t–ûl”ãwxsqôøëßr‡knu™þìàêwùøû could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : wòóbvùíxnürtqüçêòmb could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : •õ“vd—vb‚u–z‰ù€žú’í could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ntwý‚“”üt䓏dáøžèùàòwgõûé’ëñg could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ënlåaw—sïiøû䈚wœxûvqÿrèwsr—ãë could not be fit on page. It
## will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : wƒœ‡ûceošš„ðnì–zeš˜õáƒ“‚˜fgnié‘é could not be fit on page.
## It will not be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : cseqtyˆdz•e could not be fit on page. It will not be
## plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : tfðò…cùbó–àùpœãä€éñï could not be fit on page. It will not
## be plotted.
## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : ì›øiâò could not be fit on page. It will not be plotted.

``` ## Warning in wordcloud(di\(Word, di\)Frequency, max.words = 100, colors = ## brewer.pal(6, : âžáßn€rriiƒq•bä‘‘’ozví›ñzœtˆ“zduìlnž˜x‹yž–äzžqzjþ