library(tm)
## Loading required package: NLP
getReaders()
##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"
getSources() 
## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"
warp ="https://en.wikipedia.org/wiki/Main_Page"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 71883
summary(Corpus1)
##           Length Class             Mode
## Main_Page 2      PlainTextDocument list
Corpus1 <- tm_map(Corpus1, removePunctuation)  
Corpus1 <- tm_map(Corpus1, removeNumbers)  
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)  
Corpus1 <- tm_map(Corpus1, stripWhitespace)   
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
library(tm)
dtm <- DocumentTermMatrix(Corpus1)
dtms <- removeSparseTerms(dtm, 0.2)
inspect(dtm[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 28
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           â<U+0080><U+0093> abbf abfb absolute accesskeycmain accesskeyeview
##   character(0)  12    2    2        1              1              1
##               Terms
## Docs           accesskeyf accesskeygwikidata accesskeyhview accesskeyjwhat
##   character(0)          1                  1              1              1
##               Terms
## Docs           accesskeykrelated accesskeyntalkalili accesskeyolog
##   character(0)                 1                   1             1
##               Terms
## Docs           accesskeypprintable accesskeyqspecial accesskeyrrecent
##   character(0)                   1                 1                1
##               Terms
## Docs           accesskeyt accesskeyuupload accesskeyxrandom
##   character(0)          1                1                1
##               Terms
## Docs           accesskeyycontributionsalili accesskeyzmain accolades
##   character(0)                            1              1         1
##               Terms
## Docs           account accountalili actionview actionwindexphp activities
##   character(0)       1            1          1               1          1
##               Terms
## Docs           activitiestd additional additionsrecently
##   character(0)            1          1                 1
inspect(dtms[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 28
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           â<U+0080><U+0093> abbf abfb absolute accesskeycmain accesskeyeview
##   character(0)  12    2    2        1              1              1
##               Terms
## Docs           accesskeyf accesskeygwikidata accesskeyhview accesskeyjwhat
##   character(0)          1                  1              1              1
##               Terms
## Docs           accesskeykrelated accesskeyntalkalili accesskeyolog
##   character(0)                 1                   1             1
##               Terms
## Docs           accesskeypprintable accesskeyqspecial accesskeyrrecent
##   character(0)                   1                 1                1
##               Terms
## Docs           accesskeyt accesskeyuupload accesskeyxrandom
##   character(0)          1                1                1
##               Terms
## Docs           accesskeyycontributionsalili accesskeyzmain accolades
##   character(0)                            1              1         1
##               Terms
## Docs           account accountalili actionview actionwindexphp activities
##   character(0)       1            1          1               1          1
##               Terms
## Docs           activitiestd additional additionsrecently
##   character(0)            1          1                 1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 2027, documents: 1)>>
## Non-/sparse entries: 2027/0
## Sparsity           : 0%
## Maximal term length: 1973
## Weighting          : term frequency (tf)
inspect(tdm[1:30,1])
## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 28
## Weighting          : term frequency (tf)
## 
##                               Docs
## Terms                          character(0)
##   â<U+0080><U+0093>                                    12
##   abbf                                    2
##   abfb                                    2
##   absolute                                1
##   accesskeycmain                          1
##   accesskeyeview                          1
##   accesskeyf                              1
##   accesskeygwikidata                      1
##   accesskeyhview                          1
##   accesskeyjwhat                          1
##   accesskeykrelated                       1
##   accesskeyntalkalili                     1
##   accesskeyolog                           1
##   accesskeypprintable                     1
##   accesskeyqspecial                       1
##   accesskeyrrecent                        1
##   accesskeyt                              1
##   accesskeyuupload                        1
##   accesskeyxrandom                        1
##   accesskeyycontributionsalili            1
##   accesskeyzmain                          1
##   accolades                               1
##   account                                 1
##   accountalili                            1
##   actionview                              1
##   actionwindexphp                         1
##   activities                              1
##   activitiestd                            1
##   additional                              1
##   additionsrecently                       1
matx1=as.matrix(tdm)
matx1[1:10]
##  [1] 12  2  2  1  1  1  1  1  1  1
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
##                    div                    lia                   text 
##                    158                     71                     58 
##          classexternal classinterlanguagelink           classautonym 
##                     57                     45                     44 
##                  width                   liba                 height 
##                     22                     20                     19 
##         datafileheight 
##                     17
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
##                                          Word Frequency
## div                                       div       158
## lia                                       lia        71
## text                                     text        58
## classexternal                   classexternal        57
## classinterlanguagelink classinterlanguagelink        45
## classautonym                     classautonym        44
## width                                   width        22
## liba                                     liba        20
## height                                 height        19
## datafileheight                 datafileheight        17
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))   

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))