library(tm)
## Loading required package: NLP
getReaders()
## [1] "readDOC" "readPDF"
## [3] "readPlain" "readRCV1"
## [5] "readRCV1asPlain" "readReut21578XML"
## [7] "readReut21578XMLasPlain" "readTabular"
## [9] "readTagged" "readXML"
getSources()
## [1] "DataframeSource" "DirSource" "URISource" "VectorSource"
## [5] "XMLSource" "ZipSource"
warp ="https://en.wikipedia.org/wiki/Main_Page"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)
## <<VCorpus>>
## Metadata: corpus specific: 0, document level (indexed): 0
## Content: documents: 1
##
## [[1]]
## <<PlainTextDocument>>
## Metadata: 7
## Content: chars: 71883
summary(Corpus1)
## Length Class Mode
## Main_Page 2 PlainTextDocument list
Corpus1 <- tm_map(Corpus1, removePunctuation)
Corpus1 <- tm_map(Corpus1, removeNumbers)
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)
Corpus1 <- tm_map(Corpus1, stripWhitespace)
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
library(tm)
dtm <- DocumentTermMatrix(Corpus1)
dtms <- removeSparseTerms(dtm, 0.2)
inspect(dtm[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 28
## Weighting : term frequency (tf)
##
## Terms
## Docs â<U+0080><U+0093> abbf abfb absolute accesskeycmain accesskeyeview
## character(0) 12 2 2 1 1 1
## Terms
## Docs accesskeyf accesskeygwikidata accesskeyhview accesskeyjwhat
## character(0) 1 1 1 1
## Terms
## Docs accesskeykrelated accesskeyntalkalili accesskeyolog
## character(0) 1 1 1
## Terms
## Docs accesskeypprintable accesskeyqspecial accesskeyrrecent
## character(0) 1 1 1
## Terms
## Docs accesskeyt accesskeyuupload accesskeyxrandom
## character(0) 1 1 1
## Terms
## Docs accesskeyycontributionsalili accesskeyzmain accolades
## character(0) 1 1 1
## Terms
## Docs account accountalili actionview actionwindexphp activities
## character(0) 1 1 1 1 1
## Terms
## Docs activitiestd additional additionsrecently
## character(0) 1 1 1
inspect(dtms[1,1:30])
## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 28
## Weighting : term frequency (tf)
##
## Terms
## Docs â<U+0080><U+0093> abbf abfb absolute accesskeycmain accesskeyeview
## character(0) 12 2 2 1 1 1
## Terms
## Docs accesskeyf accesskeygwikidata accesskeyhview accesskeyjwhat
## character(0) 1 1 1 1
## Terms
## Docs accesskeykrelated accesskeyntalkalili accesskeyolog
## character(0) 1 1 1
## Terms
## Docs accesskeypprintable accesskeyqspecial accesskeyrrecent
## character(0) 1 1 1
## Terms
## Docs accesskeyt accesskeyuupload accesskeyxrandom
## character(0) 1 1 1
## Terms
## Docs accesskeyycontributionsalili accesskeyzmain accolades
## character(0) 1 1 1
## Terms
## Docs account accountalili actionview actionwindexphp activities
## character(0) 1 1 1 1 1
## Terms
## Docs activitiestd additional additionsrecently
## character(0) 1 1 1
tdm <- TermDocumentMatrix(Corpus1)
tdm
## <<TermDocumentMatrix (terms: 2027, documents: 1)>>
## Non-/sparse entries: 2027/0
## Sparsity : 0%
## Maximal term length: 1973
## Weighting : term frequency (tf)
inspect(tdm[1:30,1])
## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity : 0%
## Maximal term length: 28
## Weighting : term frequency (tf)
##
## Docs
## Terms character(0)
## â<U+0080><U+0093> 12
## abbf 2
## abfb 2
## absolute 1
## accesskeycmain 1
## accesskeyeview 1
## accesskeyf 1
## accesskeygwikidata 1
## accesskeyhview 1
## accesskeyjwhat 1
## accesskeykrelated 1
## accesskeyntalkalili 1
## accesskeyolog 1
## accesskeypprintable 1
## accesskeyqspecial 1
## accesskeyrrecent 1
## accesskeyt 1
## accesskeyuupload 1
## accesskeyxrandom 1
## accesskeyycontributionsalili 1
## accesskeyzmain 1
## accolades 1
## account 1
## accountalili 1
## actionview 1
## actionwindexphp 1
## activities 1
## activitiestd 1
## additional 1
## additionsrecently 1
matx1=as.matrix(tdm)
matx1[1:10]
## [1] 12 2 2 1 1 1 1 1 1 1
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
## div lia text
## 158 71 58
## classexternal classinterlanguagelink classautonym
## 57 45 44
## width liba height
## 22 20 19
## datafileheight
## 17
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
## Word Frequency
## div div 158
## lia lia 71
## text text 58
## classexternal classexternal 57
## classinterlanguagelink classinterlanguagelink 45
## classautonym classautonym 44
## width width 22
## liba liba 20
## height height 19
## datafileheight datafileheight 17
#install.packages("wordcloud")
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))
