Text_Mining_Jane_Austenr

library(tm)

## Loading required package: NLP

getReaders()

##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"

getSources()

## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"

warp="https://github.com/juliasilge/janeaustenr/blob/master/data-raw/prep_data.R"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 55939

inspect(Corpus1)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 55939

summary(Corpus1)

##             Length Class             Mode
## prep_data.R 2      PlainTextDocument list

Corpus1 <- tm_map(Corpus1, removePunctuation)  
Corpus1 <- tm_map(Corpus1, removeNumbers)  
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)  
Corpus1 <- tm_map(Corpus1, stripWhitespace)   
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
dtm <- DocumentTermMatrix(Corpus1)
dtms <- removeSparseTerms(dtm, 0.2)
inspect(dtm[1,1:30])

## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 33
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           acceptcharsetutf action actionjuliasilgejaneaustenrsearch
##   character(0)                2      2                                 1
##               Terms
## Docs           add alt andspantd another api
##   character(0)   4   1         1       2   1
##               Terms
## Docs           ariadescribedbyfaceboxdescript ariahaspopuptru
##   character(0)                              1               1
##               Terms
## Docs           ariahiddentru ariahiddentrue arialabel arialabelclose
##   character(0)             1             24         3              2
##               Terms
## Docs           arialabelcopy arialabeldismiss arialabelfilter
##   character(0)             1                1               1
##               Terms
## Docs           arialabelhomepage arialabeljump arialabelledbyfaceboxheader
##   character(0)                 2             1                           1
##               Terms
## Docs           arialabelsearch arialabelswitch arialabeltoggle
##   character(0)               1               1               1
##               Terms
## Docs           arialabelyou ariaselectedtrue article asyncasync austens
##   character(0)            5                1       1          1       3
##               Terms
## Docs           autocapitalizeoff autofocus
##   character(0)                 1         1

inspect(dtms[1,1:30])

## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 33
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           acceptcharsetutf action actionjuliasilgejaneaustenrsearch
##   character(0)                2      2                                 1
##               Terms
## Docs           add alt andspantd another api
##   character(0)   4   1         1       2   1
##               Terms
## Docs           ariadescribedbyfaceboxdescript ariahaspopuptru
##   character(0)                              1               1
##               Terms
## Docs           ariahiddentru ariahiddentrue arialabel arialabelclose
##   character(0)             1             24         3              2
##               Terms
## Docs           arialabelcopy arialabeldismiss arialabelfilter
##   character(0)             1                1               1
##               Terms
## Docs           arialabelhomepage arialabeljump arialabelledbyfaceboxheader
##   character(0)                 2             1                           1
##               Terms
## Docs           arialabelsearch arialabelswitch arialabeltoggle
##   character(0)               1               1               1
##               Terms
## Docs           arialabelyou ariaselectedtrue article asyncasync austens
##   character(0)            5                1       1          1       3
##               Terms
## Docs           autocapitalizeoff autofocus
##   character(0)                 1         1

tdm <- TermDocumentMatrix(Corpus1)
tdm

## <<TermDocumentMatrix (terms: 779, documents: 1)>>
## Non-/sparse entries: 779/0
## Sparsity           : 0%
## Maximal term length: 230
## Weighting          : term frequency (tf)

inspect(tdm[1:30,1])

## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 33
## Weighting          : term frequency (tf)
## 
##                                    Docs
## Terms                               character(0)
##   acceptcharsetutf                             2
##   action                                       2
##   actionjuliasilgejaneaustenrsearch            1
##   add                                          4
##   alt                                          1
##   andspantd                                    1
##   another                                      2
##   api                                          1
##   ariadescribedbyfaceboxdescript               1
##   ariahaspopuptru                              1
##   ariahiddentru                                1
##   ariahiddentrue                              24
##   arialabel                                    3
##   arialabelclose                               2
##   arialabelcopy                                1
##   arialabeldismiss                             1
##   arialabelfilter                              1
##   arialabelhomepage                            2
##   arialabeljump                                1
##   arialabelledbyfaceboxheader                  1
##   arialabelsearch                              1
##   arialabelswitch                              1
##   arialabeltoggle                              1
##   arialabelyou                                 5
##   ariaselectedtrue                             1
##   article                                      1
##   asyncasync                                   1
##   austens                                      3
##   autocapitalizeoff                            1
##   autofocus                                    1

matx1=as.matrix(tdm)
matx1[1:10]

##  [1] 2 2 1 4 1 1 2 1 1 1

sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]

##             span              div    blobcodeinner    classblobcode 
##              136               92               72               72 
##     classblobnum datalinenumbertd              idl             idlc 
##               72               72               72               72 
##     jslinenumber   jsfilelinespan 
##               72               53

di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]

##                              Word Frequency
## span                         span       136
## div                           div        92
## blobcodeinner       blobcodeinner        72
## classblobcode       classblobcode        72
## classblobnum         classblobnum        72
## datalinenumbertd datalinenumbertd        72
## idl                           idl        72
## idlc                         idlc        72
## jslinenumber         jslinenumber        72
## jsfilelinespan     jsfilelinespan        53

#install.packages("wordcloud")
library(wordcloud)

## Loading required package: RColorBrewer

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : span could not be fit on page. It will not be plotted.

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : classplpdsquotspanhttpwwwgutenbergorgcacheepubpgtxtspan
## could not be fit on page. It will not be plotted.

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : classplcspanspan could not be fit on page. It will not be
## plotted.

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : jslinenumber could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : classblobnum could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : datalinenumbertd could not be fit on page. It will not be
## plotted.

Text_Mining_Jane_Austenr_Novels.R

sanpande

Tue Oct 25 01:49:16 2016