Text_Mining_Jane_Austenr

#install.packages("tidytext")
setwd("C:\\Users\\sanpande\\Desktop\\R Files")
library(janeaustenr)
library(tm)

## Loading required package: NLP

getReaders()

##  [1] "readDOC"                 "readPDF"                
##  [3] "readPlain"               "readRCV1"               
##  [5] "readRCV1asPlain"         "readReut21578XML"       
##  [7] "readReut21578XMLasPlain" "readTabular"            
##  [9] "readTagged"              "readXML"

getSources()

## [1] "DataframeSource" "DirSource"       "URISource"       "VectorSource"   
## [5] "XMLSource"       "ZipSource"

warp="https://github.com/juliasilge/janeaustenr"
Corpus1=Corpus(URISource(warp), readerControl = list(language = "eng"))
inspect(Corpus1)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 67639

inspect(Corpus1)

## <<VCorpus>>
## Metadata:  corpus specific: 0, document level (indexed): 0
## Content:  documents: 1
## 
## [[1]]
## <<PlainTextDocument>>
## Metadata:  7
## Content:  chars: 67639

summary(Corpus1)

##             Length Class             Mode
## janeaustenr 2      PlainTextDocument list

Corpus1 <- tm_map(Corpus1, removePunctuation)  
Corpus1 <- tm_map(Corpus1, removeNumbers)  
Corpus1 <- tm_map(Corpus1, tolower)
Corpus1 <- tm_map(Corpus1, removeWords, stopwords("english"))
#install.packages("SnowballC")
library(SnowballC)
Corpus1 <- tm_map(Corpus1, stemDocument)  
Corpus1 <- tm_map(Corpus1, stripWhitespace)   
Corpus1 <- tm_map(Corpus1, PlainTextDocument)
dtm <- DocumentTermMatrix(Corpus1)
dtms <- removeSparseTerms(dtm, 0.2)
inspect(dtm[1,1:30])

## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 33
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           abbeyem abbeyemp abide acceptcharsetutf access action
##   character(0)       1        1     1                1      1      1
##               Terms
## Docs           actionjuliasilgejaneaustenrsearch agree aliasorangebook
##   character(0)                                 1     1               1
##               Terms
## Docs           also alt altbuild altcranstatusbadge altdoi altjuliasilge
##   character(0)    2  17        1                  1      1             1
##               Terms
## Docs           analysis analyzing announce another anything api
##   character(0)        2         1        1       2        1   1
##               Terms
## Docs           ariadescribedbyfaceboxdescript ariahaspopuptru
##   character(0)                              1               2
##               Terms
## Docs           ariahiddentru ariahiddentrue ariahiddentruesvg arialabel
##   character(0)             2             51                 4         3
##               Terms
## Docs           arialabelclone arialabelclose arialabelcopy
##   character(0)              2              2             1

inspect(dtms[1,1:30])

## <<DocumentTermMatrix (documents: 1, terms: 30)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 33
## Weighting          : term frequency (tf)
## 
##               Terms
## Docs           abbeyem abbeyemp abide acceptcharsetutf access action
##   character(0)       1        1     1                1      1      1
##               Terms
## Docs           actionjuliasilgejaneaustenrsearch agree aliasorangebook
##   character(0)                                 1     1               1
##               Terms
## Docs           also alt altbuild altcranstatusbadge altdoi altjuliasilge
##   character(0)    2  17        1                  1      1             1
##               Terms
## Docs           analysis analyzing announce another anything api
##   character(0)        2         1        1       2        1   1
##               Terms
## Docs           ariadescribedbyfaceboxdescript ariahaspopuptru
##   character(0)                              1               2
##               Terms
## Docs           ariahiddentru ariahiddentrue ariahiddentruesvg arialabel
##   character(0)             2             51                 4         3
##               Terms
## Docs           arialabelclone arialabelclose arialabelcopy
##   character(0)              2              2             1

tdm <- TermDocumentMatrix(Corpus1)
tdm

## <<TermDocumentMatrix (terms: 1058, documents: 1)>>
## Non-/sparse entries: 1058/0
## Sparsity           : 0%
## Maximal term length: 231
## Weighting          : term frequency (tf)

inspect(tdm[1:30,1])

## <<TermDocumentMatrix (terms: 30, documents: 1)>>
## Non-/sparse entries: 30/0
## Sparsity           : 0%
## Maximal term length: 33
## Weighting          : term frequency (tf)
## 
##                                    Docs
## Terms                               character(0)
##   abbeyem                                      1
##   abbeyemp                                     1
##   abide                                        1
##   acceptcharsetutf                             1
##   access                                       1
##   action                                       1
##   actionjuliasilgejaneaustenrsearch            1
##   agree                                        1
##   aliasorangebook                              1
##   also                                         2
##   alt                                         17
##   altbuild                                     1
##   altcranstatusbadge                           1
##   altdoi                                       1
##   altjuliasilge                                1
##   analysis                                     2
##   analyzing                                    1
##   announce                                     1
##   another                                      2
##   anything                                     1
##   api                                          1
##   ariadescribedbyfaceboxdescript               1
##   ariahaspopuptru                              2
##   ariahiddentru                                2
##   ariahiddentrue                              51
##   ariahiddentruesvg                            4
##   arialabel                                    3
##   arialabelclone                               2
##   arialabelclose                               2
##   arialabelcopy                                1

matx1=as.matrix(tdm)
matx1[1:10]

##  [1] 1 1 1 1 1 1 1 1 1 2

sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]

##             span              div           height          version 
##              118              106               70               61 
##     classocticon          viewbox   ariahiddentrue classcsstruncate 
##               52               52               51               51 
##             meta        widthpath 
##               51               51

di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]

##                              Word Frequency
## span                         span       118
## div                           div       106
## height                     height        70
## version                   version        61
## classocticon         classocticon        52
## viewbox                   viewbox        52
## ariahiddentrue     ariahiddentrue        51
## classcsstruncate classcsstruncate        51
## meta                         meta        51
## widthpath               widthpath        51

#install.packages("wordcloud")
library(wordcloud)

## Loading required package: RColorBrewer

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Reds"))

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : srchttpsassetscdngithubcomimagesspinnersoctocatspinnergif
## could not be fit on page. It will not be plotted.

wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : csstruncatetarget could not be fit on page. It will not be
## plotted.

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : classjsnavigationitem could not be fit on page. It will not
## be plotted.

## Warning in wordcloud(di$Word, di$Frequency, max.words = 100, colors =
## brewer.pal(6, : csstruncatetargettimeago could not be fit on page. It will
## not be plotted.

Text_Mining_Jane_Austenr_Novels.R

sanpande

Tue Oct 25 01:28:36 2016