##install.packages("janeaustenr")
## install.packages("syuzhet")
## Learning of data mining and Jane Austen package 
## This is not intended to be an original work . But mainly for academic learning 
## Sources
## https://github.com/juliasilge/janeaustenr
## http://juliasilge.com/blog/If-I-Loved-NLP-Less/


##install.packages("viridis")

## Clear the system 
ls()
## character(0)
rm(list=ls())
gc()
##          used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 362019 19.4     592000 31.7   460000 24.6
## Vcells 563062  4.3    1308461 10.0   858836  6.6
memory.size()
## [1] 30.18
memory.limit()
## [1] 4027
## processed them a bit to remove the Project Gutenberg headers and footers blank lines and NA lines, etc. 

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(stringr)
library(syuzhet)
library(ggplot2)
library(viridis)
library(tm)
## Loading required package: NLP
## 
## Attaching package: 'NLP'
## The following object is masked from 'package:ggplot2':
## 
##     annotate
library(janeaustenr)

## View of the basic data set of persuasion
##persuasion

##str(persuasion)
##summary(persuasion)

## Read the data from the novels 

##data(sensesensibility)
##data(prideprejudice)
##data(mansfieldpark)
##data(emma)
##data(northangerabbey)
##data(persuasion)


## another way to read the novels of Jane Austen 

##install.packages("stylo")
##library(stylo)
##data(novels)

##abooks <- austen_books()

##str(abooks)

##austen_books() %>% group_by(book) %>%summarise(total_lines = n())

##Analysing the novel Emma 

bkemma <- Corpus(DataframeSource(as.data.frame(emma)), readerControl = list(language = "eng"))

## Analysing the content of the novel

bkemma <- tm_map(bkemma, removePunctuation)  

bkemma <- tm_map(bkemma, removeNumbers)  

bkemma <- tm_map(bkemma, tolower)

bkemma <- tm_map(bkemma, removeWords, stopwords("english"))

newstopwords1 <- c("a", "about", "above", "across", "after", "again", "against", "all", "almost", "alone", "along", "already", "also", "although", "always", "am", "among", "an", "and", "another", "any", "anybody", "anyone", "anything", "anywhere", "are", "area", "areas", "arent", "around", "as", "ask", "asked", "asking", "asks", "at", "away", "b", "back", "backed", "backing", "backs", "be", "became", "because", "become", "becomes", "been", "before", "began", "behind", "being", "beings", "below", "best", "better", "between", "big", "both", "but", "by", "c", "came", "can", "cannot", "cant", "case", "cases", "certain", "certainly", "clear", "clearly", "come", "could", "couldnt", "d", "did", "didnt", "differ", "different", "differently", "do", "does", "doesnt", "doing", "done", "dont", "down", "downed", "downing", "downs", "during", "e", "each", "early", "either", "end", "ended", "ending", "ends", "enough", "even", "evenly", "ever", "every", "everybody", "everyone", "everything", "everywhere", "f", "face", "faces", "fact", "facts", "far", "felt", "few", "find", "finds", "first", "for", "four", "from", "full", "fully", "further", "furthered", "furthering", "furthers", "g", "gave", "general", "generally", "get", "gets", "give", "given", "gives", "go", "going", "good", "goods", "got", "great", "greater", "greatest", "group", "grouped", "grouping", "groups", "h", "had", "hadnt", "has", "hasnt", "have", "havent", "having", "he", "hed", "hell", "her", "here", "heres", "hers", "herself", "hes", "high", "higher", "highest", "him", "himself", "his", "how", "however", "hows", "i", "id", "if", "ill", "im", "important", "in", "interest", "interested", "interesting", "interests", "into", "is", "isnt", "it", "its", "its", "itself", "ive", "j", "just", "k", "keep", "keeps", "kind", "knew", "know", "known", "knows", "l", "large", "largely", "last", "later", "latest", "least", "less", "let", "lets", "lets", "like", "likely", "long", "longer", "longest", "m", "made", "make", "making", "man", "many", "may", "me", "member", "members", "men", "might", "more", "most", "mostly", "mr", "mrs", "much", "must", "mustnt", "my", "myself", "n", "necessary", "need", "needed", "needing", "needs", "never", "new", "newer", "newest", "next", "no", "nobody", "non", "noone", "nor", "not", "nothing", "now", "nowhere", "number", "numbers", "o", "of", "off", "often", "old", "older", "oldest", "on", "once", "one", "only", "open", "opened", "opening", "opens", "or", "order", "ordered", "ordering", "orders", "other", "others", "ought", "our", "ours", "ourselves", "out", "over", "own", "p", "part", "parted", "parting", "parts", "per", "perhaps", "place", "places", "point", "pointed", "pointing", "points", "possible", "present", "presented", "presenting", "presents", "problem", "problems", "put", "puts", "q", "quite", "r", "rather", "really", "right", "room", "rooms", "s", "said", "same", "saw", "say", "says", "second", "seconds", "see", "seem", "seemed", "seeming", "seems", "sees", "several", "shall", "shant", "she", "shed", "shell", "shes", "should", "shouldnt", "show", "showed", "showing", "shows", "side", "sides", "since", "small", "smaller", "smallest", "so", "some", "somebody", "someone", "something", "somewhere", "state", "states", "still", "such", "sure", "t", "take", "taken", "than", "that", "thats", "the", "their", "theirs", "them", "themselves", "then", "there", "therefore", "theres", "these", "they", "theyd", "theyll", "theyre", "theyve", "thing", "things", "think", "thinks", "this", "those", "though", "thought", "thoughts", "three", "through", "thus", "to", "today", "together", "too", "took", "toward", "turn", "turned", "turning", "turns", "two", "u", "under", "until", "up", "upon", "us", "use", "used", "uses", "v", "very", "w", "want", "wanted", "wanting", "wants", "was", "wasnt", "way", "ways", "we", "wed", "well", "well", "wells", "went", "were", "were", "werent", "weve", "what", "whats", "when", "whens", "where", "wheres", "whether", "which", "while", "who", "whole", "whom", "whos", "whose", "why", "whys", "will", "with", "within", "without", "wont", "work")
newstopwords2 <- c("worked", "working", "works", "would", "wouldnt", "x", "y", "year", "years", "yes", "yet", "you", "youd", "youll", "young", "younger", "youngest", "your", "youre", "yours", "yourself", "yourselves", "youve", "z")


bkemma <- tm_map(bkemma, removeWords, newstopwords1)
bkemma <- tm_map(bkemma, removeWords, newstopwords2)

bkemma <- tm_map(bkemma, stemDocument)  

bkemma <- tm_map(bkemma, stripWhitespace)   

bkemma <- tm_map(bkemma, PlainTextDocument)

##library(quanteda)
##dfm(bkemma , keptFeatures = c("emma" , "will*" , "knight*") , verbose = FALSE)

##install.packages("quanteda")

##inspect(bkemma) 
##summary(bkemma)

rm(newstopwords1 , newstopwords2)
library(SnowballC)

dtm <- DocumentTermMatrix(bkemma)
##inspect(dtm[1,-10])


## Term Document Matrix Calculations
tdm <- TermDocumentMatrix(bkemma)
##inspect(tdm[1:10 , 1])

matx1=as.matrix(tdm)
matx1[1:10]
##  [1] 0 0 0 0 0 0 0 0 0 0
sort1=sort(rowSums(matx1),decreasing=T)
sort1[1:10]
##      emma      miss   harriet    weston knightley    little     elton 
##       751       583       396       381       338       324       320 
##      time      jane woodhouse 
##       250       242       240
di=data.frame(Word=names(sort1),Frequency=sort1)
di[1:10,]
##                Word Frequency
## emma           emma       751
## miss           miss       583
## harriet     harriet       396
## weston       weston       381
## knightley knightley       338
## little       little       324
## elton         elton       320
## time           time       250
## jane           jane       242
## woodhouse woodhouse       240
library(wordcloud)
## Loading required package: RColorBrewer
wordcloud(di$Word, di$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))

##removeSparseTerms

dtms <- removeSparseTerms(dtm ,0.1)
##inspect(dtms)


sort2=sort(rowSums(matx1),decreasing=T)
sort2[1:10]
##      emma      miss   harriet    weston knightley    little     elton 
##       751       583       396       381       338       324       320 
##      time      jane woodhouse 
##       250       242       240
di1=data.frame(Word=names(sort2),Frequency=sort2)
di1[1:10,]
##                Word Frequency
## emma           emma       751
## miss           miss       583
## harriet     harriet       396
## weston       weston       381
## knightley knightley       338
## little       little       324
## elton         elton       320
## time           time       250
## jane           jane       242
## woodhouse woodhouse       240
wordcloud(di1$Word, di1$Frequency, max.words=100,colors=brewer.pal(6, "Dark2"))