Katie_Hanks

#Load the Libraries
library(wordcloud)

## Loading required package: RColorBrewer

library(tm)

## Loading required package: NLP

library(slam)
library(quanteda)

## Package version: 1.5.1

## Parallel computing: 2 of 4 threads used.

## See https://quanteda.io for tutorials and examples.

## 
## Attaching package: 'quanteda'

## The following objects are masked from 'package:tm':
## 
##     as.DocumentTermMatrix, stopwords

## The following object is masked from 'package:utils':
## 
##     View

library(SnowballC)
library(arules)

## Loading required package: Matrix

## 
## Attaching package: 'arules'

## The following object is masked from 'package:quanteda':
## 
##     affinity

## The following object is masked from 'package:tm':
## 
##     inspect

## The following objects are masked from 'package:base':
## 
##     abbreviate, write

library(proxy)

## 
## Attaching package: 'proxy'

## The following object is masked from 'package:Matrix':
## 
##     as.matrix

## The following object is masked from 'package:quanteda':
## 
##     as.matrix

## The following objects are masked from 'package:stats':
## 
##     as.dist, dist

## The following object is masked from 'package:base':
## 
##     as.matrix

library(cluster)
library(stringi)
library(Matrix)
library(tidytext)
library(plyr)
library(ggplot2)

## 
## Attaching package: 'ggplot2'

## The following object is masked from 'package:NLP':
## 
##     annotate

library(factoextra)

## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ

library(mclust)

## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:plyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize

## The following objects are masked from 'package:arules':
## 
##     intersect, recode, setdiff, setequal, union

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(corpus)
library(rpart)
library(rpart.plot)
library(rattle)

## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.

#Load the Data
setwd("C:/Users/katie/Desktop")
FedPapersCorpus <- Corpus(DirSource("FedPapersCorpus"))
(numberFedPapers<-length(FedPapersCorpus))

## [1] 85

summary(FedPapersCorpus)

##                     Length Class             Mode
## dispt_fed_49.txt    2      PlainTextDocument list
## dispt_fed_50.txt    2      PlainTextDocument list
## dispt_fed_51.txt    2      PlainTextDocument list
## dispt_fed_52.txt    2      PlainTextDocument list
## dispt_fed_53.txt    2      PlainTextDocument list
## dispt_fed_54.txt    2      PlainTextDocument list
## dispt_fed_55.txt    2      PlainTextDocument list
## dispt_fed_56.txt    2      PlainTextDocument list
## dispt_fed_57.txt    2      PlainTextDocument list
## dispt_fed_62.txt    2      PlainTextDocument list
## dispt_fed_63.txt    2      PlainTextDocument list
## Hamilton_fed_1.txt  2      PlainTextDocument list
## Hamilton_fed_11.txt 2      PlainTextDocument list
## Hamilton_fed_12.txt 2      PlainTextDocument list
## Hamilton_fed_13.txt 2      PlainTextDocument list
## Hamilton_fed_15.txt 2      PlainTextDocument list
## Hamilton_fed_16.txt 2      PlainTextDocument list
## Hamilton_fed_17.txt 2      PlainTextDocument list
## Hamilton_fed_21.txt 2      PlainTextDocument list
## Hamilton_fed_22.txt 2      PlainTextDocument list
## Hamilton_fed_23.txt 2      PlainTextDocument list
## Hamilton_fed_24.txt 2      PlainTextDocument list
## Hamilton_fed_25.txt 2      PlainTextDocument list
## Hamilton_fed_26.txt 2      PlainTextDocument list
## Hamilton_fed_27.txt 2      PlainTextDocument list
## Hamilton_fed_28.txt 2      PlainTextDocument list
## Hamilton_fed_29.txt 2      PlainTextDocument list
## Hamilton_fed_30.txt 2      PlainTextDocument list
## Hamilton_fed_31.txt 2      PlainTextDocument list
## Hamilton_fed_32.txt 2      PlainTextDocument list
## Hamilton_fed_33.txt 2      PlainTextDocument list
## Hamilton_fed_34.txt 2      PlainTextDocument list
## Hamilton_fed_35.txt 2      PlainTextDocument list
## Hamilton_fed_36.txt 2      PlainTextDocument list
## Hamilton_fed_59.txt 2      PlainTextDocument list
## Hamilton_fed_6.txt  2      PlainTextDocument list
## Hamilton_fed_60.txt 2      PlainTextDocument list
## Hamilton_fed_61.txt 2      PlainTextDocument list
## Hamilton_fed_65.txt 2      PlainTextDocument list
## Hamilton_fed_66.txt 2      PlainTextDocument list
## Hamilton_fed_67.txt 2      PlainTextDocument list
## Hamilton_fed_68.txt 2      PlainTextDocument list
## Hamilton_fed_69.txt 2      PlainTextDocument list
## Hamilton_fed_7.txt  2      PlainTextDocument list
## Hamilton_fed_70.txt 2      PlainTextDocument list
## Hamilton_fed_71.txt 2      PlainTextDocument list
## Hamilton_fed_72.txt 2      PlainTextDocument list
## Hamilton_fed_73.txt 2      PlainTextDocument list
## Hamilton_fed_74.txt 2      PlainTextDocument list
## Hamilton_fed_75.txt 2      PlainTextDocument list
## Hamilton_fed_76.txt 2      PlainTextDocument list
## Hamilton_fed_77.txt 2      PlainTextDocument list
## Hamilton_fed_78.txt 2      PlainTextDocument list
## Hamilton_fed_79.txt 2      PlainTextDocument list
## Hamilton_fed_8.txt  2      PlainTextDocument list
## Hamilton_fed_80.txt 2      PlainTextDocument list
## Hamilton_fed_81.txt 2      PlainTextDocument list
## Hamilton_fed_82.txt 2      PlainTextDocument list
## Hamilton_fed_83.txt 2      PlainTextDocument list
## Hamilton_fed_84.txt 2      PlainTextDocument list
## Hamilton_fed_85.txt 2      PlainTextDocument list
## Hamilton_fed_9.txt  2      PlainTextDocument list
## HM_fed_18.txt       2      PlainTextDocument list
## HM_fed_19.txt       2      PlainTextDocument list
## HM_fed_20.txt       2      PlainTextDocument list
## Jay_fed_2.txt       2      PlainTextDocument list
## Jay_fed_3.txt       2      PlainTextDocument list
## Jay_fed_4.txt       2      PlainTextDocument list
## Jay_fed_5.txt       2      PlainTextDocument list
## Jay_fed_64.txt      2      PlainTextDocument list
## Madison_fed_10.txt  2      PlainTextDocument list
## Madison_fed_14.txt  2      PlainTextDocument list
## Madison_fed_37.txt  2      PlainTextDocument list
## Madison_fed_38.txt  2      PlainTextDocument list
## Madison_fed_39.txt  2      PlainTextDocument list
## Madison_fed_40.txt  2      PlainTextDocument list
## Madison_fed_41.txt  2      PlainTextDocument list
## Madison_fed_42.txt  2      PlainTextDocument list
## Madison_fed_43.txt  2      PlainTextDocument list
## Madison_fed_44.txt  2      PlainTextDocument list
## Madison_fed_45.txt  2      PlainTextDocument list
## Madison_fed_46.txt  2      PlainTextDocument list
## Madison_fed_47.txt  2      PlainTextDocument list
## Madison_fed_48.txt  2      PlainTextDocument list
## Madison_fed_58.txt  2      PlainTextDocument list

#Create the DTM
(getTransformations())

## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

(nFedPapersCorpus<-length(FedPapersCorpus))

## [1] 85

(minTermFreq <-30)

## [1] 30

(maxTermFreq <-1000)

## [1] 1000

(MyStopwords <- c("will","one","two", "may","less","publius","Madison","Alexand", "Alexander", "James", "Hamilton", "hamilton", "Jay", "well","might","without","small", "single", "several", "but", "very", "can", "must", "also", "any", "and", "are", "however", "into", "almost", "can","for", "add", "Author", "author", "alexand", "alexander", "jame", "james" ))

##  [1] "will"      "one"       "two"       "may"       "less"     
##  [6] "publius"   "Madison"   "Alexand"   "Alexander" "James"    
## [11] "Hamilton"  "hamilton"  "Jay"       "well"      "might"    
## [16] "without"   "small"     "single"    "several"   "but"      
## [21] "very"      "can"       "must"      "also"      "any"      
## [26] "and"       "are"       "however"   "into"      "almost"   
## [31] "can"       "for"       "add"       "Author"    "author"   
## [36] "alexand"   "alexander" "jame"      "james"

(STOPS <-stopwords('english'))

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"       "will"

Papers_DTM <- DocumentTermMatrix(FedPapersCorpus,
                      control = list(
                      stopwords = TRUE,
                      wordLengths=c(3, 15),
                      removePunctuation = T,
                      removeNumbers = T,
                      tolower=T,
                      stemming = T,
                      remove_separators = T,
                      stopwords = MyStopwords,
                      removeWords=STOPS,
                      removeWords=MyStopwords,
                      bounds = list(global = c(minTermFreq, maxTermFreq))
))

DTM <- as.matrix(Papers_DTM)

#Vectorizing
WordFreq <- colSums(as.matrix(Papers_DTM))
(head(WordFreq))

##       abl   absolut    accord       act     addit administr 
##        74        63        71       139        61        90

(length(WordFreq))

## [1] 427

ord <- order(WordFreq)
(WordFreq[head(ord)])

##    jame   expos furnish    word  unless   bound 
##      30      34      36      36      37      38

(WordFreq[tail(ord)])

## constitut       may     power    govern      will     state 
##       686       811       937      1040      1263      1662

(Row_Sum_Per_doc <- rowSums((as.matrix(Papers_DTM))))

##    dispt_fed_49.txt    dispt_fed_50.txt    dispt_fed_51.txt 
##                 514                 338                 658 
##    dispt_fed_52.txt    dispt_fed_53.txt    dispt_fed_54.txt 
##                 565                 701                 582 
##    dispt_fed_55.txt    dispt_fed_56.txt    dispt_fed_57.txt 
##                 647                 553                 613 
##    dispt_fed_62.txt    dispt_fed_63.txt  Hamilton_fed_1.txt 
##                 698                 955                 483 
## Hamilton_fed_11.txt Hamilton_fed_12.txt Hamilton_fed_13.txt 
##                 564                 539                 318 
## Hamilton_fed_15.txt Hamilton_fed_16.txt Hamilton_fed_17.txt 
##                 815                 558                 477 
## Hamilton_fed_21.txt Hamilton_fed_22.txt Hamilton_fed_23.txt 
##                 537                 985                 560 
## Hamilton_fed_24.txt Hamilton_fed_25.txt Hamilton_fed_26.txt 
##                 519                 570                 670 
## Hamilton_fed_27.txt Hamilton_fed_28.txt Hamilton_fed_29.txt 
##                 466                 507                 541 
## Hamilton_fed_30.txt Hamilton_fed_31.txt Hamilton_fed_32.txt 
##                 585                 510                 442 
## Hamilton_fed_33.txt Hamilton_fed_34.txt Hamilton_fed_35.txt 
##                 522                 618                 663 
## Hamilton_fed_36.txt Hamilton_fed_59.txt  Hamilton_fed_6.txt 
##                 824                 603                 461 
## Hamilton_fed_60.txt Hamilton_fed_61.txt Hamilton_fed_65.txt 
##                 657                 444                 560 
## Hamilton_fed_66.txt Hamilton_fed_67.txt Hamilton_fed_68.txt 
##                 646                 443                 449 
## Hamilton_fed_69.txt  Hamilton_fed_7.txt Hamilton_fed_70.txt 
##                 811                 580                 852 
## Hamilton_fed_71.txt Hamilton_fed_72.txt Hamilton_fed_73.txt 
##                 473                 539                 696 
## Hamilton_fed_74.txt Hamilton_fed_75.txt Hamilton_fed_76.txt 
##                 282                 597                 594 
## Hamilton_fed_77.txt Hamilton_fed_78.txt Hamilton_fed_79.txt 
##                 586                 891                 301 
##  Hamilton_fed_8.txt Hamilton_fed_80.txt Hamilton_fed_81.txt 
##                 533                 771                1188 
## Hamilton_fed_82.txt Hamilton_fed_83.txt Hamilton_fed_84.txt 
##                 504                1598                1255 
## Hamilton_fed_85.txt  Hamilton_fed_9.txt       HM_fed_18.txt 
##                 773                 520                 443 
##       HM_fed_19.txt       HM_fed_20.txt       Jay_fed_2.txt 
##                 466                 395                 477 
##       Jay_fed_3.txt       Jay_fed_4.txt       Jay_fed_5.txt 
##                 515                 463                 401 
##      Jay_fed_64.txt  Madison_fed_10.txt  Madison_fed_14.txt 
##                 692                 884                 553 
##  Madison_fed_37.txt  Madison_fed_38.txt  Madison_fed_39.txt 
##                 723                 874                 859 
##  Madison_fed_40.txt  Madison_fed_41.txt  Madison_fed_42.txt 
##                 857                1020                 800 
##  Madison_fed_43.txt  Madison_fed_44.txt  Madison_fed_45.txt 
##                 993                 927                 724 
##  Madison_fed_46.txt  Madison_fed_47.txt  Madison_fed_48.txt 
##                 832                 925                 565 
##  Madison_fed_58.txt 
##                 655

Papers_M <- as.matrix(Papers_DTM)
Papers_M_N1 <- apply(Papers_M, 1, function(i) round(i/sum(i),3))
Papers_Matrix_Norm <- t(Papers_M_N1)
Papers_dtm_matrix = as.matrix(Papers_DTM)


Papers_DF <- as.data.frame(as.matrix(Papers_Matrix_Norm))
#remove Jays papers
Papers_DF<-Papers_DF[-66:-70,]

Papers_DF1<- Papers_DF%>%add_rownames()

## Warning: Deprecated, use tibble::rownames_to_column() instead.

names(Papers_DF1)[1]<-"Author"
Papers_DF1[1:11,1]="dispt"
Papers_DF1[12:65,1]="hamil"
Papers_DF1[66:80,1]="madis"
head(Papers_DF1)

##Word Clouds
DisputedPapersWC<- wordcloud(colnames(Papers_dtm_matrix), Papers_dtm_matrix[11,], rot.per = .35, colors = brewer.pal(5, "Set1"))

(head(sort(as.matrix(Papers_dtm_matrix)[11,], decreasing = TRUE), n=50))

##      peopl      senat       will        may     repres     govern 
##         42         24         19         18         18         16 
##       bodi        can      elect       must     measur      state 
##         15         14         14         12         11         11 
##     nation        one  constitut     former      power     reason 
##          9          9          8          8          8          8 
##       year    assembl     exampl        two     danger      everi 
##          8          7          7          7          6          6 
##       evid      feder     import     latter     object particular 
##          6          6          6          6          6          6 
##     public   advantag     answer     appear     author    charact 
##          6          5          5          5          5          5 
##       fact      first       hous   institut       less       mani 
##          5          5          5          5          5          5 
##     member      might       oper      order       part    popular 
##          5          5          5          5          5          5 
##    probabl      small 
##          5          5

HamiltonPapersWC <-wordcloud(colnames(Papers_dtm_matrix),Papers_dtm_matrix[50:53,], rot.per = .35, colors = brewer.pal(5, "Set1"))

(head(sort(as.matrix(Papers_dtm_matrix)[11,], decreasing = TRUE), n=50))

##      peopl      senat       will        may     repres     govern 
##         42         24         19         18         18         16 
##       bodi        can      elect       must     measur      state 
##         15         14         14         12         11         11 
##     nation        one  constitut     former      power     reason 
##          9          9          8          8          8          8 
##       year    assembl     exampl        two     danger      everi 
##          8          7          7          7          6          6 
##       evid      feder     import     latter     object particular 
##          6          6          6          6          6          6 
##     public   advantag     answer     appear     author    charact 
##          6          5          5          5          5          5 
##       fact      first       hous   institut       less       mani 
##          5          5          5          5          5          5 
##     member      might       oper      order       part    popular 
##          5          5          5          5          5          5 
##    probabl      small 
##          5          5

MadisonPapersHW <-wordcloud(colnames(Papers_dtm_matrix), Papers_dtm_matrix[63:66,], rot.per = .35, colors = brewer.pal(5, "Set1"))

(head(sort(as.matrix(Papers_dtm_matrix)[11,], decreasing = TRUE), n=50))

##      peopl      senat       will        may     repres     govern 
##         42         24         19         18         18         16 
##       bodi        can      elect       must     measur      state 
##         15         14         14         12         11         11 
##     nation        one  constitut     former      power     reason 
##          9          9          8          8          8          8 
##       year    assembl     exampl        two     danger      everi 
##          8          7          7          7          6          6 
##       evid      feder     import     latter     object particular 
##          6          6          6          6          6          6 
##     public   advantag     answer     appear     author    charact 
##          6          5          5          5          5          5 
##       fact      first       hous   institut       less       mani 
##          5          5          5          5          5          5 
##     member      might       oper      order       part    popular 
##          5          5          5          5          5          5 
##    probabl      small 
##          5          5

##Make Train and Test sets
trainRatio <- .60
set.seed(11) # Set Seed so that same sample can be reproduced in future also
sample <- sample.int(n = nrow(Papers_DF1), size = floor(trainRatio*nrow(Papers_DF1)), replace = FALSE)
train <- Papers_DF1[sample, ]
test <- Papers_DF1[-sample, ]
# train / test ratio
length(sample)/nrow(Papers_DF1)

## [1] 0.6

##Decision Tree Models
#Train Tree Model 1
train_tree1 <- rpart(Author ~ ., data = train, method="class", control=rpart.control(cp=0))
summary(train_tree1)

## Call:
## rpart(formula = Author ~ ., data = train, method = "class", control = rpart.control(cp = 0))
##   n= 48 
## 
##          CP nsplit rel error    xerror      xstd
## 1 0.6190476      0 1.0000000 1.0000000 0.1636634
## 2 0.3809524      1 0.3809524 0.3809524 0.1229519
## 3 0.0000000      2 0.0000000 0.3333333 0.1164397
## 
## Variable importance
##  alexand hamilton     jame     upon   matter     kind    appli    elect 
##       17       17       15       14       10        8        5        5 
##     mani  absolut 
##        5        4 
## 
## Node number 1: 48 observations,    complexity param=0.6190476
##   predicted class=hamil  expected loss=0.4375  P(node) =1
##     class counts:     8    27    13
##    probabilities: 0.167 0.562 0.271 
##   left son=2 (27 obs) right son=3 (21 obs)
##   Primary splits:
##       jame     < 5e-04  to the left,  improve=18.053570, (0 missing)
##       upon     < 0.0055 to the right, improve=16.594700, (0 missing)
##       alexand  < 5e-04  to the right, improve=15.615480, (0 missing)
##       hamilton < 5e-04  to the right, improve=15.615480, (0 missing)
##       matter   < 5e-04  to the right, improve= 9.291667, (0 missing)
##   Surrogate splits:
##       upon     < 0.004  to the right, agree=0.979, adj=0.952, (0 split)
##       matter   < 5e-04  to the right, agree=0.854, adj=0.667, (0 split)
##       alexand  < 5e-04  to the right, agree=0.833, adj=0.619, (0 split)
##       hamilton < 5e-04  to the right, agree=0.833, adj=0.619, (0 split)
##       kind     < 5e-04  to the right, agree=0.812, adj=0.571, (0 split)
## 
## Node number 2: 27 observations
##   predicted class=hamil  expected loss=0  P(node) =0.5625
##     class counts:     0    27     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 3: 21 observations,    complexity param=0.3809524
##   predicted class=madis  expected loss=0.3809524  P(node) =0.4375
##     class counts:     8     0    13
##    probabilities: 0.381 0.000 0.619 
##   left son=6 (8 obs) right son=7 (13 obs)
##   Primary splits:
##       alexand  < 5e-04  to the right, improve=9.904762, (0 missing)
##       hamilton < 5e-04  to the right, improve=9.904762, (0 missing)
##       mani     < 0.0025 to the right, improve=4.960317, (0 missing)
##       appli    < 5e-04  to the right, improve=4.761905, (0 missing)
##       branch   < 0.0015 to the right, improve=4.571429, (0 missing)
##   Surrogate splits:
##       hamilton < 5e-04  to the right, agree=1.000, adj=1.000, (0 split)
##       appli    < 5e-04  to the right, agree=0.857, adj=0.625, (0 split)
##       elect    < 0.0045 to the right, agree=0.857, adj=0.625, (0 split)
##       mani     < 0.0025 to the right, agree=0.857, adj=0.625, (0 split)
##       absolut  < 5e-04  to the left,  agree=0.810, adj=0.500, (0 split)
## 
## Node number 6: 8 observations
##   predicted class=dispt  expected loss=0  P(node) =0.1666667
##     class counts:     8     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 7: 13 observations
##   predicted class=madis  expected loss=0  P(node) =0.2708333
##     class counts:     0     0    13
##    probabilities: 0.000 0.000 1.000

#predict the test dataset using the model for train tree No. 1
predicted1= predict(train_tree1, test, type="class")
#plot number of splits
rsq.rpart(train_tree1)

## 
## Classification tree:
## rpart(formula = Author ~ ., data = train, method = "class", control = rpart.control(cp = 0))
## 
## Variables actually used in tree construction:
## [1] alexand jame   
## 
## Root node error: 21/48 = 0.4375
## 
## n= 48 
## 
##        CP nsplit rel error  xerror    xstd
## 1 0.61905      0   1.00000 1.00000 0.16366
## 2 0.38095      1   0.38095 0.38095 0.12295
## 3 0.00000      2   0.00000 0.33333 0.11644

## Warning in rsq.rpart(train_tree1): may not be applicable for this method

#plot the decision tree
fancyRpartPlot(train_tree1)

#confusion matrix to find correct and incorrect predictions
table(Authorship=predicted1, true=test$Author)

##           true
## Authorship dispt hamil madis
##      dispt     3     4     0
##      hamil     0    23     0
##      madis     0     0     2

#Train Tree Model 2
train_tree2 <- rpart(Author ~ ., data = train, method="class", control=rpart.control(cp=0, minsplit = 2, maxdepth = 5))
summary(train_tree2)

## Call:
## rpart(formula = Author ~ ., data = train, method = "class", control = rpart.control(cp = 0, 
##     minsplit = 2, maxdepth = 5))
##   n= 48 
## 
##          CP nsplit rel error    xerror      xstd
## 1 0.6190476      0 1.0000000 1.0000000 0.1636634
## 2 0.3809524      1 0.3809524 0.3809524 0.1229519
## 3 0.0000000      2 0.0000000 0.0000000 0.0000000
## 
## Variable importance
##  alexand hamilton     jame     upon   matter     kind    appli    elect 
##       17       17       15       14       10        8        5        5 
##     mani  absolut 
##        5        4 
## 
## Node number 1: 48 observations,    complexity param=0.6190476
##   predicted class=hamil  expected loss=0.4375  P(node) =1
##     class counts:     8    27    13
##    probabilities: 0.167 0.562 0.271 
##   left son=2 (27 obs) right son=3 (21 obs)
##   Primary splits:
##       jame     < 5e-04  to the left,  improve=18.053570, (0 missing)
##       upon     < 0.0055 to the right, improve=16.594700, (0 missing)
##       alexand  < 5e-04  to the right, improve=15.615480, (0 missing)
##       hamilton < 5e-04  to the right, improve=15.615480, (0 missing)
##       matter   < 5e-04  to the right, improve= 9.291667, (0 missing)
##   Surrogate splits:
##       upon     < 0.004  to the right, agree=0.979, adj=0.952, (0 split)
##       matter   < 5e-04  to the right, agree=0.854, adj=0.667, (0 split)
##       alexand  < 5e-04  to the right, agree=0.833, adj=0.619, (0 split)
##       hamilton < 5e-04  to the right, agree=0.833, adj=0.619, (0 split)
##       kind     < 5e-04  to the right, agree=0.812, adj=0.571, (0 split)
## 
## Node number 2: 27 observations
##   predicted class=hamil  expected loss=0  P(node) =0.5625
##     class counts:     0    27     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 3: 21 observations,    complexity param=0.3809524
##   predicted class=madis  expected loss=0.3809524  P(node) =0.4375
##     class counts:     8     0    13
##    probabilities: 0.381 0.000 0.619 
##   left son=6 (8 obs) right son=7 (13 obs)
##   Primary splits:
##       alexand  < 5e-04  to the right, improve=9.904762, (0 missing)
##       hamilton < 5e-04  to the right, improve=9.904762, (0 missing)
##       elect    < 0.0045 to the right, improve=5.029762, (0 missing)
##       mani     < 0.0025 to the right, improve=4.960317, (0 missing)
##       appli    < 5e-04  to the right, improve=4.761905, (0 missing)
##   Surrogate splits:
##       hamilton < 5e-04  to the right, agree=1.000, adj=1.000, (0 split)
##       appli    < 5e-04  to the right, agree=0.857, adj=0.625, (0 split)
##       elect    < 0.0045 to the right, agree=0.857, adj=0.625, (0 split)
##       mani     < 0.0025 to the right, agree=0.857, adj=0.625, (0 split)
##       absolut  < 5e-04  to the left,  agree=0.810, adj=0.500, (0 split)
## 
## Node number 6: 8 observations
##   predicted class=dispt  expected loss=0  P(node) =0.1666667
##     class counts:     8     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 7: 13 observations
##   predicted class=madis  expected loss=0  P(node) =0.2708333
##     class counts:     0     0    13
##    probabilities: 0.000 0.000 1.000

#predict the test dataset using the model for train tree No. 1
predicted2= predict(train_tree2, test, type="class")
#plot number of splits
rsq.rpart(train_tree2)

## 
## Classification tree:
## rpart(formula = Author ~ ., data = train, method = "class", control = rpart.control(cp = 0, 
##     minsplit = 2, maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] alexand jame   
## 
## Root node error: 21/48 = 0.4375
## 
## n= 48 
## 
##        CP nsplit rel error  xerror    xstd
## 1 0.61905      0   1.00000 1.00000 0.16366
## 2 0.38095      1   0.38095 0.38095 0.12295
## 3 0.00000      2   0.00000 0.00000 0.00000

## Warning in rsq.rpart(train_tree2): may not be applicable for this method

plotcp(train_tree2)

#plot the decision tree
fancyRpartPlot(train_tree2)

#confusion matrix to find correct and incorrect predictions
table(Authorship=predicted2, true=test$Author)

##           true
## Authorship dispt hamil madis
##      dispt     3     4     0
##      hamil     0    23     0
##      madis     0     0     2

#redo the DT with words taken out

FedPapersCorpus2 <- Corpus(DirSource("FedPapersCorpus"))
(numberFedPapers<-length(FedPapersCorpus2))

## [1] 85

summary(FedPapersCorpus2)

##                     Length Class             Mode
## dispt_fed_49.txt    2      PlainTextDocument list
## dispt_fed_50.txt    2      PlainTextDocument list
## dispt_fed_51.txt    2      PlainTextDocument list
## dispt_fed_52.txt    2      PlainTextDocument list
## dispt_fed_53.txt    2      PlainTextDocument list
## dispt_fed_54.txt    2      PlainTextDocument list
## dispt_fed_55.txt    2      PlainTextDocument list
## dispt_fed_56.txt    2      PlainTextDocument list
## dispt_fed_57.txt    2      PlainTextDocument list
## dispt_fed_62.txt    2      PlainTextDocument list
## dispt_fed_63.txt    2      PlainTextDocument list
## Hamilton_fed_1.txt  2      PlainTextDocument list
## Hamilton_fed_11.txt 2      PlainTextDocument list
## Hamilton_fed_12.txt 2      PlainTextDocument list
## Hamilton_fed_13.txt 2      PlainTextDocument list
## Hamilton_fed_15.txt 2      PlainTextDocument list
## Hamilton_fed_16.txt 2      PlainTextDocument list
## Hamilton_fed_17.txt 2      PlainTextDocument list
## Hamilton_fed_21.txt 2      PlainTextDocument list
## Hamilton_fed_22.txt 2      PlainTextDocument list
## Hamilton_fed_23.txt 2      PlainTextDocument list
## Hamilton_fed_24.txt 2      PlainTextDocument list
## Hamilton_fed_25.txt 2      PlainTextDocument list
## Hamilton_fed_26.txt 2      PlainTextDocument list
## Hamilton_fed_27.txt 2      PlainTextDocument list
## Hamilton_fed_28.txt 2      PlainTextDocument list
## Hamilton_fed_29.txt 2      PlainTextDocument list
## Hamilton_fed_30.txt 2      PlainTextDocument list
## Hamilton_fed_31.txt 2      PlainTextDocument list
## Hamilton_fed_32.txt 2      PlainTextDocument list
## Hamilton_fed_33.txt 2      PlainTextDocument list
## Hamilton_fed_34.txt 2      PlainTextDocument list
## Hamilton_fed_35.txt 2      PlainTextDocument list
## Hamilton_fed_36.txt 2      PlainTextDocument list
## Hamilton_fed_59.txt 2      PlainTextDocument list
## Hamilton_fed_6.txt  2      PlainTextDocument list
## Hamilton_fed_60.txt 2      PlainTextDocument list
## Hamilton_fed_61.txt 2      PlainTextDocument list
## Hamilton_fed_65.txt 2      PlainTextDocument list
## Hamilton_fed_66.txt 2      PlainTextDocument list
## Hamilton_fed_67.txt 2      PlainTextDocument list
## Hamilton_fed_68.txt 2      PlainTextDocument list
## Hamilton_fed_69.txt 2      PlainTextDocument list
## Hamilton_fed_7.txt  2      PlainTextDocument list
## Hamilton_fed_70.txt 2      PlainTextDocument list
## Hamilton_fed_71.txt 2      PlainTextDocument list
## Hamilton_fed_72.txt 2      PlainTextDocument list
## Hamilton_fed_73.txt 2      PlainTextDocument list
## Hamilton_fed_74.txt 2      PlainTextDocument list
## Hamilton_fed_75.txt 2      PlainTextDocument list
## Hamilton_fed_76.txt 2      PlainTextDocument list
## Hamilton_fed_77.txt 2      PlainTextDocument list
## Hamilton_fed_78.txt 2      PlainTextDocument list
## Hamilton_fed_79.txt 2      PlainTextDocument list
## Hamilton_fed_8.txt  2      PlainTextDocument list
## Hamilton_fed_80.txt 2      PlainTextDocument list
## Hamilton_fed_81.txt 2      PlainTextDocument list
## Hamilton_fed_82.txt 2      PlainTextDocument list
## Hamilton_fed_83.txt 2      PlainTextDocument list
## Hamilton_fed_84.txt 2      PlainTextDocument list
## Hamilton_fed_85.txt 2      PlainTextDocument list
## Hamilton_fed_9.txt  2      PlainTextDocument list
## HM_fed_18.txt       2      PlainTextDocument list
## HM_fed_19.txt       2      PlainTextDocument list
## HM_fed_20.txt       2      PlainTextDocument list
## Jay_fed_2.txt       2      PlainTextDocument list
## Jay_fed_3.txt       2      PlainTextDocument list
## Jay_fed_4.txt       2      PlainTextDocument list
## Jay_fed_5.txt       2      PlainTextDocument list
## Jay_fed_64.txt      2      PlainTextDocument list
## Madison_fed_10.txt  2      PlainTextDocument list
## Madison_fed_14.txt  2      PlainTextDocument list
## Madison_fed_37.txt  2      PlainTextDocument list
## Madison_fed_38.txt  2      PlainTextDocument list
## Madison_fed_39.txt  2      PlainTextDocument list
## Madison_fed_40.txt  2      PlainTextDocument list
## Madison_fed_41.txt  2      PlainTextDocument list
## Madison_fed_42.txt  2      PlainTextDocument list
## Madison_fed_43.txt  2      PlainTextDocument list
## Madison_fed_44.txt  2      PlainTextDocument list
## Madison_fed_45.txt  2      PlainTextDocument list
## Madison_fed_46.txt  2      PlainTextDocument list
## Madison_fed_47.txt  2      PlainTextDocument list
## Madison_fed_48.txt  2      PlainTextDocument list
## Madison_fed_58.txt  2      PlainTextDocument list

(getTransformations())

## [1] "removeNumbers"     "removePunctuation" "removeWords"      
## [4] "stemDocument"      "stripWhitespace"

(nFedPapersCorpus2<-length(FedPapersCorpus2))

## [1] 85

(minTermFreq <-30)

## [1] 30

(maxTermFreq <-1000)

## [1] 1000

(MyStopwords2 <- c("will","one","two", "may","less","publius","Madison","Alexand", "alexand", "james", "madison", "jay", "hamilton", "jame", "author", "Alexander", "James", "Hamilton","Jay", "well","might","without","small", "single", "several", "but", "very", "can", "must", "also", "any", "and", "are", "however", "into", "almost", "can","for", "add", "Author", "alexander", "people", "peoples" , "author", "authors", "member", "latter", "members", "alexand", "james" ))

##  [1] "will"      "one"       "two"       "may"       "less"     
##  [6] "publius"   "Madison"   "Alexand"   "alexand"   "james"    
## [11] "madison"   "jay"       "hamilton"  "jame"      "author"   
## [16] "Alexander" "James"     "Hamilton"  "Jay"       "well"     
## [21] "might"     "without"   "small"     "single"    "several"  
## [26] "but"       "very"      "can"       "must"      "also"     
## [31] "any"       "and"       "are"       "however"   "into"     
## [36] "almost"    "can"       "for"       "add"       "Author"   
## [41] "alexander" "people"    "peoples"   "author"    "authors"  
## [46] "member"    "latter"    "members"   "alexand"   "james"

(STOPS <-stopwords('english'))

##   [1] "i"          "me"         "my"         "myself"     "we"        
##   [6] "our"        "ours"       "ourselves"  "you"        "your"      
##  [11] "yours"      "yourself"   "yourselves" "he"         "him"       
##  [16] "his"        "himself"    "she"        "her"        "hers"      
##  [21] "herself"    "it"         "its"        "itself"     "they"      
##  [26] "them"       "their"      "theirs"     "themselves" "what"      
##  [31] "which"      "who"        "whom"       "this"       "that"      
##  [36] "these"      "those"      "am"         "is"         "are"       
##  [41] "was"        "were"       "be"         "been"       "being"     
##  [46] "have"       "has"        "had"        "having"     "do"        
##  [51] "does"       "did"        "doing"      "would"      "should"    
##  [56] "could"      "ought"      "i'm"        "you're"     "he's"      
##  [61] "she's"      "it's"       "we're"      "they're"    "i've"      
##  [66] "you've"     "we've"      "they've"    "i'd"        "you'd"     
##  [71] "he'd"       "she'd"      "we'd"       "they'd"     "i'll"      
##  [76] "you'll"     "he'll"      "she'll"     "we'll"      "they'll"   
##  [81] "isn't"      "aren't"     "wasn't"     "weren't"    "hasn't"    
##  [86] "haven't"    "hadn't"     "doesn't"    "don't"      "didn't"    
##  [91] "won't"      "wouldn't"   "shan't"     "shouldn't"  "can't"     
##  [96] "cannot"     "couldn't"   "mustn't"    "let's"      "that's"    
## [101] "who's"      "what's"     "here's"     "there's"    "when's"    
## [106] "where's"    "why's"      "how's"      "a"          "an"        
## [111] "the"        "and"        "but"        "if"         "or"        
## [116] "because"    "as"         "until"      "while"      "of"        
## [121] "at"         "by"         "for"        "with"       "about"     
## [126] "against"    "between"    "into"       "through"    "during"    
## [131] "before"     "after"      "above"      "below"      "to"        
## [136] "from"       "up"         "down"       "in"         "out"       
## [141] "on"         "off"        "over"       "under"      "again"     
## [146] "further"    "then"       "once"       "here"       "there"     
## [151] "when"       "where"      "why"        "how"        "all"       
## [156] "any"        "both"       "each"       "few"        "more"      
## [161] "most"       "other"      "some"       "such"       "no"        
## [166] "nor"        "not"        "only"       "own"        "same"      
## [171] "so"         "than"       "too"        "very"       "will"

FedPapersCorpus2<- tm_map(FedPapersCorpus2, tolower)
FedPapersCorpus2<- tm_map(FedPapersCorpus2, removeWords, MyStopwords)
FedPapersCorpus2<- tm_map(FedPapersCorpus2, removeWords, c("author", "latter", "members", "constitution", "communiti", "communities", "long", "act", "alexander", "alexand", "james", "jame", "madison", "hamil", "hamilton"))

Papers_DTM2 <- DocumentTermMatrix(FedPapersCorpus2,
                                 control = list(
                                   stopwords = TRUE,
                                   wordLengths=c(3, 15),
                                   removePunctuation = T,
                                   removeNumbers = T,
                                   tolower=T,
                                   stemming = T,
                                   remove_separators = T,
                                   stopwords = MyStopwords2,
                                   removeWords=STOPS,
                                   bounds = list(global = c(minTermFreq, maxTermFreq))
                                 ))

DTM2 <- as.matrix(Papers_DTM2)
(DTM[12:65,1])

##  Hamilton_fed_1.txt Hamilton_fed_11.txt Hamilton_fed_12.txt 
##                   1                   4                   2 
## Hamilton_fed_13.txt Hamilton_fed_15.txt Hamilton_fed_16.txt 
##                   1                   0                   2 
## Hamilton_fed_17.txt Hamilton_fed_21.txt Hamilton_fed_22.txt 
##                   2                   0                   3 
## Hamilton_fed_23.txt Hamilton_fed_24.txt Hamilton_fed_25.txt 
##                   0                   1                   1 
## Hamilton_fed_26.txt Hamilton_fed_27.txt Hamilton_fed_28.txt 
##                   1                   2                   2 
## Hamilton_fed_29.txt Hamilton_fed_30.txt Hamilton_fed_31.txt 
##                   0                   2                   1 
## Hamilton_fed_32.txt Hamilton_fed_33.txt Hamilton_fed_34.txt 
##                   0                   0                   1 
## Hamilton_fed_35.txt Hamilton_fed_36.txt Hamilton_fed_59.txt 
##                   1                   1                   0 
##  Hamilton_fed_6.txt Hamilton_fed_60.txt Hamilton_fed_61.txt 
##                   0                   0                   0 
## Hamilton_fed_65.txt Hamilton_fed_66.txt Hamilton_fed_67.txt 
##                   0                   0                   1 
## Hamilton_fed_68.txt Hamilton_fed_69.txt  Hamilton_fed_7.txt 
##                   1                   0                   2 
## Hamilton_fed_70.txt Hamilton_fed_71.txt Hamilton_fed_72.txt 
##                   1                   2                   0 
## Hamilton_fed_73.txt Hamilton_fed_74.txt Hamilton_fed_75.txt 
##                   0                   0                   1 
## Hamilton_fed_76.txt Hamilton_fed_77.txt Hamilton_fed_78.txt 
##                   0                   0                   1 
## Hamilton_fed_79.txt  Hamilton_fed_8.txt Hamilton_fed_80.txt 
##                   0                   2                   0 
## Hamilton_fed_81.txt Hamilton_fed_82.txt Hamilton_fed_83.txt 
##                   0                   0                   0 
## Hamilton_fed_84.txt Hamilton_fed_85.txt  Hamilton_fed_9.txt 
##                   0                   1                   3 
##       HM_fed_18.txt       HM_fed_19.txt       HM_fed_20.txt 
##                   0                   0                   0

#Vectorizing
WordFreq2 <- colSums(as.matrix(Papers_DTM2))
(head(WordFreq2))

##       abl   absolut    accord       act     addit administr 
##        74        63        71        58        61        90

(length(WordFreq2))

## [1] 406

ord2 <- order(WordFreq2)
(WordFreq2[head(ord2)])

##    expos  furnish     word   unless    bound descript 
##       34       36       36       37       38       38

(WordFreq2[tail(ord2)])

## author nation  peopl  power govern  state 
##    390    566    612    937   1040   1662

(Row_Sum_Per_doc <- rowSums((as.matrix(Papers_DTM2))))

##    dispt_fed_49.txt    dispt_fed_50.txt    dispt_fed_51.txt 
##                 458                 286                 554 
##    dispt_fed_52.txt    dispt_fed_53.txt    dispt_fed_54.txt 
##                 500                 598                 508 
##    dispt_fed_55.txt    dispt_fed_56.txt    dispt_fed_57.txt 
##                 554                 482                 529 
##    dispt_fed_62.txt    dispt_fed_63.txt  Hamilton_fed_1.txt 
##                 595                 821                 413 
## Hamilton_fed_11.txt Hamilton_fed_12.txt Hamilton_fed_13.txt 
##                 498                 475                 272 
## Hamilton_fed_15.txt Hamilton_fed_16.txt Hamilton_fed_17.txt 
##                 729                 506                 441 
## Hamilton_fed_21.txt Hamilton_fed_22.txt Hamilton_fed_23.txt 
##                 482                 878                 501 
## Hamilton_fed_24.txt Hamilton_fed_25.txt Hamilton_fed_26.txt 
##                 455                 510                 608 
## Hamilton_fed_27.txt Hamilton_fed_28.txt Hamilton_fed_29.txt 
##                 388                 445                 496 
## Hamilton_fed_30.txt Hamilton_fed_31.txt Hamilton_fed_32.txt 
##                 510                 457                 408 
## Hamilton_fed_33.txt Hamilton_fed_34.txt Hamilton_fed_35.txt 
##                 468                 544                 597 
## Hamilton_fed_36.txt Hamilton_fed_59.txt  Hamilton_fed_6.txt 
##                 715                 521                 420 
## Hamilton_fed_60.txt Hamilton_fed_61.txt Hamilton_fed_65.txt 
##                 566                 375                 486 
## Hamilton_fed_66.txt Hamilton_fed_67.txt Hamilton_fed_68.txt 
##                 559                 401                 390 
## Hamilton_fed_69.txt  Hamilton_fed_7.txt Hamilton_fed_70.txt 
##                 712                 542                 753 
## Hamilton_fed_71.txt Hamilton_fed_72.txt Hamilton_fed_73.txt 
##                 413                 485                 610 
## Hamilton_fed_74.txt Hamilton_fed_75.txt Hamilton_fed_76.txt 
##                 247                 536                 523 
## Hamilton_fed_77.txt Hamilton_fed_78.txt Hamilton_fed_79.txt 
##                 525                 762                 259 
##  Hamilton_fed_8.txt Hamilton_fed_80.txt Hamilton_fed_81.txt 
##                 474                 694                1059 
## Hamilton_fed_82.txt Hamilton_fed_83.txt Hamilton_fed_84.txt 
##                 448                1450                1086 
## Hamilton_fed_85.txt  Hamilton_fed_9.txt       HM_fed_18.txt 
##                 662                 454                 395 
##       HM_fed_19.txt       HM_fed_20.txt       Jay_fed_2.txt 
##                 419                 348                 439 
##       Jay_fed_3.txt       Jay_fed_4.txt       Jay_fed_5.txt 
##                 449                 398                 361 
##      Jay_fed_64.txt  Madison_fed_10.txt  Madison_fed_14.txt 
##                 604                 767                 472 
##  Madison_fed_37.txt  Madison_fed_38.txt  Madison_fed_39.txt 
##                 619                 764                 767 
##  Madison_fed_40.txt  Madison_fed_41.txt  Madison_fed_42.txt 
##                 773                 886                 716 
##  Madison_fed_43.txt  Madison_fed_44.txt  Madison_fed_45.txt 
##                 851                 826                 631 
##  Madison_fed_46.txt  Madison_fed_47.txt  Madison_fed_48.txt 
##                 718                 804                 496 
##  Madison_fed_58.txt 
##                 549

Papers_M2 <- as.matrix(Papers_DTM2)
Papers_M_N12 <- apply(Papers_M2, 1, function(i) round(i/sum(i),3))
Papers_Matrix_Norm2 <- t(Papers_M_N12)
Papers_dtm_matrix = as.matrix(Papers_DTM2)


Papers_DF2 <- as.data.frame(as.matrix(Papers_Matrix_Norm2))
#remove Jays papers
Papers_DF2<-Papers_DF2[-66:-70,]

Papers_DF3<- Papers_DF2%>%add_rownames()

## Warning: Deprecated, use tibble::rownames_to_column() instead.

names(Papers_DF3)[1]<-"Author"
Papers_DF3[1:11,1]="dispt"
Papers_DF3[12:65,1]="hamil"
Papers_DF3[66:80,1]="madis"

##Make Train and Test sets
trainRatio <- .75
set.seed(11) # Set Seed so that same sample can be reproduced in future also
sample2 <- sample.int(n = nrow(Papers_DF3), size = floor(trainRatio*nrow(Papers_DF3)), replace = FALSE)
train2 <- Papers_DF3[sample2, ]
test2 <- Papers_DF3[-sample2, ]
# train / test ratio
length(sample2)/nrow(Papers_DF3)

## [1] 0.75

##Decision Tree Models
#Train Tree Model 3
train_tree3 <- rpart(Author ~ ., data = train2, method="class", control=rpart.control(cp=0))
summary(train_tree3)

## Call:
## rpart(formula = Author ~ ., data = train2, method = "class", 
##     control = rpart.control(cp = 0))
##   n= 60 
## 
##          CP nsplit rel error    xerror      xstd
## 1 0.4545455      0 1.0000000 1.0000000 0.1696699
## 2 0.2727273      1 0.5454545 0.6818182 0.1524592
## 3 0.0000000      2 0.2727273 0.5454545 0.1408358
## 
## Variable importance
##        upon      matter        kind     assembl       among    maintain 
##          21          11          10           9           9           9 
##       union      branch    confeder confederaci   establish      legisl 
##           7           6           5           5           5           4 
## 
## Node number 1: 60 observations,    complexity param=0.4545455
##   predicted class=hamil  expected loss=0.3666667  P(node) =1
##     class counts:     9    38    13
##    probabilities: 0.150 0.633 0.217 
##   left son=2 (35 obs) right son=3 (25 obs)
##   Primary splits:
##       upon   < 0.0055 to the right, improve=17.126670, (0 missing)
##       matter < 5e-04  to the right, improve= 8.925732, (0 missing)
##       kind   < 0.0015 to the right, improve= 7.103810, (0 missing)
##       repres < 0.0115 to the right, improve= 7.035897, (0 missing)
##       thing  < 0.0015 to the right, improve= 6.822222, (0 missing)
##   Surrogate splits:
##       matter   < 5e-04  to the right, agree=0.800, adj=0.52, (0 split)
##       kind     < 5e-04  to the right, agree=0.783, adj=0.48, (0 split)
##       assembl  < 0.0025 to the left,  agree=0.767, adj=0.44, (0 split)
##       among    < 0.0035 to the left,  agree=0.750, adj=0.40, (0 split)
##       maintain < 0.0015 to the left,  agree=0.750, adj=0.40, (0 split)
## 
## Node number 2: 35 observations
##   predicted class=hamil  expected loss=0  P(node) =0.5833333
##     class counts:     0    35     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 3: 25 observations,    complexity param=0.2727273
##   predicted class=madis  expected loss=0.48  P(node) =0.4166667
##     class counts:     9     3    13
##    probabilities: 0.360 0.120 0.520 
##   left son=6 (12 obs) right son=7 (13 obs)
##   Primary splits:
##       union  < 0.0035 to the left,  improve=5.524615, (0 missing)
##       mani   < 0.0035 to the right, improve=4.806667, (0 missing)
##       appli  < 5e-04  to the right, improve=4.440000, (0 missing)
##       men    < 0.0015 to the right, improve=4.404706, (0 missing)
##       branch < 0.003  to the right, improve=4.133506, (0 missing)
##   Surrogate splits:
##       branch      < 0.003  to the right, agree=0.96, adj=0.917, (0 split)
##       confeder    < 0.0015 to the left,  agree=0.84, adj=0.667, (0 split)
##       confederaci < 5e-04  to the left,  agree=0.84, adj=0.667, (0 split)
##       establish   < 0.003  to the left,  agree=0.84, adj=0.667, (0 split)
##       legisl      < 0.0045 to the right, agree=0.80, adj=0.583, (0 split)
## 
## Node number 6: 12 observations
##   predicted class=dispt  expected loss=0.25  P(node) =0.2
##     class counts:     9     0     3
##    probabilities: 0.750 0.000 0.250 
## 
## Node number 7: 13 observations
##   predicted class=madis  expected loss=0.2307692  P(node) =0.2166667
##     class counts:     0     3    10
##    probabilities: 0.000 0.231 0.769

#predict the test dataset using the model for train tree No. 1
predicted3= predict(train_tree3, test2, type="class")
(Results3 <- data.frame(Predicted=predicted3,Actual=test2$Author))

#plot number of splits
rsq.rpart(train_tree3)

## 
## Classification tree:
## rpart(formula = Author ~ ., data = train2, method = "class", 
##     control = rpart.control(cp = 0))
## 
## Variables actually used in tree construction:
## [1] union upon 
## 
## Root node error: 22/60 = 0.36667
## 
## n= 60 
## 
##        CP nsplit rel error  xerror    xstd
## 1 0.45455      0   1.00000 1.00000 0.16967
## 2 0.27273      1   0.54545 0.68182 0.15246
## 3 0.00000      2   0.27273 0.54545 0.14084

## Warning in rsq.rpart(train_tree3): may not be applicable for this method

plotcp(train_tree3)

fancyRpartPlot(train_tree3)

#confusion matrix to find correct and incorrect predictions
table(Authorship=predicted3, true=test2$Author)

##           true
## Authorship dispt hamil madis
##      dispt     2     0     1
##      hamil     0    13     0
##      madis     0     3     1

#attributed hamilton with disputed

#Train Tree Model 4
train_tree4 <- rpart(Author ~ ., data = train2, method="class", control=rpart.control(cp=0, minsplit = 2, maxdepth = 5))
summary(train_tree4)

## Call:
## rpart(formula = Author ~ ., data = train2, method = "class", 
##     control = rpart.control(cp = 0, minsplit = 2, maxdepth = 5))
##   n= 60 
## 
##           CP nsplit  rel error    xerror      xstd
## 1 0.45454545      0 1.00000000 1.0000000 0.1696699
## 2 0.27272727      1 0.54545455 0.6363636 0.1489171
## 3 0.09090909      2 0.27272727 0.6818182 0.1524592
## 4 0.04545455      4 0.09090909 0.9090909 0.1659765
## 5 0.00000000      6 0.00000000 0.8636364 0.1637836
## 
## Variable importance
##        upon      matter        kind     assembl       among    maintain 
##          15           8           7           7           6           6 
##       union      branch    confeder confederaci   establish      legisl 
##           5           5           3           3           3           3 
##       appli     america     continu      design       elect       everi 
##           3           3           2           2           2           2 
##        full       exist         man        seem         act      answer 
##           2           2           2           2           1           1 
##   administr      affair 
##           1           1 
## 
## Node number 1: 60 observations,    complexity param=0.4545455
##   predicted class=hamil  expected loss=0.3666667  P(node) =1
##     class counts:     9    38    13
##    probabilities: 0.150 0.633 0.217 
##   left son=2 (35 obs) right son=3 (25 obs)
##   Primary splits:
##       upon   < 0.0055 to the right, improve=17.126670, (0 missing)
##       matter < 5e-04  to the right, improve= 8.925732, (0 missing)
##       kind   < 0.0015 to the right, improve= 7.103810, (0 missing)
##       repres < 0.0115 to the right, improve= 7.035897, (0 missing)
##       thing  < 0.0015 to the right, improve= 6.822222, (0 missing)
##   Surrogate splits:
##       matter   < 5e-04  to the right, agree=0.800, adj=0.52, (0 split)
##       kind     < 5e-04  to the right, agree=0.783, adj=0.48, (0 split)
##       assembl  < 0.0025 to the left,  agree=0.767, adj=0.44, (0 split)
##       among    < 0.0035 to the left,  agree=0.750, adj=0.40, (0 split)
##       maintain < 0.0015 to the left,  agree=0.750, adj=0.40, (0 split)
## 
## Node number 2: 35 observations
##   predicted class=hamil  expected loss=0  P(node) =0.5833333
##     class counts:     0    35     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 3: 25 observations,    complexity param=0.2727273
##   predicted class=madis  expected loss=0.48  P(node) =0.4166667
##     class counts:     9     3    13
##    probabilities: 0.360 0.120 0.520 
##   left son=6 (12 obs) right son=7 (13 obs)
##   Primary splits:
##       union  < 0.0035 to the left,  improve=5.524615, (0 missing)
##       mani   < 0.0035 to the right, improve=4.806667, (0 missing)
##       appli  < 5e-04  to the right, improve=4.440000, (0 missing)
##       men    < 0.0015 to the right, improve=4.404706, (0 missing)
##       branch < 0.003  to the right, improve=4.133506, (0 missing)
##   Surrogate splits:
##       branch      < 0.003  to the right, agree=0.96, adj=0.917, (0 split)
##       confeder    < 0.0015 to the left,  agree=0.84, adj=0.667, (0 split)
##       confederaci < 5e-04  to the left,  agree=0.84, adj=0.667, (0 split)
##       establish   < 0.003  to the left,  agree=0.84, adj=0.667, (0 split)
##       legisl      < 0.0045 to the right, agree=0.80, adj=0.583, (0 split)
## 
## Node number 6: 12 observations,    complexity param=0.09090909
##   predicted class=dispt  expected loss=0.25  P(node) =0.2
##     class counts:     9     0     3
##    probabilities: 0.750 0.000 0.250 
##   left son=12 (8 obs) right son=13 (4 obs)
##   Primary splits:
##       america < 0.0015 to the right, improve=3, (0 missing)
##       answer  < 0.001  to the right, improve=3, (0 missing)
##       conclus < 5e-04  to the left,  improve=3, (0 missing)
##       mani    < 0.0025 to the right, improve=3, (0 missing)
##       relat   < 0.001  to the right, improve=3, (0 missing)
##   Surrogate splits:
##       exist  < 0.001  to the right, agree=0.917, adj=0.75, (0 split)
##       man    < 0.001  to the right, agree=0.917, adj=0.75, (0 split)
##       seem   < 0.0035 to the left,  agree=0.917, adj=0.75, (0 split)
##       act    < 0.003  to the left,  agree=0.833, adj=0.50, (0 split)
##       answer < 0.001  to the right, agree=0.833, adj=0.50, (0 split)
## 
## Node number 7: 13 observations,    complexity param=0.09090909
##   predicted class=madis  expected loss=0.2307692  P(node) =0.2166667
##     class counts:     0     3    10
##    probabilities: 0.000 0.231 0.769 
##   left son=14 (4 obs) right son=15 (9 obs)
##   Primary splits:
##       appli   < 0.001  to the right, improve=3.115385, (0 missing)
##       capac   < 0.0015 to the right, improve=3.115385, (0 missing)
##       citizen < 0.001  to the left,  improve=3.115385, (0 missing)
##       direct  < 0.0025 to the right, improve=3.115385, (0 missing)
##       forc    < 0.0055 to the right, improve=3.115385, (0 missing)
##   Surrogate splits:
##       continu < 0.0045 to the right, agree=0.923, adj=0.75, (0 split)
##       design  < 0.0015 to the right, agree=0.923, adj=0.75, (0 split)
##       elect   < 5e-04  to the left,  agree=0.923, adj=0.75, (0 split)
##       everi   < 0.0035 to the left,  agree=0.923, adj=0.75, (0 split)
##       full    < 0.0015 to the right, agree=0.923, adj=0.75, (0 split)
## 
## Node number 12: 8 observations
##   predicted class=dispt  expected loss=0  P(node) =0.1333333
##     class counts:     8     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 13: 4 observations,    complexity param=0.04545455
##   predicted class=madis  expected loss=0.25  P(node) =0.06666667
##     class counts:     1     0     3
##    probabilities: 0.250 0.000 0.750 
##   left son=26 (1 obs) right son=27 (3 obs)
##   Primary splits:
##       affair < 0.007  to the right, improve=1.5, (0 missing)
##       among  < 0.0055 to the right, improve=1.5, (0 missing)
##       amount < 0.0015 to the right, improve=1.5, (0 missing)
##       answer < 0.001  to the right, improve=1.5, (0 missing)
##       appear < 0.002  to the left,  improve=1.5, (0 missing)
## 
## Node number 14: 4 observations,    complexity param=0.04545455
##   predicted class=hamil  expected loss=0.25  P(node) =0.06666667
##     class counts:     0     3     1
##    probabilities: 0.000 0.750 0.250 
##   left son=28 (3 obs) right son=29 (1 obs)
##   Primary splits:
##       administr < 0.0035 to the left,  improve=1.5, (0 missing)
##       adopt     < 0.001  to the left,  improve=1.5, (0 missing)
##       affect    < 0.001  to the left,  improve=1.5, (0 missing)
##       alon      < 0.003  to the left,  improve=1.5, (0 missing)
##       america   < 0.004  to the left,  improve=1.5, (0 missing)
## 
## Node number 15: 9 observations
##   predicted class=madis  expected loss=0  P(node) =0.15
##     class counts:     0     0     9
##    probabilities: 0.000 0.000 1.000 
## 
## Node number 26: 1 observations
##   predicted class=dispt  expected loss=0  P(node) =0.01666667
##     class counts:     1     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 27: 3 observations
##   predicted class=madis  expected loss=0  P(node) =0.05
##     class counts:     0     0     3
##    probabilities: 0.000 0.000 1.000 
## 
## Node number 28: 3 observations
##   predicted class=hamil  expected loss=0  P(node) =0.05
##     class counts:     0     3     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 29: 1 observations
##   predicted class=madis  expected loss=0  P(node) =0.01666667
##     class counts:     0     0     1
##    probabilities: 0.000 0.000 1.000

#predict the test dataset using the model for train tree No. 1
predicted4= predict(train_tree4, test2, type="class")
#plot number of splits
rsq.rpart(train_tree4)

## 
## Classification tree:
## rpart(formula = Author ~ ., data = train2, method = "class", 
##     control = rpart.control(cp = 0, minsplit = 2, maxdepth = 5))
## 
## Variables actually used in tree construction:
## [1] administr affair    america   appli     union     upon     
## 
## Root node error: 22/60 = 0.36667
## 
## n= 60 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.454545      0  1.000000 1.00000 0.16967
## 2 0.272727      1  0.545455 0.63636 0.14892
## 3 0.090909      2  0.272727 0.68182 0.15246
## 4 0.045455      4  0.090909 0.90909 0.16598
## 5 0.000000      6  0.000000 0.86364 0.16378

## Warning in rsq.rpart(train_tree4): may not be applicable for this method

plotcp(train_tree4)

rpart.plot(train_tree4)

#confusion matrix to find correct and incorrect predictions
table(Authorship=predicted4, true=test2$Author)

##           true
## Authorship dispt hamil madis
##      dispt     0     0     0
##      hamil     0    13     0
##      madis     2     3     2

(Results4<-data.frame(Predicted=predicted4, Actual=test2$Author))

#Train Tree 5
train_tree5 <- rpart(Author ~ ., data = train2, method="class", control=rpart.control(cp=0, minsplit = 5, maxdepth = 7))
summary(train_tree5)

## Call:
## rpart(formula = Author ~ ., data = train2, method = "class", 
##     control = rpart.control(cp = 0, minsplit = 5, maxdepth = 7))
##   n= 60 
## 
##           CP nsplit  rel error    xerror      xstd
## 1 0.45454545      0 1.00000000 1.0000000 0.1696699
## 2 0.27272727      1 0.54545455 0.6363636 0.1489171
## 3 0.09090909      2 0.27272727 0.6363636 0.1489171
## 4 0.00000000      4 0.09090909 0.9090909 0.1659765
## 
## Variable importance
##        upon      matter        kind     assembl       among    maintain 
##          16           8           8           7           6           6 
##       union      branch    confeder confederaci   establish      legisl 
##           5           5           3           3           3           3 
##       appli     america     continu      design       elect       everi 
##           3           3           2           2           2           2 
##        full       exist         man        seem         act      answer 
##           2           2           2           2           1           1 
## 
## Node number 1: 60 observations,    complexity param=0.4545455
##   predicted class=hamil  expected loss=0.3666667  P(node) =1
##     class counts:     9    38    13
##    probabilities: 0.150 0.633 0.217 
##   left son=2 (35 obs) right son=3 (25 obs)
##   Primary splits:
##       upon   < 0.0055 to the right, improve=17.126670, (0 missing)
##       matter < 5e-04  to the right, improve= 8.925732, (0 missing)
##       kind   < 0.0015 to the right, improve= 7.103810, (0 missing)
##       repres < 0.0115 to the right, improve= 7.035897, (0 missing)
##       thing  < 0.0015 to the right, improve= 6.822222, (0 missing)
##   Surrogate splits:
##       matter   < 5e-04  to the right, agree=0.800, adj=0.52, (0 split)
##       kind     < 5e-04  to the right, agree=0.783, adj=0.48, (0 split)
##       assembl  < 0.0025 to the left,  agree=0.767, adj=0.44, (0 split)
##       among    < 0.0035 to the left,  agree=0.750, adj=0.40, (0 split)
##       maintain < 0.0015 to the left,  agree=0.750, adj=0.40, (0 split)
## 
## Node number 2: 35 observations
##   predicted class=hamil  expected loss=0  P(node) =0.5833333
##     class counts:     0    35     0
##    probabilities: 0.000 1.000 0.000 
## 
## Node number 3: 25 observations,    complexity param=0.2727273
##   predicted class=madis  expected loss=0.48  P(node) =0.4166667
##     class counts:     9     3    13
##    probabilities: 0.360 0.120 0.520 
##   left son=6 (12 obs) right son=7 (13 obs)
##   Primary splits:
##       union  < 0.0035 to the left,  improve=5.524615, (0 missing)
##       mani   < 0.0035 to the right, improve=4.806667, (0 missing)
##       appli  < 5e-04  to the right, improve=4.440000, (0 missing)
##       men    < 0.0015 to the right, improve=4.404706, (0 missing)
##       branch < 0.003  to the right, improve=4.133506, (0 missing)
##   Surrogate splits:
##       branch      < 0.003  to the right, agree=0.96, adj=0.917, (0 split)
##       confeder    < 0.0015 to the left,  agree=0.84, adj=0.667, (0 split)
##       confederaci < 5e-04  to the left,  agree=0.84, adj=0.667, (0 split)
##       establish   < 0.003  to the left,  agree=0.84, adj=0.667, (0 split)
##       legisl      < 0.0045 to the right, agree=0.80, adj=0.583, (0 split)
## 
## Node number 6: 12 observations,    complexity param=0.09090909
##   predicted class=dispt  expected loss=0.25  P(node) =0.2
##     class counts:     9     0     3
##    probabilities: 0.750 0.000 0.250 
##   left son=12 (8 obs) right son=13 (4 obs)
##   Primary splits:
##       america < 0.0015 to the right, improve=3, (0 missing)
##       answer  < 0.001  to the right, improve=3, (0 missing)
##       conclus < 5e-04  to the left,  improve=3, (0 missing)
##       mani    < 0.0025 to the right, improve=3, (0 missing)
##       relat   < 0.001  to the right, improve=3, (0 missing)
##   Surrogate splits:
##       exist  < 0.001  to the right, agree=0.917, adj=0.75, (0 split)
##       man    < 0.001  to the right, agree=0.917, adj=0.75, (0 split)
##       seem   < 0.0035 to the left,  agree=0.917, adj=0.75, (0 split)
##       act    < 0.003  to the left,  agree=0.833, adj=0.50, (0 split)
##       answer < 0.001  to the right, agree=0.833, adj=0.50, (0 split)
## 
## Node number 7: 13 observations,    complexity param=0.09090909
##   predicted class=madis  expected loss=0.2307692  P(node) =0.2166667
##     class counts:     0     3    10
##    probabilities: 0.000 0.231 0.769 
##   left son=14 (4 obs) right son=15 (9 obs)
##   Primary splits:
##       appli   < 0.001  to the right, improve=3.115385, (0 missing)
##       capac   < 0.0015 to the right, improve=3.115385, (0 missing)
##       citizen < 0.001  to the left,  improve=3.115385, (0 missing)
##       direct  < 0.0025 to the right, improve=3.115385, (0 missing)
##       forc    < 0.0055 to the right, improve=3.115385, (0 missing)
##   Surrogate splits:
##       continu < 0.0045 to the right, agree=0.923, adj=0.75, (0 split)
##       design  < 0.0015 to the right, agree=0.923, adj=0.75, (0 split)
##       elect   < 5e-04  to the left,  agree=0.923, adj=0.75, (0 split)
##       everi   < 0.0035 to the left,  agree=0.923, adj=0.75, (0 split)
##       full    < 0.0015 to the right, agree=0.923, adj=0.75, (0 split)
## 
## Node number 12: 8 observations
##   predicted class=dispt  expected loss=0  P(node) =0.1333333
##     class counts:     8     0     0
##    probabilities: 1.000 0.000 0.000 
## 
## Node number 13: 4 observations
##   predicted class=madis  expected loss=0.25  P(node) =0.06666667
##     class counts:     1     0     3
##    probabilities: 0.250 0.000 0.750 
## 
## Node number 14: 4 observations
##   predicted class=hamil  expected loss=0.25  P(node) =0.06666667
##     class counts:     0     3     1
##    probabilities: 0.000 0.750 0.250 
## 
## Node number 15: 9 observations
##   predicted class=madis  expected loss=0  P(node) =0.15
##     class counts:     0     0     9
##    probabilities: 0.000 0.000 1.000

predicted5= predict(train_tree5, test2, type="class")
rsq.rpart(train_tree5)

## 
## Classification tree:
## rpart(formula = Author ~ ., data = train2, method = "class", 
##     control = rpart.control(cp = 0, minsplit = 5, maxdepth = 7))
## 
## Variables actually used in tree construction:
## [1] america appli   union   upon   
## 
## Root node error: 22/60 = 0.36667
## 
## n= 60 
## 
##         CP nsplit rel error  xerror    xstd
## 1 0.454545      0  1.000000 1.00000 0.16967
## 2 0.272727      1  0.545455 0.63636 0.14892
## 3 0.090909      2  0.272727 0.63636 0.14892
## 4 0.000000      4  0.090909 0.90909 0.16598

## Warning in rsq.rpart(train_tree5): may not be applicable for this method

plotcp(train_tree5)

rpart.plot(train_tree5)

table(Authorship=predicted5, true = test2$Author)

##           true
## Authorship dispt hamil madis
##      dispt     0     0     0
##      hamil     0    15     0
##      madis     2     1     2

Katie_Hanks_HW5

Katie Hanks

11/5/2019