#Load the Libraries
library(wordcloud)
## Loading required package: RColorBrewer
library(tm)
## Loading required package: NLP
library(slam)
library(quanteda)
## Package version: 1.5.1
## Parallel computing: 2 of 4 threads used.
## See https://quanteda.io for tutorials and examples.
##
## Attaching package: 'quanteda'
## The following objects are masked from 'package:tm':
##
## as.DocumentTermMatrix, stopwords
## The following object is masked from 'package:utils':
##
## View
library(SnowballC)
library(arules)
## Loading required package: Matrix
##
## Attaching package: 'arules'
## The following object is masked from 'package:quanteda':
##
## affinity
## The following object is masked from 'package:tm':
##
## inspect
## The following objects are masked from 'package:base':
##
## abbreviate, write
library(proxy)
##
## Attaching package: 'proxy'
## The following object is masked from 'package:Matrix':
##
## as.matrix
## The following object is masked from 'package:quanteda':
##
## as.matrix
## The following objects are masked from 'package:stats':
##
## as.dist, dist
## The following object is masked from 'package:base':
##
## as.matrix
library(cluster)
library(stringi)
library(Matrix)
library(tidytext)
library(plyr)
library(ggplot2)
##
## Attaching package: 'ggplot2'
## The following object is masked from 'package:NLP':
##
## annotate
library(factoextra)
## Welcome! Related Books: `Practical Guide To Cluster Analysis in R` at https://goo.gl/13EFCZ
library(mclust)
## Package 'mclust' version 5.4.5
## Type 'citation("mclust")' for citing this R package in publications.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:plyr':
##
## arrange, count, desc, failwith, id, mutate, rename, summarise,
## summarize
## The following objects are masked from 'package:arules':
##
## intersect, recode, setdiff, setequal, union
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(corpus)
library(rpart)
library(rpart.plot)
library(rattle)
## Rattle: A free graphical interface for data science with R.
## Version 5.2.0 Copyright (c) 2006-2018 Togaware Pty Ltd.
## Type 'rattle()' to shake, rattle, and roll your data.
#Load the Data
setwd("C:/Users/katie/Desktop")
FedPapersCorpus <- Corpus(DirSource("FedPapersCorpus"))
(numberFedPapers<-length(FedPapersCorpus))
## [1] 85
summary(FedPapersCorpus)
## Length Class Mode
## dispt_fed_49.txt 2 PlainTextDocument list
## dispt_fed_50.txt 2 PlainTextDocument list
## dispt_fed_51.txt 2 PlainTextDocument list
## dispt_fed_52.txt 2 PlainTextDocument list
## dispt_fed_53.txt 2 PlainTextDocument list
## dispt_fed_54.txt 2 PlainTextDocument list
## dispt_fed_55.txt 2 PlainTextDocument list
## dispt_fed_56.txt 2 PlainTextDocument list
## dispt_fed_57.txt 2 PlainTextDocument list
## dispt_fed_62.txt 2 PlainTextDocument list
## dispt_fed_63.txt 2 PlainTextDocument list
## Hamilton_fed_1.txt 2 PlainTextDocument list
## Hamilton_fed_11.txt 2 PlainTextDocument list
## Hamilton_fed_12.txt 2 PlainTextDocument list
## Hamilton_fed_13.txt 2 PlainTextDocument list
## Hamilton_fed_15.txt 2 PlainTextDocument list
## Hamilton_fed_16.txt 2 PlainTextDocument list
## Hamilton_fed_17.txt 2 PlainTextDocument list
## Hamilton_fed_21.txt 2 PlainTextDocument list
## Hamilton_fed_22.txt 2 PlainTextDocument list
## Hamilton_fed_23.txt 2 PlainTextDocument list
## Hamilton_fed_24.txt 2 PlainTextDocument list
## Hamilton_fed_25.txt 2 PlainTextDocument list
## Hamilton_fed_26.txt 2 PlainTextDocument list
## Hamilton_fed_27.txt 2 PlainTextDocument list
## Hamilton_fed_28.txt 2 PlainTextDocument list
## Hamilton_fed_29.txt 2 PlainTextDocument list
## Hamilton_fed_30.txt 2 PlainTextDocument list
## Hamilton_fed_31.txt 2 PlainTextDocument list
## Hamilton_fed_32.txt 2 PlainTextDocument list
## Hamilton_fed_33.txt 2 PlainTextDocument list
## Hamilton_fed_34.txt 2 PlainTextDocument list
## Hamilton_fed_35.txt 2 PlainTextDocument list
## Hamilton_fed_36.txt 2 PlainTextDocument list
## Hamilton_fed_59.txt 2 PlainTextDocument list
## Hamilton_fed_6.txt 2 PlainTextDocument list
## Hamilton_fed_60.txt 2 PlainTextDocument list
## Hamilton_fed_61.txt 2 PlainTextDocument list
## Hamilton_fed_65.txt 2 PlainTextDocument list
## Hamilton_fed_66.txt 2 PlainTextDocument list
## Hamilton_fed_67.txt 2 PlainTextDocument list
## Hamilton_fed_68.txt 2 PlainTextDocument list
## Hamilton_fed_69.txt 2 PlainTextDocument list
## Hamilton_fed_7.txt 2 PlainTextDocument list
## Hamilton_fed_70.txt 2 PlainTextDocument list
## Hamilton_fed_71.txt 2 PlainTextDocument list
## Hamilton_fed_72.txt 2 PlainTextDocument list
## Hamilton_fed_73.txt 2 PlainTextDocument list
## Hamilton_fed_74.txt 2 PlainTextDocument list
## Hamilton_fed_75.txt 2 PlainTextDocument list
## Hamilton_fed_76.txt 2 PlainTextDocument list
## Hamilton_fed_77.txt 2 PlainTextDocument list
## Hamilton_fed_78.txt 2 PlainTextDocument list
## Hamilton_fed_79.txt 2 PlainTextDocument list
## Hamilton_fed_8.txt 2 PlainTextDocument list
## Hamilton_fed_80.txt 2 PlainTextDocument list
## Hamilton_fed_81.txt 2 PlainTextDocument list
## Hamilton_fed_82.txt 2 PlainTextDocument list
## Hamilton_fed_83.txt 2 PlainTextDocument list
## Hamilton_fed_84.txt 2 PlainTextDocument list
## Hamilton_fed_85.txt 2 PlainTextDocument list
## Hamilton_fed_9.txt 2 PlainTextDocument list
## HM_fed_18.txt 2 PlainTextDocument list
## HM_fed_19.txt 2 PlainTextDocument list
## HM_fed_20.txt 2 PlainTextDocument list
## Jay_fed_2.txt 2 PlainTextDocument list
## Jay_fed_3.txt 2 PlainTextDocument list
## Jay_fed_4.txt 2 PlainTextDocument list
## Jay_fed_5.txt 2 PlainTextDocument list
## Jay_fed_64.txt 2 PlainTextDocument list
## Madison_fed_10.txt 2 PlainTextDocument list
## Madison_fed_14.txt 2 PlainTextDocument list
## Madison_fed_37.txt 2 PlainTextDocument list
## Madison_fed_38.txt 2 PlainTextDocument list
## Madison_fed_39.txt 2 PlainTextDocument list
## Madison_fed_40.txt 2 PlainTextDocument list
## Madison_fed_41.txt 2 PlainTextDocument list
## Madison_fed_42.txt 2 PlainTextDocument list
## Madison_fed_43.txt 2 PlainTextDocument list
## Madison_fed_44.txt 2 PlainTextDocument list
## Madison_fed_45.txt 2 PlainTextDocument list
## Madison_fed_46.txt 2 PlainTextDocument list
## Madison_fed_47.txt 2 PlainTextDocument list
## Madison_fed_48.txt 2 PlainTextDocument list
## Madison_fed_58.txt 2 PlainTextDocument list
#Create the DTM
(getTransformations())
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
(nFedPapersCorpus<-length(FedPapersCorpus))
## [1] 85
(minTermFreq <-30)
## [1] 30
(maxTermFreq <-1000)
## [1] 1000
(MyStopwords <- c("will","one","two", "may","less","publius","Madison","Alexand", "Alexander", "James", "Hamilton", "hamilton", "Jay", "well","might","without","small", "single", "several", "but", "very", "can", "must", "also", "any", "and", "are", "however", "into", "almost", "can","for", "add", "Author", "author", "alexand", "alexander", "jame", "james" ))
## [1] "will" "one" "two" "may" "less"
## [6] "publius" "Madison" "Alexand" "Alexander" "James"
## [11] "Hamilton" "hamilton" "Jay" "well" "might"
## [16] "without" "small" "single" "several" "but"
## [21] "very" "can" "must" "also" "any"
## [26] "and" "are" "however" "into" "almost"
## [31] "can" "for" "add" "Author" "author"
## [36] "alexand" "alexander" "jame" "james"
(STOPS <-stopwords('english'))
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very" "will"
Papers_DTM <- DocumentTermMatrix(FedPapersCorpus,
control = list(
stopwords = TRUE,
wordLengths=c(3, 15),
removePunctuation = T,
removeNumbers = T,
tolower=T,
stemming = T,
remove_separators = T,
stopwords = MyStopwords,
removeWords=STOPS,
removeWords=MyStopwords,
bounds = list(global = c(minTermFreq, maxTermFreq))
))
DTM <- as.matrix(Papers_DTM)
#Vectorizing
WordFreq <- colSums(as.matrix(Papers_DTM))
(head(WordFreq))
## abl absolut accord act addit administr
## 74 63 71 139 61 90
(length(WordFreq))
## [1] 427
ord <- order(WordFreq)
(WordFreq[head(ord)])
## jame expos furnish word unless bound
## 30 34 36 36 37 38
(WordFreq[tail(ord)])
## constitut may power govern will state
## 686 811 937 1040 1263 1662
(Row_Sum_Per_doc <- rowSums((as.matrix(Papers_DTM))))
## dispt_fed_49.txt dispt_fed_50.txt dispt_fed_51.txt
## 514 338 658
## dispt_fed_52.txt dispt_fed_53.txt dispt_fed_54.txt
## 565 701 582
## dispt_fed_55.txt dispt_fed_56.txt dispt_fed_57.txt
## 647 553 613
## dispt_fed_62.txt dispt_fed_63.txt Hamilton_fed_1.txt
## 698 955 483
## Hamilton_fed_11.txt Hamilton_fed_12.txt Hamilton_fed_13.txt
## 564 539 318
## Hamilton_fed_15.txt Hamilton_fed_16.txt Hamilton_fed_17.txt
## 815 558 477
## Hamilton_fed_21.txt Hamilton_fed_22.txt Hamilton_fed_23.txt
## 537 985 560
## Hamilton_fed_24.txt Hamilton_fed_25.txt Hamilton_fed_26.txt
## 519 570 670
## Hamilton_fed_27.txt Hamilton_fed_28.txt Hamilton_fed_29.txt
## 466 507 541
## Hamilton_fed_30.txt Hamilton_fed_31.txt Hamilton_fed_32.txt
## 585 510 442
## Hamilton_fed_33.txt Hamilton_fed_34.txt Hamilton_fed_35.txt
## 522 618 663
## Hamilton_fed_36.txt Hamilton_fed_59.txt Hamilton_fed_6.txt
## 824 603 461
## Hamilton_fed_60.txt Hamilton_fed_61.txt Hamilton_fed_65.txt
## 657 444 560
## Hamilton_fed_66.txt Hamilton_fed_67.txt Hamilton_fed_68.txt
## 646 443 449
## Hamilton_fed_69.txt Hamilton_fed_7.txt Hamilton_fed_70.txt
## 811 580 852
## Hamilton_fed_71.txt Hamilton_fed_72.txt Hamilton_fed_73.txt
## 473 539 696
## Hamilton_fed_74.txt Hamilton_fed_75.txt Hamilton_fed_76.txt
## 282 597 594
## Hamilton_fed_77.txt Hamilton_fed_78.txt Hamilton_fed_79.txt
## 586 891 301
## Hamilton_fed_8.txt Hamilton_fed_80.txt Hamilton_fed_81.txt
## 533 771 1188
## Hamilton_fed_82.txt Hamilton_fed_83.txt Hamilton_fed_84.txt
## 504 1598 1255
## Hamilton_fed_85.txt Hamilton_fed_9.txt HM_fed_18.txt
## 773 520 443
## HM_fed_19.txt HM_fed_20.txt Jay_fed_2.txt
## 466 395 477
## Jay_fed_3.txt Jay_fed_4.txt Jay_fed_5.txt
## 515 463 401
## Jay_fed_64.txt Madison_fed_10.txt Madison_fed_14.txt
## 692 884 553
## Madison_fed_37.txt Madison_fed_38.txt Madison_fed_39.txt
## 723 874 859
## Madison_fed_40.txt Madison_fed_41.txt Madison_fed_42.txt
## 857 1020 800
## Madison_fed_43.txt Madison_fed_44.txt Madison_fed_45.txt
## 993 927 724
## Madison_fed_46.txt Madison_fed_47.txt Madison_fed_48.txt
## 832 925 565
## Madison_fed_58.txt
## 655
Papers_M <- as.matrix(Papers_DTM)
Papers_M_N1 <- apply(Papers_M, 1, function(i) round(i/sum(i),3))
Papers_Matrix_Norm <- t(Papers_M_N1)
Papers_dtm_matrix = as.matrix(Papers_DTM)
Papers_DF <- as.data.frame(as.matrix(Papers_Matrix_Norm))
#remove Jays papers
Papers_DF<-Papers_DF[-66:-70,]
Papers_DF1<- Papers_DF%>%add_rownames()
## Warning: Deprecated, use tibble::rownames_to_column() instead.
names(Papers_DF1)[1]<-"Author"
Papers_DF1[1:11,1]="dispt"
Papers_DF1[12:65,1]="hamil"
Papers_DF1[66:80,1]="madis"
head(Papers_DF1)
##Word Clouds
DisputedPapersWC<- wordcloud(colnames(Papers_dtm_matrix), Papers_dtm_matrix[11,], rot.per = .35, colors = brewer.pal(5, "Set1"))

(head(sort(as.matrix(Papers_dtm_matrix)[11,], decreasing = TRUE), n=50))
## peopl senat will may repres govern
## 42 24 19 18 18 16
## bodi can elect must measur state
## 15 14 14 12 11 11
## nation one constitut former power reason
## 9 9 8 8 8 8
## year assembl exampl two danger everi
## 8 7 7 7 6 6
## evid feder import latter object particular
## 6 6 6 6 6 6
## public advantag answer appear author charact
## 6 5 5 5 5 5
## fact first hous institut less mani
## 5 5 5 5 5 5
## member might oper order part popular
## 5 5 5 5 5 5
## probabl small
## 5 5
HamiltonPapersWC <-wordcloud(colnames(Papers_dtm_matrix),Papers_dtm_matrix[50:53,], rot.per = .35, colors = brewer.pal(5, "Set1"))

(head(sort(as.matrix(Papers_dtm_matrix)[11,], decreasing = TRUE), n=50))
## peopl senat will may repres govern
## 42 24 19 18 18 16
## bodi can elect must measur state
## 15 14 14 12 11 11
## nation one constitut former power reason
## 9 9 8 8 8 8
## year assembl exampl two danger everi
## 8 7 7 7 6 6
## evid feder import latter object particular
## 6 6 6 6 6 6
## public advantag answer appear author charact
## 6 5 5 5 5 5
## fact first hous institut less mani
## 5 5 5 5 5 5
## member might oper order part popular
## 5 5 5 5 5 5
## probabl small
## 5 5
MadisonPapersHW <-wordcloud(colnames(Papers_dtm_matrix), Papers_dtm_matrix[63:66,], rot.per = .35, colors = brewer.pal(5, "Set1"))

(head(sort(as.matrix(Papers_dtm_matrix)[11,], decreasing = TRUE), n=50))
## peopl senat will may repres govern
## 42 24 19 18 18 16
## bodi can elect must measur state
## 15 14 14 12 11 11
## nation one constitut former power reason
## 9 9 8 8 8 8
## year assembl exampl two danger everi
## 8 7 7 7 6 6
## evid feder import latter object particular
## 6 6 6 6 6 6
## public advantag answer appear author charact
## 6 5 5 5 5 5
## fact first hous institut less mani
## 5 5 5 5 5 5
## member might oper order part popular
## 5 5 5 5 5 5
## probabl small
## 5 5
##Make Train and Test sets
trainRatio <- .60
set.seed(11) # Set Seed so that same sample can be reproduced in future also
sample <- sample.int(n = nrow(Papers_DF1), size = floor(trainRatio*nrow(Papers_DF1)), replace = FALSE)
train <- Papers_DF1[sample, ]
test <- Papers_DF1[-sample, ]
# train / test ratio
length(sample)/nrow(Papers_DF1)
## [1] 0.6
##Decision Tree Models
#Train Tree Model 1
train_tree1 <- rpart(Author ~ ., data = train, method="class", control=rpart.control(cp=0))
summary(train_tree1)
## Call:
## rpart(formula = Author ~ ., data = train, method = "class", control = rpart.control(cp = 0))
## n= 48
##
## CP nsplit rel error xerror xstd
## 1 0.6190476 0 1.0000000 1.0000000 0.1636634
## 2 0.3809524 1 0.3809524 0.3809524 0.1229519
## 3 0.0000000 2 0.0000000 0.3333333 0.1164397
##
## Variable importance
## alexand hamilton jame upon matter kind appli elect
## 17 17 15 14 10 8 5 5
## mani absolut
## 5 4
##
## Node number 1: 48 observations, complexity param=0.6190476
## predicted class=hamil expected loss=0.4375 P(node) =1
## class counts: 8 27 13
## probabilities: 0.167 0.562 0.271
## left son=2 (27 obs) right son=3 (21 obs)
## Primary splits:
## jame < 5e-04 to the left, improve=18.053570, (0 missing)
## upon < 0.0055 to the right, improve=16.594700, (0 missing)
## alexand < 5e-04 to the right, improve=15.615480, (0 missing)
## hamilton < 5e-04 to the right, improve=15.615480, (0 missing)
## matter < 5e-04 to the right, improve= 9.291667, (0 missing)
## Surrogate splits:
## upon < 0.004 to the right, agree=0.979, adj=0.952, (0 split)
## matter < 5e-04 to the right, agree=0.854, adj=0.667, (0 split)
## alexand < 5e-04 to the right, agree=0.833, adj=0.619, (0 split)
## hamilton < 5e-04 to the right, agree=0.833, adj=0.619, (0 split)
## kind < 5e-04 to the right, agree=0.812, adj=0.571, (0 split)
##
## Node number 2: 27 observations
## predicted class=hamil expected loss=0 P(node) =0.5625
## class counts: 0 27 0
## probabilities: 0.000 1.000 0.000
##
## Node number 3: 21 observations, complexity param=0.3809524
## predicted class=madis expected loss=0.3809524 P(node) =0.4375
## class counts: 8 0 13
## probabilities: 0.381 0.000 0.619
## left son=6 (8 obs) right son=7 (13 obs)
## Primary splits:
## alexand < 5e-04 to the right, improve=9.904762, (0 missing)
## hamilton < 5e-04 to the right, improve=9.904762, (0 missing)
## mani < 0.0025 to the right, improve=4.960317, (0 missing)
## appli < 5e-04 to the right, improve=4.761905, (0 missing)
## branch < 0.0015 to the right, improve=4.571429, (0 missing)
## Surrogate splits:
## hamilton < 5e-04 to the right, agree=1.000, adj=1.000, (0 split)
## appli < 5e-04 to the right, agree=0.857, adj=0.625, (0 split)
## elect < 0.0045 to the right, agree=0.857, adj=0.625, (0 split)
## mani < 0.0025 to the right, agree=0.857, adj=0.625, (0 split)
## absolut < 5e-04 to the left, agree=0.810, adj=0.500, (0 split)
##
## Node number 6: 8 observations
## predicted class=dispt expected loss=0 P(node) =0.1666667
## class counts: 8 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 7: 13 observations
## predicted class=madis expected loss=0 P(node) =0.2708333
## class counts: 0 0 13
## probabilities: 0.000 0.000 1.000
#predict the test dataset using the model for train tree No. 1
predicted1= predict(train_tree1, test, type="class")
#plot number of splits
rsq.rpart(train_tree1)
##
## Classification tree:
## rpart(formula = Author ~ ., data = train, method = "class", control = rpart.control(cp = 0))
##
## Variables actually used in tree construction:
## [1] alexand jame
##
## Root node error: 21/48 = 0.4375
##
## n= 48
##
## CP nsplit rel error xerror xstd
## 1 0.61905 0 1.00000 1.00000 0.16366
## 2 0.38095 1 0.38095 0.38095 0.12295
## 3 0.00000 2 0.00000 0.33333 0.11644
## Warning in rsq.rpart(train_tree1): may not be applicable for this method


#plot the decision tree
fancyRpartPlot(train_tree1)

#confusion matrix to find correct and incorrect predictions
table(Authorship=predicted1, true=test$Author)
## true
## Authorship dispt hamil madis
## dispt 3 4 0
## hamil 0 23 0
## madis 0 0 2
#Train Tree Model 2
train_tree2 <- rpart(Author ~ ., data = train, method="class", control=rpart.control(cp=0, minsplit = 2, maxdepth = 5))
summary(train_tree2)
## Call:
## rpart(formula = Author ~ ., data = train, method = "class", control = rpart.control(cp = 0,
## minsplit = 2, maxdepth = 5))
## n= 48
##
## CP nsplit rel error xerror xstd
## 1 0.6190476 0 1.0000000 1.0000000 0.1636634
## 2 0.3809524 1 0.3809524 0.3809524 0.1229519
## 3 0.0000000 2 0.0000000 0.0000000 0.0000000
##
## Variable importance
## alexand hamilton jame upon matter kind appli elect
## 17 17 15 14 10 8 5 5
## mani absolut
## 5 4
##
## Node number 1: 48 observations, complexity param=0.6190476
## predicted class=hamil expected loss=0.4375 P(node) =1
## class counts: 8 27 13
## probabilities: 0.167 0.562 0.271
## left son=2 (27 obs) right son=3 (21 obs)
## Primary splits:
## jame < 5e-04 to the left, improve=18.053570, (0 missing)
## upon < 0.0055 to the right, improve=16.594700, (0 missing)
## alexand < 5e-04 to the right, improve=15.615480, (0 missing)
## hamilton < 5e-04 to the right, improve=15.615480, (0 missing)
## matter < 5e-04 to the right, improve= 9.291667, (0 missing)
## Surrogate splits:
## upon < 0.004 to the right, agree=0.979, adj=0.952, (0 split)
## matter < 5e-04 to the right, agree=0.854, adj=0.667, (0 split)
## alexand < 5e-04 to the right, agree=0.833, adj=0.619, (0 split)
## hamilton < 5e-04 to the right, agree=0.833, adj=0.619, (0 split)
## kind < 5e-04 to the right, agree=0.812, adj=0.571, (0 split)
##
## Node number 2: 27 observations
## predicted class=hamil expected loss=0 P(node) =0.5625
## class counts: 0 27 0
## probabilities: 0.000 1.000 0.000
##
## Node number 3: 21 observations, complexity param=0.3809524
## predicted class=madis expected loss=0.3809524 P(node) =0.4375
## class counts: 8 0 13
## probabilities: 0.381 0.000 0.619
## left son=6 (8 obs) right son=7 (13 obs)
## Primary splits:
## alexand < 5e-04 to the right, improve=9.904762, (0 missing)
## hamilton < 5e-04 to the right, improve=9.904762, (0 missing)
## elect < 0.0045 to the right, improve=5.029762, (0 missing)
## mani < 0.0025 to the right, improve=4.960317, (0 missing)
## appli < 5e-04 to the right, improve=4.761905, (0 missing)
## Surrogate splits:
## hamilton < 5e-04 to the right, agree=1.000, adj=1.000, (0 split)
## appli < 5e-04 to the right, agree=0.857, adj=0.625, (0 split)
## elect < 0.0045 to the right, agree=0.857, adj=0.625, (0 split)
## mani < 0.0025 to the right, agree=0.857, adj=0.625, (0 split)
## absolut < 5e-04 to the left, agree=0.810, adj=0.500, (0 split)
##
## Node number 6: 8 observations
## predicted class=dispt expected loss=0 P(node) =0.1666667
## class counts: 8 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 7: 13 observations
## predicted class=madis expected loss=0 P(node) =0.2708333
## class counts: 0 0 13
## probabilities: 0.000 0.000 1.000
#predict the test dataset using the model for train tree No. 1
predicted2= predict(train_tree2, test, type="class")
#plot number of splits
rsq.rpart(train_tree2)
##
## Classification tree:
## rpart(formula = Author ~ ., data = train, method = "class", control = rpart.control(cp = 0,
## minsplit = 2, maxdepth = 5))
##
## Variables actually used in tree construction:
## [1] alexand jame
##
## Root node error: 21/48 = 0.4375
##
## n= 48
##
## CP nsplit rel error xerror xstd
## 1 0.61905 0 1.00000 1.00000 0.16366
## 2 0.38095 1 0.38095 0.38095 0.12295
## 3 0.00000 2 0.00000 0.00000 0.00000
## Warning in rsq.rpart(train_tree2): may not be applicable for this method


plotcp(train_tree2)

#plot the decision tree
fancyRpartPlot(train_tree2)

#confusion matrix to find correct and incorrect predictions
table(Authorship=predicted2, true=test$Author)
## true
## Authorship dispt hamil madis
## dispt 3 4 0
## hamil 0 23 0
## madis 0 0 2
#redo the DT with words taken out
FedPapersCorpus2 <- Corpus(DirSource("FedPapersCorpus"))
(numberFedPapers<-length(FedPapersCorpus2))
## [1] 85
summary(FedPapersCorpus2)
## Length Class Mode
## dispt_fed_49.txt 2 PlainTextDocument list
## dispt_fed_50.txt 2 PlainTextDocument list
## dispt_fed_51.txt 2 PlainTextDocument list
## dispt_fed_52.txt 2 PlainTextDocument list
## dispt_fed_53.txt 2 PlainTextDocument list
## dispt_fed_54.txt 2 PlainTextDocument list
## dispt_fed_55.txt 2 PlainTextDocument list
## dispt_fed_56.txt 2 PlainTextDocument list
## dispt_fed_57.txt 2 PlainTextDocument list
## dispt_fed_62.txt 2 PlainTextDocument list
## dispt_fed_63.txt 2 PlainTextDocument list
## Hamilton_fed_1.txt 2 PlainTextDocument list
## Hamilton_fed_11.txt 2 PlainTextDocument list
## Hamilton_fed_12.txt 2 PlainTextDocument list
## Hamilton_fed_13.txt 2 PlainTextDocument list
## Hamilton_fed_15.txt 2 PlainTextDocument list
## Hamilton_fed_16.txt 2 PlainTextDocument list
## Hamilton_fed_17.txt 2 PlainTextDocument list
## Hamilton_fed_21.txt 2 PlainTextDocument list
## Hamilton_fed_22.txt 2 PlainTextDocument list
## Hamilton_fed_23.txt 2 PlainTextDocument list
## Hamilton_fed_24.txt 2 PlainTextDocument list
## Hamilton_fed_25.txt 2 PlainTextDocument list
## Hamilton_fed_26.txt 2 PlainTextDocument list
## Hamilton_fed_27.txt 2 PlainTextDocument list
## Hamilton_fed_28.txt 2 PlainTextDocument list
## Hamilton_fed_29.txt 2 PlainTextDocument list
## Hamilton_fed_30.txt 2 PlainTextDocument list
## Hamilton_fed_31.txt 2 PlainTextDocument list
## Hamilton_fed_32.txt 2 PlainTextDocument list
## Hamilton_fed_33.txt 2 PlainTextDocument list
## Hamilton_fed_34.txt 2 PlainTextDocument list
## Hamilton_fed_35.txt 2 PlainTextDocument list
## Hamilton_fed_36.txt 2 PlainTextDocument list
## Hamilton_fed_59.txt 2 PlainTextDocument list
## Hamilton_fed_6.txt 2 PlainTextDocument list
## Hamilton_fed_60.txt 2 PlainTextDocument list
## Hamilton_fed_61.txt 2 PlainTextDocument list
## Hamilton_fed_65.txt 2 PlainTextDocument list
## Hamilton_fed_66.txt 2 PlainTextDocument list
## Hamilton_fed_67.txt 2 PlainTextDocument list
## Hamilton_fed_68.txt 2 PlainTextDocument list
## Hamilton_fed_69.txt 2 PlainTextDocument list
## Hamilton_fed_7.txt 2 PlainTextDocument list
## Hamilton_fed_70.txt 2 PlainTextDocument list
## Hamilton_fed_71.txt 2 PlainTextDocument list
## Hamilton_fed_72.txt 2 PlainTextDocument list
## Hamilton_fed_73.txt 2 PlainTextDocument list
## Hamilton_fed_74.txt 2 PlainTextDocument list
## Hamilton_fed_75.txt 2 PlainTextDocument list
## Hamilton_fed_76.txt 2 PlainTextDocument list
## Hamilton_fed_77.txt 2 PlainTextDocument list
## Hamilton_fed_78.txt 2 PlainTextDocument list
## Hamilton_fed_79.txt 2 PlainTextDocument list
## Hamilton_fed_8.txt 2 PlainTextDocument list
## Hamilton_fed_80.txt 2 PlainTextDocument list
## Hamilton_fed_81.txt 2 PlainTextDocument list
## Hamilton_fed_82.txt 2 PlainTextDocument list
## Hamilton_fed_83.txt 2 PlainTextDocument list
## Hamilton_fed_84.txt 2 PlainTextDocument list
## Hamilton_fed_85.txt 2 PlainTextDocument list
## Hamilton_fed_9.txt 2 PlainTextDocument list
## HM_fed_18.txt 2 PlainTextDocument list
## HM_fed_19.txt 2 PlainTextDocument list
## HM_fed_20.txt 2 PlainTextDocument list
## Jay_fed_2.txt 2 PlainTextDocument list
## Jay_fed_3.txt 2 PlainTextDocument list
## Jay_fed_4.txt 2 PlainTextDocument list
## Jay_fed_5.txt 2 PlainTextDocument list
## Jay_fed_64.txt 2 PlainTextDocument list
## Madison_fed_10.txt 2 PlainTextDocument list
## Madison_fed_14.txt 2 PlainTextDocument list
## Madison_fed_37.txt 2 PlainTextDocument list
## Madison_fed_38.txt 2 PlainTextDocument list
## Madison_fed_39.txt 2 PlainTextDocument list
## Madison_fed_40.txt 2 PlainTextDocument list
## Madison_fed_41.txt 2 PlainTextDocument list
## Madison_fed_42.txt 2 PlainTextDocument list
## Madison_fed_43.txt 2 PlainTextDocument list
## Madison_fed_44.txt 2 PlainTextDocument list
## Madison_fed_45.txt 2 PlainTextDocument list
## Madison_fed_46.txt 2 PlainTextDocument list
## Madison_fed_47.txt 2 PlainTextDocument list
## Madison_fed_48.txt 2 PlainTextDocument list
## Madison_fed_58.txt 2 PlainTextDocument list
(getTransformations())
## [1] "removeNumbers" "removePunctuation" "removeWords"
## [4] "stemDocument" "stripWhitespace"
(nFedPapersCorpus2<-length(FedPapersCorpus2))
## [1] 85
(minTermFreq <-30)
## [1] 30
(maxTermFreq <-1000)
## [1] 1000
(MyStopwords2 <- c("will","one","two", "may","less","publius","Madison","Alexand", "alexand", "james", "madison", "jay", "hamilton", "jame", "author", "Alexander", "James", "Hamilton","Jay", "well","might","without","small", "single", "several", "but", "very", "can", "must", "also", "any", "and", "are", "however", "into", "almost", "can","for", "add", "Author", "alexander", "people", "peoples" , "author", "authors", "member", "latter", "members", "alexand", "james" ))
## [1] "will" "one" "two" "may" "less"
## [6] "publius" "Madison" "Alexand" "alexand" "james"
## [11] "madison" "jay" "hamilton" "jame" "author"
## [16] "Alexander" "James" "Hamilton" "Jay" "well"
## [21] "might" "without" "small" "single" "several"
## [26] "but" "very" "can" "must" "also"
## [31] "any" "and" "are" "however" "into"
## [36] "almost" "can" "for" "add" "Author"
## [41] "alexander" "people" "peoples" "author" "authors"
## [46] "member" "latter" "members" "alexand" "james"
(STOPS <-stopwords('english'))
## [1] "i" "me" "my" "myself" "we"
## [6] "our" "ours" "ourselves" "you" "your"
## [11] "yours" "yourself" "yourselves" "he" "him"
## [16] "his" "himself" "she" "her" "hers"
## [21] "herself" "it" "its" "itself" "they"
## [26] "them" "their" "theirs" "themselves" "what"
## [31] "which" "who" "whom" "this" "that"
## [36] "these" "those" "am" "is" "are"
## [41] "was" "were" "be" "been" "being"
## [46] "have" "has" "had" "having" "do"
## [51] "does" "did" "doing" "would" "should"
## [56] "could" "ought" "i'm" "you're" "he's"
## [61] "she's" "it's" "we're" "they're" "i've"
## [66] "you've" "we've" "they've" "i'd" "you'd"
## [71] "he'd" "she'd" "we'd" "they'd" "i'll"
## [76] "you'll" "he'll" "she'll" "we'll" "they'll"
## [81] "isn't" "aren't" "wasn't" "weren't" "hasn't"
## [86] "haven't" "hadn't" "doesn't" "don't" "didn't"
## [91] "won't" "wouldn't" "shan't" "shouldn't" "can't"
## [96] "cannot" "couldn't" "mustn't" "let's" "that's"
## [101] "who's" "what's" "here's" "there's" "when's"
## [106] "where's" "why's" "how's" "a" "an"
## [111] "the" "and" "but" "if" "or"
## [116] "because" "as" "until" "while" "of"
## [121] "at" "by" "for" "with" "about"
## [126] "against" "between" "into" "through" "during"
## [131] "before" "after" "above" "below" "to"
## [136] "from" "up" "down" "in" "out"
## [141] "on" "off" "over" "under" "again"
## [146] "further" "then" "once" "here" "there"
## [151] "when" "where" "why" "how" "all"
## [156] "any" "both" "each" "few" "more"
## [161] "most" "other" "some" "such" "no"
## [166] "nor" "not" "only" "own" "same"
## [171] "so" "than" "too" "very" "will"
FedPapersCorpus2<- tm_map(FedPapersCorpus2, tolower)
FedPapersCorpus2<- tm_map(FedPapersCorpus2, removeWords, MyStopwords)
FedPapersCorpus2<- tm_map(FedPapersCorpus2, removeWords, c("author", "latter", "members", "constitution", "communiti", "communities", "long", "act", "alexander", "alexand", "james", "jame", "madison", "hamil", "hamilton"))
Papers_DTM2 <- DocumentTermMatrix(FedPapersCorpus2,
control = list(
stopwords = TRUE,
wordLengths=c(3, 15),
removePunctuation = T,
removeNumbers = T,
tolower=T,
stemming = T,
remove_separators = T,
stopwords = MyStopwords2,
removeWords=STOPS,
bounds = list(global = c(minTermFreq, maxTermFreq))
))
DTM2 <- as.matrix(Papers_DTM2)
(DTM[12:65,1])
## Hamilton_fed_1.txt Hamilton_fed_11.txt Hamilton_fed_12.txt
## 1 4 2
## Hamilton_fed_13.txt Hamilton_fed_15.txt Hamilton_fed_16.txt
## 1 0 2
## Hamilton_fed_17.txt Hamilton_fed_21.txt Hamilton_fed_22.txt
## 2 0 3
## Hamilton_fed_23.txt Hamilton_fed_24.txt Hamilton_fed_25.txt
## 0 1 1
## Hamilton_fed_26.txt Hamilton_fed_27.txt Hamilton_fed_28.txt
## 1 2 2
## Hamilton_fed_29.txt Hamilton_fed_30.txt Hamilton_fed_31.txt
## 0 2 1
## Hamilton_fed_32.txt Hamilton_fed_33.txt Hamilton_fed_34.txt
## 0 0 1
## Hamilton_fed_35.txt Hamilton_fed_36.txt Hamilton_fed_59.txt
## 1 1 0
## Hamilton_fed_6.txt Hamilton_fed_60.txt Hamilton_fed_61.txt
## 0 0 0
## Hamilton_fed_65.txt Hamilton_fed_66.txt Hamilton_fed_67.txt
## 0 0 1
## Hamilton_fed_68.txt Hamilton_fed_69.txt Hamilton_fed_7.txt
## 1 0 2
## Hamilton_fed_70.txt Hamilton_fed_71.txt Hamilton_fed_72.txt
## 1 2 0
## Hamilton_fed_73.txt Hamilton_fed_74.txt Hamilton_fed_75.txt
## 0 0 1
## Hamilton_fed_76.txt Hamilton_fed_77.txt Hamilton_fed_78.txt
## 0 0 1
## Hamilton_fed_79.txt Hamilton_fed_8.txt Hamilton_fed_80.txt
## 0 2 0
## Hamilton_fed_81.txt Hamilton_fed_82.txt Hamilton_fed_83.txt
## 0 0 0
## Hamilton_fed_84.txt Hamilton_fed_85.txt Hamilton_fed_9.txt
## 0 1 3
## HM_fed_18.txt HM_fed_19.txt HM_fed_20.txt
## 0 0 0
#Vectorizing
WordFreq2 <- colSums(as.matrix(Papers_DTM2))
(head(WordFreq2))
## abl absolut accord act addit administr
## 74 63 71 58 61 90
(length(WordFreq2))
## [1] 406
ord2 <- order(WordFreq2)
(WordFreq2[head(ord2)])
## expos furnish word unless bound descript
## 34 36 36 37 38 38
(WordFreq2[tail(ord2)])
## author nation peopl power govern state
## 390 566 612 937 1040 1662
(Row_Sum_Per_doc <- rowSums((as.matrix(Papers_DTM2))))
## dispt_fed_49.txt dispt_fed_50.txt dispt_fed_51.txt
## 458 286 554
## dispt_fed_52.txt dispt_fed_53.txt dispt_fed_54.txt
## 500 598 508
## dispt_fed_55.txt dispt_fed_56.txt dispt_fed_57.txt
## 554 482 529
## dispt_fed_62.txt dispt_fed_63.txt Hamilton_fed_1.txt
## 595 821 413
## Hamilton_fed_11.txt Hamilton_fed_12.txt Hamilton_fed_13.txt
## 498 475 272
## Hamilton_fed_15.txt Hamilton_fed_16.txt Hamilton_fed_17.txt
## 729 506 441
## Hamilton_fed_21.txt Hamilton_fed_22.txt Hamilton_fed_23.txt
## 482 878 501
## Hamilton_fed_24.txt Hamilton_fed_25.txt Hamilton_fed_26.txt
## 455 510 608
## Hamilton_fed_27.txt Hamilton_fed_28.txt Hamilton_fed_29.txt
## 388 445 496
## Hamilton_fed_30.txt Hamilton_fed_31.txt Hamilton_fed_32.txt
## 510 457 408
## Hamilton_fed_33.txt Hamilton_fed_34.txt Hamilton_fed_35.txt
## 468 544 597
## Hamilton_fed_36.txt Hamilton_fed_59.txt Hamilton_fed_6.txt
## 715 521 420
## Hamilton_fed_60.txt Hamilton_fed_61.txt Hamilton_fed_65.txt
## 566 375 486
## Hamilton_fed_66.txt Hamilton_fed_67.txt Hamilton_fed_68.txt
## 559 401 390
## Hamilton_fed_69.txt Hamilton_fed_7.txt Hamilton_fed_70.txt
## 712 542 753
## Hamilton_fed_71.txt Hamilton_fed_72.txt Hamilton_fed_73.txt
## 413 485 610
## Hamilton_fed_74.txt Hamilton_fed_75.txt Hamilton_fed_76.txt
## 247 536 523
## Hamilton_fed_77.txt Hamilton_fed_78.txt Hamilton_fed_79.txt
## 525 762 259
## Hamilton_fed_8.txt Hamilton_fed_80.txt Hamilton_fed_81.txt
## 474 694 1059
## Hamilton_fed_82.txt Hamilton_fed_83.txt Hamilton_fed_84.txt
## 448 1450 1086
## Hamilton_fed_85.txt Hamilton_fed_9.txt HM_fed_18.txt
## 662 454 395
## HM_fed_19.txt HM_fed_20.txt Jay_fed_2.txt
## 419 348 439
## Jay_fed_3.txt Jay_fed_4.txt Jay_fed_5.txt
## 449 398 361
## Jay_fed_64.txt Madison_fed_10.txt Madison_fed_14.txt
## 604 767 472
## Madison_fed_37.txt Madison_fed_38.txt Madison_fed_39.txt
## 619 764 767
## Madison_fed_40.txt Madison_fed_41.txt Madison_fed_42.txt
## 773 886 716
## Madison_fed_43.txt Madison_fed_44.txt Madison_fed_45.txt
## 851 826 631
## Madison_fed_46.txt Madison_fed_47.txt Madison_fed_48.txt
## 718 804 496
## Madison_fed_58.txt
## 549
Papers_M2 <- as.matrix(Papers_DTM2)
Papers_M_N12 <- apply(Papers_M2, 1, function(i) round(i/sum(i),3))
Papers_Matrix_Norm2 <- t(Papers_M_N12)
Papers_dtm_matrix = as.matrix(Papers_DTM2)
Papers_DF2 <- as.data.frame(as.matrix(Papers_Matrix_Norm2))
#remove Jays papers
Papers_DF2<-Papers_DF2[-66:-70,]
Papers_DF3<- Papers_DF2%>%add_rownames()
## Warning: Deprecated, use tibble::rownames_to_column() instead.
names(Papers_DF3)[1]<-"Author"
Papers_DF3[1:11,1]="dispt"
Papers_DF3[12:65,1]="hamil"
Papers_DF3[66:80,1]="madis"
##Make Train and Test sets
trainRatio <- .75
set.seed(11) # Set Seed so that same sample can be reproduced in future also
sample2 <- sample.int(n = nrow(Papers_DF3), size = floor(trainRatio*nrow(Papers_DF3)), replace = FALSE)
train2 <- Papers_DF3[sample2, ]
test2 <- Papers_DF3[-sample2, ]
# train / test ratio
length(sample2)/nrow(Papers_DF3)
## [1] 0.75
##Decision Tree Models
#Train Tree Model 3
train_tree3 <- rpart(Author ~ ., data = train2, method="class", control=rpart.control(cp=0))
summary(train_tree3)
## Call:
## rpart(formula = Author ~ ., data = train2, method = "class",
## control = rpart.control(cp = 0))
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.4545455 0 1.0000000 1.0000000 0.1696699
## 2 0.2727273 1 0.5454545 0.6818182 0.1524592
## 3 0.0000000 2 0.2727273 0.5454545 0.1408358
##
## Variable importance
## upon matter kind assembl among maintain
## 21 11 10 9 9 9
## union branch confeder confederaci establish legisl
## 7 6 5 5 5 4
##
## Node number 1: 60 observations, complexity param=0.4545455
## predicted class=hamil expected loss=0.3666667 P(node) =1
## class counts: 9 38 13
## probabilities: 0.150 0.633 0.217
## left son=2 (35 obs) right son=3 (25 obs)
## Primary splits:
## upon < 0.0055 to the right, improve=17.126670, (0 missing)
## matter < 5e-04 to the right, improve= 8.925732, (0 missing)
## kind < 0.0015 to the right, improve= 7.103810, (0 missing)
## repres < 0.0115 to the right, improve= 7.035897, (0 missing)
## thing < 0.0015 to the right, improve= 6.822222, (0 missing)
## Surrogate splits:
## matter < 5e-04 to the right, agree=0.800, adj=0.52, (0 split)
## kind < 5e-04 to the right, agree=0.783, adj=0.48, (0 split)
## assembl < 0.0025 to the left, agree=0.767, adj=0.44, (0 split)
## among < 0.0035 to the left, agree=0.750, adj=0.40, (0 split)
## maintain < 0.0015 to the left, agree=0.750, adj=0.40, (0 split)
##
## Node number 2: 35 observations
## predicted class=hamil expected loss=0 P(node) =0.5833333
## class counts: 0 35 0
## probabilities: 0.000 1.000 0.000
##
## Node number 3: 25 observations, complexity param=0.2727273
## predicted class=madis expected loss=0.48 P(node) =0.4166667
## class counts: 9 3 13
## probabilities: 0.360 0.120 0.520
## left son=6 (12 obs) right son=7 (13 obs)
## Primary splits:
## union < 0.0035 to the left, improve=5.524615, (0 missing)
## mani < 0.0035 to the right, improve=4.806667, (0 missing)
## appli < 5e-04 to the right, improve=4.440000, (0 missing)
## men < 0.0015 to the right, improve=4.404706, (0 missing)
## branch < 0.003 to the right, improve=4.133506, (0 missing)
## Surrogate splits:
## branch < 0.003 to the right, agree=0.96, adj=0.917, (0 split)
## confeder < 0.0015 to the left, agree=0.84, adj=0.667, (0 split)
## confederaci < 5e-04 to the left, agree=0.84, adj=0.667, (0 split)
## establish < 0.003 to the left, agree=0.84, adj=0.667, (0 split)
## legisl < 0.0045 to the right, agree=0.80, adj=0.583, (0 split)
##
## Node number 6: 12 observations
## predicted class=dispt expected loss=0.25 P(node) =0.2
## class counts: 9 0 3
## probabilities: 0.750 0.000 0.250
##
## Node number 7: 13 observations
## predicted class=madis expected loss=0.2307692 P(node) =0.2166667
## class counts: 0 3 10
## probabilities: 0.000 0.231 0.769
#predict the test dataset using the model for train tree No. 1
predicted3= predict(train_tree3, test2, type="class")
(Results3 <- data.frame(Predicted=predicted3,Actual=test2$Author))
#plot number of splits
rsq.rpart(train_tree3)
##
## Classification tree:
## rpart(formula = Author ~ ., data = train2, method = "class",
## control = rpart.control(cp = 0))
##
## Variables actually used in tree construction:
## [1] union upon
##
## Root node error: 22/60 = 0.36667
##
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.45455 0 1.00000 1.00000 0.16967
## 2 0.27273 1 0.54545 0.68182 0.15246
## 3 0.00000 2 0.27273 0.54545 0.14084
## Warning in rsq.rpart(train_tree3): may not be applicable for this method


plotcp(train_tree3)

fancyRpartPlot(train_tree3)

#confusion matrix to find correct and incorrect predictions
table(Authorship=predicted3, true=test2$Author)
## true
## Authorship dispt hamil madis
## dispt 2 0 1
## hamil 0 13 0
## madis 0 3 1
#attributed hamilton with disputed
#Train Tree Model 4
train_tree4 <- rpart(Author ~ ., data = train2, method="class", control=rpart.control(cp=0, minsplit = 2, maxdepth = 5))
summary(train_tree4)
## Call:
## rpart(formula = Author ~ ., data = train2, method = "class",
## control = rpart.control(cp = 0, minsplit = 2, maxdepth = 5))
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.45454545 0 1.00000000 1.0000000 0.1696699
## 2 0.27272727 1 0.54545455 0.6363636 0.1489171
## 3 0.09090909 2 0.27272727 0.6818182 0.1524592
## 4 0.04545455 4 0.09090909 0.9090909 0.1659765
## 5 0.00000000 6 0.00000000 0.8636364 0.1637836
##
## Variable importance
## upon matter kind assembl among maintain
## 15 8 7 7 6 6
## union branch confeder confederaci establish legisl
## 5 5 3 3 3 3
## appli america continu design elect everi
## 3 3 2 2 2 2
## full exist man seem act answer
## 2 2 2 2 1 1
## administr affair
## 1 1
##
## Node number 1: 60 observations, complexity param=0.4545455
## predicted class=hamil expected loss=0.3666667 P(node) =1
## class counts: 9 38 13
## probabilities: 0.150 0.633 0.217
## left son=2 (35 obs) right son=3 (25 obs)
## Primary splits:
## upon < 0.0055 to the right, improve=17.126670, (0 missing)
## matter < 5e-04 to the right, improve= 8.925732, (0 missing)
## kind < 0.0015 to the right, improve= 7.103810, (0 missing)
## repres < 0.0115 to the right, improve= 7.035897, (0 missing)
## thing < 0.0015 to the right, improve= 6.822222, (0 missing)
## Surrogate splits:
## matter < 5e-04 to the right, agree=0.800, adj=0.52, (0 split)
## kind < 5e-04 to the right, agree=0.783, adj=0.48, (0 split)
## assembl < 0.0025 to the left, agree=0.767, adj=0.44, (0 split)
## among < 0.0035 to the left, agree=0.750, adj=0.40, (0 split)
## maintain < 0.0015 to the left, agree=0.750, adj=0.40, (0 split)
##
## Node number 2: 35 observations
## predicted class=hamil expected loss=0 P(node) =0.5833333
## class counts: 0 35 0
## probabilities: 0.000 1.000 0.000
##
## Node number 3: 25 observations, complexity param=0.2727273
## predicted class=madis expected loss=0.48 P(node) =0.4166667
## class counts: 9 3 13
## probabilities: 0.360 0.120 0.520
## left son=6 (12 obs) right son=7 (13 obs)
## Primary splits:
## union < 0.0035 to the left, improve=5.524615, (0 missing)
## mani < 0.0035 to the right, improve=4.806667, (0 missing)
## appli < 5e-04 to the right, improve=4.440000, (0 missing)
## men < 0.0015 to the right, improve=4.404706, (0 missing)
## branch < 0.003 to the right, improve=4.133506, (0 missing)
## Surrogate splits:
## branch < 0.003 to the right, agree=0.96, adj=0.917, (0 split)
## confeder < 0.0015 to the left, agree=0.84, adj=0.667, (0 split)
## confederaci < 5e-04 to the left, agree=0.84, adj=0.667, (0 split)
## establish < 0.003 to the left, agree=0.84, adj=0.667, (0 split)
## legisl < 0.0045 to the right, agree=0.80, adj=0.583, (0 split)
##
## Node number 6: 12 observations, complexity param=0.09090909
## predicted class=dispt expected loss=0.25 P(node) =0.2
## class counts: 9 0 3
## probabilities: 0.750 0.000 0.250
## left son=12 (8 obs) right son=13 (4 obs)
## Primary splits:
## america < 0.0015 to the right, improve=3, (0 missing)
## answer < 0.001 to the right, improve=3, (0 missing)
## conclus < 5e-04 to the left, improve=3, (0 missing)
## mani < 0.0025 to the right, improve=3, (0 missing)
## relat < 0.001 to the right, improve=3, (0 missing)
## Surrogate splits:
## exist < 0.001 to the right, agree=0.917, adj=0.75, (0 split)
## man < 0.001 to the right, agree=0.917, adj=0.75, (0 split)
## seem < 0.0035 to the left, agree=0.917, adj=0.75, (0 split)
## act < 0.003 to the left, agree=0.833, adj=0.50, (0 split)
## answer < 0.001 to the right, agree=0.833, adj=0.50, (0 split)
##
## Node number 7: 13 observations, complexity param=0.09090909
## predicted class=madis expected loss=0.2307692 P(node) =0.2166667
## class counts: 0 3 10
## probabilities: 0.000 0.231 0.769
## left son=14 (4 obs) right son=15 (9 obs)
## Primary splits:
## appli < 0.001 to the right, improve=3.115385, (0 missing)
## capac < 0.0015 to the right, improve=3.115385, (0 missing)
## citizen < 0.001 to the left, improve=3.115385, (0 missing)
## direct < 0.0025 to the right, improve=3.115385, (0 missing)
## forc < 0.0055 to the right, improve=3.115385, (0 missing)
## Surrogate splits:
## continu < 0.0045 to the right, agree=0.923, adj=0.75, (0 split)
## design < 0.0015 to the right, agree=0.923, adj=0.75, (0 split)
## elect < 5e-04 to the left, agree=0.923, adj=0.75, (0 split)
## everi < 0.0035 to the left, agree=0.923, adj=0.75, (0 split)
## full < 0.0015 to the right, agree=0.923, adj=0.75, (0 split)
##
## Node number 12: 8 observations
## predicted class=dispt expected loss=0 P(node) =0.1333333
## class counts: 8 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 13: 4 observations, complexity param=0.04545455
## predicted class=madis expected loss=0.25 P(node) =0.06666667
## class counts: 1 0 3
## probabilities: 0.250 0.000 0.750
## left son=26 (1 obs) right son=27 (3 obs)
## Primary splits:
## affair < 0.007 to the right, improve=1.5, (0 missing)
## among < 0.0055 to the right, improve=1.5, (0 missing)
## amount < 0.0015 to the right, improve=1.5, (0 missing)
## answer < 0.001 to the right, improve=1.5, (0 missing)
## appear < 0.002 to the left, improve=1.5, (0 missing)
##
## Node number 14: 4 observations, complexity param=0.04545455
## predicted class=hamil expected loss=0.25 P(node) =0.06666667
## class counts: 0 3 1
## probabilities: 0.000 0.750 0.250
## left son=28 (3 obs) right son=29 (1 obs)
## Primary splits:
## administr < 0.0035 to the left, improve=1.5, (0 missing)
## adopt < 0.001 to the left, improve=1.5, (0 missing)
## affect < 0.001 to the left, improve=1.5, (0 missing)
## alon < 0.003 to the left, improve=1.5, (0 missing)
## america < 0.004 to the left, improve=1.5, (0 missing)
##
## Node number 15: 9 observations
## predicted class=madis expected loss=0 P(node) =0.15
## class counts: 0 0 9
## probabilities: 0.000 0.000 1.000
##
## Node number 26: 1 observations
## predicted class=dispt expected loss=0 P(node) =0.01666667
## class counts: 1 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 27: 3 observations
## predicted class=madis expected loss=0 P(node) =0.05
## class counts: 0 0 3
## probabilities: 0.000 0.000 1.000
##
## Node number 28: 3 observations
## predicted class=hamil expected loss=0 P(node) =0.05
## class counts: 0 3 0
## probabilities: 0.000 1.000 0.000
##
## Node number 29: 1 observations
## predicted class=madis expected loss=0 P(node) =0.01666667
## class counts: 0 0 1
## probabilities: 0.000 0.000 1.000
#predict the test dataset using the model for train tree No. 1
predicted4= predict(train_tree4, test2, type="class")
#plot number of splits
rsq.rpart(train_tree4)
##
## Classification tree:
## rpart(formula = Author ~ ., data = train2, method = "class",
## control = rpart.control(cp = 0, minsplit = 2, maxdepth = 5))
##
## Variables actually used in tree construction:
## [1] administr affair america appli union upon
##
## Root node error: 22/60 = 0.36667
##
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.454545 0 1.000000 1.00000 0.16967
## 2 0.272727 1 0.545455 0.63636 0.14892
## 3 0.090909 2 0.272727 0.68182 0.15246
## 4 0.045455 4 0.090909 0.90909 0.16598
## 5 0.000000 6 0.000000 0.86364 0.16378
## Warning in rsq.rpart(train_tree4): may not be applicable for this method


plotcp(train_tree4)

rpart.plot(train_tree4)

#confusion matrix to find correct and incorrect predictions
table(Authorship=predicted4, true=test2$Author)
## true
## Authorship dispt hamil madis
## dispt 0 0 0
## hamil 0 13 0
## madis 2 3 2
(Results4<-data.frame(Predicted=predicted4, Actual=test2$Author))
#Train Tree 5
train_tree5 <- rpart(Author ~ ., data = train2, method="class", control=rpart.control(cp=0, minsplit = 5, maxdepth = 7))
summary(train_tree5)
## Call:
## rpart(formula = Author ~ ., data = train2, method = "class",
## control = rpart.control(cp = 0, minsplit = 5, maxdepth = 7))
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.45454545 0 1.00000000 1.0000000 0.1696699
## 2 0.27272727 1 0.54545455 0.6363636 0.1489171
## 3 0.09090909 2 0.27272727 0.6363636 0.1489171
## 4 0.00000000 4 0.09090909 0.9090909 0.1659765
##
## Variable importance
## upon matter kind assembl among maintain
## 16 8 8 7 6 6
## union branch confeder confederaci establish legisl
## 5 5 3 3 3 3
## appli america continu design elect everi
## 3 3 2 2 2 2
## full exist man seem act answer
## 2 2 2 2 1 1
##
## Node number 1: 60 observations, complexity param=0.4545455
## predicted class=hamil expected loss=0.3666667 P(node) =1
## class counts: 9 38 13
## probabilities: 0.150 0.633 0.217
## left son=2 (35 obs) right son=3 (25 obs)
## Primary splits:
## upon < 0.0055 to the right, improve=17.126670, (0 missing)
## matter < 5e-04 to the right, improve= 8.925732, (0 missing)
## kind < 0.0015 to the right, improve= 7.103810, (0 missing)
## repres < 0.0115 to the right, improve= 7.035897, (0 missing)
## thing < 0.0015 to the right, improve= 6.822222, (0 missing)
## Surrogate splits:
## matter < 5e-04 to the right, agree=0.800, adj=0.52, (0 split)
## kind < 5e-04 to the right, agree=0.783, adj=0.48, (0 split)
## assembl < 0.0025 to the left, agree=0.767, adj=0.44, (0 split)
## among < 0.0035 to the left, agree=0.750, adj=0.40, (0 split)
## maintain < 0.0015 to the left, agree=0.750, adj=0.40, (0 split)
##
## Node number 2: 35 observations
## predicted class=hamil expected loss=0 P(node) =0.5833333
## class counts: 0 35 0
## probabilities: 0.000 1.000 0.000
##
## Node number 3: 25 observations, complexity param=0.2727273
## predicted class=madis expected loss=0.48 P(node) =0.4166667
## class counts: 9 3 13
## probabilities: 0.360 0.120 0.520
## left son=6 (12 obs) right son=7 (13 obs)
## Primary splits:
## union < 0.0035 to the left, improve=5.524615, (0 missing)
## mani < 0.0035 to the right, improve=4.806667, (0 missing)
## appli < 5e-04 to the right, improve=4.440000, (0 missing)
## men < 0.0015 to the right, improve=4.404706, (0 missing)
## branch < 0.003 to the right, improve=4.133506, (0 missing)
## Surrogate splits:
## branch < 0.003 to the right, agree=0.96, adj=0.917, (0 split)
## confeder < 0.0015 to the left, agree=0.84, adj=0.667, (0 split)
## confederaci < 5e-04 to the left, agree=0.84, adj=0.667, (0 split)
## establish < 0.003 to the left, agree=0.84, adj=0.667, (0 split)
## legisl < 0.0045 to the right, agree=0.80, adj=0.583, (0 split)
##
## Node number 6: 12 observations, complexity param=0.09090909
## predicted class=dispt expected loss=0.25 P(node) =0.2
## class counts: 9 0 3
## probabilities: 0.750 0.000 0.250
## left son=12 (8 obs) right son=13 (4 obs)
## Primary splits:
## america < 0.0015 to the right, improve=3, (0 missing)
## answer < 0.001 to the right, improve=3, (0 missing)
## conclus < 5e-04 to the left, improve=3, (0 missing)
## mani < 0.0025 to the right, improve=3, (0 missing)
## relat < 0.001 to the right, improve=3, (0 missing)
## Surrogate splits:
## exist < 0.001 to the right, agree=0.917, adj=0.75, (0 split)
## man < 0.001 to the right, agree=0.917, adj=0.75, (0 split)
## seem < 0.0035 to the left, agree=0.917, adj=0.75, (0 split)
## act < 0.003 to the left, agree=0.833, adj=0.50, (0 split)
## answer < 0.001 to the right, agree=0.833, adj=0.50, (0 split)
##
## Node number 7: 13 observations, complexity param=0.09090909
## predicted class=madis expected loss=0.2307692 P(node) =0.2166667
## class counts: 0 3 10
## probabilities: 0.000 0.231 0.769
## left son=14 (4 obs) right son=15 (9 obs)
## Primary splits:
## appli < 0.001 to the right, improve=3.115385, (0 missing)
## capac < 0.0015 to the right, improve=3.115385, (0 missing)
## citizen < 0.001 to the left, improve=3.115385, (0 missing)
## direct < 0.0025 to the right, improve=3.115385, (0 missing)
## forc < 0.0055 to the right, improve=3.115385, (0 missing)
## Surrogate splits:
## continu < 0.0045 to the right, agree=0.923, adj=0.75, (0 split)
## design < 0.0015 to the right, agree=0.923, adj=0.75, (0 split)
## elect < 5e-04 to the left, agree=0.923, adj=0.75, (0 split)
## everi < 0.0035 to the left, agree=0.923, adj=0.75, (0 split)
## full < 0.0015 to the right, agree=0.923, adj=0.75, (0 split)
##
## Node number 12: 8 observations
## predicted class=dispt expected loss=0 P(node) =0.1333333
## class counts: 8 0 0
## probabilities: 1.000 0.000 0.000
##
## Node number 13: 4 observations
## predicted class=madis expected loss=0.25 P(node) =0.06666667
## class counts: 1 0 3
## probabilities: 0.250 0.000 0.750
##
## Node number 14: 4 observations
## predicted class=hamil expected loss=0.25 P(node) =0.06666667
## class counts: 0 3 1
## probabilities: 0.000 0.750 0.250
##
## Node number 15: 9 observations
## predicted class=madis expected loss=0 P(node) =0.15
## class counts: 0 0 9
## probabilities: 0.000 0.000 1.000
predicted5= predict(train_tree5, test2, type="class")
rsq.rpart(train_tree5)
##
## Classification tree:
## rpart(formula = Author ~ ., data = train2, method = "class",
## control = rpart.control(cp = 0, minsplit = 5, maxdepth = 7))
##
## Variables actually used in tree construction:
## [1] america appli union upon
##
## Root node error: 22/60 = 0.36667
##
## n= 60
##
## CP nsplit rel error xerror xstd
## 1 0.454545 0 1.000000 1.00000 0.16967
## 2 0.272727 1 0.545455 0.63636 0.14892
## 3 0.090909 2 0.272727 0.63636 0.14892
## 4 0.000000 4 0.090909 0.90909 0.16598
## Warning in rsq.rpart(train_tree5): may not be applicable for this method


plotcp(train_tree5)

rpart.plot(train_tree5)

table(Authorship=predicted5, true = test2$Author)
## true
## Authorship dispt hamil madis
## dispt 0 0 0
## hamil 0 15 0
## madis 2 1 2