Tree-Based Methods for Predicting Authorship

#install.packages('tree')
library (tree)

Loading Training and Testing Data

library (ISLR)
attach (Carseats )

#head(Carseats)

df<-read.csv("author_training.csv")


kf<-read.csv("author_testing.csv")

Cnverting Response veriable to Numeric Class

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
mapping <- c("Austen" = 1, "London" = 2, "Milton" = 3, "Shakespeare" = 4)
df$Author.r <- mapping[df$Author]
kf$Author.r<- mapping[kf$Author]

Fitting Classification Tree Model

library(tree)
tree1 =tree(Author.r~.-Author,data=df)
summary(tree1)
## 
## Regression tree:
## tree(formula = Author.r ~ . - Author, data = df)
## Variables actually used in tree construction:
## [1] "was"  "my"   "its"  "be"   "to"   "this"
## Number of terminal nodes:  7 
## Residual mean deviance:  0.06852 = 39.88 / 582 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.71400 -0.04072  0.01026  0.00000  0.01026  2.01000
tree1
## node), split, n, deviance, yval
##       * denotes terminal node
## 
##  1) root 589 725.800 2.085  
##    2) was < 10.5 151  50.370 3.689  
##      4) my < 9.5 39  16.970 2.974  
##        8) its < 0.5 32   4.875 3.188 *
##        9) its > 0.5 7   4.000 2.000 *
##      5) my > 9.5 112   6.562 3.938 *
##    3) was > 10.5 438 153.100 1.532  
##      6) be < 9.5 195   9.979 1.990 *
##      7) be > 9.5 243  69.420 1.165  
##       14) to < 36.5 22  23.320 2.409  
##         28) this < 9.5 15   2.400 1.800 *
##         29) this > 9.5 7   3.429 3.714 *
##       15) to > 36.5 221   8.633 1.041 *
plot(tree1)
text(tree1, pretty=0)

set.seed (2)




tree2 =tree(Author.r~.-Author ,data= df )

summary(tree2)
## 
## Regression tree:
## tree(formula = Author.r ~ . - Author, data = df)
## Variables actually used in tree construction:
## [1] "was"  "my"   "its"  "be"   "to"   "this"
## Number of terminal nodes:  7 
## Residual mean deviance:  0.06852 = 39.88 / 582 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.71400 -0.04072  0.01026  0.00000  0.01026  2.01000
tree.pred=predict(tree2 ,kf)
#tree.pred
table(tree.pred,kf$Author.r)
##                   
## tree.pred           1  2  3  4
##   1.04072398190045 75  4  1  0
##   1.8               2  2  0  0
##   1.98974358974359  6 75  0  1
##   2                 1  1  0  0
##   3.1875            1  0 12  5
##   3.71428571428571  0  1  0  0
##   3.9375           10  0  6 49

Error for Clssification Tree:

Error1= (75+78+12+49)/252
Error1
## [1] 0.8492063