Decision trees

Q1

For the spam data, partition the data into 2/3 training and 1/3 test data

data("spam")
set.seed(1)
train  =sample(nrow(spam), nrow(spam)*2/3)
test=-train
spam.train=spam[train,]
spam.test=spam[test,]

Build the tree model for spam training data using all the variables.

Decision approach divides the predictor space into high dimensional rectangles.At each step of splitting, we intend to reduce the RSS. it’s top-down because it starts at the top with a whole set of observations, and then it splits them into two pieces, one a time at each level. It’s greedy because it doesn’t find the best split among all possible splits, but only the best split at the immediate place it’s looking.

library(tree)
tree.spam =tree(spam~., data=spam.train)
summary(tree.spam)

## 
## Classification tree:
## tree(formula = spam ~ ., data = spam.train)
## Variables actually used in tree construction:
## [1] "A.53" "A.52" "A.7"  "A.25" "A.56" "A.5"  "A.55" "A.16" "A.46"
## Number of terminal nodes:  13 
## Residual mean deviance:  0.5 = 1527 / 3054 
## Misclassification error rate: 0.08086 = 248 / 3067

plot(tree.spam)
text(tree.spam,pretty =1)

Output of summary() indicates that only nine variables that have been used in constructing the tree.

Prune the data using cross validated optimal tree size.

cv.spam=cv.tree(tree.spam, FUN = prune.misclass)
cbind(cv.spam$size,cv.spam$dev)

##       [,1] [,2]
##  [1,]   13  291
##  [2,]   10  288
##  [3,]    9  301
##  [4,]    8  301
##  [5,]    7  315
##  [6,]    6  358
##  [7,]    5  398
##  [8,]    4  438
##  [9,]    2  637
## [10,]    1 1213

names(cv.spam)

## [1] "size"   "dev"    "k"      "method"

cv.spam

## $size
##  [1] 13 10  9  8  7  6  5  4  2  1
## 
## $dev
##  [1]  291  288  301  301  315  358  398  438  637 1213
## 
## $k
##  [1]       -Inf   5.333333  11.000000  12.000000  17.000000  30.000000
##  [7]  42.000000  51.000000  96.500000 593.000000
## 
## $method
## [1] "misclass"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

If we don’t prune the tree then Decision tree will overfit hence it is important to prune at the right length. Pruning is based on cost complexity or weakest link pruning. The whole idea of pruning a tree back was to reduce the variance

I used the cv.tree() function to see whether pruning the tree will improve performance.We observe that deviation is minimum at size of 13 i.e pruning will not reduce the deviation

Apply it to the test data and get the confusion matrix and error rate.

pred=predict(tree.spam,spam.test, type="class")
obs=spam.test[,"spam"]
conf=table(pred,obs)
1-sum(diag(conf))/sum(conf)

## [1] 0.09126467

Compare it with GAM model.

set.seed(123)
train <- sample(1:dim(spam)[1], dim(spam)[1]*.67, rep=FALSE)
test <- -train
gam.lr=gam(spam ~ s(A.52) + s(A.53) + s(A.55) + s(A.7,df=4)+s(A.25)+s(A.56) +
s(A.16) + s(A.46) +s(A.5) , data=spam[train,], family=binomial)
summary(gam.lr)

## 
## Call: gam(formula = spam ~ s(A.52) + s(A.53) + s(A.55) + s(A.7, df = 4) + 
##     s(A.25) + s(A.56) + s(A.16) + s(A.46) + s(A.5), family = binomial, 
##     data = spam[train, ])
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -4.72415 -0.35405 -0.05167  0.16429  3.71666 
## 
## (Dispersion Parameter for binomial family taken to be 1)
## 
##     Null Deviance: 4122.181 on 3081 degrees of freedom
## Residual Deviance: 1292.174 on 3045 degrees of freedom
## AIC: 1366.174 
## 
## Number of Local Scoring Iterations: 21 
## 
## Anova for Parametric Effects
##                  Df Sum Sq Mean Sq F value  Pr(>F)  
## s(A.52)           1     66  65.705  2.7322 0.09845 .
## s(A.53)           1     57  56.814  2.3625 0.12439  
## s(A.55)           1     16  15.559  0.6470 0.42126  
## s(A.7, df = 4)    1     34  34.141  1.4196 0.23355  
## s(A.25)           1     56  56.203  2.3371 0.12643  
## s(A.56)           1      9   9.044  0.3761 0.53976  
## s(A.16)           1     55  54.649  2.2724 0.13180  
## s(A.46)           1     17  16.722  0.6953 0.40442  
## s(A.5)            1     39  38.598  1.6050 0.20530  
## Residuals      3045  73229  24.049                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df Npar Chisq    P(Chi)    
## (Intercept)                                    
## s(A.52)              3     58.671 1.131e-12 ***
## s(A.53)              3    127.869 < 2.2e-16 ***
## s(A.55)              3     46.137 5.303e-10 ***
## s(A.7, df = 4)       3     52.740 2.084e-11 ***
## s(A.25)              3     39.388 1.437e-08 ***
## s(A.56)              3     49.493 1.024e-10 ***
## s(A.16)              3     24.476 1.987e-05 ***
## s(A.46)              3     21.526 8.186e-05 ***
## s(A.5)               3     49.969 8.114e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

par(mfrow=c(1,3))
plot(gam.lr,se=T,col="green")

pr=predict(gam.lr,newdata=spam[test,])
conf_gam= table(pr>.5,spam$spam[test]) 
1-sum(diag(conf_gam))/sum(conf_gam)

## [1] 0.08229098

summary(gam.lr)

## 
## Call: gam(formula = spam ~ s(A.52) + s(A.53) + s(A.55) + s(A.7, df = 4) + 
##     s(A.25) + s(A.56) + s(A.16) + s(A.46) + s(A.5), family = binomial, 
##     data = spam[train, ])
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -4.72415 -0.35405 -0.05167  0.16429  3.71666 
## 
## (Dispersion Parameter for binomial family taken to be 1)
## 
##     Null Deviance: 4122.181 on 3081 degrees of freedom
## Residual Deviance: 1292.174 on 3045 degrees of freedom
## AIC: 1366.174 
## 
## Number of Local Scoring Iterations: 21 
## 
## Anova for Parametric Effects
##                  Df Sum Sq Mean Sq F value  Pr(>F)  
## s(A.52)           1     66  65.705  2.7322 0.09845 .
## s(A.53)           1     57  56.814  2.3625 0.12439  
## s(A.55)           1     16  15.559  0.6470 0.42126  
## s(A.7, df = 4)    1     34  34.141  1.4196 0.23355  
## s(A.25)           1     56  56.203  2.3371 0.12643  
## s(A.56)           1      9   9.044  0.3761 0.53976  
## s(A.16)           1     55  54.649  2.2724 0.13180  
## s(A.46)           1     17  16.722  0.6953 0.40442  
## s(A.5)            1     39  38.598  1.6050 0.20530  
## Residuals      3045  73229  24.049                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##                Npar Df Npar Chisq    P(Chi)    
## (Intercept)                                    
## s(A.52)              3     58.671 1.131e-12 ***
## s(A.53)              3    127.869 < 2.2e-16 ***
## s(A.55)              3     46.137 5.303e-10 ***
## s(A.7, df = 4)       3     52.740 2.084e-11 ***
## s(A.25)              3     39.388 1.437e-08 ***
## s(A.56)              3     49.493 1.024e-10 ***
## s(A.16)              3     24.476 1.987e-05 ***
## s(A.46)              3     21.526 8.186e-05 ***
## s(A.5)               3     49.969 8.114e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

We observe that error rate in GAM model is 8.22% as compared to error rate of 9.12% in decision tree model and hence we may prefer GAM model for this data set

For the cpus data, partition the data into 2/3 training and 1/3 test data.

data(cpus)
head(cpus)

##             name syct mmin  mmax cach chmin chmax perf estperf
## 1  ADVISOR 32/60  125  256  6000  256    16   128  198     199
## 2  AMDAHL 470V/7   29 8000 32000   32     8    32  269     253
## 3  AMDAHL 470/7A   29 8000 32000   32     8    32  220     253
## 4 AMDAHL 470V/7B   29 8000 32000   32     8    32  172     253
## 5 AMDAHL 470V/7C   29 8000 16000   32     8    16  132     132
## 6  AMDAHL 470V/8   26 8000 32000   64     8    32  318     290

cpus1 <- cpus[, 2:8] 
set.seed(1)
train  =sample(nrow(cpus1), nrow(cpus1)*2/3)
test=-train
cpus1.train=cpus1[train,]
cpus1.test=cpus1[test,]
perf.test=cpus1$perf[test]

Build the tree model for cpus training data using all the variables. The response variable is perf.

library(tree)
rtree.cpus1 =tree(perf~., data=cpus1.train)
summary(rtree.cpus1)

## 
## Regression tree:
## tree(formula = perf ~ ., data = cpus1.train)
## Variables actually used in tree construction:
## [1] "mmax" "cach"
## Number of terminal nodes:  5 
## Residual mean deviance:  5400 = 723600 / 134 
## Distribution of residuals:
##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -306.000  -20.330   -4.768    0.000   18.340  531.000

plot(rtree.cpus1, pretty=0)

## Warning in text.default(x[1L], y[1L], "|", ...): "pretty" is not a
## graphical parameter

## Warning in plot.xy(xy.coords(x, y), type = type, ...): "pretty" is not a
## graphical parameter

text(rtree.cpus1)

Prune the data using cross validated optimal tree size.

cv.cpus1=cv.tree(rtree.cpus1)
cv.cpus1

## $size
## [1] 5 4 3 2 1
## 
## $dev
## [1] 1339392 1472831 1553744 2057034 3441124
## 
## $k
## [1]      -Inf   79186.4  134315.7  501028.0 1603080.6
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

plot(cv.cpus1$size, cv.cpus1$dev, type='b')

prune.cpus1=prune.tree(rtree.cpus1, best=5)

Apply it to the test data and get the Mean square error and R squared value.

yhat=predict(prune.cpus1, newdata=cpus1[test,])
mse=mean((perf.test-yhat)^2)
R2=1-(mse/var(perf.test))
R2

## [1] 0.7285199

plot(prune.cpus1);text(prune.cpus1, pretty=0)

plot(yhat, perf.test)
abline(0,1)

Compare it with GAM model.

library(gam)
data(spam)
gam.lr=gam(perf ~ s(mmax) + s(cach), data=cpus1.train)
summary(gam.lr)

## 
## Call: gam(formula = perf ~ s(mmax) + s(cach), data = cpus1.train)
## Deviance Residuals:
##      Min       1Q   Median       3Q      Max 
## -117.224  -12.093   -2.048   13.482  172.308 
## 
## (Dispersion Parameter for gaussian family taken to be 1458.16)
## 
##     Null Deviance: 3041231 on 138 degrees of freedom
## Residual Deviance: 189560.4 on 129.9998 degrees of freedom
## AIC: 1417.766 
## 
## Number of Local Scoring Iterations: 2 
## 
## Anova for Parametric Effects
##            Df  Sum Sq Mean Sq  F value    Pr(>F)    
## s(mmax)     1 2314942 2314942 1587.578 < 2.2e-16 ***
## s(cach)     1  138706  138706   95.124 < 2.2e-16 ***
## Residuals 130  189560    1458                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Anova for Nonparametric Effects
##             Npar Df Npar F   Pr(F)    
## (Intercept)                           
## s(mmax)           3 95.313 < 2e-16 ***
## s(cach)           3  3.369 0.02063 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

par(mfrow=c(1,3))
plot(gam.lr,se=T,col="green")
yhat=predict(gam.lr, newdata=cpus1[test,])
mse=mean((perf.test-yhat)^2)
R2=1-(mse/var(perf.test))
R2

## [1] 0.848741

The result indicates that R^2 of GAM is 84.8% whereas R^2 for decision tree is 72.8%.

Decision trees

Amit

March 28, 2017

Q1