data("spam")
set.seed(1)
train =sample(nrow(spam), nrow(spam)*2/3)
test=-train
spam.train=spam[train,]
spam.test=spam[test,]
Decision approach divides the predictor space into high dimensional rectangles.At each step of splitting, we intend to reduce the RSS. it’s top-down because it starts at the top with a whole set of observations, and then it splits them into two pieces, one a time at each level. It’s greedy because it doesn’t find the best split among all possible splits, but only the best split at the immediate place it’s looking.
library(tree)
tree.spam =tree(spam~., data=spam.train)
summary(tree.spam)
##
## Classification tree:
## tree(formula = spam ~ ., data = spam.train)
## Variables actually used in tree construction:
## [1] "A.53" "A.52" "A.7" "A.25" "A.56" "A.5" "A.55" "A.16" "A.46"
## Number of terminal nodes: 13
## Residual mean deviance: 0.5 = 1527 / 3054
## Misclassification error rate: 0.08086 = 248 / 3067
plot(tree.spam)
text(tree.spam,pretty =1)
Output of summary() indicates that only nine variables that have been used in constructing the tree.
cv.spam=cv.tree(tree.spam, FUN = prune.misclass)
cbind(cv.spam$size,cv.spam$dev)
## [,1] [,2]
## [1,] 13 291
## [2,] 10 288
## [3,] 9 301
## [4,] 8 301
## [5,] 7 315
## [6,] 6 358
## [7,] 5 398
## [8,] 4 438
## [9,] 2 637
## [10,] 1 1213
names(cv.spam)
## [1] "size" "dev" "k" "method"
cv.spam
## $size
## [1] 13 10 9 8 7 6 5 4 2 1
##
## $dev
## [1] 291 288 301 301 315 358 398 438 637 1213
##
## $k
## [1] -Inf 5.333333 11.000000 12.000000 17.000000 30.000000
## [7] 42.000000 51.000000 96.500000 593.000000
##
## $method
## [1] "misclass"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
If we don’t prune the tree then Decision tree will overfit hence it is important to prune at the right length. Pruning is based on cost complexity or weakest link pruning. The whole idea of pruning a tree back was to reduce the variance
I used the cv.tree() function to see whether pruning the tree will improve performance.We observe that deviation is minimum at size of 13 i.e pruning will not reduce the deviation
pred=predict(tree.spam,spam.test, type="class")
obs=spam.test[,"spam"]
conf=table(pred,obs)
1-sum(diag(conf))/sum(conf)
## [1] 0.09126467
set.seed(123)
train <- sample(1:dim(spam)[1], dim(spam)[1]*.67, rep=FALSE)
test <- -train
gam.lr=gam(spam ~ s(A.52) + s(A.53) + s(A.55) + s(A.7,df=4)+s(A.25)+s(A.56) +
s(A.16) + s(A.46) +s(A.5) , data=spam[train,], family=binomial)
summary(gam.lr)
##
## Call: gam(formula = spam ~ s(A.52) + s(A.53) + s(A.55) + s(A.7, df = 4) +
## s(A.25) + s(A.56) + s(A.16) + s(A.46) + s(A.5), family = binomial,
## data = spam[train, ])
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.72415 -0.35405 -0.05167 0.16429 3.71666
##
## (Dispersion Parameter for binomial family taken to be 1)
##
## Null Deviance: 4122.181 on 3081 degrees of freedom
## Residual Deviance: 1292.174 on 3045 degrees of freedom
## AIC: 1366.174
##
## Number of Local Scoring Iterations: 21
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(A.52) 1 66 65.705 2.7322 0.09845 .
## s(A.53) 1 57 56.814 2.3625 0.12439
## s(A.55) 1 16 15.559 0.6470 0.42126
## s(A.7, df = 4) 1 34 34.141 1.4196 0.23355
## s(A.25) 1 56 56.203 2.3371 0.12643
## s(A.56) 1 9 9.044 0.3761 0.53976
## s(A.16) 1 55 54.649 2.2724 0.13180
## s(A.46) 1 17 16.722 0.6953 0.40442
## s(A.5) 1 39 38.598 1.6050 0.20530
## Residuals 3045 73229 24.049
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar Chisq P(Chi)
## (Intercept)
## s(A.52) 3 58.671 1.131e-12 ***
## s(A.53) 3 127.869 < 2.2e-16 ***
## s(A.55) 3 46.137 5.303e-10 ***
## s(A.7, df = 4) 3 52.740 2.084e-11 ***
## s(A.25) 3 39.388 1.437e-08 ***
## s(A.56) 3 49.493 1.024e-10 ***
## s(A.16) 3 24.476 1.987e-05 ***
## s(A.46) 3 21.526 8.186e-05 ***
## s(A.5) 3 49.969 8.114e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,3))
plot(gam.lr,se=T,col="green")
pr=predict(gam.lr,newdata=spam[test,])
conf_gam= table(pr>.5,spam$spam[test])
1-sum(diag(conf_gam))/sum(conf_gam)
## [1] 0.08229098
summary(gam.lr)
##
## Call: gam(formula = spam ~ s(A.52) + s(A.53) + s(A.55) + s(A.7, df = 4) +
## s(A.25) + s(A.56) + s(A.16) + s(A.46) + s(A.5), family = binomial,
## data = spam[train, ])
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.72415 -0.35405 -0.05167 0.16429 3.71666
##
## (Dispersion Parameter for binomial family taken to be 1)
##
## Null Deviance: 4122.181 on 3081 degrees of freedom
## Residual Deviance: 1292.174 on 3045 degrees of freedom
## AIC: 1366.174
##
## Number of Local Scoring Iterations: 21
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(A.52) 1 66 65.705 2.7322 0.09845 .
## s(A.53) 1 57 56.814 2.3625 0.12439
## s(A.55) 1 16 15.559 0.6470 0.42126
## s(A.7, df = 4) 1 34 34.141 1.4196 0.23355
## s(A.25) 1 56 56.203 2.3371 0.12643
## s(A.56) 1 9 9.044 0.3761 0.53976
## s(A.16) 1 55 54.649 2.2724 0.13180
## s(A.46) 1 17 16.722 0.6953 0.40442
## s(A.5) 1 39 38.598 1.6050 0.20530
## Residuals 3045 73229 24.049
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar Chisq P(Chi)
## (Intercept)
## s(A.52) 3 58.671 1.131e-12 ***
## s(A.53) 3 127.869 < 2.2e-16 ***
## s(A.55) 3 46.137 5.303e-10 ***
## s(A.7, df = 4) 3 52.740 2.084e-11 ***
## s(A.25) 3 39.388 1.437e-08 ***
## s(A.56) 3 49.493 1.024e-10 ***
## s(A.16) 3 24.476 1.987e-05 ***
## s(A.46) 3 21.526 8.186e-05 ***
## s(A.5) 3 49.969 8.114e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
We observe that error rate in GAM model is 8.22% as compared to error rate of 9.12% in decision tree model and hence we may prefer GAM model for this data set
data(cpus)
head(cpus)
## name syct mmin mmax cach chmin chmax perf estperf
## 1 ADVISOR 32/60 125 256 6000 256 16 128 198 199
## 2 AMDAHL 470V/7 29 8000 32000 32 8 32 269 253
## 3 AMDAHL 470/7A 29 8000 32000 32 8 32 220 253
## 4 AMDAHL 470V/7B 29 8000 32000 32 8 32 172 253
## 5 AMDAHL 470V/7C 29 8000 16000 32 8 16 132 132
## 6 AMDAHL 470V/8 26 8000 32000 64 8 32 318 290
cpus1 <- cpus[, 2:8]
set.seed(1)
train =sample(nrow(cpus1), nrow(cpus1)*2/3)
test=-train
cpus1.train=cpus1[train,]
cpus1.test=cpus1[test,]
perf.test=cpus1$perf[test]
library(tree)
rtree.cpus1 =tree(perf~., data=cpus1.train)
summary(rtree.cpus1)
##
## Regression tree:
## tree(formula = perf ~ ., data = cpus1.train)
## Variables actually used in tree construction:
## [1] "mmax" "cach"
## Number of terminal nodes: 5
## Residual mean deviance: 5400 = 723600 / 134
## Distribution of residuals:
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -306.000 -20.330 -4.768 0.000 18.340 531.000
plot(rtree.cpus1, pretty=0)
## Warning in text.default(x[1L], y[1L], "|", ...): "pretty" is not a
## graphical parameter
## Warning in plot.xy(xy.coords(x, y), type = type, ...): "pretty" is not a
## graphical parameter
text(rtree.cpus1)
cv.cpus1=cv.tree(rtree.cpus1)
cv.cpus1
## $size
## [1] 5 4 3 2 1
##
## $dev
## [1] 1339392 1472831 1553744 2057034 3441124
##
## $k
## [1] -Inf 79186.4 134315.7 501028.0 1603080.6
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cv.cpus1$size, cv.cpus1$dev, type='b')
prune.cpus1=prune.tree(rtree.cpus1, best=5)
yhat=predict(prune.cpus1, newdata=cpus1[test,])
mse=mean((perf.test-yhat)^2)
R2=1-(mse/var(perf.test))
R2
## [1] 0.7285199
plot(prune.cpus1);text(prune.cpus1, pretty=0)
plot(yhat, perf.test)
abline(0,1)
library(gam)
data(spam)
gam.lr=gam(perf ~ s(mmax) + s(cach), data=cpus1.train)
summary(gam.lr)
##
## Call: gam(formula = perf ~ s(mmax) + s(cach), data = cpus1.train)
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -117.224 -12.093 -2.048 13.482 172.308
##
## (Dispersion Parameter for gaussian family taken to be 1458.16)
##
## Null Deviance: 3041231 on 138 degrees of freedom
## Residual Deviance: 189560.4 on 129.9998 degrees of freedom
## AIC: 1417.766
##
## Number of Local Scoring Iterations: 2
##
## Anova for Parametric Effects
## Df Sum Sq Mean Sq F value Pr(>F)
## s(mmax) 1 2314942 2314942 1587.578 < 2.2e-16 ***
## s(cach) 1 138706 138706 95.124 < 2.2e-16 ***
## Residuals 130 189560 1458
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Anova for Nonparametric Effects
## Npar Df Npar F Pr(F)
## (Intercept)
## s(mmax) 3 95.313 < 2e-16 ***
## s(cach) 3 3.369 0.02063 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
par(mfrow=c(1,3))
plot(gam.lr,se=T,col="green")
yhat=predict(gam.lr, newdata=cpus1[test,])
mse=mean((perf.test-yhat)^2)
R2=1-(mse/var(perf.test))
R2
## [1] 0.848741
The result indicates that R^2 of GAM is 84.8% whereas R^2 for decision tree is 72.8%.