library(gbm)## Loaded gbm 2.1.8
library(tictoc)
library(ROCR)
library(dplyr)##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tree)
library(randomForest)## randomForest 4.7-1
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:dplyr':
##
## combine
library(gbm)1: Read the Data
Bank=read.csv("bank marketing.csv")
Bank=mutate_if(Bank, is.character, as.factor)
attach(Bank)2: Regression Tree
set.seed(10)
train=sample(nrow(Bank),nrow(Bank)/2)
bank.tree=tree(y~.-duration,data=Bank, subset=train)
plot(bank.tree)
text(bank.tree, cex=0.7)bank.tree## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 20594 14300.0 no ( 0.88968 0.11032 )
## 2) nr.employed < 5087.65 2507 3440.0 no ( 0.55963 0.44037 )
## 4) poutcome: failure,nonexistent 1942 2535.0 no ( 0.64109 0.35891 ) *
## 5) poutcome: success 565 669.7 yes ( 0.27965 0.72035 ) *
## 3) nr.employed > 5087.65 18087 8659.0 no ( 0.93542 0.06458 )
## 6) month: aug,jul,jun,may,nov 16675 6827.0 no ( 0.94789 0.05211 ) *
## 7) month: apr,dec,mar,oct 1412 1458.0 no ( 0.78824 0.21176 ) *
3: Another Regression Tree
bank.tree2 = tree(y~.-duration,data=Bank, subset=train, control=tree.control(nobs=nrow(Bank)/2, mindev=0.005))
plot(bank.tree2)
text(bank.tree2, cex=0.7)bank.tree2## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 20594 14300.0 no ( 0.88968 0.11032 )
## 2) nr.employed < 5087.65 2507 3440.0 no ( 0.55963 0.44037 )
## 4) poutcome: failure,nonexistent 1942 2535.0 no ( 0.64109 0.35891 ) *
## 5) poutcome: success 565 669.7 yes ( 0.27965 0.72035 ) *
## 3) nr.employed > 5087.65 18087 8659.0 no ( 0.93542 0.06458 )
## 6) month: aug,jul,jun,may,nov 16675 6827.0 no ( 0.94789 0.05211 )
## 12) euribor3m < 4.0485 3211 1868.0 no ( 0.91498 0.08502 ) *
## 13) euribor3m > 4.0485 13464 4881.0 no ( 0.95573 0.04427 ) *
## 7) month: apr,dec,mar,oct 1412 1458.0 no ( 0.78824 0.21176 )
## 14) job: blue-collar,entrepreneur,management,services 614 414.8 no ( 0.89414 0.10586 ) *
## 15) job: admin.,housemaid,retired,self-employed,student,technician,unemployed,unknown 798 965.6 no ( 0.70677 0.29323 ) *
4: CV Tree
set.seed(11)
cvout=cv.tree(bank.tree2)
names(cvout)## [1] "size" "dev" "k" "method"
cvout## $size
## [1] 6 5 4 3 2 1
##
## $dev
## [1] 11498.18 11498.18 11498.18 11730.85 12101.69 14302.53
##
## $k
## [1] -Inf 77.27924 77.55388 234.60860 374.82680 2201.10208
##
## $method
## [1] "deviance"
##
## attr(,"class")
## [1] "prune" "tree.sequence"
plot(cvout$size,cvout$dev,type="b")
mindev=which.min(cvout$dev)
minsize=cvout$size[mindev]
points(minsize,cvout$dev[mindev],pch=4,cex=2,col=2)bank.pruned=prune.tree(bank.tree2,best=minsize)
prune = plot(bank.pruned)
text(bank.pruned)bank.pruned## node), split, n, deviance, yval, (yprob)
## * denotes terminal node
##
## 1) root 20594 14300.0 no ( 0.88968 0.11032 )
## 2) nr.employed < 5087.65 2507 3440.0 no ( 0.55963 0.44037 )
## 4) poutcome: failure,nonexistent 1942 2535.0 no ( 0.64109 0.35891 ) *
## 5) poutcome: success 565 669.7 yes ( 0.27965 0.72035 ) *
## 3) nr.employed > 5087.65 18087 8659.0 no ( 0.93542 0.06458 )
## 6) month: aug,jul,jun,may,nov 16675 6827.0 no ( 0.94789 0.05211 )
## 12) euribor3m < 4.0485 3211 1868.0 no ( 0.91498 0.08502 ) *
## 13) euribor3m > 4.0485 13464 4881.0 no ( 0.95573 0.04427 ) *
## 7) month: apr,dec,mar,oct 1412 1458.0 no ( 0.78824 0.21176 )
## 14) job: blue-collar,entrepreneur,management,services 614 414.8 no ( 0.89414 0.10586 ) *
## 15) job: admin.,housemaid,retired,self-employed,student,technician,unemployed,unknown 798 965.6 no ( 0.70677 0.29323 ) *
5: Predict
# Method 1
y.prob = predict(bank.pruned, Bank[-train,], type="vector")
y.pred = ifelse(y.prob[,2]>.5,'yes','no')
#Method 2
y.pred = predict(bank.pruned, Bank[-train,],type="class")
table(y.pred,Bank$y[-train])##
## y.pred no yes
## no 18071 1971
## yes 155 397
err = sum(y.pred!=Bank$y[-train])/length(Bank$y[-train])
err## [1] 0.103234
6: ROC curve
rocplot=function(pred, truth, ...){
predob=prediction(pred, truth)
perf=performance(predob, "tpr", "fpr")
plot(perf,...)}
rocplot(y.prob[,2],Bank$y[-train], col="blue")
legend(x=0.65, y=0.1,legend="tree",col="blue",lwd=1, bty="n")
auc=function(pred, truth){
predob=prediction(pred, truth)
perf=performance(predob, "auc")
return(perf@y.values[[1]])}
auc.tree=auc(y.prob[,2],Bank$y[-train])
legend(x=0.85, y=0.1,legend=round(auc.tree,4),bty="n")7: Bagging
set.seed(300)
y.bag=randomForest(y~.-duration,mtry=19,data=Bank,importance=T)y.bag##
## Call:
## randomForest(formula = y ~ . - duration, data = Bank, mtry = 19, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 19
##
## OOB estimate of error rate: 10.99%
## Confusion matrix:
## no yes class.error
## no 35222 1326 0.03628106
## yes 3199 1441 0.68943966
varImpPlot(y.bag)8: ROC
rocplot(-y.bag$votes[1:nrow(Bank)],Bank$y, col=2)auc.bag=auc(-y.bag$votes[1:nrow(Bank)],Bank$y)
auc.bag## [1] 0.7701062
9: Less mtry
set.seed(300)
y.bag2=randomForest(y~.-duration,mtry=4,data=Bank,importance=T)y.bag2##
## Call:
## randomForest(formula = y ~ . - duration, data = Bank, mtry = 4, importance = T)
## Type of random forest: classification
## Number of trees: 500
## No. of variables tried at each split: 4
##
## OOB estimate of error rate: 10.12%
## Confusion matrix:
## no yes class.error
## no 35654 894 0.02446098
## yes 3275 1365 0.70581897
imp = varImpPlot(y.bag2)rocplot(-y.bag2$votes[1:nrow(Bank)],Bank$y, col=4)auc.rf=auc(-y.bag2$votes[1:nrow(Bank)],Bank$y)
imp## MeanDecreaseAccuracy MeanDecreaseGini
## age 37.868208 822.74482
## job 64.732283 589.60669
## marital 21.768342 210.69247
## education 38.614392 439.23864
## default 16.244608 73.26449
## housing 4.270088 192.56984
## loan 0.398690 144.17848
## contact 12.520663 96.28935
## month 28.817570 235.93786
## day_of_week 37.917124 377.94894
## campaign 21.874501 410.82516
## pdays 25.952835 286.76837
## previous 9.355320 122.31539
## poutcome 20.008144 251.69563
## emp.var.rate 17.773146 143.61591
## cons.price.idx 19.096216 144.20311
## cons.conf.idx 18.067662 179.37684
## euribor3m 32.845985 884.87846
## nr.employed 25.646699 502.71541
auc.rf## [1] 0.7855867
10: Boosting
y01=ifelse(Bank$y=="yes",1,0)
set.seed(300)
bank.gbm=gbm(y01~.-duration-y, Bank, distribution="bernoulli", n.trees=7000, interaction.depth=2, shrinkage=0.01, cv.fold=5)plot(bank.gbm$cv.error)min.tree=which.min(bank.gbm$cv.error)
min.tree## [1] 5493
bank.gbm$cv.error[min.tree]## [1] 0.5442875
summary(bank.gbm,n.trees=min.tree,las=1)## var rel.inf
## nr.employed nr.employed 40.5933594
## month month 11.4134450
## euribor3m euribor3m 11.2066272
## pdays pdays 10.5873686
## poutcome poutcome 5.0501037
## job job 4.7559198
## age age 4.0532912
## day_of_week day_of_week 2.6708937
## education education 1.8869116
## contact contact 1.7985368
## cons.conf.idx cons.conf.idx 1.7388371
## emp.var.rate emp.var.rate 1.0246177
## cons.price.idx cons.price.idx 0.9788107
## campaign campaign 0.9056328
## previous previous 0.6368538
## marital marital 0.2617678
## default default 0.1496384
## loan loan 0.1455162
## housing housing 0.1418686
11: Use Boosting to Predict
y01.pred=ifelse(bank.gbm$cv.fitted>0,1,0)
table(y01.pred,y01)## y01
## y01.pred 0 1
## 0 35912 3465
## 1 636 1175
err01=sum(y01.pred!=y01)/length(y01)
err01## [1] 0.09956784
12: Boosting ROC Curve
rocplot(bank.gbm$cv.fitted,y01)auc.boost=auc(bank.gbm$cv.fitted,y01)
auc.boost## [1] 0.7993592
13: Compare
bankres = data.frame(c(auc.tree,auc.bag,auc.rf,auc.boost),row.names = c("Tree", "Bag", "Random Forest", "Boosting"))
knitr::kable(bankres, row.names = T, col.names = "AUC")| AUC | |
|---|---|
| Tree | 0.7660280 |
| Bag | 0.7701062 |
| Random Forest | 0.7855867 |
| Boosting | 0.7993592 |
We can clearly see that boosting and random forest worked best for classifying the Bank dataset based on AUC value. The models benefit from the decreased variance provided by the boosting and random forest techniques.
14: House Data
House=read.csv("kc house sales.csv")
House$date=as.Date(House$date,format="%Y%m%d")
House$view=as.factor(House$view)
House$waterfront=as.factor(House$waterfront)
House$zipcode=as.factor(House$zipcode)
House[15871,3]=315: Revise Data
House$days=as.numeric(House$date-as.Date("2014-05-01"))
House=House[,-1]
attach(House)16: Boosting
house.gbm = gbm(log(price)~.,House, distribution = "gaussian", n.trees=70000, interaction.depth=2, shrinkage=0.01, cv.folds=5)17: CV Error
plot(house.gbm$cv.error[30000:70000])
min(house.gbm$cv.error)## [1] 0.02570754
which.min(house.gbm$cv.error)## [1] 58156
which.min(house.gbm$cv.error)-30000## [1] 28156
points(which.min(house.gbm$cv.error)-30000,min(house.gbm$cv.error),pch=3,cex=10,col=2)This plot clearly shows the minimum point.
18: Test MSE
newerr = min(house.gbm$cv.error) %>% sqrt() %>% exp()
backerr = 0.0303886 %>% sqrt() %>% exp()
ridgeerr = 0.03202018 %>% sqrt() %>% exp()
LASerr = 0.03716253 %>% sqrt() %>% exp()
pcrerr = 0.03367225 %>% sqrt() %>% exp()
plsrerr = 0.03367225 %>% sqrt() %>% exp()
errres = data.frame(c(newerr,backerr,ridgeerr,LASerr,pcrerr,plsrerr),row.names = c("Boosting", "Backward Subset", "Ridge", "LASSO", "PCR", "PLSR"))
knitr::kable(errres,format = "simple", col.names = "MSE")| MSE | |
|---|---|
| Boosting | 1.173905 |
| Backward Subset | 1.190440 |
| Ridge | 1.195951 |
| LASSO | 1.212611 |
| PCR | 1.201415 |
| PLSR | 1.201415 |
19: Relative Influence Plot
g = relative.influence(house.gbm, sort=T)## n.trees not given. Using 58156 trees.
g## zipcode sqft_living grade lat view
## 87496.87176 40667.62993 24481.54247 6227.61641 4010.26116
## sqft_living15 sqft_lot condition yr_built sqft_above
## 2638.36368 2336.95347 2260.83121 1987.99426 1613.20456
## waterfront long days bathrooms sqft_lot15
## 1579.85731 934.27183 928.38641 838.42258 810.79146
## bedrooms yr_renovated sqft_basement floors
## 480.62861 414.98722 410.00572 87.34437
head(g, n=3)## zipcode sqft_living grade
## 87496.87 40667.63 24481.54
plot(house.gbm,c('sqft_living'))plot(house.gbm,c('grade'))In the sqft_living to log(price) influence plot we can see that square feet has a positive linear relationship with price up to about 8,000 square feet, at which point any increase in square footage doesn’t have a relationship with price.
In the grade to log(price) relative influence plot we see a linear relationship, suggesting that price increases exponentially with grade. A grade lower than 4 is also not significant when predicting price.