library(gbm)

## Loaded gbm 2.1.8

library(tictoc)
library(ROCR)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(tree)
library(randomForest)

## randomForest 4.7-1

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:dplyr':
## 
##     combine

library(gbm)

1: Read the Data

Bank=read.csv("bank marketing.csv")
Bank=mutate_if(Bank, is.character, as.factor)
attach(Bank)

2: Regression Tree

set.seed(10)
train=sample(nrow(Bank),nrow(Bank)/2)
bank.tree=tree(y~.-duration,data=Bank, subset=train)
plot(bank.tree)
text(bank.tree, cex=0.7)

bank.tree

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
## 1) root 20594 14300.0 no ( 0.88968 0.11032 )  
##   2) nr.employed < 5087.65 2507  3440.0 no ( 0.55963 0.44037 )  
##     4) poutcome: failure,nonexistent 1942  2535.0 no ( 0.64109 0.35891 ) *
##     5) poutcome: success 565   669.7 yes ( 0.27965 0.72035 ) *
##   3) nr.employed > 5087.65 18087  8659.0 no ( 0.93542 0.06458 )  
##     6) month: aug,jul,jun,may,nov 16675  6827.0 no ( 0.94789 0.05211 ) *
##     7) month: apr,dec,mar,oct 1412  1458.0 no ( 0.78824 0.21176 ) *

3: Another Regression Tree

bank.tree2 = tree(y~.-duration,data=Bank, subset=train, control=tree.control(nobs=nrow(Bank)/2, mindev=0.005))

plot(bank.tree2)
text(bank.tree2, cex=0.7)

bank.tree2

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 20594 14300.0 no ( 0.88968 0.11032 )  
##    2) nr.employed < 5087.65 2507  3440.0 no ( 0.55963 0.44037 )  
##      4) poutcome: failure,nonexistent 1942  2535.0 no ( 0.64109 0.35891 ) *
##      5) poutcome: success 565   669.7 yes ( 0.27965 0.72035 ) *
##    3) nr.employed > 5087.65 18087  8659.0 no ( 0.93542 0.06458 )  
##      6) month: aug,jul,jun,may,nov 16675  6827.0 no ( 0.94789 0.05211 )  
##       12) euribor3m < 4.0485 3211  1868.0 no ( 0.91498 0.08502 ) *
##       13) euribor3m > 4.0485 13464  4881.0 no ( 0.95573 0.04427 ) *
##      7) month: apr,dec,mar,oct 1412  1458.0 no ( 0.78824 0.21176 )  
##       14) job: blue-collar,entrepreneur,management,services 614   414.8 no ( 0.89414 0.10586 ) *
##       15) job: admin.,housemaid,retired,self-employed,student,technician,unemployed,unknown 798   965.6 no ( 0.70677 0.29323 ) *

4: CV Tree

set.seed(11)
cvout=cv.tree(bank.tree2)
names(cvout)

## [1] "size"   "dev"    "k"      "method"

cvout

## $size
## [1] 6 5 4 3 2 1
## 
## $dev
## [1] 11498.18 11498.18 11498.18 11730.85 12101.69 14302.53
## 
## $k
## [1]       -Inf   77.27924   77.55388  234.60860  374.82680 2201.10208
## 
## $method
## [1] "deviance"
## 
## attr(,"class")
## [1] "prune"         "tree.sequence"

plot(cvout$size,cvout$dev,type="b")
mindev=which.min(cvout$dev)
minsize=cvout$size[mindev]
points(minsize,cvout$dev[mindev],pch=4,cex=2,col=2)

bank.pruned=prune.tree(bank.tree2,best=minsize)
prune = plot(bank.pruned)
text(bank.pruned)

bank.pruned

## node), split, n, deviance, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 20594 14300.0 no ( 0.88968 0.11032 )  
##    2) nr.employed < 5087.65 2507  3440.0 no ( 0.55963 0.44037 )  
##      4) poutcome: failure,nonexistent 1942  2535.0 no ( 0.64109 0.35891 ) *
##      5) poutcome: success 565   669.7 yes ( 0.27965 0.72035 ) *
##    3) nr.employed > 5087.65 18087  8659.0 no ( 0.93542 0.06458 )  
##      6) month: aug,jul,jun,may,nov 16675  6827.0 no ( 0.94789 0.05211 )  
##       12) euribor3m < 4.0485 3211  1868.0 no ( 0.91498 0.08502 ) *
##       13) euribor3m > 4.0485 13464  4881.0 no ( 0.95573 0.04427 ) *
##      7) month: apr,dec,mar,oct 1412  1458.0 no ( 0.78824 0.21176 )  
##       14) job: blue-collar,entrepreneur,management,services 614   414.8 no ( 0.89414 0.10586 ) *
##       15) job: admin.,housemaid,retired,self-employed,student,technician,unemployed,unknown 798   965.6 no ( 0.70677 0.29323 ) *

5: Predict

# Method 1
y.prob = predict(bank.pruned, Bank[-train,], type="vector")
y.pred = ifelse(y.prob[,2]>.5,'yes','no')


#Method 2
y.pred = predict(bank.pruned, Bank[-train,],type="class")


table(y.pred,Bank$y[-train])

##       
## y.pred    no   yes
##    no  18071  1971
##    yes   155   397

err = sum(y.pred!=Bank$y[-train])/length(Bank$y[-train])
err

## [1] 0.103234

6: ROC curve

rocplot=function(pred, truth, ...){
  predob=prediction(pred, truth)
  perf=performance(predob, "tpr", "fpr")
  plot(perf,...)}

rocplot(y.prob[,2],Bank$y[-train], col="blue")
legend(x=0.65, y=0.1,legend="tree",col="blue",lwd=1, bty="n")

auc=function(pred, truth){
  predob=prediction(pred, truth)
  perf=performance(predob, "auc")
  return(perf@y.values[[1]])}
auc.tree=auc(y.prob[,2],Bank$y[-train])
legend(x=0.85, y=0.1,legend=round(auc.tree,4),bty="n")

7: Bagging

set.seed(300)
y.bag=randomForest(y~.-duration,mtry=19,data=Bank,importance=T)

y.bag

## 
## Call:
##  randomForest(formula = y ~ . - duration, data = Bank, mtry = 19,      importance = T) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 19
## 
##         OOB estimate of  error rate: 10.99%
## Confusion matrix:
##        no  yes class.error
## no  35222 1326  0.03628106
## yes  3199 1441  0.68943966

varImpPlot(y.bag)

8: ROC

rocplot(-y.bag$votes[1:nrow(Bank)],Bank$y, col=2)

auc.bag=auc(-y.bag$votes[1:nrow(Bank)],Bank$y)
auc.bag

## [1] 0.7701062

9: Less mtry

set.seed(300)
y.bag2=randomForest(y~.-duration,mtry=4,data=Bank,importance=T)

y.bag2

## 
## Call:
##  randomForest(formula = y ~ . - duration, data = Bank, mtry = 4,      importance = T) 
##                Type of random forest: classification
##                      Number of trees: 500
## No. of variables tried at each split: 4
## 
##         OOB estimate of  error rate: 10.12%
## Confusion matrix:
##        no  yes class.error
## no  35654  894  0.02446098
## yes  3275 1365  0.70581897

imp = varImpPlot(y.bag2)

rocplot(-y.bag2$votes[1:nrow(Bank)],Bank$y, col=4)

auc.rf=auc(-y.bag2$votes[1:nrow(Bank)],Bank$y)
imp

##                MeanDecreaseAccuracy MeanDecreaseGini
## age                       37.868208        822.74482
## job                       64.732283        589.60669
## marital                   21.768342        210.69247
## education                 38.614392        439.23864
## default                   16.244608         73.26449
## housing                    4.270088        192.56984
## loan                       0.398690        144.17848
## contact                   12.520663         96.28935
## month                     28.817570        235.93786
## day_of_week               37.917124        377.94894
## campaign                  21.874501        410.82516
## pdays                     25.952835        286.76837
## previous                   9.355320        122.31539
## poutcome                  20.008144        251.69563
## emp.var.rate              17.773146        143.61591
## cons.price.idx            19.096216        144.20311
## cons.conf.idx             18.067662        179.37684
## euribor3m                 32.845985        884.87846
## nr.employed               25.646699        502.71541

auc.rf

## [1] 0.7855867

10: Boosting

y01=ifelse(Bank$y=="yes",1,0)
set.seed(300)
bank.gbm=gbm(y01~.-duration-y, Bank, distribution="bernoulli", n.trees=7000, interaction.depth=2, shrinkage=0.01, cv.fold=5)

plot(bank.gbm$cv.error)

min.tree=which.min(bank.gbm$cv.error)
min.tree

## [1] 5493

bank.gbm$cv.error[min.tree]

## [1] 0.5442875

summary(bank.gbm,n.trees=min.tree,las=1)

##                           var    rel.inf
## nr.employed       nr.employed 40.5933594
## month                   month 11.4134450
## euribor3m           euribor3m 11.2066272
## pdays                   pdays 10.5873686
## poutcome             poutcome  5.0501037
## job                       job  4.7559198
## age                       age  4.0532912
## day_of_week       day_of_week  2.6708937
## education           education  1.8869116
## contact               contact  1.7985368
## cons.conf.idx   cons.conf.idx  1.7388371
## emp.var.rate     emp.var.rate  1.0246177
## cons.price.idx cons.price.idx  0.9788107
## campaign             campaign  0.9056328
## previous             previous  0.6368538
## marital               marital  0.2617678
## default               default  0.1496384
## loan                     loan  0.1455162
## housing               housing  0.1418686

11: Use Boosting to Predict

y01.pred=ifelse(bank.gbm$cv.fitted>0,1,0)
table(y01.pred,y01)

##         y01
## y01.pred     0     1
##        0 35912  3465
##        1   636  1175

err01=sum(y01.pred!=y01)/length(y01)
err01

## [1] 0.09956784

12: Boosting ROC Curve

rocplot(bank.gbm$cv.fitted,y01)

auc.boost=auc(bank.gbm$cv.fitted,y01)
auc.boost

## [1] 0.7993592

13: Compare

bankres = data.frame(c(auc.tree,auc.bag,auc.rf,auc.boost),row.names = c("Tree", "Bag", "Random Forest", "Boosting"))

knitr::kable(bankres, row.names = T, col.names = "AUC")

	AUC
Tree	0.7660280
Bag	0.7701062
Random Forest	0.7855867
Boosting	0.7993592

We can clearly see that boosting and random forest worked best for classifying the Bank dataset based on AUC value. The models benefit from the decreased variance provided by the boosting and random forest techniques.

14: House Data

House=read.csv("kc house sales.csv")
House$date=as.Date(House$date,format="%Y%m%d")
House$view=as.factor(House$view)
House$waterfront=as.factor(House$waterfront)
House$zipcode=as.factor(House$zipcode)
House[15871,3]=3

15: Revise Data

House$days=as.numeric(House$date-as.Date("2014-05-01"))
House=House[,-1]
attach(House)

16: Boosting

house.gbm = gbm(log(price)~.,House, distribution = "gaussian", n.trees=70000, interaction.depth=2, shrinkage=0.01, cv.folds=5)

17: CV Error

plot(house.gbm$cv.error[30000:70000])
min(house.gbm$cv.error)

## [1] 0.02570754

which.min(house.gbm$cv.error)

## [1] 58156

which.min(house.gbm$cv.error)-30000

## [1] 28156

points(which.min(house.gbm$cv.error)-30000,min(house.gbm$cv.error),pch=3,cex=10,col=2)

This plot clearly shows the minimum point.

18: Test MSE

newerr = min(house.gbm$cv.error) %>% sqrt() %>% exp()
backerr = 0.0303886 %>% sqrt() %>% exp()
ridgeerr = 0.03202018 %>% sqrt() %>% exp()
LASerr = 0.03716253 %>% sqrt() %>% exp()
pcrerr = 0.03367225 %>% sqrt() %>% exp()
plsrerr = 0.03367225 %>% sqrt() %>% exp()

errres = data.frame(c(newerr,backerr,ridgeerr,LASerr,pcrerr,plsrerr),row.names = c("Boosting", "Backward Subset", "Ridge", "LASSO", "PCR", "PLSR"))

knitr::kable(errres,format = "simple", col.names = "MSE")

	MSE
Boosting	1.173905
Backward Subset	1.190440
Ridge	1.195951
LASSO	1.212611
PCR	1.201415
PLSR	1.201415

19: Relative Influence Plot

g = relative.influence(house.gbm, sort=T)

## n.trees not given. Using 58156 trees.

##       zipcode   sqft_living         grade           lat          view 
##   87496.87176   40667.62993   24481.54247    6227.61641    4010.26116 
## sqft_living15      sqft_lot     condition      yr_built    sqft_above 
##    2638.36368    2336.95347    2260.83121    1987.99426    1613.20456 
##    waterfront          long          days     bathrooms    sqft_lot15 
##    1579.85731     934.27183     928.38641     838.42258     810.79146 
##      bedrooms  yr_renovated sqft_basement        floors 
##     480.62861     414.98722     410.00572      87.34437

head(g, n=3)

##     zipcode sqft_living       grade 
##    87496.87    40667.63    24481.54

plot(house.gbm,c('sqft_living'))

plot(house.gbm,c('grade'))

In the sqft_living to log(price) influence plot we can see that square feet has a positive linear relationship with price up to about 8,000 square feet, at which point any increase in square footage doesn’t have a relationship with price.

In the grade to log(price) relative influence plot we see a linear relationship, suggesting that price increases exponentially with grade. A grade lower than 4 is also not significant when predicting price.