Executive Summary

My first attempt to use the mboost package. I will use the German Credit Card data set from the caret package. The data set contains 1,000 observations, a binary response variable, and 61 potential predictor variables.

The elastic net model (with a ridge-like mixing parameter and a small shrinkage parameter) was highly competitive against some of the most sophisticated models (e.g., randomForest, gradient boosted trees, and glmboost). Multivariate adaptive regression splines, surprisingly, performed worse than elastic net. Trees, as expected, were the worst models.

Load the data set

Load the German Credit Card data set from the caret package. The data set contains a response vector Class that indicates whether a credit card went to default (Bad) or was current on all its payments (Good).

I use the createDataPartition function from the caret package to split the data frame between training and test data sets.

library(caret)
## Loading required package: lattice
## Loading required package: ggplot2
data(GermanCredit)

set.seed(1984)

training <- createDataPartition(GermanCredit$Class, p = 0.6, list=FALSE)

trainData <- GermanCredit[training,]
testData <- GermanCredit[-training,]

Try a classic method (glm)

Fit a glm model (logistic regression) and score the test data set.

glmModel <- glm(Class~ . , data=trainData, family=binomial)

pred.glmModel <- predict(glmModel, newdata=testData, type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

Calculate the AUC for the test data set.

library(pROC)
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## 
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
roc.glmModel <- pROC::roc(testData$Class, pred.glmModel)

auc.glmModel <- pROC::auc(roc.glmModel)

glmboost

Fit a glm using a boosting algorithm (as opposed to MLE). Unlike the glm function, glmboost will perform variable selection. After fitting the model, score the test data set and measure the AUC.

fitControl <- trainControl(method = "repeatedcv",
                           number = 5,
                           repeats = 10,
                           ## Estimate class probabilities
                           classProbs = TRUE,
                           ## Evaluate performance using 
                           ## the following function
                           summaryFunction = twoClassSummary)

set.seed(2014)

glmBoostModel <- train(Class ~ ., data=trainData, method = "glmboost", metric="ROC", trControl = fitControl, tuneLength=5, center=TRUE, family=Binomial(link = c("logit")))
## Loading required package: mboost
## Loading required package: parallel
## Loading required package: stabs
## This is mboost 2.4-0. See 'package?mboost' and the NEWS file
## for a complete list of changes.
## 
## 
## Attaching package: 'mboost'
## 
## The following object is masked from 'package:ggplot2':
## 
##     %+%
pred.glmBoostModel <- as.vector(predict(glmBoostModel, newdata=testData, type="prob")[,"Good"])


roc.glmBoostModel <- pROC::roc(testData$Class, pred.glmBoostModel)

auc.glmBoostModel <- pROC::auc(roc.glmBoostModel)

Try many other methods

Try CART, conditional inference tree, elastic net, multivariate adaptive regression splines, boosted trees, and random forest.

# CART
set.seed(2014)

cartModel <- train(Class ~ ., data=trainData, method = "rpart", metric="ROC", trControl = fitControl, tuneLength=5)
## Loading required package: rpart
pred.cartModel <- as.vector(predict(cartModel, newdata=testData, type="prob")[,"Good"])


roc.cartModel <- pROC::roc(testData$Class, pred.cartModel)

auc.cartModel <- pROC::auc(roc.cartModel)

# Conditional Inference Tree
set.seed(2014)

partyModel <- train(Class ~ ., data=trainData, method = "ctree", metric="ROC", trControl = fitControl, tuneLength=5)
## Loading required package: party
## Loading required package: grid
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## Loading required package: sandwich
## Loading required package: strucchange
## Loading required package: modeltools
## Loading required package: stats4
pred.partyModel <- as.vector(predict(partyModel, newdata=testData, type="prob")[,"Good"])


roc.partyModel <- pROC::roc(testData$Class, pred.partyModel)

auc.partyModel <- pROC::auc(roc.partyModel)

# Elastic Net
set.seed(2014)

eNetModel <- train(Class ~ ., data=trainData, method = "glmnet", metric="ROC", trControl = fitControl, family="binomial", tuneLength=5)
## Loading required package: glmnet
## Loading required package: Matrix
## Loaded glmnet 1.9-8
## 
## 
## Attaching package: 'glmnet'
## 
## The following object is masked from 'package:pROC':
## 
##     auc
pred.eNetModel <- as.vector(predict(eNetModel, newdata=testData, type="prob")[,"Good"])


roc.eNetModel <- pROC::roc(testData$Class, pred.eNetModel)

auc.eNetModel <- pROC::auc(roc.eNetModel)

# Earth
set.seed(2014)

earthModel <- train(Class ~ ., data=trainData, method = "earth", glm=list(family=binomial), metric="ROC", trControl = fitControl, tuneLength=5)
## Loading required package: earth
## Loading required package: plotmo
## Loading required package: plotrix
pred.earthModel <- as.vector(predict(earthModel, newdata=testData, type="prob")[,"Good"])


roc.earthModel <- pROC::roc(testData$Class, pred.earthModel)

auc.earthModel <- pROC::auc(roc.earthModel)

# Boosted Trees
set.seed(2014)

gbmModel <- train(Class ~ ., data=trainData, method = "gbm", metric="ROC", trControl = fitControl, verbose=FALSE, tuneLength=5)
## Loading required package: gbm
## Loading required package: survival
## Loading required package: splines
## 
## Attaching package: 'survival'
## 
## The following object is masked from 'package:caret':
## 
##     cluster
## 
## Loaded gbm 2.1
## Loading required package: plyr
## 
## Attaching package: 'plyr'
## 
## The following object is masked from 'package:modeltools':
## 
##     empty
pred.gbmModel <- as.vector(predict(gbmModel, newdata=testData, type="prob")[,"Good"])


roc.gbmModel <- pROC::roc(testData$Class, pred.gbmModel)

auc.gbmModel <- pROC::auc(roc.gbmModel)

# Random Forest
set.seed(2014)

rfModel <- train(Class ~ ., data=trainData, method = "rf", metric="ROC", trControl = fitControl, verbose=FALSE, tuneLength=5)
## Loading required package: randomForest
## randomForest 4.6-10
## Type rfNews() to see new features/changes/bug fixes.
pred.rfModel <- as.vector(predict(rfModel, newdata=testData, type="prob")[,"Good"])


roc.rfModel <- pROC::roc(testData$Class, pred.rfModel)

auc.rfModel <- pROC::auc(roc.rfModel)

Choose the best model

Plot AUC, on the test data set, for each model.

test.auc <- data.frame(model=c("glm","glmboost","gbm","glmnet","earth","cart","ctree","rForest"),auc=c(auc.glmModel, auc.glmBoostModel, auc.gbmModel, auc.eNetModel, auc.earthModel, auc.cartModel, auc.partyModel, auc.rfModel))

test.auc <- test.auc[order(test.auc$auc, decreasing=TRUE),]

test.auc$model <- factor(test.auc$model, levels=test.auc$model)

library(ggplot2)
theme_set(theme_gray(base_size = 18))
qplot(x=model, y=auc, data=test.auc, geom="bar", stat="identity", position = "dodge")+ geom_bar(fill = "light blue", stat="identity")

Plot tuning parameters that were chosen by repeated CV.

plot(cartModel)

plot(partyModel)

plot(eNetModel)

plot(glmBoostModel)

plot(gbmModel)

plot(earthModel)

plot(rfModel)

The winner

Elastic net with a ridge-like mixing parameter and a small shrinkage parameter. The model is extremely easy to implement because the prediction is a linear combination of the chosen variables.

# Tuning parms chosen via repeated CV
eNetModel$bestTune
##   alpha lambda
## 1   0.1    0.1
# Coefficients of the Elastic Net model
coef(eNetModel$finalModel, eNetModel$bestTune$lambda)
## 62 x 1 sparse Matrix of class "dgCMatrix"
##                                                    1
## (Intercept)                             1.7699999318
## Duration                               -0.0134946822
## Amount                                 -0.0000405591
## InstallmentRatePercentage              -0.1713376403
## ResidenceDuration                       .           
## Age                                     0.0027823536
## NumberExistingCredits                   0.0247938924
## NumberPeopleMaintenance                 0.0017959074
## Telephone                              -0.1656547055
## ForeignWorker                          -0.3542782555
## CheckingAccountStatus.lt.0             -0.5275931813
## CheckingAccountStatus.0.to.200         -0.1232125990
## CheckingAccountStatus.gt.200            0.2627381384
## CheckingAccountStatus.none              0.5022791968
## CreditHistory.NoCredit.AllPaid         -0.3727566622
## CreditHistory.ThisBank.AllPaid         -0.6122488836
## CreditHistory.PaidDuly                 -0.0178754702
## CreditHistory.Delay                     .           
## CreditHistory.Critical                  0.3826608126
## Purpose.NewCar                         -0.2599967849
## Purpose.UsedCar                         0.2803373137
## Purpose.Furniture.Equipment             .           
## Purpose.Radio.Television                0.1043100891
## Purpose.DomesticAppliance               .           
## Purpose.Repairs                        -0.4253822021
## Purpose.Education                      -0.2977929177
## Purpose.Vacation                        .           
## Purpose.Retraining                      .           
## Purpose.Business                        .           
## Purpose.Other                           0.8498141929
## SavingsAccountBonds.lt.100             -0.2398608186
## SavingsAccountBonds.100.to.500         -0.1781729929
## SavingsAccountBonds.500.to.1000         0.2387120641
## SavingsAccountBonds.gt.1000             0.2951960364
## SavingsAccountBonds.Unknown             0.2419355876
## EmploymentDuration.lt.1                -0.0025592799
## EmploymentDuration.1.to.4              -0.0016065005
## EmploymentDuration.4.to.7               0.2297505291
## EmploymentDuration.gt.7                 0.0163613125
## EmploymentDuration.Unemployed          -0.0718048532
## Personal.Male.Divorced.Seperated       -0.1684437862
## Personal.Female.NotSingle              -0.0817201417
## Personal.Male.Single                    0.1821988879
## Personal.Male.Married.Widowed           .           
## Personal.Female.Single                  .           
## OtherDebtorsGuarantors.None             .           
## OtherDebtorsGuarantors.CoApplicant     -0.5032231695
## OtherDebtorsGuarantors.Guarantor        0.2601093133
## Property.RealEstate                     0.2645642793
## Property.Insurance                      .           
## Property.CarOther                       .           
## Property.Unknown                        .           
## OtherInstallmentPlans.Bank             -0.0857929599
## OtherInstallmentPlans.Stores            .           
## OtherInstallmentPlans.None              0.1794791937
## Housing.Rent                           -0.1292280391
## Housing.Own                             0.2163355553
## Housing.ForFree                         .           
## Job.UnemployedUnskilled                 0.1415253314
## Job.UnskilledResident                   0.0173550642
## Job.SkilledEmployee                    -0.0163219812
## Job.Management.SelfEmp.HighlyQualified  .