ml_credit_dataset <- read.csv("ml_credit_dataset.csv")
str(ml_credit_dataset)
## 'data.frame':    1000 obs. of  87 variables:
##  $ CheckingAccountStatus.0.to.200        : int  0 1 0 0 0 0 0 1 0 1 ...
##  $ CheckingAccountStatus.gt.200          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CheckingAccountStatus.lt.0            : int  1 0 0 1 1 0 0 0 0 0 ...
##  $ CheckingAccountStatus.none            : int  0 0 1 0 0 1 1 0 1 0 ...
##  $ Duration.0.to.6                       : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Duration.6.to.12                      : int  0 0 1 0 0 0 0 0 1 0 ...
##  $ Duration.12.to.18                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Duration.18.to.24                     : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ Duration.24.to.30                     : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Duration.30.to.36                     : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Duration.36.to.42                     : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ Duration.42.to.48                     : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Duration.48.to.54                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Duration.54.to.60                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Duration.66.to.72                     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CreditHistory.Critical                : int  1 0 1 0 0 0 0 0 0 1 ...
##  $ CreditHistory.Delay                   : int  0 0 0 0 1 0 0 0 0 0 ...
##  $ CreditHistory.NoCredit.AllPaid        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ CreditHistory.PaidDuly                : int  0 1 0 1 0 1 1 1 1 0 ...
##  $ CreditHistory.ThisBank.AllPaid        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purpose.Business                      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purpose.DomesticAppliance             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purpose.Education                     : int  0 0 1 0 0 1 0 0 0 0 ...
##  $ Purpose.Furniture.Equipment           : int  0 0 0 1 0 0 1 0 0 0 ...
##  $ Purpose.NewCar                        : int  0 0 0 0 1 0 0 0 0 1 ...
##  $ Purpose.Others                        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purpose.Radio.Television              : int  1 1 0 0 0 0 0 0 1 0 ...
##  $ Purpose.Repairs                       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purpose.Retraining                    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Purpose.UsedCar                       : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ SavingsAccountBonds.100.to.500        : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ SavingsAccountBonds.500.to.1000       : int  0 0 0 0 0 0 1 0 0 0 ...
##  $ SavingsAccountBonds.gt.1000           : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ SavingsAccountBonds.lt.100            : int  0 1 1 1 1 0 0 1 0 1 ...
##  $ SavingsAccountBonds.Unknown           : int  1 0 0 0 0 1 0 0 0 0 ...
##  $ EmploymentDuration.0.to.1             : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ EmploymentDuration.1.to.4             : int  0 1 0 0 1 1 0 1 0 0 ...
##  $ EmploymentDuration.4.to.7             : int  0 0 1 1 0 0 0 0 1 0 ...
##  $ EmploymentDuration.gt.7               : int  1 0 0 0 0 0 1 0 0 0 ...
##  $ EmploymentDuration.Unemployed         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ InstallmentRatePercentage.1           : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ InstallmentRatePercentage.2           : int  0 1 1 1 0 1 0 1 1 0 ...
##  $ InstallmentRatePercentage.3           : int  0 0 0 0 1 0 1 0 0 0 ...
##  $ InstallmentRatePercentage.4           : int  1 0 0 0 0 0 0 0 0 1 ...
##  $ Personal.Female.NotSingle             : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Personal.Male.Divorced.Seperated      : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ Personal.Male.Married.Widowed         : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Personal.Male.Single                  : int  1 0 1 1 1 1 1 1 0 0 ...
##  $ OtherDebtorsGuarantors.CoApplicant    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OtherDebtorsGuarantors.Guarantor      : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ OtherDebtorsGuarantors.None           : int  1 1 1 0 1 1 1 1 1 1 ...
##  $ ResidenceDuration.1                   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ ResidenceDuration.2                   : int  0 1 0 0 0 0 0 1 0 1 ...
##  $ ResidenceDuration.3                   : int  0 0 1 0 0 0 0 0 0 0 ...
##  $ ResidenceDuration.4                   : int  1 0 0 1 1 1 1 0 1 0 ...
##  $ Property.CarOther                     : int  0 0 0 0 0 0 0 1 0 1 ...
##  $ Property.Insurance                    : int  0 0 0 1 0 0 1 0 0 0 ...
##  $ Property.RealEstate                   : int  1 1 1 0 0 0 0 0 1 0 ...
##  $ Property.Unknown                      : int  0 0 0 0 1 1 0 0 0 0 ...
##  $ Age.18.to.24                          : int  0 1 0 0 0 0 0 0 0 0 ...
##  $ Age.24.to.30                          : int  0 0 0 0 0 0 0 0 0 1 ...
##  $ Age.30.to.36                          : int  0 0 0 0 0 1 0 1 0 0 ...
##  $ Age.36.to.42                          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.42.to.48                          : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ Age.48.to.54                          : int  0 0 1 0 1 0 1 0 0 0 ...
##  $ Age.54.to.60                          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Age.60.to.66                          : int  0 0 0 0 0 0 0 0 1 0 ...
##  $ Age.66.to.72                          : int  1 0 0 0 0 0 0 0 0 0 ...
##  $ Age.72.to.78                          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OtherInstallmentPlans.Bank            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ OtherInstallmentPlans.None            : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ OtherInstallmentPlans.Stores          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Housing.ForFree                       : int  0 0 0 1 1 1 0 0 0 0 ...
##  $ Housing.Own                           : int  1 1 1 0 0 0 1 0 1 1 ...
##  $ Housing.Rent                          : int  0 0 0 0 0 0 0 1 0 0 ...
##  $ NumberExistingCredits.1               : int  0 1 1 1 0 1 1 1 1 0 ...
##  $ NumberExistingCredits.2               : int  1 0 0 0 1 0 0 0 0 1 ...
##  $ NumberExistingCredits.3               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ NumberExistingCredits.4               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Job.Management.SelfEmp.HighlyQualified: int  0 0 0 0 0 0 0 1 0 1 ...
##  $ Job.SkilledEmployee                   : int  1 1 0 1 1 0 1 0 0 0 ...
##  $ Job.UnemployedUnskilled               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ Job.UnskilledResident                 : int  0 0 1 0 0 1 0 0 1 0 ...
##  $ NumberPeopleMaintenance               : int  1 1 2 2 2 2 1 1 1 1 ...
##  $ Telephone                             : int  1 0 0 0 0 1 0 1 0 0 ...
##  $ ForeignWorker                         : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ Class                                 : Factor w/ 2 levels "Bad","Good": 2 1 2 2 1 2 2 2 2 1 ...

Making a Machine Learning task using mlr

library(mlr)
## Loading required package: ParamHelpers
credit.task = makeClassifTask(data = ml_credit_dataset, target = "Class")
credit.task = removeConstantFeatures(credit.task)
credit.task
## Supervised task: ml_credit_dataset
## Type: classif
## Target: Class
## Observations: 1000
## Features:
##    numerics     factors     ordered functionals 
##          86           0           0           0 
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 2
##  Bad Good 
##  300  700 
## Positive class: Bad

Cost Matrix for German Credit Data

costs = matrix(c(0, 1, 5, 0), 2)
colnames(costs) = rownames(costs) = getTaskClassLevels(credit.task)
costs
##      Bad Good
## Bad    0    5
## Good   1    0

Calculate the theoretical threshold for the positive class: Since c(+1,+1)=c(-1,-1)=0

th = costs[2,1]/(costs[2,1] + costs[1,2])
th
## [1] 0.1666667

Creating a cost measure

In order to calculate the average costs over the entire data set we first need to create a new performance Measure. This can be done through function makeCostMeasure. It is expected that the rows of the cost matrix indicate true and the columns predicted class labels.

credit.costs = makeCostMeasure(id = "credit.costs", name = "Credit costs", costs = costs,
  best = 0, worst = 5)
credit.costs
## Name: Credit costs
## Performance measure: credit.costs
## Properties: classif,classif.multi,req.pred,req.truth,predtype.response,predtype.prob
## Minimize: TRUE
## Best: 0; Worst: 5
## Aggregated by: test.mean
## Arguments: costs=<matrix>, combine=<function>
## Note:

2. Rebalancing

-In order to minimize the average costs, observations from the less costly class should be given higher importance during training.

-This can be achieved by weighting the classes, provided that the learner under consideration has a ‘class weights’ or an ‘observation weights’ argument.

i. Weighing

Just as theoretical thresholds, theoretical weights can be calculated from the cost matrix. If t indicates the target threshold and t0 the original threshold for the positive class the proportion of observations in the positive class has to be multiplied by

\(w= \frac{1-t}{t}\frac{t_0}{1-t_0}\)

for our case: Weight for positive class corresponding to theoretical treshold

w = (1 - th)/th
w
## [1] 5

Assigning theoretical weight : for learner that support observation weights

-A unified and convenient way to assign class weights to a Learner (and tune them) is provided by function makeWeightedClassesWrapper.

-The class weights are specified using argument wcw.weight

-For learners that support observation weights a suitable weight vector is then generated internally during training or resampling.

wlrn = makeLearner("classif.multinom", trace = FALSE)
wlrn = makeWeightedClassesWrapper(wlrn, wcw.weight = w)
wlrn
## Learner weightedclasses.classif.multinom from package nnet
## Type: classif
## Name: ; Short name: 
## Class: WeightedClassesWrapper
## Properties: twoclass,multiclass,numerics,factors,prob
## Predict-Type: response
## Hyperparameters: trace=FALSE,wcw.weight=5
rin = makeResampleInstance("CV", iters = 5, task = credit.task,stratify = TRUE)
wr = resample(wlrn, credit.task, rin, measures = list(credit.costs, mmce), show.info = FALSE)
wr
## Resample Result
## Task: ml_credit_dataset
## Learner: weightedclasses.classif.multinom
## Aggr perf: credit.costs.test.mean=0.6160000,mmce.test.mean=0.3800000
## Runtime: 0.629795

Assigning theoretical weight : for learner that support class weights

lrn = makeWeightedClassesWrapper("classif.multinom", wcw.weight = w)
r = resample(lrn, credit.task, rin, measures = list(credit.costs, mmce), show.info = FALSE)
## # weights:  88 (87 variable)
## initial  value 1219.939038 
## iter  10 value 765.761791
## iter  20 value 743.153544
## iter  30 value 741.163899
## iter  40 value 740.891589
## iter  50 value 740.796792
## iter  60 value 740.711613
## iter  70 value 740.693141
## iter  80 value 740.687436
## final  value 740.687288 
## converged
## # weights:  88 (87 variable)
## initial  value 1219.939038 
## iter  10 value 787.663685
## iter  20 value 770.394704
## iter  30 value 769.073139
## iter  40 value 768.800480
## iter  50 value 768.673921
## iter  60 value 768.606668
## iter  70 value 768.595900
## final  value 768.593206 
## converged
## # weights:  88 (87 variable)
## initial  value 1219.939038 
## iter  10 value 783.190125
## iter  20 value 763.072001
## iter  30 value 761.169657
## iter  40 value 760.877102
## iter  50 value 760.690821
## iter  60 value 760.581150
## iter  70 value 760.550108
## iter  80 value 760.543044
## final  value 760.542914 
## converged
## # weights:  88 (87 variable)
## initial  value 1219.939038 
## iter  10 value 768.395281
## iter  20 value 748.544161
## iter  30 value 747.446921
## iter  40 value 747.360738
## iter  50 value 747.354757
## final  value 747.354353 
## converged
## # weights:  88 (87 variable)
## initial  value 1219.939038 
## iter  10 value 817.403347
## iter  20 value 801.453623
## iter  30 value 799.527239
## iter  40 value 799.266371
## iter  50 value 799.105400
## iter  60 value 799.031922
## iter  70 value 799.004406
## iter  80 value 798.996305
## final  value 798.996049 
## converged
r
## Resample Result
## Task: ml_credit_dataset
## Learner: weightedclasses.classif.multinom
## Aggr perf: credit.costs.test.mean=0.6160000,mmce.test.mean=0.3800000
## Runtime: 0.735304

Tuning the weight

-Just like the theoretical threshold, the theoretical weights may not always be suitable, therefore you can tune the weight for the positive class.

-Calculating the theoretical weight beforehand may help to narrow down the search interval.

lrn = makeLearner("classif.multinom", trace = FALSE)
lrn = makeWeightedClassesWrapper(lrn)
ps = makeParamSet(makeDiscreteParam("wcw.weight", seq(4, 12, 0.5)))
ctrl = makeTuneControlGrid()
tune.wcw.res = tuneParams(lrn, credit.task, resampling = rin, par.set = ps,
  measures = list(credit.costs, mmce), control = ctrl, show.info = FALSE)
tune.wcw.res
## Tune result:
## Op. pars: wcw.weight=9
## credit.costs.test.mean=0.5790000,mmce.test.mean=0.4230000
as.data.frame(tune.wcw.res$opt.path)[1:3]
##    wcw.weight credit.costs.test.mean mmce.test.mean
## 1           4                  0.625          0.349
## 2         4.5                  0.627          0.367
## 3           5                  0.616          0.380
## 4         5.5                  0.612          0.388
## 5           6                  0.594          0.390
## 6         6.5                  0.596          0.396
## 7           7                  0.587          0.403
## 8         7.5                  0.588          0.408
## 9           8                  0.594          0.414
## 10        8.5                  0.584          0.420
## 11          9                  0.579          0.423
## 12        9.5                  0.593          0.437
## 13         10                  0.597          0.445
## 14       10.5                  0.603          0.455
## 15         11                  0.601          0.457
## 16       11.5                  0.608          0.464
## 17         12                  0.609          0.469

ii. Over- and undersampling

-If the Learner supports neither observation nor class weights the proportions of the classes in the training data can be changed by over- or undersampling.

-In the GermanCredit data set the positive class Bad should receive a theoretical weight of w = (1 - th)/th = 5. This can be achieved by oversampling class Bad with a rate of 5 or by undersampling class Good with a rate of 1/5 (using functions oversample or undersample).

logistic model

credit.task.over = oversample(credit.task, rate = w, cl = "Bad")
logisticlrn = makeLearner("classif.multinom", trace = FALSE)
logisticmod = mlr::train(logisticlrn, credit.task.over)
logisticpred = predict(logisticmod, task = credit.task)
performance(logisticpred, measures = list(credit.costs, mmce))
## credit.costs         mmce 
##        0.430        0.322

Rpart model

credit.task.over = oversample(credit.task, rate = w, cl = "Bad")
rpartlrn = makeLearner("classif.rpart")
rpartmod = mlr::train(rpartlrn, credit.task.over)
rpartpred = predict(rpartmod, task = credit.task)
performance(rpartpred, measures = list(credit.costs, mmce))
## credit.costs         mmce 
##        0.473        0.425

Resample data to get appropriate performance

-We usually prefer resampled performance values, but simply calling resample on the oversampled task does not work since predictions have to be based on the original task.

-The solution is to create a wrapped Learner via function makeOversampleWrapper.

-Internally, oversample is called before training, but predictions are done on the original data.

logistic model

logicallrn = makeLearner("classif.multinom", trace = FALSE)
logicallrn = makeOversampleWrapper(logicallrn, osw.rate = w, osw.cl = "Bad")
logicallrn
## Learner classif.multinom.oversampled from package mlr,nnet
## Type: classif
## Name: ; Short name: 
## Class: OversampleWrapper
## Properties: numerics,factors,weights,prob,twoclass,multiclass
## Predict-Type: response
## Hyperparameters: trace=FALSE,osw.rate=5,osw.cl=Bad
lr = resample(logicallrn, credit.task, rin, measures = list(credit.costs, mmce), show.info = FALSE)
lr
## Resample Result
## Task: ml_credit_dataset
## Learner: classif.multinom.oversampled
## Aggr perf: credit.costs.test.mean=0.6140000,mmce.test.mean=0.3700000
## Runtime: 1.12441

Rpart model

rpartlrn = makeLearner("classif.rpart")
rpartlrn = makeOversampleWrapper(rpartlrn, osw.rate = w, osw.cl = "Bad")
rpartlrn
## Learner classif.rpart.oversampled from package mlr,rpart
## Type: classif
## Name: ; Short name: 
## Class: OversampleWrapper
## Properties: numerics,factors,ordered,missings,weights,prob,twoclass,multiclass,featimp
## Predict-Type: response
## Hyperparameters: xval=0,osw.rate=5,osw.cl=Bad
rr = resample(logicallrn, credit.task, rin, measures = list(credit.costs, mmce), show.info = FALSE)
rr
## Resample Result
## Task: ml_credit_dataset
## Learner: classif.multinom.oversampled
## Aggr perf: credit.costs.test.mean=0.6150000,mmce.test.mean=0.3750000
## Runtime: 1.28737

Tuning the oversample rate

-Of course, we can also tune the oversampling rate. For this purpose we again have to create an OversampleWrapper. Optimal values for parameter osw.rate can be obtained using function tuneParams.

logistic model

logicallrn = makeLearner("classif.multinom", trace = FALSE)
logicallrn = makeOversampleWrapper(logicallrn, osw.cl = "Bad")
logicalps = makeParamSet(makeDiscreteParam("osw.rate", seq(3, 8, 0.25)))
logicalctrl = makeTuneControlGrid()
logicaltune.osw.res = tuneParams(logicallrn, credit.task, rin, par.set = logicalps, measures = list(credit.costs, mmce),
  control = logicalctrl, show.info = FALSE)
logicaltune.osw.res
## Tune result:
## Op. pars: osw.rate=7.75
## credit.costs.test.mean=0.5870000,mmce.test.mean=0.4070000

Rpart model

rpartlrn = makeLearner("classif.rpart")
rpartlrn = makeOversampleWrapper(rpartlrn, osw.cl = "Bad")
rpartps = makeParamSet(makeDiscreteParam("osw.rate", seq(3, 8, 0.25)))
rpartctrl = makeTuneControlGrid()
rparttune.osw.res = tuneParams(rpartlrn, credit.task, rin, par.set = rpartps, measures = list(credit.costs, mmce),
  control = rpartctrl, show.info = FALSE)
rparttune.osw.res
## Tune result:
## Op. pars: osw.rate=7
## credit.costs.test.mean=0.5670000,mmce.test.mean=0.4430000