ml_credit_dataset <- read.csv("ml_credit_dataset.csv")
str(ml_credit_dataset)
## 'data.frame': 1000 obs. of 87 variables:
## $ CheckingAccountStatus.0.to.200 : int 0 1 0 0 0 0 0 1 0 1 ...
## $ CheckingAccountStatus.gt.200 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CheckingAccountStatus.lt.0 : int 1 0 0 1 1 0 0 0 0 0 ...
## $ CheckingAccountStatus.none : int 0 0 1 0 0 1 1 0 1 0 ...
## $ Duration.0.to.6 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Duration.6.to.12 : int 0 0 1 0 0 0 0 0 1 0 ...
## $ Duration.12.to.18 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Duration.18.to.24 : int 0 0 0 0 1 0 1 0 0 0 ...
## $ Duration.24.to.30 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Duration.30.to.36 : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Duration.36.to.42 : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Duration.42.to.48 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Duration.48.to.54 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Duration.54.to.60 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Duration.66.to.72 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CreditHistory.Critical : int 1 0 1 0 0 0 0 0 0 1 ...
## $ CreditHistory.Delay : int 0 0 0 0 1 0 0 0 0 0 ...
## $ CreditHistory.NoCredit.AllPaid : int 0 0 0 0 0 0 0 0 0 0 ...
## $ CreditHistory.PaidDuly : int 0 1 0 1 0 1 1 1 1 0 ...
## $ CreditHistory.ThisBank.AllPaid : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Business : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.DomesticAppliance : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Education : int 0 0 1 0 0 1 0 0 0 0 ...
## $ Purpose.Furniture.Equipment : int 0 0 0 1 0 0 1 0 0 0 ...
## $ Purpose.NewCar : int 0 0 0 0 1 0 0 0 0 1 ...
## $ Purpose.Others : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Radio.Television : int 1 1 0 0 0 0 0 0 1 0 ...
## $ Purpose.Repairs : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.Retraining : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Purpose.UsedCar : int 0 0 0 0 0 0 0 1 0 0 ...
## $ SavingsAccountBonds.100.to.500 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ SavingsAccountBonds.500.to.1000 : int 0 0 0 0 0 0 1 0 0 0 ...
## $ SavingsAccountBonds.gt.1000 : int 0 0 0 0 0 0 0 0 1 0 ...
## $ SavingsAccountBonds.lt.100 : int 0 1 1 1 1 0 0 1 0 1 ...
## $ SavingsAccountBonds.Unknown : int 1 0 0 0 0 1 0 0 0 0 ...
## $ EmploymentDuration.0.to.1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ EmploymentDuration.1.to.4 : int 0 1 0 0 1 1 0 1 0 0 ...
## $ EmploymentDuration.4.to.7 : int 0 0 1 1 0 0 0 0 1 0 ...
## $ EmploymentDuration.gt.7 : int 1 0 0 0 0 0 1 0 0 0 ...
## $ EmploymentDuration.Unemployed : int 0 0 0 0 0 0 0 0 0 1 ...
## $ InstallmentRatePercentage.1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ InstallmentRatePercentage.2 : int 0 1 1 1 0 1 0 1 1 0 ...
## $ InstallmentRatePercentage.3 : int 0 0 0 0 1 0 1 0 0 0 ...
## $ InstallmentRatePercentage.4 : int 1 0 0 0 0 0 0 0 0 1 ...
## $ Personal.Female.NotSingle : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Personal.Male.Divorced.Seperated : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Personal.Male.Married.Widowed : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Personal.Male.Single : int 1 0 1 1 1 1 1 1 0 0 ...
## $ OtherDebtorsGuarantors.CoApplicant : int 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherDebtorsGuarantors.Guarantor : int 0 0 0 1 0 0 0 0 0 0 ...
## $ OtherDebtorsGuarantors.None : int 1 1 1 0 1 1 1 1 1 1 ...
## $ ResidenceDuration.1 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ ResidenceDuration.2 : int 0 1 0 0 0 0 0 1 0 1 ...
## $ ResidenceDuration.3 : int 0 0 1 0 0 0 0 0 0 0 ...
## $ ResidenceDuration.4 : int 1 0 0 1 1 1 1 0 1 0 ...
## $ Property.CarOther : int 0 0 0 0 0 0 0 1 0 1 ...
## $ Property.Insurance : int 0 0 0 1 0 0 1 0 0 0 ...
## $ Property.RealEstate : int 1 1 1 0 0 0 0 0 1 0 ...
## $ Property.Unknown : int 0 0 0 0 1 1 0 0 0 0 ...
## $ Age.18.to.24 : int 0 1 0 0 0 0 0 0 0 0 ...
## $ Age.24.to.30 : int 0 0 0 0 0 0 0 0 0 1 ...
## $ Age.30.to.36 : int 0 0 0 0 0 1 0 1 0 0 ...
## $ Age.36.to.42 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.42.to.48 : int 0 0 0 1 0 0 0 0 0 0 ...
## $ Age.48.to.54 : int 0 0 1 0 1 0 1 0 0 0 ...
## $ Age.54.to.60 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Age.60.to.66 : int 0 0 0 0 0 0 0 0 1 0 ...
## $ Age.66.to.72 : int 1 0 0 0 0 0 0 0 0 0 ...
## $ Age.72.to.78 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherInstallmentPlans.Bank : int 0 0 0 0 0 0 0 0 0 0 ...
## $ OtherInstallmentPlans.None : int 1 1 1 1 1 1 1 1 1 1 ...
## $ OtherInstallmentPlans.Stores : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Housing.ForFree : int 0 0 0 1 1 1 0 0 0 0 ...
## $ Housing.Own : int 1 1 1 0 0 0 1 0 1 1 ...
## $ Housing.Rent : int 0 0 0 0 0 0 0 1 0 0 ...
## $ NumberExistingCredits.1 : int 0 1 1 1 0 1 1 1 1 0 ...
## $ NumberExistingCredits.2 : int 1 0 0 0 1 0 0 0 0 1 ...
## $ NumberExistingCredits.3 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ NumberExistingCredits.4 : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Job.Management.SelfEmp.HighlyQualified: int 0 0 0 0 0 0 0 1 0 1 ...
## $ Job.SkilledEmployee : int 1 1 0 1 1 0 1 0 0 0 ...
## $ Job.UnemployedUnskilled : int 0 0 0 0 0 0 0 0 0 0 ...
## $ Job.UnskilledResident : int 0 0 1 0 0 1 0 0 1 0 ...
## $ NumberPeopleMaintenance : int 1 1 2 2 2 2 1 1 1 1 ...
## $ Telephone : int 1 0 0 0 0 1 0 1 0 0 ...
## $ ForeignWorker : int 1 1 1 1 1 1 1 1 1 1 ...
## $ Class : Factor w/ 2 levels "Bad","Good": 2 1 2 2 1 2 2 2 2 1 ...
library(mlr)
## Loading required package: ParamHelpers
credit.task = makeClassifTask(data = ml_credit_dataset, target = "Class")
credit.task = removeConstantFeatures(credit.task)
credit.task
## Supervised task: ml_credit_dataset
## Type: classif
## Target: Class
## Observations: 1000
## Features:
## numerics factors ordered functionals
## 86 0 0 0
## Missings: FALSE
## Has weights: FALSE
## Has blocking: FALSE
## Has coordinates: FALSE
## Classes: 2
## Bad Good
## 300 700
## Positive class: Bad
Cost Matrix for German Credit Data
costs = matrix(c(0, 1, 5, 0), 2)
colnames(costs) = rownames(costs) = getTaskClassLevels(credit.task)
costs
## Bad Good
## Bad 0 5
## Good 1 0
Calculate the theoretical threshold for the positive class: Since c(+1,+1)=c(-1,-1)=0
th = costs[2,1]/(costs[2,1] + costs[1,2])
th
## [1] 0.1666667
In order to calculate the average costs over the entire data set we first need to create a new performance Measure. This can be done through function makeCostMeasure. It is expected that the rows of the cost matrix indicate true and the columns predicted class labels.
credit.costs = makeCostMeasure(id = "credit.costs", name = "Credit costs", costs = costs,
best = 0, worst = 5)
credit.costs
## Name: Credit costs
## Performance measure: credit.costs
## Properties: classif,classif.multi,req.pred,req.truth,predtype.response,predtype.prob
## Minimize: TRUE
## Best: 0; Worst: 5
## Aggregated by: test.mean
## Arguments: costs=<matrix>, combine=<function>
## Note:
-In order to minimize the average costs, observations from the less costly class should be given higher importance during training.
-This can be achieved by weighting the classes, provided that the learner under consideration has a ‘class weights’ or an ‘observation weights’ argument.
Just as theoretical thresholds, theoretical weights can be calculated from the cost matrix. If t indicates the target threshold and t0 the original threshold for the positive class the proportion of observations in the positive class has to be multiplied by
\(w= \frac{1-t}{t}\frac{t_0}{1-t_0}\)
for our case: Weight for positive class corresponding to theoretical treshold
w = (1 - th)/th
w
## [1] 5
-A unified and convenient way to assign class weights to a Learner (and tune them) is provided by function makeWeightedClassesWrapper.
-The class weights are specified using argument wcw.weight
-For learners that support observation weights a suitable weight vector is then generated internally during training or resampling.
wlrn = makeLearner("classif.multinom", trace = FALSE)
wlrn = makeWeightedClassesWrapper(wlrn, wcw.weight = w)
wlrn
## Learner weightedclasses.classif.multinom from package nnet
## Type: classif
## Name: ; Short name:
## Class: WeightedClassesWrapper
## Properties: twoclass,multiclass,numerics,factors,prob
## Predict-Type: response
## Hyperparameters: trace=FALSE,wcw.weight=5
rin = makeResampleInstance("CV", iters = 5, task = credit.task,stratify = TRUE)
wr = resample(wlrn, credit.task, rin, measures = list(credit.costs, mmce), show.info = FALSE)
wr
## Resample Result
## Task: ml_credit_dataset
## Learner: weightedclasses.classif.multinom
## Aggr perf: credit.costs.test.mean=0.6160000,mmce.test.mean=0.3800000
## Runtime: 0.629795
If the learner can deal with class weights, the weights are basically passed on to the appropriate learner parameter.
The advantage of using the wrapper in this case is the unified way to specify the class weights.
For classification methods like “classif.multinom” that support class weights you can pass them directly.
lrn = makeWeightedClassesWrapper("classif.multinom", wcw.weight = w)
r = resample(lrn, credit.task, rin, measures = list(credit.costs, mmce), show.info = FALSE)
## # weights: 88 (87 variable)
## initial value 1219.939038
## iter 10 value 765.761791
## iter 20 value 743.153544
## iter 30 value 741.163899
## iter 40 value 740.891589
## iter 50 value 740.796792
## iter 60 value 740.711613
## iter 70 value 740.693141
## iter 80 value 740.687436
## final value 740.687288
## converged
## # weights: 88 (87 variable)
## initial value 1219.939038
## iter 10 value 787.663685
## iter 20 value 770.394704
## iter 30 value 769.073139
## iter 40 value 768.800480
## iter 50 value 768.673921
## iter 60 value 768.606668
## iter 70 value 768.595900
## final value 768.593206
## converged
## # weights: 88 (87 variable)
## initial value 1219.939038
## iter 10 value 783.190125
## iter 20 value 763.072001
## iter 30 value 761.169657
## iter 40 value 760.877102
## iter 50 value 760.690821
## iter 60 value 760.581150
## iter 70 value 760.550108
## iter 80 value 760.543044
## final value 760.542914
## converged
## # weights: 88 (87 variable)
## initial value 1219.939038
## iter 10 value 768.395281
## iter 20 value 748.544161
## iter 30 value 747.446921
## iter 40 value 747.360738
## iter 50 value 747.354757
## final value 747.354353
## converged
## # weights: 88 (87 variable)
## initial value 1219.939038
## iter 10 value 817.403347
## iter 20 value 801.453623
## iter 30 value 799.527239
## iter 40 value 799.266371
## iter 50 value 799.105400
## iter 60 value 799.031922
## iter 70 value 799.004406
## iter 80 value 798.996305
## final value 798.996049
## converged
r
## Resample Result
## Task: ml_credit_dataset
## Learner: weightedclasses.classif.multinom
## Aggr perf: credit.costs.test.mean=0.6160000,mmce.test.mean=0.3800000
## Runtime: 0.735304
-Just like the theoretical threshold, the theoretical weights may not always be suitable, therefore you can tune the weight for the positive class.
-Calculating the theoretical weight beforehand may help to narrow down the search interval.
lrn = makeLearner("classif.multinom", trace = FALSE)
lrn = makeWeightedClassesWrapper(lrn)
ps = makeParamSet(makeDiscreteParam("wcw.weight", seq(4, 12, 0.5)))
ctrl = makeTuneControlGrid()
tune.wcw.res = tuneParams(lrn, credit.task, resampling = rin, par.set = ps,
measures = list(credit.costs, mmce), control = ctrl, show.info = FALSE)
tune.wcw.res
## Tune result:
## Op. pars: wcw.weight=9
## credit.costs.test.mean=0.5790000,mmce.test.mean=0.4230000
as.data.frame(tune.wcw.res$opt.path)[1:3]
## wcw.weight credit.costs.test.mean mmce.test.mean
## 1 4 0.625 0.349
## 2 4.5 0.627 0.367
## 3 5 0.616 0.380
## 4 5.5 0.612 0.388
## 5 6 0.594 0.390
## 6 6.5 0.596 0.396
## 7 7 0.587 0.403
## 8 7.5 0.588 0.408
## 9 8 0.594 0.414
## 10 8.5 0.584 0.420
## 11 9 0.579 0.423
## 12 9.5 0.593 0.437
## 13 10 0.597 0.445
## 14 10.5 0.603 0.455
## 15 11 0.601 0.457
## 16 11.5 0.608 0.464
## 17 12 0.609 0.469
-If the Learner supports neither observation nor class weights the proportions of the classes in the training data can be changed by over- or undersampling.
-In the GermanCredit data set the positive class Bad should receive a theoretical weight of w = (1 - th)/th = 5. This can be achieved by oversampling class Bad with a rate of 5 or by undersampling class Good with a rate of 1/5 (using functions oversample or undersample).
logistic model
credit.task.over = oversample(credit.task, rate = w, cl = "Bad")
logisticlrn = makeLearner("classif.multinom", trace = FALSE)
logisticmod = mlr::train(logisticlrn, credit.task.over)
logisticpred = predict(logisticmod, task = credit.task)
performance(logisticpred, measures = list(credit.costs, mmce))
## credit.costs mmce
## 0.430 0.322
Rpart model
credit.task.over = oversample(credit.task, rate = w, cl = "Bad")
rpartlrn = makeLearner("classif.rpart")
rpartmod = mlr::train(rpartlrn, credit.task.over)
rpartpred = predict(rpartmod, task = credit.task)
performance(rpartpred, measures = list(credit.costs, mmce))
## credit.costs mmce
## 0.473 0.425
-We usually prefer resampled performance values, but simply calling resample on the oversampled task does not work since predictions have to be based on the original task.
-The solution is to create a wrapped Learner via function makeOversampleWrapper.
-Internally, oversample is called before training, but predictions are done on the original data.
logistic model
logicallrn = makeLearner("classif.multinom", trace = FALSE)
logicallrn = makeOversampleWrapper(logicallrn, osw.rate = w, osw.cl = "Bad")
logicallrn
## Learner classif.multinom.oversampled from package mlr,nnet
## Type: classif
## Name: ; Short name:
## Class: OversampleWrapper
## Properties: numerics,factors,weights,prob,twoclass,multiclass
## Predict-Type: response
## Hyperparameters: trace=FALSE,osw.rate=5,osw.cl=Bad
lr = resample(logicallrn, credit.task, rin, measures = list(credit.costs, mmce), show.info = FALSE)
lr
## Resample Result
## Task: ml_credit_dataset
## Learner: classif.multinom.oversampled
## Aggr perf: credit.costs.test.mean=0.6140000,mmce.test.mean=0.3700000
## Runtime: 1.12441
Rpart model
rpartlrn = makeLearner("classif.rpart")
rpartlrn = makeOversampleWrapper(rpartlrn, osw.rate = w, osw.cl = "Bad")
rpartlrn
## Learner classif.rpart.oversampled from package mlr,rpart
## Type: classif
## Name: ; Short name:
## Class: OversampleWrapper
## Properties: numerics,factors,ordered,missings,weights,prob,twoclass,multiclass,featimp
## Predict-Type: response
## Hyperparameters: xval=0,osw.rate=5,osw.cl=Bad
rr = resample(logicallrn, credit.task, rin, measures = list(credit.costs, mmce), show.info = FALSE)
rr
## Resample Result
## Task: ml_credit_dataset
## Learner: classif.multinom.oversampled
## Aggr perf: credit.costs.test.mean=0.6150000,mmce.test.mean=0.3750000
## Runtime: 1.28737
-Of course, we can also tune the oversampling rate. For this purpose we again have to create an OversampleWrapper. Optimal values for parameter osw.rate can be obtained using function tuneParams.
logistic model
logicallrn = makeLearner("classif.multinom", trace = FALSE)
logicallrn = makeOversampleWrapper(logicallrn, osw.cl = "Bad")
logicalps = makeParamSet(makeDiscreteParam("osw.rate", seq(3, 8, 0.25)))
logicalctrl = makeTuneControlGrid()
logicaltune.osw.res = tuneParams(logicallrn, credit.task, rin, par.set = logicalps, measures = list(credit.costs, mmce),
control = logicalctrl, show.info = FALSE)
logicaltune.osw.res
## Tune result:
## Op. pars: osw.rate=7.75
## credit.costs.test.mean=0.5870000,mmce.test.mean=0.4070000
Rpart model
rpartlrn = makeLearner("classif.rpart")
rpartlrn = makeOversampleWrapper(rpartlrn, osw.cl = "Bad")
rpartps = makeParamSet(makeDiscreteParam("osw.rate", seq(3, 8, 0.25)))
rpartctrl = makeTuneControlGrid()
rparttune.osw.res = tuneParams(rpartlrn, credit.task, rin, par.set = rpartps, measures = list(credit.costs, mmce),
control = rpartctrl, show.info = FALSE)
rparttune.osw.res
## Tune result:
## Op. pars: osw.rate=7
## credit.costs.test.mean=0.5670000,mmce.test.mean=0.4430000