Load the necessary packages.

library(rsample)    # stratified sampling
library(cdata)      # data wrangling
## Loading required package: wrapr
library(ggplot2)    # beautiful plot
library(GGally)     # grouped scatter plot matrix
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(magrittr)   # pipe
library(dplyr)      # data wrangling
## 
## Attaching package: 'dplyr'
## The following object is masked from 'package:wrapr':
## 
##     coalesce
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(caret)      # data preprocessing and transform
## Loading required package: lattice
library(e1071)      # svm
## 
## Attaching package: 'e1071'
## The following object is masked from 'package:rsample':
## 
##     permutations
library(WVPlots)    # double density plot and ROC curve
library(knitr)      # tidy table
library(sigr)       # calculate AUC

Import the data saved as .RData and split it into training and test data by stratified sampling.

load("credit.RData")
str(dat)
## 'data.frame':    1000 obs. of  21 variables:
##  $ status                 : Factor w/ 4 levels "no checking account",..: 1 1 2 1 1 1 1 1 4 2 ...
##  $ duration               : int  18 9 12 12 12 10 8 6 18 24 ...
##  $ credit_history         : Factor w/ 5 levels "delay in paying off in the past",..: 5 5 3 5 5 5 5 5 5 3 ...
##  $ purpose                : Factor w/ 11 levels "others","car (new)",..: 3 1 10 1 1 1 1 1 4 4 ...
##  $ amount                 : int  1049 2799 841 2122 2171 2241 3398 1361 1098 3758 ...
##  $ savings                : Factor w/ 5 levels "unknown/no savings account",..: 1 1 2 1 1 1 1 1 1 3 ...
##  $ employment_duration    : Factor w/ 5 levels "unemployed","< 1 yr",..: 2 3 4 3 3 2 4 2 1 1 ...
##  $ installment_rate       : Ord.factor w/ 4 levels ">= 35"<"25 <= ... < 35"<..: 4 2 2 3 4 1 1 2 4 1 ...
##  $ personal_status_sex    : Factor w/ 4 levels "male : divorced/separated",..: 2 3 2 3 3 3 3 3 2 2 ...
##  $ other_debtors          : Factor w/ 3 levels "none","co-applicant",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ present_residence      : Ord.factor w/ 4 levels "< 1 yr"<"1 <= ... < 4 yrs"<..: 4 2 4 2 4 3 4 4 4 4 ...
##  $ property               : Factor w/ 4 levels "unknown / no property",..: 2 1 1 1 2 1 1 1 3 4 ...
##  $ age                    : int  21 36 23 39 38 48 39 40 65 23 ...
##  $ other_installment_plans: Factor w/ 3 levels "bank","stores",..: 3 3 3 3 1 3 3 3 3 3 ...
##  $ housing                : Factor w/ 3 levels "for free","rent",..: 1 1 1 1 2 1 2 2 2 1 ...
##  $ number_credits         : Ord.factor w/ 4 levels "1"<"2-3"<"4-5"<..: 1 2 1 2 2 2 2 1 2 1 ...
##  $ job                    : Factor w/ 4 levels "unemployed/unskilled - non-resident",..: 3 3 2 2 2 2 2 2 1 1 ...
##  $ people_liable          : Factor w/ 2 levels "3 or more","0 to 2": 2 1 2 1 2 1 2 1 2 2 ...
##  $ telephone              : Factor w/ 2 levels "no","yes (under customer name)": 1 1 1 1 1 1 1 1 1 1 ...
##  $ foreign_worker         : Factor w/ 2 levels "yes","no": 2 2 2 1 1 1 1 1 2 2 ...
##  $ credit_risk            : Factor w/ 2 levels "bad","good": 2 2 2 2 2 2 2 2 2 2 ...
summary(dat)
##                                         status       duration   
##  no checking account                       :274   Min.   : 4.0  
##  ... < 0 DM                                :269   1st Qu.:12.0  
##  0<= ... < 200 DM                          : 63   Median :18.0  
##  ... >= 200 DM / salary for at least 1 year:394   Mean   :20.9  
##                                                   3rd Qu.:24.0  
##                                                   Max.   :72.0  
##                                                                 
##                                      credit_history                purpose   
##  delay in paying off in the past            : 40    furniture/equipment:280  
##  critical account/other credits elsewhere   : 49    others             :234  
##  no credits taken/all credits paid back duly:530    car (used)         :181  
##  existing credits paid back duly till now   : 88    car (new)          :103  
##  all credits at this bank paid back duly    :293    retraining         : 97  
##                                                     repairs            : 50  
##                                                     (Other)            : 55  
##      amount                            savings          employment_duration
##  Min.   :  250   unknown/no savings account:603   unemployed      : 62     
##  1st Qu.: 1366   ... <  100 DM             :103   < 1 yr          :172     
##  Median : 2320   100 <= ... <  500 DM      : 63   1 <= ... < 4 yrs:339     
##  Mean   : 3271   500 <= ... < 1000 DM      : 48   4 <= ... < 7 yrs:174     
##  3rd Qu.: 3972   ... >= 1000 DM            :183   >= 7 yrs        :253     
##  Max.   :18424                                                             
##                                                                            
##        installment_rate                           personal_status_sex
##  >= 35         :136     male : divorced/separated           : 50     
##  25 <= ... < 35:231     female : non-single or male : single:310     
##  20 <= ... < 25:157     male : married/widowed              :548     
##  < 20          :476     female : single                     : 92     
##                                                                      
##                                                                      
##                                                                      
##       other_debtors        present_residence
##  none        :907   < 1 yr          :130    
##  co-applicant: 41   1 <= ... < 4 yrs:308    
##  guarantor   : 52   4 <= ... < 7 yrs:149    
##                     >= 7 yrs        :413    
##                                             
##                                             
##                                             
##                                       property        age       
##  unknown / no property                    :282   Min.   :19.00  
##  car or other                             :232   1st Qu.:27.00  
##  building soc. savings agr./life insurance:332   Median :33.00  
##  real estate                              :154   Mean   :35.54  
##                                                  3rd Qu.:42.00  
##                                                  Max.   :75.00  
##                                                                 
##  other_installment_plans     housing    number_credits
##  bank  :139              for free:179   1   :633      
##  stores: 47              rent    :714   2-3 :333      
##  none  :814              own     :107   4-5 : 28      
##                                         >= 6:  6      
##                                                       
##                                                       
##                                                       
##                                          job        people_liable
##  unemployed/unskilled - non-resident       : 22   3 or more:155  
##  unskilled - resident                      :200   0 to 2   :845  
##  skilled employee/official                 :630                  
##  manager/self-empl./highly qualif. employee:148                  
##                                                                  
##                                                                  
##                                                                  
##                      telephone   foreign_worker credit_risk
##  no                       :596   yes: 37        bad :300   
##  yes (under customer name):404   no :963        good:700   
##                                                            
##                                                            
##                                                            
##                                                            
## 
# no missing values
response='credit_risk'
library(rsample)
set.seed(10)
split=initial_split(dat,prop=0.8,strata = response)
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(response)` instead of `response` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
train=training(split)
test=testing(split)

Data visualization

Based on the output of str() function, we know that there are 17 categorical predictors and 3 continuos predictors. There are 700 good credit and 300 bad credits. From the paper, it is stated that the bad credit instances are oversampled. Normally, the prevalence of bad credit instances is approximately 5%.

Graphical outputs to show the relationship between continuous predictors and response

Grouped boxplots

# Continuos predictors
idx_con=c("duration","amount","age")
# Grouped boxplot. Turn train data from wide to long format
train_con=cbind(train[,idx_con],credit_risk=train[,response])
train_con_long=unpivot_to_blocks(train_con,nameForNewKeyColumn = "variables",
                                 nameForNewValueColumn = "values",
                                 columnsToTakeFrom = idx_con)

ggplot(data=train_con_long, aes(x=credit_risk,y=values)) +
  geom_boxplot(color="blue",fill="blue",alpha=0.2,notch=TRUE,
               outlier.color="red",outlier.fill = "red",outlier.size = 2) +
  facet_wrap(~variables,ncol=3,scales = "free")

grouped scatter plot matrix

ggpairs(train_con,columns = 1:3, ggplot2::aes(colour=credit_risk))

Visualization of categorical predictors with respect to response

# Visualization of two categorical variables: method 1
ggplot(data=train) +
  geom_count(aes(x=housing,y=credit_risk))

# method 2
train %>%
  count(credit_history,credit_risk) %>%
  ggplot(aes(x=credit_history,y=credit_risk)) +
  geom_tile(aes(fill=n)) +
  theme(axis.text.x = element_text(angle = 45,hjust=1))

Chi-square independence test

# categorical predictors. Employ chi-square test to investigate the association between
# categorical features and response/outcome
train_cat=train[,!(colnames(train) %in% idx_con)]
# (tt=chisq.test(train$status,train$credit_risk))

t=c()
idx=c()
for (i in (1:(ncol(train_cat)-1))) {
  t[i]=chisq.test(train_cat[,i],train$credit_risk)$p.value
  # u[i]=fisher.test(train[,i],train$credit_risk)$p.value
  if (!is.list(tryCatch( { result <- chisq.test(train[,i],train$credit_risk) }
                         , warning = function(w) { print("TRUE") }))) {
    idx=c(idx,i)
  }
}
## [1] "TRUE"
## Warning in chisq.test(train_cat[, i], train$credit_risk): Chi-squared
## approximation may be incorrect
## [1] "TRUE"
## [1] "TRUE"
## Warning in chisq.test(train_cat[, i], train$credit_risk): Chi-squared
## approximation may be incorrect
## [1] "TRUE"
## [1] "TRUE"
idx_sig=which(t<=0.05)

idx_int=!(idx_sig %in% idx)
colnames(train_cat)[idx_sig[idx_int]]
## [1] "status"                  "purpose"                
## [3] "personal_status_sex"     "property"               
## [5] "other_installment_plans" "housing"                
## [7] "foreign_worker"

Feature engineering

Training dataset

# Perform one hot encoding for categorical (nominal) predictors.
var_cat=c("status","credit_history","purpose","savings","employment_duration",
          "personal_status_sex","other_debtors","property","other_installment_plans",
          "housing","job","people_liable","telephone","foreign_worker")

train_cat=train[,colnames(train) %in% var_cat]
dummy=dummyVars("~.",data=train_cat)
newdata=data.frame(predict(dummy,newdata=train_cat))

# label encoding for ordinal variables
# chooses the related variables
var_ord=c("installment_rate","present_residence","number_credits")
train_cont=train[,colnames(train) %in% var_ord]
train_cont=transform(train_cont,installment_rate=as.numeric(installment_rate)-1,
                     present_residence=as.numeric(present_residence)-1,
                     number_credits=as.numeric(number_credits)-1)

# Min-max normalization of continuos predictors
var_cont=c("amount","age","duration")
dat_cont=train[,colnames(train) %in% var_cont]
process=preProcess(dat_cont,method = c("range"))
scaled_dat_cont=predict(process,dat_cont)

# concatenate all the predictors with response variable by columns
train_new=cbind(newdata,train_cont,
                scaled_dat_cont,credit_risk=train$credit_risk)

Test dataset

# One-hot encoding
test_cat=test[,colnames(test) %in% var_cat]
newdata=data.frame(predict(dummy,newdata=test_cat))

# Label encoding
test_cont=test[,colnames(test) %in% var_ord]
test_cont=transform(test_cont,installment_rate=as.numeric(installment_rate)-1,
                    present_residence=as.numeric(present_residence)-1,
                    number_credits=as.numeric(number_credits)-1)
# Min-max normalization
dat_cont=test[,colnames(test) %in% var_cont]
scaled_dat_cont=predict(process,dat_cont)

# concatenate by columns
test_new=cbind(newdata,test_cont,
               scaled_dat_cont,credit_risk=test$credit_risk)

SVM hyperparameters tuning and training

Linear SVM

# Hyper-parameters tuning cost function
cost_matrix=matrix(c(0,1,5,0),ncol=2)
err=function(truth,pred){
  t=table(truth=truth,pred=pred)
  tot_cost=sum(t*cost_matrix)
  tot_cost
}

range_exp=seq(-10,10,by=2)
set.seed(200)  # for reproducibility
# linear kernel SVM. No scaling is needed as it had been performed beforehand.
# class weight is set to be inversely proportional to the number of samples in 
# each class
svm_tune=tune(svm,credit_risk~.,data = train_new,kernel='linear', scale=FALSE,
              probability=TRUE, class.weights='inverse',
              ranges = list(cost=c(2^range_exp)),
              tunecontrol = tune.control(cross=5,error.fun = err))
summary(svm_tune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 5-fold cross validation 
## 
## - best parameters:
##  cost
##    16
## 
## - best performance: 98.8 
## 
## - Detailed performance results:
##            cost error dispersion
## 1  9.765625e-04 172.8  79.672454
## 2  3.906250e-03 172.8  79.672454
## 3  1.562500e-02 172.8  79.672454
## 4  6.250000e-02 172.8  79.672454
## 5  2.500000e-01 172.8  79.672454
## 6  1.000000e+00 108.0   9.273618
## 7  4.000000e+00 102.2   5.890671
## 8  1.600000e+01  98.8  13.663821
## 9  6.400000e+01 102.4   9.044335
## 10 2.560000e+02 105.2   6.610598
## 11 1.024000e+03 106.8  12.029131
min_cost=svm_tune$performances$cost[which.min(svm_tune$performances$error)]

# Visualization 
svm_tune$performances %>%
  ggplot(aes(x=cost,y=error)) +
  geom_line() +
  scale_x_continuous(name = "cost, C",trans = "log2") + 
  ylab("misclassification cost") +
  geom_vline(xintercept = min_cost,
             color="red",linetype=2)

# Extract the best model in term of misclassification cost
svm_lin=svm_tune$best.model

Coefficients of linear kernel SVM

# Coefficients of linear SVM can give insight on how individual predictor affects
# the outcome
coef_lin=data.frame(names=names(coef(svm_lin))[-1],coef=coef(svm_lin)[-1])
coef_lin_10=coef_lin[order(-abs(coef_lin$coef))[1:10],]
rownames(coef_lin_10)=NULL
library(knitr)
kable(coef_lin_10)
names coef
status……..200.DM…salary.for.at.least.1.year -0.7908579
credit_history.all.credits.at.this.bank.paid.back.duly -0.5707810
status.no.checking.account 0.5532820
purpose.car..new. -0.3861087
duration 0.3832446
savings.unknown.no.savings.account 0.3404894
property.unknown…no.property -0.3105557
amount 0.3055987
other_debtors.guarantor -0.2966404
other_debtors.co.applicant 0.2952381
# Visualization of coefficients estimates
ggplot(data=coef_lin_10,aes(x=names,y=coef)) +
  geom_pointrange(aes(ymin=0,ymax=coef)) +
  coord_flip() +theme_classic() + ylab("coefficient estimates")

RBF SVM

range_exp_sigma=seq(-5,5,by=2)
set.seed(300)
svm_tune=tune(svm,credit_risk~.,data = train_new,kernel='radial', scale=FALSE,
              probability=TRUE, class.weights='inverse',
              ranges = list(cost=c(2^range_exp),gamma=c(2^range_exp_sigma)),
              tunecontrol = tune.control(cross=5,error.fun = err))
summary(svm_tune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 5-fold cross validation 
## 
## - best parameters:
##  cost   gamma
##    64 0.03125
## 
## - best performance: 100.6 
## 
## - Detailed performance results:
##            cost    gamma error dispersion
## 1  9.765625e-04  0.03125 213.2  58.789455
## 2  3.906250e-03  0.03125 213.2  58.789455
## 3  1.562500e-02  0.03125 213.2  58.789455
## 4  6.250000e-02  0.03125 213.2  58.789455
## 5  2.500000e-01  0.03125 213.2  58.789455
## 6  1.000000e+00  0.03125 213.2  58.789455
## 7  4.000000e+00  0.03125 213.2  58.789455
## 8  1.600000e+01  0.03125 151.8  33.169263
## 9  6.400000e+01  0.03125 100.6  11.216060
## 10 2.560000e+02  0.03125 104.6   8.324662
## 11 1.024000e+03  0.03125 110.4  15.009997
## 12 9.765625e-04  0.12500 213.2  58.789455
## 13 3.906250e-03  0.12500 213.2  58.789455
## 14 1.562500e-02  0.12500 213.2  58.789455
## 15 6.250000e-02  0.12500 213.2  58.789455
## 16 2.500000e-01  0.12500 213.2  58.789455
## 17 1.000000e+00  0.12500 213.2  58.789455
## 18 4.000000e+00  0.12500 213.2  58.789455
## 19 1.600000e+01  0.12500 191.0  45.667275
## 20 6.400000e+01  0.12500 106.2  15.722595
## 21 2.560000e+02  0.12500 123.2  10.545141
## 22 1.024000e+03  0.12500 138.2  23.530831
## 23 9.765625e-04  0.50000 213.2  58.789455
## 24 3.906250e-03  0.50000 213.2  58.789455
## 25 1.562500e-02  0.50000 213.2  58.789455
## 26 6.250000e-02  0.50000 213.2  58.789455
## 27 2.500000e-01  0.50000 213.2  58.789455
## 28 1.000000e+00  0.50000 213.2  58.789455
## 29 4.000000e+00  0.50000 213.2  58.789455
## 30 1.600000e+01  0.50000 213.2  58.789455
## 31 6.400000e+01  0.50000 213.2  58.789455
## 32 2.560000e+02  0.50000 224.6  17.444197
## 33 1.024000e+03  0.50000 228.8  18.780309
## 34 9.765625e-04  2.00000 213.2  58.789455
## 35 3.906250e-03  2.00000 213.2  58.789455
## 36 1.562500e-02  2.00000 213.2  58.789455
## 37 6.250000e-02  2.00000 213.2  58.789455
## 38 2.500000e-01  2.00000 213.2  58.789455
## 39 1.000000e+00  2.00000 213.2  58.789455
## 40 4.000000e+00  2.00000 213.2  58.789455
## 41 1.600000e+01  2.00000 213.2  58.789455
## 42 6.400000e+01  2.00000 213.2  58.789455
## 43 2.560000e+02  2.00000 238.0  14.832397
## 44 1.024000e+03  2.00000 238.0  14.832397
## 45 9.765625e-04  8.00000 213.2  58.789455
## 46 3.906250e-03  8.00000 213.2  58.789455
## 47 1.562500e-02  8.00000 213.2  58.789455
## 48 6.250000e-02  8.00000 213.2  58.789455
## 49 2.500000e-01  8.00000 213.2  58.789455
## 50 1.000000e+00  8.00000 213.2  58.789455
## 51 4.000000e+00  8.00000 213.2  58.789455
## 52 1.600000e+01  8.00000 213.2  58.789455
## 53 6.400000e+01  8.00000 213.2  58.789455
## 54 2.560000e+02  8.00000 238.0  14.832397
## 55 1.024000e+03  8.00000 238.0  14.832397
## 56 9.765625e-04 32.00000 213.2  58.789455
## 57 3.906250e-03 32.00000 213.2  58.789455
## 58 1.562500e-02 32.00000 213.2  58.789455
## 59 6.250000e-02 32.00000 213.2  58.789455
## 60 2.500000e-01 32.00000 213.2  58.789455
## 61 1.000000e+00 32.00000 213.2  58.789455
## 62 4.000000e+00 32.00000 213.2  58.789455
## 63 1.600000e+01 32.00000 213.2  58.789455
## 64 6.400000e+01 32.00000 213.2  58.789455
## 65 2.560000e+02 32.00000 238.0  14.832397
## 66 1.024000e+03 32.00000 238.0  14.832397
# Best parameters from grid search
idx_min=which.min(svm_tune$performances[,'error'])
# countour plot
ggplot(svm_tune$performances,aes(x=cost,y=gamma)) +
  geom_raster(aes(fill=error)) +
  geom_contour(aes(z=error),color='white') +
  scale_x_continuous(name='cost',trans = "log2") +
  scale_y_continuous(name='gamma',trans = "log2") +
  geom_point(aes(x=cost[idx_min],y=gamma[idx_min]),shape=19,color='red',size=2) +
  geom_text(data=svm_tune$performances[idx_min,],
            aes(x=cost,y=1.05*gamma, color='yellow',
                label=sprintf("cost of misclassification: %.2f",error)),
            show.legend = FALSE)

# fine grid search
range_exp=seq(4,10,by=1)
range_exp_gamma=seq(-6,-2,by=1)
set.seed(400)
svm_tune=tune(svm,credit_risk~.,data = train_new,kernel='radial', scale=FALSE,
              probability=TRUE, class.weights='inverse',
              ranges = list(cost=c(2^range_exp),gamma=c(2^range_exp_gamma)),
              tunecontrol = tune.control(cross=5,error.fun = err))
summary(svm_tune)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 5-fold cross validation 
## 
## - best parameters:
##  cost   gamma
##   128 0.03125
## 
## - best performance: 97.8 
## 
## - Detailed performance results:
##    cost    gamma error dispersion
## 1    16 0.015625 190.0  74.551995
## 2    32 0.015625 105.4   6.985700
## 3    64 0.015625 104.2  12.930584
## 4   128 0.015625  99.6  10.014989
## 5   256 0.015625  98.2   6.906519
## 6   512 0.015625 102.6  10.502381
## 7  1024 0.015625 108.4  13.011533
## 8    16 0.031250 141.8  33.700148
## 9    32 0.031250 108.8  16.946976
## 10   64 0.031250 104.2  13.971399
## 11  128 0.031250  97.8  10.305338
## 12  256 0.031250 104.4  10.597169
## 13  512 0.031250 112.4  13.867228
## 14 1024 0.031250 116.8  12.853015
## 15   16 0.062500 132.6  31.627520
## 16   32 0.062500 110.8  16.006249
## 17   64 0.062500 103.0  13.583078
## 18  128 0.062500 104.4  10.549882
## 19  256 0.062500 114.4  21.454603
## 20  512 0.062500 118.8  19.715476
## 21 1024 0.062500 122.6  29.271146
## 22   16 0.125000 176.2  69.142606
## 23   32 0.125000 109.4  11.414903
## 24   64 0.125000 108.4  13.867228
## 25  128 0.125000 109.0  14.949916
## 26  256 0.125000 115.0  24.545875
## 27  512 0.125000 127.6  34.997143
## 28 1024 0.125000 148.8  40.517897
## 29   16 0.250000 190.0  74.551995
## 30   32 0.250000 186.0  73.426834
## 31   64 0.250000 124.2  21.707142
## 32  128 0.250000 133.0  33.726844
## 33  256 0.250000 150.8  36.792662
## 34  512 0.250000 170.6  42.799533
## 35 1024 0.250000 176.6  44.410584
svm_rbf=svm_tune$best.model

idx_min=which.min(svm_tune$performances[,'error'])
# contour plot
ggplot(svm_tune$performances,aes(x=cost,y=gamma)) +
  geom_raster(aes(fill=error)) +
  geom_contour(aes(z=error),color='white') +
  scale_x_continuous(name='cost',trans = "log2") +
  scale_y_continuous(name='gamma',trans = "log2") +
  geom_point(aes(x=cost[idx_min],y=gamma[idx_min]),shape=19,color='red',size=2) +
  geom_text(data=svm_tune$performances[idx_min,],
            aes(x=cost,y=1.05*gamma, color='yellow',
                label=sprintf("cost of misclassification: %.2f",error)),
            show.legend = FALSE)

Performance evaluation

Performance metrics calculation function

performance=function(truth,pred){
  tab=table(truth=truth,prediction=pred)
  acc=sum(diag(tab))/sum(tab)
  cost=sum(cost_matrix*tab)
  recall=tab[1,1]/sum(tab[1,])
  precision=tab[1,1]/sum(tab[,1])
  F1=(2*precision*recall)/(precision+recall)
  truth_logical=as.numeric(truth)-1
  AUC=calcAUC(attr(pred,"decision.values"),truth_logical,yTarget = FALSE)
  round(c(accuracy=acc,misclass_cost=cost,recall=recall,precision=precision,
          f1_measure=F1,AUC=AUC),4)
}

Double density plots and ROC curves

pred_svm_lin=predict(svm_lin,newdata=test_new,decision.values = TRUE)
pred_svm_rbf=predict(svm_rbf,newdata = test_new,decision.values = TRUE)
dat_plot=data.frame(outcome=test_new$credit_risk,dv_svm_linear=attr(pred_svm_lin,"decision.values")[1:nrow(test_new)],
                    dv_svm_rbf=attr(pred_svm_rbf,"decision.values")[1:nrow(test_new)])

library(WVPlots)
DoubleDensityPlot(dat_plot,xvar="dv_svm_linear",truthVar = "outcome",
                  title="Distribution of linear svm scores (test data)") +
  geom_vline(xintercept = 0, color="red", linetype=2)

DoubleDensityPlot(dat_plot,xvar="dv_svm_rbf",truthVar = "outcome",
                  title="Distribution of RBF svm scores (test data)") +
  geom_vline(xintercept = 0, color="red", linetype=2)

ROCPlotPair(dat_plot,xvar1="dv_svm_linear",xvar2 = "dv_svm_rbf",truthVar = "outcome",
            truthTarget = "bad", title="ROC plots for svm models (test data)")

Tabulation of classifiers’ performances

perf_svm_lin=performance(test_new$credit_risk,pred_svm_lin)
perf_svm_rbf=performance(test_new$credit_risk,pred_svm_rbf)
tab_perf=rbind(perf_svm_lin,perf_svm_rbf)
rownames(tab_perf)=c("linear svm","rbf svm")
kable(tab_perf)
accuracy misclass_cost recall precision f1_measure AUC
linear svm 0.710 118 0.7500 0.5114 0.6081 0.7626
rbf svm 0.685 119 0.7667 0.4842 0.5935 0.7613

Reference

Grömping, U. (2019). South German Credit Data: Correcting a Widely Used Data Set. Report 4/2019, Reports in Mathematics, Physics and Chemistry, Department II, Beuth University of Applied Sciences Berlin.