Credit risk analysis with logistic regression

Just to practice.

Loading packages:

Loading packages to calculate skewness (moments), data manipulation (dplyr), create a logistic regression model (caret),string manipulation (stringr), ROC and auxiliary plots to evaluate logistic regression models (ROCit and prerec) and correlation matrix plot (corrplot).

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

## Loading required package: lattice

## Loading required package: ggplot2

## corrplot 0.84 loaded

Loading dataset:

df = read.csv("credit_dataset.csv")
View(df)

Selecting numerical and categorical variables names:

numVariables = c('credit.duration.months', 'credit.amount', 'age')

catVariables =names(df[,!names(df) %in% numVariables])

Central tendencies measurements:

summary(df[numVariables])

##  credit.duration.months credit.amount        age       
##  Min.   : 4.0           Min.   :  250   Min.   :19.00  
##  1st Qu.:12.0           1st Qu.: 1366   1st Qu.:27.00  
##  Median :18.0           Median : 2320   Median :33.00  
##  Mean   :20.9           Mean   : 3271   Mean   :35.54  
##  3rd Qu.:24.0           3rd Qu.: 3972   3rd Qu.:42.00  
##  Max.   :72.0           Max.   :18424   Max.   :75.00

Frequency distribution of numerical variables

Ploting frequency distribution of all numerical variables:

for(i in 1:length(numVariables)){
  hist(df[,numVariables[i]], main = paste('Frequency of',numVariables[i]), freq = TRUE,xlab=numVariables[i],ylab = paste('Frequency of',numVariables[i]) )
}

Measuring skewness to all numerical variables:

for(i in 1:length(numVariables)){
  cat(sprintf('Skewness %s: %.3f.\n',numVariables[i],skewness(df[,numVariables[i]])))
}

## Skewness credit.duration.months: 1.093.
## Skewness credit.amount: 1.947.
## Skewness age: 1.023.

Transforming numerical variables:

Data must have similar scale and normal-shaped.

for(i in 1:length(numVariables)){
   hist(log(df[,numVariables[i]],base=exp(1)), main = paste('Frequency of',numVariables[i]),freq = TRUE ,xlab=numVariables[i],ylab = paste('Frequency of',numVariables[i]) )
}

Trying to transform data:

for(i in 1:length(numVariables)){
  cat(sprintf('Skewness %s: %.3f.\n',numVariables[i],skewness(sqrt(log(df[,numVariables[i]],base=exp(1))))))
}

## Skewness credit.duration.months: -0.367.
## Skewness credit.amount: 0.003.
## Skewness age: 0.334.

Defining new column credit.amount.transformed:

df$credit.amount.transformed = sqrt(log(df$credit.amount, base = exp(1)))

Ln transformations:

for(i in 1:length(numVariables)){
  cat(sprintf('Skewness %s: %.3f.\n',numVariables[i],skewness(log(df[,numVariables[i]],base=exp(1)))))
}

## Skewness credit.duration.months: -0.127.
## Skewness credit.amount: 0.129.
## Skewness age: 0.414.

Another try:

for(i in 1:length(numVariables)){
  cat(sprintf('Skewness %s: %.3f.\n',numVariables[i],skewness(log1p(df[,numVariables[i]])^1/3)))
}

## Skewness credit.duration.months: -0.051.
## Skewness credit.amount: 0.130.
## Skewness age: 0.429.

So we have:

df$credit.duration.months.transformed =log1p(df$credit.duration.months)^1/3

… Another try:

for(i in 1:length(numVariables)){
  cat(sprintf('Skewness %s: %.3f.\n',numVariables[i],skewness((df[,numVariables[i]])^1/3)))
}

## Skewness credit.duration.months: 1.093.
## Skewness credit.amount: 1.947.
## Skewness age: 1.023.

Let’s keep the first age transformation:

df$age.transformed = sqrt(log(df$age, base = exp(1)))

Redefying numerical and categorical variables:

numTransformed = c('credit.duration.months.transformed','credit.amount.transformed','age.transformed')

for(i in 1:length(numTransformed)){
   hist(df[,numTransformed[i]], main = paste('Hist. of',numTransformed[i]),freq = TRUE ,xlab=numTransformed[i],ylab = paste('Frequency of',numTransformed[i]) )
}

for(i in 1:length(numTransformed)){
  cat(sprintf('Skewness %s: %.3f.\n',numTransformed[i],skewness(df[,numTransformed[i]])))
}

## Skewness credit.duration.months.transformed: -0.051.
## Skewness credit.amount.transformed: 0.003.
## Skewness age.transformed: 0.334.

Statistical measurements for the new variables :

summary(df[numTransformed])

##  credit.duration.months.transformed credit.amount.transformed age.transformed
##  Min.   :0.5365                     Min.   :2.350             Min.   :1.716  
##  1st Qu.:0.8550                     1st Qu.:2.687             1st Qu.:1.815  
##  Median :0.9815                     Median :2.784             Median :1.870  
##  Mean   :0.9803                     Mean   :2.787             Mean   :1.876  
##  3rd Qu.:1.0730                     3rd Qu.:2.879             3rd Qu.:1.933  
##  Max.   :1.4302                     Max.   :3.134             Max.   :2.078

Boxplots:

for(i in 1:length(numTransformed)){
   boxplot(df[,numTransformed[i]], main = paste('Boxplot for',numTransformed[i]),ylab = numTransformed[i])
}

summary(df[numTransformed])

##  credit.duration.months.transformed credit.amount.transformed age.transformed
##  Min.   :0.5365                     Min.   :2.350             Min.   :1.716  
##  1st Qu.:0.8550                     1st Qu.:2.687             1st Qu.:1.815  
##  Median :0.9815                     Median :2.784             Median :1.870  
##  Mean   :0.9803                     Mean   :2.787             Mean   :1.876  
##  3rd Qu.:1.0730                     3rd Qu.:2.879             3rd Qu.:1.933  
##  Max.   :1.4302                     Max.   :3.134             Max.   :2.078

Redefining categorical variables and numerical variables:

numVariables = c('credit.duration.months','credit.duration.months.transformed', 'credit.amount','credit.amount.transformed', 'age','age.transformed')

catVariables =names(df[,!names(df) %in% numVariables])

Correlation matrix:

numData = df[,names(df) %in% numTransformed]
corrplot(cor(numData),method = 'pie')

corrplot(cor(numData),method = 'number')

Frequency distribution of categorical variables:

for(i in 1:length(catVariables)){
   hist(df[,catVariables[i]], main = paste('Hist. of',catVariables[i]),freq = TRUE ,xlab=catVariables[i],ylab = paste('Frequency of',catVariables[i]) )
}

Checking classes of categorical variables:

sapply(catVariables,class)

##                  credit.rating                account.balance 
##                    "character"                    "character" 
## previous.credit.payment.status                 credit.purpose 
##                    "character"                    "character" 
##                        savings            employment.duration 
##                    "character"                    "character" 
##               installment.rate                 marital.status 
##                    "character"                    "character" 
##                      guarantor             residence.duration 
##                    "character"                    "character" 
##                 current.assets                  other.credits 
##                    "character"                    "character" 
##                 apartment.type                   bank.credits 
##                    "character"                    "character" 
##                     occupation                     dependents 
##                    "character"                    "character" 
##                      telephone                 foreign.worker 
##                    "character"                    "character"

Transforming categorical variables into factors:

 df[(names(df)%in%catVariables)] = lapply(df[(names(df)%in%catVariables)],as.factor)

# Just checking if the above line works:

for(i in 1:length(catVariables)){
a = class(df[1,catVariables[i]])
cat(sprintf('Type of %s: %s.\n',catVariables[i],a))
}

## Type of credit.rating: factor.
## Type of account.balance: factor.
## Type of previous.credit.payment.status: factor.
## Type of credit.purpose: factor.
## Type of savings: factor.
## Type of employment.duration: factor.
## Type of installment.rate: factor.
## Type of marital.status: factor.
## Type of guarantor: factor.
## Type of residence.duration: factor.
## Type of current.assets: factor.
## Type of other.credits: factor.
## Type of apartment.type: factor.
## Type of bank.credits: factor.
## Type of occupation: factor.
## Type of dependents: factor.
## Type of telephone: factor.
## Type of foreign.worker: factor.

rm(a)

Feature selection

Selecting variable names that will be used and defining feature selection function.

features = c(numTransformed, catVariables)

featureSelection <- function(numIters = 100, featureVars, classVars){
  set.seed(3659)
  variableSizes <- 1:30
  control <- rfeControl(functions = rfFuncs,
                        method = 'cv',
                        verbose = FALSE,
                        returnResamp = 'all',
                        number = numIters)
  resultsRFE <- rfe(x = featureVars,
                    y = classVars,
                    sizes = variableSizes,
                    rfeControl = control)
  return(resultsRFE)
}

Creating train and test partitions for feature selection:

featureDF = df[,(names(df) %in% features)]
index = sample(1:nrow(featureDF),size = .75*nrow(featureDF))
trainFeature = featureDF[index,]
testFeature = featureDF[-index,]

Applying user-defined function featureSelection and showing its result:

resultsRFE = featureSelection(featureVars = featureDF[,-1],classVars = featureDF[,1])
varImp(resultsRFE)

##                                      Overall
## account.balance                    26.246856
## credit.duration.months.transformed 13.277859
## previous.credit.payment.status     10.913693
## credit.amount.transformed           8.708697
## savings                             7.452023
## current.assets                      5.269912
## guarantor                           5.060788
## age.transformed                     4.161562
## employment.duration                 4.093572
## apartment.type                      3.849470
## credit.purpose                      3.839569
## occupation                          3.757153
## marital.status                      3.615173
## foreign.worker                      3.587833
## bank.credits                        3.563263
## installment.rate                    3.530525
## dependents                          3.100130
## telephone                           3.034385

Getting the variables with overall score greater than 2:

varImpOutput = t(varImp(resultsRFE))
mostValuable = varImpOutput[,varImpOutput>2]
mostValuableNames = names(mostValuable)

Generating test and train subsets for a first model:

firstModelDF = featureDF

set.seed(6521)

index = sample(1:nrow(firstModelDF),size = 0.75*nrow(firstModelDF))
trainFM = firstModelDF[index,]
testFM = firstModelDF[-index,]

Creating first logistic regression model:

firstModel = glm('credit.rating ~.', data=trainFM, family = 'binomial')
summary(firstModel)

## 
## Call:
## glm(formula = "credit.rating ~.", family = "binomial", data = trainFM)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6484  -0.7296   0.3898   0.7169   1.8190  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          1.5930     4.0552   0.393 0.694440    
## account.balance2                     0.4178     0.2474   1.689 0.091213 .  
## account.balance3                     1.5285     0.2473   6.182 6.33e-10 ***
## previous.credit.payment.status2      0.7080     0.3438   2.059 0.039451 *  
## previous.credit.payment.status3      1.2643     0.3762   3.361 0.000777 ***
## credit.purpose2                     -0.9148     0.4278  -2.138 0.032489 *  
## credit.purpose3                     -0.9181     0.4170  -2.202 0.027681 *  
## credit.purpose4                     -1.4528     0.4037  -3.599 0.000319 ***
## savings2                             0.4385     0.3179   1.379 0.167753    
## savings3                             0.8493     0.3767   2.254 0.024172 *  
## savings4                             0.9894     0.3114   3.177 0.001487 ** 
## employment.duration2                 0.4395     0.2675   1.643 0.100436    
## employment.duration3                 1.1045     0.3308   3.339 0.000840 ***
## employment.duration4                 0.4651     0.3090   1.505 0.132298    
## installment.rate2                   -0.3640     0.3596  -1.012 0.311482    
## installment.rate3                   -0.3662     0.3970  -0.922 0.356302    
## installment.rate4                   -0.6166     0.3619  -1.704 0.088373 .  
## marital.status3                      0.5279     0.2281   2.314 0.020650 *  
## marital.status4                      0.4954     0.3722   1.331 0.183164    
## guarantor2                           0.4426     0.3351   1.321 0.186618    
## residence.duration2                 -0.9230     0.3444  -2.680 0.007359 ** 
## residence.duration3                 -0.6445     0.3981  -1.619 0.105455    
## residence.duration4                 -0.5174     0.3424  -1.511 0.130713    
## current.assets2                     -0.3245     0.2872  -1.130 0.258403    
## current.assets3                     -0.2409     0.2658  -0.906 0.364844    
## current.assets4                     -0.9523     0.4734  -2.012 0.044265 *  
## other.credits2                       0.2215     0.2513   0.881 0.378116    
## apartment.type2                      0.6046     0.2698   2.241 0.025052 *  
## apartment.type3                      0.5583     0.5360   1.042 0.297614    
## bank.credits2                       -0.1977     0.2709  -0.730 0.465480    
## occupation2                         -0.4640     0.6880  -0.674 0.500066    
## occupation3                         -0.5524     0.6665  -0.829 0.407225    
## occupation4                         -0.3684     0.7087  -0.520 0.603179    
## dependents2                         -0.2672     0.2909  -0.918 0.358368    
## telephone2                           0.3757     0.2212   1.698 0.089446 .  
## foreign.worker2                      1.4841     0.7149   2.076 0.037896 *  
## credit.amount.transformed           -0.5659     1.1565  -0.489 0.624590    
## credit.duration.months.transformed  -2.6853     0.8033  -3.343 0.000829 ***
## age.transformed                      1.6210     1.5008   1.080 0.280098    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 931.04  on 749  degrees of freedom
## Residual deviance: 692.07  on 711  degrees of freedom
## AIC: 770.07
## 
## Number of Fisher Scoring iterations: 5

Predict:

predictFirstModel = predict(firstModel, testFM, type="response")
predictFirstModel = round(predictFirstModel)

Evaluating:

confusionMatrix(
  table(
    data=predictFirstModel,
    reference = testFM[,1]),
    positive = '1')

## Confusion Matrix and Statistics
## 
##     reference
## data   0   1
##    0  24  17
##    1  42 167
##                                           
##                Accuracy : 0.764           
##                  95% CI : (0.7064, 0.8152)
##     No Information Rate : 0.736           
##     P-Value [Acc > NIR] : 0.175868        
##                                           
##                   Kappa : 0.3087          
##                                           
##  Mcnemar's Test P-Value : 0.001781        
##                                           
##             Sensitivity : 0.9076          
##             Specificity : 0.3636          
##          Pos Pred Value : 0.7990          
##          Neg Pred Value : 0.5854          
##              Prevalence : 0.7360          
##          Detection Rate : 0.6680          
##    Detection Prevalence : 0.8360          
##       Balanced Accuracy : 0.6356          
##                                           
##        'Positive' Class : 1               
##

Generating our second model:

if(('credit.rating' %in% mostValuableNames)==FALSE){
  mostValuableNames = append(mostValuableNames, 'credit.rating')
}

secondModelDF = featureDF[,names(featureDF) %in% mostValuableNames]

set.seed(9721)

index = sample(1:nrow(secondModelDF),size = 0.75*nrow(secondModelDF))
trainSM = secondModelDF[index,]
testSM = secondModelDF[-index,]

if('credit.rating' %in% mostValuableNames){
  t = mostValuableNames[mostValuableNames!= 'credit.rating']
formula <- paste('credit.rating ~', 
                str_c(t,
                      collapse = ' + '))
rm(t)
}else{
formula <- paste('credit.rating ~', 
            str_c(mostValuableNames,
            collapse = ' + '))}
secondModel = glm(formula, data= trainSM, family = 'binomial')
summary(secondModel)

## 
## Call:
## glm(formula = formula, family = "binomial", data = trainSM)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6100  -0.6940   0.4161   0.7305   1.8367  
## 
## Coefficients:
##                                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                         5.1505216  3.8881548   1.325 0.185281    
## account.balance2                    0.2826920  0.2470994   1.144 0.252606    
## account.balance3                    1.4431639  0.2488900   5.798 6.70e-09 ***
## credit.duration.months.transformed -2.3281888  0.7714989  -3.018 0.002547 ** 
## previous.credit.payment.status2     0.7485760  0.3246103   2.306 0.021106 *  
## previous.credit.payment.status3     1.5094700  0.3594319   4.200 2.67e-05 ***
## credit.amount.transformed          -1.2086643  1.1189143  -1.080 0.280048    
## savings2                            0.0982942  0.3067908   0.320 0.748669    
## savings3                            0.6331030  0.3573445   1.772 0.076446 .  
## savings4                            1.1509229  0.3124389   3.684 0.000230 ***
## current.assets2                    -0.2625445  0.2876063  -0.913 0.361316    
## current.assets3                    -0.3264492  0.2708186  -1.205 0.228043    
## current.assets4                    -1.0903610  0.4514891  -2.415 0.015734 *  
## guarantor2                          0.5263160  0.3352922   1.570 0.116479    
## age.transformed                     0.3942900  1.4344286   0.275 0.783412    
## employment.duration2               -0.0008643  0.2598853  -0.003 0.997346    
## employment.duration3                0.4461284  0.3224981   1.383 0.166557    
## employment.duration4                0.1738148  0.2966347   0.586 0.557905    
## apartment.type2                     0.3312077  0.2558216   1.295 0.195430    
## apartment.type3                     0.4727864  0.5156360   0.917 0.359195    
## credit.purpose2                    -0.7279055  0.4326804  -1.682 0.092507 .  
## credit.purpose3                    -0.8335840  0.4098668  -2.034 0.041973 *  
## credit.purpose4                    -1.4766748  0.3961894  -3.727 0.000194 ***
## occupation2                        -0.0211708  0.7040273  -0.030 0.976010    
## occupation3                        -0.1161336  0.6774614  -0.171 0.863890    
## occupation4                        -0.2003285  0.7174270  -0.279 0.780067    
## marital.status3                     0.4674848  0.2306714   2.027 0.042701 *  
## marital.status4                     0.1606438  0.3437541   0.467 0.640270    
## foreign.worker2                     0.7744927  0.6493793   1.193 0.233000    
## bank.credits2                      -0.3391546  0.2559429  -1.325 0.185132    
## installment.rate2                   0.0079589  0.3609444   0.022 0.982408    
## installment.rate3                  -0.1662233  0.3918120  -0.424 0.671389    
## installment.rate4                  -0.5891871  0.3609464  -1.632 0.102608    
## dependents2                        -0.3841625  0.2793579  -1.375 0.169081    
## telephone2                          0.3525408  0.2289247   1.540 0.123564    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 900.53  on 749  degrees of freedom
## Residual deviance: 690.66  on 715  degrees of freedom
## AIC: 760.66
## 
## Number of Fisher Scoring iterations: 5

Predicting and evaluating our second model:

predictSecondModel = predict(secondModel, testSM, type="response")
predictSecondModel = round(predictSecondModel)

confusionMatrix(
  table(
    data=predictSecondModel,
    reference = testSM[,1]),
    positive = '1')

## Confusion Matrix and Statistics
## 
##     reference
## data   0   1
##    0  35  18
##    1  49 148
##                                           
##                Accuracy : 0.732           
##                  95% CI : (0.6725, 0.7859)
##     No Information Rate : 0.664           
##     P-Value [Acc > NIR] : 0.0124691       
##                                           
##                   Kappa : 0.3391          
##                                           
##  Mcnemar's Test P-Value : 0.0002473       
##                                           
##             Sensitivity : 0.8916          
##             Specificity : 0.4167          
##          Pos Pred Value : 0.7513          
##          Neg Pred Value : 0.6604          
##              Prevalence : 0.6640          
##          Detection Rate : 0.5920          
##    Detection Prevalence : 0.7880          
##       Balanced Accuracy : 0.6541          
##                                           
##        'Positive' Class : 1               
##

Creating a third model

Adapting our data:

if(('credit.rating' %in% mostValuableNames)==FALSE){
  mostValuableNames = append(mostValuableNames, 'credit.rating')
}

thirdModelDF = featureDF[,names(featureDF) %in% mostValuableNames]
thirdModelDF$credit.amount.transformed = NULL 
featureTMNames = mostValuableNames[mostValuableNames!='credit.amount.transformed']

Train and test data:

set.seed(3217)
index = sample(1:nrow(thirdModelDF),size = 0.75*nrow(thirdModelDF))
trainTM = thirdModelDF[index,]
testTM = thirdModelDF[-index,]

Generating our third model:

if('credit.rating' %in% featureTMNames){
  t = featureTMNames[featureTMNames!= 'credit.rating']
formula <- paste('credit.rating ~', 
                str_c(t,
                      collapse = ' + '))
rm(t)
}else{
formula <- paste('credit.rating ~', 
            str_c(featureTMNames,
            collapse = ' + '))}
thirdModel = glm(formula, data= trainTM, family = 'binomial')
summary(thirdModel)

## 
## Call:
## glm(formula = formula, family = "binomial", data = trainTM)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5992  -0.7256   0.4109   0.7220   2.0543  
## 
## Coefficients:
##                                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                          0.2001     2.9402   0.068 0.945748    
## account.balance2                     0.2765     0.2428   1.139 0.254625    
## account.balance3                     1.4079     0.2415   5.830 5.53e-09 ***
## credit.duration.months.transformed  -2.8805     0.5981  -4.817 1.46e-06 ***
## previous.credit.payment.status2      1.1446     0.3315   3.453 0.000554 ***
## previous.credit.payment.status3      1.6068     0.3488   4.607 4.08e-06 ***
## savings2                             0.4195     0.3294   1.274 0.202822    
## savings3                             0.5828     0.3597   1.620 0.105194    
## savings4                             0.8522     0.3003   2.838 0.004543 ** 
## current.assets2                     -0.3942     0.2912  -1.354 0.175817    
## current.assets3                     -0.5048     0.2674  -1.888 0.059065 .  
## current.assets4                     -1.2258     0.4292  -2.856 0.004287 ** 
## guarantor2                           0.1508     0.3457   0.436 0.662750    
## age.transformed                      1.6366     1.4588   1.122 0.261886    
## employment.duration2                 0.1555     0.2541   0.612 0.540463    
## employment.duration3                 0.8065     0.3223   2.502 0.012335 *  
## employment.duration4                 0.4793     0.3032   1.581 0.113947    
## apartment.type2                      0.5662     0.2610   2.170 0.030044 *  
## apartment.type3                      0.7482     0.5030   1.487 0.136893    
## credit.purpose2                     -0.9603     0.4133  -2.323 0.020166 *  
## credit.purpose3                     -0.9952     0.3810  -2.612 0.008997 ** 
## credit.purpose4                     -1.2939     0.3736  -3.463 0.000534 ***
## occupation2                         -0.5420     0.7384  -0.734 0.462932    
## occupation3                         -0.6206     0.7126  -0.871 0.383819    
## occupation4                         -0.8325     0.7450  -1.117 0.263809    
## marital.status3                      0.3903     0.2279   1.712 0.086846 .  
## marital.status4                      0.2392     0.3477   0.688 0.491514    
## foreign.worker2                      1.1916     0.7139   1.669 0.095097 .  
## bank.credits2                       -0.2238     0.2522  -0.888 0.374806    
## installment.rate2                   -0.2265     0.3469  -0.653 0.513847    
## installment.rate3                   -0.5413     0.3763  -1.438 0.150334    
## installment.rate4                   -0.7102     0.3171  -2.239 0.025135 *  
## dependents2                         -0.3775     0.2860  -1.320 0.186942    
## telephone2                           0.2574     0.2230   1.155 0.248257    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 924.61  on 749  degrees of freedom
## Residual deviance: 699.29  on 716  degrees of freedom
## AIC: 767.29
## 
## Number of Fisher Scoring iterations: 5

Predicting values with our third model:

predictThirdModel = predict(thirdModel, testTM, type="response")
predictThirdModel = round(predictThirdModel)

Evaluating our third model:

confusionMatrix(
  table(
    data=predictThirdModel,
    reference = testTM[,1]),
    positive = '1')

## Confusion Matrix and Statistics
## 
##     reference
## data   0   1
##    0  34  16
##    1  36 164
##                                           
##                Accuracy : 0.792           
##                  95% CI : (0.7364, 0.8406)
##     No Information Rate : 0.72            
##     P-Value [Acc > NIR] : 0.005759        
##                                           
##                   Kappa : 0.4348          
##                                           
##  Mcnemar's Test P-Value : 0.008418        
##                                           
##             Sensitivity : 0.9111          
##             Specificity : 0.4857          
##          Pos Pred Value : 0.8200          
##          Neg Pred Value : 0.6800          
##              Prevalence : 0.7200          
##          Detection Rate : 0.6560          
##    Detection Prevalence : 0.8000          
##       Balanced Accuracy : 0.6984          
##                                           
##        'Positive' Class : 1               
##

ROC Curves to all models above:

rocitObjFM = rocit(score=predictFirstModel, class=testFM[,1])
rocitObjSM = rocit(score=predictSecondModel,class=testSM[,1])
rocitObjTM = rocit(score=predictThirdModel,class=testTM[,1])

plot(rocitObjFM, title = 'First Model')

plot(rocitObjSM, title = 'Second Model')

plot(rocitObjTM, title = 'Second Model')

Other curves:

precrecObjFM = evalmod(scores=predictFirstModel,labels=testFM[,1],mode = 'basic')

precrecObjSM = evalmod(scores=predictSecondModel,labels=testSM[,1],mode='basic') 

precrecObjTM = evalmod(scores=predictThirdModel,labels=testTM[,1],mode='basic')

autoplot(precrecObjFM)

autoplot(precrecObjSM)

autoplot(precrecObjTM)

Credit risk modeling with logistic regression

Leandro Kellermann de Oliveira