R Markdown

The German credit data contains attributes and outcomes on 1,000 loan applications. Lending that results in default is very costly and for this dataset, you will use logistic regression for determining the probability of default: . Use duration, amount, installment, and age in this analysis, along with loan history, purpose, and rent. . You need to use random selection for 900 cases to train the program, and then the other 100 cases will be used for testing. Test the program, and describe your results.

** Load data **

# chagne working directory to where german.csv is at
setwd("~/..")

# load german data
german <- read.csv("germancredit.csv", sep = "," , header = T )

# preview german data
head(german,3)
##   Default checkingstatus1 duration history purpose amount savings employ
## 1       0             A11        6     A34     A43   1169     A65    A75
## 2       1             A12       48     A32     A43   5951     A61    A73
## 3       0             A14       12     A34     A46   2096     A61    A74
##   installment status others residence property age otherplans housing
## 1           4    A93   A101         4     A121  67       A143    A152
## 2           2    A92   A101         2     A121  22       A143    A152
## 3           2    A93   A101         3     A121  49       A143    A152
##   cards  job liable tele foreign
## 1     2 A173      1 A192    A201
## 2     1 A173      1 A191    A201
## 3     1 A172      2 A191    A201
## Q1) Impute columns to duration, amount, installment, and age in this analysis, along with loan history, purpose, and rent
german <- german[, c('duration', 'amount', 'installment', 'age', 'history', 'purpose', 'housing','Default')]

# check if all columns are numeric
str(german)
## 'data.frame':    1000 obs. of  8 variables:
##  $ duration   : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ amount     : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ installment: int  4 2 2 2 3 2 3 2 2 4 ...
##  $ age        : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ history    : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
##  $ purpose    : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
##  $ housing    : Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
##  $ Default    : int  0 1 0 0 1 0 0 0 0 1 ...
# the result indicate history, purpose and housing are not numeric
# create function to convert non-numeric data.frame columns to numeric
as.numeric_from_fractor <- function(x) if(is.factor(x)) as.integer(as.factor(x)) else x

# apply the function to convert german data.frame
german[] <- lapply(german, as.numeric_from_fractor)

# check data frame again
str(german)
## 'data.frame':    1000 obs. of  8 variables:
##  $ duration   : int  6 48 12 42 24 36 24 36 12 30 ...
##  $ amount     : int  1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
##  $ installment: int  4 2 2 2 3 2 3 2 2 4 ...
##  $ age        : int  67 22 49 45 53 35 53 35 61 28 ...
##  $ history    : int  5 3 5 3 4 3 3 3 3 5 ...
##  $ purpose    : int  5 5 8 4 1 8 4 2 5 1 ...
##  $ housing    : int  2 2 2 3 3 3 2 1 2 2 ...
##  $ Default    : int  0 1 0 0 1 0 0 0 0 1 ...
head(german)
##   duration amount installment age history purpose housing Default
## 1        6   1169           4  67       5       5       2       0
## 2       48   5951           2  22       3       5       2       1
## 3       12   2096           2  49       5       8       2       0
## 4       42   7882           2  45       3       4       3       0
## 5       24   4870           3  53       4       1       3       1
## 6       36   9055           2  35       3       8       3       0

1. Split Training and Testing Split training and testing datasets in the ratio of 90 - 10.

# if caTools library does not exist, then install 
if (!require("caTools")) install.packages("caTools", dependencies=TRUE)
## Loading required package: caTools
# load library
library(caTools)


# spliting the german data
split = sample.split(german$Default, SplitRatio = 0.9)
train_german = subset(german, split == TRUE)
test_german = subset(german, split == FALSE)

# test the output
nrow(train_german)
## [1] 900
nrow(test_german)
## [1] 100

2. Train Logistric Regression Model.

# create model
german_LR <- glm(Default~.,data=train_german,family=binomial())

# display model details
summary(german_LR)
## 
## Call:
## glm(formula = Default ~ ., family = binomial(), data = train_german)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8007  -0.8606  -0.6200   1.1214   2.3255  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  1.019e-01  4.716e-01   0.216 0.828850    
## duration     2.948e-02  8.486e-03   3.475 0.000512 ***
## amount       6.566e-05  3.814e-05   1.721 0.085181 .  
## installment  2.497e-01  7.984e-02   3.127 0.001764 ** 
## age         -1.404e-02  7.621e-03  -1.842 0.065538 .  
## history     -4.792e-01  7.741e-02  -6.190 6.02e-10 ***
## purpose     -2.475e-02  2.893e-02  -0.855 0.392369    
## housing     -1.680e-01  1.544e-01  -1.088 0.276541    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1099.56  on 899  degrees of freedom
## Residual deviance:  998.88  on 892  degrees of freedom
## AIC: 1014.9
## 
## Number of Fisher Scoring iterations: 4
#The summary shows only duration, amaount, installment, and history are significant with 95% confidence interval

3. Test Logistric Regression Model.

# if lmtest library does not exist, then install 
if (!require("lmtest")) install.packages("lmtest", dependencies=TRUE)
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# if lmtest library does not exist, then install 
if (!require("rcompanion")) install.packages("rcompanion", dependencies=TRUE)
## Loading required package: rcompanion
# if caret library does not exist, then install 
if (!require("caret")) install.packages("caret", dependencies=TRUE)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
# load library
library(lmtest)
library(rcompanion)
library(caret)


# prediction: predicted default probabilities for cases in test set
predicted_test <- predict(german_LR,newdata=test_german,type="response")
data.frame(test_german$Default,predicted_test)[1:10,] # [actual, predicted]
##    test_german.Default predicted_test
## 6                    0      0.4077161
## 30                   1      0.4217757
## 41                   0      0.4551502
## 53                   0      0.2767963
## 60                   1      0.3978254
## 65                   0      0.4390630
## 92                   0      0.1200268
## 94                   0      0.2895190
## 96                   1      0.8558581
## 99                   0      0.2601376
# confusion matrix
#table(test_german$Default,floor(predicted_test+0.5))
confusionMatrix(data = as.factor(predicted_test>0.5), reference = as.factor(test_german$Default>0.5))
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction FALSE TRUE
##      FALSE    65   22
##      TRUE      5    8
##                                          
##                Accuracy : 0.73           
##                  95% CI : (0.632, 0.8139)
##     No Information Rate : 0.7            
##     P-Value [Acc > NIR] : 0.296366       
##                                          
##                   Kappa : 0.233          
##  Mcnemar's Test P-Value : 0.002076       
##                                          
##             Sensitivity : 0.9286         
##             Specificity : 0.2667         
##          Pos Pred Value : 0.7471         
##          Neg Pred Value : 0.6154         
##              Prevalence : 0.7000         
##          Detection Rate : 0.6500         
##    Detection Prevalence : 0.8700         
##       Balanced Accuracy : 0.5976         
##                                          
##        'Positive' Class : FALSE          
## 
# Analysis of variance $
anova(german_LR, test="Chisq")
## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: Default
## 
## Terms added sequentially (first to last)
## 
## 
##             Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                          899    1099.56              
## duration     1   40.724       898    1058.83 1.753e-10 ***
## amount       1    0.171       897    1058.66  0.679216    
## installment  1    6.196       896    1052.46  0.012801 *  
## age          1   10.414       895    1042.05  0.001251 ** 
## history      1   41.310       894    1000.74 1.299e-10 ***
## purpose      1    0.678       893    1000.06  0.410348    
## housing      1    1.184       892     998.88  0.276484    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pseudo-R-squared
nagelkerke(german_LR)
## $Models
##                                                    
## Model: "glm, Default ~ ., binomial(), train_german"
## Null:  "glm, Default ~ 1, binomial(), train_german"
## 
## $Pseudo.R.squared.for.model.vs.null
##                              Pseudo.R.squared
## McFadden                            0.0915622
## Cox and Snell (ML)                  0.1058340
## Nagelkerke (Cragg and Uhler)        0.1500600
## 
## $Likelihood.ratio.test
##  Df.diff LogLik.diff  Chisq    p.value
##       -7     -50.339 100.68 7.8156e-19
## 
## $Number.of.observations
##           
## Model: 900
## Null:  900
## 
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
## 
## $Warnings
## [1] "None"
# Overall p-value for model
anova(german_LR,update(german_LR, ~1), test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: Default ~ duration + amount + installment + age + history + purpose + 
##     housing
## Model 2: Default ~ 1
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1       892     998.88                          
## 2       899    1099.56 -7  -100.68 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

4. Plot Logistic Regression and its Performance.

# if ROCR library does not exist, then install 
if (!require("ROCR")) install.packages("ROCR", dependencies=TRUE)
## Loading required package: ROCR
## Loading required package: gplots
## 
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
## 
##     lowess
# load library
library(ROCR)

LR_pred <- predict(german_LR,type='response',newdata=subset(german))
german_pred <- prediction(LR_pred,german$Default)
german_perf <- performance(german_pred,"tpr","fpr")

# Plot Logistic Regression Model for German Credit
plot(german_perf,col="red")