The German credit data contains attributes and outcomes on 1,000 loan applications. Lending that results in default is very costly and for this dataset, you will use logistic regression for determining the probability of default: . Use duration, amount, installment, and age in this analysis, along with loan history, purpose, and rent. . You need to use random selection for 900 cases to train the program, and then the other 100 cases will be used for testing. Test the program, and describe your results.
** Load data **
# chagne working directory to where german.csv is at
setwd("~/..")
# load german data
german <- read.csv("germancredit.csv", sep = "," , header = T )
# preview german data
head(german,3)
## Default checkingstatus1 duration history purpose amount savings employ
## 1 0 A11 6 A34 A43 1169 A65 A75
## 2 1 A12 48 A32 A43 5951 A61 A73
## 3 0 A14 12 A34 A46 2096 A61 A74
## installment status others residence property age otherplans housing
## 1 4 A93 A101 4 A121 67 A143 A152
## 2 2 A92 A101 2 A121 22 A143 A152
## 3 2 A93 A101 3 A121 49 A143 A152
## cards job liable tele foreign
## 1 2 A173 1 A192 A201
## 2 1 A173 1 A191 A201
## 3 1 A172 2 A191 A201
## Q1) Impute columns to duration, amount, installment, and age in this analysis, along with loan history, purpose, and rent
german <- german[, c('duration', 'amount', 'installment', 'age', 'history', 'purpose', 'housing','Default')]
# check if all columns are numeric
str(german)
## 'data.frame': 1000 obs. of 8 variables:
## $ duration : int 6 48 12 42 24 36 24 36 12 30 ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ installment: int 4 2 2 2 3 2 3 2 2 4 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ history : Factor w/ 5 levels "A30","A31","A32",..: 5 3 5 3 4 3 3 3 3 5 ...
## $ purpose : Factor w/ 10 levels "A40","A41","A410",..: 5 5 8 4 1 8 4 2 5 1 ...
## $ housing : Factor w/ 3 levels "A151","A152",..: 2 2 2 3 3 3 2 1 2 2 ...
## $ Default : int 0 1 0 0 1 0 0 0 0 1 ...
# the result indicate history, purpose and housing are not numeric
# create function to convert non-numeric data.frame columns to numeric
as.numeric_from_fractor <- function(x) if(is.factor(x)) as.integer(as.factor(x)) else x
# apply the function to convert german data.frame
german[] <- lapply(german, as.numeric_from_fractor)
# check data frame again
str(german)
## 'data.frame': 1000 obs. of 8 variables:
## $ duration : int 6 48 12 42 24 36 24 36 12 30 ...
## $ amount : int 1169 5951 2096 7882 4870 9055 2835 6948 3059 5234 ...
## $ installment: int 4 2 2 2 3 2 3 2 2 4 ...
## $ age : int 67 22 49 45 53 35 53 35 61 28 ...
## $ history : int 5 3 5 3 4 3 3 3 3 5 ...
## $ purpose : int 5 5 8 4 1 8 4 2 5 1 ...
## $ housing : int 2 2 2 3 3 3 2 1 2 2 ...
## $ Default : int 0 1 0 0 1 0 0 0 0 1 ...
head(german)
## duration amount installment age history purpose housing Default
## 1 6 1169 4 67 5 5 2 0
## 2 48 5951 2 22 3 5 2 1
## 3 12 2096 2 49 5 8 2 0
## 4 42 7882 2 45 3 4 3 0
## 5 24 4870 3 53 4 1 3 1
## 6 36 9055 2 35 3 8 3 0
1. Split Training and Testing Split training and testing datasets in the ratio of 90 - 10.
# if caTools library does not exist, then install
if (!require("caTools")) install.packages("caTools", dependencies=TRUE)
## Loading required package: caTools
# load library
library(caTools)
# spliting the german data
split = sample.split(german$Default, SplitRatio = 0.9)
train_german = subset(german, split == TRUE)
test_german = subset(german, split == FALSE)
# test the output
nrow(train_german)
## [1] 900
nrow(test_german)
## [1] 100
2. Train Logistric Regression Model.
# create model
german_LR <- glm(Default~.,data=train_german,family=binomial())
# display model details
summary(german_LR)
##
## Call:
## glm(formula = Default ~ ., family = binomial(), data = train_german)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8007 -0.8606 -0.6200 1.1214 2.3255
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.019e-01 4.716e-01 0.216 0.828850
## duration 2.948e-02 8.486e-03 3.475 0.000512 ***
## amount 6.566e-05 3.814e-05 1.721 0.085181 .
## installment 2.497e-01 7.984e-02 3.127 0.001764 **
## age -1.404e-02 7.621e-03 -1.842 0.065538 .
## history -4.792e-01 7.741e-02 -6.190 6.02e-10 ***
## purpose -2.475e-02 2.893e-02 -0.855 0.392369
## housing -1.680e-01 1.544e-01 -1.088 0.276541
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1099.56 on 899 degrees of freedom
## Residual deviance: 998.88 on 892 degrees of freedom
## AIC: 1014.9
##
## Number of Fisher Scoring iterations: 4
#The summary shows only duration, amaount, installment, and history are significant with 95% confidence interval
3. Test Logistric Regression Model.
# if lmtest library does not exist, then install
if (!require("lmtest")) install.packages("lmtest", dependencies=TRUE)
## Loading required package: lmtest
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# if lmtest library does not exist, then install
if (!require("rcompanion")) install.packages("rcompanion", dependencies=TRUE)
## Loading required package: rcompanion
# if caret library does not exist, then install
if (!require("caret")) install.packages("caret", dependencies=TRUE)
## Loading required package: caret
## Loading required package: lattice
## Loading required package: ggplot2
# load library
library(lmtest)
library(rcompanion)
library(caret)
# prediction: predicted default probabilities for cases in test set
predicted_test <- predict(german_LR,newdata=test_german,type="response")
data.frame(test_german$Default,predicted_test)[1:10,] # [actual, predicted]
## test_german.Default predicted_test
## 6 0 0.4077161
## 30 1 0.4217757
## 41 0 0.4551502
## 53 0 0.2767963
## 60 1 0.3978254
## 65 0 0.4390630
## 92 0 0.1200268
## 94 0 0.2895190
## 96 1 0.8558581
## 99 0 0.2601376
# confusion matrix
#table(test_german$Default,floor(predicted_test+0.5))
confusionMatrix(data = as.factor(predicted_test>0.5), reference = as.factor(test_german$Default>0.5))
## Confusion Matrix and Statistics
##
## Reference
## Prediction FALSE TRUE
## FALSE 65 22
## TRUE 5 8
##
## Accuracy : 0.73
## 95% CI : (0.632, 0.8139)
## No Information Rate : 0.7
## P-Value [Acc > NIR] : 0.296366
##
## Kappa : 0.233
## Mcnemar's Test P-Value : 0.002076
##
## Sensitivity : 0.9286
## Specificity : 0.2667
## Pos Pred Value : 0.7471
## Neg Pred Value : 0.6154
## Prevalence : 0.7000
## Detection Rate : 0.6500
## Detection Prevalence : 0.8700
## Balanced Accuracy : 0.5976
##
## 'Positive' Class : FALSE
##
# Analysis of variance $
anova(german_LR, test="Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: Default
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 899 1099.56
## duration 1 40.724 898 1058.83 1.753e-10 ***
## amount 1 0.171 897 1058.66 0.679216
## installment 1 6.196 896 1052.46 0.012801 *
## age 1 10.414 895 1042.05 0.001251 **
## history 1 41.310 894 1000.74 1.299e-10 ***
## purpose 1 0.678 893 1000.06 0.410348
## housing 1 1.184 892 998.88 0.276484
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Pseudo-R-squared
nagelkerke(german_LR)
## $Models
##
## Model: "glm, Default ~ ., binomial(), train_german"
## Null: "glm, Default ~ 1, binomial(), train_german"
##
## $Pseudo.R.squared.for.model.vs.null
## Pseudo.R.squared
## McFadden 0.0915622
## Cox and Snell (ML) 0.1058340
## Nagelkerke (Cragg and Uhler) 0.1500600
##
## $Likelihood.ratio.test
## Df.diff LogLik.diff Chisq p.value
## -7 -50.339 100.68 7.8156e-19
##
## $Number.of.observations
##
## Model: 900
## Null: 900
##
## $Messages
## [1] "Note: For models fit with REML, these statistics are based on refitting with ML"
##
## $Warnings
## [1] "None"
# Overall p-value for model
anova(german_LR,update(german_LR, ~1), test="Chisq")
## Analysis of Deviance Table
##
## Model 1: Default ~ duration + amount + installment + age + history + purpose +
## housing
## Model 2: Default ~ 1
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 892 998.88
## 2 899 1099.56 -7 -100.68 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
4. Plot Logistic Regression and its Performance.
# if ROCR library does not exist, then install
if (!require("ROCR")) install.packages("ROCR", dependencies=TRUE)
## Loading required package: ROCR
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
# load library
library(ROCR)
LR_pred <- predict(german_LR,type='response',newdata=subset(german))
german_pred <- prediction(LR_pred,german$Default)
german_perf <- performance(german_pred,"tpr","fpr")
# Plot Logistic Regression Model for German Credit
plot(german_perf,col="red")