This project predict people described by a set of attributes will have good or bad credit risks

Data

The data for this problem is taken from UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)). The original dataset contains 1000 entries with 20 categorial/symbolic attributes prepared by Prof. Hofmann. In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of attributes.

Approach

  • EDA
  • correlation between these categorical variables and the response variable
  • Machine Learning algorithms to select best model for classification customers into good or bad credit.
library(dplyr)
library(glmnet)
library(ROCR)
library(PRROC)
library(boot)
library(rpart)
library(rpart.plot)
library(knitr)
library(dplyr)
library(tidyr)
library(reshape2)
library(RColorBrewer)
library(GGally)
library(ggplot2)
library(boot)
library(verification)

set.seed(1234)
credit.data <- read.table("D:/Course/Data Mining/Data Mining 1/Homework/german.data")

We get the data from the link. We need to provide names for the columns and change the response labels to 1 and 0: 0 corresponding to a good credit record and 1 corresponding to a bad one (positive class).

colnames(credit.data)=c("chk_acct","duration","credit_his","purpose","amount","saving_acct","present_emp","installment_rate","sex","other_debtor","present_resid","property","age","other_install","housing","n_credits","job","n_people","telephone","foreign","response")

#orginal response coding 1= good, 2 = bad
#we need 0 = good, 1 = bad

credit.data$response = credit.data$response - 1

Data Structure

There is a total on 21 attributes in the dataset. Their descriptions and details have been tabulated below:

We take the summary statistics of the dataset, the dataset has a total of 1000 observations with 21 variables, out of which 8 are numerical variables including the response and 13 are categorical variables with various levels. The summary statistics for the variables have been presented

  • Status of existing checking account.
  • Duration in month
  • Credit history
  • Purpose
  • Credit amount
  • Savings account/bonds
  • Present employment since
  • Installment rate in percentage of disposable income
  • Personal status and sex
  • Other debtors / guarantors
  • Present residence since
  • Property
  • Age in years
  • Other installment plans
  • Housing
  • Number of existing credits at this bank
  • Job
  • Number of people being liable to provide maintenance for
  • Telephone
  • foreign worker
glimpse(credit.data)
## Observations: 1,000
## Variables: 21
## $ chk_acct         <fct> A11, A12, A14, A11, A11, A14, A14, A12, A14, A12, ...
## $ duration         <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48, 12,...
## $ credit_his       <fct> A34, A32, A34, A32, A33, A32, A32, A32, A32, A34, ...
## $ purpose          <fct> A43, A43, A46, A42, A40, A46, A42, A41, A43, A40, ...
## $ amount           <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 6948, 30...
## $ saving_acct      <fct> A65, A61, A61, A61, A61, A65, A63, A61, A64, A61, ...
## $ present_emp      <fct> A75, A73, A74, A74, A73, A73, A75, A73, A74, A71, ...
## $ installment_rate <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 3, 1, 4, 2, 4, 4,...
## $ sex              <fct> A93, A92, A93, A93, A93, A93, A93, A93, A91, A94, ...
## $ other_debtor     <fct> A101, A101, A101, A103, A101, A101, A101, A101, A1...
## $ present_resid    <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1, 4, 1, 4, 4, 2, 4,...
## $ property         <fct> A121, A121, A121, A122, A124, A124, A122, A123, A1...
## $ age              <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24, 22...
## $ other_install    <fct> A143, A143, A143, A143, A143, A143, A143, A143, A1...
## $ housing          <fct> A152, A152, A152, A153, A153, A153, A152, A151, A1...
## $ n_credits        <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2,...
## $ job              <fct> A173, A173, A172, A173, A173, A172, A173, A174, A1...
## $ n_people         <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ telephone        <fct> A192, A191, A191, A191, A191, A192, A191, A192, A1...
## $ foreign          <fct> A201, A201, A201, A201, A201, A201, A201, A201, A2...
## $ response         <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,...
#converting response to factor

credit.data$response <- as.factor(credit.data$response)

summary(credit.data)
##  chk_acct     duration    credit_his    purpose        amount      saving_acct
##  A11:274   Min.   : 4.0   A30: 40    A43    :280   Min.   :  250   A61:603    
##  A12:269   1st Qu.:12.0   A31: 49    A40    :234   1st Qu.: 1366   A62:103    
##  A13: 63   Median :18.0   A32:530    A42    :181   Median : 2320   A63: 63    
##  A14:394   Mean   :20.9   A33: 88    A41    :103   Mean   : 3271   A64: 48    
##            3rd Qu.:24.0   A34:293    A49    : 97   3rd Qu.: 3972   A65:183    
##            Max.   :72.0              A46    : 50   Max.   :18424              
##                                      (Other): 55                              
##  present_emp installment_rate  sex      other_debtor present_resid   property  
##  A71: 62     Min.   :1.000    A91: 50   A101:907     Min.   :1.000   A121:282  
##  A72:172     1st Qu.:2.000    A92:310   A102: 41     1st Qu.:2.000   A122:232  
##  A73:339     Median :3.000    A93:548   A103: 52     Median :3.000   A123:332  
##  A74:174     Mean   :2.973    A94: 92                Mean   :2.845   A124:154  
##  A75:253     3rd Qu.:4.000                           3rd Qu.:4.000             
##              Max.   :4.000                           Max.   :4.000             
##                                                                                
##       age        other_install housing      n_credits       job     
##  Min.   :19.00   A141:139      A151:179   Min.   :1.000   A171: 22  
##  1st Qu.:27.00   A142: 47      A152:713   1st Qu.:1.000   A172:200  
##  Median :33.00   A143:814      A153:108   Median :1.000   A173:630  
##  Mean   :35.55                            Mean   :1.407   A174:148  
##  3rd Qu.:42.00                            3rd Qu.:2.000             
##  Max.   :75.00                            Max.   :4.000             
##                                                                     
##     n_people     telephone  foreign    response
##  Min.   :1.000   A191:596   A201:963   0:700   
##  1st Qu.:1.000   A192:404   A202: 37   1:300   
##  Median :1.000                                 
##  Mean   :1.155                                 
##  3rd Qu.:1.000                                 
##  Max.   :2.000                                 
## 

Exploratory Data Analysis

Continuous Variables

We get the following insights from our EDA of continuous variables:

  • From the age variable, we see that the median value for bad records is lesser than that of good records, it might be premature to say young people tend to have bad credit records, but we can safely assume it tends to be riskier.
  • The installment_rate variable has a great deal of difference between the good and bad records, we see that bad records have almost the double median value than good ones.
  • The median value and the range of the duration variables appears to be on the higher side of bad records as compared to good records
  • For the amount variable, we observe that the amount for bad records is larger in general as compared to good ones
  • We further built on this by plotting the density curve along the vertical line for their mean value and find that there is a great deal of difference for the duration as well as amount variable.

Duration

amount.mean = credit.data %>% dplyr::select(amount, response) %>% group_by(response) %>% summarise(m =mean(amount))
duration.mean = credit.data %>% dplyr::select(duration, response) %>%group_by(response) %>% summarise( m =mean(duration))

ggplot(credit.data, aes(duration, fill=response)) + 
  geom_density(alpha=.5) 

test.m = credit.data[,c(2,5,8,13,16,18,21)]
test.m$response <- as.numeric(test.m$response)
ggplot(melt(credit.data[,c(2,21)]), aes(x = variable, y = value, fill = response)) + geom_boxplot() + xlab("response") + ylab("duration")
## Using response as id variables

Installment Rate

ggplot(credit.data, aes(factor(installment_rate), ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") + xlab("Installment Rates")

Amount

ggplot(credit.data, aes(amount, fill=response)) + 
  geom_density(alpha=.5) 

ggplot(melt(credit.data[,c(5,21)]), aes(x = variable, y = value, fill = response)) + 
geom_boxplot() + xlab("response") + ylab("amount")
## Using response as id variables

Age

ggplot(melt(credit.data[,c(13,21)]), aes(x = variable, y = value, fill = response)) + 
geom_boxplot()+ xlab("response") + ylab("age")
## Using response as id variables

n_credits

ggplot(melt(credit.data[,c(16,21)]), aes(x = variable, y = value, fill = response)) + 
geom_boxplot()
## Using response as id variables

Categorical Variables

We get the following insights from our EDA of categorical variables:

  • For chk_acct we see that, the current status of the checking account matters as the frequency of the response variables is seen to differ from one sub category to another, overall A11 houses more number of bad credit records and A14 the least
  • For credit_his, we observe that proportion of the response variable varies significantly, for categories A30, A31 we see the number of bad credit records are greater.
  • For the purpose variable, we observe that the proportion of good and bad credit record varies also overall A44, A45, A410 and A46 seem to include more risky records.
  • We also observe these trends in other variables like sex, other_debtor, saving_acct, other_install and foreign. Overall, the trend looks significant in saving_acct, purpose, credit_his and chk_acct as compared to others.

chk_acct

ggplot(credit.data, aes(chk_acct, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") 

credit_hist

ggplot(credit.data, aes(credit_his, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") 

purpose

ggplot(credit.data, aes(purpose, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") 

Saving_acct

ggplot(credit.data, aes(saving_acct, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") 

other_debtor

ggplot(credit.data, aes(other_debtor, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") 

sex

ggplot(credit.data, aes(sex, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") 

other_install

ggplot(credit.data, aes(other_install, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") 

foreign

ggplot(credit.data, aes(foreign, ..count..)) + 
  geom_bar(aes(fill = response), position = "dodge") 

Machine Learning Algorithms

# Sampling data

index <- sample(nrow(credit.data),nrow(credit.data)*0.70)
credit.train = credit.data[index,]
credit.test = credit.data[-index,]


creditcost <- function(observed, predicted){
  weight1 = 5
  weight0 = 1
  c1 = (observed==1)&(predicted == 0) #logical vector - true if actual 1 but predict 0
  c0 = (observed==0)&(predicted == 1) #logical vector - true if actual 0 but predict 1
  return(mean(weight1*c1+weight0*c0))
}

Logistic Regression

Key insights

  • As per analysis, Step AIC logit model was best fit is used
  • MR is 0.27 and AUC is 0.85
credit.glm.logit<- glm(response~., family=binomial, data=credit.train)
credit.glm.back.AIC <- step(credit.glm.logit) # backward selection (if you don't specify anything)
## Start:  AIC=669.94
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + job + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - job               3   574.14 666.14
## - n_people          1   571.98 667.98
## - present_resid     1   572.01 668.01
## - property          3   577.69 669.69
## - age               1   573.73 669.73
## <none>                  571.94 669.94
## - present_emp       4   580.21 670.21
## - sex               3   579.28 671.28
## - other_install     2   577.57 671.57
## - n_credits         1   575.72 671.72
## - foreign           1   575.77 671.77
## - housing           2   578.20 672.20
## - duration          1   576.30 672.30
## - purpose           9   594.05 674.05
## - telephone         1   578.55 674.55
## - other_debtor      2   583.50 677.50
## - installment_rate  1   581.72 677.72
## - amount            1   582.65 678.65
## - saving_acct       4   588.92 678.92
## - credit_his        4   599.06 689.06
## - chk_acct          3   623.27 715.27
## 
## Step:  AIC=666.14
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + n_people + telephone + foreign
## 
##                    Df Deviance    AIC
## - n_people          1   574.19 664.19
## - present_resid     1   574.21 664.21
## - present_emp       4   581.13 665.13
## - property          3   579.64 665.64
## - age               1   576.03 666.03
## <none>                  574.14 666.14
## - n_credits         1   577.53 667.53
## - other_install     2   579.68 667.68
## - sex               3   581.88 667.88
## - foreign           1   578.33 668.33
## - duration          1   578.51 668.51
## - housing           2   580.59 668.59
## - purpose           9   595.39 669.39
## - telephone         1   580.82 670.82
## - other_debtor      2   585.79 673.79
## - installment_rate  1   585.01 675.01
## - saving_acct       4   592.12 676.12
## - amount            1   586.16 676.16
## - credit_his        4   600.54 684.54
## - chk_acct          3   624.32 710.32
## 
## Step:  AIC=664.19
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     present_resid + property + age + other_install + housing + 
##     n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - present_resid     1   574.26 662.26
## - present_emp       4   581.14 663.14
## - property          3   579.65 663.65
## - age               1   576.07 664.07
## <none>                  574.19 664.19
## - n_credits         1   577.70 665.70
## - other_install     2   579.77 665.77
## - sex               3   581.93 665.93
## - foreign           1   578.39 666.39
## - duration          1   578.53 666.53
## - housing           2   580.60 666.60
## - purpose           9   595.48 667.48
## - telephone         1   580.90 668.90
## - other_debtor      2   585.79 671.79
## - installment_rate  1   585.01 673.01
## - saving_acct       4   592.12 674.12
## - amount            1   586.17 674.17
## - credit_his        4   601.07 683.07
## - chk_acct          3   624.66 708.66
## 
## Step:  AIC=662.26
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + present_emp + installment_rate + sex + other_debtor + 
##     property + age + other_install + housing + n_credits + telephone + 
##     foreign
## 
##                    Df Deviance    AIC
## - present_emp       4   581.16 661.16
## - property          3   579.72 661.72
## - age               1   576.08 662.08
## <none>                  574.26 662.26
## - other_install     2   579.81 663.81
## - n_credits         1   577.87 663.87
## - sex               3   581.93 663.93
## - foreign           1   578.52 664.52
## - duration          1   578.65 664.65
## - housing           2   581.40 665.40
## - purpose           9   595.48 665.48
## - telephone         1   580.91 666.91
## - other_debtor      2   585.87 669.87
## - installment_rate  1   585.08 671.08
## - saving_acct       4   592.14 672.14
## - amount            1   586.17 672.17
## - credit_his        4   601.07 681.07
## - chk_acct          3   625.10 707.10
## 
## Step:  AIC=661.16
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + installment_rate + sex + other_debtor + property + 
##     age + other_install + housing + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - property          3   586.85 660.85
## - age               1   582.99 660.99
## <none>                  581.16 661.16
## - n_credits         1   584.34 662.34
## - other_install     2   586.57 662.57
## - duration          1   584.59 662.59
## - purpose           9   601.57 663.57
## - foreign           1   585.80 663.80
## - housing           2   587.85 663.85
## - sex               3   589.91 663.91
## - telephone         1   588.55 666.55
## - other_debtor      2   593.22 669.22
## - installment_rate  1   591.84 669.84
## - amount            1   593.38 671.38
## - saving_acct       4   600.08 672.08
## - credit_his        4   608.54 680.54
## - chk_acct          3   633.96 707.96
## 
## Step:  AIC=660.85
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + installment_rate + sex + other_debtor + age + 
##     other_install + housing + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## - age               1   588.76 660.76
## <none>                  586.85 660.85
## - n_credits         1   589.65 661.65
## - duration          1   590.56 662.56
## - sex               3   594.77 662.77
## - other_install     2   593.10 663.10
## - foreign           1   591.38 663.38
## - purpose           9   607.75 663.75
## - telephone         1   592.78 664.78
## - housing           2   595.10 665.10
## - other_debtor      2   599.90 669.90
## - saving_acct       4   604.77 670.77
## - installment_rate  1   599.41 671.41
## - amount            1   601.13 673.13
## - credit_his        4   613.80 679.80
## - chk_acct          3   642.79 710.79
## 
## Step:  AIC=660.76
## response ~ chk_acct + duration + credit_his + purpose + amount + 
##     saving_acct + installment_rate + sex + other_debtor + other_install + 
##     housing + n_credits + telephone + foreign
## 
##                    Df Deviance    AIC
## <none>                  588.76 660.76
## - n_credits         1   591.36 661.36
## - other_install     2   594.46 662.46
## - sex               3   596.80 662.80
## - duration          1   593.18 663.18
## - purpose           9   609.23 663.23
## - foreign           1   593.25 663.25
## - housing           2   597.06 665.06
## - telephone         1   595.61 665.61
## - other_debtor      2   602.05 670.05
## - installment_rate  1   600.84 670.84
## - saving_acct       4   607.57 671.57
## - amount            1   602.83 672.83
## - credit_his        4   616.64 680.64
## - chk_acct          3   644.80 710.80
german.logit <- summary(credit.glm.back.AIC)
credit.glm.back.AIC$deviance
## [1] 588.7578
AIC(credit.glm.back.AIC)
## [1] 660.7578
BIC(credit.glm.back.AIC)
## [1] 824.5966
pred.glm.a <- predict(credit.glm.back.AIC, type="response")


pred1 <- prediction(pred.glm.a, credit.train$response)
perf1 <- performance(pred1, "tpr", "fpr")
plot(perf1, colorize=TRUE)

AUC.logit <- unlist(slot(performance(pred1, "auc"), "y.values"))
AUC.logit
## [1] 0.853799
#misclassification rate table

#define a cost rate function
costfunc = function(obs, pred.p, pcut){
  weight1 = 5   # define the weight for "true=1 but pred=0" (FN)
  weight0 = 1    # define the weight for "true=0 but pred=1" (FP)
  c1 = (obs==1)&(pred.p<pcut)    # count for "true=1 but pred=0"   (FN)
  c0 = (obs==0)&(pred.p>=pcut)   # count for "true=0 but pred=1"   (FP)
  cost = mean(weight1*c1 + weight0*c0)  # misclassification with weight
  return(cost)
}
p.seq = seq(0.01, 1, 0.01) 

cost = rep(0, length(p.seq))  
for(i in 1:length(p.seq)){ 
  cost[i] = costfunc(obs = credit.train$response, pred.p = pred.glm.a, pcut = p.seq[i])  
}

plot(p.seq, cost)

optimal.pcut.glm.a = p.seq[which(cost==min(cost))]
optimal.pcut.glm.a
## [1] 0.2
class.glm0.train.opt<- (pred.glm.a>optimal.pcut.glm.a)*1
table(credit.train$response, class.glm0.train.opt, dnn = c("True", "Predicted"))
##     Predicted
## True   0   1
##    0 324 167
##    1  25 184
MR.logit<- mean(credit.train$response!= class.glm0.train.opt)
MR.logit
## [1] 0.2742857

CART Analysis

Key Insights

  • MR is 0.44 & AUC is 0.67
credit.rpart <- rpart(formula = response ~ . , data = credit.train, method = "class", parms = list(loss=matrix(c(0,5,1,0), nrow = 2)))
prp(credit.rpart,digits = 4, extra = 1)

#in sample
credit.train.pred.tree<- predict(credit.rpart, credit.train, type="class")
table(credit.train$response, credit.train.pred.tree, dnn=c("Truth","Predicted"))
##      Predicted
## Truth   0   1
##     0 286 205
##     1   7 202
# out of sample
credit.test.pred.tree<- predict(credit.rpart, credit.test, type="class")
table(credit.test$response, credit.test.pred.tree, dnn=c("Truth","Predicted"))
##      Predicted
## Truth   0   1
##     0  95 114
##     1  18  73
MR.tree<- mean(credit.test$response!= credit.test.pred.tree)
MR.tree
## [1] 0.44
credit.rpart <- rpart(formula = response ~ ., data = credit.train, 
                      method = "class", 
                      parms = list(loss=matrix(c(0,5,1,0), nrow = 2)))
#Probability of getting 1
credit.test.prob.rpart = predict(credit.rpart,credit.test, type="prob")

pred = prediction(credit.test.prob.rpart[,2], credit.test$response)
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)

AUC.tree <- slot(performance(pred, "auc"), "y.values")[[1]]
AUC.tree
## [1] 0.6715916

Generalized Additive Model (GAM)

GAM function run only on the continuous variables present in the German credit dataset: ‘age’, ‘duration’ and ‘amount’

Key Insights

  • Edf of duration and age is 1,remove the spline function from these variables, and rerun the gam model.
  • MR for the in-sample and out-of sample for the dataset are 0.30 and 0.59 respectively.
library(mgcv)
## Loading required package: nlme
## 
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
## 
##     collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
## Create a formula for a model with a large number of variables:
credit.gam1 <- gam(as.factor(response)~chk_acct+s(duration)+credit_his+purpose+s(amount)+saving_acct+present_emp+installment_rate+sex+other_debtor+present_resid+property
                      +s(age)+other_install+housing+n_credits+telephone+foreign , family=binomial,data=credit.train)
summary(credit.gam1)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## as.factor(response) ~ chk_acct + s(duration) + credit_his + purpose + 
##     s(amount) + saving_acct + present_emp + installment_rate + 
##     sex + other_debtor + present_resid + property + s(age) + 
##     other_install + housing + n_credits + telephone + foreign
## 
## Parametric coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        2.08427    1.09878   1.897 0.057841 .  
## chk_acctA12       -0.56633    0.28070  -2.018 0.043632 *  
## chk_acctA13       -1.08760    0.43223  -2.516 0.011860 *  
## chk_acctA14       -1.95817    0.30221  -6.479  9.2e-11 ***
## credit_hisA31      1.06490    0.70769   1.505 0.132384    
## credit_hisA32     -0.13421    0.55623  -0.241 0.809328    
## credit_hisA33     -0.57983    0.61030  -0.950 0.342074    
## credit_hisA34     -1.35119    0.56668  -2.384 0.017108 *  
## purposeA41        -1.66380    0.49190  -3.382 0.000719 ***
## purposeA410       -0.23376    0.98762  -0.237 0.812895    
## purposeA42        -0.61334    0.34337  -1.786 0.074064 .  
## purposeA43        -0.89896    0.31994  -2.810 0.004958 ** 
## purposeA44        -0.81411    0.92274  -0.882 0.377628    
## purposeA45        -0.41481    0.68848  -0.603 0.546841    
## purposeA46        -0.25140    0.50245  -0.500 0.616827    
## purposeA48        -1.09956    1.36406  -0.806 0.420188    
## purposeA49        -0.10096    0.39308  -0.257 0.797305    
## saving_acctA62    -0.14172    0.35622  -0.398 0.690756    
## saving_acctA63    -0.18338    0.45317  -0.405 0.685726    
## saving_acctA64    -0.75293    0.59058  -1.275 0.202348    
## saving_acctA65    -1.33825    0.34523  -3.876 0.000106 ***
## present_empA72    -0.25747    0.48814  -0.527 0.597875    
## present_empA73    -0.29573    0.45276  -0.653 0.513652    
## present_empA74    -1.10871    0.50990  -2.174 0.029677 *  
## present_empA75    -0.43517    0.46579  -0.934 0.350174    
## installment_rate   0.32688    0.11468   2.850 0.004365 ** 
## sexA92            -1.12260    0.49201  -2.282 0.022511 *  
## sexA93            -1.28285    0.47677  -2.691 0.007130 ** 
## sexA94            -0.98753    0.59450  -1.661 0.096692 .  
## other_debtorA102   0.58044    0.49276   1.178 0.238823    
## other_debtorA103  -1.77175    0.62042  -2.856 0.004294 ** 
## present_resid      0.02018    0.10968   0.184 0.854033    
## propertyA122       0.57842    0.33321   1.736 0.082582 .  
## propertyA123       0.35230    0.30640   1.150 0.250229    
## propertyA124       1.10485    0.51126   2.161 0.030693 *  
## other_installA142 -0.61814    0.53303  -1.160 0.246177    
## other_installA143 -0.69262    0.29603  -2.340 0.019302 *  
## housingA152       -0.73430    0.29047  -2.528 0.011471 *  
## housingA153       -0.86812    0.57911  -1.499 0.133856    
## n_credits          0.43200    0.23207   1.861 0.062676 .  
## telephoneA192     -0.60788    0.24654  -2.466 0.013677 *  
## foreignA202       -1.62031    0.84536  -1.917 0.055276 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##               edf Ref.df Chi.sq p-value   
## s(duration) 1.000  1.000  5.946 0.01475 * 
## s(amount)   2.336  2.964 14.599 0.00306 **
## s(age)      1.000  1.001  1.743 0.18684   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.322   Deviance explained = 33.2%
## UBRE = -0.053405  Scale est. = 1         n = 700
plot(credit.gam1, shade=TRUE)

credit.gam <- gam(as.factor(response)~chk_acct+(duration)+credit_his+purpose+s(amount)+saving_acct+present_emp+installment_rate+sex+other_debtor+present_resid+property
                      +(age)+other_install+housing+n_credits+telephone+foreign , family=binomial,data=credit.train)
summary(credit.gam)
## 
## Family: binomial 
## Link function: logit 
## 
## Formula:
## as.factor(response) ~ chk_acct + (duration) + credit_his + purpose + 
##     s(amount) + saving_acct + present_emp + installment_rate + 
##     sex + other_debtor + present_resid + property + (age) + other_install + 
##     housing + n_credits + telephone + foreign
## 
## Parametric coefficients:
##                   Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        2.01580    1.19158   1.692 0.090702 .  
## chk_acctA12       -0.56633    0.28070  -2.018 0.043632 *  
## chk_acctA13       -1.08760    0.43223  -2.516 0.011860 *  
## chk_acctA14       -1.95817    0.30221  -6.480  9.2e-11 ***
## duration           0.02886    0.01184   2.439 0.014747 *  
## credit_hisA31      1.06490    0.70768   1.505 0.132383    
## credit_hisA32     -0.13421    0.55623  -0.241 0.809337    
## credit_hisA33     -0.57982    0.61030  -0.950 0.342079    
## credit_hisA34     -1.35118    0.56668  -2.384 0.017108 *  
## purposeA41        -1.66379    0.49190  -3.382 0.000719 ***
## purposeA410       -0.23377    0.98762  -0.237 0.812888    
## purposeA42        -0.61333    0.34337  -1.786 0.074067 .  
## purposeA43        -0.89895    0.31994  -2.810 0.004958 ** 
## purposeA44        -0.81410    0.92274  -0.882 0.377633    
## purposeA45        -0.41480    0.68848  -0.602 0.546851    
## purposeA46        -0.25140    0.50245  -0.500 0.616830    
## purposeA48        -1.09955    1.36405  -0.806 0.420190    
## purposeA49        -0.10095    0.39308  -0.257 0.797311    
## saving_acctA62    -0.14172    0.35622  -0.398 0.690751    
## saving_acctA63    -0.18338    0.45317  -0.405 0.685727    
## saving_acctA64    -0.75293    0.59058  -1.275 0.202346    
## saving_acctA65    -1.33825    0.34523  -3.876 0.000106 ***
## present_empA72    -0.25749    0.48814  -0.527 0.597857    
## present_empA73    -0.29574    0.45276  -0.653 0.513633    
## present_empA74    -1.10873    0.50990  -2.174 0.029675 *  
## present_empA75    -0.43518    0.46579  -0.934 0.350155    
## installment_rate   0.32688    0.11467   2.850 0.004365 ** 
## sexA92            -1.12258    0.49201  -2.282 0.022512 *  
## sexA93            -1.28284    0.47677  -2.691 0.007130 ** 
## sexA94            -0.98752    0.59450  -1.661 0.096696 .  
## other_debtorA102   0.58045    0.49276   1.178 0.238816    
## other_debtorA103  -1.77175    0.62042  -2.856 0.004294 ** 
## present_resid      0.02018    0.10968   0.184 0.854017    
## propertyA122       0.57843    0.33321   1.736 0.082581 .  
## propertyA123       0.35230    0.30640   1.150 0.250228    
## propertyA124       1.10485    0.51126   2.161 0.030693 *  
## age               -0.01523    0.01153  -1.320 0.186713    
## other_installA142 -0.61815    0.53303  -1.160 0.246175    
## other_installA143 -0.69262    0.29603  -2.340 0.019301 *  
## housingA152       -0.73431    0.29047  -2.528 0.011470 *  
## housingA153       -0.86812    0.57911  -1.499 0.133856    
## n_credits          0.43200    0.23207   1.861 0.062676 .  
## telephoneA192     -0.60789    0.24654  -2.466 0.013676 *  
## foreignA202       -1.62031    0.84536  -1.917 0.055275 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Approximate significance of smooth terms:
##             edf Ref.df Chi.sq p-value   
## s(amount) 2.336  2.964   14.6 0.00306 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## R-sq.(adj) =  0.322   Deviance explained = 33.2%
## UBRE = -0.053406  Scale est. = 1         n = 700
AIC(credit.gam)
## [1] 662.6159
BIC(credit.gam)
## [1] 873.494
credit.gam$deviance
## [1] 569.9443
pcut.gam <- (1/6)
prob.gam.in<-predict(credit.gam,credit.train,type="response")
pred.gam.in<-(prob.gam.in>=pcut.gam)*1
table(credit.train$response,pred.gam.in,dnn=c("Observed","Predicted"))
##         Predicted
## Observed   0   1
##        0 302 189
##        1  23 186
#MR
MR.gam <- mean(ifelse(credit.train$response != pred.gam.in, 1, 0))
#Cost assocaited with MR
creditcost(credit.train$response, pred.gam.in)
## [1] 0.4342857
#Out-of-sample performance########
prob.gam.out<-predict(credit.gam,credit.test,type="response")
pred.gam.out<-(prob.gam.out>=pcut.gam)*1
table(credit.test$response,pred.gam.out,dnn=c("Observed","Predicted"))
##         Predicted
## Observed   0   1
##        0 122  87
##        1  18  73
#MR
Mr.gam.out <- mean(ifelse(credit.test$response != pred.gam.out, 1, 0))
#Cost assocaited with MR
creditcost(credit.test$response, pred.gam.out)
## [1] 0.59

Neural Network

The response(in classification) needs not to be standardized

Key Insights

  • Neural Network is like a black box, which shows the actual outcomes but the interpretation of the features is much more difficult as compare to other models.
  • MR of in sample and out of sample from the Neural Network model comes out to be 0.21 and 0.69 respectively.
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
## 
##     melanoma
## Registered S3 method overwritten by 'pROC':
##   method    from        
##   lines.roc verification
library(NeuralNetTools)
par(mfrow=c(1,1))
credit.nnet <- train(as.factor(response)~., data=credit.train,method="nnet",na.action=na.exclude,hidden=c(5,3),maxit=300,act.fct="logistic",learningrate=0.1)
plot(credit.nnet)

print(credit.nnet)
## Neural Network 
## 
## 700 samples
##  20 predictor
##   2 classes: '0', '1' 
## 
## No pre-processing
## Resampling: Bootstrapped (25 reps) 
## Summary of sample sizes: 700, 700, 700, 700, 700, 700, ... 
## Resampling results across tuning parameters:
## 
##   size  decay  Accuracy   Kappa      
##   1     0e+00  0.7058774  0.000000000
##   1     1e-04  0.7058774  0.000000000
##   1     1e-01  0.7125275  0.147500089
##   3     0e+00  0.7036716  0.004883037
##   3     1e-04  0.7076839  0.024502285
##   3     1e-01  0.7191004  0.296608780
##   5     0e+00  0.7058774  0.000000000
##   5     1e-04  0.7043809  0.034209917
##   5     1e-01  0.7124420  0.300580044
## 
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 0.1.
plotnet(credit.nnet$finalModel, y_names = "response")
title("Graphical Representation of our Neural Network")

#In sample
prob.nnet= predict(credit.nnet,type='prob')
pred.nnet = as.numeric(prob.nnet[,2] >=pcut.gam)
table(credit.train$response,pred.nnet, dnn=c("Observed","Predicted"))
##         Predicted
## Observed   0   1
##        0 352 139
##        1  13 196
#MR
MR.nnet_in <- mean(ifelse(credit.train$response != pred.nnet, 1, 0))
MR.nnet_in
## [1] 0.2171429
#Costfunction
creditcost(credit.train$response, pred.nnet)
## [1] 0.2914286
#Out of sample
prob.nnet.test= predict(credit.nnet,credit.test,type='prob')
pred.nnet.test = as.numeric(prob.nnet.test[,2] > pcut.gam)
table(credit.test$response,pred.nnet.test, dnn=c("Observed","Predicted"))
##         Predicted
## Observed   0   1
##        0 135  74
##        1  27  64
##MR
MR.nnet_out <- mean(ifelse(credit.test$response != pred.nnet.test, 1, 0))
#Costfunction
creditcost(credit.test$response, pred.nnet.test)
## [1] 0.6966667

Conclusion

Based on MR , it shows logistic regression and Neural network are the best model among all for Boston Housing Data

##Final Table fo MR in sample

stats.models <- data.frame("Model Name" = c("Logistic Regression","CART", "GAM","Neural Network"),
                             "MR" = c(MR.logit,MR.tree,MR.gam,MR.nnet_in)
                          )

stats.models