This project predict people described by a set of attributes will have good or bad credit risks
The data for this problem is taken from UCI Machine Learning Repository (https://archive.ics.uci.edu/ml/datasets/statlog+(german+credit+data)). The original dataset contains 1000 entries with 20 categorial/symbolic attributes prepared by Prof. Hofmann. In this dataset, each entry represents a person who takes a credit by a bank. Each person is classified as good or bad credit risks according to the set of attributes.
Approach
library(dplyr)
library(glmnet)
library(ROCR)
library(PRROC)
library(boot)
library(rpart)
library(rpart.plot)
library(knitr)
library(dplyr)
library(tidyr)
library(reshape2)
library(RColorBrewer)
library(GGally)
library(ggplot2)
library(boot)
library(verification)
set.seed(1234)
credit.data <- read.table("D:/Course/Data Mining/Data Mining 1/Homework/german.data")
We get the data from the link. We need to provide names for the columns and change the response labels to 1 and 0: 0 corresponding to a good credit record and 1 corresponding to a bad one (positive class).
colnames(credit.data)=c("chk_acct","duration","credit_his","purpose","amount","saving_acct","present_emp","installment_rate","sex","other_debtor","present_resid","property","age","other_install","housing","n_credits","job","n_people","telephone","foreign","response")
#orginal response coding 1= good, 2 = bad
#we need 0 = good, 1 = bad
credit.data$response = credit.data$response - 1
Data Structure
There is a total on 21 attributes in the dataset. Their descriptions and details have been tabulated below:
We take the summary statistics of the dataset, the dataset has a total of 1000 observations with 21 variables, out of which 8 are numerical variables including the response and 13 are categorical variables with various levels. The summary statistics for the variables have been presented
glimpse(credit.data)
## Observations: 1,000
## Variables: 21
## $ chk_acct <fct> A11, A12, A14, A11, A11, A14, A14, A12, A14, A12, ...
## $ duration <int> 6, 48, 12, 42, 24, 36, 24, 36, 12, 30, 12, 48, 12,...
## $ credit_his <fct> A34, A32, A34, A32, A33, A32, A32, A32, A32, A34, ...
## $ purpose <fct> A43, A43, A46, A42, A40, A46, A42, A41, A43, A40, ...
## $ amount <int> 1169, 5951, 2096, 7882, 4870, 9055, 2835, 6948, 30...
## $ saving_acct <fct> A65, A61, A61, A61, A61, A65, A63, A61, A64, A61, ...
## $ present_emp <fct> A75, A73, A74, A74, A73, A73, A75, A73, A74, A71, ...
## $ installment_rate <int> 4, 2, 2, 2, 3, 2, 3, 2, 2, 4, 3, 3, 1, 4, 2, 4, 4,...
## $ sex <fct> A93, A92, A93, A93, A93, A93, A93, A93, A91, A94, ...
## $ other_debtor <fct> A101, A101, A101, A103, A101, A101, A101, A101, A1...
## $ present_resid <int> 4, 2, 3, 4, 4, 4, 4, 2, 4, 2, 1, 4, 1, 4, 4, 2, 4,...
## $ property <fct> A121, A121, A121, A122, A124, A124, A122, A123, A1...
## $ age <int> 67, 22, 49, 45, 53, 35, 53, 35, 61, 28, 25, 24, 22...
## $ other_install <fct> A143, A143, A143, A143, A143, A143, A143, A143, A1...
## $ housing <fct> A152, A152, A152, A153, A153, A153, A152, A151, A1...
## $ n_credits <int> 2, 1, 1, 1, 2, 1, 1, 1, 1, 2, 1, 1, 1, 2, 1, 1, 2,...
## $ job <fct> A173, A173, A172, A173, A173, A172, A173, A174, A1...
## $ n_people <int> 1, 1, 2, 2, 2, 2, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1,...
## $ telephone <fct> A192, A191, A191, A191, A191, A192, A191, A192, A1...
## $ foreign <fct> A201, A201, A201, A201, A201, A201, A201, A201, A2...
## $ response <dbl> 0, 1, 0, 0, 1, 0, 0, 0, 0, 1, 1, 1, 0, 1, 0, 1, 0,...
#converting response to factor
credit.data$response <- as.factor(credit.data$response)
summary(credit.data)
## chk_acct duration credit_his purpose amount saving_acct
## A11:274 Min. : 4.0 A30: 40 A43 :280 Min. : 250 A61:603
## A12:269 1st Qu.:12.0 A31: 49 A40 :234 1st Qu.: 1366 A62:103
## A13: 63 Median :18.0 A32:530 A42 :181 Median : 2320 A63: 63
## A14:394 Mean :20.9 A33: 88 A41 :103 Mean : 3271 A64: 48
## 3rd Qu.:24.0 A34:293 A49 : 97 3rd Qu.: 3972 A65:183
## Max. :72.0 A46 : 50 Max. :18424
## (Other): 55
## present_emp installment_rate sex other_debtor present_resid property
## A71: 62 Min. :1.000 A91: 50 A101:907 Min. :1.000 A121:282
## A72:172 1st Qu.:2.000 A92:310 A102: 41 1st Qu.:2.000 A122:232
## A73:339 Median :3.000 A93:548 A103: 52 Median :3.000 A123:332
## A74:174 Mean :2.973 A94: 92 Mean :2.845 A124:154
## A75:253 3rd Qu.:4.000 3rd Qu.:4.000
## Max. :4.000 Max. :4.000
##
## age other_install housing n_credits job
## Min. :19.00 A141:139 A151:179 Min. :1.000 A171: 22
## 1st Qu.:27.00 A142: 47 A152:713 1st Qu.:1.000 A172:200
## Median :33.00 A143:814 A153:108 Median :1.000 A173:630
## Mean :35.55 Mean :1.407 A174:148
## 3rd Qu.:42.00 3rd Qu.:2.000
## Max. :75.00 Max. :4.000
##
## n_people telephone foreign response
## Min. :1.000 A191:596 A201:963 0:700
## 1st Qu.:1.000 A192:404 A202: 37 1:300
## Median :1.000
## Mean :1.155
## 3rd Qu.:1.000
## Max. :2.000
##
We get the following insights from our EDA of continuous variables:
Duration
amount.mean = credit.data %>% dplyr::select(amount, response) %>% group_by(response) %>% summarise(m =mean(amount))
duration.mean = credit.data %>% dplyr::select(duration, response) %>%group_by(response) %>% summarise( m =mean(duration))
ggplot(credit.data, aes(duration, fill=response)) +
geom_density(alpha=.5)
test.m = credit.data[,c(2,5,8,13,16,18,21)]
test.m$response <- as.numeric(test.m$response)
ggplot(melt(credit.data[,c(2,21)]), aes(x = variable, y = value, fill = response)) + geom_boxplot() + xlab("response") + ylab("duration")
## Using response as id variables
Installment Rate
ggplot(credit.data, aes(factor(installment_rate), ..count..)) +
geom_bar(aes(fill = response), position = "dodge") + xlab("Installment Rates")
Amount
ggplot(credit.data, aes(amount, fill=response)) +
geom_density(alpha=.5)
ggplot(melt(credit.data[,c(5,21)]), aes(x = variable, y = value, fill = response)) +
geom_boxplot() + xlab("response") + ylab("amount")
## Using response as id variables
Age
ggplot(melt(credit.data[,c(13,21)]), aes(x = variable, y = value, fill = response)) +
geom_boxplot()+ xlab("response") + ylab("age")
## Using response as id variables
n_credits
ggplot(melt(credit.data[,c(16,21)]), aes(x = variable, y = value, fill = response)) +
geom_boxplot()
## Using response as id variables
We get the following insights from our EDA of categorical variables:
chk_acct
ggplot(credit.data, aes(chk_acct, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
credit_hist
ggplot(credit.data, aes(credit_his, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
purpose
ggplot(credit.data, aes(purpose, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
Saving_acct
ggplot(credit.data, aes(saving_acct, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
other_debtor
ggplot(credit.data, aes(other_debtor, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
sex
ggplot(credit.data, aes(sex, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
other_install
ggplot(credit.data, aes(other_install, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
foreign
ggplot(credit.data, aes(foreign, ..count..)) +
geom_bar(aes(fill = response), position = "dodge")
# Sampling data
index <- sample(nrow(credit.data),nrow(credit.data)*0.70)
credit.train = credit.data[index,]
credit.test = credit.data[-index,]
creditcost <- function(observed, predicted){
weight1 = 5
weight0 = 1
c1 = (observed==1)&(predicted == 0) #logical vector - true if actual 1 but predict 0
c0 = (observed==0)&(predicted == 1) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
Key insights
credit.glm.logit<- glm(response~., family=binomial, data=credit.train)
credit.glm.back.AIC <- step(credit.glm.logit) # backward selection (if you don't specify anything)
## Start: AIC=669.94
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + property + age + other_install + housing +
## n_credits + job + n_people + telephone + foreign
##
## Df Deviance AIC
## - job 3 574.14 666.14
## - n_people 1 571.98 667.98
## - present_resid 1 572.01 668.01
## - property 3 577.69 669.69
## - age 1 573.73 669.73
## <none> 571.94 669.94
## - present_emp 4 580.21 670.21
## - sex 3 579.28 671.28
## - other_install 2 577.57 671.57
## - n_credits 1 575.72 671.72
## - foreign 1 575.77 671.77
## - housing 2 578.20 672.20
## - duration 1 576.30 672.30
## - purpose 9 594.05 674.05
## - telephone 1 578.55 674.55
## - other_debtor 2 583.50 677.50
## - installment_rate 1 581.72 677.72
## - amount 1 582.65 678.65
## - saving_acct 4 588.92 678.92
## - credit_his 4 599.06 689.06
## - chk_acct 3 623.27 715.27
##
## Step: AIC=666.14
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + property + age + other_install + housing +
## n_credits + n_people + telephone + foreign
##
## Df Deviance AIC
## - n_people 1 574.19 664.19
## - present_resid 1 574.21 664.21
## - present_emp 4 581.13 665.13
## - property 3 579.64 665.64
## - age 1 576.03 666.03
## <none> 574.14 666.14
## - n_credits 1 577.53 667.53
## - other_install 2 579.68 667.68
## - sex 3 581.88 667.88
## - foreign 1 578.33 668.33
## - duration 1 578.51 668.51
## - housing 2 580.59 668.59
## - purpose 9 595.39 669.39
## - telephone 1 580.82 670.82
## - other_debtor 2 585.79 673.79
## - installment_rate 1 585.01 675.01
## - saving_acct 4 592.12 676.12
## - amount 1 586.16 676.16
## - credit_his 4 600.54 684.54
## - chk_acct 3 624.32 710.32
##
## Step: AIC=664.19
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## present_resid + property + age + other_install + housing +
## n_credits + telephone + foreign
##
## Df Deviance AIC
## - present_resid 1 574.26 662.26
## - present_emp 4 581.14 663.14
## - property 3 579.65 663.65
## - age 1 576.07 664.07
## <none> 574.19 664.19
## - n_credits 1 577.70 665.70
## - other_install 2 579.77 665.77
## - sex 3 581.93 665.93
## - foreign 1 578.39 666.39
## - duration 1 578.53 666.53
## - housing 2 580.60 666.60
## - purpose 9 595.48 667.48
## - telephone 1 580.90 668.90
## - other_debtor 2 585.79 671.79
## - installment_rate 1 585.01 673.01
## - saving_acct 4 592.12 674.12
## - amount 1 586.17 674.17
## - credit_his 4 601.07 683.07
## - chk_acct 3 624.66 708.66
##
## Step: AIC=662.26
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + present_emp + installment_rate + sex + other_debtor +
## property + age + other_install + housing + n_credits + telephone +
## foreign
##
## Df Deviance AIC
## - present_emp 4 581.16 661.16
## - property 3 579.72 661.72
## - age 1 576.08 662.08
## <none> 574.26 662.26
## - other_install 2 579.81 663.81
## - n_credits 1 577.87 663.87
## - sex 3 581.93 663.93
## - foreign 1 578.52 664.52
## - duration 1 578.65 664.65
## - housing 2 581.40 665.40
## - purpose 9 595.48 665.48
## - telephone 1 580.91 666.91
## - other_debtor 2 585.87 669.87
## - installment_rate 1 585.08 671.08
## - saving_acct 4 592.14 672.14
## - amount 1 586.17 672.17
## - credit_his 4 601.07 681.07
## - chk_acct 3 625.10 707.10
##
## Step: AIC=661.16
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + installment_rate + sex + other_debtor + property +
## age + other_install + housing + n_credits + telephone + foreign
##
## Df Deviance AIC
## - property 3 586.85 660.85
## - age 1 582.99 660.99
## <none> 581.16 661.16
## - n_credits 1 584.34 662.34
## - other_install 2 586.57 662.57
## - duration 1 584.59 662.59
## - purpose 9 601.57 663.57
## - foreign 1 585.80 663.80
## - housing 2 587.85 663.85
## - sex 3 589.91 663.91
## - telephone 1 588.55 666.55
## - other_debtor 2 593.22 669.22
## - installment_rate 1 591.84 669.84
## - amount 1 593.38 671.38
## - saving_acct 4 600.08 672.08
## - credit_his 4 608.54 680.54
## - chk_acct 3 633.96 707.96
##
## Step: AIC=660.85
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + installment_rate + sex + other_debtor + age +
## other_install + housing + n_credits + telephone + foreign
##
## Df Deviance AIC
## - age 1 588.76 660.76
## <none> 586.85 660.85
## - n_credits 1 589.65 661.65
## - duration 1 590.56 662.56
## - sex 3 594.77 662.77
## - other_install 2 593.10 663.10
## - foreign 1 591.38 663.38
## - purpose 9 607.75 663.75
## - telephone 1 592.78 664.78
## - housing 2 595.10 665.10
## - other_debtor 2 599.90 669.90
## - saving_acct 4 604.77 670.77
## - installment_rate 1 599.41 671.41
## - amount 1 601.13 673.13
## - credit_his 4 613.80 679.80
## - chk_acct 3 642.79 710.79
##
## Step: AIC=660.76
## response ~ chk_acct + duration + credit_his + purpose + amount +
## saving_acct + installment_rate + sex + other_debtor + other_install +
## housing + n_credits + telephone + foreign
##
## Df Deviance AIC
## <none> 588.76 660.76
## - n_credits 1 591.36 661.36
## - other_install 2 594.46 662.46
## - sex 3 596.80 662.80
## - duration 1 593.18 663.18
## - purpose 9 609.23 663.23
## - foreign 1 593.25 663.25
## - housing 2 597.06 665.06
## - telephone 1 595.61 665.61
## - other_debtor 2 602.05 670.05
## - installment_rate 1 600.84 670.84
## - saving_acct 4 607.57 671.57
## - amount 1 602.83 672.83
## - credit_his 4 616.64 680.64
## - chk_acct 3 644.80 710.80
german.logit <- summary(credit.glm.back.AIC)
credit.glm.back.AIC$deviance
## [1] 588.7578
AIC(credit.glm.back.AIC)
## [1] 660.7578
BIC(credit.glm.back.AIC)
## [1] 824.5966
pred.glm.a <- predict(credit.glm.back.AIC, type="response")
pred1 <- prediction(pred.glm.a, credit.train$response)
perf1 <- performance(pred1, "tpr", "fpr")
plot(perf1, colorize=TRUE)
AUC.logit <- unlist(slot(performance(pred1, "auc"), "y.values"))
AUC.logit
## [1] 0.853799
#misclassification rate table
#define a cost rate function
costfunc = function(obs, pred.p, pcut){
weight1 = 5 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost)
}
p.seq = seq(0.01, 1, 0.01)
cost = rep(0, length(p.seq))
for(i in 1:length(p.seq)){
cost[i] = costfunc(obs = credit.train$response, pred.p = pred.glm.a, pcut = p.seq[i])
}
plot(p.seq, cost)
optimal.pcut.glm.a = p.seq[which(cost==min(cost))]
optimal.pcut.glm.a
## [1] 0.2
class.glm0.train.opt<- (pred.glm.a>optimal.pcut.glm.a)*1
table(credit.train$response, class.glm0.train.opt, dnn = c("True", "Predicted"))
## Predicted
## True 0 1
## 0 324 167
## 1 25 184
MR.logit<- mean(credit.train$response!= class.glm0.train.opt)
MR.logit
## [1] 0.2742857
Key Insights
credit.rpart <- rpart(formula = response ~ . , data = credit.train, method = "class", parms = list(loss=matrix(c(0,5,1,0), nrow = 2)))
prp(credit.rpart,digits = 4, extra = 1)
#in sample
credit.train.pred.tree<- predict(credit.rpart, credit.train, type="class")
table(credit.train$response, credit.train.pred.tree, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 286 205
## 1 7 202
# out of sample
credit.test.pred.tree<- predict(credit.rpart, credit.test, type="class")
table(credit.test$response, credit.test.pred.tree, dnn=c("Truth","Predicted"))
## Predicted
## Truth 0 1
## 0 95 114
## 1 18 73
MR.tree<- mean(credit.test$response!= credit.test.pred.tree)
MR.tree
## [1] 0.44
credit.rpart <- rpart(formula = response ~ ., data = credit.train,
method = "class",
parms = list(loss=matrix(c(0,5,1,0), nrow = 2)))
#Probability of getting 1
credit.test.prob.rpart = predict(credit.rpart,credit.test, type="prob")
pred = prediction(credit.test.prob.rpart[,2], credit.test$response)
perf = performance(pred, "tpr", "fpr")
plot(perf, colorize=TRUE)
AUC.tree <- slot(performance(pred, "auc"), "y.values")[[1]]
AUC.tree
## [1] 0.6715916
GAM function run only on the continuous variables present in the German credit dataset: ‘age’, ‘duration’ and ‘amount’
Key Insights
library(mgcv)
## Loading required package: nlme
##
## Attaching package: 'nlme'
## The following object is masked from 'package:dplyr':
##
## collapse
## This is mgcv 1.8-31. For overview type 'help("mgcv-package")'.
## Create a formula for a model with a large number of variables:
credit.gam1 <- gam(as.factor(response)~chk_acct+s(duration)+credit_his+purpose+s(amount)+saving_acct+present_emp+installment_rate+sex+other_debtor+present_resid+property
+s(age)+other_install+housing+n_credits+telephone+foreign , family=binomial,data=credit.train)
summary(credit.gam1)
##
## Family: binomial
## Link function: logit
##
## Formula:
## as.factor(response) ~ chk_acct + s(duration) + credit_his + purpose +
## s(amount) + saving_acct + present_emp + installment_rate +
## sex + other_debtor + present_resid + property + s(age) +
## other_install + housing + n_credits + telephone + foreign
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.08427 1.09878 1.897 0.057841 .
## chk_acctA12 -0.56633 0.28070 -2.018 0.043632 *
## chk_acctA13 -1.08760 0.43223 -2.516 0.011860 *
## chk_acctA14 -1.95817 0.30221 -6.479 9.2e-11 ***
## credit_hisA31 1.06490 0.70769 1.505 0.132384
## credit_hisA32 -0.13421 0.55623 -0.241 0.809328
## credit_hisA33 -0.57983 0.61030 -0.950 0.342074
## credit_hisA34 -1.35119 0.56668 -2.384 0.017108 *
## purposeA41 -1.66380 0.49190 -3.382 0.000719 ***
## purposeA410 -0.23376 0.98762 -0.237 0.812895
## purposeA42 -0.61334 0.34337 -1.786 0.074064 .
## purposeA43 -0.89896 0.31994 -2.810 0.004958 **
## purposeA44 -0.81411 0.92274 -0.882 0.377628
## purposeA45 -0.41481 0.68848 -0.603 0.546841
## purposeA46 -0.25140 0.50245 -0.500 0.616827
## purposeA48 -1.09956 1.36406 -0.806 0.420188
## purposeA49 -0.10096 0.39308 -0.257 0.797305
## saving_acctA62 -0.14172 0.35622 -0.398 0.690756
## saving_acctA63 -0.18338 0.45317 -0.405 0.685726
## saving_acctA64 -0.75293 0.59058 -1.275 0.202348
## saving_acctA65 -1.33825 0.34523 -3.876 0.000106 ***
## present_empA72 -0.25747 0.48814 -0.527 0.597875
## present_empA73 -0.29573 0.45276 -0.653 0.513652
## present_empA74 -1.10871 0.50990 -2.174 0.029677 *
## present_empA75 -0.43517 0.46579 -0.934 0.350174
## installment_rate 0.32688 0.11468 2.850 0.004365 **
## sexA92 -1.12260 0.49201 -2.282 0.022511 *
## sexA93 -1.28285 0.47677 -2.691 0.007130 **
## sexA94 -0.98753 0.59450 -1.661 0.096692 .
## other_debtorA102 0.58044 0.49276 1.178 0.238823
## other_debtorA103 -1.77175 0.62042 -2.856 0.004294 **
## present_resid 0.02018 0.10968 0.184 0.854033
## propertyA122 0.57842 0.33321 1.736 0.082582 .
## propertyA123 0.35230 0.30640 1.150 0.250229
## propertyA124 1.10485 0.51126 2.161 0.030693 *
## other_installA142 -0.61814 0.53303 -1.160 0.246177
## other_installA143 -0.69262 0.29603 -2.340 0.019302 *
## housingA152 -0.73430 0.29047 -2.528 0.011471 *
## housingA153 -0.86812 0.57911 -1.499 0.133856
## n_credits 0.43200 0.23207 1.861 0.062676 .
## telephoneA192 -0.60788 0.24654 -2.466 0.013677 *
## foreignA202 -1.62031 0.84536 -1.917 0.055276 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(duration) 1.000 1.000 5.946 0.01475 *
## s(amount) 2.336 2.964 14.599 0.00306 **
## s(age) 1.000 1.001 1.743 0.18684
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.322 Deviance explained = 33.2%
## UBRE = -0.053405 Scale est. = 1 n = 700
plot(credit.gam1, shade=TRUE)
credit.gam <- gam(as.factor(response)~chk_acct+(duration)+credit_his+purpose+s(amount)+saving_acct+present_emp+installment_rate+sex+other_debtor+present_resid+property
+(age)+other_install+housing+n_credits+telephone+foreign , family=binomial,data=credit.train)
summary(credit.gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## as.factor(response) ~ chk_acct + (duration) + credit_his + purpose +
## s(amount) + saving_acct + present_emp + installment_rate +
## sex + other_debtor + present_resid + property + (age) + other_install +
## housing + n_credits + telephone + foreign
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.01580 1.19158 1.692 0.090702 .
## chk_acctA12 -0.56633 0.28070 -2.018 0.043632 *
## chk_acctA13 -1.08760 0.43223 -2.516 0.011860 *
## chk_acctA14 -1.95817 0.30221 -6.480 9.2e-11 ***
## duration 0.02886 0.01184 2.439 0.014747 *
## credit_hisA31 1.06490 0.70768 1.505 0.132383
## credit_hisA32 -0.13421 0.55623 -0.241 0.809337
## credit_hisA33 -0.57982 0.61030 -0.950 0.342079
## credit_hisA34 -1.35118 0.56668 -2.384 0.017108 *
## purposeA41 -1.66379 0.49190 -3.382 0.000719 ***
## purposeA410 -0.23377 0.98762 -0.237 0.812888
## purposeA42 -0.61333 0.34337 -1.786 0.074067 .
## purposeA43 -0.89895 0.31994 -2.810 0.004958 **
## purposeA44 -0.81410 0.92274 -0.882 0.377633
## purposeA45 -0.41480 0.68848 -0.602 0.546851
## purposeA46 -0.25140 0.50245 -0.500 0.616830
## purposeA48 -1.09955 1.36405 -0.806 0.420190
## purposeA49 -0.10095 0.39308 -0.257 0.797311
## saving_acctA62 -0.14172 0.35622 -0.398 0.690751
## saving_acctA63 -0.18338 0.45317 -0.405 0.685727
## saving_acctA64 -0.75293 0.59058 -1.275 0.202346
## saving_acctA65 -1.33825 0.34523 -3.876 0.000106 ***
## present_empA72 -0.25749 0.48814 -0.527 0.597857
## present_empA73 -0.29574 0.45276 -0.653 0.513633
## present_empA74 -1.10873 0.50990 -2.174 0.029675 *
## present_empA75 -0.43518 0.46579 -0.934 0.350155
## installment_rate 0.32688 0.11467 2.850 0.004365 **
## sexA92 -1.12258 0.49201 -2.282 0.022512 *
## sexA93 -1.28284 0.47677 -2.691 0.007130 **
## sexA94 -0.98752 0.59450 -1.661 0.096696 .
## other_debtorA102 0.58045 0.49276 1.178 0.238816
## other_debtorA103 -1.77175 0.62042 -2.856 0.004294 **
## present_resid 0.02018 0.10968 0.184 0.854017
## propertyA122 0.57843 0.33321 1.736 0.082581 .
## propertyA123 0.35230 0.30640 1.150 0.250228
## propertyA124 1.10485 0.51126 2.161 0.030693 *
## age -0.01523 0.01153 -1.320 0.186713
## other_installA142 -0.61815 0.53303 -1.160 0.246175
## other_installA143 -0.69262 0.29603 -2.340 0.019301 *
## housingA152 -0.73431 0.29047 -2.528 0.011470 *
## housingA153 -0.86812 0.57911 -1.499 0.133856
## n_credits 0.43200 0.23207 1.861 0.062676 .
## telephoneA192 -0.60789 0.24654 -2.466 0.013676 *
## foreignA202 -1.62031 0.84536 -1.917 0.055275 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(amount) 2.336 2.964 14.6 0.00306 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.322 Deviance explained = 33.2%
## UBRE = -0.053406 Scale est. = 1 n = 700
AIC(credit.gam)
## [1] 662.6159
BIC(credit.gam)
## [1] 873.494
credit.gam$deviance
## [1] 569.9443
pcut.gam <- (1/6)
prob.gam.in<-predict(credit.gam,credit.train,type="response")
pred.gam.in<-(prob.gam.in>=pcut.gam)*1
table(credit.train$response,pred.gam.in,dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 302 189
## 1 23 186
#MR
MR.gam <- mean(ifelse(credit.train$response != pred.gam.in, 1, 0))
#Cost assocaited with MR
creditcost(credit.train$response, pred.gam.in)
## [1] 0.4342857
#Out-of-sample performance########
prob.gam.out<-predict(credit.gam,credit.test,type="response")
pred.gam.out<-(prob.gam.out>=pcut.gam)*1
table(credit.test$response,pred.gam.out,dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 122 87
## 1 18 73
#MR
Mr.gam.out <- mean(ifelse(credit.test$response != pred.gam.out, 1, 0))
#Cost assocaited with MR
creditcost(credit.test$response, pred.gam.out)
## [1] 0.59
The response(in classification) needs not to be standardized
Key Insights
library(caret)
## Loading required package: lattice
##
## Attaching package: 'lattice'
## The following object is masked from 'package:boot':
##
## melanoma
## Registered S3 method overwritten by 'pROC':
## method from
## lines.roc verification
library(NeuralNetTools)
par(mfrow=c(1,1))
credit.nnet <- train(as.factor(response)~., data=credit.train,method="nnet",na.action=na.exclude,hidden=c(5,3),maxit=300,act.fct="logistic",learningrate=0.1)
plot(credit.nnet)
print(credit.nnet)
## Neural Network
##
## 700 samples
## 20 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 700, 700, 700, 700, 700, 700, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0e+00 0.7058774 0.000000000
## 1 1e-04 0.7058774 0.000000000
## 1 1e-01 0.7125275 0.147500089
## 3 0e+00 0.7036716 0.004883037
## 3 1e-04 0.7076839 0.024502285
## 3 1e-01 0.7191004 0.296608780
## 5 0e+00 0.7058774 0.000000000
## 5 1e-04 0.7043809 0.034209917
## 5 1e-01 0.7124420 0.300580044
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 0.1.
plotnet(credit.nnet$finalModel, y_names = "response")
title("Graphical Representation of our Neural Network")
#In sample
prob.nnet= predict(credit.nnet,type='prob')
pred.nnet = as.numeric(prob.nnet[,2] >=pcut.gam)
table(credit.train$response,pred.nnet, dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 352 139
## 1 13 196
#MR
MR.nnet_in <- mean(ifelse(credit.train$response != pred.nnet, 1, 0))
MR.nnet_in
## [1] 0.2171429
#Costfunction
creditcost(credit.train$response, pred.nnet)
## [1] 0.2914286
#Out of sample
prob.nnet.test= predict(credit.nnet,credit.test,type='prob')
pred.nnet.test = as.numeric(prob.nnet.test[,2] > pcut.gam)
table(credit.test$response,pred.nnet.test, dnn=c("Observed","Predicted"))
## Predicted
## Observed 0 1
## 0 135 74
## 1 27 64
##MR
MR.nnet_out <- mean(ifelse(credit.test$response != pred.nnet.test, 1, 0))
#Costfunction
creditcost(credit.test$response, pred.nnet.test)
## [1] 0.6966667
Based on MR , it shows logistic regression and Neural network are the best model among all for Boston Housing Data
##Final Table fo MR in sample
stats.models <- data.frame("Model Name" = c("Logistic Regression","CART", "GAM","Neural Network"),
"MR" = c(MR.logit,MR.tree,MR.gam,MR.nnet_in)
)
stats.models