Data: The German credit scoring data is a dataset provided by Prof. Hogmann. The data set has information about 1000 individuals, on the basis of which they have been classified as risky or not.
Goal: Compare the performance of various classification models on predicting the risk of the loans for 1000 individuals.
Approach: Compare the asymmetric cost for train and test set for 4 different classification models.
Major Findings: In this case, predictive power of GAM > Logistic Regression > Neural Network > Classification Tree
Loading Libraries
library(tidyverse)
library(DT)
library(glmnet)
library(rpart)
library(rpart.plot)
library(caret)
library(knitr)
library(mgcv)
library(nnet)
library(NeuralNetTools)
library(e1071)
library(verification)
Loading Data
german.data <- read.table("http://archive.ics.uci.edu/ml/machine-learning-databases/statlog/german/german.data")
colnames(german.data) <- c("chk_acct", "duration", "credit_his", "purpose",
"amount", "saving_acct", "present_emp", "installment_rate", "sex", "other_debtor",
"present_resid", "property", "age", "other_install", "housing", "n_credits",
"job", "n_people", "telephone", "foreign", "response")
german.data$response <- german.data$response - 1
german.data$response <- as.factor(german.data$response)
Splitting German Credit Data into train and test
set.seed(12871014)
trainrows <- sample(nrow(german.data), nrow(german.data) * 0.75)
germandata.train <- german.data[trainrows, ]
germandata.test <- german.data[-trainrows,]
Running Logistic Regression on all variables. Using stepwise AIC to select most important variables
germandata.train.glm0 <- glm(response~., family = binomial, germandata.train)
step(germandata.train.glm0)
Running Logistic Regression for only the important variables
germandata.train.glm0<- glm(formula = response ~ chk_acct + duration + credit_his + purpose +
amount + saving_acct + present_emp + installment_rate + sex +
other_install + housing + telephone + foreign, family = binomial,
data = germandata.train)
summary(germandata.train.glm0)
##
## Call:
## glm(formula = response ~ chk_acct + duration + credit_his + purpose +
## amount + saving_acct + present_emp + installment_rate + sex +
## other_install + housing + telephone + foreign, family = binomial,
## data = germandata.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.3907 -0.7121 -0.4148 0.6884 2.5532
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1027052 0.8320367 1.325 0.185069
## chk_acctA12 -0.4275772 0.2475858 -1.727 0.084170 .
## chk_acctA13 -1.2145459 0.4202196 -2.890 0.003849 **
## chk_acctA14 -1.6299836 0.2651828 -6.147 7.91e-10 ***
## duration 0.0235051 0.0100735 2.333 0.019629 *
## credit_hisA31 0.5281037 0.5959294 0.886 0.375518
## credit_hisA32 -0.5144857 0.4552584 -1.130 0.258436
## credit_hisA33 -0.7381269 0.5172045 -1.427 0.153538
## credit_hisA34 -1.2309114 0.4827676 -2.550 0.010782 *
## purposeA41 -1.5790902 0.4170250 -3.787 0.000153 ***
## purposeA410 -1.4171252 0.7442436 -1.904 0.056895 .
## purposeA42 -0.7734629 0.2986503 -2.590 0.009601 **
## purposeA43 -0.8716696 0.2775587 -3.140 0.001687 **
## purposeA44 -0.6523193 0.9795588 -0.666 0.505455
## purposeA45 -0.1393128 0.5834826 -0.239 0.811291
## purposeA46 0.1008669 0.4572692 0.221 0.825415
## purposeA48 -1.9351338 1.2544685 -1.543 0.122930
## purposeA49 -0.6153918 0.3749360 -1.641 0.100730
## amount 0.0001271 0.0000479 2.653 0.007979 **
## saving_acctA62 -0.6855558 0.3558182 -1.927 0.054017 .
## saving_acctA63 -0.2724063 0.4246704 -0.641 0.521228
## saving_acctA64 -0.7795669 0.5323271 -1.464 0.143071
## saving_acctA65 -0.7884676 0.2899230 -2.720 0.006537 **
## present_empA72 0.0737220 0.4277273 0.172 0.863156
## present_empA73 -0.1545465 0.3923392 -0.394 0.693647
## present_empA74 -0.8518728 0.4463937 -1.908 0.056347 .
## present_empA75 -0.4486597 0.4102079 -1.094 0.274070
## installment_rate 0.2860638 0.0992871 2.881 0.003962 **
## sexA92 -0.1804648 0.4343657 -0.415 0.677800
## sexA93 -0.4798463 0.4235443 -1.133 0.257243
## sexA94 -0.2757366 0.5191583 -0.531 0.595334
## other_installA142 -0.0235699 0.4847276 -0.049 0.961218
## other_installA143 -0.6757472 0.2694804 -2.508 0.012156 *
## housingA152 -0.4952948 0.2525648 -1.961 0.049872 *
## housingA153 -0.0592166 0.3843387 -0.154 0.877551
## telephoneA192 -0.2204227 0.2135435 -1.032 0.301972
## foreignA202 -1.3888488 0.6332060 -2.193 0.028281 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 902.33 on 749 degrees of freedom
## Residual deviance: 688.16 on 713 degrees of freedom
## AIC: 762.16
##
## Number of Fisher Scoring iterations: 5
Finding the optimal probability cutoff value - Symmetric Cost (1:1)
# predicting on train set
predict_logit_train <- predict(germandata.train.glm0, type="response")
# define a cost function with input "obs" being observed response
# and "pi" being predicted probability, and "pcut" being the threshold.
costfunc = function(obs, pred.p, pcut){
weight1 = 1 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
} # end of the function
# define a sequence from 0.01 to 1 by 0.01
p.seq = seq(0.01, 1, 0.01)
# write a loop for all p-cut to see which one provides the smallest cost
# first, need to define a 0 vector in order to save the value of cost from all pcut
cost = rep(0, length(p.seq))
for(i in 1:length(p.seq)){
cost[i] = costfunc(obs = germandata.train$response, pred.p = predict_logit_train, pcut = p.seq[i])
} # end of the loop
optimal.pcut = p.seq[which(cost==min(cost))][1]
optimal.pcut
## [1] 0.53
Plotting the symmetric misclassfication rate vs range of probability cutoffs
plot(p.seq, cost)
Train and Test Predictions
pred.glm.gtrain.glm0 <- predict(germandata.train.glm0, type = "response")
pred.glm.gtest.glm0 <- predict(germandata.train.glm0, newdata=germandata.test,type = "response")
pred.train <- as.numeric(pred.glm.gtrain.glm0 > optimal.pcut)
pred.test <- as.numeric(pred.glm.gtest.glm0 > optimal.pcut)
confusion_matrix_train <- table(germandata.train$response, pred.train)
confusion_matrix_test <- table(germandata.test$response, pred.test)
misclassification_rate_train <- round((confusion_matrix_train[2]+confusion_matrix_train[3])/sum(confusion_matrix_train), 2)
misclassification_rate_test <- round((confusion_matrix_test[2]+confusion_matrix_test[3])/sum(confusion_matrix_test), 2)
cat("train misclassfication rate:", misclassification_rate_train, "| test misclassfication rate:", misclassification_rate_test)
## train misclassfication rate: 0.21 | test misclassfication rate: 0.24
confusion_matrix_train
## pred.train
## 0 1
## 0 494 39
## 1 119 98
confusion_matrix_test
## pred.test
## 0 1
## 0 150 17
## 1 44 39
ROC Curve - Train
par(mfrow=c(1,1))
roc.logit <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm0)
AUC - Train
roc.logit$roc.vol[2]
## Area
## 1 0.8203543
ROC Curve - Test
par(mfrow=c(1,1))
roc.logit.test <- roc.plot(x=(germandata.test$response == "1"), pred =pred.glm.gtest.glm0)
AUC - Train
roc.logit.test$roc.vol[2]
## Area
## 1 0.8280788
Finding the optimal probability cutoff value - Asymmetric cost (5:1)
# predicting on train set
predict_logit_train <- predict(germandata.train.glm0, type="response")
# define a cost function with input "obs" being observed response
# and "pi" being predicted probability, and "pcut" being the threshold.
costfunc = function(obs, pred.p, pcut){
weight1 = 5 # define the weight for "true=1 but pred=0" (FN)
weight0 = 1 # define the weight for "true=0 but pred=1" (FP)
c1 = (obs==1)&(pred.p<pcut) # count for "true=1 but pred=0" (FN)
c0 = (obs==0)&(pred.p>=pcut) # count for "true=0 but pred=1" (FP)
cost = mean(weight1*c1 + weight0*c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
} # end of the function
# define a sequence from 0.01 to 1 by 0.01
p.seq = seq(0.01, 1, 0.01)
# write a loop for all p-cut to see which one provides the smallest cost
# first, need to define a 0 vector in order to save the value of cost from all pcut
cost = rep(0, length(p.seq))
for(i in 1:length(p.seq)){
cost[i] = costfunc(obs = germandata.train$response, pred.p = predict_logit_train, pcut = p.seq[i])
} # end of the loop
optimal.pcut.asymmetric = p.seq[which(cost==min(cost))][1]
optimal.pcut.asymmetric
## [1] 0.17
Plotting the symmetric misclassfication rate vs range of probability cutoffs
plot(p.seq, cost)
Defining a function to calculate Asymmetric Misclassfication Rate or Asymmetric Misclassification Cost
# Asymmetric Misclassification Rate, using 5:1 asymmetric cost
# r - actual response
# pi - predicted response
cost <- function(r, pi){
weight1 = 5
weight0 = 1
c1 = (r==1)&(pi==0) #logical vector - true if actual 1 but predict 0
c0 = (r==0)&(pi==1) #logical vector - true if actual 0 but predict 1
return(mean(weight1*c1+weight0*c0))
}
# pcut <- 1/6 ## Bayes estimate
pcut <- optimal.pcut.asymmetric
Train and Test Asymmetric Misclassfication Rate or Asymmetric Misclassification Cost
class.pred.train.glm0 <- (pred.glm.gtrain.glm0>pcut)*1
cost.train <- round(cost(r = germandata.train$response, pi = class.pred.train.glm0),2)
class.pred.test.glm0<- (pred.glm.gtest.glm0>pcut)*1
cost.test <- round(cost(r = germandata.test$response, pi = class.pred.test.glm0),2)
cat("total train cost:", cost.train, "| total test cost:", cost.test)
## total train cost: 0.47 | total test cost: 0.48
Building and plotting a Classificaion Tree using all variables
set.seed(27)
germandata.largetree <- rpart(formula = response~., data = germandata.train,
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)))
prp(germandata.largetree, extra = 1, nn.font=40,box.palette = "green")
Plotting the complexity parameters for all possible number of splits
plotcp(germandata.largetree)
Printing the complexity parameters for all possible number of splits
printcp(germandata.largetree)
##
## Classification tree:
## rpart(formula = response ~ ., data = germandata.train, parms = list(loss = matrix(c(0,
## 5, 1, 0), nrow = 2)))
##
## Variables actually used in tree construction:
## [1] age chk_acct credit_his duration
## [5] installment_rate job other_debtor other_install
## [9] present_emp purpose saving_acct
##
## Root node error: 533/750 = 0.71067
##
## n= 750
##
## CP nsplit rel error xerror xstd
## 1 0.140713 0 1.00000 5.0000 0.11649
## 2 0.086304 1 0.85929 2.5966 0.12145
## 3 0.030957 2 0.77298 2.6735 0.12271
## 4 0.024390 4 0.71107 2.7205 0.12312
## 5 0.017824 5 0.68668 2.7280 0.12325
## 6 0.015009 7 0.65103 2.8011 0.12390
## 7 0.010000 15 0.51782 2.5535 0.12117
Pruning the tree using optimal complexity parameter and then plotting the optimal tree
german.prunedtree <- rpart(response~., data = germandata.train, method = "class",
parms = list(loss = matrix(c(0, 5, 1, 0), nrow = 2)),cp=0.015009)
prp(german.prunedtree, extra = 1, nn.font=500,box.palette = "green")
Train and Test Predictions
pred.tree.gtrain <- predict(german.prunedtree, type = "prob")[,2]
pred.tree.gtest <- predict(german.prunedtree, newdata=germandata.test, type = "prob")[,2]
pred.train <- as.numeric(pred.tree.gtrain > optimal.pcut)
pred.test <- as.numeric(pred.tree.gtest > optimal.pcut)
confusion_matrix_train <- table(germandata.train$response, pred.train)
confusion_matrix_test <- table(germandata.test$response, pred.test)
misclassification_rate_train <- round((confusion_matrix_train[2]+confusion_matrix_train[3])/sum(confusion_matrix_train), 2)
misclassification_rate_test <- round((confusion_matrix_test[2]+confusion_matrix_test[3])/sum(confusion_matrix_test), 2)
cat("train misclassfication rate:", misclassification_rate_train, "| test misclassfication rate:", misclassification_rate_test)
## train misclassfication rate: 0.24 | test misclassfication rate: 0.28
confusion_matrix_train
## pred.train
## 0 1
## 0 489 44
## 1 139 78
confusion_matrix_test
## pred.test
## 0 1
## 0 153 14
## 1 56 27
ROC Curve - Train
par(mfrow=c(1,1))
roc.logit <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm0)
AUC - Train
roc.logit$roc.vol[2]
## Area
## 1 0.8203543
ROC Curve - Test
par(mfrow=c(1,1))
roc.logit.test <- roc.plot(x=(germandata.test$response == "1"), pred =pred.glm.gtest.glm0)
AUC - Train
roc.logit.test$roc.vol[2]
## Area
## 1 0.8280788
Train and Test Asymmetric Misclassfication Rate or Asymmetric Misclassification Cost
class.pred.train.tree <- (pred.tree.gtrain>pcut)*1
cost.train <- cost(r = germandata.train$response, pi = class.pred.train.tree)
class.pred.test.tree<- (pred.tree.gtest>pcut)*1
cost.test <- cost(r = germandata.test$response, pi = class.pred.test.tree)
cat("total train cost:", cost.train, "| total test cost:", cost.test)
## total train cost: 0.368 | total test cost: 0.684
Building a Generalized Additive Model
germandata.gam <- gam(as.factor(response)~chk_acct+s(duration)+credit_his+purpose+s(amount)+saving_acct+present_emp+installment_rate+sex+other_debtor+present_resid+property
+s(age)+other_install+housing+n_credits+telephone+foreign , family=binomial,data=germandata.train)
summary(germandata.gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## as.factor(response) ~ chk_acct + s(duration) + credit_his + purpose +
## s(amount) + saving_acct + present_emp + installment_rate +
## sex + other_debtor + present_resid + property + s(age) +
## other_install + housing + n_credits + telephone + foreign
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.73499 0.97295 1.783 0.074550 .
## chk_acctA12 -0.39872 0.25495 -1.564 0.117836
## chk_acctA13 -1.25695 0.42700 -2.944 0.003243 **
## chk_acctA14 -1.64115 0.27004 -6.077 1.22e-09 ***
## credit_hisA31 0.67822 0.62572 1.084 0.278403
## credit_hisA32 -0.41406 0.48379 -0.856 0.392070
## credit_hisA33 -0.74087 0.52470 -1.412 0.157954
## credit_hisA34 -1.23285 0.49098 -2.511 0.012039 *
## purposeA41 -1.50127 0.42849 -3.504 0.000459 ***
## purposeA410 -1.77146 0.81426 -2.176 0.029589 *
## purposeA42 -0.77717 0.30986 -2.508 0.012138 *
## purposeA43 -0.80479 0.28289 -2.845 0.004443 **
## purposeA44 -0.59062 0.98369 -0.600 0.548233
## purposeA45 -0.14647 0.60786 -0.241 0.809590
## purposeA46 0.01392 0.46918 0.030 0.976330
## purposeA48 -2.05678 1.28478 -1.601 0.109404
## purposeA49 -0.58042 0.38068 -1.525 0.127331
## saving_acctA62 -0.76531 0.36447 -2.100 0.035750 *
## saving_acctA63 -0.34857 0.43022 -0.810 0.417823
## saving_acctA64 -0.91498 0.54794 -1.670 0.094946 .
## saving_acctA65 -0.79432 0.29608 -2.683 0.007301 **
## present_empA72 0.08586 0.43745 0.196 0.844392
## present_empA73 -0.06766 0.40001 -0.169 0.865688
## present_empA74 -0.78556 0.45429 -1.729 0.083770 .
## present_empA75 -0.42007 0.41898 -1.003 0.316051
## installment_rate 0.22439 0.10535 2.130 0.033175 *
## sexA92 -0.23545 0.44499 -0.529 0.596728
## sexA93 -0.50671 0.43443 -1.166 0.243463
## sexA94 -0.29413 0.53269 -0.552 0.580831
## other_debtorA102 0.76429 0.47913 1.595 0.110677
## other_debtorA103 -0.78319 0.46169 -1.696 0.089819 .
## present_resid -0.01038 0.09985 -0.104 0.917207
## propertyA122 0.21695 0.29202 0.743 0.457526
## propertyA123 0.20147 0.27349 0.737 0.461333
## propertyA124 0.80037 0.48109 1.664 0.096179 .
## other_installA142 -0.01103 0.49221 -0.022 0.982114
## other_installA143 -0.66704 0.27766 -2.402 0.016291 *
## housingA152 -0.46127 0.26904 -1.715 0.086434 .
## housingA153 -0.50695 0.53286 -0.951 0.341420
## n_credits 0.14560 0.22456 0.648 0.516749
## telephoneA192 -0.24913 0.22129 -1.126 0.260245
## foreignA202 -1.31608 0.64223 -2.049 0.040439 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(duration) 1.951 2.470 8.870 0.0249 *
## s(amount) 2.552 3.237 7.037 0.0767 .
## s(age) 1.000 1.000 0.027 0.8695
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.249 Deviance explained = 25.5%
## UBRE = 0.022901 Scale est. = 1 n = 750
Plotting the non-linear terms in Generalized Additive Model
plot(germandata.gam, shade=TRUE)
Moving age to partially linear term
# Move age to partially linear term and refit gam() model
germandata.gam <- gam(as.factor(response)~chk_acct+s(duration)+credit_his+purpose+s(amount)+saving_acct+present_emp+installment_rate+sex+other_debtor+present_resid+property
+(age)+other_install+housing+n_credits+telephone+foreign , family=binomial,data=germandata.train)
summary(germandata.gam)
##
## Family: binomial
## Link function: logit
##
## Formula:
## as.factor(response) ~ chk_acct + s(duration) + credit_his + purpose +
## s(amount) + saving_acct + present_emp + installment_rate +
## sex + other_debtor + present_resid + property + (age) + other_install +
## housing + n_credits + telephone + foreign
##
## Parametric coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.796336 1.037565 1.731 0.083398 .
## chk_acctA12 -0.398722 0.254950 -1.564 0.117836
## chk_acctA13 -1.256953 0.426997 -2.944 0.003243 **
## chk_acctA14 -1.641154 0.270044 -6.077 1.22e-09 ***
## credit_hisA31 0.678224 0.625717 1.084 0.278403
## credit_hisA32 -0.414061 0.483791 -0.856 0.392071
## credit_hisA33 -0.740865 0.524697 -1.412 0.157954
## credit_hisA34 -1.232850 0.490978 -2.511 0.012039 *
## purposeA41 -1.501274 0.428488 -3.504 0.000459 ***
## purposeA410 -1.771460 0.814261 -2.176 0.029589 *
## purposeA42 -0.777174 0.309864 -2.508 0.012138 *
## purposeA43 -0.804785 0.282887 -2.845 0.004443 **
## purposeA44 -0.590617 0.983691 -0.600 0.548234
## purposeA45 -0.146467 0.607864 -0.241 0.809592
## purposeA46 0.013921 0.469181 0.030 0.976329
## purposeA48 -2.056777 1.284780 -1.601 0.109404
## purposeA49 -0.580420 0.380676 -1.525 0.127331
## saving_acctA62 -0.765308 0.364474 -2.100 0.035750 *
## saving_acctA63 -0.348568 0.430223 -0.810 0.417823
## saving_acctA64 -0.914980 0.547936 -1.670 0.094946 .
## saving_acctA65 -0.794324 0.296082 -2.683 0.007301 **
## present_empA72 0.085861 0.437445 0.196 0.844393
## present_empA73 -0.067658 0.400010 -0.169 0.865687
## present_empA74 -0.785564 0.454288 -1.729 0.083770 .
## present_empA75 -0.420075 0.418982 -1.003 0.316049
## installment_rate 0.224385 0.105347 2.130 0.033175 *
## sexA92 -0.235448 0.444989 -0.529 0.596730
## sexA93 -0.506713 0.434434 -1.166 0.243463
## sexA94 -0.294132 0.532684 -0.552 0.580832
## other_debtorA102 0.764294 0.479134 1.595 0.110677
## other_debtorA103 -0.783190 0.461690 -1.696 0.089819 .
## present_resid -0.010380 0.099853 -0.104 0.917209
## propertyA122 0.216951 0.292022 0.743 0.457526
## propertyA123 0.201469 0.273493 0.737 0.461333
## propertyA124 0.800372 0.481088 1.664 0.096178 .
## age -0.001719 0.010456 -0.164 0.869392
## other_installA142 -0.011034 0.492214 -0.022 0.982115
## other_installA143 -0.667039 0.277662 -2.402 0.016291 *
## housingA152 -0.461272 0.269039 -1.715 0.086433 .
## housingA153 -0.506948 0.532864 -0.951 0.341419
## n_credits 0.145597 0.224560 0.648 0.516749
## telephoneA192 -0.249134 0.221292 -1.126 0.260244
## foreignA202 -1.316076 0.642226 -2.049 0.040438 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Approximate significance of smooth terms:
## edf Ref.df Chi.sq p-value
## s(duration) 1.951 2.470 8.870 0.0249 *
## s(amount) 2.552 3.237 7.037 0.0767 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## R-sq.(adj) = 0.249 Deviance explained = 25.5%
## UBRE = 0.0229 Scale est. = 1 n = 750
Plotting the non-linear terms in Generalized Additive Model
plot(germandata.gam, shade=TRUE)
Train and Test Predictions
pred.glm.gtrain.gam <- predict(germandata.gam, type = "response")
pred.glm.gtest.gam <- predict(germandata.gam, newdata=germandata.test,type = "response")
pred.train <- as.numeric(pred.glm.gtrain.gam > optimal.pcut)
pred.test <- as.numeric(pred.glm.gtest.gam > optimal.pcut)
confusion_matrix_train <- table(germandata.train$response, pred.train)
confusion_matrix_test <- table(germandata.test$response, pred.test)
misclassification_rate_train <- round((confusion_matrix_train[2]+confusion_matrix_train[3])/sum(confusion_matrix_train), 2)
misclassification_rate_test <- round((confusion_matrix_test[2]+confusion_matrix_test[3])/sum(confusion_matrix_test), 2)
cat("train misclassfication rate:", misclassification_rate_train, "| test misclassfication rate:", misclassification_rate_test)
## train misclassfication rate: 0.21 | test misclassfication rate: 0.23
confusion_matrix_train
## pred.train
## 0 1
## 0 496 37
## 1 117 100
confusion_matrix_test
## pred.test
## 0 1
## 0 152 15
## 1 43 40
ROC Curve - Train
par(mfrow=c(1,1))
roc.logit <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm0)
AUC - Train
roc.logit$roc.vol[2]
## Area
## 1 0.8203543
ROC Curve - Test
par(mfrow=c(1,1))
roc.logit.test <- roc.plot(x=(germandata.test$response == "1"), pred =pred.glm.gtest.glm0)
AUC - Train
roc.logit.test$roc.vol[2]
## Area
## 1 0.8280788
Train and Test Asymmetric Misclassfication Rate or Asymmetric Misclassification Cost
class.pred.train.gam <- (pred.glm.gtrain.gam>pcut)*1
cost.train <- round(cost(r = germandata.train$response, pi = class.pred.train.gam),2)
class.pred.test.gam<- (pred.glm.gtest.gam>pcut)*1
cost.test <- round(cost(r = germandata.test$response, pi = class.pred.test.gam),2)
cat("total train cost:", cost.train, "| total test cost:", cost.test)
## total train cost: 0.49 | total test cost: 0.46
Building a Neural Network Model
par(mfrow=c(1,1))
germandata.nnet <- train(response~., data=germandata.train,method="nnet")
Summary Neural Net Model
print(germandata.nnet)
## Neural Network
##
## 750 samples
## 20 predictor
## 2 classes: '0', '1'
##
## No pre-processing
## Resampling: Bootstrapped (25 reps)
## Summary of sample sizes: 750, 750, 750, 750, 750, 750, ...
## Resampling results across tuning parameters:
##
## size decay Accuracy Kappa
## 1 0e+00 0.7037446 0.00000000
## 1 1e-04 0.7037446 0.00000000
## 1 1e-01 0.7081389 0.10623878
## 3 0e+00 0.7037446 0.00000000
## 3 1e-04 0.7037446 0.00000000
## 3 1e-01 0.7089005 0.24424263
## 5 0e+00 0.7037446 0.00000000
## 5 1e-04 0.7008505 0.01061292
## 5 1e-01 0.7069696 0.27008473
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were size = 3 and decay = 0.1.
Plotting Neural Net Model
plot(germandata.nnet)
plotnet(germandata.nnet$finalModel, y_names = "response")
title("Graphical Representation of our Neural Network")
Train and Test Predictions
pred.glm.gtrain.nn <- predict(germandata.nnet, type = "prob")[,2]
pred.glm.gtest.nn <- predict(germandata.nnet, newdata=germandata.test,type = "prob")[,2]
pred.train <- as.numeric(pred.glm.gtrain.nn > optimal.pcut)
pred.test <- as.numeric(pred.glm.gtest.nn > optimal.pcut)
confusion_matrix_train <- table(germandata.train$response, pred.train)
confusion_matrix_test <- table(germandata.test$response, pred.test)
misclassification_rate_train <- round((confusion_matrix_train[2]+confusion_matrix_train[3])/sum(confusion_matrix_train), 2)
misclassification_rate_test <- round((confusion_matrix_test[2]+confusion_matrix_test[3])/sum(confusion_matrix_test), 2)
cat("train misclassfication rate:", misclassification_rate_train, "| test misclassfication rate:", misclassification_rate_test)
## train misclassfication rate: 0.16 | test misclassfication rate: 0.26
confusion_matrix_train
## pred.train
## 0 1
## 0 495 38
## 1 82 135
confusion_matrix_test
## pred.test
## 0 1
## 0 142 25
## 1 41 42
ROC Curve - Train
par(mfrow=c(1,1))
roc.logit <- roc.plot(x=(germandata.train$response == "1"), pred =pred.glm.gtrain.glm0)
AUC - Train
roc.logit$roc.vol[2]
## Area
## 1 0.8203543
ROC Curve - Test
par(mfrow=c(1,1))
roc.logit.test <- roc.plot(x=(germandata.test$response == "1"), pred =pred.glm.gtest.glm0)
AUC - Train
roc.logit.test$roc.vol[2]
## Area
## 1 0.8280788
Train and Test Asymmetric Misclassfication Rate or Asymmetric Misclassification Cost
class.pred.train.nn <- (pred.glm.gtrain.nn>pcut)*1
cost.train <- round(cost(r = germandata.train$response, pi = class.pred.train.nn),2)
class.pred.test.nn<- (pred.glm.gtest.nn>pcut)*1
cost.test <- round(cost(r = germandata.test$response, pi = class.pred.test.nn),2)
cat("total train cost:", cost.train, "| total test cost:", cost.test)
## total train cost: 0.35 | total test cost: 0.56