rows <- sample(nrow(german_credit))
german_credit_randomized <- german_credit[rows, ]
split_german_credit <- round(nrow(german_credit_randomized)*0.75)
train_german_credit <- german_credit_randomized[1:split_german_credit, ]
test_german_credit <- german_credit_randomized[(split_german_credit + 1):nrow(german_credit_randomized), ]
For logit model, AIC came out to be 791.727. For probit model, AIC came out to be 790.26. And, for cloglog model, AIC came out to be 789.07. The significant parameters are similar for all the three link functions.
For logit model, BIC came out to be 1017.82 For probit model, AIC came out to be 1016.63 And, for cloglog model, AIC came out to be 1015.46 The significant parameters are similar for all the three link functions.
null_log <- glm(formula = response~1, family = "binomial", data = train_german_credit)
full_log <- glm(formula = response ~ . , family = "binomial",
data = train_german_credit)
summary(full_log)
##
## Call:
## glm(formula = response ~ ., family = "binomial", data = train_german_credit)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.0764 -0.7138 -0.3931 0.7203 2.4910
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.711e-01 1.233e+00 -0.625 0.53180
## chk_acctA12 -4.136e-01 2.548e-01 -1.623 0.10459
## chk_acctA13 -9.493e-01 4.092e-01 -2.320 0.02035 *
## chk_acctA14 -1.695e+00 2.626e-01 -6.454 1.09e-10 ***
## duration 2.576e-02 1.102e-02 2.338 0.01939 *
## credit_hisA31 4.822e-01 6.214e-01 0.776 0.43771
## credit_hisA32 -4.181e-01 4.735e-01 -0.883 0.37721
## credit_hisA33 -1.099e+00 5.431e-01 -2.024 0.04298 *
## credit_hisA34 -1.440e+00 4.831e-01 -2.980 0.00288 **
## purposeA41 -2.021e+00 4.548e-01 -4.444 8.82e-06 ***
## purposeA410 -1.912e+00 1.011e+00 -1.891 0.05859 .
## purposeA42 -7.281e-01 2.949e-01 -2.469 0.01353 *
## purposeA43 -6.425e-01 2.774e-01 -2.316 0.02053 *
## purposeA44 -7.444e-02 8.880e-01 -0.084 0.93319
## purposeA45 -6.122e-02 6.009e-01 -0.102 0.91886
## purposeA46 3.119e-01 4.478e-01 0.696 0.48615
## purposeA48 -2.125e+00 1.224e+00 -1.736 0.08264 .
## purposeA49 -6.586e-01 3.869e-01 -1.702 0.08875 .
## amount 1.141e-04 5.274e-05 2.163 0.03050 *
## saving_acctA62 1.497e-02 3.298e-01 0.045 0.96380
## saving_acctA63 -2.506e-01 4.375e-01 -0.573 0.56671
## saving_acctA64 -7.828e-01 5.740e-01 -1.364 0.17266
## saving_acctA65 -5.563e-01 2.973e-01 -1.871 0.06134 .
## present_empA72 -3.733e-01 5.062e-01 -0.737 0.46089
## present_empA73 -3.436e-01 4.819e-01 -0.713 0.47578
## present_empA74 -1.129e+00 5.255e-01 -2.147 0.03176 *
## present_empA75 -6.814e-01 5.007e-01 -1.361 0.17356
## installment_rate 2.908e-01 9.883e-02 2.942 0.00326 **
## sexA92 -1.267e-01 4.368e-01 -0.290 0.77182
## sexA93 -6.055e-01 4.342e-01 -1.394 0.16319
## sexA94 2.986e-02 5.096e-01 0.059 0.95328
## other_debtorA102 8.926e-01 4.749e-01 1.880 0.06017 .
## other_debtorA103 -1.148e+00 5.004e-01 -2.294 0.02180 *
## present_resid -2.243e-03 9.911e-02 -0.023 0.98194
## propertyA122 4.166e-01 2.878e-01 1.447 0.14784
## propertyA123 3.315e-01 2.706e-01 1.225 0.22049
## propertyA124 5.784e-01 4.672e-01 1.238 0.21567
## age -3.334e-03 1.020e-02 -0.327 0.74387
## other_installA142 8.858e-02 4.813e-01 0.184 0.85398
## other_installA143 -5.101e-01 2.770e-01 -1.842 0.06553 .
## housingA152 -4.642e-01 2.744e-01 -1.692 0.09070 .
## housingA153 -4.187e-01 5.361e-01 -0.781 0.43485
## n_credits 4.202e-01 2.145e-01 1.959 0.05010 .
## jobA172 7.151e-01 7.902e-01 0.905 0.36546
## jobA173 8.205e-01 7.662e-01 1.071 0.28427
## jobA174 8.639e-01 7.846e-01 1.101 0.27089
## n_people 4.322e-01 2.843e-01 1.520 0.12840
## telephoneA192 -3.449e-01 2.258e-01 -1.528 0.12658
## foreignA202 -1.214e+00 6.325e-01 -1.920 0.05486 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 932.61 on 749 degrees of freedom
## Residual deviance: 693.44 on 701 degrees of freedom
## AIC: 791.44
##
## Number of Fisher Scoring iterations: 5
AIC(full_log)
## [1] 791.4415
BIC(full_log)
## [1] 1017.825
stepwise_AICmodel <- step(full_log,direction = "both")
stepwise_BICmodel <- step(full_log,direction = "both", k = log(nrow(train_german_credit)))
Variables selected after applying Stepwise AIC are chk_acct, duration, credit_his, purpose, amount, present_emp, installment_rate, sex, other_debtor, other_install, n_credits, n_people and foreign
AIC is 778.2989039
Variables selected after applying Stepwise AIC are chk_acct and duration
BIC is 852.5304001
dummy <- model.matrix(~., data = german_credit) #converting data to numeric matrix
dummy <- dummy[,-1]
nrow(german_credit)
## [1] 1000
rows_dummy <- sample(nrow(dummy))
dummy_randomized <- dummy[rows_dummy, ]
split_dummy <- round(nrow(dummy_randomized)*0.75)
train_dummy <- dummy_randomized[1:split_dummy, ]
test_dummy <- dummy_randomized[(split_dummy + 1):nrow(dummy_randomized), ]
credit_lasso <- cv.glmnet(x = as.matrix(train_dummy[,-49]), y = train_dummy[,49],
family = "binomial", type.measure = "class", alpha = 1)
plot(credit_lasso)
coef(credit_lasso, credit_lasso$lambda.min)
## 49 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -0.5613264252
## chk_acctA12 -0.1420174360
## chk_acctA13 -0.6622696525
## chk_acctA14 -1.4872929361
## duration 0.0206773121
## credit_hisA31 0.5067554283
## credit_hisA32 -0.1705706879
## credit_hisA33 -0.3736114895
## credit_hisA34 -1.2091923212
## purposeA41 -1.0111826105
## purposeA410 -0.8213943124
## purposeA42 -0.4842731717
## purposeA43 -0.6815762862
## purposeA44 0.0020436481
## purposeA45 .
## purposeA46 0.0819945789
## purposeA48 -1.0366689351
## purposeA49 -0.2324811942
## amount 0.0001184591
## saving_acctA62 -0.5223704470
## saving_acctA63 -0.6220660805
## saving_acctA64 -1.0229609816
## saving_acctA65 -0.7526951004
## present_empA72 0.2154133169
## present_empA73 .
## present_empA74 -0.3740548135
## present_empA75 .
## installment_rate 0.2376622322
## sexA92 .
## sexA93 -0.4153892160
## sexA94 0.0078418519
## other_debtorA102 0.3058138572
## other_debtorA103 -0.8905784569
## present_resid .
## propertyA122 0.1741980172
## propertyA123 0.0828950145
## propertyA124 0.4542421897
## age -0.0103108439
## other_installA142 .
## other_installA143 -0.0786072726
## housingA152 -0.2152687022
## housingA153 -0.3380414897
## n_credits 0.2165851270
## jobA172 .
## jobA173 0.0941302125
## jobA174 .
## n_people 0.1511730774
## telephoneA192 -0.1738092064
## foreignA202 -0.7650523339
Variables selected for in sample prediction are based on Stepwise AIC model.
The logistic model has been fitted based on the variable selected from the above the step, The ROC curve is plotted and the area under the curve was found to be 0.8182. The misclassification rate (false negative rate) is calculated to be 0.221.
best_insample <- glm(formula = response ~ chk_acct + duration + credit_his + purpose + amount + present_emp + installment_rate + sex + other_debtor + other_install+ foreign + n_credits + n_people, family = "binomial", data = train_german_credit)
insample_pred <- predict(best_insample,
type = "response")
insample_ROC <- roc(train_german_credit$response,insample_pred)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(insample_ROC, col = "blue")
auc(insample_ROC)
## Area under the curve: 0.8182
insamplecutoff_prob <- coords(insample_ROC,"best",ret = "threshold")
## Warning in coords.roc(insample_ROC, "best", ret = "threshold"): An
## upcoming version of pROC will set the 'transpose' argument to FALSE by
## default. Set transpose = TRUE explicitly to keep the current behavior,
## or transpose = FALSE to adopt the new one and silence this warning. Type
## help(coords_transpose) for additional information.
insample_class <- ifelse(insample_pred > insamplecutoff_prob, 1, 0)
table(insample_class,train_german_credit$response,
dnn = list("predicted", "actual"))
## actual
## predicted 0 1
## 0 359 52
## 1 156 183
The ROC curve is plotted and the area under the curve was found to be 0.138 The misclassification rate (false negative rate) is calculated to be
best_log <- glm(formula = response ~ chk_acct + duration + credit_his + purpose + amount + present_emp + installment_rate + sex + other_debtor + other_install+ foreign + n_credits + n_people, family = "binomial", data = train_german_credit)
train_prob <- predict(best_log, type = "response")
german_credit_prob <- predict(best_log, newdata = test_german_credit[,-21], type = "response")
ROC <- roc(test_german_credit$response,german_credit_prob)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(ROC, col = "blue")
auc(ROC)
## Area under the curve: 0.8124
cutoff_prob <- coords(ROC,"best",ret = "threshold")
## Warning in coords.roc(ROC, "best", ret = "threshold"): An upcoming
## version of pROC will set the 'transpose' argument to FALSE by default.
## Set transpose = TRUE explicitly to keep the current behavior, or
## transpose = FALSE to adopt the new one and silence this warning. Type
## help(coords_transpose) for additional information.
german_credit_pred <- ifelse(german_credit_prob > cutoff_prob, 1, 0)
table(german_credit_pred,test_german_credit$response,
dnn = list("predicted", "actual"))
## actual
## predicted 0 1
## 0 123 9
## 1 62 56
The asymmetric cost function has been used to calculate the optimal cut off rate. After using the optimal cut off value, the misclassification rate came down to 0.138 from 0.061
costfunc <- function(obs, pred.p, pcut) {
weight1 <- 5 # define the weight for "true=1 but pred=0" (FN)
weight0 <- 1 # define the weight for "true=0 but pred=1" (FP)
c1 <- (obs == 1) & (pred.p < pcut) # count for "true=1 but pred=0" (FN)
c0 <- (obs == 0) & (pred.p >= pcut) # count for "true=0 but pred=1" (FP)
cost <- mean(weight1 * c1 + weight0 * c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
p.seq <- seq(0.01, 1, 0.01)
cost <- rep(0, length(p.seq))
for (i in 1:length(p.seq)) {
cost[i] = costfunc(obs = train_german_credit$response,
pred.p = train_prob, pcut = p.seq[i])
}
plot(p.seq, cost)
optimal.pcut.glm0 <- p.seq[which(cost == min(cost))]
german_credit_pred <- ifelse(german_credit_prob > optimal.pcut.glm0, 1, 0)
table(german_credit_pred,test_german_credit$response,
dnn = list("predicted", "actual"))
## actual
## predicted 0 1
## 0 95 4
## 1 90 61
Based on the cross validation function, the misclassification rate is found out to be 0.535.
pcut <- optimal.pcut.glm0
costfunc2 <- function(obs, pred.p){
weight1 <- 5 # define the weight for "true=1 but pred=0" (FN)
weight0 <- 1 # define the weight for "true=0 but pred=1" (FP)
c1 <- (obs == 1) & (pred.p < pcut) # count for "true=1 but pred=0" (FN)
c0 <- (obs == 0) & (pred.p >= pcut) # count for "true=0 but pred=1" (FP)
cost <- mean(weight1 * c1 + weight0 * c0) # misclassification with weight
return(cost) # you have to return to a value when you write R functions
}
credit_glm1 <- glm(response~. , family = binomial, data = german_credit)
cv.result <- cv.glm(data = german_credit, glmfit = credit_glm1, cost = costfunc2, K = 4)
cv.result$delta[2]
## [1] 0.535
The misclassification rate (false negative rate) is 0.076 and is higher than that of the logistic regression model.
tree_model <- rpart(formula = response ~ chk_acct + duration + credit_his + purpose + amount + present_emp + installment_rate + sex + other_debtor + other_install+ foreign + n_credits + n_people, data = train_german_credit, method = "class", parms = list(loss = matrix(c(0,5,1,0), nrow = 2)))
prp(tree_model, extra = 1)
tree_predict <- predict(tree_model, test_german_credit[,-21], type = "class")
table(test_german_credit$response, tree_predict, dnn = c("Actual","Predicted"))
## Predicted
## Actual 0 1
## 0 82 103
## 1 5 60
The misclassification rate is 0.061 and is same as that of the logistic regression model.
credit_rpart <- rpart(formula = response ~ . , data = train_german_credit, method = "class",
parms = list(loss = matrix(c(0,5,1,0), nrow = 2)), cp = 0.0001)
plotcp(credit_rpart)
prp(prune(credit_rpart, cp = 0.012)) # Pruning the tree
credit_rpartbest <- rpart(formula = response ~ . , data = train_german_credit, method = "class",
parms = list(loss = matrix(c(0,5,1,0), nrow = 2)), cp = 0.012)
prp(credit_rpartbest, extra = 1)
credit_testpredict <- predict(credit_rpartbest, test_german_credit[,-21], type = "class")
table(test_german_credit$response, credit_testpredict, dnn = c("Actual","Predicted"))
## Predicted
## Actual 0 1
## 0 83 102
## 1 4 61