EVALUATE THE MODEL: using AUC (area under the ROC curve) for binary classification models on the validation data.
IF we dont’t have a test set with the default, we can’t see how accurate it is
#Import the data
credit_training <- read.csv('AT2_credit_train_STUDENT.csv', header = TRUE)
credit_test <- read.csv('AT2_credit_test_STUDENT.csv', header = TRUE)
summary(credit_training)
## ID LIMIT_BAL SEX EDUCATION
## Min. : 1 Min. : -99 1 : 9244 Min. :0.000
## 1st Qu.: 7489 1st Qu.: 50000 2 :13854 1st Qu.:1.000
## Median :14987 Median : 140000 cat : 1 Median :2.000
## Mean :14981 Mean : 167524 dog : 1 Mean :1.853
## 3rd Qu.:22452 3rd Qu.: 240000 dolphin: 1 3rd Qu.:2.000
## Max. :30000 Max. :1000000 Max. :6.000
## MARRIAGE AGE PAY_PC1 PAY_PC2
## Min. :0.000 Min. : 21.0 Min. :-11.859675 Min. :-4.42243
## 1st Qu.:1.000 1st Qu.: 28.0 1st Qu.: -0.393308 1st Qu.:-0.23617
## Median :2.000 Median : 34.0 Median : -0.393308 Median : 0.17555
## Mean :1.553 Mean : 35.7 Mean : -0.001656 Mean :-0.00177
## 3rd Qu.:2.000 3rd Qu.: 41.0 3rd Qu.: 1.360047 3rd Qu.: 0.36112
## Max. :3.000 Max. :141.0 Max. : 3.813348 Max. : 5.44103
## PAY_PC3 AMT_PC1 AMT_PC2
## Min. :-3.864638 Min. :-3.41080 Min. :-4.71769
## 1st Qu.:-0.283941 1st Qu.:-1.50827 1st Qu.:-0.42961
## Median : 0.004886 Median :-0.86433 Median :-0.20780
## Mean : 0.000652 Mean : 0.00461 Mean : 0.00137
## 3rd Qu.: 0.093942 3rd Qu.: 0.49766 3rd Qu.: 0.09062
## Max. : 3.364030 Max. :37.49240 Max. :83.52137
## AMT_PC3 AMT_PC4 AMT_PC5
## Min. :-38.46500 Min. :-21.593416 Min. :-42.37665
## 1st Qu.: -0.13710 1st Qu.: -0.068199 1st Qu.: -0.08239
## Median : -0.07044 Median : 0.018389 Median : -0.03200
## Mean : 0.00383 Mean : 0.004618 Mean : 0.00148
## 3rd Qu.: 0.00325 3rd Qu.: 0.083236 3rd Qu.: 0.02644
## Max. : 21.98483 Max. : 21.823749 Max. : 17.43097
## AMT_PC6 AMT_PC7 default
## Min. :-38.88504 Min. :-41.71546 N:17518
## 1st Qu.: -0.04241 1st Qu.: -0.09273 Y: 5583
## Median : -0.00216 Median : -0.04099
## Mean : -0.00202 Mean : -0.00409
## 3rd Qu.: 0.06754 3rd Qu.: 0.03157
## Max. : 20.22670 Max. : 22.92727
str(credit_training)
## 'data.frame': 23101 obs. of 17 variables:
## $ ID : int 1 2 3 4 6 7 8 9 10 11 ...
## $ LIMIT_BAL: num 20000 120000 90000 50000 50000 500000 100000 140000 20000 200000 ...
## $ SEX : Factor w/ 5 levels "1","2","cat",..: 2 2 2 2 1 1 2 2 1 2 ...
## $ EDUCATION: int 2 2 2 2 1 1 2 3 3 3 ...
## $ MARRIAGE : int 1 2 2 1 2 2 2 1 2 2 ...
## $ AGE : int 24 26 34 37 37 29 23 28 35 34 ...
## $ PAY_PC1 : num 0.477 -1.462 -0.393 -0.393 -0.393 ...
## $ PAY_PC2 : num -3.225 0.854 0.176 0.176 0.176 ...
## $ PAY_PC3 : num 0.14504 -0.36086 0.00489 0.00489 0.00489 ...
## $ AMT_PC1 : num -1.752 -1.663 -1.135 -0.397 -0.393 ...
## $ AMT_PC2 : num -0.224 -0.144 -0.177 -0.451 -0.5 ...
## $ AMT_PC3 : num -0.0778 -0.0546 0.016 -0.0998 -0.1033 ...
## $ AMT_PC4 : num 0.00696 -0.00285 -0.12907 -0.03534 -0.1179 ...
## $ AMT_PC5 : num -0.0414 0.0439 0.0982 -0.0553 -0.0546 ...
## $ AMT_PC6 : num 0.000887 -0.02619 -0.022383 0.050465 0.112137 ...
## $ AMT_PC7 : num -0.0563 -0.1 -0.069 -0.0282 0.0186 ...
## $ default : Factor w/ 2 levels "N","Y": 2 2 1 1 1 1 1 2 1 1 ...
str(credit_test)
## 'data.frame': 6899 obs. of 16 variables:
## $ ID : int 5 17 19 23 27 30 32 34 37 38 ...
## $ LIMIT_BAL: int 50000 20000 360000 70000 60000 50000 50000 500000 280000 60000 ...
## $ SEX : int 1 1 2 2 1 1 1 2 1 2 ...
## $ EDUCATION: int 2 1 1 2 1 1 2 2 2 2 ...
## $ MARRIAGE : int 1 2 1 2 2 2 2 1 1 2 ...
## $ AGE : int 57 24 49 26 27 26 33 54 40 22 ...
## $ PAY_PC1 : num 0.273 -3.294 2.876 -3.211 1.425 ...
## $ PAY_PC2 : num 0.847 1.835 -1.4 0.831 -0.57 ...
## $ PAY_PC3 : num -0.0458 -0.2383 1.297 1.7682 1.1754 ...
## $ AMT_PC1 : num -0.793 -1.117 -1.797 -0.148 -1.785 ...
## $ AMT_PC2 : num 0.864 -0.249 -0.22 -0.431 -0.169 ...
## $ AMT_PC3 : num -0.64 -0.0889 -0.0704 -0.1271 -0.0622 ...
## $ AMT_PC4 : num 0.21823 0.02798 0.01867 0.08418 -0.00615 ...
## $ AMT_PC5 : num -0.46541 -0.10028 -0.032 0.04805 -0.00469 ...
## $ AMT_PC6 : num 0.30533 -0.04551 -0.00997 0.1519 0.01235 ...
## $ AMT_PC7 : num -1.0232 0.1004 -0.0433 -0.1224 -0.0813 ...
#need to change ID to be "character", so the IDs are not counted as numbers
clean_data <- function(dataSet) {
#clean up sex, call it new_sex and change the animal classifications to 0
output <- dataSet
output$ID <- as.character(output$ID)
output$SEX <- as.integer(output$SEX)
output$SEX[output$SEX > 2] <- 0
#clean age to remove aged over 100 entries will become NA
output$AGE <- ifelse(output$AGE >=100, NA, output$AGE)
#removed these factors from modelling to make them integers
#output$SEX <- as.factor(output$SEX)
#output$EDUCATION <- as.factor(output$EDUCATION)
#output$MARRIAGE <- as.factor(output$MARRIAGE)
return(output)
}
credit_train_clean <- clean_data(credit_training)
credit_test_clean <- clean_data(credit_test)
str(credit_train_clean)
## 'data.frame': 23101 obs. of 17 variables:
## $ ID : chr "1" "2" "3" "4" ...
## $ LIMIT_BAL: num 20000 120000 90000 50000 50000 500000 100000 140000 20000 200000 ...
## $ SEX : num 2 2 2 2 1 1 2 2 1 2 ...
## $ EDUCATION: int 2 2 2 2 1 1 2 3 3 3 ...
## $ MARRIAGE : int 1 2 2 1 2 2 2 1 2 2 ...
## $ AGE : int 24 26 34 37 37 29 23 28 35 34 ...
## $ PAY_PC1 : num 0.477 -1.462 -0.393 -0.393 -0.393 ...
## $ PAY_PC2 : num -3.225 0.854 0.176 0.176 0.176 ...
## $ PAY_PC3 : num 0.14504 -0.36086 0.00489 0.00489 0.00489 ...
## $ AMT_PC1 : num -1.752 -1.663 -1.135 -0.397 -0.393 ...
## $ AMT_PC2 : num -0.224 -0.144 -0.177 -0.451 -0.5 ...
## $ AMT_PC3 : num -0.0778 -0.0546 0.016 -0.0998 -0.1033 ...
## $ AMT_PC4 : num 0.00696 -0.00285 -0.12907 -0.03534 -0.1179 ...
## $ AMT_PC5 : num -0.0414 0.0439 0.0982 -0.0553 -0.0546 ...
## $ AMT_PC6 : num 0.000887 -0.02619 -0.022383 0.050465 0.112137 ...
## $ AMT_PC7 : num -0.0563 -0.1 -0.069 -0.0282 0.0186 ...
## $ default : Factor w/ 2 levels "N","Y": 2 2 1 1 1 1 1 2 1 1 ...
str(credit_test_clean)
## 'data.frame': 6899 obs. of 16 variables:
## $ ID : chr "5" "17" "19" "23" ...
## $ LIMIT_BAL: int 50000 20000 360000 70000 60000 50000 50000 500000 280000 60000 ...
## $ SEX : num 1 1 2 2 1 1 1 2 1 2 ...
## $ EDUCATION: int 2 1 1 2 1 1 2 2 2 2 ...
## $ MARRIAGE : int 1 2 1 2 2 2 2 1 1 2 ...
## $ AGE : int 57 24 49 26 27 26 33 54 40 22 ...
## $ PAY_PC1 : num 0.273 -3.294 2.876 -3.211 1.425 ...
## $ PAY_PC2 : num 0.847 1.835 -1.4 0.831 -0.57 ...
## $ PAY_PC3 : num -0.0458 -0.2383 1.297 1.7682 1.1754 ...
## $ AMT_PC1 : num -0.793 -1.117 -1.797 -0.148 -1.785 ...
## $ AMT_PC2 : num 0.864 -0.249 -0.22 -0.431 -0.169 ...
## $ AMT_PC3 : num -0.64 -0.0889 -0.0704 -0.1271 -0.0622 ...
## $ AMT_PC4 : num 0.21823 0.02798 0.01867 0.08418 -0.00615 ...
## $ AMT_PC5 : num -0.46541 -0.10028 -0.032 0.04805 -0.00469 ...
## $ AMT_PC6 : num 0.30533 -0.04551 -0.00997 0.1519 0.01235 ...
## $ AMT_PC7 : num -1.0232 0.1004 -0.0433 -0.1224 -0.0813 ...
#note test data has no default, so we need to do a split of our train data
Run a linear model with the training data. First, split the training data into a test and train (as the test data supplied has no default)
# Description
# -----------
# Split data into random train and test sets, then add them back to a single dataframe.
# A new column called 'is_train' indicats whether a row belongs to the training set (1) or test set (0)
# Parameters
# ----------
# dataSet (dataframe) - data frame of data with test and train split
# Returns - a data frame
train_test_set <- function(dataSet, split) {
trainset_size = floor(split * nrow(dataSet))
set.seed(333)
#this is via randomly picking observations using the sample function
trainset_indices = sample(seq_len(nrow(dataSet)), size = trainset_size)
#assign observations to training and testing sets
trainset = dataSet[trainset_indices, ]
testset = dataSet[-trainset_indices, ]
#Add a column to each data frame called 'is_train'. for training set, set it to 1, for test set, set it to 0.
trainset$is_train = 1
testset$is_train = 0
#Bind the 2 data frames back together
output = rbind(trainset,testset)
return(output)
}
# done an 80:20 split on our training data (80% is train values)
credit_train_clean_for_GLM <- train_test_set(credit_train_clean, 0.8)
ONly want the stuff that is an ‘is train’ value of 1 - because I only want the training data of the trainset
glm_test_set <- filter(credit_train_clean_for_GLM, is_train==0)
glm_train_set <- filter(credit_train_clean_for_GLM, is_train==1)
#We run a linear regression model on our training set
glm.model = glm(formula = default ~ ., family = binomial, data = glm_train_set[,-1])
summary(glm.model)
##
## Call:
## glm(formula = default ~ ., family = binomial, data = glm_train_set[,
## -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9158 -0.7519 -0.5659 -0.2404 3.0060
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.490e+00 1.456e-01 -10.228 < 2e-16 ***
## LIMIT_BAL -1.871e-06 2.005e-07 -9.332 < 2e-16 ***
## SEX -1.494e-02 3.775e-02 -0.396 0.6923
## EDUCATION 1.849e-01 2.412e-02 7.664 1.81e-14 ***
## MARRIAGE -6.856e-02 3.856e-02 -1.778 0.0754 .
## AGE 7.634e-03 2.156e-03 3.540 0.0004 ***
## PAY_PC1 -2.961e-01 1.056e-02 -28.035 < 2e-16 ***
## PAY_PC2 -3.559e-01 2.121e-02 -16.780 < 2e-16 ***
## PAY_PC3 2.797e-01 2.710e-02 10.322 < 2e-16 ***
## AMT_PC1 -6.250e-02 1.055e-02 -5.923 3.16e-09 ***
## AMT_PC2 -1.430e-01 2.880e-02 -4.967 6.81e-07 ***
## AMT_PC3 8.151e-03 3.110e-02 0.262 0.7933
## AMT_PC4 4.648e-03 2.842e-02 0.164 0.8701
## AMT_PC5 5.358e-02 3.233e-02 1.657 0.0974 .
## AMT_PC6 -6.796e-02 2.940e-02 -2.312 0.0208 *
## AMT_PC7 -5.397e-03 3.907e-02 -0.138 0.8902
## is_train NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20445 on 18437 degrees of freedom
## Residual deviance: 18147 on 18422 degrees of freedom
## (42 observations deleted due to missingness)
## AIC: 18179
##
## Number of Fisher Scoring iterations: 5
Summary says most important predictors are Limit balance, Education, Age, Pay and Amt variables. Analyse the predictors and determine which ones are the valid ones to keep (feature selection).
# probabilities on our test data that's been split from our training IE not the provided test
probabilities <- predict.glm(glm.model,glm_test_set[,-1],type="response")
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading
# Create a vector to hold predictions
predictions <- rep("N",nrow(glm_test_set[,-1]))
predictions[probabilities >.5] <- "Y" #turned it from a probability to a prediction
#Create a confusion matrix
glm_confusion_matrix <- table(pred=predictions,true=glm_test_set$default)
glm_confusion_matrix
## true
## pred N Y
## N 3418 843
## Y 108 252
# We'll want to look at evaluation measures regularly, so create a function to calculate and return them
get_evaluation_measures <- function(name = NA, tn, fp, fn, tp) {
accuracy = (tp+tn)/(tp+tn+fp+fn)
precision = tp/(tp+fp)
recall = tp/(tp+fn)
F1 = 2 * ((precision * recall)/(precision + recall))
output = data.frame(name, accuracy, precision, recall, F1)
return(output)
}
#converted CM to dataframe extracted the frequency values for each score
glm_confusion_matrix_df <- as.data.frame(glm_confusion_matrix)
glm_tn <- glm_confusion_matrix_df$Freq[1]
glm_fp <- glm_confusion_matrix_df$Freq[2]
glm_fn <- glm_confusion_matrix_df$Freq[3]
glm_tp <- glm_confusion_matrix_df$Freq[4]
glm_evaluation_measures <- get_evaluation_measures("model_train", glm_tn, glm_fp, glm_fn, glm_tp)
#glm_evaluation_measures now holds a dataframe showing accuracy, precision, recall and F1
glm_evaluation_measures
## name accuracy precision recall F1
## 1 model_train 0.7942004 0.7 0.230137 0.3463918
#Now to get AUC. We'll do it again further on in our analysis, so write a function
get_auc <- function(probabilities, targets) {
probs = as.vector(probabilities)
pred = prediction(probs,targets)
perf_AUC = performance(pred, "auc")
AUC = perf_AUC@y.values[[1]]
return(AUC)
}
glm_evaluation_measures$AUC <- get_auc(probabilities, glm_test_set$default)
glm_evaluation_measures
## name accuracy precision recall F1 AUC
## 1 model_train 0.7942004 0.7 0.230137 0.3463918 0.7279244
#Now we want to try LASSO to perform grid search to find optimal value of lambda IE regularize the model
#removing NAs
credit_train_clean_for_lasso = credit_train_clean[complete.cases(credit_train_clean), ]
# done an 80:20 split on our training data (80% is train values)
credit_train_clean_for_lasso <- train_test_set(credit_train_clean_for_lasso, 0.8)
View(credit_train_clean_for_lasso)
#create test and train set dataframes
lrm_testset <- filter(credit_train_clean_for_lasso, is_train==0)
lrm_trainset <- filter(credit_train_clean_for_lasso, is_train==1)
#convert training data to matrix format
lrm_x <- model.matrix(default~., lrm_trainset[,-1])
lrm_y <- lrm_trainset$default
#family= binomial => logistic regression, alpha=1 => lasso
lrm_cv.out <- cv.glmnet(lrm_x, lrm_y, alpha=1, family="binomial", type.measure="auc")
plot(lrm_cv.out)
#Using lambda of 1se rather than the minimum lambda, see what predictors are discarded
#min value of lambda
lrm_lambda_min <- lrm_cv.out$lambda.min
#best value of lambda
lrm_lambda_1se <- lrm_cv.out$lambda.1se
#regression coefficients
coef(lrm_cv.out,s=lrm_lambda_1se)
## 18 x 1 sparse Matrix of class "dgCMatrix"
## 1
## (Intercept) -1.270966e+00
## (Intercept) .
## LIMIT_BAL -1.806116e-06
## SEX .
## EDUCATION 1.650089e-01
## MARRIAGE .
## AGE .
## PAY_PC1 -2.499713e-01
## PAY_PC2 -3.043219e-01
## PAY_PC3 1.735162e-01
## AMT_PC1 -1.161302e-02
## AMT_PC2 -6.643125e-03
## AMT_PC3 .
## AMT_PC4 .
## AMT_PC5 .
## AMT_PC6 .
## AMT_PC7 .
## is_train .
#Convert test data to a model matrix
lrm_x_test <- model.matrix(default~.,lrm_testset[,-1])
#Get prediction probabilities
lasso_prob <- predict(lrm_cv.out, newx = lrm_x_test, s=lrm_lambda_1se, type="response")
#translate probabilities to predictions
lasso_predict <- rep("N",nrow(lrm_testset))
lasso_predict[lasso_prob>.5] <- "Y"
#confusion matrix
lasso_confusion_matrix <- table(pred=lasso_predict,true=lrm_testset$default)
lasso_confusion_matrix
## true
## pred N Y
## N 3399 947
## Y 78 187
#Convert the confusion matrix to a data frame
lasso_confusion_matrix_df <- as.data.frame(lasso_confusion_matrix)
lasso_evaluation_measures <- get_evaluation_measures("Lasso",
lasso_confusion_matrix_df$Freq[1],
lasso_confusion_matrix_df$Freq[2],
lasso_confusion_matrix_df$Freq[3],
lasso_confusion_matrix_df$Freq[4])
#Get the AUC and add it to our evaluation measures data frame
lasso_evaluation_measures$AUC <- get_auc(lasso_prob, lrm_testset$default)
lasso_evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Lasso 0.7777055 0.7056604 0.164903 0.2673338 0.7149422
evaluation_measures <- rbind(lasso_evaluation_measures, glm_evaluation_measures)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Lasso 0.7777055 0.7056604 0.164903 0.2673338 0.7149422
## 2 model_train 0.7942004 0.7000000 0.230137 0.3463918 0.7279244
Try another glm with different variables
#We run a linear regression model selecting new features
glm.model2 = glm(formula = default ~ LIMIT_BAL+EDUCATION+AGE+PAY_PC1+PAY_PC2+PAY_PC3+AMT_PC1+AMT_PC2, family = binomial, data = glm_train_set[,-1])
summary(glm.model2)
##
## Call:
## glm(formula = default ~ LIMIT_BAL + EDUCATION + AGE + PAY_PC1 +
## PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2, family = binomial,
## data = glm_train_set[, -1])
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9221 -0.7515 -0.5659 -0.2470 2.9572
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.678e+00 8.429e-02 -19.909 < 2e-16 ***
## LIMIT_BAL -1.841e-06 1.991e-07 -9.247 < 2e-16 ***
## EDUCATION 1.880e-01 2.397e-02 7.842 4.42e-15 ***
## AGE 9.054e-03 1.992e-03 4.546 5.48e-06 ***
## PAY_PC1 -2.974e-01 1.053e-02 -28.234 < 2e-16 ***
## PAY_PC2 -3.584e-01 2.108e-02 -17.007 < 2e-16 ***
## PAY_PC3 2.825e-01 2.698e-02 10.469 < 2e-16 ***
## AMT_PC1 -6.045e-02 1.041e-02 -5.810 6.27e-09 ***
## AMT_PC2 -1.275e-01 2.666e-02 -4.784 1.72e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 20445 on 18437 degrees of freedom
## Residual deviance: 18158 on 18429 degrees of freedom
## (42 observations deleted due to missingness)
## AIC: 18176
##
## Number of Fisher Scoring iterations: 5
# probabilities on our test data that's been split from our training IE not the provided test
probabilities2 <- predict.glm(glm.model2,glm_test_set[,-1],type="response")
# Create a vector to hold predictions
predictions2 <- rep("N",nrow(glm_test_set[,-1]))
predictions2[probabilities2 >.5] <- "Y" #turned it from a probability to a prediction
#Create a confusion matrix
glm_confusion_matrix2 <- table(pred=predictions2,true=glm_test_set$default)
glm_confusion_matrix2
## true
## pred N Y
## N 3418 841
## Y 108 254
#converted CM to dataframe extracted the frequency values for each score
glm_confusion_matrix2_df <- as.data.frame(glm_confusion_matrix2)
glm2_tn <- glm_confusion_matrix2_df$Freq[1]
glm2_fp <- glm_confusion_matrix2_df$Freq[2]
glm2_fn <- glm_confusion_matrix2_df$Freq[3]
glm2_tp <- glm_confusion_matrix2_df$Freq[4]
glm_evaluation_measures2 <- get_evaluation_measures("model_train2", glm2_tn, glm2_fp, glm2_fn, glm2_tp)
#glm_evaluation_measures now holds a dataframe showing accuracy, precision, recall and F1
glm_evaluation_measures2
## name accuracy precision recall F1
## 1 model_train2 0.7946332 0.7016575 0.2319635 0.3486616
glm_evaluation_measures2$AUC <- get_auc(probabilities2, glm_test_set$default)
glm_evaluation_measures2
## name accuracy precision recall F1 AUC
## 1 model_train2 0.7946332 0.7016575 0.2319635 0.3486616 0.7274563
evaluation_measures <- rbind(evaluation_measures, glm_evaluation_measures2)
evaluation_measures
## name accuracy precision recall F1 AUC
## 1 Lasso 0.7777055 0.7056604 0.1649030 0.2673338 0.7149422
## 2 model_train 0.7942004 0.7000000 0.2301370 0.3463918 0.7279244
## 3 model_train2 0.7946332 0.7016575 0.2319635 0.3486616 0.7274563