Start modelling

EVALUATE THE MODEL: using AUC (area under the ROC curve) for binary classification models on the validation data.

IF we dont’t have a test set with the default, we can’t see how accurate it is

#Import the data
credit_training <- read.csv('AT2_credit_train_STUDENT.csv', header = TRUE)
credit_test <- read.csv('AT2_credit_test_STUDENT.csv', header = TRUE)
summary(credit_training)

##        ID          LIMIT_BAL            SEX          EDUCATION    
##  Min.   :    1   Min.   :    -99   1      : 9244   Min.   :0.000  
##  1st Qu.: 7489   1st Qu.:  50000   2      :13854   1st Qu.:1.000  
##  Median :14987   Median : 140000   cat    :    1   Median :2.000  
##  Mean   :14981   Mean   : 167524   dog    :    1   Mean   :1.853  
##  3rd Qu.:22452   3rd Qu.: 240000   dolphin:    1   3rd Qu.:2.000  
##  Max.   :30000   Max.   :1000000                   Max.   :6.000  
##     MARRIAGE          AGE           PAY_PC1              PAY_PC2        
##  Min.   :0.000   Min.   : 21.0   Min.   :-11.859675   Min.   :-4.42243  
##  1st Qu.:1.000   1st Qu.: 28.0   1st Qu.: -0.393308   1st Qu.:-0.23617  
##  Median :2.000   Median : 34.0   Median : -0.393308   Median : 0.17555  
##  Mean   :1.553   Mean   : 35.7   Mean   : -0.001656   Mean   :-0.00177  
##  3rd Qu.:2.000   3rd Qu.: 41.0   3rd Qu.:  1.360047   3rd Qu.: 0.36112  
##  Max.   :3.000   Max.   :141.0   Max.   :  3.813348   Max.   : 5.44103  
##     PAY_PC3             AMT_PC1            AMT_PC2        
##  Min.   :-3.864638   Min.   :-3.41080   Min.   :-4.71769  
##  1st Qu.:-0.283941   1st Qu.:-1.50827   1st Qu.:-0.42961  
##  Median : 0.004886   Median :-0.86433   Median :-0.20780  
##  Mean   : 0.000652   Mean   : 0.00461   Mean   : 0.00137  
##  3rd Qu.: 0.093942   3rd Qu.: 0.49766   3rd Qu.: 0.09062  
##  Max.   : 3.364030   Max.   :37.49240   Max.   :83.52137  
##     AMT_PC3             AMT_PC4              AMT_PC5         
##  Min.   :-38.46500   Min.   :-21.593416   Min.   :-42.37665  
##  1st Qu.: -0.13710   1st Qu.: -0.068199   1st Qu.: -0.08239  
##  Median : -0.07044   Median :  0.018389   Median : -0.03200  
##  Mean   :  0.00383   Mean   :  0.004618   Mean   :  0.00148  
##  3rd Qu.:  0.00325   3rd Qu.:  0.083236   3rd Qu.:  0.02644  
##  Max.   : 21.98483   Max.   : 21.823749   Max.   : 17.43097  
##     AMT_PC6             AMT_PC7          default  
##  Min.   :-38.88504   Min.   :-41.71546   N:17518  
##  1st Qu.: -0.04241   1st Qu.: -0.09273   Y: 5583  
##  Median : -0.00216   Median : -0.04099            
##  Mean   : -0.00202   Mean   : -0.00409            
##  3rd Qu.:  0.06754   3rd Qu.:  0.03157            
##  Max.   : 20.22670   Max.   : 22.92727

str(credit_training)

## 'data.frame':    23101 obs. of  17 variables:
##  $ ID       : int  1 2 3 4 6 7 8 9 10 11 ...
##  $ LIMIT_BAL: num  20000 120000 90000 50000 50000 500000 100000 140000 20000 200000 ...
##  $ SEX      : Factor w/ 5 levels "1","2","cat",..: 2 2 2 2 1 1 2 2 1 2 ...
##  $ EDUCATION: int  2 2 2 2 1 1 2 3 3 3 ...
##  $ MARRIAGE : int  1 2 2 1 2 2 2 1 2 2 ...
##  $ AGE      : int  24 26 34 37 37 29 23 28 35 34 ...
##  $ PAY_PC1  : num  0.477 -1.462 -0.393 -0.393 -0.393 ...
##  $ PAY_PC2  : num  -3.225 0.854 0.176 0.176 0.176 ...
##  $ PAY_PC3  : num  0.14504 -0.36086 0.00489 0.00489 0.00489 ...
##  $ AMT_PC1  : num  -1.752 -1.663 -1.135 -0.397 -0.393 ...
##  $ AMT_PC2  : num  -0.224 -0.144 -0.177 -0.451 -0.5 ...
##  $ AMT_PC3  : num  -0.0778 -0.0546 0.016 -0.0998 -0.1033 ...
##  $ AMT_PC4  : num  0.00696 -0.00285 -0.12907 -0.03534 -0.1179 ...
##  $ AMT_PC5  : num  -0.0414 0.0439 0.0982 -0.0553 -0.0546 ...
##  $ AMT_PC6  : num  0.000887 -0.02619 -0.022383 0.050465 0.112137 ...
##  $ AMT_PC7  : num  -0.0563 -0.1 -0.069 -0.0282 0.0186 ...
##  $ default  : Factor w/ 2 levels "N","Y": 2 2 1 1 1 1 1 2 1 1 ...

str(credit_test)

## 'data.frame':    6899 obs. of  16 variables:
##  $ ID       : int  5 17 19 23 27 30 32 34 37 38 ...
##  $ LIMIT_BAL: int  50000 20000 360000 70000 60000 50000 50000 500000 280000 60000 ...
##  $ SEX      : int  1 1 2 2 1 1 1 2 1 2 ...
##  $ EDUCATION: int  2 1 1 2 1 1 2 2 2 2 ...
##  $ MARRIAGE : int  1 2 1 2 2 2 2 1 1 2 ...
##  $ AGE      : int  57 24 49 26 27 26 33 54 40 22 ...
##  $ PAY_PC1  : num  0.273 -3.294 2.876 -3.211 1.425 ...
##  $ PAY_PC2  : num  0.847 1.835 -1.4 0.831 -0.57 ...
##  $ PAY_PC3  : num  -0.0458 -0.2383 1.297 1.7682 1.1754 ...
##  $ AMT_PC1  : num  -0.793 -1.117 -1.797 -0.148 -1.785 ...
##  $ AMT_PC2  : num  0.864 -0.249 -0.22 -0.431 -0.169 ...
##  $ AMT_PC3  : num  -0.64 -0.0889 -0.0704 -0.1271 -0.0622 ...
##  $ AMT_PC4  : num  0.21823 0.02798 0.01867 0.08418 -0.00615 ...
##  $ AMT_PC5  : num  -0.46541 -0.10028 -0.032 0.04805 -0.00469 ...
##  $ AMT_PC6  : num  0.30533 -0.04551 -0.00997 0.1519 0.01235 ...
##  $ AMT_PC7  : num  -1.0232 0.1004 -0.0433 -0.1224 -0.0813 ...

Clean the data based on observations

#need to change ID to be "character", so the IDs are not counted as numbers

clean_data <- function(dataSet) {
  
  #clean up sex, call it new_sex and change the animal classifications to 0
  output <- dataSet
  
  output$ID <- as.character(output$ID)
  
  output$SEX <- as.integer(output$SEX)
  
  output$SEX[output$SEX > 2] <- 0
  #clean age to remove aged over 100 entries will become NA
  output$AGE <- ifelse(output$AGE >=100, NA, output$AGE)
  
  #removed these factors from modelling to make them integers
  #output$SEX <- as.factor(output$SEX)
 #output$EDUCATION <- as.factor(output$EDUCATION)
  #output$MARRIAGE <- as.factor(output$MARRIAGE)
  
  return(output)
}

credit_train_clean <- clean_data(credit_training)
credit_test_clean <- clean_data(credit_test)

str(credit_train_clean)

## 'data.frame':    23101 obs. of  17 variables:
##  $ ID       : chr  "1" "2" "3" "4" ...
##  $ LIMIT_BAL: num  20000 120000 90000 50000 50000 500000 100000 140000 20000 200000 ...
##  $ SEX      : num  2 2 2 2 1 1 2 2 1 2 ...
##  $ EDUCATION: int  2 2 2 2 1 1 2 3 3 3 ...
##  $ MARRIAGE : int  1 2 2 1 2 2 2 1 2 2 ...
##  $ AGE      : int  24 26 34 37 37 29 23 28 35 34 ...
##  $ PAY_PC1  : num  0.477 -1.462 -0.393 -0.393 -0.393 ...
##  $ PAY_PC2  : num  -3.225 0.854 0.176 0.176 0.176 ...
##  $ PAY_PC3  : num  0.14504 -0.36086 0.00489 0.00489 0.00489 ...
##  $ AMT_PC1  : num  -1.752 -1.663 -1.135 -0.397 -0.393 ...
##  $ AMT_PC2  : num  -0.224 -0.144 -0.177 -0.451 -0.5 ...
##  $ AMT_PC3  : num  -0.0778 -0.0546 0.016 -0.0998 -0.1033 ...
##  $ AMT_PC4  : num  0.00696 -0.00285 -0.12907 -0.03534 -0.1179 ...
##  $ AMT_PC5  : num  -0.0414 0.0439 0.0982 -0.0553 -0.0546 ...
##  $ AMT_PC6  : num  0.000887 -0.02619 -0.022383 0.050465 0.112137 ...
##  $ AMT_PC7  : num  -0.0563 -0.1 -0.069 -0.0282 0.0186 ...
##  $ default  : Factor w/ 2 levels "N","Y": 2 2 1 1 1 1 1 2 1 1 ...

str(credit_test_clean)

## 'data.frame':    6899 obs. of  16 variables:
##  $ ID       : chr  "5" "17" "19" "23" ...
##  $ LIMIT_BAL: int  50000 20000 360000 70000 60000 50000 50000 500000 280000 60000 ...
##  $ SEX      : num  1 1 2 2 1 1 1 2 1 2 ...
##  $ EDUCATION: int  2 1 1 2 1 1 2 2 2 2 ...
##  $ MARRIAGE : int  1 2 1 2 2 2 2 1 1 2 ...
##  $ AGE      : int  57 24 49 26 27 26 33 54 40 22 ...
##  $ PAY_PC1  : num  0.273 -3.294 2.876 -3.211 1.425 ...
##  $ PAY_PC2  : num  0.847 1.835 -1.4 0.831 -0.57 ...
##  $ PAY_PC3  : num  -0.0458 -0.2383 1.297 1.7682 1.1754 ...
##  $ AMT_PC1  : num  -0.793 -1.117 -1.797 -0.148 -1.785 ...
##  $ AMT_PC2  : num  0.864 -0.249 -0.22 -0.431 -0.169 ...
##  $ AMT_PC3  : num  -0.64 -0.0889 -0.0704 -0.1271 -0.0622 ...
##  $ AMT_PC4  : num  0.21823 0.02798 0.01867 0.08418 -0.00615 ...
##  $ AMT_PC5  : num  -0.46541 -0.10028 -0.032 0.04805 -0.00469 ...
##  $ AMT_PC6  : num  0.30533 -0.04551 -0.00997 0.1519 0.01235 ...
##  $ AMT_PC7  : num  -1.0232 0.1004 -0.0433 -0.1224 -0.0813 ...

#note test data has no default, so we need to do a split of our train data

Run a linear model with the training data. First, split the training data into a test and train (as the test data supplied has no default)

# Description
# -----------
# Split data into random train and test sets, then add them back to a single dataframe.
# A new column called 'is_train' indicats whether a row belongs to the training set (1) or test set (0)

# Parameters
# ----------
  # dataSet (dataframe) - data frame of data with test and train split

# Returns - a data frame

train_test_set <- function(dataSet, split) {
  
  trainset_size = floor(split * nrow(dataSet))
  
  set.seed(333)
  
  #this is via randomly picking observations using the sample function
  trainset_indices = sample(seq_len(nrow(dataSet)), size = trainset_size)
  
  #assign observations to training and testing sets
  trainset = dataSet[trainset_indices, ]
  testset = dataSet[-trainset_indices, ]
  
  #Add a column to each data frame called 'is_train'. for training set, set it to 1, for test set, set it to 0.
  trainset$is_train = 1
  testset$is_train = 0
  
  #Bind the 2 data frames back together
  output = rbind(trainset,testset)
  
  return(output)
  
}

# done an 80:20 split on our training data (80% is train values)
credit_train_clean_for_GLM <- train_test_set(credit_train_clean, 0.8)

ONly want the stuff that is an ‘is train’ value of 1 - because I only want the training data of the trainset

glm_test_set <- filter(credit_train_clean_for_GLM, is_train==0)
glm_train_set <- filter(credit_train_clean_for_GLM, is_train==1)
#We run a linear regression model on our training set  
glm.model = glm(formula = default ~ ., family = binomial, data = glm_train_set[,-1])

summary(glm.model)

## 
## Call:
## glm(formula = default ~ ., family = binomial, data = glm_train_set[, 
##     -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9158  -0.7519  -0.5659  -0.2404   3.0060  
## 
## Coefficients: (1 not defined because of singularities)
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.490e+00  1.456e-01 -10.228  < 2e-16 ***
## LIMIT_BAL   -1.871e-06  2.005e-07  -9.332  < 2e-16 ***
## SEX         -1.494e-02  3.775e-02  -0.396   0.6923    
## EDUCATION    1.849e-01  2.412e-02   7.664 1.81e-14 ***
## MARRIAGE    -6.856e-02  3.856e-02  -1.778   0.0754 .  
## AGE          7.634e-03  2.156e-03   3.540   0.0004 ***
## PAY_PC1     -2.961e-01  1.056e-02 -28.035  < 2e-16 ***
## PAY_PC2     -3.559e-01  2.121e-02 -16.780  < 2e-16 ***
## PAY_PC3      2.797e-01  2.710e-02  10.322  < 2e-16 ***
## AMT_PC1     -6.250e-02  1.055e-02  -5.923 3.16e-09 ***
## AMT_PC2     -1.430e-01  2.880e-02  -4.967 6.81e-07 ***
## AMT_PC3      8.151e-03  3.110e-02   0.262   0.7933    
## AMT_PC4      4.648e-03  2.842e-02   0.164   0.8701    
## AMT_PC5      5.358e-02  3.233e-02   1.657   0.0974 .  
## AMT_PC6     -6.796e-02  2.940e-02  -2.312   0.0208 *  
## AMT_PC7     -5.397e-03  3.907e-02  -0.138   0.8902    
## is_train            NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20445  on 18437  degrees of freedom
## Residual deviance: 18147  on 18422  degrees of freedom
##   (42 observations deleted due to missingness)
## AIC: 18179
## 
## Number of Fisher Scoring iterations: 5

Summary says most important predictors are Limit balance, Education, Age, Pay and Amt variables. Analyse the predictors and determine which ones are the valid ones to keep (feature selection).

# probabilities on our test data that's been split from our training IE not the provided test
probabilities <- predict.glm(glm.model,glm_test_set[,-1],type="response")

## Warning in predict.lm(object, newdata, se.fit, scale = 1, type =
## ifelse(type == : prediction from a rank-deficient fit may be misleading

# Create a vector to hold predictions
predictions <- rep("N",nrow(glm_test_set[,-1])) 
predictions[probabilities >.5] <- "Y" #turned it from a probability to a prediction
#Create a confusion matrix
glm_confusion_matrix <- table(pred=predictions,true=glm_test_set$default)

glm_confusion_matrix

##     true
## pred    N    Y
##    N 3418  843
##    Y  108  252

# We'll want to look at evaluation measures regularly, so create a function to calculate and return them
get_evaluation_measures <- function(name = NA, tn, fp, fn, tp) {
  
  accuracy = (tp+tn)/(tp+tn+fp+fn)
  
  precision = tp/(tp+fp)
  
  recall = tp/(tp+fn)
  
  F1 = 2 * ((precision * recall)/(precision + recall))
  
  output = data.frame(name, accuracy, precision, recall, F1)
  
  return(output)
  
}

#converted CM to dataframe extracted the frequency values for each score
glm_confusion_matrix_df <- as.data.frame(glm_confusion_matrix)
glm_tn <- glm_confusion_matrix_df$Freq[1]
glm_fp <- glm_confusion_matrix_df$Freq[2]
glm_fn <- glm_confusion_matrix_df$Freq[3]
glm_tp <- glm_confusion_matrix_df$Freq[4]
glm_evaluation_measures <- get_evaluation_measures("model_train", glm_tn, glm_fp, glm_fn, glm_tp)
#glm_evaluation_measures now holds a dataframe showing accuracy, precision, recall and F1
glm_evaluation_measures

##          name  accuracy precision   recall        F1
## 1 model_train 0.7942004       0.7 0.230137 0.3463918

#Now to get AUC. We'll do it again further on in our analysis, so write a function
get_auc <- function(probabilities, targets) {
  
  probs = as.vector(probabilities)
  
  pred = prediction(probs,targets)
  
  perf_AUC = performance(pred, "auc")
  
  AUC = perf_AUC@y.values[[1]]
  
  return(AUC)
  
}

glm_evaluation_measures$AUC <- get_auc(probabilities, glm_test_set$default)
glm_evaluation_measures

##          name  accuracy precision   recall        F1       AUC
## 1 model_train 0.7942004       0.7 0.230137 0.3463918 0.7279244

This isn’t very good. Shall we try Lasso

#Now we want to try LASSO to perform grid search to find optimal value of lambda IE regularize the model
#removing NAs
credit_train_clean_for_lasso = credit_train_clean[complete.cases(credit_train_clean), ]

# done an 80:20 split on our training data (80% is train values)
credit_train_clean_for_lasso <- train_test_set(credit_train_clean_for_lasso, 0.8)
View(credit_train_clean_for_lasso)

#create test and train set dataframes
lrm_testset <- filter(credit_train_clean_for_lasso, is_train==0)
lrm_trainset <- filter(credit_train_clean_for_lasso, is_train==1)
#convert training data to matrix format
lrm_x <- model.matrix(default~., lrm_trainset[,-1])
lrm_y <- lrm_trainset$default

#family= binomial => logistic regression, alpha=1 => lasso
lrm_cv.out <- cv.glmnet(lrm_x, lrm_y, alpha=1, family="binomial", type.measure="auc")

plot(lrm_cv.out)

#Using lambda of 1se rather than the minimum lambda, see what predictors are discarded

#min value of lambda
lrm_lambda_min <- lrm_cv.out$lambda.min
#best value of lambda
lrm_lambda_1se <- lrm_cv.out$lambda.1se

#regression coefficients
coef(lrm_cv.out,s=lrm_lambda_1se)

## 18 x 1 sparse Matrix of class "dgCMatrix"
##                         1
## (Intercept) -1.270966e+00
## (Intercept)  .           
## LIMIT_BAL   -1.806116e-06
## SEX          .           
## EDUCATION    1.650089e-01
## MARRIAGE     .           
## AGE          .           
## PAY_PC1     -2.499713e-01
## PAY_PC2     -3.043219e-01
## PAY_PC3      1.735162e-01
## AMT_PC1     -1.161302e-02
## AMT_PC2     -6.643125e-03
## AMT_PC3      .           
## AMT_PC4      .           
## AMT_PC5      .           
## AMT_PC6      .           
## AMT_PC7      .           
## is_train     .

The pay variables are selected again, along with limit balance and education.

#Convert test data to a model matrix
lrm_x_test <- model.matrix(default~.,lrm_testset[,-1])

#Get prediction probabilities
lasso_prob <- predict(lrm_cv.out, newx = lrm_x_test, s=lrm_lambda_1se, type="response")

#translate probabilities to predictions
lasso_predict <- rep("N",nrow(lrm_testset))

lasso_predict[lasso_prob>.5] <- "Y"

2.a. Show the confusion matrix and calculate precision, recall, F1 and AUC

#confusion matrix
lasso_confusion_matrix <- table(pred=lasso_predict,true=lrm_testset$default)

lasso_confusion_matrix

##     true
## pred    N    Y
##    N 3399  947
##    Y   78  187

#Convert the confusion matrix to a data frame
lasso_confusion_matrix_df <- as.data.frame(lasso_confusion_matrix)
  
lasso_evaluation_measures <- get_evaluation_measures("Lasso",
                        lasso_confusion_matrix_df$Freq[1],
                        lasso_confusion_matrix_df$Freq[2],
                        lasso_confusion_matrix_df$Freq[3],
                        lasso_confusion_matrix_df$Freq[4])

#Get the AUC and add it to our evaluation measures data frame
lasso_evaluation_measures$AUC <- get_auc(lasso_prob, lrm_testset$default)

lasso_evaluation_measures

##    name  accuracy precision   recall        F1       AUC
## 1 Lasso 0.7777055 0.7056604 0.164903 0.2673338 0.7149422

evaluation_measures <- rbind(lasso_evaluation_measures, glm_evaluation_measures)
evaluation_measures

##          name  accuracy precision   recall        F1       AUC
## 1       Lasso 0.7777055 0.7056604 0.164903 0.2673338 0.7149422
## 2 model_train 0.7942004 0.7000000 0.230137 0.3463918 0.7279244

LASSO result is worse than trained model - overfitting?

Try another glm with different variables

#We run a linear regression model selecting new features 
glm.model2 = glm(formula = default ~ LIMIT_BAL+EDUCATION+AGE+PAY_PC1+PAY_PC2+PAY_PC3+AMT_PC1+AMT_PC2, family = binomial, data = glm_train_set[,-1])

summary(glm.model2)

## 
## Call:
## glm(formula = default ~ LIMIT_BAL + EDUCATION + AGE + PAY_PC1 + 
##     PAY_PC2 + PAY_PC3 + AMT_PC1 + AMT_PC2, family = binomial, 
##     data = glm_train_set[, -1])
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9221  -0.7515  -0.5659  -0.2470   2.9572  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.678e+00  8.429e-02 -19.909  < 2e-16 ***
## LIMIT_BAL   -1.841e-06  1.991e-07  -9.247  < 2e-16 ***
## EDUCATION    1.880e-01  2.397e-02   7.842 4.42e-15 ***
## AGE          9.054e-03  1.992e-03   4.546 5.48e-06 ***
## PAY_PC1     -2.974e-01  1.053e-02 -28.234  < 2e-16 ***
## PAY_PC2     -3.584e-01  2.108e-02 -17.007  < 2e-16 ***
## PAY_PC3      2.825e-01  2.698e-02  10.469  < 2e-16 ***
## AMT_PC1     -6.045e-02  1.041e-02  -5.810 6.27e-09 ***
## AMT_PC2     -1.275e-01  2.666e-02  -4.784 1.72e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 20445  on 18437  degrees of freedom
## Residual deviance: 18158  on 18429  degrees of freedom
##   (42 observations deleted due to missingness)
## AIC: 18176
## 
## Number of Fisher Scoring iterations: 5

# probabilities on our test data that's been split from our training IE not the provided test
probabilities2 <- predict.glm(glm.model2,glm_test_set[,-1],type="response")
# Create a vector to hold predictions
predictions2 <- rep("N",nrow(glm_test_set[,-1])) 
predictions2[probabilities2 >.5] <- "Y" #turned it from a probability to a prediction
#Create a confusion matrix
glm_confusion_matrix2 <- table(pred=predictions2,true=glm_test_set$default)

glm_confusion_matrix2

##     true
## pred    N    Y
##    N 3418  841
##    Y  108  254

#converted CM to dataframe extracted the frequency values for each score
glm_confusion_matrix2_df <- as.data.frame(glm_confusion_matrix2)
glm2_tn <- glm_confusion_matrix2_df$Freq[1]
glm2_fp <- glm_confusion_matrix2_df$Freq[2]
glm2_fn <- glm_confusion_matrix2_df$Freq[3]
glm2_tp <- glm_confusion_matrix2_df$Freq[4]
glm_evaluation_measures2 <- get_evaluation_measures("model_train2", glm2_tn, glm2_fp, glm2_fn, glm2_tp)
#glm_evaluation_measures now holds a dataframe showing accuracy, precision, recall and F1
glm_evaluation_measures2

##           name  accuracy precision    recall        F1
## 1 model_train2 0.7946332 0.7016575 0.2319635 0.3486616

glm_evaluation_measures2$AUC <- get_auc(probabilities2, glm_test_set$default)
glm_evaluation_measures2

##           name  accuracy precision    recall        F1       AUC
## 1 model_train2 0.7946332 0.7016575 0.2319635 0.3486616 0.7274563

evaluation_measures <- rbind(evaluation_measures, glm_evaluation_measures2)
evaluation_measures

##           name  accuracy precision    recall        F1       AUC
## 1        Lasso 0.7777055 0.7056604 0.1649030 0.2673338 0.7149422
## 2  model_train 0.7942004 0.7000000 0.2301370 0.3463918 0.7279244
## 3 model_train2 0.7946332 0.7016575 0.2319635 0.3486616 0.7274563

AT2 Modelling by AB

Alex Brooks

29 September 2018