Objective

Create and validate a predictive model to understand a medical student’s chances of matching into obstetrics and gynecology residency.

TRAINING THE LOGISTIC REGRESSION MODEL USING THE caret PACKAGE

Setting ‘caret’ model control parameters

Cross-validation is a technique to assess the out-of-sample performance of a model. It involves partitioning the data into multiple sets and evaluating the model on each set, one-by-one. The goal is to obtain a more robust estimate of the model’s performance compared to a simple train-test split. The caret package in R can be used to perform cross-validation. We will work with the caret package (Classification and Regression Training) package using R. A strength of the caret package is that the same syntax can be used whether the model we are considering has a categorical or numeric outcome.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Data fitness and tuning functions. ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
#This code sets the seed for reproducibility, creates cross-validation folds using the caret::groupKFold function, sets the number of epochs and folds per epoch, and creates a trainControl object using the caret::trainControl function. The trainControl object specifies the method for cross-validation ("repeatedcv"), the number of folds (K), the number of epochs (repeats), the cross-validation folds (index), the summaryFunction (BigSummary), and whether to display verbose output for each iteration (verboseIter).
#This code sets the lambda sequence for lasso regression, sets the XGBoost and CatBoost tuning parameter grids, prepares the training and test data for CatBoost, and converts character variables to factors for both the training and test data.

set.seed(1978) # Set seed for reproducibility

# Set cross-validation folds using groupKFold
folds <- caret::groupKFold(group = train_match2$Location, k = 7)

# Set number of epochs and number of folds per epoch
epochs <- 3
K <- 10

# Create trainControl object
control <- caret::trainControl(
  method = "repeatedcv",
  #method = "adaptive_cv",
  number = K,
  repeats = epochs,
  index = folds,
  classProbs = TRUE,
  summaryFunction = BigSummary,
  verboseIter = FALSE,
  allowParallel = TRUE,
  savePredictions = TRUE,
  adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE)
)

#https://topepo.github.io/caret/adaptive-resampling.html
# adaptControl <- trainControl(method = "adaptive_cv",
#                              number = 5, repeats = 3,
#                              adaptive = list(min = 5, alpha = 0.05, 
#                                              method = "gls", complete = TRUE),
#                              classProbs = TRUE,
#                              summaryFunction = BigSummary,
#                              search = "random")


# #catboost training
# grid <- base::expand.grid(depth = c(6,7,9),
#                     learning_rate = 0.1,
#                     iterations = 10, l2_leaf_reg=5, rsm=0.8, border_count=30 )
# 
# #data prep for catboost
# x= train_match[,-1]
# y= train_match[1]
# 
# y <- y$Match_Status
# 
# x2= test_match[,-1]
# y2= test_match[1]
# y2 <- y2$Match_Status
# 
# x[sapply(x, is.character)] <- lapply(x[sapply(x, is.character)], as.factor)
# x2[sapply(x2, is.character)] <- lapply(x2[sapply(x2, is.character)], as.factor)

#Logit grid
logitGrid <- expand.grid(alpha = c(0,0.5,1), lambda = seq(0,0.1, length.out = 5))

#LASSO grid
lambda <- 10^seq(5, -2, length = 100) #This grid includes a range of lambda values from 10^5 to 10^-2, with a total of 25 values
lassoGrid <- expand.grid(alpha = 1, lambda = lambda[1:25])

#XGboost parameters tuning
#change to this for final
xgbGrid <- expand.grid(nrounds = c(50,100),#Number of boosting rounds to perform. This can be thought of as the number of trees in the model.
                       max_depth = c(5, 10), #Maximum depth of a tree. Larger value will make the model more complex and more likely to overfit.
                       colsample_bytree = seq(0.7, 0.8), #Subsample ratio of columns when constructing each tree. This means that for each tree, a random subset of the total number of columns will be used.
                       eta = c(0.1, 0.2), #Learning rate. This controls how quickly the model learns. Lower values will take longer to learn but will lead to a more accurate model.
                       gamma=0, #Minimum loss reduction required to make a further partition on a leaf node of the tree. A higher value means that the tree will be more conservative, and less likely to overfit.
                       min_child_weight = 1, #Minimum sum of instance weight needed in a child. This can be used to control the complexity of the model.
                       subsample = 0.8 #Subsample ratio of the training instances. This means that for each tree, a random subset of the total number of rows will be used.
                      )

# xgbGrid <- expand.grid(nrounds = 2,  #small for testing
#                        max_depth = 5,
#                        colsample_bytree = c(0.5),
#                        eta = c(0.1),
#                        gamma=0,
#                        min_child_weight = 1,
#                        subsample = 0.8
#                       )

# Set CatBoost tuning parameter grid
catboost_grid <- base::expand.grid(
  depth = c(6, 7, 9),
  learning_rate = 0.1,
  iterations = 10,
  l2_leaf_reg = 5,
  rsm = 0.8,
  border_count = 30
)

# Prepare training data for CatBoost
train_x <- train_match2[, -1]
train_y <- train_match2[1]$Match_Status

# Prepare test data for CatBoost
test_x <- test_match2[, -1]
test_y <- test_match2[1]$Match_Status

#nnet grid
# Not working:  nnetGrid <- expand.grid(size = c(1, 5, 10), decay = c(0, 0.1, 0.01), maxit = c(50, 100, 200), linout = c(TRUE, FALSE)) #Error: The tuning parameter grid should have columns size, decay

nnetGrid <- expand.grid(size = c(1, 5, 10), decay = c(0, 0.1, 0.01))

# Convert character variables to factors
train_x[sapply(train_x, is.character)] <- lapply(train_x[sapply(train_x, is.character)], as.factor)
test_x[sapply(test_x, is.character)] <- lapply(test_x[sapply(test_x, is.character)], as.factor)

In-sample performance of a model refers to how well the model fits or predicts on the data it was trained on. It measures the accuracy of the model on the training data and can be used as an indicator of model complexity and overfitting. However, in-sample performance alone is not a reliable indicator of a model’s ability to generalize to new, unseen data. Our algorithm was optimizing for the Brier Score. This is set with the argument metric.

Definition of Brier Score

The Brier Score in R can be calculated using the following mathematical equation:

\[BrierScore = mean((actual - predicted)^2)\] where actual is the actual binary outcome (0 or 1) and predicted is the predicted probability of the positive class (also ranging from 0 to 1). The mean is taken over all observations in the dataset. The resulting score ranges from 0 to 1, with 0 indicating a perfect prediction and 1 indicating a completely incorrect prediction.

Logistic regression: logit.fit

Logistic Regression is a popular and simple statistical model for binary classification problems. It makes use of a linear combination of predictors to predict the probability of a binary outcome. Logistic Regression is easy to interpret and can handle both numerical and categorical predictors. However, it may not perform as well on complex non-linear relationships between predictors and the outcome. The glmnet package is extremely efficient and fast, even on very large data sets (mostly due to its use of Fortran to solve the lasso problem via coordinate descent); note, however, that it only accepts the non-formula XY interface so prior to modeling we need to separate our feature and target sets.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Logistic regression model. In-sample----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

#This code sets the seed and runs garbage collection, trains a logistic regression model using the caret::train function with the glmnet method, and saves the model to a file using the readr::write_rds function. The trControl argument specifies the trainControl object that was previously defined, and the family and metric arguments specify the response distribution and evaluation metric, respectively. The maximize argument specifies that the metric should be minimized rather than maximized.

# Set seed and run garbage collection
set.seed(1978)
invisible(gc())

cluster <- makeCluster(detectCores() - 1)
registerDoParallel(cluster)

# Train logistic regression model using glmnet
logit.fit <- caret::train(
  Match_Status ~ .,
  data = train_match,
  method = "glmnet",
  trControl = control,
  family = "binomial",
  tuneGrid = logitGrid,
  metric = "Brier",
  maximize = FALSE,
  allowParallel = TRUE #It's important to note that the allowParallel argument is only available in caret 6.0-84 or later.
); logit.fit
#> glmnet 
#> 
#> 3121 samples
#>   34 predictor
#>    2 classes: 'Matched', 'Not_Matched' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 3 times) 
#> Summary of sample sizes: 2254, 2394, 2962, 1877, 2997 
#> Resampling results across tuning parameters:
#> 
#>   alpha  lambda  Acc    Kappa  AUCROC  AUCPR  Brier  Precision  Recall  F    
#>   0.0    0.000   0.722  0.396  0.757   0.743  0.190  0.744      0.552   0.628
#>   0.0    0.025   0.722  0.396  0.757   0.744  0.189  0.744      0.552   0.628
#>   0.0    0.050   0.724  0.401  0.760   0.746  0.188  0.745      0.559   0.633
#>   0.0    0.075   0.724  0.401  0.761   0.748  0.188  0.743      0.561   0.634
#>   0.0    0.100   0.723  0.398  0.762   0.748  0.188  0.742      0.562   0.634
#>   0.5    0.000   0.718  0.392  0.751   0.735  0.195  0.739      0.548   0.627
#>   0.5    0.025   0.725  0.404  0.766   0.753  0.187  0.748      0.563   0.637
#>   0.5    0.050   0.721  0.396  0.765   0.752  0.191  0.751      0.547   0.630
#>   0.5    0.075   0.708  0.368  0.765   0.751  0.195  0.764      0.495   0.598
#>   0.5    0.100   0.703  0.357  0.759   0.746  0.202  0.791      0.449   0.570
#>   1.0    0.000   0.716  0.386  0.753   0.738  0.195  0.744      0.536   0.618
#>   1.0    0.025   0.720  0.395  0.765   0.752  0.189  0.749      0.549   0.630
#>   1.0    0.050   0.703  0.356  0.758   0.744  0.199  0.785      0.453   0.572
#>   1.0    0.075   0.699  0.344  0.740   0.730  0.210  0.812      0.416   0.546
#>   1.0    0.100   0.699  0.343  0.729   0.723  0.220  0.815      0.411   0.543
#> 
#> Brier was used to select the optimal model using the smallest value.
#> The final values used for the model were alpha = 0.5 and lambda = 0.025.
# Save model to file
readr::write_rds(logit.fit, file.path(mod_path, "logit.fit.rds"))
knitr_table_html(logit.fit$bestTune, "Best tune for the Logistic Regression: logit.fit model.")
Best tune for the Logistic Regression: logit.fit model.
alpha lambda
7 0.5 0.025

LASSO: lasso.fit

LASSO: LASSO, or Least Absolute Shrinkage and Selection Operator, is a regularization technique used in regression analysis. It works by adding a penalty term to the objective function to reduce the magnitude of the coefficients, effectively shrinking some coefficients to zero, which results in feature selection. LASSO is a good option for models with a high number of predictors, as it can perform feature selection and help prevent overfitting. Compared to random forest, xgboost, logistic regression, and neural network, LASSO has the advantage of producing a sparse model with only a subset of predictors.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Least absolute shrinkage and selection operator model.  In-sample----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

#Lasso model
#The lasso_fit model is a Lasso model that uses the glmnet function to fit a generalized linear model with a L1 penalty (also known as a Lasso penalty) on the coefficients. The logit.fit model, on the other hand, is a logistic regression model that uses the glmnet function to fit a generalized linear model with a combination of L1 and L2 penalties (also known as an Elastic Net penalty) on the coefficients.

#The Lasso model tends to produce sparse models, where many of the coefficients are exactly zero. This can be useful for feature selection, as it can identify the most important predictors and eliminate the less important ones. The Elastic Net model, on the other hand, allows for a combination of L1 and L2 penalties, which can balance between sparsity and model fit.

set.seed(1978); invisible(gc())
lasso.fit <- caret::train(Match_Status ~., # must be a factor
                   data = train_match, 
                   method = "glmnet", 
                   family= "binomial", 
                   trControl = control, 
                   tuneGrid = lassoGrid, # 1: lasso; 0: ridge
                   metric = "Brier", 
                   maximize = FALSE); lasso.fit # minimize the Brier Score
#> glmnet 
#> 
#> 3121 samples
#>   34 predictor
#>    2 classes: 'Matched', 'Not_Matched' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 3 times) 
#> Summary of sample sizes: 2254, 2394, 2962, 1877, 2997 
#> Resampling results across tuning parameters:
#> 
#>   lambda  Acc    Kappa  AUCROC  AUCPR  Brier  Precision  Recall  F  
#>     2009  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     2364  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     2783  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     3275  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     3854  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     4535  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     5337  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     6280  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     7391  0.541  0      0.5     0      0.252  NaN        0       NaN
#>     8697  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    10235  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    12045  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    14175  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    16681  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    19630  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    23101  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    27186  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    31993  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    37649  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    44306  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    52140  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    61359  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    72208  0.541  0      0.5     0      0.252  NaN        0       NaN
#>    84975  0.541  0      0.5     0      0.252  NaN        0       NaN
#>   100000  0.541  0      0.5     0      0.252  NaN        0       NaN
#> 
#> Tuning parameter 'alpha' was held constant at a value of 1
#> Brier was used to select the optimal model using the smallest value.
#> The final values used for the model were alpha = 1 and lambda = 100000.
readr::write_rds(lasso.fit, file.path(mod_path, 'lasso_fit.rds'))

The values of best_alpha and best_lambda correspond to the combination of parameters that produced the best results according to the performance metric used during the training process.

knitr_table_html(lasso.fit$bestTune, "Best tune for the LASSO Optimized: lasso.fit model.")
Best tune for the LASSO Optimized: lasso.fit model.
alpha lambda
25 1 100,000

best_alpha and best_lambda are variables that store the optimal values of the alpha and lambda tuning parameters, respectively, from a logistic regression model (logit.fit). The value of best_alpha is 0.5. The best lambda value is 0.065. These variables are extracted from the logit.fit$bestTune object.

knitr_table_html(data = best_params, caption = "Alpha and Lambda values that minimizes the Brier Score for lasso.fit")
Alpha and Lambda values that minimizes the Brier Score for lasso.fit
alpha lambda min_brier_score brier_score_sd
0.0 0.075 0.188 0.015
0.5 0.025 0.187 0.015
1.0 0.025 0.189 0.014

LASSO Optimized: lasso.best.fit

Best grid search terms in logit_shrink model.

set.seed(1978)

#The lambda argument specifies the value of the L1-norm penalty parameter to be used in the model. 
lasso.best.fit <- caret::train(
        Match_Status~., # must be a factor
        data=train_match,
        method="glmnet",
        family="binomial",
        lambda=best_lambda,
        metric = "Accuracy", #The metric "Brier" was not in the result set. Accuracy will be used instead.
        trControl=control,
        tuneGrid=expand.grid(alpha=best_alpha, lambda=best_lambda)); lasso.best.fit
#> glmnet 
#> 
#> 3121 samples
#>   34 predictor
#>    2 classes: 'Matched', 'Not_Matched' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 3 times) 
#> Summary of sample sizes: 2254, 2394, 2962, 1877, 2997 
#> Resampling results:
#> 
#>   Acc    Kappa  AUCROC  AUCPR  Brier  Precision  Recall  F    
#>   0.714  0.378  0.765   0.751  0.193  0.757      0.515   0.609
#> 
#> Tuning parameter 'alpha' was held constant at a value of 0.5
#> Tuning parameter 'lambda' was held constant at a value
#>  of 0.0645
readr::write_rds(lasso.best.fit, file.path(mod_path, 'lasso_best_fit.rds'))

The alpha and lambda variables in the lasso.best.fit$bestTune object are two of the tuning parameters that can be used to control the model fit of a LASSO regression model in R. The alpha parameter determines the type of penalty applied to the coefficients in the LASSO regression. A value of 1 corresponds to a L1 penalty, while a value of 0 corresponds to an unpenalized regression. The lambda parameter determines the strength of the penalty applied, with larger values leading to more regularization and smaller coefficients. These two parameters can be tuned to control the tradeoff between model complexity and predictive performance.

knitr_table_html(lasso.best.fit$bestTune, "Best tune for the LASSO Optimized: lasso.best.fit model.")
Best tune for the LASSO Optimized: lasso.best.fit model.
alpha lambda
0.5 0.065

Random forest: rf.fit

Random Forest: Random Forest creates multiple decision trees and uses their outputs to make the final prediction. The trees are created by randomly sampling the data and features, which makes the model less prone to overfitting compared to single decision trees. The final prediction is the average of all the trees, making it more robust and less sensitive to outliers.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Random forest.In-sample  ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
# Set seed and run garbage collection
set.seed(1978); invisible(gc())
# 
# # Train random forest model using caret::train
#control <- trainControl(method = "cv", number = 5)
 rf.fit <- caret::train(
   Match_Status ~ .,
   data = train_match,
   method = "rf",
   trControl = control,
   metric = "Brier",
   maximize = FALSE
 ); rf.fit
#> Random Forest 
#> 
#> 3121 samples
#>   34 predictor
#>    2 classes: 'Matched', 'Not_Matched' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 3 times) 
#> Summary of sample sizes: 2254, 2394, 2962, 1877, 2997 
#> Resampling results across tuning parameters:
#> 
#>   mtry  Acc    Kappa  AUCROC  AUCPR  Brier  Precision  Recall  F    
#>    2    0.710  0.372  0.758   0.747  0.191  0.767      0.505   0.604
#>   23    0.706  0.371  0.756   0.739  0.193  0.717      0.556   0.623
#>   45    0.691  0.344  0.745   0.723  0.197  0.704      0.527   0.601
#> 
#> Brier was used to select the optimal model using the smallest value.
#> The final value used for the model was mtry = 2.
# 
# # Save model to file
readr::write_rds(rf.fit, file.path(mod_path, "random_forest_fit.rds"))

The rf.fit$bestTune variable mtry represents the number of predictors randomly sampled as candidates at each split in the random forest model. The value of mtry that provides the best performance, as determined by the optimization metric specified in the caret::train function, is stored in rf.fit$bestTune. In this case, the optimization metric used was the Brier Score, which measures the mean squared error between the predicted probabilities and the true target values. The optimal value of mtry is the one that results in the lowest Brier Score for the random forest model.

knitr_table_html(rf.fit$bestTune, "Best tune for the Random Forest model.")
Best tune for the Random Forest model.
mtry
2

xgboost: xgb.fit

XGBoost: XGBoost is a powerful algorithm that has shown to be effective in many machine learning tasks, including binary logistic regression problems. Compared to logistic regression and random forest, XGBoost is designed to handle larger datasets with many factors and can handle non-linear relationships between variables. Additionally, XGBoost uses an ensemble of decision trees to make predictions, which allows it to capture complex relationships in the data and improve the accuracy of predictions. Overall, XGBoost is a highly efficient and effective algorithm for binary logistic regression problems.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Xgboost. In-sample ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
set.seed(1978); invisible(gc())
 xgb.fit <- caret::train(Match_Status~., 
                  data=train_match, 
                  method="xgbTree", 
                  trControl=control, 
                  tuneGrid = xgbGrid, 
                  metric = "Brier", 
                  maximize = FALSE,
                  parallel = TRUE);xgb.fit
#> [03:12:15] WARNING: amalgamation/../src/learner.cc:627: 
#> Parameters: { "parallel" } might not be used.
#> 
#>   This could be a false alarm, with some parameters getting used by language bindings but
#>   then being mistakenly passed down to XGBoost core, or some parameter actually being used
#>   but getting flagged wrongly here. Please open an issue if you find any such cases.
#> eXtreme Gradient Boosting 
#> 
#> 3121 samples
#>   34 predictor
#>    2 classes: 'Matched', 'Not_Matched' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 3 times) 
#> Summary of sample sizes: 2254, 2394, 2962, 1877, 2997 
#> Resampling results across tuning parameters:
#> 
#>   eta  max_depth  nrounds  Acc    Kappa  AUCROC  AUCPR  Brier  Precision  Recall  F    
#>   0.1   5          50      0.704  0.374  0.758   0.747  0.191  0.744      0.529   0.617
#>   0.1   5         100      0.689  0.353  0.746   0.739  0.202  0.725      0.518   0.600
#>   0.1  10          50      0.694  0.353  0.746   0.736  0.198  0.700      0.542   0.609
#>   0.1  10         100      0.678  0.329  0.738   0.729  0.210  0.687      0.529   0.593
#>   0.2   5          50      0.682  0.339  0.735   0.727  0.208  0.728      0.488   0.581
#>   0.2   5         100      0.676  0.331  0.720   0.715  0.222  0.703      0.511   0.586
#>   0.2  10          50      0.692  0.347  0.739   0.726  0.210  0.692      0.544   0.604
#>   0.2  10         100      0.671  0.312  0.730   0.715  0.227  0.663      0.546   0.591
#> 
#> Tuning parameter 'gamma' was held constant at a value of 0
#> Tuning parameter 'colsample_bytree' was held constant at
#>  a value of 0.7
#> Tuning parameter 'min_child_weight' was held constant at a value of 1
#> Tuning parameter 'subsample'
#>  was held constant at a value of 0.8
#> Brier was used to select the optimal model using the smallest value.
#> The final values used for the model were nrounds = 50, max_depth = 5, eta = 0.1, gamma = 0, colsample_bytree =
#>  0.7, min_child_weight = 1 and subsample = 0.8.
readr::write_rds(xgb.fit, file.path(mod_path, 'xgb_fit.rds')); invisible(gc())

The nrounds variable represents the number of boosting iterations to perform, max_depth is the maximum depth of a tree, eta represents the learning rate, gamma is a parameter for the minimum loss reduction required to make a split, colsample_bytree is the fraction of columns to be subsampled for each tree, min_child_weight is the minimum sum of instance weight required in a child, and subsample is the fraction of the training instance to be subsampled.

knitr_table_html(xgb.fit$bestTune, caption = "Best Tune for xgBoost.")
Best Tune for xgBoost.
nrounds max_depth eta gamma colsample_bytree min_child_weight subsample
50 5 0.1 0 0.7 1 0.8

catboost: CATBoost, which stands for Category-Aware Boosting, is a gradient boosting algorithm that is specifically designed to handle categorical features. This makes it a good choice for binary regression models where there may be a significant number of categorical variables. Additionally, CATBoost can handle missing values and is robust to noisy data, which are common challenges in real-world datasets. Furthermore, CATBoost uses an innovative approach to gradient boosting that results in more accurate predictions compared to traditional gradient boosting algorithms. These features make CATBoost a strong contender for binary regression problems.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   CatBOOST.  In-sample ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

## CatBOOST
#CatBoost is a gradient boosting library developed by Yandex that is particularly well-suited for datasets with many categorical features. It can automatically handle missing values and categorical variables, and also includes built-in regularization to prevent overfitting. Additionally, it allows you to use both categorical and numerical features in the same model. This makes it a good option for datasets that have many factors, as it can handle them without the need for extensive preprocessing.

set.seed(1978); invisible(gc())
# CAT.fit <- caret::train(x, as.factor(y),
#                 method = catboost.caret,
#                 logging_level = 'Verbose', preProcess = NULL,
#                 tuneGrid = grid, 
#                 trControl = control, 
#                 metric = "Brier", 
#                 maximize = F)

#readr::write_rds(CAT.fit, file.path(mod_path, 'CAT_fit.rds'))

Neural net: nnet.fit

Neural Networks: Compared to logistic regression, random forest, and xgboost, neural networks offer a more powerful and flexible approach to modeling complex relationships between factors in a binary logistic regression problem. They have the ability to learn non-linear relationships between the predictors and the target, whereas logistic regression and other traditional machine learning algorithms make assumptions about the form of the relationships. This makes neural networks well suited to datasets with many factors, especially where the relationships between the factors are not well understood.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Neural net. In-sample ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
# Set seed and run garbage collection
set.seed(1978); invisible(gc())

nnet.fit <- caret::train(
  Match_Status ~ .,
  data = train_match,
  method = "nnet",
  trControl = control,
  metric = "Brier",
  maximize = FALSE,
  parallelStart = detectCores() - 1,
  tuneGrid = nnetGrid
); nnet.fit
#> # weights:  236
#> initial  value 2450.849282 
#> iter  10 value 2133.088710
#> iter  20 value 2107.768677
#> iter  30 value 2035.843112
#> iter  40 value 1854.180065
#> iter  50 value 1768.245036
#> iter  60 value 1744.076682
#> iter  70 value 1732.966697
#> iter  80 value 1731.805443
#> iter  90 value 1724.153592
#> iter 100 value 1706.837527
#> final  value 1706.837527 
#> stopped after 100 iterations
#> Neural Network 
#> 
#> 3121 samples
#>   34 predictor
#>    2 classes: 'Matched', 'Not_Matched' 
#> 
#> No pre-processing
#> Resampling: Cross-Validated (10 fold, repeated 3 times) 
#> Summary of sample sizes: 2254, 2394, 2962, 1877, 2997 
#> Resampling results across tuning parameters:
#> 
#>   size  decay  Acc    Kappa    AUCROC  AUCPR  Brier  Precision  Recall  F    
#>    1    0.00   0.541   0.0000  0.500   0.000  0.253    NaN      0.0000    NaN
#>    1    0.01   0.569   0.0645  0.538   0.159  0.251  0.623      0.3175  0.640
#>    1    0.10   0.660   0.2071  0.648   0.450  0.221  0.683      0.3867  0.662
#>    5    0.00   0.504  -0.0285  0.483   0.113  0.260  0.337      0.0948  0.394
#>    5    0.01   0.715   0.3745  0.744   0.639  0.208  0.728      0.5847  0.629
#>    5    0.10   0.695   0.3414  0.722   0.689  0.210  0.729      0.5234  0.597
#>   10    0.00   0.636   0.1417  0.599   0.288  0.233  0.838      0.2039  0.634
#>   10    0.01   0.632   0.1639  0.674   0.511  0.268  0.644      0.5469  0.611
#>   10    0.10   0.705   0.3352  0.734   0.710  0.208  0.737      0.5076  0.580
#> 
#> Brier was used to select the optimal model using the smallest value.
#> The final values used for the model were size = 5 and decay = 0.01.
# Save model to file
readr::write_rds(nnet.fit, file.path(mod_path, "neural_network_fit.rds"))

# Run garbage collection
invisible(gc())

The nnet.best.fit$bestTune variables, size and decay, refer to the parameters in a neural network model. Size represents the number of neurons in the hidden layer, and decay refers to the weight decay or regularization term applied to the model to prevent overfitting. The optimal values of these parameters are determined by the caret package during the model training process to minimize the chosen performance metric, such as the Brier score.

knitr_table_html(nnet.fit$bestTune, caption = "Best Tune for nnet")
Best Tune for nnet
size decay
5 5 0.01

Model ensemble: model_list_big

Runs forever even with adaptive modeling turned on in the caret::trainControl.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
#####   Creation of a Model Ensemble tuned with Accuracy. In-sample. ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

#The error message says "The metric 'ROC' was not in the result set. Acc will be used instead." It seems that the metric argument in the caretList function has the value 'ROC' but it is not supported by the method that is being used for modeling. Try changing the value of the metric argument to 'Acc' or another supported metric like 'Kappa' or 'AUC'.
# 
cl <- makeCluster(detectCores() - 1)

ensemble_control <- caret::trainControl(
  method="cv",
  number=5,
  verboseIter = FALSE,
  #adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE),
  savePredictions = "final")

model_list_big <- caretEnsemble::caretList(Match_Status~., data=train_match,
  trControl=ensemble_control,
tuneList=list(
    logit = caretEnsemble::caretModelSpec(method = "glmnet",  family= "binomial"),
    lasso = caretEnsemble::caretModelSpec(method = "glmnet",  family= "binomial",
                           tuneGrid = expand.grid(alpha = 1, lambda = lambda)),
    xgb=caretEnsemble::caretModelSpec(method="xgbTree", tuneGrid=xgbGrid)),
  metric="Accuracy", #Error: The metric "ROC" was not in the result set. Acc will be used instead.
  methodList = c("glmnet", "nnet", "rf", "rpart"), #, "xgbTree"),
  parallel = TRUE,
  numCores = cl); model_list_big

readr::write_rds(model_list_big, file.path(mod_path, 'model_list_big.rds')); invisible(gc())

stopCluster(cl)
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
#####   Run the catboost model. In-sample ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
#As ensemble caret does not support custom model, we will add cat model manually

set.seed(1978)
# CAT.fit2 <- caret::train(x, as.factor(y),
#                 method = catboost.caret,
#                 logging_level = 'Verbose', preProc = NULL,
#                 tuneGrid = grid,
#                 trControl = trainControl(method="repeatedcv", number=10, repeats=3, index = folds, classProbs = TRUE,
#                 summaryFunction = twoClassSummary, savePredictions = "final"), metric = "ROC") # Specify fit parameters
#
# readr::write_rds(CAT.fit2, file.path(mod_path, 'CAT_fit2.rds'))
#model_list_big[["CATboost"]] <- CAT.fit2

Correlation of models

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
#####   Determine if the models are different enough to be used together as an ensemble.  ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
#https://cran.r-project.org/web/packages/caretEnsemble/vignettes/caretEnsemble-intro.html

#caret::modelCor is a function in the caret package that calculates the pairwise correlations between the models in a caretList. It returns a matrix of correlations between the models.
#
# caret::resamples is a function in the caret package that creates a resample object from a train object. It takes as input a train object or a list of train objects, and returns a resamples object that contains the results of model training and evaluation.
#
# So, caret::modelCor(caret::resamples(model_list_big)) calculates the pairwise correlations between the models in the model_list_big caretList object, using the results of model training and evaluation stored in the resamples object. The resulting matrix of correlations can be used to assess the degree of agreement between the models in the caretList.
#
# For example, a high correlation between two models might indicate that the models are making similar predictions and are therefore in agreement. On the other hand, a low correlation between two models might indicate that the models are making different predictions and are therefore in disagreement. The degree of agreement between the models in a caretList can be useful in understanding the relative strengths and weaknesses of the models and in identifying potential areas of improvement.

#model_cor_matrix <- caret::modelCor(caret::resamples(model_list_big))

#a heatmap of the correlation matrix where red indicates a positive correlation and blue indicates a negative correlation.
#heatmap <- ggcorrplot::ggcorrplot(model_cor_matrix, hc.order = TRUE, type = "lower", lab = TRUE, lab_size = 3, method = "circle");heatmap

#tm_ggsave(heatmap, "heatmap_correlation_matrix_of_all_models")

All models are highly correlated as shown by the plot above. Therefore the individual models are not ideal candidates for ensemble. Nevertheless we will try.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
#####   Creation of an ensemble.  In-sample.----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

#In summary, the greedy_ensemble object is an ensemble model, while the caret::modelCor(caret::resamples(model_list_big)) code is used to calculate the pairwise correlations between the models in the resamples object. The greedy_ensemble object combines the predictions of multiple models to improve the overall performance of the ensemble, while the caret::modelCor function does not create an ensemble model, but rather provides a way to assess the degree of agreement between the models in a resamples object.

#This code creates an ensemble model using the caretEnsemble package. The ensemble model is trained using a list of pre-defined models, in this case logit.fit, rf.fit, and lasso.best.fit. A custom function brier_score_binary is defined for calculating the Brier score. The function is used in the summaryFunction argument in the trainControl function. The trainControl function also uses 2-fold cross-validation and sets classProbs to TRUE, indicating that class probabilities should be returned. The ensemble model is then trained using the caretEnsemble function with the model_list_big and the control parameters defined in ensemble_control. The summary of the trained ensemble model is then displayed using the summary function.

# Define a custom function to calculate the Brier score
brier_score_binary = function(ytru, pred_prob){
  mse(ytru, pred_prob)
}

# Create a list of models to be used in the ensemble
#model_list_big <- list(logit.fit, rf.fit, lasso.best.fit)

# Define the control parameters for training the ensemble
# using 2-fold cross-validation and Brier score as the evaluation metric
# ensemble_control <- caret::trainControl(
#     method = "cv",
#     number = 2,
#     summaryFunction = brier_score_binary,
#     classProbs = TRUE
# )
# 
# greedy_ensemble <- caretEnsemble::caretEnsemble(
#   model_list_big,
#   metric="Acc",
#   trControl=caret::trainControl(
#     number=2,
#     summaryFunction=BigSummary,
#     classProbs=TRUE
#     ))
# summary(greedy_ensemble)
# 
# readr::write_rds(greedy_ensemble, file.path(mod_path, 'greedy_ensemble.rds')); invisible(gc())
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Collect resamples. In-sample fit ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

#caret::resamples() is a function that lets you compare the performance of different models you've trained. It collects performance results of the models you pass to it and allows you to compare them in an easy way. It's like having a contest with friends, and each friend has a different toy car. You race each of the toy cars, and caret::resamples() is like a notebook where you keep track of who finished first, second, third and so on in each race. So at the end, you can see which toy car was the fastest and the best overall.

results <- caret::resamples(list(Logit=logit.fit, 
                                 Lasso=lasso.best.fit, 
                                 RF=rf.fit, 
                                 XGB=xgb.fit, 
                                 #CAT= CAT.fit, 
                                 Nnet= nnet.fit))

Brier Score (.fit models)

Brier Scores of Each Model

Brier score and AUC are both used for evaluating binary classifiers, but each has its own strengths and weaknesses. Brier score measures the mean squared difference between the predicted probabilities and the true target values, making it more sensitive to predictions that are further away from the true values. On the other hand, AUC measures the overall performance of a classifier by comparing the rank order of predicted probabilities to the true target values, making it less sensitive to individual predictions but more robust to class imbalance. Ultimately, the choice between Brier score and AUC depends on the specific needs and goals of the analysis.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Model Comparison.  Train data set Brier Scores. In-sample. ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

logit = logit.fit$results$Brier[1]
lasso = lasso.best.fit$results$Brier[1]
rf = rf.fit$results$Brier[1]
xgb = xgb.fit$results$Brier[1]
#cat = CAT.fit$results$Brier[1]
nnet = nnet.fit$results$Brier[1]

Metric_value <- data.frame(logit = logit, lasso = lasso, RF =rf, XGB = xgb, #CAT=cat,
                           Nnet=nnet) #model types and names

# Write the data frame to a CSV file
write.csv(Metric_value, file = "output/csv/Metric_value_Brier_scores_in_sample.csv")
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Confidence intervals of the Brier Scores. In-sample. ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

#The method used in the code provided is the classical way to estimate confidence intervals by using the formula of estimated value + c(-1,1)*1.96*standard deviation.

# Get Brier scores
logit_brier <- logit.fit$results$Brier[1]
lasso_brier <- lasso.best.fit$results$Brier[1]
rf_brier <- rf.fit$results$Brier[1]
xgb_brier <- xgb.fit$results$Brier[1]
nnet_brier <- nnet.fit$results$Brier[1]

# Get Brier score CI upper bounds
logit_brier_high <- logit.fit$results$Brier[1]+1.96*logit.fit$results$BrierSD[1]
lasso_brier_high <- lasso.best.fit$results$Brier[1]+1.96*lasso.best.fit$results$BrierSD[1]
rf_brier_high <- rf.fit$results$Brier[1]+1.96*rf.fit$results$BrierSD[1]
xgb_brier_high = xgb.fit$results$Brier[1]+1.96*xgb.fit$results$BrierSD[1] 
#cat_CI <- CAT.fit$results$Brier[1]+c(-1,1)*1.96*CAT.fit$results$BrierSD[1] 
nnet_brier_high <- nnet.fit$results$Brier[1]+1.96*nnet.fit$results$BrierSD[1]

# Get Brier score CI lower bounds
logit_brier_low <- logit.fit$results$Brier[1]-1.96*logit.fit$results$BrierSD[1]
lasso_brier_low <- lasso.best.fit$results$Brier[1]-1.96*lasso.best.fit$results$BrierSD[1]
rf_brier_low <- rf.fit$results$Brier[1]-1.96*rf.fit$results$BrierSD[1]
xgb_brier_low <- xgb.fit$results$Brier[1]-1.96*xgb.fit$results$BrierSD[1]
nnet_brier_low <- nnet.fit$results$Brier[1]-1.96*nnet.fit$results$BrierSD[1]

logistic= c(logit_brier, logit_brier_high, logit_brier_low)
lasso = c(lasso_brier, lasso_brier_high, lasso_brier_low)
rf = c(rf_brier, rf_brier_high, rf_brier_low) 
xgb = c(xgb_brier, xgb_brier_high, xgb_brier_low)
#cat_CI=cat_CI, 
nnet=c(nnet_brier, nnet_brier_high, nnet_brier_low)

table <- data.frame(logistic, lasso, rf, xgb, nnet)

rownames(table) <- c("Brier", "Brier Lower", "Brier Upper")
knitr_table_html(data = table, caption = "Brier Score and Confidence Intervals by Training Model")
Brier Score and Confidence Intervals by Training Model
logistic lasso rf xgb nnet
Brier 0.190 0.193 0.191 0.191 0.253
Brier Lower 0.227 0.215 0.227 0.224 0.284
Brier Upper 0.153 0.171 0.156 0.158 0.223

Brier Score Summary Table

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####  Summarize the differences between the In-sample models.  ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
summary(results)
#> 
#> Call:
#> summary.resamples(object = results)
#> 
#> Models: Logit, Lasso, RF, XGB, Nnet 
#> Number of resamples: 5 
#> 
#> Acc 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.677   0.686  0.718 0.725   0.758 0.788    0
#> Lasso 0.674   0.688  0.698 0.714   0.734 0.774    0
#> RF    0.670   0.682  0.702 0.710   0.717 0.781    0
#> XGB   0.657   0.661  0.695 0.704   0.730 0.777    0
#> Nnet  0.631   0.710  0.711 0.715   0.759 0.766    0
#> 
#> AUCPR 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.660   0.711  0.715 0.753   0.801 0.877    0
#> Lasso 0.653   0.704  0.718 0.751   0.796 0.886    0
#> RF    0.640   0.699  0.734 0.747   0.790 0.872    0
#> XGB   0.633   0.697  0.739 0.747   0.788 0.877    0
#> Nnet  0.486   0.542  0.688 0.639   0.711 0.766    0
#> 
#> AUCROC 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.710   0.733  0.771 0.766   0.793 0.823    0
#> Lasso 0.703   0.735  0.768 0.765   0.790 0.827    0
#> RF    0.694   0.747  0.756 0.758   0.788 0.806    0
#> XGB   0.687   0.749  0.761 0.758   0.779 0.814    0
#> Nnet  0.668   0.723  0.768 0.744   0.773 0.787    0
#> 
#> Brier 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.171   0.172  0.187 0.187   0.197 0.206    0
#> Lasso 0.180   0.184  0.195 0.193   0.198 0.208    0
#> RF    0.168   0.179  0.193 0.191   0.206 0.211    0
#> XGB   0.170   0.184  0.187 0.191   0.199 0.214    0
#> Nnet  0.171   0.182  0.208 0.208   0.217 0.262    0
#> 
#> F 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.528   0.564  0.617 0.637   0.669 0.808    0
#> Lasso 0.538   0.540  0.575 0.609   0.612 0.779    0
#> RF    0.519   0.581  0.583 0.604   0.587 0.752    0
#> XGB   0.540   0.599  0.616 0.617   0.632 0.696    0
#> Nnet  0.484   0.532  0.641 0.629   0.662 0.828    0
#> 
#> Kappa 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.310   0.316  0.433 0.404   0.480 0.482    0
#> Lasso 0.294   0.341  0.372 0.378   0.436 0.450    0
#> RF    0.279   0.358  0.383 0.372   0.389 0.450    0
#> XGB   0.270   0.335  0.386 0.374   0.426 0.454    0
#> Nnet  0.228   0.364  0.399 0.375   0.417 0.465    0
#> 
#> Precision 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.616   0.718  0.785 0.748   0.790 0.829    0
#> Lasso 0.622   0.757  0.782 0.757   0.787 0.841    0
#> RF    0.626   0.762  0.810 0.767   0.812 0.823    0
#> XGB   0.586   0.740  0.756 0.744   0.809 0.828    0
#> Nnet  0.542   0.672  0.774 0.728   0.787 0.863    0
#> 
#> Recall 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.418   0.508  0.520 0.563   0.579 0.787    0
#> Lasso 0.418   0.455  0.477 0.515   0.501 0.725    0
#> RF    0.443   0.448  0.455 0.505   0.478 0.700    0
#> XGB   0.496   0.498  0.500 0.529   0.552 0.600    0
#> Nnet  0.336   0.523  0.578 0.585   0.612 0.875    0

Plots of the Summary Table

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####  Box and whisker plot to compare the In-sample models.  ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
scales <- list(x=list(relation="free"), y=list(relation="free"))
lattice::bwplot(results, scales=scales)

ggsave("output/fig/results_box_and_whisker_plot_in_sample_resamples.tiff")

Log Loss (.fit3 models)

Both Brier score and log loss are measures of how well a model predicts probabilities for binary outcomes. The Brier score is defined as the mean squared difference between the predicted probability and the true outcome, over all observations. \[BrierScore = mean((actual - predicted)^2)\] Log loss is the negative log likelihood of the predicted probability for the true outcome.Here, y represents the true binary target values (0 or 1), and y_hat represents the predicted probabilities of the positive class. The log loss measures the average error between the predicted probabilities and the true targets, with a lower log loss indicating a better model performance. \[log_loss = -mean(y * log(y_hat) + (1 - y) * log(1 - y_hat))}\]

Brier score is a directly interpretable measure, it gives a single number indicating the mean squared difference between predicted probability and true outcome. It ranges from 0 to 1 and the lower the score the better the model, a score of 0.25 is four times better than a score of 0.5.

Log loss punishes the predicted probabilities that are farther from true outcome, and it ranges from 0 to positive infinity, with zero being a perfect score. This makes it a better metric for selecting between models when there are multiple models that are very good.

Another benefit of Brier score is that it is more robust to class imbalance, It is a symmetric loss function around the decision threshold of 0.5 that treats false positives and false negatives equally, whereas Log loss can give more weight to one class over the other if the sample is imbalanced. The Brier score is a commonly used measure of the accuracy of probabilistic predictions, while log loss is a commonly used measure of the performance of probability estimates.

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####  Check logloss.  ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

set.seed(1978)
#train_match2$Match_Status <- ifelse(train_match2$Match_Status == 1, "Matched", "Not_Matched")

#set cross validation
folds = caret::groupKFold(group = train_match$Location, k=7)

control <- caret::trainControl(
  method = "repeatedcv",
  #method = "adaptive_cv",
  number = 3,
  repeats = 1,
  index = folds,
  classProbs = TRUE,
  summaryFunction = mnLogLoss,
  verboseIter = FALSE,
  allowParallel = TRUE,
  savePredictions = TRUE,
  adaptive = list(min = 5, alpha = 0.05, method = "gls", complete = TRUE)
)

#control <- caret::trainControl(method="repeatedcv", number=10, repeats=3, index = folds, classProbs = TRUE, summaryFunction = mnLogLoss)

#Set lambda sequence for lasso regression
lambda <- 10^seq(-3, 3, length = 100)

#XGboost parameters tuning
# xgbGrid <- expand.grid(nrounds = c(100,200),  
#                        max_depth = c(10, 15, 20, 25),
#                        colsample_bytree = seq(0.5, 0.9, length.out = 5),
#                        eta = 0.1,
#                        gamma=0,
#                        min_child_weight = 1,
#                        subsample = 0.8
#                       )
xgbGrid <- expand.grid(nrounds = c(100,150),  
                       max_depth = c(10, 15),
                       colsample_bytree = c(0.5,0.7,0.9),
                       eta = c(0.1,0.2),
                       gamma=0,
                       min_child_weight = 1,
                       subsample = 0.8
                      )

#catboost training
grid <- expand.grid(depth = c(6,7,9),
                    learning_rate = 0.1,
                    iterations = 10, l2_leaf_reg=5, rsm=0.8, border_count=30 )

train_match2 <- na.omit(train_match)

#data prep for catboost
# x= train_match2[,-1]
# y= train_match2[1]
# 
# y <- y$Match_Status
# 
# x2= test_match2[,-1]
# y2= test_match2[1]
# y2 <- y2$Match_Status
# 
# x[sapply(x, is.character)] <- lapply(x[sapply(x, is.character)], as.factor)
# x2[sapply(x2, is.character)] <- lapply(x2[sapply(x2, is.character)], as.factor)

Train models on train data for LOG LOSS

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####   Log Loss on the models: in-sample.  ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####
#set cross validation
folds = caret::groupKFold(group = train_match$Location, k=7)
#control <- caret::trainControl(method="repeatedcv", number=10, repeats=3, index = folds, classProbs = TRUE,
 #                    summaryFunction = mnLogLoss)

train_match2 <- na.omit(train_match2)

#logistic model
set.seed(1978)
logit.fit3 <- caret::train(Match_Status~., data=train_match2, 
                           method="glmnet", 
                           trControl=control, 
                           family= "binomial",
                           metric = "logLoss")
###
# pred <- predict(logit.fit3, newdata = train_match2, type = "prob")[,2]
# train_match2$Match_Status <- ifelse(train_match2$Match_Status == 1, "Matched", "Not_Matched")
# obs <- as.numeric(train_match2$Match_Status)
# log_loss_value <- defaultSummary(pred, obs, "logloss")
###

#Lasso model
set.seed(1978)
lasso.fit3 <- caret::train(Match_Status ~., data = train_match2, method = "glmnet", family= "binomial", trControl = control, tuneGrid = expand.grid(alpha = 1, lambda = lambda), metric = "logLoss")

#Random forest
set.seed(1978)
rf.fit3 <- train(Match_Status~., data=train_match2, method="rf", trControl=control, metric = "logLoss")

#Xgboost. #RUNS FOREVER
# set.seed(1978)
# xgb.fit3 <- train(Match_Status~., data=train_match2, method="xgbTree", trControl=control, tuneGrid = xgbGrid, metric = "logLoss")

#CATboost
set.seed(1978)
# CAT.fit3 <- train(x[1:31], as.factor(y),
#                 method = catboost.caret,
#                 logging_level = 'Verbose', preProc = NULL,
#                 tuneGrid = grid, trControl = control, metric = "logLoss")

#Neural net
set.seed(1978)
nnet.fit3 <- train(Match_Status~., data=train_match2, method="nnet", trControl=control, metric ="logLoss")
#> # weights:  142
#> initial  value 2326.085011 
#> iter  10 value 2116.070449
#> iter  20 value 2111.594697
#> iter  30 value 2110.655669
#> iter  40 value 2101.593357
#> iter  50 value 2095.273575
#> iter  60 value 2001.593060
#> iter  70 value 1942.859357
#> iter  80 value 1861.482988
#> iter  90 value 1761.680133
#> iter 100 value 1725.313317
#> final  value 1725.313317 
#> stopped after 100 iterations
# collect resamples
results_log_loss <- caret::resamples(list(Logit=logit.fit3, Lasso=lasso.fit3, RF=rf.fit3, #XGB=xgb.fit3, #CAT= CAT.fit3,
                                  Nnet= nnet.fit3))

Summary Table of LOG LOSS

# summarize differences between modes
summary(results_log_loss)
#> 
#> Call:
#> summary.resamples(object = results_log_loss)
#> 
#> Models: Logit, Lasso, RF, Nnet 
#> Number of resamples: 5 
#> 
#> logLoss 
#>        Min. 1st Qu. Median  Mean 3rd Qu.  Max. NA's
#> Logit 0.517   0.529  0.554 0.557   0.580 0.602    0
#> Lasso 0.514   0.519  0.552 0.553   0.578 0.603    0
#> RF    0.514   0.535  0.574 0.568   0.604 0.612    0
#> Nnet  0.532   0.548  0.598 0.621   0.701 0.726    0

Plot of LOG LOSS

# box and whisker plots to compare models
scales <- list(x=list(relation="free"), y=list(relation="free"))
#lattice::bwplot(results_log_loss, scales=scales)

lattice::bwplot(results_log_loss, 
                scales=scales, 
                main="Log Loss for the Different\n Models and Testing Data", 
                #xlab="Models", 
                #lab="Log Loss", 
                col="blue", 
                pch=16, 
                cex=1.5, 
                outline="black", 
                lty=1, 
                lwd=1, 
                fill="lightblue")

Find LOG LOSS Confidence interval

logistic_CI3 = logit.fit3$results$logLoss[1]+c(-1,1)*1.96*logit.fit3$results$logLossSD[1]
lasso_CI3 = lasso.fit3$results$logLoss[1]+c(-1,1)*1.96*lasso.fit3$results$logLossSD[1]
rf_CI3 = rf.fit3$results$logLoss[1]+c(-1,1)*1.96*rf.fit3$results$logLossSD[1]
#xgb_CI3 = xgb.fit3$results$logLoss[1]+c(-1,1)*1.96*xgb.fit3$results$logLossSD[1]
#cat_CI3 <- CAT.fit3$results$logLoss[1]+c(-1,1)*1.96*CAT.fit3$results$logLossSD[1]
nnet_CI3 <- nnet.fit3$results$logLoss[1]+c(-1,1)*1.96*nnet.fit3$results$logLossSD[1]

CI3 <- data.frame(logistic_CI= logistic_CI3, lasso_CI = lasso_CI3, rf_CI = rf_CI3, #xgb_CI = xgb_CI3, #cat_CI=cat_CI3, 
                  nnet_CI=nnet_CI3)
rownames(CI3) <- c("Lower", "Upper")
CI3
colnames(CI3) <- c("Logistic Regression", "Lasso", "Random Forest", #"XGBoost", 
                   "Neural Network")
knitr_table_html(CI3, "Log Loss for the Different Models on Testing Data")
Log Loss for the Different Models on Testing Data
Logistic Regression Lasso Random Forest Neural Network
Lower 0.479 0.462 0.484 0.638
Upper 0.676 0.690 0.652 0.761
logit3 = logit.fit3$results$logLoss[1]
lasso3 = lasso.fit3$results$logLoss[1]
rf3 = rf.fit3$results$logLoss[1]
#xgb3 = xgb.fit3$results$logLoss[1]
#cat3 = CAT.fit3$results$logLoss[1]
nnet3 = nnet.fit3$results$logLoss[1]

Metric_value3 <- data.frame(logit = logit3, lasso = lasso3, RF =rf3, #XGB = xgb3, #CAT=cat3, 
                            Nnet=nnet3)
Metric_value3

Summarized p-values for pair-wise comparisons.

# difference in model predictions
diffs <- diff(results_log_loss)
# summarize p-values for pair-wise comparisons
summary(diffs)
#> 
#> Call:
#> summary.diff.resamples(object = diffs)
#> 
#> p-value adjustment: bonferroni 
#> Upper diagonal: estimates of the difference
#> Lower diagonal: p-value for H0: difference = 0
#> 
#> logLoss 
#>       Logit Lasso   RF      Nnet   
#> Logit        0.0033 -0.0114 -0.0642
#> Lasso 1.000         -0.0147 -0.0675
#> RF    1.000 1.000           -0.0528
#> Nnet  0.670 0.547   0.497

Variable importance

Evaluating the importance of variables in a model helps to understand which predictors have the most impact on the outcome. This information can be used to improve the model by selecting the most relevant variables, or to gain insight into the underlying relationships in the data. Additionally, evaluating variable importance can help in detecting potential confounding variables or bias in the data. varImp showcases variable importance of the variables used in the training model.

# Combine the importance data frames into a single data frame
logit_imp_r <- dplyr::arrange(logit_imp$importance, desc(logit_imp$importance$Overall)) 
logit_imp_r$variable <- row.names(logit_imp_r)
logit_imp_r$model <- "logit"

lasso_imp_r <- dplyr::arrange(lasso_imp$importance, desc(lasso_imp$importance$Overall))
lasso_imp_r$variable <- row.names(lasso_imp_r)
lasso_imp_r$model <- "lasso"

rf_imp_r <- dplyr::arrange(rf_imp$importance, desc(rf_imp$importance$Overall))
rf_imp_r$variable <- row.names(rf_imp_r)
rf_imp_r$model <- "rf"

xgb_imp_r <- dplyr::arrange(xgb_imp$importance, desc(xgb_imp$importance$Overall))
xgb_imp_r$variable <- row.names(xgb_imp_r)
xgb_imp_r$model <- "xgb"
# cat_imp_r <- dplyr::arrange(cat_imp$importance, desc(cat_imp$importance$Overall))
# cat_imp_r$variable <- row.names(cat_imp_r)

nnet_imp_r <- dplyr::arrange(nnet_imp$importance, desc(nnet_imp$importance$Overall))
nnet_imp_r$variable <- row.names(nnet_imp_r)
nnet_imp_r$model <- "nnet"

imp_df <- bind_rows(
   logit_imp_r,
   lasso_imp_r,
   rf_imp_r,
   xgb_imp_r,
   nnet_imp_r
 ) %>% filter(Overall > (mean(Overall)+(sd(Overall)))) #%>%
  #dplyr::select(variable, -starts_with("X")); imp_df. # Don't know how to get rid of variables named X.  

# Re-format the data frame for ggplot
imp_df_plot <- imp_df  %>% 
  ggplot(aes(x = variable, y = Overall, fill = model)) +
  geom_col(position = "dodge") +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) + 
  theme(legend.position = "bottom") +
  xlab("Feature") +
  ylab("Importance") +
  labs(title = "Variable Importance where Importance is Greater than the Mean Plus One Standard Deviation"); imp_df_plot

###
tm_ggsave(imp_df_plot, "Variable Importance where Importance is Greater than the Mean Plus One Standard Deviation")
#> [1] "Function Sanity Check: Saving a ggplot image as a TIFF"

Plot Variable importance

#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
#####  Plot the variable importantce for each model. In-sample.  ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

# Open device
tiff("output/fig/logit_importance_for_each_model.tiff", compression = "zip")

# Make a plot
plot(logit_imp, main = "Calculation of Variable Importance for Logistic Model")

# Close device
dev.off()
#> svg 
#>   2
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### #### 
##### Plot the model importance. In-sample. ----
#### #### #### #### #### #### #### #### #### #### #### #### #### #### #### ####

plot(lasso_imp, main = "Calculation of Variable Importance for Lasso model")

plot(rf_imp, main = "Calculation of Variable Importance for RF model")

plot(xgb_imp, main = "Calculation of Variable Importance for XGB model")

#plot(cat_imp, main = "Calculation of Variable Importance for CAT model")
plot(nnet_imp, main = "Calculation of Variable Importance for Nnet model")

Statistical Significance

#The first line of this code calculates the difference in model predictions between all pairs of models in the 'results' variable.
diffs <- diff(results)

#The second line then applies the summary() function to the 'diffs' variable, which calculates the summary statistics of the model differences, such as the mean, minimum, and maximum values, and p-values for pair-wise comparisons between models in the 'results' variable.
summary(diffs)
#> 
#> Call:
#> summary.diff.resamples(object = diffs)
#> 
#> p-value adjustment: bonferroni 
#> Upper diagonal: estimates of the difference
#> Lower diagonal: p-value for H0: difference = 0
#> 
#> Acc 
#>       Logit Lasso    RF       XGB      Nnet    
#> Logit        0.01171  0.01500  0.02117  0.00997
#> Lasso 1               0.00328  0.00946 -0.00175
#> RF    1     1                  0.00618 -0.00503
#> XGB   1     1        1                 -0.01121
#> Nnet  1     1        1        1                
#> 
#> AUCPR 
#>       Logit Lasso    RF       XGB      Nnet    
#> Logit       0.001634 0.005692 0.006288 0.114097
#> Lasso 1              0.004057 0.004654 0.112463
#> RF    1     1                 0.000597 0.108405
#> XGB   1     1        1                 0.107809
#> Nnet  1     1        1        1                
#> 
#> AUCROC 
#>       Logit Lasso     RF        XGB       Nnet     
#> Logit       0.0013905 0.0078076 0.0078361 0.0223987
#> Lasso 1.000           0.0064171 0.0064456 0.0210082
#> RF    1.000 1.000               0.0000286 0.0145911
#> XGB   1.000 1.000     1.000               0.0145625
#> Nnet  0.525 0.629     1.000     1.000              
#> 
#> Brier 
#>       Logit Lasso     RF        XGB       Nnet     
#> Logit       -0.006423 -0.004699 -0.004051 -0.021046
#> Lasso 0.350            0.001724  0.002372 -0.014623
#> RF    1.000 1.000                0.000649 -0.016347
#> XGB   1.000 1.000     1.000               -0.016995
#> Nnet  0.865 1.000     1.000     1.000              
#> 
#> F 
#>       Logit Lasso    RF       XGB      Nnet    
#> Logit        0.02828  0.03296  0.02053  0.00774
#> Lasso 0.64            0.00468 -0.00775 -0.02054
#> RF    1.00  1.00              -0.01243 -0.02522
#> XGB   1.00  1.00     1.00              -0.01279
#> Nnet  1.00  1.00     1.00     1.00             
#> 
#> Kappa 
#>       Logit Lasso     RF        XGB       Nnet     
#> Logit        0.025855  0.032602  0.030195  0.029753
#> Lasso 1                0.006747  0.004340  0.003897
#> RF    1     1                   -0.002407 -0.002850
#> XGB   1     1         1                   -0.000442
#> Nnet  1     1         1         1                  
#> 
#> Precision 
#>       Logit Lasso    RF       XGB      Nnet    
#> Logit       -0.00997 -0.01901  0.00387  0.01999
#> Lasso 1              -0.00904  0.01384  0.02996
#> RF    1     1                  0.02287  0.03899
#> XGB   1     1        1                  0.01612
#> Nnet  1     1        1        1                
#> 
#> Recall 
#>       Logit Lasso   RF      XGB     Nnet   
#> Logit        0.0475  0.0579  0.0335 -0.0221
#> Lasso 0.23           0.0104 -0.0140 -0.0696
#> RF    1.00  1.00            -0.0244 -0.0800
#> XGB   1.00  1.00    1.00            -0.0556
#> Nnet  1.00  1.00    1.00    1.00

There is no significant difference between models.

stopCluster(cl)