When working on our group project, our team members needed to build our own GLM models using the same input datasets. For the final submission of our report, we chose the best 2 models in order to draw conclusion for the project. Apart from reading the same dataset as input, every one of us needed to code more or less the same diagnostic plots, and to provide RMSEs and R² for comparison. The repetition of these common coding led me to write up this R script to facilitate the comparison of multiple GLM models in a consistent manner.

Example - comparing 4 models

knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)

# load packages, clear environment ----
library(tidyverse)

## -- Attaching packages ------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --

## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0

## -- Conflicts --------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library(here)

## here() starts at N:/Data_Work/20200808 36103 Statistical Thinking for DS/AT2/AT2C/STDS-AT2C

library(janitor)

## 
## Attaching package: 'janitor'

## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test

library(dplyr)
library(ggplot2)
library(caret)

## Loading required package: lattice

## 
## Attaching package: 'caret'

## The following object is masked from 'package:purrr':
## 
##     lift

library(glmnet)

## Loading required package: Matrix

## 
## Attaching package: 'Matrix'

## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack

## Loaded glmnet 4.0-2

library(hydroGOF)

## Loading required package: zoo

## 
## Attaching package: 'zoo'

## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

# remove objects from current environment
remove(list = ls())

# read data ----
yield_by_region <- readRDS(here("project/src/output/yield_by_region.rds"))

# remove one outlier and one significant observation
yield_by_region <- yield_by_region[-222,]
yield_by_region <- yield_by_region[-180,]

# set train and test ----
train_size <- floor(0.7 * nrow(yield_by_region))

set.seed(10) 
train_n <- sample(seq_len(nrow(yield_by_region)), size = train_size)

train <- yield_by_region[train_n, ]
test <- yield_by_region[-train_n, ]

########################################
# Common functions
setTrainFeatures <- function(x) {train %>% dplyr::select(x)}

setTestFeatures <- function(x) {test %>% dplyr::select(x)}

runFit <- function(model_name ='',train_feat,test_feat,control_method,folds,train_method,train_metric,train_family,train_tuneLength) {
  # set up trainControl object
  control <- trainControl(  method = control_method  # cross-validation
                            ,number = folds  # 5 folds
                            ,savePredictions = "final"
                            ,allowParallel = TRUE
  )
  # set up modelling parameters and run ----
  fit <- train(
    yield ~ .
    ,data = train_feat
    ,method = train_method
    ,metric = train_metric
    ,trControl = control
    ,family = train_family
    ,preProc = c("center","scale")
    ,tuneLength = train_tuneLength
  )
  # Make predictions on train
  train_pred <- predict(fit, train_feat)
  
  # Make predictions on test
  test_pred <- predict(fit, test_feat)

  fit$rmse_test <- rmse(test_pred, test$yield)
  fit$rmse_train <- rmse(train_pred, train$yield)
  fit$SI <- fit$rmse_test/mean(yield_by_region$yield)     # (RL) changed to use rmse_test
  fit$rmse_ratio <- fit$rmse_train/fit$rmse_test

  # Diagnostic Plots
  plot(fit$finalModel, sub = model_name)

  return(fit)
}
########################################

Model 1

######################
# MODEL # 1
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state,lu_mean_temp_nov,lu_mean_rain_annual,lu_mean_solar_wet,lu_clay,lu_phosphorus,lu_nitrogen,lu_silt,total_water_used, yield) %>% 
  colnames()

train_feat_01 <- setTrainFeatures(feat_columns)

test_feat_01 <- setTestFeatures(feat_columns)

fit_01 <- runFit(
  model_name = 'Model #1'
  ,train_feat_01
  ,test_feat_01
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glm'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- data.frame(
  name = 'Model #1'
  ,rmse = fit_01[["results"]][["RMSE"]]
  ,rsquared = fit_01[["results"]][["Rsquared"]]
  ,rmse_test = fit_01[["rmse_test"]]
  ,rmse_train = fit_01[["rmse_train"]]
  ,SI = fit_01[["SI"]]
  ,rmse_ratio = fit_01[["rmse_ratio"]]
)

Model 2

######################
# MODEL # 2
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state:lu_water_capacity, total_water_used:main_total_white, yield) %>% 
  colnames()

train_feat_02 <- setTrainFeatures(feat_columns)

test_feat_02 <- setTestFeatures(feat_columns)

fit_02 <- runFit(
  model_name = 'Model #2'
  ,train_feat_02
  ,test_feat_02
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glmnet'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- rbind(model_summary,  data.frame(
  name = 'Model #2'
  ,rmse = mean(fit_02[["results"]][["RMSE"]])
  ,rsquared = mean(fit_02[["results"]][["Rsquared"]])
  ,rmse_test = fit_02[["rmse_test"]]
  ,rmse_train = fit_02[["rmse_train"]]
  ,SI = fit_02[["SI"]]
  ,rmse_ratio = fit_02[["rmse_ratio"]]
)
)

Model 3

######################
# MODEL # 3
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state,grape_area_region,lu_mean_rain_jan,lu_ph,lu_water_capacity,total_water_used,petit_verdot_ratio,bernet_franc_ratio,chardonnay_ratio,colombard_ratio,muscat_blanc_ratio,main_chardonnay, yield) %>% 
  colnames()

train_feat_03 <- setTrainFeatures(feat_columns)

test_feat_03 <- setTestFeatures(feat_columns)

fit_03 <- runFit(
  model_name = 'Model #3'
  ,train_feat_03
  ,test_feat_03
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glm'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- rbind(model_summary, data.frame(
  name = 'Model #3'
  ,rmse = fit_03[["results"]][["RMSE"]]
  ,rsquared = fit_03[["results"]][["Rsquared"]]
  ,rmse_test = fit_03[["rmse_test"]]
  ,rmse_train = fit_03[["rmse_train"]]
  ,SI = fit_03[["SI"]]
  ,rmse_ratio = fit_03[["rmse_ratio"]]
)
)

Model 4

######################
# MODEL # 4
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state,lon,lat,lu_mean_temp_summer,lu_mean_temp_jan,lu_mean_temp_feb,lu_mean_temp_sep,lu_mean_temp_oct,lu_mean_temp_nov,lu_mean_temp_dec,lu_mean_solar_wet,lu_ph,lu_water_capacity,yield) %>% 
  colnames()

train_feat_04 <- setTrainFeatures(feat_columns)

test_feat_04 <- setTestFeatures(feat_columns)

fit_04 <- runFit(
  model_name = 'Model #4'
  ,train_feat_04
  ,test_feat_04
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glm'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- rbind(model_summary, data.frame(
  name = 'Model #4'
  ,rmse = fit_04[["results"]][["RMSE"]]
  ,rsquared = fit_04[["results"]][["Rsquared"]]
  ,rmse_test = fit_04[["rmse_test"]]
  ,rmse_train = fit_04[["rmse_train"]]
  ,SI = fit_04[["SI"]]
  ,rmse_ratio = fit_04[["rmse_ratio"]]
)
)

######################
# MODEL # 4
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state,lon,lat,lu_mean_temp_summer,lu_mean_temp_jan,lu_mean_temp_feb,lu_mean_temp_sep,lu_mean_temp_oct,lu_mean_temp_nov,lu_mean_temp_dec,lu_mean_solar_wet,lu_ph,lu_water_capacity,yield) %>% 
  colnames()

train_feat_04 <- setTrainFeatures(feat_columns)

test_feat_04 <- setTestFeatures(feat_columns)

fit_04 <- runFit(
  model_name = 'Model #4'
  ,train_feat_04
  ,test_feat_04
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glm'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- rbind(model_summary, data.frame(
  name = 'Model #4'
  ,rmse = fit_04[["results"]][["RMSE"]]
  ,rsquared = fit_04[["results"]][["Rsquared"]]
  ,rmse_test = fit_04[["rmse_test"]]
  ,rmse_train = fit_04[["rmse_train"]]
  ,SI = fit_04[["SI"]]
  ,rmse_ratio = fit_04[["rmse_ratio"]]
)
)

Model Summary

# Models Scatter Plots
plot(model_summary)

Comparison of Multiple GLM Models

How to use this script

The `runFit()` function

Future enhancements

Example - comparing 4 models

Model 1

Model 2

Model 3

Model 4

Model Summary

Comparison of Multiple GLM Models

How to use this script

The runFit() function

Future enhancements

Example - comparing 4 models

Model 1

Model 2

Model 3

Model 4

Model Summary

The `runFit()` function