When working on our group project, our team members needed to build our own GLM models using the same input datasets. For the final submission of our report, we chose the best 2 models in order to draw conclusion for the project. Apart from reading the same dataset as input, every one of us needed to code more or less the same diagnostic plots, and to provide RMSEs and R2 for comparison. The repetition of these common coding led me to write up this R script to facilitate the comparison of multiple GLM models in a consistent manner.

How to use this script

To use this script, you need to:

  1. supply the common dataset on which models should be built;
  2. perform any filter as necessary;
  3. specify the percentage split between training and testing set (eg 70% and 30%)
  4. For each model
  1. set feat_columns to a list of predictor variables in using your selection mechanism (eg manual selection or ridge and lasso selection);
  2. create the training dataset by calling setTrainFeatures();
  3. create the test dataset by calling setTestFeatures();
  4. create the GLM model object by calling runFit();
  5. insert the model summary statistics in the common data frame model_summary
  1. Compare the different models by analyzing and plotting from the model_summary object

The runFit() function

This function:

  1. sets up the training control object;
  2. calls train() function with the model and tuning parameters supplied;
  3. creates and stores predicted values;
  4. stores the diagnostic statistics in the return object;
  5. plots diagnostic plots

Future enhancements

It maybe possible to make use of functional programming feature of R to encapsulate other modelling methods than GLM. Please comment and suggest ways of improving this script to make it more general.

Example - comparing 4 models

knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)

# load packages, clear environment ----
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2     v purrr   0.3.4
## v tibble  3.0.3     v dplyr   1.0.2
## v tidyr   1.1.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts --------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(here)
## here() starts at N:/Data_Work/20200808 36103 Statistical Thinking for DS/AT2/AT2C/STDS-AT2C
library(janitor)
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(dplyr)
library(ggplot2)
library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
library(glmnet)
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.0-2
library(hydroGOF)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
# remove objects from current environment
remove(list = ls())

# read data ----
yield_by_region <- readRDS(here("project/src/output/yield_by_region.rds"))

# remove one outlier and one significant observation
yield_by_region <- yield_by_region[-222,]
yield_by_region <- yield_by_region[-180,]

# set train and test ----
train_size <- floor(0.7 * nrow(yield_by_region))

set.seed(10) 
train_n <- sample(seq_len(nrow(yield_by_region)), size = train_size)

train <- yield_by_region[train_n, ]
test <- yield_by_region[-train_n, ]

########################################
# Common functions
setTrainFeatures <- function(x) {train %>% dplyr::select(x)}

setTestFeatures <- function(x) {test %>% dplyr::select(x)}

runFit <- function(model_name ='',train_feat,test_feat,control_method,folds,train_method,train_metric,train_family,train_tuneLength) {
  # set up trainControl object
  control <- trainControl(  method = control_method  # cross-validation
                            ,number = folds  # 5 folds
                            ,savePredictions = "final"
                            ,allowParallel = TRUE
  )
  # set up modelling parameters and run ----
  fit <- train(
    yield ~ .
    ,data = train_feat
    ,method = train_method
    ,metric = train_metric
    ,trControl = control
    ,family = train_family
    ,preProc = c("center","scale")
    ,tuneLength = train_tuneLength
  )
  # Make predictions on train
  train_pred <- predict(fit, train_feat)
  
  # Make predictions on test
  test_pred <- predict(fit, test_feat)

  fit$rmse_test <- rmse(test_pred, test$yield)
  fit$rmse_train <- rmse(train_pred, train$yield)
  fit$SI <- fit$rmse_test/mean(yield_by_region$yield)     # (RL) changed to use rmse_test
  fit$rmse_ratio <- fit$rmse_train/fit$rmse_test

  # Diagnostic Plots
  plot(fit$finalModel, sub = model_name)

  return(fit)
}
########################################

Model 1

######################
# MODEL # 1
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state,lu_mean_temp_nov,lu_mean_rain_annual,lu_mean_solar_wet,lu_clay,lu_phosphorus,lu_nitrogen,lu_silt,total_water_used, yield) %>% 
  colnames()

train_feat_01 <- setTrainFeatures(feat_columns)

test_feat_01 <- setTestFeatures(feat_columns)

fit_01 <- runFit(
  model_name = 'Model #1'
  ,train_feat_01
  ,test_feat_01
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glm'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- data.frame(
  name = 'Model #1'
  ,rmse = fit_01[["results"]][["RMSE"]]
  ,rsquared = fit_01[["results"]][["Rsquared"]]
  ,rmse_test = fit_01[["rmse_test"]]
  ,rmse_train = fit_01[["rmse_train"]]
  ,SI = fit_01[["SI"]]
  ,rmse_ratio = fit_01[["rmse_ratio"]]
)

Model 2

######################
# MODEL # 2
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state:lu_water_capacity, total_water_used:main_total_white, yield) %>% 
  colnames()

train_feat_02 <- setTrainFeatures(feat_columns)

test_feat_02 <- setTestFeatures(feat_columns)

fit_02 <- runFit(
  model_name = 'Model #2'
  ,train_feat_02
  ,test_feat_02
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glmnet'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- rbind(model_summary,  data.frame(
  name = 'Model #2'
  ,rmse = mean(fit_02[["results"]][["RMSE"]])
  ,rsquared = mean(fit_02[["results"]][["Rsquared"]])
  ,rmse_test = fit_02[["rmse_test"]]
  ,rmse_train = fit_02[["rmse_train"]]
  ,SI = fit_02[["SI"]]
  ,rmse_ratio = fit_02[["rmse_ratio"]]
)
)

Model 3

######################
# MODEL # 3
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state,grape_area_region,lu_mean_rain_jan,lu_ph,lu_water_capacity,total_water_used,petit_verdot_ratio,bernet_franc_ratio,chardonnay_ratio,colombard_ratio,muscat_blanc_ratio,main_chardonnay, yield) %>% 
  colnames()

train_feat_03 <- setTrainFeatures(feat_columns)

test_feat_03 <- setTestFeatures(feat_columns)

fit_03 <- runFit(
  model_name = 'Model #3'
  ,train_feat_03
  ,test_feat_03
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glm'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- rbind(model_summary, data.frame(
  name = 'Model #3'
  ,rmse = fit_03[["results"]][["RMSE"]]
  ,rsquared = fit_03[["results"]][["Rsquared"]]
  ,rmse_test = fit_03[["rmse_test"]]
  ,rmse_train = fit_03[["rmse_train"]]
  ,SI = fit_03[["SI"]]
  ,rmse_ratio = fit_03[["rmse_ratio"]]
)
)

Model 4

######################
# MODEL # 4
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state,lon,lat,lu_mean_temp_summer,lu_mean_temp_jan,lu_mean_temp_feb,lu_mean_temp_sep,lu_mean_temp_oct,lu_mean_temp_nov,lu_mean_temp_dec,lu_mean_solar_wet,lu_ph,lu_water_capacity,yield) %>% 
  colnames()

train_feat_04 <- setTrainFeatures(feat_columns)

test_feat_04 <- setTestFeatures(feat_columns)

fit_04 <- runFit(
  model_name = 'Model #4'
  ,train_feat_04
  ,test_feat_04
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glm'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- rbind(model_summary, data.frame(
  name = 'Model #4'
  ,rmse = fit_04[["results"]][["RMSE"]]
  ,rsquared = fit_04[["results"]][["Rsquared"]]
  ,rmse_test = fit_04[["rmse_test"]]
  ,rmse_train = fit_04[["rmse_train"]]
  ,SI = fit_04[["SI"]]
  ,rmse_ratio = fit_04[["rmse_ratio"]]
)
)

######################
# MODEL # 4
# select features ----
feat_columns <- yield_by_region %>% 
  dplyr::select(state,lon,lat,lu_mean_temp_summer,lu_mean_temp_jan,lu_mean_temp_feb,lu_mean_temp_sep,lu_mean_temp_oct,lu_mean_temp_nov,lu_mean_temp_dec,lu_mean_solar_wet,lu_ph,lu_water_capacity,yield) %>% 
  colnames()

train_feat_04 <- setTrainFeatures(feat_columns)

test_feat_04 <- setTestFeatures(feat_columns)

fit_04 <- runFit(
  model_name = 'Model #4'
  ,train_feat_04
  ,test_feat_04
  ,control_method = 'cv'
  ,folds = 5
  ,train_method = 'glm'
  ,train_metric = 'RMSE'
  ,train_family = 'gaussian'
  ,train_tuneLength = 100
)

# print model coefficients
model_summary <- rbind(model_summary, data.frame(
  name = 'Model #4'
  ,rmse = fit_04[["results"]][["RMSE"]]
  ,rsquared = fit_04[["results"]][["Rsquared"]]
  ,rmse_test = fit_04[["rmse_test"]]
  ,rmse_train = fit_04[["rmse_train"]]
  ,SI = fit_04[["SI"]]
  ,rmse_ratio = fit_04[["rmse_ratio"]]
)
)

Model Summary

# Models Scatter Plots
plot(model_summary)