When working on our group project, our team members needed to build our own GLM models using the same input datasets. For the final submission of our report, we chose the best 2 models in order to draw conclusion for the project. Apart from reading the same dataset as input, every one of us needed to code more or less the same diagnostic plots, and to provide RMSEs and R2 for comparison. The repetition of these common coding led me to write up this R script to facilitate the comparison of multiple GLM models in a consistent manner.
To use this script, you need to:
feat_columns
to a list of predictor variables in using your selection mechanism (eg manual selection or ridge and lasso selection);setTrainFeatures()
;setTestFeatures()
;runFit()
;model_summary
model_summary
objectrunFit()
functionThis function:
train()
function with the model and tuning parameters supplied;It maybe possible to make use of functional programming feature of R to encapsulate other modelling methods than GLM. Please comment and suggest ways of improving this script to make it more general.
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)
# load packages, clear environment ----
library(tidyverse)
## -- Attaching packages ------------------------------------------------------------------------------------------------ tidyverse 1.3.0 --
## v ggplot2 3.3.2 v purrr 0.3.4
## v tibble 3.0.3 v dplyr 1.0.2
## v tidyr 1.1.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts --------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(here)
## here() starts at N:/Data_Work/20200808 36103 Statistical Thinking for DS/AT2/AT2C/STDS-AT2C
library(janitor)
##
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
##
## chisq.test, fisher.test
library(dplyr)
library(ggplot2)
library(caret)
## Loading required package: lattice
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(glmnet)
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
## Loaded glmnet 4.0-2
library(hydroGOF)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
# remove objects from current environment
remove(list = ls())
# read data ----
yield_by_region <- readRDS(here("project/src/output/yield_by_region.rds"))
# remove one outlier and one significant observation
yield_by_region <- yield_by_region[-222,]
yield_by_region <- yield_by_region[-180,]
# set train and test ----
train_size <- floor(0.7 * nrow(yield_by_region))
set.seed(10)
train_n <- sample(seq_len(nrow(yield_by_region)), size = train_size)
train <- yield_by_region[train_n, ]
test <- yield_by_region[-train_n, ]
########################################
# Common functions
setTrainFeatures <- function(x) {train %>% dplyr::select(x)}
setTestFeatures <- function(x) {test %>% dplyr::select(x)}
runFit <- function(model_name ='',train_feat,test_feat,control_method,folds,train_method,train_metric,train_family,train_tuneLength) {
# set up trainControl object
control <- trainControl( method = control_method # cross-validation
,number = folds # 5 folds
,savePredictions = "final"
,allowParallel = TRUE
)
# set up modelling parameters and run ----
fit <- train(
yield ~ .
,data = train_feat
,method = train_method
,metric = train_metric
,trControl = control
,family = train_family
,preProc = c("center","scale")
,tuneLength = train_tuneLength
)
# Make predictions on train
train_pred <- predict(fit, train_feat)
# Make predictions on test
test_pred <- predict(fit, test_feat)
fit$rmse_test <- rmse(test_pred, test$yield)
fit$rmse_train <- rmse(train_pred, train$yield)
fit$SI <- fit$rmse_test/mean(yield_by_region$yield) # (RL) changed to use rmse_test
fit$rmse_ratio <- fit$rmse_train/fit$rmse_test
# Diagnostic Plots
plot(fit$finalModel, sub = model_name)
return(fit)
}
########################################
######################
# MODEL # 1
# select features ----
feat_columns <- yield_by_region %>%
dplyr::select(state,lu_mean_temp_nov,lu_mean_rain_annual,lu_mean_solar_wet,lu_clay,lu_phosphorus,lu_nitrogen,lu_silt,total_water_used, yield) %>%
colnames()
train_feat_01 <- setTrainFeatures(feat_columns)
test_feat_01 <- setTestFeatures(feat_columns)
fit_01 <- runFit(
model_name = 'Model #1'
,train_feat_01
,test_feat_01
,control_method = 'cv'
,folds = 5
,train_method = 'glm'
,train_metric = 'RMSE'
,train_family = 'gaussian'
,train_tuneLength = 100
)
# print model coefficients
model_summary <- data.frame(
name = 'Model #1'
,rmse = fit_01[["results"]][["RMSE"]]
,rsquared = fit_01[["results"]][["Rsquared"]]
,rmse_test = fit_01[["rmse_test"]]
,rmse_train = fit_01[["rmse_train"]]
,SI = fit_01[["SI"]]
,rmse_ratio = fit_01[["rmse_ratio"]]
)
######################
# MODEL # 2
# select features ----
feat_columns <- yield_by_region %>%
dplyr::select(state:lu_water_capacity, total_water_used:main_total_white, yield) %>%
colnames()
train_feat_02 <- setTrainFeatures(feat_columns)
test_feat_02 <- setTestFeatures(feat_columns)
fit_02 <- runFit(
model_name = 'Model #2'
,train_feat_02
,test_feat_02
,control_method = 'cv'
,folds = 5
,train_method = 'glmnet'
,train_metric = 'RMSE'
,train_family = 'gaussian'
,train_tuneLength = 100
)
# print model coefficients
model_summary <- rbind(model_summary, data.frame(
name = 'Model #2'
,rmse = mean(fit_02[["results"]][["RMSE"]])
,rsquared = mean(fit_02[["results"]][["Rsquared"]])
,rmse_test = fit_02[["rmse_test"]]
,rmse_train = fit_02[["rmse_train"]]
,SI = fit_02[["SI"]]
,rmse_ratio = fit_02[["rmse_ratio"]]
)
)
######################
# MODEL # 3
# select features ----
feat_columns <- yield_by_region %>%
dplyr::select(state,grape_area_region,lu_mean_rain_jan,lu_ph,lu_water_capacity,total_water_used,petit_verdot_ratio,bernet_franc_ratio,chardonnay_ratio,colombard_ratio,muscat_blanc_ratio,main_chardonnay, yield) %>%
colnames()
train_feat_03 <- setTrainFeatures(feat_columns)
test_feat_03 <- setTestFeatures(feat_columns)
fit_03 <- runFit(
model_name = 'Model #3'
,train_feat_03
,test_feat_03
,control_method = 'cv'
,folds = 5
,train_method = 'glm'
,train_metric = 'RMSE'
,train_family = 'gaussian'
,train_tuneLength = 100
)
# print model coefficients
model_summary <- rbind(model_summary, data.frame(
name = 'Model #3'
,rmse = fit_03[["results"]][["RMSE"]]
,rsquared = fit_03[["results"]][["Rsquared"]]
,rmse_test = fit_03[["rmse_test"]]
,rmse_train = fit_03[["rmse_train"]]
,SI = fit_03[["SI"]]
,rmse_ratio = fit_03[["rmse_ratio"]]
)
)
######################
# MODEL # 4
# select features ----
feat_columns <- yield_by_region %>%
dplyr::select(state,lon,lat,lu_mean_temp_summer,lu_mean_temp_jan,lu_mean_temp_feb,lu_mean_temp_sep,lu_mean_temp_oct,lu_mean_temp_nov,lu_mean_temp_dec,lu_mean_solar_wet,lu_ph,lu_water_capacity,yield) %>%
colnames()
train_feat_04 <- setTrainFeatures(feat_columns)
test_feat_04 <- setTestFeatures(feat_columns)
fit_04 <- runFit(
model_name = 'Model #4'
,train_feat_04
,test_feat_04
,control_method = 'cv'
,folds = 5
,train_method = 'glm'
,train_metric = 'RMSE'
,train_family = 'gaussian'
,train_tuneLength = 100
)
# print model coefficients
model_summary <- rbind(model_summary, data.frame(
name = 'Model #4'
,rmse = fit_04[["results"]][["RMSE"]]
,rsquared = fit_04[["results"]][["Rsquared"]]
,rmse_test = fit_04[["rmse_test"]]
,rmse_train = fit_04[["rmse_train"]]
,SI = fit_04[["SI"]]
,rmse_ratio = fit_04[["rmse_ratio"]]
)
)
######################
# MODEL # 4
# select features ----
feat_columns <- yield_by_region %>%
dplyr::select(state,lon,lat,lu_mean_temp_summer,lu_mean_temp_jan,lu_mean_temp_feb,lu_mean_temp_sep,lu_mean_temp_oct,lu_mean_temp_nov,lu_mean_temp_dec,lu_mean_solar_wet,lu_ph,lu_water_capacity,yield) %>%
colnames()
train_feat_04 <- setTrainFeatures(feat_columns)
test_feat_04 <- setTestFeatures(feat_columns)
fit_04 <- runFit(
model_name = 'Model #4'
,train_feat_04
,test_feat_04
,control_method = 'cv'
,folds = 5
,train_method = 'glm'
,train_metric = 'RMSE'
,train_family = 'gaussian'
,train_tuneLength = 100
)
# print model coefficients
model_summary <- rbind(model_summary, data.frame(
name = 'Model #4'
,rmse = fit_04[["results"]][["RMSE"]]
,rsquared = fit_04[["results"]][["Rsquared"]]
,rmse_test = fit_04[["rmse_test"]]
,rmse_train = fit_04[["rmse_train"]]
,SI = fit_04[["SI"]]
,rmse_ratio = fit_04[["rmse_ratio"]]
)
)
# Models Scatter Plots
plot(model_summary)