1 Objectives

The objective of this document is to develop statistical models that predict the heat capacity, thermal conductivity and density of milk with various concentrations of fat and water. Various polynomials functions were trained and tested on a training set using best subset regression and assessed using R2, adjusted R2 and cross-validation statistics using leave-one-out cross validation (LOOCV). The final models selected were assessed on a hold-out test set.

The data used for this project was obtained from the research paper Influence of Temperature and Water and Fat Contents on the Thermophysical Properties of Milk, Minim et al., Journal of Chemical & Engineering Data 2002 47 (6), 1488-1491.

# Install and load packages
if (!require(pacman)) {install.packages("pacman")} else require(pacman)
pacman::p_load(caret, tidyverse, ggfortify, readr, readxl, parallel, doParallel, gridExtra, plyr, GGally, broom, knitr, kableExtra, tictoc, glmulti, modelr, plot3D, install = T)

2 Import Data

Import dataset from CSV file:

project_ID <- "milk_properties"
project_name <- "Milk Thermophysical Properties"

# Import data
data <- read.csv(file = "Data_Milk_Thermophysical_Properties.csv", header = TRUE, row.names = NULL)

# Convert fat and water to %
data$water <- data$water*100
data$fat <- data$fat*100

# Convert temperature from Kelvin to Celsius
data$temperature <- data$temperature - 273.15

write.csv(x = data, file = "Data_Milk_Thermophysical_Properties_Transformed.csv")
# Tabulate dataset
data %>% 
  select(`Water, %` = water, `Fat, %` = fat, `Temperature, °C` = temperature, `Heat Capacity, J/(g.K)` = heat_capacity, `Thermal Conductivity, W/(m.K)` = thermal_conductivity, `Density, kg/m3` = density) %>% 
  kable(align = "c", caption = "Values for the Thermophysical Properties of Milk for Different Temperatures, Water and Fat Content.") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F)) %>%
  # scroll_box(height = "20%", fixed_thead = list(enabled = T)) %>% 
  kableExtra::footnote(general = "Adapted with permission from Influence of Temperature and Water and Fat Contents on the Thermophysical Properties of Milk, Minim et al., Journal of Chemical & Engineering Data 2002 47 (6), 1488-1491. Copyright (2002) American Chemical Society.", general_title = "")
Values for the Thermophysical Properties of Milk for Different Temperatures, Water and Fat Content.
Water, % Fat, % Temperature, °C Heat Capacity, J/(g.K) Thermal Conductivity, W/(m.K) Density, kg/m3
72 0.44 2 3.527 0.477 1049.5
72 0.44 8 3.476 0.477 1048.5
72 0.44 15 3.499 0.488 1045.1
72 0.44 20 3.518 0.495 1044.4
72 0.44 30 3.572 0.508 1040.0
72 0.44 41 3.622 0.524 1036.1
72 0.44 50 3.684 0.529 1032.0
72 0.44 60 3.709 0.539 1028.7
72 0.44 71 3.690 0.550 1024.9
72 4.81 2 3.472 0.475 1048.4
72 4.81 8 3.422 0.475 1045.3
72 4.81 15 3.448 0.486 1042.8
72 4.81 20 3.530 0.496 1040.1
72 4.81 30 3.555 0.505 1036.7
72 4.81 41 3.537 0.518 1032.9
72 4.81 50 3.623 0.531 1031.1
72 4.81 60 3.652 0.541 1028.7
72 4.81 71 3.694 0.554 1023.9
72 7.71 2 3.369 0.462 1046.3
72 7.71 8 3.391 0.471 1044.2
72 7.71 15 3.434 0.485 1041.9
72 7.71 20 3.464 0.493 1039.1
72 7.71 30 3.457 0.511 1035.7
72 7.71 41 3.495 0.523 1033.1
72 7.71 50 3.544 0.524 1031.1
72 7.71 60 3.590 0.541 1026.7
72 7.71 71 3.658 0.548 1022.9
84 0.44 2 3.840 0.514 1044.4
84 0.44 8 3.886 0.532 1041.3
84 0.44 15 3.891 0.545 1038.8
84 0.44 20 3.842 0.546 1036.1
84 0.44 30 3.923 0.572 1032.8
84 0.44 41 3.869 0.577 1029.0
84 0.44 50 3.889 0.588 1027.1
84 0.44 60 3.932 0.600 1024.8
84 0.44 71 3.968 0.609 1020.1
84 4.81 2 3.750 0.507 1041.1
84 4.81 8 3.821 0.529 1038.0
84 4.81 15 3.762 0.533 1035.7
84 4.81 20 3.775 0.543 1034.0
84 4.81 30 3.816 0.563 1031.8
84 4.81 41 3.809 0.574 1029.1
84 4.81 50 3.829 0.585 1025.0
84 4.81 60 3.870 0.597 1022.7
84 4.81 71 3.906 0.606 1018.0
84 7.71 2 3.776 0.511 1040.2
84 7.71 8 3.780 0.523 1039.2
84 7.71 15 3.736 0.530 1035.8
84 7.71 20 3.805 0.548 1034.1
84 7.71 30 3.820 0.564 1029.8
84 7.71 41 3.836 0.579 1026.0
84 7.71 50 3.825 0.585 1022.9
84 7.71 60 3.863 0.596 1020.7
84 7.71 71 3.852 0.598 1018.0
92 0.44 2 4.098 0.544 1039.2
92 0.44 8 4.042 0.552 1038.2
92 0.44 15 4.114 0.578 1034.8
92 0.44 20 4.123 0.590 1033.1
92 0.44 30 4.124 0.608 1030.8
92 0.44 41 4.073 0.616 1026.1
92 0.44 50 4.069 0.637 1023.0
92 0.44 60 4.082 0.641 1020.7
92 0.44 71 4.117 0.641 1016.0
92 4.81 2 3.962 0.532 1037.0
92 4.81 8 3.971 0.548 1036.1
92 4.81 15 4.002 0.568 1032.7
92 4.81 20 3.978 0.575 1032.0
92 4.81 30 3.991 0.595 1027.7
92 4.81 41 4.025 0.616 1025.0
92 4.81 50 4.050 0.629 1020.9
92 4.81 60 4.095 0.644 1018.6
92 4.81 71 4.097 0.648 1013.9
92 7.71 2 3.923 0.531 1036.1
92 7.71 8 3.932 0.547 1035.1
92 7.71 15 3.962 0.559 1031.7
92 7.71 20 3.982 0.571 1031.1
92 7.71 30 4.027 0.593 1026.7
92 7.71 41 4.029 0.606 1022.9
92 7.71 50 3.978 0.617 1018.9
92 7.71 60 4.051 0.630 1015.5
92 7.71 71 3.984 0.638 1011.8
Adapted with permission from Influence of Temperature and Water and Fat Contents on the Thermophysical Properties of Milk, Minim et al., Journal of Chemical & Engineering Data 2002 47 (6), 1488-1491. Copyright (2002) American Chemical Society.

3 Exploratory Data Analysis

Perform exploratory data analysis of the dataset.

3.1 Distribution of Target Variables

The distribution of the values of each thermophysical property is plotted on the histograms below:

# Plot histograms of milk properties
g1 <- data %>% 
  ggplot(mapping = aes(x = heat_capacity)) +
    geom_histogram(bins = 10, fill = "#045a8d", alpha = 0.8) +
    labs(title = "Heat Capacity", 
         # subtitle = "Source: ",
         y = "Count",
         x = "Heat Capacity, J/(g.K)") +
    theme_bw() +
    theme(aspect.ratio = 0.5, legend.position = "none")

g2 <- data %>% 
  ggplot(mapping = aes(x = thermal_conductivity)) +
    geom_histogram(bins = 10, fill = "#045a8d", alpha = 0.8) +
    labs(title = "Thermal Conductivity", 
         # subtitle = "Source: ",
         y = "Count",
         x = "Thermal Conductivity, W/(m.K)") +
    theme_bw() +
    theme(aspect.ratio = 0.5, legend.position = "none")

g3 <- data %>% 
  ggplot(mapping = aes(x = density)) +
    geom_histogram(bins = 10, fill = "#045a8d", alpha = 0.8) +
    labs(title = "Density", 
         # subtitle = "Source: ",
         y = "Count",
         x = "Density, kg/m3") +
    theme_bw() +
    theme(aspect.ratio = 0.5, legend.position = "none", axis.title = element_text(size = NULL))

gridExtra::grid.arrange(g1,g2,g3,nrow = 3)

3.2 Scatter Plot Matrix

Plot matrix of scatter plot to analyse relationships between variables:

EDA_data <- data
colnames(EDA_data) <- c("Water", "Fat", "Temperature", "Heat Capacity", "Thermal Conductivity", "Density")

# install.packages("GGally", verbose = FALSE, quiet = TRUE)
library(GGally, quietly = TRUE, verbose = FALSE)

# Function to add linear regression line and points to the lower half of the scatterplot matrix
regression_lines <- function(data, mapping){
  p <- ggplot(data = data, mapping = mapping) + 
    geom_point(alpha = 0.5, 
               size = 1.5, 
               color = "#08519c") + # Add points
    geom_smooth(method=lm, 
                fill="#08306b", 
                color="#08306b") # Add linear regression
  p
}

ggpairs(data = EDA_data, 
        lower = list(continuous = regression_lines)
        ) + 
  theme_bw() + 
  theme(
        panel.background = element_blank(),
        panel.grid.major = element_blank(), 
        panel.grid.minor = element_blank(),
        axis.line = element_line(colour = "black"), axis.text = element_text(size = 6), strip.text = element_text(size = 7)
        )

3.3 Correlation Heat Map

Plot correlation heat map of dataset:

# Correlation Heat Map
ggcorr(EDA_data, label = TRUE, palette = "RdBu", name = "Correlation", hjust = 0.75, label_size = 3, label_round = 2)


4 Modeling

# Function to plot observed vs predicted values
predicted_observed_plot <- function(predicted_val, observed_val, residual_val, model_name = "", R_squared, ...) {
  
  plot <- ggplot(mapping = aes(x = predicted_val, y = observed_val, col = abs(residual_val))) +
  geom_point(alpha = 0.9, size = 2) +
  geom_abline(intercept = 0, slope = 1) +
    # facet_wrap(~) +
    labs(title = paste0(model_name, "\nPredicted vs Observed: Test Set"),
         subtitle = paste0("R-squared: ", signif(R_squared, 4)),
         x = "Predicted",
         y = "Observed",
         col = "Absolute Deviation") +
  theme_bw() +
  theme(aspect.ratio = 0.9, panel.grid.minor.x = element_blank(), legend.title = element_text(size = 10, face="bold"), legend.text = element_text(size = 9), plot.title = element_text(size=12, face="bold"), axis.title=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 0), legend.position = "none") +
  # scale_x_continuous(expand = c(0,0)) +
  # scale_y_continuous(expand = c(0,0)) + 
  coord_equal() + scale_color_viridis_c(direction = -1)

  return (plot)
}

# Function to plot residuals
residuals_plot <- function(predicted_val, residual_val, model_name = "", MAE, RMSE, ...) {

  plot <- ggplot(mapping = aes(x = predicted_val, y = residual_val, col = abs(residual_val))) +
  geom_point(alpha = 0.9, size = 2) +
  geom_abline(intercept = 0, slope = 0) +
    # facet_wrap(~) +
    labs(
       title = paste0(model_name, "\nResiduals: Test Set"),
       subtitle = paste0("RMSE: ", signif(RMSE, 4), ", MAE: ", signif(MAE, 4)),
       x = "Predicted",
       y = "Residual",
       col = "Absolute Deviation"
       ) +
  theme_bw() +
  theme(aspect.ratio = 0.9, panel.grid.minor.x = element_blank(), legend.title = element_text(size = 10, face="bold"), legend.text = element_text(size = 9), plot.title = element_text(size=12, face="bold"), axis.title=element_text(size=10, face="bold"), axis.text.x = element_text(angle = 0), legend.position = "none") +
  # scale_x_continuous(expand = c(0,0)) +
  # scale_y_continuous(expand = c(0,0)) +
  coord_equal() + scale_color_viridis_c(direction = -1)

  return (plot)
}

4.1 Heat Capacity

4.1.1 Creating a training and test set

Create a training and test set by randomly selecting 85% of the samples from the data for the training set and use the remaining 15% for the test set.

predictor_ID <- c("water", "fat", "temperature")
predictor_name <- c("Water", "Fat", "Temperature")

outcome_ID <- "heat_capacity"
outcome_name <- "Heat Capacity"
units <- "J/(g.K)"
variable_vector <- data$heat_capacity

# Remove any observations with missing values
data <- data[complete.cases(data), ]

# Average the values of replicate experiments (if present)
data <- ddply(.data = data, 
                       .variables = .(water, fat, temperature), 
                       .fun = function(x) c(heat_capacity = mean(x$heat_capacity),
                                            thermal_conductivity = mean(x$thermal_conductivity),
                                            density = mean(x$density)))

# Create training and test set using stratified partioning of outcome variable
training_fraction <- 0.85
set.seed(1); training_index <- createDataPartition(y = data$heat_capacity, p = training_fraction)[[1]]

training_set <- data[training_index, ]
test_set <- data[-training_index, ]

# Check distributtion of strength on training set
par(mfrow = c(1, 2))
hist(training_set$heat_capacity, main = "Training Set", xlab = paste0(outcome_name, " ", units, ""), freq = FALSE)

# Check distributtion of strength on test set
hist(test_set$heat_capacity, main = "Test Set", xlab = paste0(outcome_name, " ", units, ""), freq = FALSE)

rbind(summary(training_set$heat_capacity),summary(test_set$heat_capacity)) %>% data.frame() %>% add_column(Set = c("Training Set", "Test Set")) %>% select(Set, everything()) %>% kable(digits = 4, align = "c", caption = "Training and Test Set Statistics.", col.names = c("Set", "Minimum", "1st Quartile", "Median", "Mean", "3rd quartile", "Maximum")) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F), position = "center")
Training and Test Set Statistics.
Set Minimum 1st Quartile Median Mean 3rd quartile Maximum
Training Set 3.391 3.6220 3.8250 3.8002 3.9780 4.124
Test Set 3.369 3.7183 3.8495 3.8307 3.9932 4.097

The distribution of Heat Capacity values on both the training and test set are similar.

4.1.2 Modeling

Create formula objects for multiple linear regression with and without 2-factor interaction terms and with and without quadratic terms.

MLR_formula <- as.formula("heat_capacity ~ water + fat + temperature")
MLR_formula_Interactions <- as.formula("heat_capacity ~ water*fat*temperature")
MLR_formula_Quadratic <- as.formula("heat_capacity ~ water + fat + temperature + I(water^2) + I(fat^2) + I(temperature^2)")

4.1.2.1 Best Subset Regression

Perform best subset regression (with and without interaction terms) and select the best models.

set.seed(1)

# Initial Formula for Best Subset Regression
BSS_formula <- as.formula(heat_capacity ~ water + fat + temperature)

# BSS with Main effects ---------------------
# Perform best subset regression without interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_NI <- glmulti::glmulti(BSS_formula, data = training_set,
                                      level = 1, # Main effects only
                                      method = "h", # Use exhaustive screening
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )

# # Plot relative importance of the various model terms 
# plot(Models_BSS_NI, type="s", cex.axis=0.8, cex.names=0.8, sub = paste0(outcome_name, ": Best subset regression with main effects only"), mgp=c(2,0.6,-0.4))

# BSS with 2-fi ---------------------
# Perform best subset regression with interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_WI <- glmulti::glmulti(BSS_formula, data = training_set,
                                      level = 2, # 2-fi considered
                                      method = "h", # Use exhaustive screening
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )

# # Plot relative importance of the various model terms 
# plot(Models_BSS_WI, type="s", cex.axis=0.8, cex.names=0.55, sub = paste0(outcome_name, ": Best subset regression with interaction terms"), mgp=c(2,0.6,-0.4))

BSS_formula_QI <- as.formula(heat_capacity ~ water + fat + temperature + I(water^2) + I(fat^2) + I(temperature^2))

# BSS with quadratic terms and 2-fi ---------------------
# Perform best subset regression with interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_QI <- glmulti::glmulti(BSS_formula_QI, data = training_set,
                                      level = 2, # 2-fi considered
                                      method = "g", # Use genetic algorithm
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )
TASK: Genetic algorithm in the candidate set.
Initialization...
Algorithm started...
Improvements in best and average IC have bebingo en below the specified goals.
Algorithm is declared to have converged.
Completed.
# # Plot relative importance of the various model terms 
# plot(Models_BSS_QI, type="s", cex.axis=0.8, cex.names=0.55, sub = paste0(outcome_name, ": Best subset regression with quadratic and interaction terms"), mgp=c(2,0.6,-0.4))
# Save best BSS models

# main effects 
Model_BSS_NI1 <- Models_BSS_NI@objects[[1]]
Model_BSS_NI2 <- Models_BSS_NI@objects[[2]]
Model_BSS_NI3 <- Models_BSS_NI@objects[[3]]
Model_BSS_NI4 <- Models_BSS_NI@objects[[4]]
Model_BSS_NI5 <- Models_BSS_NI@objects[[5]]

# 2-fi 
Model_BSS_WI1 <- Models_BSS_WI@objects[[1]]
Model_BSS_WI2 <- Models_BSS_WI@objects[[2]]
Model_BSS_WI3 <- Models_BSS_WI@objects[[3]]
Model_BSS_WI4 <- Models_BSS_WI@objects[[4]]
Model_BSS_WI5 <- Models_BSS_WI@objects[[5]]

# quadratic with 2-fi 
Model_BSS_QI1 <- Models_BSS_QI@objects[[1]]
Model_BSS_QI2 <- Models_BSS_QI@objects[[2]]
Model_BSS_QI3 <- Models_BSS_QI@objects[[3]]
Model_BSS_QI4 <- Models_BSS_QI@objects[[4]]
Model_BSS_QI5 <- Models_BSS_QI@objects[[5]]

4.1.2.2 Multiple Linear Regression

Model data with standard multiple linear regression (with and without interaction terms).

# Formula for Multiple Linear Regression - Without Interaction Terms 
formula_Linear_NI <- as.formula(heat_capacity ~ water + fat + temperature)

# Formula for Multiple Linear Regression - With all 2-factor Interaction Terms
formula_Linear_WI <- as.formula(heat_capacity ~ water*fat + water*temperature + fat*temperature)

# Formula for Multiple Linear Regression - With all quadratic and 2-factor Interaction Terms
formula_Linear_QI <- as.formula(heat_capacity ~ water*fat + water*temperature + fat*temperature + I(water^2) + I(fat^2) + I(temperature^2))

Output of multiple linear regression with main factors only:

Model_Linear_NI <- lm(formula = formula_Linear_NI, data = training_set)

glance(Model_Linear_NI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F)) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.966 0.964 0.041 616.575 0 4 125.311 -240.621 -229.451 0.107 65

Output of multiple linear regression with all main factors and 2-factor interactions (2-fi):

Model_Linear_WI <- lm(formula = formula_Linear_WI, data = training_set)

glance(Model_Linear_WI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.982 0.98 0.03 559.313 0 7 146.932 -277.864 -259.991 0.057 62

Output of multiple linear regression with all main factors, 2-factor interactions (2-fi) and quadratic terms:

Model_Linear_QI <- lm(formula = formula_Linear_QI, data = training_set)

glance(Model_Linear_QI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.982 0.98 0.031 365.637 0 10 147.948 -273.896 -249.321 0.055 59

4.1.3 Model Evaluation

Evaluate and select models with Leave-one-out cross-validation (LOOCV)

# Leave-one-out cross-validation (LOOCV)

# Linear without interactions
Model_Linear_NI_LOOCV <- train(formula_Linear_NI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Linear with Interactions
Model_Linear_WI_LOOCV <- train(formula_Linear_WI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Linear with Interactions
Model_Linear_QI_LOOCV <- train(formula_Linear_QI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Best Subset Regression - No Interaction Terms
Model_BSS_NI1_LOOCV <- if (length(Model_BSS_NI1$coefficients)>1) train(formula(Model_BSS_NI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI2_LOOCV <- if (length(Model_BSS_NI2$coefficients)>1) train(formula(Model_BSS_NI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI3_LOOCV <- if (length(Model_BSS_NI3$coefficients)>1) train(formula(Model_BSS_NI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI4_LOOCV <- if (length(Model_BSS_NI4$coefficients)>1) train(formula(Model_BSS_NI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI5_LOOCV <- if (length(Model_BSS_NI5$coefficients)>1) train(formula(Model_BSS_NI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

# Best Subset Regression - With Interaction Terms
Model_BSS_WI1_LOOCV <- if (length(Model_BSS_WI1$coefficients)>1) train(formula(Model_BSS_WI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI2_LOOCV <- if (length(Model_BSS_WI2$coefficients)>1) train(formula(Model_BSS_WI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI3_LOOCV <- if (length(Model_BSS_WI3$coefficients)>1) train(formula(Model_BSS_WI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI4_LOOCV <- if (length(Model_BSS_WI4$coefficients)>1) train(formula(Model_BSS_WI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI5_LOOCV <- if (length(Model_BSS_WI5$coefficients)>1) train(formula(Model_BSS_WI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

# Best Subset Regression - With Interaction Terms and Quadratic Terms
Model_BSS_QI1_LOOCV <- if (length(Model_BSS_QI1$coefficients)>1) train(formula(Model_BSS_QI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI2_LOOCV <- if (length(Model_BSS_QI2$coefficients)>1) train(formula(Model_BSS_QI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI3_LOOCV <- if (length(Model_BSS_QI3$coefficients)>1) train(formula(Model_BSS_QI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI4_LOOCV <- if (length(Model_BSS_QI4$coefficients)>1) train(formula(Model_BSS_QI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI5_LOOCV <- if (length(Model_BSS_QI5$coefficients)>1) train(formula(Model_BSS_QI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

Summary statistics for all models evaluated:

# Summary statistics for all models evaluated
options("scipen"=100, "digits"=4)

Model_Summary <- data.frame(rbind(
  # Linear without interactions
  cbind(Model = "Model_Linear_NI", Type = "MLR Main Factors",
    glance(Model_Linear_NI)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_Linear_NI_LOOCV$results[[2]], `R-Squared LOOCV` = Model_Linear_NI_LOOCV$results[[3]]),
  
  # Linear with Interactions
  cbind(Model = "Model_Linear_WI", Type = "MLR 2-fi",
    glance(Model_Linear_WI)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_Linear_WI_LOOCV$results[[2]], `R-Squared LOOCV` = Model_Linear_WI_LOOCV$results[[3]]),

  # Best Subset Regression - No Interaction Terms
    # Model_BSS_NI1_LOOCV
  cbind(Model = "Model_BSS_NI1", Type = "BSS Main Factors",
    glance(Model_BSS_NI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI1_LOOCV$results[[3]]),
    # Model_BSS_NI2_LOOCV
  cbind(Model = "Model_BSS_NI2", Type = "BSS Main Factors",
    glance(Model_BSS_NI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI2_LOOCV$results[[3]]),
    # Model_BSS_NI3_LOOCV
  cbind(Model = "Model_BSS_NI3", Type = "BSS Main Factors",
    glance(Model_BSS_NI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI3_LOOCV$results[[3]]),
    # Model_BSS_NI4_LOOCV
  cbind(Model = "Model_BSS_NI4", Type = "BSS Main Factors",
    glance(Model_BSS_NI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI4_LOOCV$results[[3]]),
    # Model_BSS_NI5_LOOCV
  cbind(Model = "Model_BSS_NI5", Type = "BSS Main Factors",
    glance(Model_BSS_NI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI5_LOOCV$results[[3]]),

  # Best Subset Regression - With Interaction Terms
    # Model_BSS_WI1_LOOCV
  cbind(Model = "Model_BSS_WI1", Type = "BSS 2-fi",
    glance(Model_BSS_WI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI1_LOOCV$results[[3]]),
    # Model_BSS_WI2_LOOCV
  cbind(Model = "Model_BSS_WI2", Type = "BSS 2-fi",
    glance(Model_BSS_WI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI2_LOOCV$results[[3]]),
    # Model_BSS_WI3_LOOCV
  cbind(Model = "Model_BSS_WI3", Type = "BSS 2-fi",
    glance(Model_BSS_WI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI3_LOOCV$results[[3]]),
    # Model_BSS_WI4_LOOCV
  cbind(Model = "Model_BSS_WI4", Type = "BSS 2-fi",
    glance(Model_BSS_WI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI4_LOOCV$results[[3]]),
    # Model_BSS_WI5_LOOCV
  cbind(Model = "Model_BSS_WI5", Type = "BSS 2-fi",
    glance(Model_BSS_WI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI5_LOOCV$results[[3]]),
  
  # Best Subset Regression - With Interaction Terms
    # Model_BSS_QI1_LOOCV
  cbind(Model = "Model_BSS_QI1", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI1_LOOCV$results[[3]]),
    # Model_BSS_QI2_LOOCV
  cbind(Model = "Model_BSS_QI2", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI2_LOOCV$results[[3]]),
    # Model_BSS_QI3_LOOCV
  cbind(Model = "Model_BSS_QI3", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI3_LOOCV$results[[3]]),
    # Model_BSS_QI4_LOOCV
  cbind(Model = "Model_BSS_QI4", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI4_LOOCV$results[[3]]),
    # Model_BSS_QI5_LOOCV
  cbind(Model = "Model_BSS_QI5", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI5_LOOCV$results[[3]])
  )
)

Model_Summary %>% 
  kable(align = "c", digits = 4, col.names = c("Model", "Type", "R-squared", "Adjusted R-squared", "AIC", "BIC", "p-value", "RMSE (LOOCV)", "R-squared (LOOCV)")) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F)) %>% footnote(general = "LOOCV: Leave-one-out cross-validation", general_title = "")
Model Type R-squared Adjusted R-squared AIC BIC p-value RMSE (LOOCV) R-squared (LOOCV)
Model_Linear_NI MLR Main Factors 0.9661 0.9645 -240.6 -229.45 0 0.0422 0.9610
Model_Linear_WI MLR 2-fi 0.9819 0.9801 -277.9 -259.99 0 0.0320 0.9776
Model_BSS_NI1 BSS Main Factors 0.9661 0.9645 -240.6 -229.45 0 0.0422 0.9610
Model_BSS_NI2 BSS Main Factors 0.9284 0.9262 -191.1 -182.18 0 0.0600 0.9210
Model_BSS_NI3 BSS Main Factors 0.9215 0.9191 -184.8 -175.86 0 0.0628 0.9135
Model_BSS_NI4 BSS Main Factors 0.8879 0.8862 -162.2 -155.47 0 0.0740 0.8801
Model_BSS_NI5 BSS Main Factors 0.0000 0.0000 -13.2 -8.73 NA 0.0422 0.9610
Model_BSS_WI1 BSS 2-fi 0.9814 0.9803 -280.2 -266.79 0 0.0314 0.9784
Model_BSS_WI2 BSS 2-fi 0.9813 0.9802 -279.9 -266.45 0 0.0315 0.9782
Model_BSS_WI3 BSS 2-fi 0.9817 0.9803 -279.4 -263.77 0 0.0316 0.9782
Model_BSS_WI4 BSS 2-fi 0.9817 0.9803 -279.4 -263.75 0 0.0316 0.9781
Model_BSS_WI5 BSS 2-fi 0.9815 0.9800 -278.4 -262.76 0 0.0319 0.9777
Model_BSS_QI1 BSS 2-fi and Q 0.9859 0.9838 -289.3 -264.71 0 0.0298 0.9805
Model_BSS_QI2 BSS 2-fi and Q 0.9855 0.9835 -289.2 -266.87 0 0.0295 0.9809
Model_BSS_QI3 BSS 2-fi and Q 0.9863 0.9839 -289.2 -262.38 0 0.0300 0.9803
Model_BSS_QI4 BSS 2-fi and Q 0.9862 0.9838 -288.6 -261.80 0 0.0300 0.9802
Model_BSS_QI5 BSS 2-fi and Q 0.9866 0.9840 -288.5 -259.48 0 0.0302 0.9801
LOOCV: Leave-one-out cross-validation
pallete <- c("#000000", "#5e5656", 
             "#67000d", "#a50f15", "#cb181d", "#ef3b2c", "#fb6a4a", # reds
             "#08306b", "#08519c", "#2171b5", "#4292c6", "#6baed6", # blues
             "#00441b", "#006d2c", "#238b45", "#41ab5d", "#74c476") # greens

caption <- "NI: No Interaction terms\nWI: With Interaction terms\nQI: With Quadratic and Interaction terms"

# Vector for point labels
Descriptors <- c("NI", "WI", substr(Model_Summary$Model[3:length(Model_Summary$Model)], 
                                    start = nchar(x = as.character(Model_Summary$Model[3:length(Model_Summary$Model)])), 
                                    stop = nchar(as.character(Model_Summary$Model[3:length(Model_Summary$Model)]))))

# Adjusted R-squared and AIC
g1 <- Model_Summary %>%
  ggplot(aes(x = AIC, y = adj.r.squared, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Adjusted R-squared and AIC."),
         caption = caption,
         x = "AIC",
         y = "Adjusted R-squared") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

# Adjusted R-squared and BIC
g2 <- Model_Summary %>%
  ggplot(aes(x = BIC, y = adj.r.squared, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Adjusted R-squared and BIC values."),
         caption = caption,
         x = "BIC",
         y = "Adjusted R-squared") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

# LOOCV R-squared and LOOCV RMSE
g3 <- Model_Summary %>%
  ggplot(aes(x =RMSE.LOOCV, y = R.Squared.LOOCV, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Cross-validated RMSE and R-squared between \nobserved and predicted values"),
         caption = caption,
         x = "RMSE (LOOCV)",
         y = "R-squared (LOOCV)") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

gridExtra::grid.arrange(g1, g2, g3, ncol = 1)


4.1.4 Model Evaluation

Based on the \(R^2\) from Leave-one-out cross-validation (R.squared.LOOCV), Model_BSS_QI2 appears to be the best model.

Based on the root-mean squared error from Leave-one-out cross-validation (RMSE.LOOCV), Model_BSS_QI2 appears to be the best model.

We will select the model Model_BSS_QI2 to make our predictions for Heat Capacity.

# Best Model
# Get descriptor best model
Best_Model_ID <- as.character(Model_Summary$Model[which.max(Model_Summary$R.Squared.LOOCV)][[1]])
# Get type of best model
Best_Model_Type <- if_else(condition = stringr::str_detect(Best_Model_ID, pattern = "QI"), 
                           true = "BSS 2-fi and Q", 
                           false = if_else(
                             condition = str_detect(Best_Model_ID, pattern = "WI"), 
                             true = "BSS 2-fi",
                             false = "BSS Main Factors")
                           )
# Get index of best model
Best_Model_Index <- as.numeric(str_extract(Best_Model_ID, "[[:digit:]]+"))

# Automatically select best model and assign it to Model_Selected_auto - choice based on R.Squared.LOOCV
Model_Selected <- if_else(
  condition = Best_Model_Type == "BSS 2-fi and Q", 
  true = Models_BSS_QI@objects[Best_Model_Index], 
  false = if_else(
    condition = Best_Model_Type == "BSS 2-fi", 
    true = Models_BSS_WI@objects[Best_Model_Index], 
    false = Models_BSS_NI@objects[Best_Model_Index])
  )[[1]]

# Create string of model selected
Model_Selected_string <- Best_Model_ID # as.character(Model_Summary$Model[which.max(Model_Summary$R.Squared.LOOCV)][[1]]) 

# Generate Description for model selected
Model_Selected_Description <- paste(if_else(
                                    condition = grepl(pattern = "BSS", x = Model_Selected_string),
                                            true = "Best Subset Regression Model", 
                                            false = "Multiple Linear Regression Model"),
                                    if_else(condition = grepl(pattern = "QI", x = Model_Selected_string), 
                                            true = "with Quadratic and", 
                                            false = ""),
                                    if_else(condition = grepl(pattern = "NI", x = Model_Selected_string), 
                                            true = "without Interaction Terms", 
                                            false = "with Interaction Terms"),
                                    sep = " ") 

# Save model as RDS file
saveRDS(object = Model_Selected, file = paste("Models/",project_ID, "_", outcome_ID, "_model_selected", ".rds", sep = ""))

Summary Statistics of selected model:

# Print Summary Tables for selected model
options("scipen"=10000, "digits"=5)

# Print model fit statistics
glance(Model_Selected) %>% kable(digits = 3, align = "c", col.names = c("R-squared", "Adjusted R-squared", "Sigma", "Statistic", "p-value", "df", "logLik", "AIC", "BIC","Deviance", "df Residual")) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
R-squared Adjusted R-squared Sigma Statistic p-value df logLik AIC BIC Deviance df Residual
0.985 0.984 0.028 508.91 0 9 154.6 -289.21 -266.87 0.046 60
# save model fit statistics
Model_Selected_Statistics <- glance(Model_Selected)

summmary_table_Model_Selected <- summary(Model_Selected) 
summmary_table_Model_Selected <- cbind(rownames(summmary_table_Model_Selected$coefficients), as.data.frame(summmary_table_Model_Selected$coefficients))
rownames(summmary_table_Model_Selected) <- NULL
colnames(summmary_table_Model_Selected) <- c("Parameter", "Estimate", "Std Error", "Statistic", "p-value")
summmary_table_Model_Selected %>% kable(digits = 6, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
Parameter Estimate Std Error Statistic p-value
(Intercept) 1.425898 0.061730 23.0989 0.000000
water 0.029487 0.000836 35.2807 0.000000
I(temperature^2) 0.000236 0.000024 9.6964 0.000000
water:fat -0.001720 0.000490 -3.5109 0.000855
water:I(fat^2) 0.000725 0.000243 2.9854 0.004095
fat:I(fat^2) -0.002847 0.000951 -2.9924 0.004013
I(fat2):I(water2) -0.000003 0.000001 -2.8151 0.006587
water:I(temperature^2) -0.000002 0.000000 -8.4684 0.000000
I(temperature^2):temperature -0.000001 0.000000 -3.2262 0.002033
# # Plot model coefficients
# ggcoef(x = Model_Selected, exponentiate = F, exclude_intercept = T, vline_color = "red", vline_linetype =  "dotted", errorbar_color = "grey10", errorbar_height = .15, mapping = aes(x = estimate, y = term, size = p.value)) + scale_size_continuous(trans = "reverse") + labs(x = "Estimate", y = NULL, size = "p-value", title = paste0(outcome_name, ": Plot of Model Coefficients")) + theme_bw()

# Plot Diagnostics for selected model
ggfortify:::autoplot.lm(Model_Selected,
         ncol = 2, alpha = 0.8,
         label.size = 3) +
  theme_bw() +
  theme(aspect.ratio = 0.8, legend.title = element_text(size = 10), legend.text = element_text(size = 8), axis.title = element_text(size = 9))

# Save relevant model statistics
r.squared <- round(glance(Model_Selected)[[1]], 3)
adj.r.squared <- round(glance(Model_Selected)[[2]], 3)
r.squared.LOOCV <- round(train(as.formula(Model_Selected), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")$results[[3]], 3)

Summary Statistics of selected model on test set:

# Make predictions on test set
test_set$heat_capacity_predicted <- predict(Model_Selected, test_set)
# Calculate Residuals on test set
test_set$heat_capacity_residuals <- test_set$heat_capacity - test_set$heat_capacity_predicted

# Calculate test set R-squared, RMSE, MAE
R_squared <- round(cor(test_set$heat_capacity_predicted, test_set$heat_capacity), 4)
RMSE <- signif(RMSE(pred = test_set$heat_capacity_predicted, obs = test_set$heat_capacity, na.rm = T), 4)
MAE <- signif(MAE(pred = test_set$heat_capacity_predicted, obs = test_set$heat_capacity), 4)

Test_Set_Statistics <- c(RMSE, MAE, R_squared)
names(Test_Set_Statistics) <- c("RMSE", "MAE", "R-squared")

# Print table
Test_Set_Statistics %>% t() %>% 
  kable(align = "c", caption = paste0("Test set statistics of predicted ", outcome_name, ".")) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = T, position = "center")
Test set statistics of predicted Heat Capacity.
RMSE MAE R-squared
0.0453 0.0366 0.9805
# Plot predicted vs observed values and residuals
pred_obs_plot <- predicted_observed_plot(predicted_val = test_set$heat_capacity_predicted, observed_val = test_set$heat_capacity, residual_val = test_set$heat_capacity_residuals, R_squared = R_squared, model_name = outcome_name)

residual_plot <- residuals_plot(predicted_val = test_set$heat_capacity_predicted, observed_val = test_set$heat_capacity, residual_val = test_set$heat_capacity_residuals, MAE = MAE, RMSE = RMSE, model_name = outcome_name)

gridExtra::grid.arrange(pred_obs_plot, residual_plot, ncol = 2)


4.1.5 Surface Plots

# Grid Settings
z_expansion_factor <- 0.05 # Expansion factor for z-axis
theta <- c(320) # Perspective angle
theta_index <- 1
colour_range <- plot3D::ramp.col(c("#2166ac", "#b2182b")) # [TD]

# Create vectors for variables
# Number fo gridlines for x- and y-axis
grid.lines <- 21

# Create values for surface plots
water_range <- seq_range(x = data$water, n = grid.lines) # x-axis
temperature_range <- seq_range(data$temperature, n = grid.lines) # y-axis
fat_range <- unique(x = data$fat) # Column

# Create grid of variables
prediction_grid <- expand.grid(
                              temperature = temperature_range,
                              water = water_range,
                              fat = fat_range
                              )
# Create data frames of values for each combination of process parameters to be plotted
# 1x3 structure
# Plot 1: Row 1, Column 1
Grid1_Row1_Column1 <- prediction_grid %>% filter(fat == fat_range[1])
# Plot 2: Row 1, Column 2
Grid1_Row1_Column2 <- prediction_grid %>% filter(fat == fat_range[2])
# Plot 3: Row 1, Column 3
Grid1_Row1_Column3 <- prediction_grid %>% filter(fat == fat_range[3])

Description_Grid1_Row1_Column1 <- paste0(outcome_name, " of Milk\nwith ", fat_range[1], "% fat")
Description_Grid1_Row1_Column2 <- paste0(outcome_name, " of Milk\nwith ", fat_range[2], "% fat")
Description_Grid1_Row1_Column3 <- paste0(outcome_name, " of Milk\nwith ", fat_range[3], "% fat")

# Use selected model to make Predictions for each grid created
# Plot 1: Row 1, Column 1
Predictions_Grid1_Row1_Column1 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column1), nrow = grid.lines, ncol = grid.lines)
# Plot 2: Row 1, Column 2
Predictions_Grid1_Row1_Column2 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column2), nrow = grid.lines, ncol = grid.lines)
# Plot 3: Row 1, Column 3
Predictions_Grid1_Row1_Column3 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column3), nrow = grid.lines, ncol = grid.lines)
# Multiple Plots
par(mfrow = c(1, 3)) # 1x3 grid
# Chart (1,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column1,
                      facets = NA),
          main = Description_Grid1_Row1_Column1)

# Chart (2,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column2,
                      facets = NA),
          main = Description_Grid1_Row1_Column2, 
          sub = paste0("R-squared = ", r.squared, "\n",
                       "Adjusted R-squared = ", adj.r.squared, "\n",
                       "Cross-validated R-squared = ", r.squared.LOOCV))

# Chart (3,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column3,
                      facets = NA),
          main = Description_Grid1_Row1_Column3)

# Delete obsolete models
rm(Model_BSS_NI1_LOOCV, Model_BSS_NI2_LOOCV, Model_BSS_NI3_LOOCV, Model_BSS_NI4_LOOCV, Model_BSS_NI5_LOOCV, Model_BSS_WI1_LOOCV, Model_BSS_WI2_LOOCV, Model_BSS_WI3_LOOCV, Model_BSS_WI4_LOOCV, Model_BSS_WI5_LOOCV, Model_BSS_QI1_LOOCV, Model_BSS_QI2_LOOCV, Model_BSS_QI3_LOOCV, Model_BSS_QI4_LOOCV, Model_BSS_QI5_LOOCV, Model_BSS_WI1, Model_BSS_WI2, Model_BSS_WI3, Model_BSS_WI4, Model_BSS_WI5, Model_BSS_QI1, Model_BSS_QI2, Model_BSS_QI3, Model_BSS_QI4, Model_BSS_QI5, Model_BSS_NI1, Model_BSS_NI2, Model_BSS_NI3, Model_BSS_NI4, Model_BSS_NI5, Model_Linear_NI, Model_Linear_NI_LOOCV, Model_Linear_WI, Model_Linear_WI_LOOCV, Model_Linear_QI, Model_Linear_QI_LOOCV, Models_BSS_NI, Models_BSS_WI, Model_Selected, BSS_formula, BSS_formula_QI,  Model_Selected_Description, Model_Selected_string, Model_Summary, formula_BSS, formula_Linear_NI, formula_Linear_WI, Model_Selected_auto_index, Model_Selected_auto)

# Delete obsolete objects
rm(g1, g2, g3, g, Best_Model_ID, Best_Model_Index, Best_Model_Type, colour_range, units, height, width, res, theta, theta_index, pointsize, i, r.squared, adj.r.squared, r.squared.LOOCV, z_expansion_factor, variable_name, variable_ID, Predictions_Grid1_Row1_Column1, Predictions_Grid1_Row1_Column2, Predictions_Grid1_Row1_Column3, outcome_ID, outcome_name, pred_obs_plot, residual_plot)

4.2 Thermal Conductivity

4.2.1 Creating a training and test set

Create a training and test set by randomly selecting 85% of the samples from the data for the training set and use the remaining 15% for the test set.

predictor_ID <- c("water", "fat", "temperature")
predictor_name <- c("Water", "Fat", "Temperature")

outcome_ID <- "thermal_conductivity"
outcome_name <- "Thermal Conductivity"
units <- "W/(m.K)"
variable_vector <- data$thermal_conductivity

# Remove any observations with missing values
data <- data[complete.cases(data), ]

# Average the values of replicate experiments (if present)
data <- ddply(.data = data, 
                       .variables = .(water, fat, temperature), 
                       .fun = function(x) c(thermal_conductivity = mean(x$thermal_conductivity),
                                            thermal_conductivity = mean(x$thermal_conductivity),
                                            density = mean(x$density)))

# Create training and test set using stratified partioning of outcome variable
training_fraction <- 0.85
set.seed(1); training_index <- createDataPartition(y = data$thermal_conductivity, p = training_fraction)[[1]]

training_set <- data[training_index, ]
test_set <- data[-training_index, ]

# Check distributtion of strength on training set
par(mfrow = c(1, 2))
hist(training_set$thermal_conductivity, main = "Training Set", xlab = paste0(outcome_name, " ", units, ""), freq = FALSE)

# Check distributtion of strength on test set
hist(test_set$thermal_conductivity, main = "Test Set", xlab = paste0(outcome_name, " ", units, ""), freq = FALSE)

rbind(summary(training_set$thermal_conductivity),summary(test_set$thermal_conductivity)) %>% data.frame() %>% add_column(Set = c("Training Set", "Test Set")) %>% select(Set, everything()) %>% kable(digits = 4, align = "c", caption = "Training and Test Set Statistics.", col.names = c("Set", "Minimum", "1st Quartile", "Median", "Mean", "3rd quartile", "Maximum")) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F), position = "center")
Training and Test Set Statistics.
Set Minimum 1st Quartile Median Mean 3rd quartile Maximum
Training Set 0.475 0.5232 0.548 0.5557 0.5922 0.644
Test Set 0.462 0.5090 0.543 0.5484 0.5900 0.648

The distribution of Thermal Conductivity values on both the training and test set are similar.

4.2.2 Modelling

Create formula objects for multiple linear regression with and without 2-factor interaction terms and with and without quadratic terms.

MLR_formula <- as.formula("thermal_conductivity ~ water + fat + temperature")
MLR_formula_Interactions <- as.formula("thermal_conductivity ~ water*fat*temperature")
MLR_formula_Quadratic <- as.formula("thermal_conductivity ~ water + fat + temperature + I(water^2) + I(fat^2) + I(temperature^2)")

4.2.2.1 Best Subset Regression

Perform best subset regression (with and without interaction terms) and select the best models.

set.seed(1)

# Initial Formula for Best Subset Regression
BSS_formula <- as.formula(thermal_conductivity ~ water + fat + temperature)

# BSS with Main effects ---------------------
# Perform best subset regression without interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_NI <- glmulti::glmulti(BSS_formula, data = training_set,
                                      level = 1, # Main effects only
                                      method = "h", # Use exhaustive screening
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )

# # Plot relative importance of the various model terms 
# plot(Models_BSS_NI, type="s", cex.axis=0.8, cex.names=0.8, sub = paste0(outcome_name, ": Best subset regression with main effects only"), mgp=c(2,0.6,-0.4))

# BSS with 2-fi ---------------------
# Perform best subset regression with interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_WI <- glmulti::glmulti(BSS_formula, data = training_set,
                                      level = 2, # 2-fi considered
                                      method = "h", # Use exhaustive screening
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )

# # Plot relative importance of the various model terms 
# plot(Models_BSS_WI, type="s", cex.axis=0.8, cex.names=0.55, sub = paste0(outcome_name, ": Best subset regression with interaction terms"), mgp=c(2,0.6,-0.4))

BSS_formula_QI <- as.formula(thermal_conductivity ~ water + fat + temperature + I(water^2) + I(fat^2) + I(temperature^2))

# BSS with quadratic terms and 2-fi ---------------------
# Perform best subset regression with interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_QI <- glmulti::glmulti(BSS_formula_QI, data = training_set,
                                      level = 2, # 2-fi considered
                                      method = "g", # Use genetic algorithm
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )
TASK: Genetic algorithm in the candidate set.
Initialization...
Algorithm started...
Improvements in best and average IC have bebingo en below the specified goals.
Algorithm is declared to have converged.
Completed.
# # Plot relative importance of the various model terms 
# plot(Models_BSS_QI, type="s", cex.axis=0.8, cex.names=0.55, sub = paste0(outcome_name, ": Best subset regression with quadratic and interaction terms"), mgp=c(2,0.6,-0.4))
# Save best BSS models

# main effects models
Model_BSS_NI1 <- Models_BSS_NI@objects[[1]]
Model_BSS_NI2 <- Models_BSS_NI@objects[[2]]
Model_BSS_NI3 <- Models_BSS_NI@objects[[3]]
Model_BSS_NI4 <- Models_BSS_NI@objects[[4]]
Model_BSS_NI5 <- Models_BSS_NI@objects[[5]]

# 2-fi models
Model_BSS_WI1 <- Models_BSS_WI@objects[[1]]
Model_BSS_WI2 <- Models_BSS_WI@objects[[2]]
Model_BSS_WI3 <- Models_BSS_WI@objects[[3]]
Model_BSS_WI4 <- Models_BSS_WI@objects[[4]]
Model_BSS_WI5 <- Models_BSS_WI@objects[[5]]

# 2-fi models
Model_BSS_QI1 <- Models_BSS_QI@objects[[1]]
Model_BSS_QI2 <- Models_BSS_QI@objects[[2]]
Model_BSS_QI3 <- Models_BSS_QI@objects[[3]]
Model_BSS_QI4 <- Models_BSS_QI@objects[[4]]
Model_BSS_QI5 <- Models_BSS_QI@objects[[5]]

4.2.2.2 Multiple Linear Regression

Model data with standard multiple linear regression (with and without interaction terms).

# Formula for Multiple Linear Regression - Without Interaction Terms 
formula_Linear_NI <- as.formula(thermal_conductivity ~ water + fat + temperature)

# Formula for Multiple Linear Regression - With all 2-factor Interaction Terms
formula_Linear_WI <- as.formula(thermal_conductivity ~ water*fat + water*temperature + fat*temperature)

# Formula for Multiple Linear Regression - With all quadratic and 2-factor Interaction Terms
formula_Linear_QI <- as.formula(thermal_conductivity ~ water*fat + water*temperature + fat*temperature + I(water^2) + I(fat^2) + I(temperature^2))

Output of multiple linear regression with main factors only:

Model_Linear_NI <- lm(formula = formula_Linear_NI, data = training_set)

glance(Model_Linear_NI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F)) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.972 0.971 0.008 775.5 0 4 241.9 -473.9 -462.6 0.004 66

Output of multiple linear regression with all main factors and 2-factor interactions (2-fi):

Model_Linear_WI <- lm(formula = formula_Linear_WI, data = training_set)

glance(Model_Linear_WI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.982 0.98 0.007 569.8 0 7 256.7 -497.4 -479.4 0.003 63

Output of multiple linear regression with all main factors, 2-factor interactions (2-fi) and quadratic terms:

Model_Linear_QI <- lm(formula = formula_Linear_QI, data = training_set)

glance(Model_Linear_QI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.992 0.99 0.005 789.5 0 10 283.7 -545.3 -520.6 0.001 60

4.2.3 Model Evaluation

Evaluate and select models with Leave-one-out cross-validation (LOOCV)

# Leave-one-out cross-validation (LOOCV)

# Linear without interactions
Model_Linear_NI_LOOCV <- train(formula_Linear_NI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Linear with Interactions
Model_Linear_WI_LOOCV <- train(formula_Linear_WI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Linear with Interactions
Model_Linear_QI_LOOCV <- train(formula_Linear_QI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Best Subset Regression - No Interaction Terms
Model_BSS_NI1_LOOCV <- if (length(Model_BSS_NI1$coefficients)>1) train(formula(Model_BSS_NI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI2_LOOCV <- if (length(Model_BSS_NI2$coefficients)>1) train(formula(Model_BSS_NI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI3_LOOCV <- if (length(Model_BSS_NI3$coefficients)>1) train(formula(Model_BSS_NI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI4_LOOCV <- if (length(Model_BSS_NI4$coefficients)>1) train(formula(Model_BSS_NI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI5_LOOCV <- if (length(Model_BSS_NI5$coefficients)>1) train(formula(Model_BSS_NI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

# Best Subset Regression - With Interaction Terms
Model_BSS_WI1_LOOCV <- if (length(Model_BSS_WI1$coefficients)>1) train(formula(Model_BSS_WI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI2_LOOCV <- if (length(Model_BSS_WI2$coefficients)>1) train(formula(Model_BSS_WI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI3_LOOCV <- if (length(Model_BSS_WI3$coefficients)>1) train(formula(Model_BSS_WI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI4_LOOCV <- if (length(Model_BSS_WI4$coefficients)>1) train(formula(Model_BSS_WI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI5_LOOCV <- if (length(Model_BSS_WI5$coefficients)>1) train(formula(Model_BSS_WI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

# Best Subset Regression - With Interaction Terms and Quadratic Terms
Model_BSS_QI1_LOOCV <- if (length(Model_BSS_QI1$coefficients)>1) train(formula(Model_BSS_QI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI2_LOOCV <- if (length(Model_BSS_QI2$coefficients)>1) train(formula(Model_BSS_QI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI3_LOOCV <- if (length(Model_BSS_QI3$coefficients)>1) train(formula(Model_BSS_QI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI4_LOOCV <- if (length(Model_BSS_QI4$coefficients)>1) train(formula(Model_BSS_QI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI5_LOOCV <- if (length(Model_BSS_QI5$coefficients)>1) train(formula(Model_BSS_QI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

Summary statistics for all models evaluated:

# Summary statistics for all models evaluated
options("scipen"=100, "digits"=4)

Model_Summary <- data.frame(rbind(
  # Linear without interactions
  cbind(Model = "Model_Linear_NI", Type = "MLR Main Factors",
    glance(Model_Linear_NI)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_Linear_NI_LOOCV$results[[2]], `R-Squared LOOCV` = Model_Linear_NI_LOOCV$results[[3]]),
  
  # Linear with Interactions
  cbind(Model = "Model_Linear_WI", Type = "MLR 2-fi",
    glance(Model_Linear_WI)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_Linear_WI_LOOCV$results[[2]], `R-Squared LOOCV` = Model_Linear_WI_LOOCV$results[[3]]),

  # Best Subset Regression - No Interaction Terms
    # Model_BSS_NI1_LOOCV
  cbind(Model = "Model_BSS_NI1", Type = "BSS Main Factors",
    glance(Model_BSS_NI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI1_LOOCV$results[[3]]),
    # Model_BSS_NI2_LOOCV
  cbind(Model = "Model_BSS_NI2", Type = "BSS Main Factors",
    glance(Model_BSS_NI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI2_LOOCV$results[[3]]),
    # Model_BSS_NI3_LOOCV
  cbind(Model = "Model_BSS_NI3", Type = "BSS Main Factors",
    glance(Model_BSS_NI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI3_LOOCV$results[[3]]),
    # Model_BSS_NI4_LOOCV
  cbind(Model = "Model_BSS_NI4", Type = "BSS Main Factors",
    glance(Model_BSS_NI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI4_LOOCV$results[[3]]),
    # Model_BSS_NI5_LOOCV
  cbind(Model = "Model_BSS_NI5", Type = "BSS Main Factors",
    glance(Model_BSS_NI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI5_LOOCV$results[[3]]),

  # Best Subset Regression - With Interaction Terms
    # Model_BSS_WI1_LOOCV
  cbind(Model = "Model_BSS_WI1", Type = "BSS 2-fi",
    glance(Model_BSS_WI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI1_LOOCV$results[[3]]),
    # Model_BSS_WI2_LOOCV
  cbind(Model = "Model_BSS_WI2", Type = "BSS 2-fi",
    glance(Model_BSS_WI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI2_LOOCV$results[[3]]),
    # Model_BSS_WI3_LOOCV
  cbind(Model = "Model_BSS_WI3", Type = "BSS 2-fi",
    glance(Model_BSS_WI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI3_LOOCV$results[[3]]),
    # Model_BSS_WI4_LOOCV
  cbind(Model = "Model_BSS_WI4", Type = "BSS 2-fi",
    glance(Model_BSS_WI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI4_LOOCV$results[[3]]),
    # Model_BSS_WI5_LOOCV
  cbind(Model = "Model_BSS_WI5", Type = "BSS 2-fi",
    glance(Model_BSS_WI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI5_LOOCV$results[[3]]),
  
  # Best Subset Regression - With Interaction Terms
    # Model_BSS_QI1_LOOCV
  cbind(Model = "Model_BSS_QI1", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI1_LOOCV$results[[3]]),
    # Model_BSS_QI2_LOOCV
  cbind(Model = "Model_BSS_QI2", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI2_LOOCV$results[[3]]),
    # Model_BSS_QI3_LOOCV
  cbind(Model = "Model_BSS_QI3", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI3_LOOCV$results[[3]]),
    # Model_BSS_QI4_LOOCV
  cbind(Model = "Model_BSS_QI4", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI4_LOOCV$results[[3]]),
    # Model_BSS_QI5_LOOCV
  cbind(Model = "Model_BSS_QI5", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI5_LOOCV$results[[3]])
  )
)

Model_Summary %>% 
  kable(align = "c", digits = 4, col.names = c("Model", "Type", "R-squared", "Adjusted R-squared", "AIC", "BIC", "p-value", "RMSE (LOOCV)", "R-squared (LOOCV)")) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F)) %>% footnote(general = "LOOCV: Leave-one-out cross-validation", general_title = "")
Model Type R-squared Adjusted R-squared AIC BIC p-value RMSE (LOOCV) R-squared (LOOCV)
Model_Linear_NI MLR Main Factors 0.9724 0.9712 -473.9 -462.6 0 0.0082 0.9686
Model_Linear_WI MLR 2-fi 0.9819 0.9802 -497.4 -479.4 0 0.0071 0.9761
Model_BSS_NI1 BSS Main Factors 0.9724 0.9712 -473.9 -462.6 0 0.0082 0.9686
Model_BSS_NI2 BSS Main Factors 0.9674 0.9664 -464.1 -455.1 0 0.0087 0.9640
Model_BSS_NI3 BSS Main Factors 0.5176 0.5105 -277.6 -270.8 0 0.0328 0.4899
Model_BSS_NI4 BSS Main Factors 0.5211 0.5068 -276.1 -267.1 0 0.0332 0.4794
Model_BSS_NI5 BSS Main Factors 0.3897 0.3807 -261.1 -254.4 0 0.0369 0.3557
Model_BSS_WI1 BSS 2-fi 0.9813 0.9801 -499.0 -485.5 0 0.0069 0.9778
Model_BSS_WI2 BSS 2-fi 0.9818 0.9803 -498.8 -483.1 0 0.0069 0.9774
Model_BSS_WI3 BSS 2-fi 0.9814 0.9800 -497.5 -481.8 0 0.0070 0.9766
Model_BSS_WI4 BSS 2-fi 0.9819 0.9802 -497.4 -479.4 0 0.0071 0.9761
Model_BSS_WI5 BSS 2-fi 0.9798 0.9788 -495.5 -484.3 0 0.0070 0.9767
Model_BSS_QI1 BSS 2-fi and Q 0.9960 0.9952 -592.5 -563.2 0 0.0036 0.9939
Model_BSS_QI2 BSS 2-fi and Q 0.9960 0.9952 -592.5 -563.2 0 0.0036 0.9939
Model_BSS_QI3 BSS 2-fi and Q 0.9960 0.9951 -590.9 -559.4 0 0.0037 0.9937
Model_BSS_QI4 BSS 2-fi and Q 0.9960 0.9951 -590.9 -559.4 0 0.0037 0.9937
Model_BSS_QI5 BSS 2-fi and Q 0.9960 0.9951 -590.9 -559.4 0 0.0037 0.9937
LOOCV: Leave-one-out cross-validation
pallete <- c("#000000", "#5e5656", 
             "#67000d", "#a50f15", "#cb181d", "#ef3b2c", "#fb6a4a", # reds
             "#08306b", "#08519c", "#2171b5", "#4292c6", "#6baed6", # blues
             "#00441b", "#006d2c", "#238b45", "#41ab5d", "#74c476") # greens

caption <- "NI: No Interaction terms\nWI: With Interaction terms\nQI: With Quadratic and Interaction terms"

# Vector for point labels
Descriptors <- c("NI", "WI", substr(Model_Summary$Model[3:length(Model_Summary$Model)], 
                                    start = nchar(x = as.character(Model_Summary$Model[3:length(Model_Summary$Model)])), 
                                    stop = nchar(as.character(Model_Summary$Model[3:length(Model_Summary$Model)]))))

# Adjusted R-squared and AIC
g1 <- Model_Summary %>%
  ggplot(aes(x = AIC, y = adj.r.squared, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Adjusted R-squared and AIC."),
         caption = caption,
         x = "AIC",
         y = "Adjusted R-squared") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

# Adjusted R-squared and BIC
g2 <- Model_Summary %>%
  ggplot(aes(x = BIC, y = adj.r.squared, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Adjusted R-squared and BIC values."),
         caption = caption,
         x = "BIC",
         y = "Adjusted R-squared") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

# LOOCV R-squared and LOOCV RMSE
g3 <- Model_Summary %>%
  ggplot(aes(x =RMSE.LOOCV, y = R.Squared.LOOCV, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Cross-validated RMSE and R-squared between \nobserved and predicted values"),
         caption = caption,
         x = "RMSE (LOOCV)",
         y = "R-squared (LOOCV)") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

gridExtra::grid.arrange(g1, g2, g3, ncol = 1)


4.2.4 Model Evaluation

Based on the \(R^2\) from Leave-one-out cross-validation (R.squared.LOOCV), Model_BSS_QI1 appears to be the best model.

Based on the root-mean squared error from Leave-one-out cross-validation (RMSE.LOOCV), Model_BSS_QI1 appears to be the best model.

We will select the model Model_BSS_QI1 to make our predictions for Thermal Conductivity.

# Best Model
# Get descriptor best model
Best_Model_ID <- as.character(Model_Summary$Model[which.max(Model_Summary$R.Squared.LOOCV)][[1]])
# Get type of best model
Best_Model_Type <- if_else(condition = stringr::str_detect(Best_Model_ID, pattern = "QI"), 
                           true = "BSS 2-fi and Q", 
                           false = if_else(
                             condition = str_detect(Best_Model_ID, pattern = "WI"), 
                             true = "BSS 2-fi",
                             false = "BSS Main Factors")
                           )
# Get index of best model
Best_Model_Index <- as.numeric(str_extract(Best_Model_ID, "[[:digit:]]+"))

# Automatically select best model and assign it to Model_Selected_auto - choice based on R.Squared.LOOCV
Model_Selected <- if_else(
  condition = Best_Model_Type == "BSS 2-fi and Q", 
  true = Models_BSS_QI@objects[Best_Model_Index], 
  false = if_else(
    condition = Best_Model_Type == "BSS 2-fi", 
    true = Models_BSS_WI@objects[Best_Model_Index], 
    false = Models_BSS_NI@objects[Best_Model_Index])
  )[[1]]

# Create string of model selected
Model_Selected_string <- Best_Model_ID # as.character(Model_Summary$Model[which.max(Model_Summary$R.Squared.LOOCV)][[1]]) 

# Generate Description for model selected
Model_Selected_Description <- paste(if_else(
                                    condition = grepl(pattern = "BSS", x = Model_Selected_string),
                                            true = "Best Subset Regression Model", 
                                            false = "Multiple Linear Regression Model"),
                                    if_else(condition = grepl(pattern = "QI", x = Model_Selected_string), 
                                            true = "with Quadratic and", 
                                            false = ""),
                                    if_else(condition = grepl(pattern = "NI", x = Model_Selected_string), 
                                            true = "without Interaction Terms", 
                                            false = "with Interaction Terms"),
                                    sep = " ") 

# Save model as RDS file
saveRDS(object = Model_Selected, file = paste("Models/",project_ID, "_", outcome_ID, "_model_selected", ".rds", sep = ""))

Summary Statistics of selected model:

# Print Summary Tables for selected model
options("scipen"=10000, "digits"=5)

# Print model fit statistics
glance(Model_Selected) %>% kable(digits = 3, align = "c", col.names = c("R-squared", "Adjusted R-squared", "Sigma", "Statistic", "p-value", "df", "logLik", "AIC", "BIC","Deviance", "df Residual")) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
R-squared Adjusted R-squared Sigma Statistic p-value df logLik AIC BIC Deviance df Residual
0.996 0.995 0.003 1301.9 0 12 309.23 -592.45 -563.22 0.001 58
# save model fit statistics
Model_Selected_Statistics <- glance(Model_Selected)

summmary_table_Model_Selected <- summary(Model_Selected) 
summmary_table_Model_Selected <- cbind(rownames(summmary_table_Model_Selected$coefficients), as.data.frame(summmary_table_Model_Selected$coefficients))
rownames(summmary_table_Model_Selected) <- NULL
colnames(summmary_table_Model_Selected) <- c("Parameter", "Estimate", "Std Error", "Statistic", "p-value")
summmary_table_Model_Selected %>% kable(digits = 6, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
Parameter Estimate Std Error Statistic p-value
(Intercept) 0.225811 0.010917 20.6834 0.000000
water 0.003423 0.000132 25.9989 0.000000
fat -0.030585 0.011462 -2.6685 0.009863
I(fat^2) 0.000202 0.000093 2.1756 0.033670
I(temperature^2) 0.000049 0.000009 5.7315 0.000000
water:fat 0.000781 0.000283 2.7657 0.007605
water:temperature -0.000025 0.000008 -3.0838 0.003128
fat:I(water^2) -0.000005 0.000002 -3.0924 0.003051
temperature:I(water^2) 0.000001 0.000000 6.2695 0.000000
water:I(temperature^2) -0.000001 0.000000 -7.1257 0.000000
fat:I(temperature^2) 0.000001 0.000000 3.1579 0.002522
I(fat2):I(temperature2) 0.000000 0.000000 -2.8978 0.005295
# # Plot model coefficients
# ggcoef(x = Model_Selected, exponentiate = F, exclude_intercept = T, vline_color = "red", vline_linetype =  "dotted", errorbar_color = "grey10", errorbar_height = .15, mapping = aes(x = estimate, y = term, size = p.value)) + scale_size_continuous(trans = "reverse") + labs(x = "Estimate", y = NULL, size = "p-value", title = paste0(outcome_name, ": Plot of Model Coefficients")) + theme_bw()

# Save model statistics into a csv
write_csv(x = summmary_table_Model_Selected, path = paste0("Models/",  project_ID, "_", outcome_ID, "_Model_Selected_Coefficients",".csv"), na = "NA")
write_csv(x = Model_Selected_Statistics, path = paste0("Models/",  project_ID, "_", outcome_ID, "_Model_Selected_Statistics",".csv"), na = "NA")

# Plot Diagnostics for selected model
ggfortify:::autoplot.lm(Model_Selected,
         ncol = 2, alpha = 0.8,
         label.size = 3) +
  theme_bw() +
  theme(aspect.ratio = 0.8, legend.title = element_text(size = 10), legend.text = element_text(size = 8), axis.title = element_text(size = 9))

# Save relevant model statistics
r.squared <- round(glance(Model_Selected)[[1]], 3)
adj.r.squared <- round(glance(Model_Selected)[[2]], 3)
r.squared.LOOCV <- round(train(as.formula(Model_Selected), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")$results[[3]], 3)

Summary Statistics of selected model on test set:

# Make predictions on test set
test_set$thermal_conductivity_predicted <- predict(Model_Selected, test_set)
# Calculate Residuals on test set
test_set$thermal_conductivity_residuals <- test_set$thermal_conductivity - test_set$thermal_conductivity_predicted

# Calculate test set R-squared, RMSE, MAE
R_squared <- round(cor(test_set$thermal_conductivity_predicted, test_set$thermal_conductivity), 4)
RMSE <- signif(RMSE(pred = test_set$thermal_conductivity_predicted, obs = test_set$thermal_conductivity, na.rm = T), 4)
MAE <- signif(MAE(pred = test_set$thermal_conductivity_predicted, obs = test_set$thermal_conductivity), 4)

Test_Set_Statistics <- c(RMSE, MAE, R_squared)
names(Test_Set_Statistics) <- c("RMSE", "MAE", "R-squared")

# Print table
Test_Set_Statistics %>% t() %>% 
  kable(align = "c", caption = paste0("Test set statistics of predicted ", outcome_name, ".")) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = T, position = "center")
Test set statistics of predicted Thermal Conductivity.
RMSE MAE R-squared
0.0059 0.0052 0.9972
# Plot predicted vs observed values and residuals
pred_obs_plot <- predicted_observed_plot(predicted_val = test_set$thermal_conductivity_predicted, observed_val = test_set$thermal_conductivity, residual_val = test_set$thermal_conductivity_residuals, R_squared = R_squared, model_name = outcome_name)

residual_plot <- residuals_plot(predicted_val = test_set$thermal_conductivity_predicted, observed_val = test_set$thermal_conductivity, residual_val = test_set$thermal_conductivity_residuals, MAE = MAE, RMSE = RMSE, model_name = outcome_name)

gridExtra::grid.arrange(pred_obs_plot, residual_plot, ncol = 2)


4.2.5 Surface Plots

# Grid Settings
z_expansion_factor <- 0.05 # Expansion factor for z-axis
theta <- c(320) # Perspective angle
theta_index <- 1
colour_range <- plot3D::ramp.col(c("#2166ac", "#b2182b")) # [TD]

# Create vectors for variables
# Number fo gridlines for x- and y-axis
grid.lines <- 21

# Create values for surface plots
water_range <- seq_range(x = data$water, n = grid.lines) # x-axis
temperature_range <- seq_range(data$temperature, n = grid.lines) # y-axis
fat_range <- unique(x = data$fat) # Column

# Create grid of variables
prediction_grid <- expand.grid(
                              temperature = temperature_range,
                              water = water_range,
                              fat = fat_range
                              ) 
# Create data frames of values for each combination of process parameters to be plotted
# 1x3 structure
# Plot 1: Row 1, Column 1
Grid1_Row1_Column1 <- prediction_grid %>% filter(fat == fat_range[1])
# Plot 2: Row 1, Column 2
Grid1_Row1_Column2 <- prediction_grid %>% filter(fat == fat_range[2])
# Plot 3: Row 1, Column 3
Grid1_Row1_Column3 <- prediction_grid %>% filter(fat == fat_range[3])

Description_Grid1_Row1_Column1 <- paste0(outcome_name, " of Milk\nwith ", fat_range[1], "% fat")
Description_Grid1_Row1_Column2 <- paste0(outcome_name, " of Milk\nwith ", fat_range[2], "% fat")
Description_Grid1_Row1_Column3 <- paste0(outcome_name, " of Milk\nwith ", fat_range[3], "% fat")

# Use selected model to make Predictions for each grid created
# Plot 1: Row 1, Column 1
Predictions_Grid1_Row1_Column1 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column1), nrow = grid.lines, ncol = grid.lines)
# Plot 2: Row 1, Column 2
Predictions_Grid1_Row1_Column2 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column2), nrow = grid.lines, ncol = grid.lines)
# Plot 3: Row 1, Column 3
Predictions_Grid1_Row1_Column3 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column3), nrow = grid.lines, ncol = grid.lines)
# Multiple Plots
par(mfrow = c(1, 3)) # 1x3 grid
# Chart (1,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column1,
                      facets = NA),
          main = Description_Grid1_Row1_Column1)

# Chart (2,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column2,
                      facets = NA),
          main = Description_Grid1_Row1_Column2, 
          sub = paste0("R-squared = ", r.squared, "\n",
                       "Adjusted R-squared = ", adj.r.squared, "\n",
                       "Cross-validated R-squared = ", r.squared.LOOCV))

# Chart (3,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column3,
                      facets = NA),
          main = Description_Grid1_Row1_Column3)

# Delete obsolete models
rm(Model_BSS_NI1_LOOCV, Model_BSS_NI2_LOOCV, Model_BSS_NI3_LOOCV, Model_BSS_NI4_LOOCV, Model_BSS_NI5_LOOCV, Model_BSS_WI1_LOOCV, Model_BSS_WI2_LOOCV, Model_BSS_WI3_LOOCV, Model_BSS_WI4_LOOCV, Model_BSS_WI5_LOOCV, Model_BSS_QI1_LOOCV, Model_BSS_QI2_LOOCV, Model_BSS_QI3_LOOCV, Model_BSS_QI4_LOOCV, Model_BSS_QI5_LOOCV, Model_BSS_WI1, Model_BSS_WI2, Model_BSS_WI3, Model_BSS_WI4, Model_BSS_WI5, Model_BSS_QI1, Model_BSS_QI2, Model_BSS_QI3, Model_BSS_QI4, Model_BSS_QI5, Model_BSS_NI1, Model_BSS_NI2, Model_BSS_NI3, Model_BSS_NI4, Model_BSS_NI5, Model_Linear_NI, Model_Linear_NI_LOOCV, Model_Linear_WI, Model_Linear_WI_LOOCV, Model_Linear_QI, Model_Linear_QI_LOOCV, Models_BSS_NI, Models_BSS_WI, Model_Selected, BSS_formula, BSS_formula_QI,  Model_Selected_Description, Model_Selected_string, Model_Summary, formula_BSS, formula_Linear_NI, formula_Linear_WI, Model_Selected_auto_index, Model_Selected_auto)

# Delete obsolete objects
rm(g1, g2, g3, g, Best_Model_ID, Best_Model_Index, Best_Model_Type, colour_range, units, height, width, res, theta, theta_index, pointsize, i, r.squared, adj.r.squared, r.squared.LOOCV, z_expansion_factor, variable_name, variable_ID, Predictions_Grid1_Row1_Column1, Predictions_Grid1_Row1_Column2, Predictions_Grid1_Row1_Column3, outcome_ID, outcome_name, pred_obs_plot, residual_plot)

4.3 Density

4.3.1 Creating a training and test set

Create a training and test set by randomly selecting 85% of the samples from the data for the training set and use the remaining 15% for the test set.

library(caret, quietly = T, verbose = F)
library(tidyverse, quietly = T, verbose = F)
library(plyr, quietly = T, verbose = F)
library(tictoc, quietly = T, verbose = F)

predictor_ID <- c("water", "fat", "temperature")
predictor_name <- c("Water", "Fat", "Temperature")

outcome_ID <- "density"
outcome_name <- "Density"
units <- "kg/m3"
variable_vector <- data$density

# Remove any observations with missing values
data <- data[complete.cases(data), ]

# Average the values of replicate experiments (if present)
data <- ddply(.data = data, 
                       .variables = .(water, fat, temperature), 
                       .fun = function(x) c(density = mean(x$density),
                                            density = mean(x$density),
                                            density = mean(x$density)))

# Create training and test set using stratified partioning of outcome variable
training_fraction <- 0.85
set.seed(1); training_index <- createDataPartition(y = data$density, p = training_fraction)[[1]]

training_set <- data[training_index, ]
test_set <- data[-training_index, ]

# Check distributtion of strength on training set
par(mfrow = c(1, 2))
hist(training_set$density, main = "Training Set", xlab = paste0(outcome_name, " ", units, ""), freq = FALSE)

# Check distributtion of strength on test set
hist(test_set$density, main = "Test Set", xlab = paste0(outcome_name, " ", units, ""), freq = FALSE)

rbind(summary(training_set$density),summary(test_set$density)) %>% data.frame() %>% add_column(Set = c("Training Set", "Test Set")) %>% select(Set, everything()) %>% kable(digits = 4, align = "c", caption = "Training and Test Set Statistics.", col.names = c("Set", "Minimum", "1st Quartile", "Median", "Mean", "3rd quartile", "Maximum")) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F), position = "center")
Training and Test Set Statistics.
Set Minimum 1st Quartile Median Mean 3rd quartile Maximum
Training Set 1012 1025 1032 1032 1038 1050
Test Set 1016 1024 1033 1031 1038 1043

The distribution of Density values on both the training and test set are similar.

4.3.2 Modelling

Create formula objects for multiple linear regression with and without 2-factor interaction terms and with and without quadratic terms.

MLR_formula <- as.formula("density ~ water + fat + temperature")
MLR_formula_Interactions <- as.formula("density ~ water*fat*temperature")
MLR_formula_Quadratic <- as.formula("density ~ water + fat + temperature + I(water^2) + I(fat^2) + I(temperature^2)")

4.3.2.1 Best Subset Regression

Perform best subset regression (with and without interaction terms) and select the best models.

set.seed(1)

# Initial Formula for Best Subset Regression
BSS_formula <- as.formula(density ~ water + fat + temperature)

# BSS with Main effects ---------------------
# Perform best subset regression without interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_NI <- glmulti::glmulti(BSS_formula, data = training_set,
                                      level = 1, # Main effects only
                                      method = "h", # Use exhaustive screening
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )

# # Plot relative importance of the various model terms 
# plot(Models_BSS_NI, type="s", cex.axis=0.8, cex.names=0.8, sub = paste0(outcome_name, ": Best subset regression with main effects only"), mgp=c(2,0.6,-0.4))

# BSS with 2-fi ---------------------
# Perform best subset regression with interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_WI <- glmulti::glmulti(BSS_formula, data = training_set,
                                      level = 2, # 2-fi considered
                                      method = "h", # Use exhaustive screening
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )

# # Plot relative importance of the various model terms 
# plot(Models_BSS_WI, type="s", cex.axis=0.8, cex.names=0.55, sub = paste0(outcome_name, ": Best subset regression with interaction terms"), mgp=c(2,0.6,-0.4))

BSS_formula_QI <- as.formula(density ~ water + fat + temperature + I(water^2) + I(fat^2) + I(temperature^2))

# BSS with quadratic terms and 2-fi ---------------------
# Perform best subset regression with interaction terms), set seed for reproducibility
set.seed(1)
Models_BSS_QI <- glmulti::glmulti(BSS_formula_QI, data = training_set,
                                      level = 2, # 2-fi considered
                                      method = "g", # Use genetic algorithm
                                      crit = "aic", # use AIC as criterium
                                      confsetsize = 8, # Keep n best models
                                      fitfunction = "lm", plotty = F, report = F  # No plots or interim reports
                                      )
TASK: Genetic algorithm in the candidate set.
Initialization...
Algorithm started...
Improvements in best and average IC have bebingo en below the specified goals.
Algorithm is declared to have converged.
Completed.
# # Plot relative importance of the various model terms 
# plot(Models_BSS_QI, type="s", cex.axis=0.8, cex.names=0.55, sub = paste0(outcome_name, ": Best subset regression with quadratic and interaction terms"), mgp=c(2,0.6,-0.4))
# Save best BSS models

# main effects models
Model_BSS_NI1 <- Models_BSS_NI@objects[[1]]
Model_BSS_NI2 <- Models_BSS_NI@objects[[2]]
Model_BSS_NI3 <- Models_BSS_NI@objects[[3]]
Model_BSS_NI4 <- Models_BSS_NI@objects[[4]]
Model_BSS_NI5 <- Models_BSS_NI@objects[[5]]

# 2-fi models
Model_BSS_WI1 <- Models_BSS_WI@objects[[1]]
Model_BSS_WI2 <- Models_BSS_WI@objects[[2]]
Model_BSS_WI3 <- Models_BSS_WI@objects[[3]]
Model_BSS_WI4 <- Models_BSS_WI@objects[[4]]
Model_BSS_WI5 <- Models_BSS_WI@objects[[5]]

# 2-fi models
Model_BSS_QI1 <- Models_BSS_QI@objects[[1]]
Model_BSS_QI2 <- Models_BSS_QI@objects[[2]]
Model_BSS_QI3 <- Models_BSS_QI@objects[[3]]
Model_BSS_QI4 <- Models_BSS_QI@objects[[4]]
Model_BSS_QI5 <- Models_BSS_QI@objects[[5]]

4.3.2.2 Multiple Linear Regression

Model data with standard multiple linear regression (with and without interaction terms).

# Formula for Multiple Linear Regression - Without Interaction Terms 
formula_Linear_NI <- as.formula(density ~ water + fat + temperature)

# Formula for Multiple Linear Regression - With all 2-factor Interaction Terms
formula_Linear_WI <- as.formula(density ~ water*fat + water*temperature + fat*temperature)

# Formula for Multiple Linear Regression - With all quadratic and 2-factor Interaction Terms
formula_Linear_QI <- as.formula(density ~ water*fat + water*temperature + fat*temperature + I(water^2) + I(fat^2) + I(temperature^2))

Output of multiple linear regression with main factors only:

Model_Linear_NI <- lm(formula = formula_Linear_NI, data = training_set)

glance(Model_Linear_NI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F)) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.993 0.993 0.767 3056 0 4 -78.7 167.4 178.6 38.83 66

Output of multiple linear regression with all main factors and 2-factor interactions (2-fi):

Model_Linear_WI <- lm(formula = formula_Linear_WI, data = training_set)

glance(Model_Linear_WI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.993 0.992 0.778 1486 0 7 -78.06 172.1 190.1 38.13 63

Output of multiple linear regression with all main factors, 2-factor interactions (2-fi) and quadratic terms:

Model_Linear_QI <- lm(formula = formula_Linear_QI, data = training_set)

glance(Model_Linear_QI) %>% kable(digits = 3, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
r.squared adj.r.squared sigma statistic p.value df logLik AIC BIC deviance df.residual
0.993 0.992 0.774 1001 0 10 -76 174 198.7 35.94 60

4.3.3 Model Evaluation

Evaluate and select models with Leave-one-out cross-validation (LOOCV)

# Leave-one-out cross-validation (LOOCV)

# Linear without interactions
Model_Linear_NI_LOOCV <- train(formula_Linear_NI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Linear with Interactions
Model_Linear_WI_LOOCV <- train(formula_Linear_WI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Linear with Interactions
Model_Linear_QI_LOOCV <- train(formula_Linear_QI, data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")

# Best Subset Regression - No Interaction Terms
Model_BSS_NI1_LOOCV <- if (length(Model_BSS_NI1$coefficients)>1) train(formula(Model_BSS_NI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI2_LOOCV <- if (length(Model_BSS_NI2$coefficients)>1) train(formula(Model_BSS_NI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI3_LOOCV <- if (length(Model_BSS_NI3$coefficients)>1) train(formula(Model_BSS_NI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI4_LOOCV <- if (length(Model_BSS_NI4$coefficients)>1) train(formula(Model_BSS_NI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_NI5_LOOCV <- if (length(Model_BSS_NI5$coefficients)>1) train(formula(Model_BSS_NI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

# Best Subset Regression - With Interaction Terms
Model_BSS_WI1_LOOCV <- if (length(Model_BSS_WI1$coefficients)>1) train(formula(Model_BSS_WI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI2_LOOCV <- if (length(Model_BSS_WI2$coefficients)>1) train(formula(Model_BSS_WI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI3_LOOCV <- if (length(Model_BSS_WI3$coefficients)>1) train(formula(Model_BSS_WI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI4_LOOCV <- if (length(Model_BSS_WI4$coefficients)>1) train(formula(Model_BSS_WI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_WI5_LOOCV <- if (length(Model_BSS_WI5$coefficients)>1) train(formula(Model_BSS_WI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

# Best Subset Regression - With Interaction Terms and Quadratic Terms
Model_BSS_QI1_LOOCV <- if (length(Model_BSS_QI1$coefficients)>1) train(formula(Model_BSS_QI1), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI2_LOOCV <- if (length(Model_BSS_QI2$coefficients)>1) train(formula(Model_BSS_QI2), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI3_LOOCV <- if (length(Model_BSS_QI3$coefficients)>1) train(formula(Model_BSS_QI3), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI4_LOOCV <- if (length(Model_BSS_QI4$coefficients)>1) train(formula(Model_BSS_QI4), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV
Model_BSS_QI5_LOOCV <- if (length(Model_BSS_QI5$coefficients)>1) train(formula(Model_BSS_QI5), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit") else Model_Linear_NI_LOOCV

Summary statistics for all models evaluated:

# Summary statistics for all models evaluated
options("scipen"=100, "digits"=4)

Model_Summary <- data.frame(rbind(
  # Linear without interactions
  cbind(Model = "Model_Linear_NI", Type = "MLR Main Factors",
    glance(Model_Linear_NI)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_Linear_NI_LOOCV$results[[2]], `R-Squared LOOCV` = Model_Linear_NI_LOOCV$results[[3]]),
  
  # Linear with Interactions
  cbind(Model = "Model_Linear_WI", Type = "MLR 2-fi",
    glance(Model_Linear_WI)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_Linear_WI_LOOCV$results[[2]], `R-Squared LOOCV` = Model_Linear_WI_LOOCV$results[[3]]),

  # Best Subset Regression - No Interaction Terms
    # Model_BSS_NI1_LOOCV
  cbind(Model = "Model_BSS_NI1", Type = "BSS Main Factors",
    glance(Model_BSS_NI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI1_LOOCV$results[[3]]),
    # Model_BSS_NI2_LOOCV
  cbind(Model = "Model_BSS_NI2", Type = "BSS Main Factors",
    glance(Model_BSS_NI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI2_LOOCV$results[[3]]),
    # Model_BSS_NI3_LOOCV
  cbind(Model = "Model_BSS_NI3", Type = "BSS Main Factors",
    glance(Model_BSS_NI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI3_LOOCV$results[[3]]),
    # Model_BSS_NI4_LOOCV
  cbind(Model = "Model_BSS_NI4", Type = "BSS Main Factors",
    glance(Model_BSS_NI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI4_LOOCV$results[[3]]),
    # Model_BSS_NI5_LOOCV
  cbind(Model = "Model_BSS_NI5", Type = "BSS Main Factors",
    glance(Model_BSS_NI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_NI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_NI5_LOOCV$results[[3]]),

  # Best Subset Regression - With Interaction Terms
    # Model_BSS_WI1_LOOCV
  cbind(Model = "Model_BSS_WI1", Type = "BSS 2-fi",
    glance(Model_BSS_WI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI1_LOOCV$results[[3]]),
    # Model_BSS_WI2_LOOCV
  cbind(Model = "Model_BSS_WI2", Type = "BSS 2-fi",
    glance(Model_BSS_WI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI2_LOOCV$results[[3]]),
    # Model_BSS_WI3_LOOCV
  cbind(Model = "Model_BSS_WI3", Type = "BSS 2-fi",
    glance(Model_BSS_WI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI3_LOOCV$results[[3]]),
    # Model_BSS_WI4_LOOCV
  cbind(Model = "Model_BSS_WI4", Type = "BSS 2-fi",
    glance(Model_BSS_WI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI4_LOOCV$results[[3]]),
    # Model_BSS_WI5_LOOCV
  cbind(Model = "Model_BSS_WI5", Type = "BSS 2-fi",
    glance(Model_BSS_WI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_WI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_WI5_LOOCV$results[[3]]),
  
  # Best Subset Regression - With Interaction Terms
    # Model_BSS_QI1_LOOCV
  cbind(Model = "Model_BSS_QI1", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI1)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI1_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI1_LOOCV$results[[3]]),
    # Model_BSS_QI2_LOOCV
  cbind(Model = "Model_BSS_QI2", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI2)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI2_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI2_LOOCV$results[[3]]),
    # Model_BSS_QI3_LOOCV
  cbind(Model = "Model_BSS_QI3", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI3)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI3_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI3_LOOCV$results[[3]]),
    # Model_BSS_QI4_LOOCV
  cbind(Model = "Model_BSS_QI4", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI4)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI4_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI4_LOOCV$results[[3]]),
    # Model_BSS_QI5_LOOCV
  cbind(Model = "Model_BSS_QI5", Type = "BSS 2-fi and Q",
    glance(Model_BSS_QI5)[,c(1, 2, 8, 9, 5)], `RMSE LOOCV` = Model_BSS_QI5_LOOCV$results[[2]], `R-Squared LOOCV` = Model_BSS_QI5_LOOCV$results[[3]])
  )
)

Model_Summary %>% 
  kable(align = "c", digits = 4, col.names = c("Model", "Type", "R-squared", "Adjusted R-squared", "AIC", "BIC", "p-value", "RMSE (LOOCV)", "R-squared (LOOCV)")) %>% 
  kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F)) %>% footnote(general = "LOOCV: Leave-one-out cross-validation", general_title = "")
Model Type R-squared Adjusted R-squared AIC BIC p-value RMSE (LOOCV) R-squared (LOOCV)
Model_Linear_NI MLR Main Factors 0.9929 0.9925 167.4 178.6 0.0000 0.7866 0.9920
Model_Linear_WI MLR 2-fi 0.9930 0.9923 172.1 190.1 0.0000 0.8120 0.9915
Model_BSS_NI1 BSS Main Factors 0.9929 0.9925 167.4 178.6 0.0000 0.7866 0.9920
Model_BSS_NI2 BSS Main Factors 0.9724 0.9716 260.0 269.0 0.0000 1.5272 0.9699
Model_BSS_NI3 BSS Main Factors 0.7844 0.7779 403.9 412.9 0.0000 4.2811 0.7640
Model_BSS_NI4 BSS Main Factors 0.7588 0.7553 409.7 416.5 0.0000 4.4568 0.7441
Model_BSS_NI5 BSS Main Factors 0.1879 0.1760 494.7 501.4 0.0002 8.1774 0.1417
Model_BSS_WI1 BSS 2-fi 0.9929 0.9925 167.4 178.6 0.0000 0.7866 0.9920
Model_BSS_WI2 BSS 2-fi 0.9929 0.9925 168.5 182.0 0.0000 0.7930 0.9919
Model_BSS_WI3 BSS 2-fi 0.9927 0.9924 168.9 180.1 0.0000 0.7943 0.9919
Model_BSS_WI4 BSS 2-fi 0.9929 0.9925 169.0 182.5 0.0000 0.7940 0.9919
Model_BSS_WI5 BSS 2-fi 0.9929 0.9924 169.4 182.9 0.0000 0.7964 0.9918
Model_BSS_QI1 BSS 2-fi and Q 0.9949 0.9938 161.5 193.0 0.0000 0.7755 0.9923
Model_BSS_QI2 BSS 2-fi and Q 0.9949 0.9938 161.5 193.0 0.0000 0.7755 0.9923
Model_BSS_QI3 BSS 2-fi and Q 0.9947 0.9937 162.3 191.6 0.0000 0.7792 0.9922
Model_BSS_QI4 BSS 2-fi and Q 0.9949 0.9938 163.2 196.9 0.0000 0.7881 0.9920
Model_BSS_QI5 BSS 2-fi and Q 0.9949 0.9938 163.3 197.1 0.0000 0.7922 0.9919
LOOCV: Leave-one-out cross-validation
pallete <- c("#000000", "#5e5656", 
             "#67000d", "#a50f15", "#cb181d", "#ef3b2c", "#fb6a4a", # reds
             "#08306b", "#08519c", "#2171b5", "#4292c6", "#6baed6", # blues
             "#00441b", "#006d2c", "#238b45", "#41ab5d", "#74c476") # greens

caption <- "NI: No Interaction terms\nWI: With Interaction terms\nQI: With Quadratic and Interaction terms"

# Vector for point labels
Descriptors <- c("NI", "WI", substr(Model_Summary$Model[3:length(Model_Summary$Model)], 
                                    start = nchar(x = as.character(Model_Summary$Model[3:length(Model_Summary$Model)])), 
                                    stop = nchar(as.character(Model_Summary$Model[3:length(Model_Summary$Model)]))))

# Adjusted R-squared and AIC
g1 <- Model_Summary %>%
  ggplot(aes(x = AIC, y = adj.r.squared, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Adjusted R-squared and AIC."),
         caption = caption,
         x = "AIC",
         y = "Adjusted R-squared") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

# Adjusted R-squared and BIC
g2 <- Model_Summary %>%
  ggplot(aes(x = BIC, y = adj.r.squared, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Adjusted R-squared and BIC values."),
         caption = caption,
         x = "BIC",
         y = "Adjusted R-squared") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

# LOOCV R-squared and LOOCV RMSE
g3 <- Model_Summary %>%
  ggplot(aes(x =RMSE.LOOCV, y = R.Squared.LOOCV, col = Model)) +
    geom_point(mapping = aes(shape = Type), shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 6, alpha = 0.6, position = position_jitter(width = 0.01, height = 0.01)) +
    # geom_text(mapping = aes(label = Descriptors), col = "#000000", size = 3) + # add labels to points
    scale_y_continuous(limits = c(0,1), breaks = seq(0,1,0.10), minor_breaks = NULL) +
    labs(title = paste0(outcome_name, ": Cross-validated RMSE and R-squared between \nobserved and predicted values"),
         caption = caption,
         x = "RMSE (LOOCV)",
         y = "R-squared (LOOCV)") +
    theme_bw() +
    scale_colour_manual(values = pallete) +
    guides(col=guide_legend(ncol=2, override.aes = list(shape = c(18,18,rep(15,5), rep(16,5), rep(17,5)), size = 4))) +
    theme(aspect.ratio = 0.8)

gridExtra::grid.arrange(g1, g2, g3, ncol = 1)


4.3.4 Model Evaluation

Based on the \(R^2\) from Leave-one-out cross-validation (R.squared.LOOCV), Model_BSS_QI1 appears to be the best model.

Based on the root-mean squared error from Leave-one-out cross-validation (RMSE.LOOCV), Model_BSS_QI1 appears to be the best model.

We will select the model Model_BSS_QI1 to make our predictions for Density.

# Best Model
# Get descriptor best model
Best_Model_ID <- as.character(Model_Summary$Model[which.max(Model_Summary$R.Squared.LOOCV)][[1]])
# Get type of best model
Best_Model_Type <- if_else(condition = stringr::str_detect(Best_Model_ID, pattern = "QI"), 
                           true = "BSS 2-fi and Q", 
                           false = if_else(
                             condition = str_detect(Best_Model_ID, pattern = "WI"), 
                             true = "BSS 2-fi",
                             false = "BSS Main Factors")
                           )
# Get index of best model
Best_Model_Index <- as.numeric(str_extract(Best_Model_ID, "[[:digit:]]+"))

# Automatically select best model and assign it to Model_Selected_auto - choice based on R.Squared.LOOCV
Model_Selected <- if_else(
  condition = Best_Model_Type == "BSS 2-fi and Q", 
  true = Models_BSS_QI@objects[Best_Model_Index], 
  false = if_else(
    condition = Best_Model_Type == "BSS 2-fi", 
    true = Models_BSS_WI@objects[Best_Model_Index], 
    false = Models_BSS_NI@objects[Best_Model_Index])
  )[[1]]

# Create string of model selected
Model_Selected_string <- Best_Model_ID # as.character(Model_Summary$Model[which.max(Model_Summary$R.Squared.LOOCV)][[1]]) 

# Generate Description for model selected
Model_Selected_Description <- paste(if_else(
                                    condition = grepl(pattern = "BSS", x = Model_Selected_string),
                                            true = "Best Subset Regression Model", 
                                            false = "Multiple Linear Regression Model"),
                                    if_else(condition = grepl(pattern = "QI", x = Model_Selected_string), 
                                            true = "with Quadratic and", 
                                            false = ""),
                                    if_else(condition = grepl(pattern = "NI", x = Model_Selected_string), 
                                            true = "without Interaction Terms", 
                                            false = "with Interaction Terms"),
                                    sep = " ") 

# Save model as RDS file
saveRDS(object = Model_Selected, file = paste("Models/",project_ID, "_", outcome_ID, "_model_selected", ".rds", sep = ""))

Summary Statistics of selected model:

# Print Summary Tables for selected model
options("scipen"=10000, "digits"=5)

# Print model fit statistics
glance(Model_Selected) %>% kable(digits = 3, align = "c", col.names = c("R-squared", "Adjusted R-squared", "Sigma", "Statistic", "p-value", "df", "logLik", "AIC", "BIC","Deviance", "df Residual")) %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
R-squared Adjusted R-squared Sigma Statistic p-value df logLik AIC BIC Deviance df Residual
0.995 0.994 0.696 930.08 0 13 -66.756 161.51 192.99 27.603 57
# save model fit statistics
Model_Selected_Statistics <- glance(Model_Selected)

summmary_table_Model_Selected <- summary(Model_Selected) 
summmary_table_Model_Selected <- cbind(rownames(summmary_table_Model_Selected$coefficients), as.data.frame(summmary_table_Model_Selected$coefficients))
rownames(summmary_table_Model_Selected) <- NULL
colnames(summmary_table_Model_Selected) <- c("Parameter", "Estimate", "Std Error", "Statistic", "p-value")
summmary_table_Model_Selected %>% kable(digits = 6, align = "c") %>% kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive", full_width = F))
Parameter Estimate Std Error Statistic p-value
(Intercept) 1067.794318 1.077754 990.7591 0.000000
fat -0.728135 0.123698 -5.8864 0.000000
temperature 4.064718 1.165728 3.4868 0.000948
I(water^2) -0.003188 0.000150 -21.2111 0.000000
I(temperature^2) -0.071937 0.019859 -3.6223 0.000622
temperature:water -0.113653 0.028740 -3.9546 0.000214
fat:temperature 0.010557 0.004890 2.1590 0.035070
temperature:I(water^2) 0.000711 0.000176 4.0380 0.000163
fat:I(fat^2) 0.004430 0.001873 2.3650 0.021455
temperature:I(fat^2) -0.001193 0.000599 -1.9909 0.051294
I(temperature^2):water 0.001863 0.000489 3.8132 0.000339
temperature:I(temperature^2) -0.000016 0.000010 -1.5340 0.130559
I(water2):I(temperature2) -0.000012 0.000003 -3.8784 0.000275
# # Plot model coefficients
# ggcoef(x = Model_Selected, exponentiate = F, exclude_intercept = T, vline_color = "red", vline_linetype =  "dotted", errorbar_color = "grey10", errorbar_height = .15, mapping = aes(x = estimate, y = term, size = p.value)) + scale_size_continuous(trans = "reverse") + labs(x = "Estimate", y = NULL, size = "p-value", title = paste0(outcome_name, ": Plot of Model Coefficients")) + theme_bw()

# Save model statistics into a csv
write_csv(x = summmary_table_Model_Selected, path = paste0("Models/",  project_ID, "_", outcome_ID, "_Model_Selected_Coefficients",".csv"), na = "NA")
write_csv(x = Model_Selected_Statistics, path = paste0("Models/",  project_ID, "_", outcome_ID, "_Model_Selected_Statistics",".csv"), na = "NA")

# Plot Diagnostics for selected model
ggfortify:::autoplot.lm(Model_Selected,
         ncol = 2, alpha = 0.8,
         label.size = 3) +
  theme_bw() +
  theme(aspect.ratio = 0.8, legend.title = element_text(size = 10), legend.text = element_text(size = 8), axis.title = element_text(size = 9))

# Save relevant model statistics
r.squared <- round(glance(Model_Selected)[[1]], 3)
adj.r.squared <- round(glance(Model_Selected)[[2]], 3)
r.squared.LOOCV <- round(train(as.formula(Model_Selected), data = training_set, trControl=trainControl(method="LOOCV"), method="lm", na.action = "na.omit")$results[[3]], 3)

Summary Statistics of selected model on test set:

# Make predictions on test set
test_set$density_predicted <- predict(Model_Selected, test_set)
# Calculate Residuals on test set
test_set$density_residuals <- test_set$density - test_set$density_predicted

# Calculate test set R-squared, RMSE, MAE
R_squared <- round(cor(test_set$density_predicted, test_set$density), 4)
RMSE <- signif(RMSE(pred = test_set$density_predicted, obs = test_set$density, na.rm = T), 4)
MAE <- signif(MAE(pred = test_set$density_predicted, obs = test_set$density), 4)

Test_Set_Statistics <- c(RMSE, MAE, R_squared)
names(Test_Set_Statistics) <- c("RMSE", "MAE", "R-squared")

# Print table
Test_Set_Statistics %>% t() %>% 
  kable(align = "c", caption = paste0("Test set statistics of predicted ", outcome_name, ".")) %>% 
  kableExtra::kable_styling(bootstrap_options = c("striped", "hover", "condensed", "responsive"), full_width = T, position = "center") # %>%
Test set statistics of predicted Density.
RMSE MAE R-squared
0.8087 0.7013 0.9969
# Plot predicted vs observed values and residuals
pred_obs_plot <- predicted_observed_plot(predicted_val = test_set$density_predicted, observed_val = test_set$density, residual_val = test_set$density_residuals, R_squared = R_squared, model_name = outcome_name)

residual_plot <- residuals_plot(predicted_val = test_set$density_predicted, observed_val = test_set$density, residual_val = test_set$density_residuals, MAE = MAE, RMSE = RMSE, model_name = outcome_name)

gridExtra::grid.arrange(pred_obs_plot, residual_plot, ncol = 2)


4.3.5 Surface Plots

# Grid Settings
z_expansion_factor <- 0.05 # Expansion factor for z-axis
theta <- c(320) # Perspective angle
theta_index <- 1
colour_range <- plot3D::ramp.col(c("#2166ac", "#b2182b")) # [TD]

# Create vectors for variables
# Number fo gridlines for x- and y-axis
grid.lines <- 21

# Create values for surface plots
water_range <- seq_range(x = data$water, n = grid.lines) # x-axis
temperature_range <- seq_range(data$temperature, n = grid.lines) # y-axis
fat_range <- unique(x = data$fat) # Column

# Create grid of variables
prediction_grid <- expand.grid(
                              temperature = temperature_range,
                              water = water_range,
                              fat = fat_range
                              ) 
# Create data frames of values for each combination of process parameters to be plotted
# 1x3 structure
# Plot 1: Row 1, Column 1
Grid1_Row1_Column1 <- prediction_grid %>% filter(fat == fat_range[1])
# Plot 2: Row 1, Column 2
Grid1_Row1_Column2 <- prediction_grid %>% filter(fat == fat_range[2])
# Plot 3: Row 1, Column 3
Grid1_Row1_Column3 <- prediction_grid %>% filter(fat == fat_range[3])

Description_Grid1_Row1_Column1 <- paste0(outcome_name, " of Milk\nwith ", fat_range[1], "% fat")
Description_Grid1_Row1_Column2 <- paste0(outcome_name, " of Milk\nwith ", fat_range[2], "% fat")
Description_Grid1_Row1_Column3 <- paste0(outcome_name, " of Milk\nwith ", fat_range[3], "% fat")

# Use selected model to make Predictions for each grid created
# Plot 1: Row 1, Column 1
Predictions_Grid1_Row1_Column1 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column1), nrow = grid.lines, ncol = grid.lines)
# Plot 2: Row 1, Column 2
Predictions_Grid1_Row1_Column2 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column2), nrow = grid.lines, ncol = grid.lines)
# Plot 3: Row 1, Column 3
Predictions_Grid1_Row1_Column3 <- matrix(predict(object = Model_Selected, newdata = Grid1_Row1_Column3), nrow = grid.lines, ncol = grid.lines)
# Multiple Plots
par(mfrow = c(1, 3)) # 1x3 grid
# Chart (1,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column1,
                      facets = NA),
          main = Description_Grid1_Row1_Column1)

# Chart (2,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column2,
                      facets = NA),
          main = Description_Grid1_Row1_Column2, 
          sub = paste0("R-squared = ", r.squared, "\n",
                       "Adjusted R-squared = ", adj.r.squared, "\n",
                       "Cross-validated R-squared = ", r.squared.LOOCV))

# Chart (3,1)
scatter3D(x = data$temperature,
          y = data$water,
          z = variable_vector,
          pch = 19, cex = 1.8, bty = "b2", phi = 20,
          theta = theta[[theta_index]],
          col = colour_range,
          ticktype = "detailed", colkey = FALSE, 
          xlab = "Temperature (°C)",
          ylab = "Water (%)",
          zlab = paste0(outcome_name, " ", units),
          xlim = c(min(data$temperature, na.rm = TRUE), max(data$temperature, na.rm = TRUE)),
          ylim = c(min(data$water, na.rm = TRUE), max(data$water, na.rm = TRUE)),
          zlim = range(min(variable_vector, na.rm = TRUE)*(1-z_expansion_factor), max(variable_vector, na.rm = TRUE)*(1+z_expansion_factor)),
          surf = list(x = temperature_range,
                      y = water_range,
                      z = Predictions_Grid1_Row1_Column3,
                      facets = NA),
          main = Description_Grid1_Row1_Column3)

# Delete obsolete models
rm(Model_BSS_NI1_LOOCV, Model_BSS_NI2_LOOCV, Model_BSS_NI3_LOOCV, Model_BSS_NI4_LOOCV, Model_BSS_NI5_LOOCV, Model_BSS_WI1_LOOCV, Model_BSS_WI2_LOOCV, Model_BSS_WI3_LOOCV, Model_BSS_WI4_LOOCV, Model_BSS_WI5_LOOCV, Model_BSS_QI1_LOOCV, Model_BSS_QI2_LOOCV, Model_BSS_QI3_LOOCV, Model_BSS_QI4_LOOCV, Model_BSS_QI5_LOOCV, Model_BSS_WI1, Model_BSS_WI2, Model_BSS_WI3, Model_BSS_WI4, Model_BSS_WI5, Model_BSS_QI1, Model_BSS_QI2, Model_BSS_QI3, Model_BSS_QI4, Model_BSS_QI5, Model_BSS_NI1, Model_BSS_NI2, Model_BSS_NI3, Model_BSS_NI4, Model_BSS_NI5, Model_Linear_NI, Model_Linear_NI_LOOCV, Model_Linear_WI, Model_Linear_WI_LOOCV, Model_Linear_QI, Model_Linear_QI_LOOCV, Models_BSS_NI, Models_BSS_WI, Model_Selected, BSS_formula, BSS_formula_QI,  Model_Selected_Description, Model_Selected_string, Model_Summary, formula_BSS, formula_Linear_NI, formula_Linear_WI, Model_Selected_auto_index, Model_Selected_auto)

# Delete obsolete objects
rm(g1, g2, g3, g, Best_Model_ID, Best_Model_Index, Best_Model_Type, colour_range, units, height, width, res, theta, theta_index, pointsize, i, r.squared, adj.r.squared, r.squared.LOOCV, z_expansion_factor, variable_name, variable_ID, Predictions_Grid1_Row1_Column1, Predictions_Grid1_Row1_Column2, Predictions_Grid1_Row1_Column3, outcome_ID, outcome_name, pred_obs_plot, residual_plot)

5 Summary

The models obtained show that the heat capacity, thermal conductivity and density of milk can be reliably modeled with statistical models.

Temperature and water content appear to be the most important variables to predict heat capacity and thermal conductivity. Milk density seems to be mostly affected by temperature.


6 References

Influence of Temperature and Water and Fat Contents on the Thermophysical Properties of Milk Luis A. Minim, Jane S. R. Coimbra, and, Valéria P. R. Minim, and Javier Telis-Romero, Journal of Chemical & Engineering Data, 2002, 47 (6), 1488-1491, DOI: 10.1021/je025546a