DATA624 - Project 2

Packages:

library(tidyverse)
library(httr)
library(readxl)
library(DataExplorer)
library(psych)
library(knitr)
library(snakecase)
library(RColorBrewer)
library(VIM)
library(ggcorrplot)
library(caret)
library(randomForest)
library(cowplot)
library(car)
library(MASS)
select <- dplyr::select
library(earth)
library(rminer)
library(writexl)

cur_theme <- theme_set(theme_classic())
palette <- brewer.pal(n = 12, name = "Paired")
greys <- brewer.pal(n = 9, name = "Greys")

Introduction:

New regulations require ABC Beverage to understand our manufacturing process and the predictive factors. We need to be able to report to leadership our predictive model of PH.

We load the historical dataset provided, as well as the evaluation dataset we will later make predictions on.

my_url1 <- "https://github.com/geedoubledee/data624_project2/raw/main/StudentData.xlsx"
temp <- tempfile(fileext = ".xlsx")
req <- GET(my_url1, authenticate(Sys.getenv("GITHUB_PAT"), ""),
           write_disk(path = temp))
main_df <- readxl::read_excel(temp)
colnames(main_df) <- to_screaming_snake_case(colnames(main_df))
my_url2 <- "https://github.com/geedoubledee/data624_project2/raw/main/StudentEvaluation.xlsx"
temp <- tempfile(fileext = ".xlsx")
req <- GET(my_url2, authenticate(Sys.getenv("GITHUB_PAT"), ""),
           write_disk(path = temp))
eval_df <- readxl::read_excel(temp)
colnames(eval_df) <- to_screaming_snake_case(colnames(eval_df))

Exploratory Data Analysis:

We take a look at the distribution for the response variable and a summary of it.

annotations <- data.frame(x = c(min(main_df$PH, na.rm = TRUE),
                                round(median(main_df$PH, na.rm = TRUE), 2),
                                max(main_df$PH, na.rm = TRUE)),
                          y = c(20, 340, 20),
                          label = c("Min:", "Median:", "Max:"))
p0 <- main_df |>
    ggplot(aes(x = PH)) +
    geom_histogram(binwidth = 0.05, color = palette[2], fill = palette[1]) +
    geom_text(data = annotations,
              aes(x = x, y = y, label = paste(label, x)),
              size = 4, fontface = "bold") +
    labs(y = "Frequency") + 
    scale_x_continuous(limits = c(7.5, 9.5), breaks = seq(7.5, 9.5, 0.5)) +
    scale_y_continuous(limits = c(0, 350), breaks = seq(0, 350, 25))
p0

summary(main_df$PH)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   7.880   8.440   8.540   8.546   8.680   9.360       4

The median PH value is 8.54 and ranges between 7.88 and 9.36. 50 percent of observations have values between 8.44 and 8.68 though. There are 4 observations with missing PH values. This is a small enough percentage of our total observations to justify simple list-wise deletion. We lose little by removing these observations, and we would gain little by imputing them.

main_df <- main_df |>
    filter(!is.na(PH))

We take a look at histograms for the numeric predictor variables, as well as scatterplots of each numeric predictor and the response, in batches since there are so many of them.

non_numeric <- c("BRAND_CODE")
all_numeric <- colnames(main_df |> select(-all_of(c("PH", non_numeric))))
n = 8
all_numeric_chunks <- split(all_numeric, ceiling(seq_along(all_numeric)/n))
remove <- c("PH", non_numeric)
pivot_df <- main_df |>
    select(-all_of(remove)) |>
    pivot_longer(cols = all_of(all_numeric), names_to = "PREDICTOR",
                 values_to = "VALUE")
p1a <- pivot_df |>
    filter(PREDICTOR %in% all_numeric_chunks[[1]]) |>
    ggplot(aes(x = VALUE)) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "CARB_PRESSURE"),
                   binwidth = 2) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "CARB_TEMP"),
                   binwidth = 2) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "CARB_VOLUME"),
                   binwidth = 0.04) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "FILL_OUNCES"),
                   binwidth = 0.04) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "PC_VOLUME"),
                   binwidth = 0.02) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "PSC"),
                   binwidth = 0.02) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "PSC_CO_2"),
                   binwidth = 0.02) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "PSC_FILL"),
                   binwidth = 0.04) +
    facet_wrap(vars(PREDICTOR), ncol = 4, scales = "free_x") +
    labs(y = "COUNT",
         title = "Batch 1 Predictor Distributions") +
    theme(panel.spacing.x = unit(4, "mm"),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.title.position = "plot")
p1a

In the first batch of numeric predictors, we see that PSC, PSC_CO_2, and PSC_FILL are all right-skewed, and the distribution for CARB_VOLUME is multimodal. The distributions for the rest of the variables are nearly normal.

sel <- c("PH", all_numeric_chunks[[1]])
p1b <- main_df |>
    select(all_of(sel)) |>
    pivot_longer(cols = all_of(all_numeric_chunks[[1]]), names_to = "PREDICTOR",
                 values_to = "VALUE") |>
    ggplot(aes(x = VALUE, y = PH)) +
    geom_point(color = palette[2], fill = palette[1], alpha = 0.25) +
    geom_smooth(method = "lm", color = "black", linewidth = 0.5, se = FALSE) +
    facet_wrap(~PREDICTOR, ncol = 4, scales = "free_x") +
    labs(title = "Batch 1 Predictor vs. PH Scatterplots") +
    theme(panel.spacing.x = unit(4, "mm"),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.title.position = "plot")
p1b

There are no linear relationships discernable from these scatterplots. There may be two clusters in CARB_VOLUME.

p2a <- pivot_df |>
    filter(PREDICTOR %in% all_numeric_chunks[[2]]) |>
    ggplot(aes(x = VALUE)) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "CARB_PRESSURE_1"),
                   binwidth = 2) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "FILL_PRESSURE"),
                   binwidth = 2) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "FILLER_LEVEL"),
                   binwidth = 6) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "HYD_PRESSURE_1"),
                   binwidth = 4) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "HYD_PRESSURE_2"),
                   binwidth = 4) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "HYD_PRESSURE_3"),
                   binwidth = 4) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "HYD_PRESSURE_4"),
                   binwidth = 6) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "MNF_FLOW"),
                   binwidth = 20) +
    facet_wrap(vars(PREDICTOR), ncol = 4, scales = "free_x") +
    labs(y = "COUNT",
         title = "Batch 2 Predictor Distributions") +
    theme(panel.spacing.x = unit(4, "mm"),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.title.position = "plot")
p2a

In the second batch of numeric predictors, we see that HYD_PRESSURE_1, HYD_PRESSURE_2, and HYD_PRESSURE_3 are heavy with zero value observations, skewing their distributions. Most observations for MNF_FLOW are around -100, and its distribution might be degenerate. We’ll check for degeneracy for this variable and any others shortly. FILL_PRESSURE and FILLER_LEVEL are multimodal. HYD_PRESSURE_4 is right-skewed. CARB_PRESSURE_1 has the only nearly normal distribution here.

sel <- c("PH", all_numeric_chunks[[2]])
p2b <- main_df |>
    select(all_of(sel)) |>
    pivot_longer(cols = all_of(all_numeric_chunks[[2]]), names_to = "PREDICTOR",
                 values_to = "VALUE") |>
    ggplot(aes(x = VALUE, y = PH)) +
    geom_point(color = palette[2], fill = palette[1], alpha = 0.25) +
    geom_smooth(method = "lm", color = "black", linewidth = 0.5, se = FALSE) +
    facet_wrap(~PREDICTOR, ncol = 4, scales = "free_x") +
    labs(title = "Batch 2 Predictor vs. PH Scatterplots") +
    theme(panel.spacing.x = unit(4, "mm"),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.title.position = "plot")
p2b

Again, linear relationships are hard to discern from these scatterplots, but there may be a negative relationship between FILL_PRESSURE and PH and a positive relationship between FILLER_LEVEL and PH. There’s clustering in both variables.

p3a <- pivot_df |>
    filter(PREDICTOR %in% all_numeric_chunks[[3]]) |>
    ggplot(aes(x = VALUE)) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "BALLING"),
                   binwidth = 0.25) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "CARB_FLOW"),
                   binwidth = 400) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "DENSITY"),
                   binwidth = 0.1) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "FILLER_SPEED"),
                   binwidth = 200) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "MFR"),
                   binwidth = 50) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "PRESSURE_VACUUM"),
                   binwidth = 0.25) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "TEMPERATURE"),
                   binwidth = 1) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "USAGE_CONT"),
                   binwidth = 1) +
    facet_wrap(vars(PREDICTOR), ncol = 4, scales = "free_x") +
    labs(y = "COUNT",
         title = "Batch 3 Predictor Distributions") +
    theme(panel.spacing.x = unit(4, "mm"),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.title.position = "plot")
p3a

In the third batch of numeric predictors, we see multimodal distributions for BALLING, CARB_FLOW, and DENSITY. FILLER_SPEED, MFR, and USAGE_CONT are left-skewed, and TEMPERATURE and PRESSURE_VACUUM are right-skewed.

sel <- c("PH", all_numeric_chunks[[3]])
p3b <- main_df |>
    select(all_of(sel)) |>
    pivot_longer(cols = all_of(all_numeric_chunks[[3]]), names_to = "PREDICTOR",
                 values_to = "VALUE") |>
    ggplot(aes(x = VALUE, y = PH)) +
    geom_point(color = palette[2], fill = palette[1], alpha = 0.25) +
    geom_smooth(method = "lm", color = "black", linewidth = 0.5, se = FALSE) +
    facet_wrap(~PREDICTOR, ncol = 4, scales = "free_x") +
    labs(title = "Batch 3 Predictor vs. PH Scatterplots") +
    theme(panel.spacing.x = unit(4, "mm"),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.title.position = "plot")
p3b

We see clustering in BALLING, CARB_FLOW, and DENSITY. There may be a somewhat positive relationship between PRESSURE_VACUUM and PH, as well as a somewhat negative relationship between TEMPERATURE and PH.

p4a <- pivot_df |>
    filter(PREDICTOR %in% all_numeric_chunks[[4]]) |>
    ggplot(aes(x = VALUE)) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "AIR_PRESSURER"),
                   binwidth = 0.5) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "ALCH_REL"),
                   binwidth = 0.25) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "BALLING_LVL"),
                   binwidth = 0.25) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "BOWL_SETPOINT"),
                   binwidth = 10) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "CARB_REL"),
                   binwidth = 0.1) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "OXYGEN_FILLER"),
                   binwidth = 0.02) +
    geom_histogram(color = palette[2], fill = palette[1],
                   data = subset(pivot_df, PREDICTOR == "PRESSURE_SETPOINT"),
                   bins = 8) +
    facet_wrap(vars(PREDICTOR), ncol = 4, scales = "free_x") +
    labs(y = "COUNT",
         title = "Batch 4 Predictor Distributions") +
    theme(panel.spacing.x = unit(4, "mm"),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.title.position = "plot")
p4a

In the last batch of numeric predictors, we see that AIR_PRESSURER and OXYGEN_FILLER are right-skewed. The distributions for ALCH_REL, BALLING_LVL, and PRESSURE_SETPOINT are multimodal. BOWL_SETPOINT is left-skewed. CARB_REL is the only variable for which the distribution is nearly normal, and that’s debatable.

sel <- c("PH", all_numeric_chunks[[4]])
p4b <- main_df |>
    select(all_of(sel)) |>
    pivot_longer(cols = all_of(all_numeric_chunks[[4]]), names_to = "PREDICTOR",
                 values_to = "VALUE") |>
    ggplot(aes(x = VALUE, y = PH)) +
    geom_point(color = palette[2], fill = palette[1], alpha = 0.25) +
    geom_smooth(method = "lm", color = "black", linewidth = 0.5, se = FALSE) +
    facet_wrap(~PREDICTOR, ncol = 4, scales = "free_x") +
    labs(title = "Batch 4 Predictor vs. PH Scatterplots") +
    theme(panel.spacing.x = unit(4, "mm"),
          axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
          plot.title.position = "plot")
p4b

We see clustering in AIR_PRESSURER, ALCH_REL, and BALLING_LVL.

Summary statistics for all numeric predictors are below.

remove <- c("n", "vars", "trimmed", "mad", "range", "se", "kurtosis")
describe <- main_df |>
    select(all_of(all_numeric)) |>
    describe() |>
    select(-all_of(remove))
knitr::kable(describe, format = "simple")

	mean	sd	median	min	max	skew
CARB_VOLUME	5.3703337	0.1063981	5.3466667	5.0400000	5.700	0.3904059
FILL_OUNCES	23.9749176	0.0874663	23.9733333	23.6333333	24.320	-0.0215410
PC_VOLUME	0.2772392	0.0605992	0.2713333	0.0793333	0.478	0.3468176
CARB_PRESSURE	68.1902677	3.5386086	68.2000000	57.0000000	79.400	0.1811752
CARB_TEMP	141.0922393	4.0340631	140.8000000	128.6000000	154.000	0.2427101
PSC	0.0846433	0.0492487	0.0760000	0.0020000	0.270	0.8504528
PSC_FILL	0.1952987	0.1177889	0.1800000	0.0000000	0.620	0.9352821
PSC_CO_2	0.0564399	0.0430641	0.0400000	0.0000000	0.240	1.7270393
MNF_FLOW	24.6269575	119.5013986	70.2000000	-100.2000000	229.400	0.0031327
CARB_PRESSURE_1	122.5704142	4.7272264	123.2000000	105.6000000	140.200	0.0429942
FILL_PRESSURE	47.9221656	3.1775457	46.4000000	34.6000000	60.400	0.5471107
HYD_PRESSURE_1	12.4571987	12.4330687	11.4000000	-0.8000000	58.000	0.7779346
HYD_PRESSURE_2	20.9935737	16.3784943	28.6000000	0.0000000	59.400	-0.3056277
HYD_PRESSURE_3	20.4778997	15.9714047	27.6000000	-1.2000000	50.000	-0.3210114
HYD_PRESSURE_4	96.3087830	13.0976498	96.0000000	62.0000000	142.000	0.5602427
FILLER_LEVEL	109.2523716	15.6984241	118.4000000	55.8000000	161.200	-0.8482847
FILLER_SPEED	3688.1066454	769.6282261	3982.0000000	998.0000000	4030.000	-2.8777117
TEMPERATURE	65.9648532	1.3790586	65.6000000	63.6000000	76.200	2.3920389
USAGE_CONT	20.9942155	2.9761958	21.7900000	12.0800000	25.900	-0.5351830
CARB_FLOW	2472.0530214	1070.4281545	3030.0000000	26.0000000	5104.000	-0.9916636
DENSITY	1.1744527	0.3769684	0.9800000	0.2400000	1.920	0.5311125
MFR	704.0492582	73.8983094	724.0000000	31.4000000	868.600	-5.0917729
BALLING	2.1998418	0.9295470	1.6480000	0.1600000	4.012	0.6004592
PRESSURE_VACUUM	-5.2162057	0.5703665	-5.4000000	-6.6000000	-3.600	0.5258505
OXYGEN_FILLER	0.0464281	0.0450729	0.0334000	0.0024000	0.400	2.4147972
BOWL_SETPOINT	109.3450292	15.2891482	120.0000000	70.0000000	140.000	-0.9749161
PRESSURE_SETPOINT	47.6132290	2.0387546	46.0000000	44.0000000	52.000	0.2051072
AIR_PRESSURER	142.8339696	1.2127148	142.6000000	140.8000000	148.200	2.2512354
ALCH_REL	6.8978125	0.5052561	6.5600000	5.2800000	8.620	0.8830750
CARB_REL	5.4367956	0.1287629	5.4000000	4.9600000	6.060	0.5028431
BALLING_LVL	2.0516212	0.8688888	1.4800000	0.0000000	3.660	0.5943424

Now we check for degenerate distributions.

nzv_predictors <- nearZeroVar(main_df, names = TRUE, saveMetrics = FALSE)
nzv_predictors

## [1] "HYD_PRESSURE_1"

The only near-zero-variance predictor identified is HYD_PRESSURE_1. We remove this predictor from the historical and evaluation datasets.

main_df <- main_df |>
    select(-all_of(nzv_predictors))
eval_df <- eval_df |>
    select(-all_of(nzv_predictors))

Next we examine the dataset’s completeness.

remove <- c("discrete_columns", "continuous_columns", "total_observations",
            "memory_usage")
introduce <- main_df |>
    introduce() |>
    select(-all_of(remove))
knitr::kable(t(introduce), format = "simple")

rows	2567
columns	32
all_missing_columns	0
total_missing_values	801
complete_rows	2038

Only 2,038 out of 2,571 rows are complete, which is about 79 percent of observations. There are 844 missing values. None of our variables are completely NA.

We take a closer look at where the missing values are.

p5 <- p5 + 
    scale_fill_brewer(palette = "Paired") +
    theme(plot.title.position = "plot")
p5

MFR, BRAND_CODE, and FILLER_SPEED are the predictors with the most missing values, but many other predictors are missing values as well. We coerce BRAND_CODE to a factor and add a level for NA values to handle missingness for this categorical predictor. Then we look at the distribution of PH by BRAND_CODE level to determine whether there are differences in variation between groups and outliers within groups.

main_df <- main_df |>
    mutate(BRAND_CODE = factor(BRAND_CODE, exclude = NULL))
palette <- brewer.pal(n = 12, name = "Paired")
col <- palette[c(2, 4, 6, 8, 10)]
fil <- palette[c(1, 3, 5, 7, 9)]
p6 <- main_df |>
    ggplot(aes(x = BRAND_CODE, y = PH, color = BRAND_CODE, fill = BRAND_CODE)) +
    geom_violin(trim = FALSE) +
    geom_boxplot(width = 0.1, fill = "white") +
    geom_text(aes(label = BRAND_CODE, color = BRAND_CODE), y = 7.5,
              vjust = -0.75, size = 4, fontface = "bold") +
    scale_y_continuous(limits = c(7.5, 9.5), breaks = seq(7.5, 9.5, 0.5)) +
    scale_color_manual(values = col) +
    scale_fill_manual(values = fil) +
    theme(legend.position = "none",
          axis.ticks = element_blank(),
          axis.text.x = element_blank(),
          axis.line = element_blank(),
          panel.grid.major.y = element_line(color = greys[3], linewidth = 0.25,
                                            linetype = 1))
p6

Level “D” has the highest median PH, level “C” has the only outlier on the high end, and all levels except the level representing NA values have outliers on the low end. Level “A” has the narrowest IQR, whereas as the level representing NA values has the widest IQR.

We will perform KNN imputation for the numeric predictors with missing data. We will also create a secondary version of the data where we perform list-wise deletion instead. Before we handle this remaining missing data, we first look at correlations between our predictors and the response variable. Because we have so many variables, it would be difficult to visualize all correlations at the same time without binning them. So we will bin absolute value correlations into four groups: 1) 0.00 to 0.25, 2) 0.26 to 0.50, 3) 0.51 to 0.75, and 4) 0.76 to 1.00. We won’t visualize any correlations less than 0.26, but note that this doesn’t imply those correlations are insignificant. Limiting what we examine most closely will simply help us hone in on a) the predictor variables we should expect any good model we develop to include and b) the predictor variables that are so highly correlated with one another that they could inhibit certain models’ performance.

incl <- c("PH", sort(colnames(main_df |> select(-PH))))
palette <- brewer.pal(n = 7, name = "RdBu")[c(1, 4, 7)]
r <- model.matrix(~0+., data = main_df |> select(all_of(incl))) |>
    cor(use = "pairwise.complete.obs")
is.na(r) <- abs(r) > 0.5
is.na(r) <- abs(r) < 0.26
p7 <- r |>
    ggcorrplot(show.diag = FALSE, type = "lower", lab = TRUE, lab_size = 2,
               tl.cex = 8, tl.srt = 90,
               colors = palette, outline.color = "white") +
    labs(title = "Correlations Between 0.26 and 0.50 (Absolute Value)") +
    theme(plot.title.position = "plot")
p7

Here, we see the predictors that are most positively correlated with PH are BOWL_SETPOINT and FILLER_LEVEL, and the predictors that are most negatively correlated with PH are BRAND_CODE level “C”, FILL_PRESSURE, HYD_PRESSURE_3, MNF_FLOW, PRESSURE_SETPOINT, and USAGE_CONT. While some of the predictors in this plot are moderately correlated with each other, we will focus on higher/more worrisome predictor-predictor correlation levels in the following two plots.

r <- model.matrix(~0+., data = main_df |> select(all_of(incl))) |>
    cor(use = "pairwise.complete.obs")
is.na(r) <- abs(r) > 0.75
is.na(r) <- abs(r) < 0.51
p8 <- r |>
    ggcorrplot(show.diag = FALSE, type = "lower", lab = TRUE, lab_size = 2,
               tl.cex = 8, tl.srt = 90,
               colors = palette, outline.color = "white") +
    labs(title = "Correlations Between 0.51 and 0.75 (Absolute Value)") +
    theme(plot.title.position = "plot")
p8

Here, we notice immediately that PH is missing from the plot and is therefore not correlated with any predictors at a level between 0.51 and 0.75 in absolute value. Although we could comment on all these correlation levels, we see high (> 0.6) positive predictor-predictor correlations between:

MNF_FLOW and HYD_PRESSURE_2
HYD_PRESSURE_3 and HYD_PRESSURE_2
HYD_PRESSURE_2 and HYD_PRESSURE_1

We also see high (< -0.6) negative predictor-predictor correlations between:

PRESSURE_VACUUM and HYD_PRESSURE_3/HYD_PRESSURE_2
HYD_PRESSURE_4 and BRAND_CODE level “D”/ALCH_REL
BRAND_CODE level “B” and DENSITY/BALLING_LVL/BALLING/ALCH_REL

r <- model.matrix(~0+., data = main_df |> select(all_of(incl))) |>
    cor(use = "pairwise.complete.obs")
is.na(r) <- abs(r) < 0.76
p9 <- r |>
    ggcorrplot(show.diag = FALSE, type = "lower", lab = TRUE, lab_size = 2,
               tl.cex = 8, tl.srt = 90,
               colors = palette, outline.color = "white") +
    labs(title = "Correlations Between 0.76 and 1.00 (Absolute Value)") +
    theme(plot.title.position = "plot")
p9

PH is again missing, so it is therefore not correlated with any predictors at a level between 0.76 and 1.00 in absolute value. Although we could again comment on all these correlation levels, we see extremely high (> 0.9) positive predictor-predictor correlations between

MFR and FILLER_SPEED
HYD_PRESSURE_3 and HYD_PRESSURE_2
FILLER_LEVEL and BOWL_SETPOINT
DENSITY and BALLING_LVL/BALLING/ALCH_REL
BRAND_CODE level “D” and ALCH_REL
BALLING_LVL and BALLING/ALCH_REL
BALLING and ALCH_REL

There are no extremely high (< -0.9) negative predictor-predictor correlations.

When creating models that are not robust to collinearity later, we will definitely need to exclude one or more variables in each extremely correlated group, and we may have to do the same for less (but still highly) correlated groups as well.

Data Preparation:

To create three versions of the data, one in which missing numeric values have been imputed, one in which observations with missing numeric values have been deleted, and one in which observations with missing numeric values have been imputed and skewed predictors have been transformed, we first set a seed and split the data into train and test sets.

set.seed(417)
rows <- sample(nrow(main_df))
main_df <- main_df[rows, ]
sample <- sample(c(TRUE, FALSE), nrow(main_df), replace=TRUE,
                 prob=c(0.7,0.3))
train_df <- main_df[sample, ]
test_df <- main_df[!sample, ]

Next we perform KNN imputation on the missing numeric values for the primary train and test sets separately. To determine the variables this imputation method will use to calculate distance, and the weight of each distance variable used, we fit a random forest model to the training data, identify the 25 most important variables, and extract their variable importance scores. Since including highly correlated variables in a random forest model reduces all their variable importance scores, however, we’re actually going to first fit a full multiple linear regression model, check the variance inflation factors to determine any sources of multicollinearity, and eliminate problematic predictors from consideration.

mlr_model1 <- lm(PH ~ ., data = train_df)
mlr_model1_vif <- as.data.frame(vif(mlr_model1)) |>
    rownames_to_column()
cols <- c("PREDICTOR", "GVIF", "DF", "GVIF_ADJ_BY_DF")
colnames(mlr_model1_vif) <- cols
palette <- brewer.pal(n = 12, name = "Paired")
p10 <- mlr_model1_vif |>
    ggplot() +
    geom_col(aes(x = reorder(PREDICTOR, GVIF_ADJ_BY_DF), y = GVIF_ADJ_BY_DF),
             color = palette[2], fill = palette[1]) +
    geom_abline(intercept = 5, slope = 0, linewidth = 1, linetype = 2,
                color = palette[6]) +
    labs(x = "PREDICTOR",
         y = "Variance Inflation Factor") +
    scale_y_continuous(limits = c(0, 15), breaks = seq(0, 15, 2.5)) +
    coord_flip()
p10

The variables with variance inflation factors greater than five are BALLING_LVL, BALLING, ALCH_REL, CARB_PRESSURE, and CARB_TEMP. We remove ALCH_REL and BALLING from variable importance consideration for imputation because the information these variables provide is largely covered by BALLING_LVL, and we remove CARB_TEMP from variable importance consideration for imputation in favor of CARB_PRESSUREfor the same reason.

rf_model1 <- randomForest(PH ~ . - ALCH_REL - BALLING - CARB_TEMP, data = train_df,
                          importance = TRUE,
                          ntree = 1000,
                          na.action = na.omit)
rf_imp1 <- varImp(rf_model1, scale = TRUE)
cols <- c("Predictor", "Importance")
rf_imp1 <- rf_imp1 |>
    rownames_to_column()
colnames(rf_imp1) <- cols
rf_imp1 <- rf_imp1 |>
    arrange(desc(Importance)) |>
    top_n(25)
knitr::kable(rf_imp1, format = "simple")

Predictor	Importance
BRAND_CODE	62.546944
USAGE_CONT	45.431027
MNF_FLOW	44.120070
PRESSURE_VACUUM	40.833090
OXYGEN_FILLER	38.732888
CARB_REL	36.848281
BALLING_LVL	36.079182
TEMPERATURE	33.419371
DENSITY	31.057464
AIR_PRESSURER	30.507710
FILLER_SPEED	28.337795
FILLER_LEVEL	27.848629
CARB_FLOW	27.345460
BOWL_SETPOINT	23.776895
CARB_VOLUME	22.893358
HYD_PRESSURE_3	22.242977
CARB_PRESSURE_1	21.929859
HYD_PRESSURE_2	21.497959
HYD_PRESSURE_4	20.569592
FILL_PRESSURE	19.534525
MFR	17.209387
PC_VOLUME	16.985108
PRESSURE_SETPOINT	14.316484
CARB_PRESSURE	5.407045
FILL_OUNCES	4.167075

The above variables become the distance variables, and their variable importance scores become the weights in our KNN imputation. We perform said imputation on the primary train and test sets.

dist_vars = rf_imp1$Predictor
wts = rf_imp1$Importance
find_cols_na <- function(df){
    col_sums_na <- colSums(is.na(df))
    cols <- names(col_sums_na[col_sums_na > 0])
    cols #returns column names vector
}
missing_val_cols <- find_cols_na(train_df)
primary_train_df <- train_df |>
    VIM::kNN(variable = missing_val_cols, k = 15, dist_var = dist_vars,
             weights = wts, numFun = median, imp_var = FALSE)
missing_val_cols <- find_cols_na(test_df)
primary_test_df <- test_df |>
    VIM::kNN(variable = missing_val_cols, k = 15, dist_var = dist_vars,
             weights = wts, numFun = median, imp_var = FALSE)

Then we create secondary train and test sets where observations with missing numeric values have been deleted.

remove_rows_na <- function(df){
    na_row_sums <- rowSums(is.na(df))
    row_has_na <- ifelse(na_row_sums > 0, TRUE, FALSE)
    copy <- df[!row_has_na, ]
    copy
}
secondary_train_df <- remove_rows_na(train_df)
secondary_test_df <- remove_rows_na(test_df)

Finally, we create tertiary train and test sets where all observations with missing numeric values have been imputed and skewed predictors (excluding PRESSURE_VACUUM since it takes negative values) have been transformed. Below is a breakdown of the ideal lambdas proposed by Box-Cox for these skewed predictors and the reasonable, more commonly understood transformations we will make instead:

tertiary_train_df <- primary_train_df
tertiary_test_df <- primary_test_df
skewed <- c("PSC", "PSC_CO_2", "PSC_FILL", "HYD_PRESSURE_4", "FILLER_SPEED",
            "MFR", "TEMPERATURE", "USAGE_CONT", "AIR_PRESSURER",
            "BOWL_SETPOINT", "OXYGEN_FILLER")
for (i in 1:(length(skewed))){
    #Add a small constant to columns with any 0 values
    if (sum(tertiary_train_df[[skewed[i]]] == 0) > 0){
        tertiary_train_df[[skewed[i]]] <-
            tertiary_train_df[[skewed[i]]] + 0.001
    }
}
for (i in 1:(length(skewed))){
    if (i == 1){
        lambdas <- c()
    }
    bc <- boxcox(lm(tertiary_train_df[[skewed[i]]] ~ 1),
                 lambda = seq(-2, 2, length.out = 81),
                 plotit = FALSE)
    lambda <- bc$x[which.max(bc$y)]
    lambdas <- append(lambdas, lambda)
}
lambdas <- as.data.frame(cbind(skewed, lambdas))
adj <- c("square root", "square root", "square root", "none", "square",
         "square", "inverse square", "square", "inverse square", "square",
         "log")
lambdas <- cbind(lambdas, adj)
cols <- c("Variable", "Ideal Lambda Proposed by Box-Cox", "Reasonable Alternative Transformation")
colnames(lambdas) <- cols
kable(lambdas, format = "simple")

Variable	Ideal Lambda Proposed by Box-Cox	Reasonable Alternative Transformation
PSC	0.45	square root
PSC_CO_2	0.4	square root
PSC_FILL	0.45	square root
HYD_PRESSURE_4	-0.3	none
FILLER_SPEED	2	square
MFR	2	square
TEMPERATURE	-2	inverse square
USAGE_CONT	2	square
AIR_PRESSURER	-2	inverse square
BOWL_SETPOINT	2	square
OXYGEN_FILLER	0.25	log

We make the transformations in the tertiary train and test sets, leaving in the lower order terms for anything we squared, but removing the original terms otherwise.

remove <- c("PSC", "PSC_CO_2", "PSC_FILL", "TEMPERATURE", "AIR_PRESSURER",
            "OXYGEN_FILLER")
tertiary_train_df <- tertiary_train_df |>
    mutate(sqrt_PSC = PSC^0.5,
           sqrt_PSC_CO_2 = PSC_CO_2^0.5,
           sqrt_PSC_FILL = PSC_FILL^0.5,
           FILLER_SPEED_sq = FILLER_SPEED^2,
           MFR_sq = MFR^2,
           inv_sq_TEMPERATURE = TEMPERATURE^-2,
           USAGE_CONT_sq = USAGE_CONT^2,
           inv_sq_AIR_PRESSURER = AIR_PRESSURER^-2,
           BOWL_SETPOINT_sq = BOWL_SETPOINT^2,
           log_OXYGEN_FILLER = log(OXYGEN_FILLER)) |>
    select(-all_of(remove))
for (i in 1:(length(skewed))){
    #Add a small constant to columns with any 0 values
    if (sum(tertiary_test_df[[skewed[i]]] == 0) > 0){
        tertiary_test_df[[skewed[i]]] <-
            tertiary_test_df[[skewed[i]]] + 0.001
    }
}
tertiary_test_df <- tertiary_test_df |>
    mutate(sqrt_PSC = PSC^0.5,
           sqrt_PSC_CO_2 = PSC_CO_2^0.5,
           sqrt_PSC_FILL = PSC_FILL^0.5,
           FILLER_SPEED_sq = FILLER_SPEED^2,
           MFR_sq = MFR^2,
           inv_sq_TEMPERATURE = TEMPERATURE^-2,
           USAGE_CONT_sq = USAGE_CONT^2,
           inv_sq_AIR_PRESSURER = AIR_PRESSURER^-2,
           BOWL_SETPOINT_sq = BOWL_SETPOINT^2,
           log_OXYGEN_FILLER = log(OXYGEN_FILLER)) |>
    select(-all_of(remove))

Model Building:

We build a few models in each of three regression categories: linear, nonlinear, and tree-based.

Linear Regression Models:

We start with Model LM:1, a linear model that includes all predictors and uses the primary training set (in which missing numeric values have been imputed). The Adjusted R-Squared for the full model is:

lm1 <- lm(PH ~ ., data=primary_train_df)
summary(lm1)$adj.r.squared

## [1] 0.4121808

We reduce Model LM:1 via step-wise AIC (Akaike Information Criterion) selection in both directions. A summary of the reduced model is below.

lm1 <- stepAIC(lm1, direction = "both", trace=FALSE)
summary(lm1)

## 
## Call:
## lm(formula = PH ~ BRAND_CODE + CARB_VOLUME + PSC + MNF_FLOW + 
##     CARB_PRESSURE_1 + HYD_PRESSURE_2 + HYD_PRESSURE_3 + FILLER_LEVEL + 
##     TEMPERATURE + USAGE_CONT + CARB_FLOW + DENSITY + BALLING + 
##     PRESSURE_VACUUM + OXYGEN_FILLER + BOWL_SETPOINT + PRESSURE_SETPOINT + 
##     ALCH_REL + BALLING_LVL, data = primary_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.52101 -0.07777  0.00976  0.08602  0.42617 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        8.988e+00  3.906e-01  23.014  < 2e-16 ***
## BRAND_CODEB        1.049e-01  2.653e-02   3.953 8.01e-05 ***
## BRAND_CODEC       -4.464e-02  2.618e-02  -1.706 0.088266 .  
## BRAND_CODED        6.708e-02  1.836e-02   3.654 0.000266 ***
## BRAND_CODENA       2.964e-02  2.969e-02   0.998 0.318189    
## CARB_VOLUME       -1.336e-01  5.287e-02  -2.527 0.011580 *  
## PSC               -1.340e-01  6.440e-02  -2.080 0.037626 *  
## MNF_FLOW          -6.764e-04  5.409e-05 -12.506  < 2e-16 ***
## CARB_PRESSURE_1    6.283e-03  8.182e-04   7.678 2.66e-14 ***
## HYD_PRESSURE_2    -1.094e-03  5.441e-04  -2.011 0.044480 *  
## HYD_PRESSURE_3     3.403e-03  6.640e-04   5.125 3.31e-07 ***
## FILLER_LEVEL      -1.036e-03  6.440e-04  -1.608 0.107909    
## TEMPERATURE       -1.430e-02  2.671e-03  -5.353 9.78e-08 ***
## USAGE_CONT        -5.385e-03  1.317e-03  -4.088 4.54e-05 ***
## CARB_FLOW          1.422e-05  3.823e-06   3.718 0.000207 ***
## DENSITY           -1.056e-01  3.282e-02  -3.219 0.001311 ** 
## BALLING           -1.209e-01  2.654e-02  -4.555 5.60e-06 ***
## PRESSURE_VACUUM   -2.694e-02  8.373e-03  -3.217 0.001317 ** 
## OXYGEN_FILLER     -2.901e-01  8.995e-02  -3.225 0.001285 ** 
## BOWL_SETPOINT      3.316e-03  6.662e-04   4.977 7.09e-07 ***
## PRESSURE_SETPOINT -7.734e-03  1.904e-03  -4.062 5.09e-05 ***
## ALCH_REL           6.825e-02  2.705e-02   2.523 0.011710 *  
## BALLING_LVL        1.735e-01  2.752e-02   6.305 3.64e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1313 on 1754 degrees of freedom
## Multiple R-squared:   0.42,  Adjusted R-squared:  0.4128 
## F-statistic: 57.74 on 22 and 1754 DF,  p-value: < 2.2e-16

The Adjusted R-Squared did not change.

Next we build Model LM:2, a linear model that includes all predictors and uses the secondary training set (in which observations with missing numeric values have been deleted). The Adjusted R-Squared for the full model is:

lm2 <- lm(PH ~ ., data=secondary_train_df)
summary(lm2)$adj.r.squared

## [1] 0.4314624

We reduce Model LM:2 using step-wise AIC selection in both directions again. A summary of the reduced model is below.

lm2 <- stepAIC(lm2, direction = "both", trace=FALSE)
summary(lm2)

## 
## Call:
## lm(formula = PH ~ BRAND_CODE + CARB_VOLUME + FILL_OUNCES + CARB_PRESSURE + 
##     CARB_TEMP + MNF_FLOW + CARB_PRESSURE_1 + HYD_PRESSURE_2 + 
##     HYD_PRESSURE_3 + HYD_PRESSURE_4 + TEMPERATURE + USAGE_CONT + 
##     CARB_FLOW + DENSITY + BALLING + PRESSURE_VACUUM + OXYGEN_FILLER + 
##     BOWL_SETPOINT + PRESSURE_SETPOINT + CARB_REL + BALLING_LVL, 
##     data = secondary_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.54066 -0.07215  0.00653  0.08647  0.43003 
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.305e+01  1.437e+00   9.085  < 2e-16 ***
## BRAND_CODEB        1.792e-01  3.740e-02   4.791 1.83e-06 ***
## BRAND_CODEC        2.057e-02  3.601e-02   0.571 0.568022    
## BRAND_CODED        1.079e-01  1.678e-02   6.429 1.74e-10 ***
## BRAND_CODENA       1.160e-01  4.023e-02   2.883 0.004001 ** 
## CARB_VOLUME       -3.947e-01  1.363e-01  -2.895 0.003844 ** 
## FILL_OUNCES       -9.681e-02  4.272e-02  -2.266 0.023594 *  
## CARB_PRESSURE      1.226e-02  6.420e-03   1.909 0.056421 .  
## CARB_TEMP         -9.288e-03  5.022e-03  -1.849 0.064594 .  
## MNF_FLOW          -6.262e-04  5.923e-05 -10.571  < 2e-16 ***
## CARB_PRESSURE_1    5.918e-03  9.018e-04   6.562 7.36e-11 ***
## HYD_PRESSURE_2    -1.463e-03  5.835e-04  -2.507 0.012284 *  
## HYD_PRESSURE_3     3.509e-03  7.331e-04   4.787 1.87e-06 ***
## HYD_PRESSURE_4     7.270e-04  4.817e-04   1.509 0.131453    
## TEMPERATURE       -1.874e-02  3.763e-03  -4.981 7.09e-07 ***
## USAGE_CONT        -6.878e-03  1.480e-03  -4.647 3.68e-06 ***
## CARB_FLOW          2.013e-05  5.337e-06   3.772 0.000168 ***
## DENSITY           -1.149e-01  3.567e-02  -3.222 0.001303 ** 
## BALLING           -1.885e-01  4.217e-02  -4.469 8.45e-06 ***
## PRESSURE_VACUUM   -4.973e-02  1.111e-02  -4.476 8.22e-06 ***
## OXYGEN_FILLER     -3.267e-01  1.062e-01  -3.076 0.002139 ** 
## BOWL_SETPOINT      2.323e-03  3.361e-04   6.911 7.18e-12 ***
## PRESSURE_SETPOINT -7.642e-03  2.080e-03  -3.673 0.000248 ***
## CARB_REL           1.090e-01  7.484e-02   1.456 0.145535    
## BALLING_LVL        3.049e-01  4.850e-02   6.288 4.26e-10 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.1283 on 1448 degrees of freedom
## Multiple R-squared:  0.4424, Adjusted R-squared:  0.4331 
## F-statistic: 47.86 on 24 and 1448 DF,  p-value: < 2.2e-16

The Adjusted R-Squared increased very slightly to 0.4331.

Next we build Model LM:3, a linear model that uses the tertiary training set (in which observations with missing numeric values have been imputed and skewed predictors have been transformed). The Adjusted R-Squared for the full model is:

lm3 <- lm(PH ~ ., data=tertiary_train_df)
summary(lm3)$adj.r.squared

## [1] 0.4158026

We reduce Model LM:3 using step-wise AIC selection in both directions again. A summary of the reduced model is below. (Note that BOWL_SETPOINT, which we squared during transformation, had to be manually added back to the model. If we had done the transformations within the model itself, the model would have kept all lower order terms by default during reduction, but since we transformed the data outside the model, we have to be more careful.)

lm3 <- stepAIC(lm3, direction = "both", trace=FALSE)
lm3 <- update(lm3, . ~ . + BOWL_SETPOINT)
summary(lm3)

## 
## Call:
## lm(formula = PH ~ BRAND_CODE + CARB_VOLUME + PC_VOLUME + MNF_FLOW + 
##     CARB_PRESSURE_1 + HYD_PRESSURE_2 + HYD_PRESSURE_3 + USAGE_CONT + 
##     CARB_FLOW + DENSITY + MFR + BALLING + PRESSURE_VACUUM + PRESSURE_SETPOINT + 
##     ALCH_REL + BALLING_LVL + sqrt_PSC + MFR_sq + inv_sq_TEMPERATURE + 
##     USAGE_CONT_sq + BOWL_SETPOINT_sq + BOWL_SETPOINT, data = tertiary_train_df)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.51590 -0.07899  0.00931  0.08659  0.42401 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         7.061e+00  4.535e-01  15.568  < 2e-16 ***
## BRAND_CODEB         1.192e-01  2.670e-02   4.464 8.55e-06 ***
## BRAND_CODEC        -3.018e-02  2.643e-02  -1.142 0.253802    
## BRAND_CODED         6.711e-02  1.840e-02   3.647 0.000273 ***
## BRAND_CODENA        4.450e-02  2.981e-02   1.493 0.135657    
## CARB_VOLUME        -1.109e-01  5.298e-02  -2.094 0.036384 *  
## PC_VOLUME          -1.055e-01  6.258e-02  -1.686 0.091908 .  
## MNF_FLOW           -5.758e-04  4.930e-05 -11.680  < 2e-16 ***
## CARB_PRESSURE_1     5.962e-03  8.188e-04   7.281 4.97e-13 ***
## HYD_PRESSURE_2     -1.063e-03  5.489e-04  -1.936 0.053051 .  
## HYD_PRESSURE_3      3.423e-03  6.615e-04   5.174 2.55e-07 ***
## USAGE_CONT          6.737e-02  1.825e-02   3.691 0.000230 ***
## CARB_FLOW           1.063e-05  4.029e-06   2.639 0.008385 ** 
## DENSITY            -1.110e-01  3.312e-02  -3.352 0.000820 ***
## MFR                -4.212e-04  2.421e-04  -1.740 0.082122 .  
## BALLING            -1.100e-01  2.767e-02  -3.976 7.30e-05 ***
## PRESSURE_VACUUM    -2.741e-02  8.782e-03  -3.121 0.001829 ** 
## PRESSURE_SETPOINT  -6.978e-03  1.912e-03  -3.650 0.000270 ***
## ALCH_REL            6.038e-02  2.736e-02   2.207 0.027460 *  
## BALLING_LVL         1.729e-01  2.794e-02   6.186 7.66e-10 ***
## sqrt_PSC           -7.006e-02  3.817e-02  -1.835 0.066638 .  
## MFR_sq              3.939e-07  2.289e-07   1.721 0.085403 .  
## inv_sq_TEMPERATURE  2.249e+03  4.083e+02   5.509 4.15e-08 ***
## USAGE_CONT_sq      -1.848e-03  4.599e-04  -4.019 6.09e-05 ***
## BOWL_SETPOINT_sq    2.009e-05  1.481e-05   1.357 0.175084    
## BOWL_SETPOINT      -1.722e-03  3.015e-03  -0.571 0.567982    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.131 on 1751 degrees of freedom
## Multiple R-squared:  0.4242, Adjusted R-squared:  0.416 
## F-statistic:  51.6 on 25 and 1751 DF,  p-value: < 2.2e-16

The Adjusted R-Squared increased very slightly to 0.416, but BOWL_SETPOINT_sq is no longer significant. We leave it in.

Looking at the significant predictors in the three linear models, there are some differences to note. FILL_OUNCES was deemed significant in only Model LM:2, and PSC and ALCH_REL were deemed significant in only Model LM:1.

We examine diagnostic plots for the three linear models.

par(mfrow = c(1, 3))
plot(lm1, 1, main = "Model LM:1")
plot(lm2, 1, main = "Model LM:2")
plot(lm3, 1, main = "Model LM:3")

There’s a bit of a sigmoid pattern in the red line in the Residuals vs. Fitted Values plots for all three linear models, suggesting they share a fit issue.

par(mfrow = c(1, 3))
plot(lm1, 2, main = "Model LM:1")
plot(lm2, 2, main = "Model LM:2")
plot(lm3, 2, main = "Model LM:3")

The Q-Q plots diverge from the normal line at the low-end for all three linear models.

par(mfrow = c(1, 3))
plot(lm1, 5, main = "Model LM:1")
plot(lm2, 5, main = "Model LM:2")
plot(lm3, 5, main = "Model LM:3")

There are no points beyond the border of Cook’s distance in any of the linear models, so there are no highly influential observations to investigate.

We check for multicollinearity in the three linear models. (Only variance inflation factors greater than 4 are displayed for readability.)

palette <- brewer.pal(n = 12, name = "Paired")
lm1_vif <- as.data.frame(vif(lm1)) |>
    rownames_to_column()
lm2_vif <- as.data.frame(vif(lm2)) |>
    rownames_to_column()
lm3_vif <- as.data.frame(vif(lm3)) |>
    rownames_to_column()
cols <- c("PREDICTOR", "GVIF", "DF", "GVIF_ADJ_BY_DF")
colnames(lm1_vif) <- cols
colnames(lm2_vif) <- cols
colnames(lm3_vif) <- cols
p11a <- lm1_vif |>
    filter(GVIF_ADJ_BY_DF > 4) |>
    ggplot() +
    geom_col(aes(x = reorder(PREDICTOR, GVIF_ADJ_BY_DF), y = GVIF_ADJ_BY_DF),
             color = palette[2], fill = palette[1]) +
    geom_abline(intercept = 5, slope = 0, linewidth = 1, linetype = 2,
                color = palette[6]) +
    labs(x = "PREDICTOR",
         y = "Variance Inflation Factor",
         title = "Model LM:1") +
    scale_y_continuous(limits = c(0, 20), breaks = seq(0, 20, 2.5)) +
    coord_flip()
p11b <- lm2_vif |>
    filter(GVIF_ADJ_BY_DF > 4) |>
    ggplot() +
    geom_col(aes(x = reorder(PREDICTOR, GVIF_ADJ_BY_DF), y = GVIF_ADJ_BY_DF),
             color = palette[2], fill = palette[1]) +
    geom_abline(intercept = 5, slope = 0, linewidth = 1, linetype = 2,
                color = palette[6]) +
    labs(x = "PREDICTOR",
         y = "Variance Inflation Factor",
         title = "Model LM:2") +
    scale_y_continuous(limits = c(0, 20), breaks = seq(0, 20, 2.5)) +
    coord_flip()
p11c <- lm3_vif |>
    filter(GVIF_ADJ_BY_DF > 4) |>
    ggplot() +
    geom_col(aes(x = reorder(PREDICTOR, GVIF_ADJ_BY_DF), y = GVIF_ADJ_BY_DF),
             color = palette[2], fill = palette[1]) +
    geom_abline(intercept = 5, slope = 0, linewidth = 1, linetype = 2,
                color = palette[6]) +
    labs(x = "PREDICTOR",
         y = "Variance Inflation Factor",
         title = "Model LM:3") +
    scale_y_continuous(limits = c(0, 20), breaks = seq(0, 20, 2.5)) +
    coord_flip()
p11 <- plot_grid(p11a, p11b, p11c, ncol = 1, align = "v", axis = "l")
p11

There are different multicollinearity issues in each model. In Model LM:1, we remove BALLING and ALCH_REL because their information is largely covered by BALLING_LVL, as discussed previously. The same reasoning applies to removing BALLING in favor of BALLING_LVL and CARB_TEMP in favor of CARB_PRESSURE in Model LM:2. In Model LM:3, we expect issues from including lower and higher order terms, but that is what we want to do, so those can be ignored. We only remove BALLING and ALCH_REL. After these adjustments, the final Adjusted R-Squared metrics for the three linear models are below:

lm1 <- update(lm1, . ~ . - BALLING - ALCH_REL)
lm2 <- update(lm2, . ~ . - BALLING - CARB_TEMP)
lm3 <- update(lm3, . ~ . - BALLING - ALCH_REL)
lms <- c("Model LM:1", "Model LM:2", "Model LM:3")
rsq <- c(summary(lm1)$adj.r.squared, summary(lm2)$adj.r.squared,
         summary(lm3)$adj.r.squared)
summ <- as.data.frame(cbind(lms, round(rsq, 4)))
cols <- c("Model", "Adjust R-Squared")
colnames(summ) <- cols
knitr::kable(summ, format = "simple")

Model	Adjust R-Squared
Model LM:1	0.4054
Model LM:2	0.4245
Model LM:3	0.4107

Nonlinear Regression Models:

Next we build three versions of three types of nonlinear regression models, training each version on one of the three different training sets.

First, we tune three Multivariate Adaptive Regression Spline (MARS) models.

marsGrid <- expand.grid(.degree = 1:2, .nprune = 2:20)
mars1 <- train(primary_train_df |> select(-PH), primary_train_df$PH,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))
mars2 <- train(secondary_train_df |> select(-PH), secondary_train_df$PH,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))
mars3 <- train(tertiary_train_df |> select(-PH), tertiary_train_df$PH,
                   method = "earth",
                   tuneGrid = marsGrid,
                   trControl = trainControl(method = "cv"))

A summary of the ideal tuning parameters and Generalized R-Squared values for the three MARS models is below:

mars_mods <- c("Model MARS:1", "Model MARS:2", "Model MARS:3")
nprune <- c(mars1$bestTune$nprune, mars2$bestTune$nprune, mars3$bestTune$nprune)
degree <- c(mars1$bestTune$degree, mars2$bestTune$degree, mars3$bestTune$degree)
grsq <- c(mars1$finalModel$grsq, mars2$finalModel$grsq, mars3$finalModel$grsq)
summ <- as.data.frame(cbind(mars_mods, nprune, degree, round(grsq, 4)))
cols <- c("Model", "nprune", "degree", "Generalized R-Squared")
colnames(summ) <- cols
knitr::kable(summ, format = "simple")

Model	nprune	degree	Generalized R-Squared
Model MARS:1	16	2	0.4853
Model MARS:2	20	2	0.5296
Model MARS:3	14	2	0.4804

And here are summaries of the estimated feature importance for the ten most important features in each of these MARS models:

mars1_feature_importance <- varImp(mars1, method = "gcv")
mars1_feature_importance <- mars1_feature_importance$importance |>
    arrange(desc(Overall)) |>
    rownames_to_column() |>
    top_n(10)
cols <- c("Predictor", "Model MARS:1 Importance")
colnames(mars1_feature_importance) <- cols
knitr::kable(mars1_feature_importance, format = "simple")

Predictor	Model MARS:1 Importance
MNF_FLOW	100.000000
BRAND_CODEC	66.716165
HYD_PRESSURE_3	52.879172
AIR_PRESSURER	52.879172
BALLING	48.230435
ALCH_REL	43.999619
BOWL_SETPOINT	26.673886
TEMPERATURE	22.344526
USAGE_CONT	11.440172
CARB_PRESSURE_1	6.685723

mars2_feature_importance <- varImp(mars2, method = "gcv")
mars2_feature_importance <- mars2_feature_importance$importance |>
    arrange(desc(Overall)) |>
    rownames_to_column() |>
    top_n(10)
cols <- c("Predictor", "Model MARS:2 Importance")
colnames(mars2_feature_importance) <- cols
knitr::kable(mars2_feature_importance, format = "simple")

Predictor	Model MARS:2 Importance
MNF_FLOW	100.00000
BRAND_CODEC	74.45417
HYD_PRESSURE_3	74.45417
USAGE_CONT	63.63574
FILLER_LEVEL	59.72071
ALCH_REL	54.45987
CARB_PRESSURE_1	50.84616
PRESSURE_VACUUM	45.07385
BALLING	41.47829
TEMPERATURE	36.53317

mars3_feature_importance <- varImp(mars3, method = "gcv")
mars3_feature_importance <- mars3_feature_importance$importance |>
    arrange(desc(Overall)) |>
    rownames_to_column() |>
    top_n(10)
cols <- c("Predictor", "Model MARS:3 Importance")
colnames(mars3_feature_importance) <- cols
knitr::kable(mars3_feature_importance, format = "simple")

Predictor	Model MARS:3 Importance
MNF_FLOW	100.00000
inv_sq_AIR_PRESSURER	100.00000
BRAND_CODEC	58.96943
BALLING	44.67704
inv_sq_TEMPERATURE	44.67704
FILLER_SPEED_sq	41.81319
USAGE_CONT	36.39819
CARB_PRESSURE_1	30.52184
PRESSURE_VACUUM	30.52184
BRAND_CODED	23.80526
BOWL_SETPOINT_sq	23.80526

Next, we tune three K Nearest Neighbors (KNN) models.

ctrl <- trainControl(method="repeatedcv",repeats = 3) 
knn1 <- train(PH ~ ., data = primary_train_df, method = "knn", trControl=ctrl, preProcess = c("center","scale"), tuneLength = 20)
knn2 <- train(PH ~ ., data = secondary_train_df, method = "knn", trControl=ctrl, preProcess = c("center","scale"), tuneLength = 20)
knn3 <- train(PH ~ ., data = tertiary_train_df, method = "knn", trControl=ctrl, preProcess = c("center","scale"), tuneLength = 20)

A summary of the ideal k and R-Squared values for the three KNN models is below:

knn_mods <- c("Model KNN:1", "Model KNN:2", "Model KNN:3")
k <- c(knn1$bestTune$k, knn2$bestTune$k, knn3$bestTune$k)
rsq <- c(knn1$results |> filter(k == knn1$bestTune$k) |> select(Rsquared) |> as.numeric(),
         knn2$results |> filter(k == knn2$bestTune$k) |> select(Rsquared) |> as.numeric(),
         knn3$results |> filter(k == knn3$bestTune$k) |> select(Rsquared) |> as.numeric())
summ <- as.data.frame(cbind(knn_mods, k, round(rsq, 4)))
cols <- c("Model", "k", "R-Squared")
colnames(summ) <- cols
knitr::kable(summ, format = "simple")

Model	k	R-Squared
Model KNN:1	7	0.5137
Model KNN:2	7	0.5543
Model KNN:3	7	0.49

The ideal k was 7 for all three models, so the different training sets did not impact the best boundary here. Below are summaries of the estimated feature importance for the ten most important features in each of these KNN models:

orig_test_pred <- predict(knn1, primary_test_df |> select(-PH))
orig_pred_rsq <- as.numeric(R2(orig_test_pred, primary_test_df |> select(PH),
                               form = "traditional"))
features <- colnames(primary_test_df |> select(-PH))
feature_importance <- rep(0, length(features))
names(feature_importance) <- features
for (f in 1:length(features)){
    test_x_shuffled <- primary_test_df |> select(-PH)
    rows <- sample(nrow(test_x_shuffled))
    test_x_shuffled[, f] <- test_x_shuffled[rows, f]
    new_test_pred <- predict(knn1, test_x_shuffled)
    new_pred_rsq <- as.numeric(R2(new_test_pred, primary_test_df |> select(PH),
                                  form = "traditional"))
    feature_importance[f] <- orig_pred_rsq - new_pred_rsq
}
feature_importance <- sort(feature_importance, decreasing = TRUE) |>
    as.data.frame() |>
    rownames_to_column() |>
    top_n(10)
cols <- c("Predictor", "Model KNN:1 Importance")
colnames(feature_importance) <-  cols
knitr::kable(feature_importance, format = "simple")

Predictor	Model KNN:1 Importance
BRAND_CODE	0.2173181
AIR_PRESSURER	0.0647417
USAGE_CONT	0.0526153
OXYGEN_FILLER	0.0510969
BOWL_SETPOINT	0.0480915
MNF_FLOW	0.0378647
TEMPERATURE	0.0352188
CARB_FLOW	0.0339325
PRESSURE_VACUUM	0.0333629
FILLER_LEVEL	0.0298235

orig_test_pred <- predict(knn2, secondary_test_df |> select(-PH))
orig_pred_rsq <- as.numeric(R2(orig_test_pred, secondary_test_df |> select(PH),
                               form = "traditional"))
features <- colnames(secondary_test_df |> select(-PH))
feature_importance <- rep(0, length(features))
names(feature_importance) <- features
for (f in 1:length(features)){
    test_x_shuffled <- secondary_test_df |> select(-PH)
    rows <- sample(nrow(test_x_shuffled))
    test_x_shuffled[, f] <- test_x_shuffled[rows, f]
    new_test_pred <- predict(knn2, test_x_shuffled)
    new_pred_rsq <- as.numeric(R2(new_test_pred,
                                  secondary_test_df |> select(PH),
                                  form = "traditional"))
    feature_importance[f] <- orig_pred_rsq - new_pred_rsq
}
feature_importance <- sort(feature_importance, decreasing = TRUE) |>
    as.data.frame() |>
    rownames_to_column() |>
    top_n(10)
cols <- c("Predictor", "Model KNN:2 Importance")
colnames(feature_importance) <-  cols
knitr::kable(feature_importance, format = "simple")

Predictor	Model KNN:2 Importance
BRAND_CODE	0.1998780
OXYGEN_FILLER	0.0563566
PRESSURE_VACUUM	0.0505829
CARB_FLOW	0.0454219
FILLER_LEVEL	0.0439370
TEMPERATURE	0.0407374
AIR_PRESSURER	0.0380072
BOWL_SETPOINT	0.0352910
CARB_PRESSURE_1	0.0322369
USAGE_CONT	0.0303735

orig_test_pred <- predict(knn3, tertiary_test_df |> select(-PH))
orig_pred_rsq <- as.numeric(R2(orig_test_pred, tertiary_test_df |> select(PH),
                               form = "traditional"))
features <- colnames(tertiary_test_df |> select(-PH))
feature_importance <- rep(0, length(features))
names(feature_importance) <- features
for (f in 1:length(features)){
    test_x_shuffled <- tertiary_test_df |> select(-PH)
    rows <- sample(nrow(test_x_shuffled))
    test_x_shuffled[, f] <- test_x_shuffled[rows, f]
    new_test_pred <- predict(knn3, test_x_shuffled)
    new_pred_rsq <- as.numeric(R2(new_test_pred,
                                  tertiary_test_df |> select(PH),
                                  form = "traditional"))
    feature_importance[f] <- orig_pred_rsq - new_pred_rsq
}
feature_importance <- sort(feature_importance, decreasing = TRUE) |>
    as.data.frame() |>
    rownames_to_column() |>
    top_n(10)
cols <- c("Predictor", "Model KNN:3 Importance")
colnames(feature_importance) <-  cols
knitr::kable(feature_importance, format = "simple")

Predictor	Model KNN:3 Importance
BRAND_CODE	0.2014817
inv_sq_AIR_PRESSURER	0.0609561
PRESSURE_VACUUM	0.0347035
log_OXYGEN_FILLER	0.0324806
MNF_FLOW	0.0310678
CARB_FLOW	0.0305449
CARB_PRESSURE_1	0.0285600
BOWL_SETPOINT	0.0279335
USAGE_CONT	0.0229364
inv_sq_TEMPERATURE	0.0188264

Next we train three Support Vector Machine: Radial Basis (SVM: RB) models. (Note that factors always have to be one-hot encoded prior to building these models).

mm1 <- model.matrix(~0+., data = primary_train_df |> select(-PH))
svmRB1Tuned <- train(mm1, primary_train_df$PH,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))
mm2 <- model.matrix(~0+., data = secondary_train_df |> select(-PH))
svmRB2Tuned <- train(mm2, secondary_train_df$PH,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))
mm3 <- model.matrix(~0+., data = tertiary_train_df |> select(-PH))
svmRB3Tuned <- train(mm3, tertiary_train_df$PH,
                   method = "svmRadial",
                   preProc = c("center", "scale"),
                   tuneLength = 14,
                   trControl = trainControl(method = "cv"))

A summary of the ideal tuning parameters and R-Squared values for the three SVM:RB models is below:

svm_mods <- c("Model SVM:RB:1", "Model SVM:RB:2", "Model SVM:RB:3")
sigmas <- c(svmRB1Tuned$bestTune$sigma, svmRB2Tuned$bestTune$sigma, 
            svmRB3Tuned$bestTune$sigma)
cs <- c(svmRB1Tuned$bestTune$C, svmRB2Tuned$bestTune$C, 
            svmRB3Tuned$bestTune$C)
rsq <- c(svmRB1Tuned$results |> filter(C == svmRB1Tuned$bestTune$C) |> select(Rsquared) |> as.numeric(),
svmRB2Tuned$results |> filter(C == svmRB2Tuned$bestTune$C) |> select(Rsquared) |> as.numeric(),
svmRB3Tuned$results |> filter(C == svmRB3Tuned$bestTune$C) |> select(Rsquared) |> as.numeric())
summ <- as.data.frame(cbind(svm_mods, round(sigmas, 4), cs, round(rsq, 4)))
cols <- c("Model", "sigma", "C", "R-Squared")
colnames(summ) <- cols
knitr::kable(summ, format = "simple")

Model	sigma	C	R-Squared
Model SVM:RB:1	0.02	8	0.532
Model SVM:RB:2	0.0203	8	0.5757
Model SVM:RB:3	0.0167	8	0.5393

The ideal C was 8 for all three SVM models. Below are summaries of the estimated feature importance for the ten most important features in each of these SVM:RB models:

Model SVM:RB:1 Importance

y <- primary_train_df$PH
names(y) <- "y"
dat = cbind(primary_train_df |> select(-PH), y)
svmRB1Fit <- fit(y~., data = dat, model = "svm",
               kpar = list(sigma = .02), C = 8)
svmRB1.imp <- Importance(svmRB1Fit, data = dat)
L = list(runs = 1,sen = t(svmRB1.imp$imp),
         sresponses = svmRB1.imp$sresponses)
sen_vec <- as.numeric(L[["sen"]])
copy <- L
delete <- c()
for (i in 1:length(sen_vec)){
    if (sen_vec[i] >= 0.045){
        next
    }else{
        delete <- append(delete, i)
    }
}
copy[["sen"]] <- t(as.matrix(copy[["sen"]][, -delete]))
copy[["sresponses"]] <- copy[["sresponses"]][-delete]
names <- c()
for (i in 1:length(copy[["sresponses"]])){
    n <- copy[["sresponses"]][[i]][["n"]]
    names <- append(names, n)
}
mgraph(copy, graph = "IMP", leg = names, col = "gray",
       PDF = "")

Model SVM:RB:2 Importance

y <- secondary_train_df$PH
names(y) <- "y"
dat = cbind(secondary_train_df |> select(-PH), y)
svmRB2Fit <- fit(y~., data = dat, model = "svm",
               kpar = list(sigma = .0203), C = 8)
svmRB2.imp <- Importance(svmRB2Fit, data = dat)
L = list(runs = 1,sen = t(svmRB2.imp$imp),
         sresponses = svmRB2.imp$sresponses)
sen_vec <- as.numeric(L[["sen"]])
copy <- L
delete <- c()
for (i in 1:length(sen_vec)){
    if (sen_vec[i] >= 0.045){
        next
    }else{
        delete <- append(delete, i)
    }
}
copy[["sen"]] <- t(as.matrix(copy[["sen"]][, -delete]))
copy[["sresponses"]] <- copy[["sresponses"]][-delete]
names <- c()
for (i in 1:length(copy[["sresponses"]])){
    n <- copy[["sresponses"]][[i]][["n"]]
    names <- append(names, n)
}
mgraph(copy, graph = "IMP", leg = names, col = "gray",
       PDF = "")

Model SVM:RB:3 Importance

y <- tertiary_train_df$PH
names(y) <- "y"
dat = cbind(tertiary_train_df |> select(-PH), y)
svmRB3Fit <- fit(y~., data = dat, model = "svm",
               kpar = list(sigma = .0167), C = 8)
svmRB3.imp <- Importance(svmRB3Fit, data = dat)
L = list(runs = 1,sen = t(svmRB3.imp$imp),
         sresponses = svmRB3.imp$sresponses)
sen_vec <- as.numeric(L[["sen"]])
copy <- L
delete <- c()
for (i in 1:length(sen_vec)){
    if (sen_vec[i] >= 0.045){
        next
    }else{
        delete <- append(delete, i)
    }
}
copy[["sen"]] <- t(as.matrix(copy[["sen"]][, -delete]))
copy[["sresponses"]] <- copy[["sresponses"]][-delete]
names <- c()
for (i in 1:length(copy[["sresponses"]])){
    n <- copy[["sresponses"]][[i]][["n"]]
    names <- append(names, n)
}
mgraph(copy, graph = "IMP", leg = names, col = "gray",
       PDF = "")

Tree Models:

Finally, we train three single tree regression models.

rpart1Tune <- train(primary_train_df |> select(-PH), primary_train_df$PH,
                   method = "rpart2",
                   tuneLength = 10,
                   trControl = trainControl(method = "cv"))
rpart2Tune <- train(secondary_train_df |> select(-PH), secondary_train_df$PH,
                   method = "rpart2",
                   tuneLength = 10,
                   trControl = trainControl(method = "cv"))
rpart3Tune <- train(tertiary_train_df |> select(-PH), tertiary_train_df$PH,
                   method = "rpart2",
                   tuneLength = 10,
                   trControl = trainControl(method = "cv"))

A summary of the best maxdepth and R-Squared for each of these tree models is below:

tree_mods <- c("Model Tree:1", "Model Tree:2", "Model Tree:3")
maxdepth <- c(rpart1Tune$bestTune$maxdepth, rpart2Tune$bestTune$maxdepth,
              rpart3Tune$bestTune$maxdepth)
rsq <- c(rpart1Tune$results |> filter(maxdepth == rpart1Tune$bestTune$maxdepth) |> select(Rsquared) |> as.numeric(),
         rpart2Tune$results |> filter(maxdepth == rpart2Tune$bestTune$maxdepth) |> select(Rsquared) |> as.numeric(),
         rpart3Tune$results |> filter(maxdepth == rpart3Tune$bestTune$maxdepth) |> select(Rsquared) |> as.numeric())
summ <- as.data.frame(cbind(tree_mods, maxdepth, round(rsq, 4)))
cols <- c("Model", "maxdepth", "R-Squared")
colnames(summ) <- cols
knitr::kable(summ, format = "simple")

Model	maxdepth	R-Squared
Model Tree:1	11	0.4738
Model Tree:2	11	0.4115
Model Tree:3	11	0.4659

All three tree models had a best maxdepth of 11. Below are summaries of the estimated feature importance for the ten most important features in each of these tree models:

feature_importance <- varImp(rpart1Tune)
feature_importance <- feature_importance$importance |>
    rownames_to_column()
feature_importance <- feature_importance |>
    arrange(desc(Overall)) |>
    top_n(10)
cols <- c("Predictor", "Model Tree:1 Importance")
colnames(feature_importance) <- cols
knitr::kable(feature_importance, format = "simple")

Predictor	Model Tree:1 Importance
BRAND_CODE	100.00000
MNF_FLOW	79.72742
HYD_PRESSURE_3	76.87580
FILLER_SPEED	76.26406
CARB_PRESSURE_1	74.44439
USAGE_CONT	71.98577
PC_VOLUME	69.37967
OXYGEN_FILLER	65.27942
CARB_REL	57.61015
FILLER_LEVEL	49.09769

feature_importance <- varImp(rpart2Tune)
feature_importance <- feature_importance$importance |>
    rownames_to_column()
feature_importance <- feature_importance |>
    arrange(desc(Overall)) |>
    top_n(10)
cols <- c("Predictor", "Model Tree:2 Importance")
colnames(feature_importance) <- cols
knitr::kable(feature_importance, format = "simple")

Predictor	Model Tree:2 Importance
PC_VOLUME	100.00000
MNF_FLOW	81.18041
CARB_REL	81.13728
BRAND_CODE	80.01114
BOWL_SETPOINT	76.62495
FILLER_LEVEL	76.55623
TEMPERATURE	67.52719
CARB_PRESSURE_1	66.97522
ALCH_REL	63.45126
OXYGEN_FILLER	51.02602

feature_importance <- varImp(rpart3Tune)
feature_importance <- feature_importance$importance |>
    rownames_to_column()
feature_importance <- feature_importance |>
    arrange(desc(Overall)) |>
    top_n(10)
cols <- c("Predictor", "Model Tree:3 Importance")
colnames(feature_importance) <- cols
knitr::kable(feature_importance, format = "simple")

Predictor	Model Tree:3 Importance
BRAND_CODE	100.00000
FILLER_SPEED	82.86299
FILLER_SPEED_sq	82.86299
MNF_FLOW	81.29959
USAGE_CONT	78.21451
USAGE_CONT_sq	78.21451
HYD_PRESSURE_3	77.57268
PC_VOLUME	75.38292
CARB_PRESSURE_1	73.52001
log_OXYGEN_FILLER	70.92788

Final Model Selection:

We will select the final model by looking at the models’ Predictive R-Squared and Root Mean Squared Error (RMSE) measures using the test data we held out from the primary, secondary, and tertiary datasets. For each test dataset, we will select the model with the lowest RMSE (and hopefully the highest Predictive R-Squared). From that reduced selection of models, we will select the most appropriate model for the evaluation data, a determination that requires consideration of both the model itself and what changes were made to its underlying training data.

Below is a test data performance summary for all models operating on the primary dataset:

test_pred1 <- predict(lm1, primary_test_df |> select(-PH))
test_rsq1 <- as.numeric(R2(test_pred1, primary_test_df$PH, form = "traditional"))
test_rmse1 <- as.numeric(RMSE(test_pred1, primary_test_df$PH))
row1 <- cbind("Model LM:1",
              as.character(round(test_rsq1, 4)),
              as.character(round(test_rmse1, 4)))
test_pred2 <- predict(mars1, primary_test_df |> select(-PH))
test_rsq2 <- as.numeric(R2(test_pred2, primary_test_df$PH, form = "traditional"))
test_rmse2 <- as.numeric(RMSE(test_pred2, primary_test_df$PH))
row2 <- cbind("Model MARS:1",
              as.character(round(test_rsq2, 4)),
              as.character(round(test_rmse2, 4)))
test_pred3 <- predict(knn1, primary_test_df |> select(-PH))
test_rsq3 <- as.numeric(R2(test_pred3, primary_test_df$PH, form = "traditional"))
test_rmse3 <- as.numeric(RMSE(test_pred3, primary_test_df$PH))
row3 <- cbind("Model KNN:1",
              as.character(round(test_rsq3, 4)),
              as.character(round(test_rmse3, 4)))
mm_test <- model.matrix(~0+., data = primary_test_df |> select(-PH))
test_pred4 <- predict(svmRB1Tuned, mm_test)
test_rsq4 <- as.numeric(R2(test_pred4, primary_test_df$PH, form = "traditional"))
test_rmse4 <- as.numeric(RMSE(test_pred4, primary_test_df$PH))
row4 <- cbind("Model SVM:RB:1",
              as.character(round(test_rsq4, 4)),
              as.character(round(test_rmse4, 4)))
test_pred5 <- predict(rpart1Tune, primary_test_df |> select(-PH))
test_rsq5 <- as.numeric(R2(test_pred5, primary_test_df$PH, form = "traditional"))
test_rmse5 <- as.numeric(RMSE(test_pred5, primary_test_df$PH))
row5 <- cbind("Model Tree:1",
              as.character(round(test_rsq5, 4)),
              as.character(round(test_rmse5, 4)))
tbl <- as.data.frame(rbind(row1, row2, row3, row4, row5))
cols <- c("Model", "Predictive R-Squared", "RMSE")
colnames(tbl) <- cols
knitr::kable(tbl, format = "simple")

Model	Predictive R-Squared	RMSE
Model LM:1	0.3984	0.1357
Model MARS:1	0.4532	0.1294
Model KNN:1	0.4651	0.128
Model SVM:RB:1	0.567	0.1151
Model Tree:1	0.4432	0.1306

Model SVM:RB:1 has the lowest RMSE and highest Predictive R-Squared on the primary dataset.

Below is a test data performance summary for all models operating on the secondary dataset:

test_pred1 <- predict(lm2, secondary_test_df |> select(-PH))
test_rsq1 <- as.numeric(R2(test_pred1, secondary_test_df$PH, form = "traditional"))
test_rmse1 <- as.numeric(RMSE(test_pred1, secondary_test_df$PH))
row1 <- cbind("Model LM:2",
              as.character(round(test_rsq1, 4)),
              as.character(round(test_rmse1, 4)))
test_pred2 <- predict(mars2, secondary_test_df |> select(-PH))
test_rsq2 <- as.numeric(R2(test_pred2, secondary_test_df$PH, form = "traditional"))
test_rmse2 <- as.numeric(RMSE(test_pred2, secondary_test_df$PH))
row2 <- cbind("Model MARS:2",
              as.character(round(test_rsq2, 4)),
              as.character(round(test_rmse2, 4)))
test_pred3 <- predict(knn2, secondary_test_df |> select(-PH))
test_rsq3 <- as.numeric(R2(test_pred3, secondary_test_df$PH, form = "traditional"))
test_rmse3 <- as.numeric(RMSE(test_pred3, secondary_test_df$PH))
row3 <- cbind("Model KNN:2",
              as.character(round(test_rsq3, 4)),
              as.character(round(test_rmse3, 4)))
mm_test <- model.matrix(~0+., data = secondary_test_df |> select(-PH))
test_pred4 <- predict(svmRB2Tuned, mm_test)
test_rsq4 <- as.numeric(R2(test_pred4, secondary_test_df$PH, form = "traditional"))
test_rmse4 <- as.numeric(RMSE(test_pred4, secondary_test_df$PH))
row4 <- cbind("Model SVM:RB:2",
              as.character(round(test_rsq4, 4)),
              as.character(round(test_rmse4, 4)))
test_pred5 <- predict(rpart2Tune, secondary_test_df |> select(-PH))
test_rsq5 <- as.numeric(R2(test_pred5, secondary_test_df$PH, form = "traditional"))
test_rmse5 <- as.numeric(RMSE(test_pred5, secondary_test_df$PH))
row5 <- cbind("Model Tree:2",
              as.character(round(test_rsq5, 4)),
              as.character(round(test_rmse5, 4)))
tbl <- as.data.frame(rbind(row1, row2, row3, row4, row5))
cols <- c("Model", "Predictive R-Squared", "RMSE")
colnames(tbl) <- cols
knitr::kable(tbl, format = "simple")

Model	Predictive R-Squared	RMSE
Model LM:2	0.4345	0.13
Model MARS:2	-2.7649	0.3355
Model KNN:2	0.5322	0.1183
Model SVM:RB:2	0.6224	0.1063
Model Tree:2	0.4217	0.1315

Model SVM:RB:2 has the lowest RMSE and highest Predictive R-Squared on the secondary dataset. Model MARS:2 performs particularly poorly on this dataset.

test_pred1 <- predict(lm3, tertiary_test_df |> select(-PH))
test_rsq1 <- as.numeric(R2(test_pred1, tertiary_test_df$PH, form = "traditional"))
test_rmse1 <- as.numeric(RMSE(test_pred1, tertiary_test_df$PH))
row1 <- cbind("Model LM:3",
              as.character(round(test_rsq1, 4)),
              as.character(round(test_rmse1, 4)))
test_pred2 <- predict(mars3, tertiary_test_df |> select(-PH))
test_rsq2 <- as.numeric(R2(test_pred2, tertiary_test_df$PH, form = "traditional"))
test_rmse2 <- as.numeric(RMSE(test_pred2, tertiary_test_df$PH))
row2 <- cbind("Model MARS:3",
              as.character(round(test_rsq2, 4)),
              as.character(round(test_rmse2, 4)))
test_pred3 <- predict(knn3, tertiary_test_df |> select(-PH))
test_rsq3 <- as.numeric(R2(test_pred3, tertiary_test_df$PH, form = "traditional"))
test_rmse3 <- as.numeric(RMSE(test_pred3, tertiary_test_df$PH))
row3 <- cbind("Model KNN:3",
              as.character(round(test_rsq3, 4)),
              as.character(round(test_rmse3, 4)))
mm_test <- model.matrix(~0+., data = tertiary_test_df |> select(-PH))
test_pred4 <- predict(svmRB3Tuned, mm_test)
test_rsq4 <- as.numeric(R2(test_pred4, tertiary_test_df$PH, form = "traditional"))
test_rmse4 <- as.numeric(RMSE(test_pred4, tertiary_test_df$PH))
row4 <- cbind("Model SVM:RB:3",
              as.character(round(test_rsq4, 4)),
              as.character(round(test_rmse4, 4)))
test_pred5 <- predict(rpart3Tune, tertiary_test_df |> select(-PH))
test_rsq5 <- as.numeric(R2(test_pred5, tertiary_test_df$PH, form = "traditional"))
test_rmse5 <- as.numeric(RMSE(test_pred5, tertiary_test_df$PH))
row5 <- cbind("Model Tree:3",
              as.character(round(test_rsq5, 4)),
              as.character(round(test_rmse5, 4)))
tbl <- as.data.frame(rbind(row1, row2, row3, row4, row5))
cols <- c("Model", "Predictive R-Squared", "RMSE")
colnames(tbl) <- cols
knitr::kable(tbl, format = "simple")

Model	Predictive R-Squared	RMSE
Model LM:3	0.3993	0.1356
Model MARS:3	0.4853	0.1255
Model KNN:3	0.4599	0.1286
Model SVM:RB:3	0.5571	0.1164
Model Tree:3	0.4432	0.1306

Model SVM:RB:3 has the lowest RMSE and highest Predictive R-Squared on the tertiary dataset.

While Model SVM:RB:2 has the lowest RMSE (and the highest Predictive R-Squared), its underlying data uses only complete cases. Since the evaluation data contains NA values, and SVM models can’t handle missing predictor data, we choose the second best performer: Model SVM:RB:1. It uses imputed data, and we can impute the missing values in the evaluation data the same way so that PH can be predicted for all observations.

Final Model Evaluation:

We make our PH predictions using Model SVM:RB:1 and save the predictions made to an Excel file: “Student_Evaluation_w_Predictions.xlsx.”

missing_val_cols <- find_cols_na(eval_df |> select(-PH))
eval_df <- eval_df |>
    mutate(BRAND_CODE = factor(BRAND_CODE, exclude = NULL)) |>
    VIM::kNN(variable = missing_val_cols, k = 15, dist_var = dist_vars,
             weights = wts, numFun = median, imp_var = FALSE)
mm_pred <- model.matrix(~0+., data = eval_df |> select(-PH))
eval_df <- eval_df |>
    mutate(PH = predict(svmRB1Tuned, mm_pred))
write_xlsx(eval_df, "Student_Evaluation_w_Predictions.xlsx")