Introduction

This document outlines the analysis and modeling of the wine dataset using R. The goal is to predict the number of wine cases sold based on chemical and marketing features.

Data Exploration

Load Libraries and Data

# Load required libraries
library(ggplot2)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)
library(pscl)
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
# Load data
train_data <- read.csv("wine-training-data.csv")
eval_data <- read.csv("wine-evaluation-data.csv")

cat("\nStructure of the training data\n")
## 
## Structure of the training data
str(train_data)
## 'data.frame':    12795 obs. of  16 variables:
##  $ INDEX             : int  1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET            : int  3 3 5 3 4 0 0 4 3 6 ...
##  $ FixedAcidity      : num  3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
##  $ VolatileAcidity   : num  1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
##  $ CitricAcid        : num  -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
##  $ ResidualSugar     : num  54.2 26.1 14.8 18.8 9.4 ...
##  $ Chlorides         : num  -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
##  $ FreeSulfurDioxide : num  NA 15 214 22 -167 -37 287 523 -213 62 ...
##  $ TotalSulfurDioxide: num  268 -327 142 115 108 15 156 551 NA 180 ...
##  $ Density           : num  0.993 1.028 0.995 0.996 0.995 ...
##  $ pH                : num  3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
##  $ Sulphates         : num  -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
##  $ Alcohol           : num  9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
##  $ LabelAppeal       : int  0 -1 -1 -1 0 0 0 1 0 0 ...
##  $ AcidIndex         : int  8 7 8 6 9 11 8 7 6 8 ...
##  $ STARS             : int  2 3 3 1 2 NA NA 3 NA 4 ...
cat("\nStructure of the evaluation data\n")
## 
## Structure of the evaluation data
str(eval_data)
## 'data.frame':    3335 obs. of  16 variables:
##  $ INDEX             : int  3 9 10 18 21 30 31 37 39 47 ...
##  $ TARGET            : logi  NA NA NA NA NA NA ...
##  $ FixedAcidity      : num  5.4 12.4 7.2 6.2 11.4 17.6 15.5 15.9 11.6 3.8 ...
##  $ VolatileAcidity   : num  -0.86 0.385 1.75 0.1 0.21 0.04 0.53 1.19 0.32 0.22 ...
##  $ CitricAcid        : num  0.27 -0.76 0.17 1.8 0.28 -1.15 -0.53 1.14 0.55 0.31 ...
##  $ ResidualSugar     : num  -10.7 -19.7 -33 1 1.2 1.4 4.6 31.9 -50.9 -7.7 ...
##  $ Chlorides         : num  0.092 1.169 0.065 -0.179 0.038 ...
##  $ FreeSulfurDioxide : num  23 -37 9 104 70 -250 10 115 35 40 ...
##  $ TotalSulfurDioxide: num  398 68 76 89 53 140 17 381 83 129 ...
##  $ Density           : num  0.985 0.99 1.046 0.989 1.029 ...
##  $ pH                : num  5.02 3.37 4.61 3.2 2.54 3.06 3.07 2.99 3.32 4.72 ...
##  $ Sulphates         : num  0.64 1.09 0.68 2.11 -0.07 -0.02 0.75 0.31 2.18 -0.64 ...
##  $ Alcohol           : num  12.3 16 8.55 12.3 4.8 11.4 8.5 11.4 -0.5 10.9 ...
##  $ LabelAppeal       : int  -1 0 0 -1 0 1 0 1 0 0 ...
##  $ AcidIndex         : int  6 6 8 8 10 8 12 7 12 7 ...
##  $ STARS             : int  NA 2 1 1 NA 4 3 NA NA NA ...

Summary Statistics and Missing Data

# Load necessary libraries
library(kableExtra)
## 
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
## 
##     group_rows
library(dplyr)
library(tidyr)
library(moments) # For skewness and kurtosis

# Create vectors of numeric and categorical variables
numeric_vars <- names(train_data)[sapply(train_data, is.numeric)]
categorical_vars <- names(train_data)[sapply(train_data, is.factor)]

# Correctly select numerical variables using the predefined numeric_vars
numerical_vars <- train_data %>%
  dplyr::select(all_of(numeric_vars))

# Compute statistical summary including skewness, kurtosis, missing value counts, and percentages
statistical_summary <- numerical_vars %>%
  summarise(across(
    everything(),
    list(
      Min = ~round(min(., na.rm = TRUE), 2),
      Q1 = ~round(quantile(., 0.25, na.rm = TRUE), 2),
      Mean = ~round(mean(., na.rm = TRUE), 2),
      Median = ~round(median(., na.rm = TRUE), 2),
      Q3 = ~round(quantile(., 0.75, na.rm = TRUE), 2),
      Max = ~round(max(., na.rm = TRUE), 2),
      SD = ~round(sd(., na.rm = TRUE), 2),
      Variance = ~round(var(., na.rm = TRUE), 2),
      CV = ~round(sd(., na.rm = TRUE) / mean(., na.rm = TRUE), 2),
      Skewness = ~round(skewness(., na.rm = TRUE), 2),
      Kurtosis = ~round(kurtosis(., na.rm = TRUE), 2),
      ZeroCount = ~sum(. == 0, na.rm = TRUE),
      ZeroProportion = ~round(mean(. == 0, na.rm = TRUE) * 100, 2),
      Missing = ~sum(is.na(.)),
      PercentMissing = ~round(mean(is.na(.)) * 100, 2),
      Overdispersion = ~round(var(., na.rm = TRUE) / mean(., na.rm = TRUE), 2),
      Outliers = ~sum(. > (quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE)) | 
                       . < (quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE)), na.rm = TRUE)
    ),
    .names = "{.col}_{.fn}"
  )) %>%
  pivot_longer(
    cols = everything(),
    names_to = c("Variable", ".value"),
    names_pattern = "^(.*)_(.*)$"
  )

# Display the resulting summary table
statistical_summary %>%
  kable(caption = "Summary of Numerical Variables (Including Skewness, Kurtosis, Missing Counts and Percentages)") %>%
  kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Summary of Numerical Variables (Including Skewness, Kurtosis, Missing Counts and Percentages)
Variable Min Q1 Mean Median Q3 Max SD Variance CV Skewness Kurtosis ZeroCount ZeroProportion Missing PercentMissing Overdispersion Outliers
INDEX 1.00 4037.50 8069.98 8110.00 12106.50 16129.00 4656.91 21686765.18 0.58 0.00 1.80 0 0.00 0 0.00 2687.34 0
TARGET 0.00 2.00 3.03 3.00 4.00 8.00 1.93 3.71 0.64 -0.33 2.12 2734 21.37 0 0.00 1.23 17
FixedAcidity -18.10 5.20 7.08 6.90 9.50 34.40 6.32 39.91 0.89 -0.02 4.68 39 0.30 0 0.00 5.64 2455
VolatileAcidity -2.79 0.13 0.32 0.28 0.64 3.68 0.78 0.61 2.42 0.02 4.83 18 0.14 0 0.00 1.90 2599
CitricAcid -3.24 0.03 0.31 0.31 0.58 3.86 0.86 0.74 2.80 -0.05 4.84 115 0.90 0 0.00 2.41 2688
ResidualSugar -127.80 -2.00 5.42 3.90 15.90 141.15 33.75 1139.02 6.23 -0.05 4.89 6 0.05 616 4.81 210.20 3298
Chlorides -1.17 -0.03 0.05 0.05 0.15 1.35 0.32 0.10 5.81 0.03 4.79 5 0.04 638 4.99 1.85 3021
FreeSulfurDioxide -555.00 0.00 30.85 30.00 70.00 623.00 148.71 22116.02 4.82 0.01 4.84 11 0.09 647 5.06 716.99 3712
TotalSulfurDioxide -823.00 27.00 120.71 123.00 208.00 1057.00 231.91 53783.74 1.92 -0.01 4.68 7 0.06 682 5.33 445.55 1590
Density 0.89 0.99 0.99 0.99 1.00 1.10 0.03 0.00 0.03 -0.02 4.90 0 0.00 0 0.00 0.00 3823
pH 0.48 2.96 3.21 3.20 3.47 6.13 0.68 0.46 0.21 0.04 4.65 0 0.00 395 3.09 0.14 1864
Sulphates -3.13 0.28 0.53 0.50 0.86 4.24 0.93 0.87 1.77 0.01 4.75 22 0.19 1210 9.46 1.65 2606
Alcohol -4.70 9.00 10.49 10.40 12.40 26.50 3.73 13.90 0.36 -0.03 4.54 2 0.02 653 5.10 1.32 928
LabelAppeal -2.00 -1.00 -0.01 0.00 1.00 2.00 0.89 0.79 -98.29 0.01 2.74 5617 43.90 0 0.00 -87.58 0
AcidIndex 4.00 7.00 7.77 8.00 8.00 17.00 1.32 1.75 0.17 1.65 8.19 0 0.00 0 0.00 0.23 1151
STARS 1.00 1.00 2.04 2.00 3.00 4.00 0.90 0.81 0.44 0.45 2.31 0 0.00 3359 26.25 0.40 0

The summary statistics highlights key metrics for numerical variables in the dataset, including central tendencies (mean, median), variability (standard deviation, variance), distribution shape (skewness, kurtosis), and data integrity measures (missing values, outliers). Here’s a statement tailored for this modeling context:

analysis of the Summary Statistics:

  1. Target Variable (TARGET):
    • The mean (3.03) and median (3.00) indicate a relatively symmetrical distribution with low skewness (-0.33).
    • However, the kurtosis (2.12) suggests a slightly heavier-tailed distribution than normal.
    • Approximately 21.37% of the TARGET values are zeros, which supports the appropriateness of Zero-Inflated Poisson (ZIP) or Negative Binomial models for this data.
  2. Variables with Missing Data:
    • ResidualSugar, Chlorides, FreeSulfurDioxide, TotalSulfurDioxide, Sulphates, and Alcohol have notable missing percentages (ranging from 4.81% to 9.46%).
    • Addressing missing values or incorporating missing value flags will be essential for modeling.
  3. Overdispersion:
    • Many variables, such as FreeSulfurDioxide (Overdispersion: 716.99) and ResidualSugar (210.20), exhibit significant overdispersion (variance > mean), which further justifies using Negative Binomial models.
  4. Variables with Zeros:
    • Some variables, such as LabelAppeal (43.90% zeros) and TARGET (21.37% zeros), have a significant proportion of zeros. This reinforces the potential need for Zero-Inflated Models to account for excess zeros.
  5. Outliers:
    • Variables such as ResidualSugar (3298 outliers) and Chlorides (3021 outliers) exhibit a high number of outliers, which could influence regression models. Techniques like transformations or robust regressions may help mitigate their impact.
  6. Highly Skewed Variables:
    • Variables such as LabelAppeal (Skewness: 0.01) and AcidIndex (1.65) indicate noticeable deviations from symmetry.
    • Highly skewed variables may benefit from transformations (e.g., Box-Cox or log transformations).

Suitability of Models Based on Summary Statistics:

Given the observed distribution of the TARGET variable and the characteristics of predictor variables:

  1. Poisson Regression: Suitable for count data, but check for overdispersion in residuals.
  2. Negative Binomial Regression: Preferable for variables like ResidualSugar and FreeSulfurDioxide, where overdispersion is evident.
  3. Zero-Inflated Models: Particularly useful for handling excess zeros in variables like LabelAppeal and TARGET.
  4. Multiple Linear Regression: Can be explored for a broader perspective on relationships but may require transformations to address non-normality and outliers.

This summary emphasizes the appropriate models and preprocessing steps based on the data’s statistical characteristics.

Visualizations Based on Summary Statistics

Key Variables for Visualization Analysis:

Based on the summary statistics and modeling context, here are the key variables and visuals. These visualizations will provide critical insights into the data distribution, relationships, and model suitability.

1. Target Variable (TARGET)

The histogram and bar plot reveal a multimodal distribution of the TARGET variable, with prominent peaks at 0, 3, and 4, and very few occurrences at higher values. The excess of zeros in the data suggests potential zero-inflation, which makes the TARGET variable suitable for models like Zero-Inflated Poisson and Zero-Inflated Negative Binomial, in addition to standard Poisson and Negative Binomial models. The non-uniformity and zero-inflation also indicate that Multiple Regression models will need to account for the skewed and count-based nature of the TARGET variable to ensure accurate predictions. These visuals underscore the importance of considering overdispersion and zero-inflation when selecting statistical models for analysis.

library(ggplot2)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
# Remove NAs from ResidualSugar
numerical_vars_clean <- numerical_vars %>%
  dplyr::filter(!is.na(ResidualSugar))

# Create the histogram of TARGET
histogram_plot <- ggplot(numerical_vars_clean, aes(x = TARGET)) +
  geom_histogram(binwidth = 1, fill = "black", color = "black", alpha = 0.6) +
  labs(title = "Histogram of TARGET", x = "TARGET", y = "Frequency") +
  theme_minimal()

# Create the bar plot of TARGET value counts
bar_plot <- ggplot(numerical_vars_clean, aes(x = factor(TARGET))) +
  geom_bar(fill = "black", color = "black", alpha = 0.6) +
  labs(title = "Bar Plot of TARGET Value Counts", x = "TARGET", y = "Count") +
  theme_minimal()

# Use gridExtra to arrange the plots
gridExtra::grid.arrange(histogram_plot, bar_plot, ncol = 1)

The histogram and bar plot reveal a multimodal distribution of the TARGET variable, with prominent peaks around 0, 3, and 4, and fewer occurrences at higher values. This suggests that TARGET is not evenly distributed, making it suitable for count-based regression models such as Poisson, Negative Binomial, and Zero-Inflated models. The observed non-uniformity highlights the importance of selecting appropriate statistical models to account for overdispersion and the excess zero counts. Multiple regression models may also need to address the variable’s skewed distribution for accurate prediction.

2. Highly Skewed Variables (e.g., ResidualSugar, FreeSulfurDioxide)

The ResidualSugar variable shows heavy skewness and numerous outliers, as seen in the density and box plots. These issues suggest the need for transformations (e.g., log or Box-Cox) to stabilize variance and normalize the distribution. For Poisson and Negative Binomial models, overdispersion and extreme values may require robust handling, such as Zero-Inflated models. For multiple regression, preprocessing the variable will likely improve model performance and interpretability.

R Code:

library(ggplot2)
library(gridExtra)

# Remove NAs from ResidualSugar
numerical_vars_clean <- numerical_vars %>%
  dplyr::filter(!is.na(ResidualSugar))

# Density plot of ResidualSugar
density_plot <- ggplot(numerical_vars_clean, aes(x = ResidualSugar)) +
  geom_density(fill = "black", alpha = 0.6) +
  labs(title = "Density Plot of ResidualSugar", x = "ResidualSugar", y = "Density") +
  theme_minimal()

# Box plot of ResidualSugar
box_plot <- ggplot(numerical_vars_clean, aes(y = ResidualSugar)) +
  geom_boxplot(fill = "black", color = "black", alpha = 0.6) +
  labs(title = "Box Plot of ResidualSugar", y = "ResidualSugar") +
  theme_minimal()

# Use gridExtra to arrange the plots
gridExtra::grid.arrange(density_plot, box_plot, ncol = 2)

3. LabelAppeal and STARS

The visualizations highlight the relationships between the key predictors (STARS and LabelAppeal) and TARGET. The box plot demonstrates that TARGET increases with higher values of STARS, consistent with the theoretical expectation that better wine ratings lead to higher sales. Similarly, the bar plot shows a clear positive trend between LabelAppeal and the mean TARGET, aligning with the notion that visually appealing labels drive consumer purchases. These predictors are significant in the models and should be emphasized for their strong predictive power in explaining TARGET.

library(ggplot2)
library(gridExtra)

# Box plot of TARGET by STARS
box_plot <- ggplot(numerical_vars, aes(x = factor(STARS), y = TARGET)) +
  geom_boxplot(fill = "black", color = "black", alpha = 0.6) +
  labs(title = "Box Plot of TARGET by STARS", x = "STARS", y = "TARGET") +
  scale_x_discrete(labels = function(x) sprintf("%.4f", as.numeric(x))) +
  theme_minimal()

# Bar plot of TARGET by LabelAppeal
bar_plot <- ggplot(numerical_vars, aes(x = factor(LabelAppeal), y = TARGET)) +
  geom_bar(stat = "summary", fun = "mean", fill = "black", color = "black", alpha = 0.6) +
  labs(title = "Bar Plot of TARGET by LabelAppeal", x = "LabelAppeal", y = "Mean TARGET") +
  scale_x_discrete(labels = function(x) sprintf("%.4f", as.numeric(x))) +
  theme_minimal()

# Use gridExtra to arrange the plots
gridExtra::grid.arrange(box_plot, bar_plot, ncol = 1)

4. Correlations Between Numeric Predictors and TARGET

The scatter plot reveals a positive but weak linear relationship between Alcohol and TARGET, suggesting Alcohol may have a small predictive effect on the number of cases purchased. The correlation heatmap highlights stronger positive correlations for variables such as STARS and LabelAppeal with TARGET, aligning with their theoretical importance in the models. These findings guide variable selection and emphasize the need for both simple and complex model types (e.g., Poisson, Negative Binomial, Zero-Inflated, and Multiple Regression) to capture various relationships effectively.

# Install the ggcorrplot package if not already installed
if (!requireNamespace("ggcorrplot", quietly = TRUE)) {
  install.packages("ggcorrplot")
}

# Load necessary libraries
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2
library(ggcorrplot)
library(gridExtra)

# Scatter plot: TARGET vs. Alcohol
scatter_plot <- ggplot(numerical_vars, aes(x = Alcohol, y = TARGET)) +
  geom_point(alpha = 0.6, color = 'blue') +
  geom_smooth(method = "lm", color = 'red', se = FALSE) +
  labs(title = "Scatter Plot of TARGET vs Alcohol", x = "Alcohol", y = "TARGET") +
  theme_minimal()

# Correlation heatmap for numeric variables
numeric_vars <- dplyr::select(numerical_vars, where(is.numeric))
cor_matrix <- cor(numeric_vars, use = "complete.obs")
heatmap <- ggcorrplot(cor_matrix, lab = TRUE, digits = 1, title = "Correlation Heatmap")

# Display scatter plot and heatmap
scatter_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 653 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 653 rows containing missing values or values outside the scale range
## (`geom_point()`).

heatmap

5. Zero-Inflation and Overdispersion:

The bar plot illustrates the proportion of zeros across numeric variables, with the TARGET variable exhibiting a notable degree of zero inflation. This suggests the appropriateness of employing Zero-Inflated Poisson or Negative Binomial regression to account for the excess zeros effectively. In contrast, variables such as LabelAppeal show a higher presence of non-zero values, indicating their suitability for traditional regression models without zero-inflation considerations.

# Bar plot of zero proportions
zero_counts <- numerical_vars %>%
 summarise(across(where(is.numeric), ~ mean(. == 0, na.rm = TRUE))) %>%
 pivot_longer(everything(), names_to = "Variable", values_to = "ZeroProportion")

ggplot(zero_counts, aes(x = reorder(Variable, -ZeroProportion), y = ZeroProportion)) +
 geom_bar(stat = "identity", fill = "black", color = "black", alpha = 0.6) +
 coord_flip() +
 labs(title = "Proportion of Zeros in Numeric Variables", x = "Variable", y = "Proportion of Zeros") +
 theme_minimal()

Data Preparation

Impute Missing Values

# Load necessary libraries
library(dplyr)
library(rpart)

# Combine training and evaluation datasets for consistent imputation
combined_data <- bind_rows(
  train_data %>% mutate(dataset = "train"),
  eval_data %>% mutate(dataset = "eval")
)

# Step 1: Impute missing values with median for variables with < 20% missingness
combined_data_imputed <- combined_data %>%
  group_by(dataset) %>%
  mutate(
    ResidualSugar = ifelse(is.na(ResidualSugar), median(ResidualSugar, na.rm = TRUE), ResidualSugar),
    Chlorides = ifelse(is.na(Chlorides), median(Chlorides, na.rm = TRUE), Chlorides),
    FreeSulfurDioxide = ifelse(is.na(FreeSulfurDioxide), median(FreeSulfurDioxide, na.rm = TRUE), FreeSulfurDioxide),
    TotalSulfurDioxide = ifelse(is.na(TotalSulfurDioxide), median(TotalSulfurDioxide, na.rm = TRUE), TotalSulfurDioxide),
    pH = ifelse(is.na(pH), median(pH, na.rm = TRUE), pH),
    Sulphates = ifelse(is.na(Sulphates), median(Sulphates, na.rm = TRUE), Sulphates),
    Alcohol = ifelse(is.na(Alcohol), median(Alcohol, na.rm = TRUE), Alcohol)
  ) %>%
  ungroup()

# Step 2: Predictive imputation for STARS using rpart
# Separate combined data into rows with and without missing STARS
complete_stars <- combined_data_imputed %>% filter(!is.na(STARS))
missing_stars <- combined_data_imputed %>% filter(is.na(STARS))

# Check column names and exclude STARS from predictors
predictor_columns <- setdiff(names(combined_data_imputed), c("STARS", "dataset"))

# Train an rpart model to predict STARS
rpart_model <- rpart(
  STARS ~ .,
  data = complete_stars,
  method = "anova"
)

# Predict missing STARS values
predicted_stars <- predict(rpart_model, missing_stars)

# Impute the predicted values back into the combined dataset
combined_data_imputed <- combined_data_imputed %>%
  mutate(
    STARS = ifelse(is.na(STARS), predicted_stars, STARS)
  )

# Step 3: Create missingness flags for all variables with missing data
combined_data_imputed <- combined_data_imputed %>%
  mutate(
    ResidualSugar_missing = is.na(ResidualSugar),
    Chlorides_missing = is.na(Chlorides),
    FreeSulfurDioxide_missing = is.na(FreeSulfurDioxide),
    TotalSulfurDioxide_missing = is.na(TotalSulfurDioxide),
    pH_missing = is.na(pH),
    Sulphates_missing = is.na(Sulphates),
    Alcohol_missing = is.na(Alcohol),
    STARS_missing = is.na(STARS)
  )

# Step 4: Split back into train_data and eval_data
train_data_imputed <- combined_data_imputed %>% filter(dataset == "train") %>% dplyr::select(-dataset)
eval_data_imputed <- combined_data_imputed %>% filter(dataset == "eval") %>% dplyr::select(-dataset)

# Display the structure of the imputed datasets
cat("\nImputed Training Data\n\n")
## 
## Imputed Training Data
str(train_data_imputed)
## tibble [12,795 × 24] (S3: tbl_df/tbl/data.frame)
##  $ INDEX                     : int [1:12795] 1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET                    : int [1:12795] 3 3 5 3 4 0 0 4 3 6 ...
##  $ FixedAcidity              : num [1:12795] 3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
##  $ VolatileAcidity           : num [1:12795] 1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
##  $ CitricAcid                : num [1:12795] -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
##  $ ResidualSugar             : num [1:12795] 54.2 26.1 14.8 18.8 9.4 ...
##  $ Chlorides                 : num [1:12795] -0.567 -0.425 0.037 -0.425 0.046 0.556 0.06 0.04 -0.007 -0.277 ...
##  $ FreeSulfurDioxide         : num [1:12795] 30 15 214 22 -167 -37 287 523 -213 62 ...
##  $ TotalSulfurDioxide        : num [1:12795] 268 -327 142 115 108 15 156 551 123 180 ...
##  $ Density                   : num [1:12795] 0.993 1.028 0.995 0.996 0.995 ...
##  $ pH                        : num [1:12795] 3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
##  $ Sulphates                 : num [1:12795] -0.59 0.7 0.48 1.83 1.77 1.29 1.21 0.5 0.26 0.75 ...
##  $ Alcohol                   : num [1:12795] 9.9 10.4 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
##  $ LabelAppeal               : int [1:12795] 0 -1 -1 -1 0 0 0 1 0 0 ...
##  $ AcidIndex                 : int [1:12795] 8 7 8 6 9 11 8 7 6 8 ...
##  $ STARS                     : num [1:12795] 2 3 3 1 2 ...
##  $ ResidualSugar_missing     : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Chlorides_missing         : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ FreeSulfurDioxide_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ TotalSulfurDioxide_missing: logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pH_missing                : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Sulphates_missing         : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Alcohol_missing           : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ STARS_missing             : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
cat("\nImputed Evaluation Data\n\n")
## 
## Imputed Evaluation Data
str(eval_data_imputed)
## tibble [3,335 × 24] (S3: tbl_df/tbl/data.frame)
##  $ INDEX                     : int [1:3335] 3 9 10 18 21 30 31 37 39 47 ...
##  $ TARGET                    : int [1:3335] NA NA NA NA NA NA NA NA NA NA ...
##  $ FixedAcidity              : num [1:3335] 5.4 12.4 7.2 6.2 11.4 17.6 15.5 15.9 11.6 3.8 ...
##  $ VolatileAcidity           : num [1:3335] -0.86 0.385 1.75 0.1 0.21 0.04 0.53 1.19 0.32 0.22 ...
##  $ CitricAcid                : num [1:3335] 0.27 -0.76 0.17 1.8 0.28 -1.15 -0.53 1.14 0.55 0.31 ...
##  $ ResidualSugar             : num [1:3335] -10.7 -19.7 -33 1 1.2 1.4 4.6 31.9 -50.9 -7.7 ...
##  $ Chlorides                 : num [1:3335] 0.092 1.169 0.065 -0.179 0.038 ...
##  $ FreeSulfurDioxide         : num [1:3335] 23 -37 9 104 70 -250 10 115 35 40 ...
##  $ TotalSulfurDioxide        : num [1:3335] 398 68 76 89 53 140 17 381 83 129 ...
##  $ Density                   : num [1:3335] 0.985 0.99 1.046 0.989 1.029 ...
##  $ pH                        : num [1:3335] 5.02 3.37 4.61 3.2 2.54 3.06 3.07 2.99 3.32 4.72 ...
##  $ Sulphates                 : num [1:3335] 0.64 1.09 0.68 2.11 -0.07 -0.02 0.75 0.31 2.18 -0.64 ...
##  $ Alcohol                   : num [1:3335] 12.3 16 8.55 12.3 4.8 11.4 8.5 11.4 -0.5 10.9 ...
##  $ LabelAppeal               : int [1:3335] -1 0 0 -1 0 1 0 1 0 0 ...
##  $ AcidIndex                 : int [1:3335] 6 6 8 8 10 8 12 7 12 7 ...
##  $ STARS                     : num [1:3335] 1.29 2 1 1 1.29 ...
##  $ ResidualSugar_missing     : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Chlorides_missing         : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ FreeSulfurDioxide_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ TotalSulfurDioxide_missing: logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pH_missing                : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Sulphates_missing         : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Alcohol_missing           : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ STARS_missing             : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...

Transform Data with Box-Cox and Yeo-Johnson

# Combine train and evaluation datasets for consistent transformation
combined_data <- bind_rows(
  train_data_imputed %>% mutate(dataset = "train"),
  eval_data_imputed %>% mutate(dataset = "eval")
)

# Exclude columns from transformations
exclude_cols <- c("INDEX", "TARGET", "dataset")

# Identify numeric columns for transformation
numeric_cols <- setdiff(names(combined_data)[sapply(combined_data, is.numeric)], exclude_cols)

# Perform Box-Cox transformation
boxcox_data_combined <- combined_data

for (col in numeric_cols) {
  if (col %in% c("INDEX", "TARGET") && all(is.na(combined_data[[col]][combined_data$dataset == "eval"]))) {
    next # Skip transformation for INDEX and TARGET in eval_data if all values are NA
  }
  
  tryCatch({
    # Extract the column as a vector
    column_data <- combined_data[[col]]
    
    # Check for non-positive values and adjust
    adjustment <- 0
    if (min(column_data, na.rm = TRUE) <= 0) {
      adjustment <- abs(min(column_data, na.rm = TRUE)) + 0.001
      column_data <- column_data + adjustment
    }
    
    # Fit a simple linear model on training data only
    model <- lm(column_data[combined_data$dataset == "train"] ~ 1)
    
    # Perform Box-Cox transformation
    bc <- boxcox(model, lambda = seq(-2, 2, by = 0.1), plotit = FALSE)
    optimal_lambda <- bc$x[which.max(bc$y)]
    
    # Apply the transformation
    if (!is.na(optimal_lambda)) {
      if (optimal_lambda == 0) {
        transformed <- log(column_data)
      } else {
        transformed <- ((column_data)^optimal_lambda - 1) / optimal_lambda
      }
      if (adjustment > 0) {
        transformed <- transformed - adjustment
      }
      boxcox_data_combined[[col]] <- transformed
    }
  }, error = function(e) {
    cat(paste("Error processing column", col, ":", e$message, "\n"))
  })
}

# Perform Yeo-Johnson transformation
yeojohnson_data_combined <- combined_data

for (col in numeric_cols) {
  if (col %in% c("INDEX", "TARGET") && all(is.na(combined_data[[col]][combined_data$dataset == "eval"]))) {
    next # Skip transformation for INDEX and TARGET in eval_data if all values are NA
  }
  
  tryCatch({
    # Extract the column as a vector
    column_data <- combined_data[[col]]
    
    # Compute optimal lambda using training data
    lambda <- car::powerTransform(column_data[combined_data$dataset == "train"], family = "yjPower")$lambda
    
    # Apply the Yeo-Johnson transformation
    transformed <- car::yjPower(column_data, lambda)
    
    # Save the transformed data
    yeojohnson_data_combined[[col]] <- transformed
  }, error = function(e) {
    cat(paste("Error processing column", col, ":", e$message, "\n"))
  })
}


# Split transformed dataset back into train and evaluation datasets
boxcox_train_data <- boxcox_data_combined %>% 
  filter(dataset == "train") %>% 
  dplyr::select(-c(dataset))

boxcox_eval_data <- boxcox_data_combined %>% 
  filter(dataset == "eval") %>% 
  dplyr::select(-c(dataset))

yeojohnson_train_data <- yeojohnson_data_combined %>% 
  filter(dataset == "train") %>% 
  dplyr::select(-c(dataset))

yeojohnson_eval_data <- yeojohnson_data_combined %>% 
  filter(dataset == "eval") %>% 
  dplyr::select(-c(dataset))

# Display structures of transformed datasets
cat("\nBox-Cox Transformed Training Dataset:\n\n")
## 
## Box-Cox Transformed Training Dataset:
str(boxcox_train_data)
## tibble [12,795 × 24] (S3: tbl_df/tbl/data.frame)
##  $ INDEX                     : int [1:12795] 1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET                    : int [1:12795] 3 3 5 3 4 0 0 4 3 6 ...
##  $ FixedAcidity              : num [1:12795] 7.32 9.09 12.66 10.73 13.91 ...
##  $ VolatileAcidity           : num [1:12795] 0.427 -0.706 2.155 -0.454 -0.516 ...
##  $ CitricAcid                : num [1:12795] -1.92 -1.735 -1.811 -0.791 -2.222 ...
##  $ ResidualSugar             : num [1:12795] 150 103.1 84.5 91.1 75.6 ...
##  $ Chlorides                 : num [1:12795] -1.558 -1.422 -0.961 -1.422 -0.952 ...
##  $ FreeSulfurDioxide         : num [1:12795] 457 428.6 810.4 441.8 90.8 ...
##  $ TotalSulfurDioxide        : num [1:12795] 1172.3 14.9 920.3 866.7 852.8 ...
##  $ Density                   : num [1:12795] -0.00719 0.02804 -0.00482 -0.0036 -0.00543 ...
##  $ pH                        : num [1:12795] 2.51 2.56 2.27 1.3 2.27 ...
##  $ Sulphates                 : num [1:12795] -1.5043 -0.0567 -0.3076 1.2533 1.1829 ...
##  $ Alcohol                   : num [1:12795] 11.75 12.4 28.1 6.97 16.77 ...
##  $ LabelAppeal               : num [1:12795] -1.07 -2 -2 -2 -1.07 ...
##  $ AcidIndex                 : num [1:12795] 0.765 0.753 0.765 0.736 0.774 ...
##  $ STARS                     : num [1:12795] 0.647 0.986 0.986 0 0.647 ...
##  $ ResidualSugar_missing     : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Chlorides_missing         : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ FreeSulfurDioxide_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ TotalSulfurDioxide_missing: logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pH_missing                : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Sulphates_missing         : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Alcohol_missing           : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ STARS_missing             : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
cat("\nBox-Cox Transformed Evaluation Dataset:\n\n")
## 
## Box-Cox Transformed Evaluation Dataset:
str(boxcox_eval_data)
## tibble [3,335 × 24] (S3: tbl_df/tbl/data.frame)
##  $ INDEX                     : int [1:3335] 3 9 10 18 21 30 31 37 39 47 ...
##  $ TARGET                    : int [1:3335] NA NA NA NA NA NA NA NA NA NA ...
##  $ FixedAcidity              : num [1:3335] 10.3 20.1 12.8 11.4 18.7 ...
##  $ VolatileAcidity           : num [1:3335] -1.822 -0.454 1.109 -0.773 -0.65 ...
##  $ CitricAcid                : num [1:3335] -0.531 -1.68 -0.644 1.237 -0.52 ...
##  $ ResidualSugar             : num [1:3335] 43 28.56 7.44 61.94 62.26 ...
##  $ Chlorides                 : num [1:3335] -0.905 0.236 -0.932 -1.179 -0.96 ...
##  $ FreeSulfurDioxide         : num [1:3335] 444 331 417 598 533 ...
##  $ TotalSulfurDioxide        : num [1:3335] 1435 774 789 815 744 ...
##  $ Density                   : num [1:3335] -0.0147 -0.00951 0.04673 -0.01121 0.02912 ...
##  $ pH                        : num [1:3335] 4.45 2.55 3.97 2.36 1.63 ...
##  $ Sulphates                 : num [1:3335] -0.1253 0.3915 -0.0796 1.5828 -0.928 ...
##  $ Alcohol                   : num [1:3335] 14.91 19.87 9.99 14.91 5.21 ...
##  $ LabelAppeal               : num [1:3335] -2 -1.07 -1.07 -2 -1.07 ...
##  $ AcidIndex                 : num [1:3335] 0.736 0.736 0.765 0.765 0.781 ...
##  $ STARS                     : num [1:3335] 0.246 0.647 0 0 0.246 ...
##  $ ResidualSugar_missing     : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Chlorides_missing         : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ FreeSulfurDioxide_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ TotalSulfurDioxide_missing: logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pH_missing                : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Sulphates_missing         : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Alcohol_missing           : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ STARS_missing             : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...

Statement on Skewness Before and After Transformations:

  1. Effectiveness of Transformations:
    • Both Box-Cox and Yeo-Johnson reduced skewness significantly for most variables, especially highly skewed ones like AcidIndex, Sulphates, and STARS.
  2. Comparison:
    • Box-Cox: More effective for highly skewed variables like AcidIndex and Sulphates.
    • Yeo-Johnson: Effective but slightly less impactful for extreme skewness; better suited for variables with zero or negative values.
  3. Minimal Changes:
    • Variables with low skewness, such as Density and Chlorides, showed little change as they were already near symmetric.

Conclusion:

Both transformations improve symmetry, with Box-Cox being more effective overall, while Yeo-Johnson offers versatility for handling zero or negative values. For our modeling homework, we will use the Box-Cox transformed data

# Required Libraries
library(ggplot2)
library(dplyr)
library(tidyr)

# Compute skewness for original, Box-Cox, and Yeo-Johnson transformed datasets
compute_skewness <- function(data, numeric_cols) {
  sapply(data %>% dplyr::select(all_of(numeric_cols)), function(x) {
    moments::skewness(x, na.rm = TRUE)
  })
}

# Calculate skewness for all datasets
original_skewness <- compute_skewness(train_data_imputed, numeric_cols)
boxcox_skewness <- compute_skewness(boxcox_train_data, numeric_cols)
yeojohnson_skewness <- compute_skewness(yeojohnson_train_data, numeric_cols)

# Combine skewness data into a single dataframe for plotting
skewness_df <- data.frame(
  Variable = rep(numeric_cols, 3),
  Transformation = rep(c("Original", "Box-Cox", "Yeo-Johnson"), each = length(numeric_cols)),
  Skewness = c(original_skewness, boxcox_skewness, yeojohnson_skewness)
)

# Create a bar chart to compare skewness
ggplot(skewness_df, aes(x = Variable, y = Skewness, fill = Transformation)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(
    title = "Skewness Comparison Before and After Transformations",
    x = "Variable",
    y = "Skewness"
  ) +
  theme_minimal() +
  theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
  scale_fill_brewer(palette = "Set2")

Identify Highly Correlated Variables and Plot Correlations

# Load necessary libraries
library(corrplot)
## corrplot 0.95 loaded
# Compute the correlation matrix (only for numerical predictors)
cor_matrix <- cor(boxcox_train_data %>% select_if(is.numeric), use = "pairwise.complete.obs")

# Identify highly correlated variable pairs (e.g., correlation > 0.9 or < -0.9)
high_corr_pairs <- which(abs(cor_matrix) > 0.9 & abs(cor_matrix) < 1, arr.ind = TRUE)
high_corr_df <- data.frame(
  Variable1 = rownames(cor_matrix)[high_corr_pairs[, 1]],
  Variable2 = colnames(cor_matrix)[high_corr_pairs[, 2]],
  Correlation = cor_matrix[high_corr_pairs]
)

# Print highly correlated variable pairs
print(high_corr_df)
## [1] Variable1   Variable2   Correlation
## <0 rows> (or 0-length row.names)
# Plot the correlation matrix using corrplot
corrplot(cor_matrix, method = "color", type = "upper", tl.col = "black", tl.cex = 0.8)

#### Observations from the Correlation Analysis:

  1. No Highly Correlated Variable Pairs (Threshold > 0.9):
    • The high_corr_df data frame is empty, indicating that no variable pairs have a correlation above the threshold of 0.9 (or below -0.9).
    • This suggests that the dataset does not have multicollinearity issues due to highly correlated variables.
  2. Correlation Matrix Plot:
    • The corrplot visualization shows the pairwise correlations between variables.
    • Most correlations are weak to moderate, with no strong patterns of redundancy among variables.
    • The diagonal of the matrix (autocorrelations of variables with themselves) is 1, as expected.

Confirmation of No Multicollinearity with VIF Before Model Fitting

# Load necessary libraries
library(car)
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
library(carData)

#### Confirmation of No Multicollinearity with VIF Before Model Fitting

# Fit an initial model with all predictors (ignoring TARGET for this step)
initial_model <- lm(
  FixedAcidity ~ VolatileAcidity + CitricAcid + ResidualSugar +
    Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + pH +
    Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS,
  data = boxcox_train_data
)

# Check VIF for all predictors
vif_values <- vif(initial_model)

# Convert VIF values to a data frame
vif_df <- data.frame(
  Variable = names(vif_values),
  VIF = vif_values
)

# Print the VIF data frame
print(vif_df)
##                              Variable      VIF
## VolatileAcidity       VolatileAcidity 1.004032
## CitricAcid                 CitricAcid 1.005589
## ResidualSugar           ResidualSugar 1.001635
## Chlorides                   Chlorides 1.002502
## FreeSulfurDioxide   FreeSulfurDioxide 1.002859
## TotalSulfurDioxide TotalSulfurDioxide 1.003562
## Density                       Density 1.003313
## pH                                 pH 1.005586
## Sulphates                   Sulphates 1.001448
## Alcohol                       Alcohol 1.006642
## LabelAppeal               LabelAppeal 1.080330
## AcidIndex                   AcidIndex 1.027973
## STARS                           STARS 1.092914
# Identify variables with high multicollinearity (VIF > 10)
high_vif <- vif_values[vif_values > 10]

# Create a data frame for high VIF values
high_vif_df <- data.frame(
  Variable = names(high_vif),
  VIF = high_vif
)

# Print the high VIF data frame
print(high_vif_df)
## [1] Variable VIF     
## <0 rows> (or 0-length row.names)

The output indicates that multicollinearity is not a concern among the predictors in the dataset. Here are the key points:

  1. Variance Inflation Factor (VIF):
    • All VIF values are close to 1, with the highest being 1.12 (for STARS), far below the common threshold of 10.
    • This suggests that the predictors are largely independent of each other.
  2. No High Multicollinearity Detected:
    • The high_vif object is empty (named numeric(0)), confirming that no variables exhibit problematic multicollinearity.
  3. Conclusion:
    • You can safely proceed with model fitting using all predictors without any adjustments for multicollinearity.

To address the task comprehensively using concepts from Chapter 5 of the uploaded document, I propose the following steps to build and compare models across Poisson, Negative Binomial, and Linear Regression approaches, incorporating different techniques for variable selection, transformation, and model diagnostics. Here’s the strategy:

Scale the data

# Scale only numeric columns excluding TARGET
boxcox_train_data <- boxcox_train_data %>%
  mutate(across(where(is.numeric) & !starts_with(c("TARGET", "INDEX")), scale))

boxcox_eval_data <- boxcox_eval_data %>%
  mutate(across(where(is.numeric) & !starts_with(c("TARGET", "INDEX")), scale))

str(boxcox_train_data)
## tibble [12,795 × 24] (S3: tbl_df/tbl/data.frame)
##  $ INDEX                     : int [1:12795] 1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET                    : int [1:12795] 3 3 5 3 4 0 0 4 3 6 ...
##  $ FixedAcidity              : num [1:12795, 1] -0.62514 -0.42106 -0.00944 -0.23164 0.13406 ...
##   ..- attr(*, "scaled:center")= num 12.7
##   ..- attr(*, "scaled:scale")= num 8.68
##  $ VolatileAcidity           : num [1:12795, 1] 1.07138 -0.22295 3.04599 0.06495 -0.00562 ...
##   ..- attr(*, "scaled:center")= num -0.511
##   ..- attr(*, "scaled:scale")= num 0.875
##  $ CitricAcid                : num [1:12795, 1] -1.485 -1.295 -1.373 -0.325 -1.795 ...
##   ..- attr(*, "scaled:center")= num -0.475
##   ..- attr(*, "scaled:scale")= num 0.973
##  $ ResidualSugar             : num [1:12795, 1] 1.504 0.626 0.277 0.4 0.111 ...
##   ..- attr(*, "scaled:center")= num 69.7
##   ..- attr(*, "scaled:scale")= num 53.4
##  $ Chlorides                 : num [1:12795, 1] -1.9641 -1.531 -0.0697 -1.531 -0.0406 ...
##   ..- attr(*, "scaled:center")= num -0.939
##   ..- attr(*, "scaled:scale")= num 0.315
##  $ FreeSulfurDioxide         : num [1:12795, 1] -0.0185 -0.1224 1.2757 -0.074 -1.3593 ...
##   ..- attr(*, "scaled:center")= num 462
##   ..- attr(*, "scaled:scale")= num 273
##  $ TotalSulfurDioxide        : num [1:12795, 1] 0.6476 -1.9508 0.0817 -0.0386 -0.0698 ...
##   ..- attr(*, "scaled:center")= num 884
##   ..- attr(*, "scaled:scale")= num 445
##  $ Density                   : num [1:12795, 1] -0.05686 1.27316 0.03283 0.07883 0.00984 ...
##   ..- attr(*, "scaled:center")= num -0.00569
##   ..- attr(*, "scaled:scale")= num 0.0265
##  $ pH                        : num [1:12795, 1] 0.173 0.249 -0.142 -1.436 -0.142 ...
##   ..- attr(*, "scaled:center")= num 2.38
##   ..- attr(*, "scaled:scale")= num 0.75
##  $ Sulphates                 : num [1:12795, 1] -1.2547 0.1864 -0.0634 1.4905 1.4204 ...
##   ..- attr(*, "scaled:center")= num -0.244
##   ..- attr(*, "scaled:scale")= num 1
##  $ Alcohol                   : num [1:12795, 1] -0.1742 -0.0361 3.2745 -1.1801 0.8861 ...
##   ..- attr(*, "scaled:center")= num 12.6
##   ..- attr(*, "scaled:scale")= num 4.74
##  $ LabelAppeal               : num [1:12795, 1] 0.0643 -1.0769 -1.0769 -1.0769 0.0643 ...
##   ..- attr(*, "scaled:center")= num -1.13
##   ..- attr(*, "scaled:scale")= num 0.812
##  $ AcidIndex                 : num [1:12795, 1] 0.363 -0.545 0.363 -1.792 1.052 ...
##   ..- attr(*, "scaled:center")= num 0.76
##   ..- attr(*, "scaled:scale")= num 0.0132
##  $ STARS                     : num [1:12795, 1] 0.349 1.241 1.241 -1.354 0.349 ...
##   ..- attr(*, "scaled:center")= num 0.515
##   ..- attr(*, "scaled:scale")= num 0.38
##  $ ResidualSugar_missing     : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Chlorides_missing         : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ FreeSulfurDioxide_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ TotalSulfurDioxide_missing: logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ pH_missing                : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Sulphates_missing         : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ Alcohol_missing           : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
##  $ STARS_missing             : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...

Build Models

# Load necessary libraries
library(MASS)
library(pscl)
library(car)
library(dplyr)
library(caret)
## Loading required package: lattice
library(lattice)

# Remove unnecessary columns (e.g., "_missing" flags)
boxcox_train_data <- boxcox_train_data %>%
  dplyr::select(-ends_with("_missing"), -INDEX)

boxcox_eval_data <- boxcox_eval_data %>%
  dplyr::select(-ends_with("_missing"))

# Split data into training and testing sets
set.seed(123)  # For reproducibility
train_index <- createDataPartition(boxcox_train_data$TARGET, p = 0.8, list = FALSE)
train_data <- boxcox_train_data[train_index, ]
test_data <- boxcox_train_data[-train_index, ]

Poisson Regression Models

Model 1: Basic Poisson regression using all predictors.

# Load necessary libraries
library(MASS)
library(pscl)
library(car)
library(dplyr)
library(caret)
library(ggplot2)

# Poisson Regression Model 1: All Predictors
# Fit a Poisson regression model on training data
poisson_model_all <- glm(
  TARGET ~ ., data = train_data, 
  family = poisson()
)

# Summarize the model
summary(poisson_model_all)
## 
## Call:
## glm(formula = TARGET ~ ., family = poisson(), data = train_data)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.044806   0.006033 173.194  < 2e-16 ***
## FixedAcidity       -0.006167   0.005849  -1.054  0.29169    
## VolatileAcidity    -0.040178   0.005704  -7.044 1.87e-12 ***
## CitricAcid          0.008565   0.005673   1.510  0.13111    
## ResidualSugar       0.008053   0.005644   1.427  0.15362    
## Chlorides          -0.015407   0.005763  -2.674  0.00750 ** 
## FreeSulfurDioxide   0.022239   0.005672   3.921 8.83e-05 ***
## TotalSulfurDioxide  0.024427   0.005701   4.285 1.83e-05 ***
## Density            -0.010871   0.005656  -1.922  0.05461 .  
## pH                 -0.010682   0.005707  -1.872  0.06123 .  
## Sulphates          -0.011761   0.005696  -2.065  0.03895 *  
## Alcohol             0.017374   0.005701   3.048  0.00231 ** 
## LabelAppeal         0.168867   0.006181  27.318  < 2e-16 ***
## AcidIndex          -0.114966   0.005809 -19.791  < 2e-16 ***
## STARS               0.235666   0.006045  38.983  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18252  on 10237  degrees of freedom
## Residual deviance: 14249  on 10223  degrees of freedom
## AIC: 39847
## 
## Number of Fisher Scoring iterations: 5
# Predict on training and testing sets
train_predictions <- predict(poisson_model_all, train_data, type = "response")
test_predictions <- predict(poisson_model_all, test_data, type = "response")

# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))

# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                 sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                sum((test_data$TARGET - mean(test_data$TARGET))^2))

# Print metrics
cat("AIC:", AIC(poisson_model_all), "\n")
## AIC: 39846.53
cat("BIC:", BIC(poisson_model_all), "\n")
## BIC: 39955.03
cat("Train RMSE:", round(train_rmse, 4), "\n")
## Train RMSE: 1.5723
cat("Test RMSE:", round(test_rmse, 4), "\n")
## Test RMSE: 1.5783
cat("Train R²:", round(train_r2, 4), "\n")
## Train R²: 0.3319
cat("Test R²:", round(test_r2, 4), "\n")
## Test R²: 0.3359
# Residual plot for diagnostics
plot(residuals(poisson_model_all), 
     main = "Residuals of Poisson Regression Model (All Predictors)",
     xlab = "Observation", ylab = "Residuals", 
     col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)

Model 2: Poisson regression after stepwise variable selection (AIC-based).

# Load necessary libraries
library(dplyr)
library(MASS)
library(caret)
library(ggplot2)

# Use full dataset for stepwise regression
# Perform stepwise Poisson regression on training data
poisson_model_stepwise <- step(
  glm(TARGET ~ ., data = train_data, family = poisson()),
  direction = "both",
  trace = FALSE  # Suppress output of intermediate steps
)

# Summarize the final stepwise-selected model
summary(poisson_model_stepwise)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, 
##     family = poisson(), data = train_data)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.044778   0.006033 173.191  < 2e-16 ***
## VolatileAcidity    -0.040161   0.005703  -7.042 1.90e-12 ***
## CitricAcid          0.008529   0.005674   1.503  0.13279    
## ResidualSugar       0.008190   0.005642   1.452  0.14662    
## Chlorides          -0.015352   0.005762  -2.664  0.00771 ** 
## FreeSulfurDioxide   0.022202   0.005672   3.914 9.07e-05 ***
## TotalSulfurDioxide  0.024475   0.005700   4.294 1.75e-05 ***
## Density            -0.010849   0.005657  -1.918  0.05511 .  
## pH                 -0.010778   0.005705  -1.889  0.05888 .  
## Sulphates          -0.011912   0.005694  -2.092  0.03642 *  
## Alcohol             0.017375   0.005701   3.048  0.00230 ** 
## LabelAppeal         0.168873   0.006182  27.318  < 2e-16 ***
## AcidIndex          -0.115890   0.005741 -20.186  < 2e-16 ***
## STARS               0.235688   0.006045  38.988  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18252  on 10237  degrees of freedom
## Residual deviance: 14250  on 10224  degrees of freedom
## AIC: 39846
## 
## Number of Fisher Scoring iterations: 5
# Predict on training and testing sets
train_predictions <- predict(poisson_model_stepwise, train_data, type = "response")
test_predictions <- predict(poisson_model_stepwise, test_data, type = "response")

# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))

# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                 sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                sum((test_data$TARGET - mean(test_data$TARGET))^2))

# Print metrics
cat("AIC:", AIC(poisson_model_stepwise), "\n")
## AIC: 39845.64
cat("BIC:", BIC(poisson_model_stepwise), "\n")
## BIC: 39946.91
cat("Train RMSE:", round(train_rmse, 4), "\n")
## Train RMSE: 1.5724
cat("Test RMSE:", round(test_rmse, 4), "\n")
## Test RMSE: 1.5785
cat("Train R²:", round(train_r2, 4), "\n")
## Train R²: 0.3318
cat("Test R²:", round(test_r2, 4), "\n")
## Test R²: 0.3358
# Residual plot for diagnostics
plot(residuals(poisson_model_stepwise), 
     main = "Residuals of Stepwise Poisson Regression",
     xlab = "Observation", ylab = "Residuals", 
     col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)

Negative Binomial Regression Models

Model 1: Standard Negative Binomial model with all predictors.

# Load necessary libraries
library(dplyr)
library(MASS)
library(caret)  # For data splitting and evaluation metrics
library(ggplot2)

# Fit a Negative Binomial model on training data
nb_model_all <- glm.nb(TARGET ~ ., data = train_data)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
# Summarize the model
summary(nb_model_all)
## 
## Call:
## glm.nb(formula = TARGET ~ ., data = train_data, init.theta = 43393.59694, 
##     link = log)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.044806   0.006033 173.189  < 2e-16 ***
## FixedAcidity       -0.006167   0.005849  -1.054  0.29171    
## VolatileAcidity    -0.040178   0.005704  -7.044 1.87e-12 ***
## CitricAcid          0.008565   0.005674   1.510  0.13113    
## ResidualSugar       0.008053   0.005644   1.427  0.15362    
## Chlorides          -0.015407   0.005763  -2.674  0.00750 ** 
## FreeSulfurDioxide   0.022240   0.005673   3.921 8.83e-05 ***
## TotalSulfurDioxide  0.024428   0.005701   4.285 1.83e-05 ***
## Density            -0.010872   0.005657  -1.922  0.05461 .  
## pH                 -0.010682   0.005707  -1.872  0.06123 .  
## Sulphates          -0.011761   0.005696  -2.065  0.03895 *  
## Alcohol             0.017374   0.005701   3.048  0.00231 ** 
## LabelAppeal         0.168867   0.006182  27.317  < 2e-16 ***
## AcidIndex          -0.114969   0.005809 -19.791  < 2e-16 ***
## STARS               0.235665   0.006046  38.982  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(43393.6) family taken to be 1)
## 
##     Null deviance: 18251  on 10237  degrees of freedom
## Residual deviance: 14248  on 10223  degrees of freedom
## AIC: 39849
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  43394 
##           Std. Err.:  65170 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -39816.66
# Predict on training and testing sets
train_predictions <- predict(nb_model_all, train_data, type = "response")
test_predictions <- predict(nb_model_all, test_data, type = "response")

# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))

# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                 sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                sum((test_data$TARGET - mean(test_data$TARGET))^2))

# Print metrics
cat("AIC:", AIC(nb_model_all), "\n")
## AIC: 39848.66
cat("BIC:", BIC(nb_model_all), "\n")
## BIC: 39964.4
cat("Train RMSE:", train_rmse, "\n")
## Train RMSE: 1.572312
cat("Test RMSE:", test_rmse, "\n")
## Test RMSE: 1.578315
cat("Train R²:", train_r2, "\n")
## Train R²: 0.3319323
cat("Test R²:", test_r2, "\n")
## Test R²: 0.3359193
# Residual plot for diagnostics
plot(residuals(nb_model_all))

Negative Binomial Model with Weighted Residuals

# Load necessary libraries
library(dplyr)
library(MASS)
library(caret)  # For data splitting and evaluation metrics
library(ggplot2)

# Initial Poisson model to compute residuals for weights
poisson_model <- glm(TARGET ~ ., data = train_data, family = poisson)
poisson_residuals <- residuals(poisson_model, type = "response")

# Compute weights based on residuals
nb_weights <- 1 / (abs(poisson_residuals) + 1)

# Fit a weighted Negative Binomial model
nb_model_weighted <- glm.nb(
  TARGET ~ ., 
  data = train_data, 
  weights = nb_weights,  # Apply weights
  control = glm.control(maxit = 100)
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
# Summarize the weighted model
summary(nb_model_weighted)
## 
## Call:
## glm.nb(formula = TARGET ~ ., data = train_data, weights = nb_weights, 
##     control = glm.control(maxit = 100), init.theta = 527063126000, 
##     link = log)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.088930   0.008393 129.741  < 2e-16 ***
## FixedAcidity       -0.005856   0.007849  -0.746  0.45564    
## VolatileAcidity    -0.038304   0.007734  -4.952 7.33e-07 ***
## CitricAcid          0.007981   0.007590   1.052  0.29303    
## ResidualSugar       0.006804   0.007541   0.902  0.36688    
## Chlorides          -0.014391   0.007717  -1.865  0.06221 .  
## FreeSulfurDioxide   0.019892   0.007559   2.632  0.00850 ** 
## TotalSulfurDioxide  0.021324   0.007612   2.801  0.00509 ** 
## Density            -0.010114   0.007627  -1.326  0.18479    
## pH                 -0.009419   0.007678  -1.227  0.21996    
## Sulphates          -0.009539   0.007679  -1.242  0.21416    
## Alcohol             0.018980   0.007620   2.491  0.01275 *  
## LabelAppeal         0.180900   0.008405  21.524  < 2e-16 ***
## AcidIndex          -0.110908   0.008169 -13.577  < 2e-16 ***
## STARS               0.212858   0.008428  25.255  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(527063125967) family taken to be 1)
## 
##     Null deviance: 6492.6  on 10237  degrees of freedom
## Residual deviance: 4386.9  on 10223  degrees of freedom
## AIC: 19016
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  527063125967 
##           Std. Err.:  292534145483 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -18984.15
# Predict on training and testing sets
train_predictions <- predict(nb_model_weighted, train_data, type = "response")
test_predictions <- predict(nb_model_weighted, test_data, type = "response")

# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))

# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                 sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                sum((test_data$TARGET - mean(test_data$TARGET))^2))

# Format metrics for display
formatted_metrics <- sprintf(
  "AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
  AIC(nb_model_weighted),
  BIC(nb_model_weighted),
  train_rmse,
  test_rmse,
  train_r2,
  test_r2
)

# Print metrics
cat(formatted_metrics)
## AIC: 19016.15
## BIC: 19131.89
## Train RMSE: 1.5784
## Test RMSE: 1.5875
## Train R²: 0.3267
## Test R²: 0.3281
# Residual plot for diagnostics
plot(residuals(nb_model_weighted), main = "Residuals of Weighted Negative Binomial Model",
     xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)

### Model 2: Negative Binomial model with stepwise variable selection.

library(MASS)
library(dplyr)
library(car)
library(carData)

# Stepwise selection with updated parameters
nb_model_stepwise <- step(
  glm.nb(TARGET ~ ., 
         data = train_data, 
         link = log),
  direction = "both",
  trace = FALSE  # Enable trace for debugging
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(nb_model_stepwise)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, 
##     data = train_data, init.theta = 43390.17136, link = log)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.044777   0.006033 173.185  < 2e-16 ***
## VolatileAcidity    -0.040162   0.005704  -7.041 1.90e-12 ***
## CitricAcid          0.008529   0.005674   1.503  0.13281    
## ResidualSugar       0.008191   0.005643   1.452  0.14661    
## Chlorides          -0.015353   0.005762  -2.664  0.00771 ** 
## FreeSulfurDioxide   0.022203   0.005672   3.914 9.07e-05 ***
## TotalSulfurDioxide  0.024476   0.005700   4.294 1.75e-05 ***
## Density            -0.010850   0.005657  -1.918  0.05511 .  
## pH                 -0.010779   0.005706  -1.889  0.05888 .  
## Sulphates          -0.011913   0.005694  -2.092  0.03643 *  
## Alcohol             0.017375   0.005701   3.048  0.00230 ** 
## LabelAppeal         0.168873   0.006182  27.317  < 2e-16 ***
## AcidIndex          -0.115893   0.005741 -20.186  < 2e-16 ***
## STARS               0.235686   0.006045  38.986  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(43390.17) family taken to be 1)
## 
##     Null deviance: 18251  on 10237  degrees of freedom
## Residual deviance: 14249  on 10224  degrees of freedom
## AIC: 39848
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  43390 
##           Std. Err.:  65185 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -39817.77
# Predict on training and testing sets
train_predictions <- predict(nb_model_stepwise, train_data, type = "response")
test_predictions <- predict(nb_model_stepwise, test_data, type = "response")

# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))

# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                 sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                sum((test_data$TARGET - mean(test_data$TARGET))^2))

# Format metrics for display
formatted_metrics <- sprintf(
  "AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
  AIC(nb_model_stepwise),
  BIC(nb_model_stepwise),
  train_rmse,
  test_rmse,
  train_r2,
  test_r2
)

# Print metrics
cat(formatted_metrics)
## AIC: 39847.77
## BIC: 39956.28
## Train RMSE: 1.5724
## Test RMSE: 1.5785
## Train R²: 0.3318
## Test R²: 0.3358
# Residual plot for diagnostics
plot(residuals(nb_model_stepwise), main = "Residuals of Weighted Negative Binomial Model",
     xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)

library(MASS)  # For Negative Binomial model
library(dplyr)  # For data manipulation

# Subset the data to include selected variables
selected_variables <- c(
  "FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar",
  "Chlorides", "FreeSulfurDioxide", "TotalSulfurDioxide", "pH",
  "Sulphates", "Alcohol", "LabelAppeal", "AcidIndex", "STARS"
)

nb_data_selected <- train_data %>% dplyr::select(TARGET, all_of(selected_variables))

# Fit the Negative Binomial model
nb_model_selected <- glm.nb(
  TARGET ~ .,
  data = nb_data_selected,
  control = glm.control(maxit = 100)  # Allow more iterations
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
# Display model summary
summary(nb_model_selected)
## 
## Call:
## glm.nb(formula = TARGET ~ ., data = nb_data_selected, control = glm.control(maxit = 100), 
##     init.theta = 47170243380, link = log)
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.044923   0.006032 173.235  < 2e-16 ***
## FixedAcidity       -0.006126   0.005850  -1.047  0.29501    
## VolatileAcidity    -0.040224   0.005705  -7.050 1.78e-12 ***
## CitricAcid          0.008697   0.005673   1.533  0.12526    
## ResidualSugar       0.008106   0.005644   1.436  0.15095    
## Chlorides          -0.015661   0.005760  -2.719  0.00655 ** 
## FreeSulfurDioxide   0.022134   0.005671   3.903 9.51e-05 ***
## TotalSulfurDioxide  0.024156   0.005698   4.239 2.24e-05 ***
## pH                 -0.010814   0.005706  -1.895  0.05808 .  
## Sulphates          -0.011639   0.005695  -2.044  0.04100 *  
## Alcohol             0.017509   0.005700   3.072  0.00213 ** 
## LabelAppeal         0.168874   0.006181  27.323  < 2e-16 ***
## AcidIndex          -0.115476   0.005803 -19.898  < 2e-16 ***
## STARS               0.235758   0.006045  38.999  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(47170243384) family taken to be 1)
## 
##     Null deviance: 18252  on 10237  degrees of freedom
## Residual deviance: 14252  on 10224  degrees of freedom
## AIC: 39848
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  47170243384 
##           Std. Err.:  91936991916 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -39817.99
# Predict on training and testing sets
train_predictions <- predict(nb_model_selected, train_data, type = "response")
test_predictions <- predict(nb_model_selected, test_data, type = "response")

# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))

# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                 sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                sum((test_data$TARGET - mean(test_data$TARGET))^2))

# Format metrics for display
formatted_metrics <- sprintf(
  "AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
  AIC(nb_model_selected),
  BIC(nb_model_selected),
  train_rmse,
  test_rmse,
  train_r2,
  test_r2
)

# Print metrics
cat(formatted_metrics)
## AIC: 39847.99
## BIC: 39956.49
## Train RMSE: 1.5725
## Test RMSE: 1.5790
## Train R²: 0.3318
## Test R²: 0.3354
# Residual plot for diagnostics
plot(residuals(nb_model_selected), main = "Residuals of Weighted Negative Binomial Model",
     xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)

Zero-Inflated Models

ZIP: Zero-Inflated Poisson regression

# Zero-Inflated Poisson (ZIP) Model
library(pscl)

# Fit ZIP Model with corrected starting values
zip_model <- zeroinfl(
  TARGET ~ . | 1,
  data = train_data,
  dist = "poisson"
)

# Display the summary
summary(zip_model)
## 
## Call:
## zeroinfl(formula = TARGET ~ . | 1, data = train_data, dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5645 -0.2555  0.2290  0.5110  2.3408 
## 
## Count model coefficients (poisson with log link):
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.2555554  0.0066738 188.132  < 2e-16 ***
## FixedAcidity       -0.0008205  0.0061253  -0.134  0.89344    
## VolatileAcidity    -0.0161057  0.0060322  -2.670  0.00759 ** 
## CitricAcid          0.0010249  0.0059159   0.173  0.86247    
## ResidualSugar       0.0004010  0.0059177   0.068  0.94598    
## Chlorides          -0.0086612  0.0060573  -1.430  0.15275    
## FreeSulfurDioxide   0.0067480  0.0058980   1.144  0.25258    
## TotalSulfurDioxide  0.0014842  0.0059824   0.248  0.80406    
## Density            -0.0076060  0.0059620  -1.276  0.20204    
## pH                  0.0025329  0.0059861   0.423  0.67220    
## Sulphates          -0.0025319  0.0059910  -0.423  0.67257    
## Alcohol             0.0263208  0.0059276   4.440 8.98e-06 ***
## LabelAppeal         0.2208770  0.0066038  33.447  < 2e-16 ***
## AcidIndex          -0.0428218  0.0066664  -6.424 1.33e-10 ***
## STARS               0.1045219  0.0063575  16.441  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.52762    0.02942  -51.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 22 
## Log-likelihood: -1.851e+04 on 16 Df
# Predict on training and testing sets
train_predictions <- predict(zip_model, train_data, type = "response")
test_predictions <- predict(zip_model, test_data, type = "response")

# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))

# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                 sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                sum((test_data$TARGET - mean(test_data$TARGET))^2))

# Format metrics for display
formatted_metrics <- sprintf(
  "AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
  AIC(zip_model),
  BIC(zip_model),
  train_rmse,
  test_rmse,
  train_r2,
  test_r2
)

# Print metrics
cat(formatted_metrics)
## AIC: 37052.37
## BIC: 37168.11
## Train RMSE: 1.6483
## Test RMSE: 1.6585
## Train R²: 0.2658
## Test R²: 0.2667
# Residual plot for diagnostics
plot(residuals(zip_model), main = "Residuals of Weighted Negative Binomial Model",
     xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)

ZINB: Zero-Inflated Negative Binomial regression.

# Zero-Inflated Negative Binomial (ZINB) Model
library(pscl)

# Fit ZIP Model with corrected starting values
zinb_model <- zeroinfl(TARGET ~ . | 1, 
                       data = train_data, 
                       dist = "negbin",
                       )
summary(zinb_model)
## 
## Call:
## zeroinfl(formula = TARGET ~ . | 1, data = train_data, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.5645 -0.2555  0.2289  0.5110  2.3414 
## 
## Count model coefficients (negbin with log link):
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.2555581  0.0066738 188.132  < 2e-16 ***
## FixedAcidity       -0.0008398  0.0061253  -0.137  0.89095    
## VolatileAcidity    -0.0160788  0.0060322  -2.666  0.00769 ** 
## CitricAcid          0.0010199  0.0059159   0.172  0.86313    
## ResidualSugar       0.0004023  0.0059177   0.068  0.94580    
## Chlorides          -0.0086504  0.0060573  -1.428  0.15326    
## FreeSulfurDioxide   0.0067196  0.0058980   1.139  0.25458    
## TotalSulfurDioxide  0.0014702  0.0059824   0.246  0.80588    
## Density            -0.0076168  0.0059619  -1.278  0.20140    
## pH                  0.0025502  0.0059861   0.426  0.67010    
## Sulphates          -0.0025297  0.0059910  -0.422  0.67285    
## Alcohol             0.0263471  0.0059276   4.445 8.80e-06 ***
## LabelAppeal         0.2208875  0.0066038  33.449  < 2e-16 ***
## AcidIndex          -0.0428380  0.0066663  -6.426 1.31e-10 ***
## STARS               0.1045255  0.0063575  16.441  < 2e-16 ***
## Log(theta)         14.1204074  8.4954034   1.662  0.09649 .  
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.52766    0.02942  -51.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 1356485.0954 
## Number of iterations in BFGS optimization: 37 
## Log-likelihood: -1.851e+04 on 17 Df
# Predict on training and testing sets
train_predictions <- predict(zinb_model, train_data, type = "response")
test_predictions <- predict(zinb_model, test_data, type = "response")

# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))

# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                 sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                sum((test_data$TARGET - mean(test_data$TARGET))^2))

# Format metrics for display
formatted_metrics <- sprintf(
  "AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
  AIC(zinb_model),
  BIC(zinb_model),
  train_rmse,
  test_rmse,
  train_r2,
  test_r2
)

# Print metrics
cat(formatted_metrics)
## AIC: 37054.39
## BIC: 37177.36
## Train RMSE: 1.6483
## Test RMSE: 1.6585
## Train R²: 0.2658
## Test R²: 0.2667
# Residual plot for diagnostics
plot(residuals(zinb_model), main = "Residuals of Weighted Negative Binomial Model",
     xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)

Multiple Linear Regression Models

Linear regression with Box-Cox-transformed variables (Full, Forward, Backward, Stepwise, Recursive, Polynomial, Interaction)

# Load necessary libraries
library(MASS)
library(dplyr)
library(car)
library(caret)
library(ggplot2)

# Function to calculate RMSE and R²
evaluate_model <- function(model, train_data, test_data) {
  # Predictions
  train_predictions <- predict(model, newdata = train_data)
  test_predictions <- predict(model, newdata = test_data)
  
  # RMSE
  train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
  test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
  
  # R²
  train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                   sum((train_data$TARGET - mean(train_data$TARGET))^2))
  test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                  sum((test_data$TARGET - mean(test_data$TARGET))^2))
  
  return(list(train_rmse = train_rmse, test_rmse = test_rmse, 
              train_r2 = train_r2, test_r2 = test_r2))
}

# 1. Base Linear Model with all predictors
base_model <- lm(TARGET ~ ., data = train_data)

# 2. Forward Selection
forward_model <- step(
  lm(TARGET ~ 1, data = train_data), 
  scope = list(lower = ~1, upper = ~.), 
  direction = "forward"
)
## Start:  AIC=13398.02
## TARGET ~ 1
# 3. Backward Selection
backward_model <- step(
  lm(TARGET ~ ., data = train_data), 
  direction = "backward"
)
## Start:  AIC=9479.13
## TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS
## 
##                      Df Sum of Sq   RSS     AIC
## - FixedAcidity        1       2.4 25768  9478.1
## <none>                            25766  9479.1
## - CitricAcid          1       6.2 25772  9479.6
## - ResidualSugar       1       6.6 25773  9479.8
## - pH                  1       7.9 25774  9480.3
## - Density             1      10.0 25776  9481.1
## - Sulphates           1      11.3 25777  9481.6
## - Chlorides           1      26.5 25792  9487.7
## - Alcohol             1      40.1 25806  9493.1
## - FreeSulfurDioxide   1      42.9 25809  9494.2
## - TotalSulfurDioxide  1      53.8 25820  9498.5
## - VolatileAcidity     1     154.4 25920  9538.3
## - AcidIndex           1    1207.3 26973  9945.9
## - LabelAppeal         1    2334.3 28100 10365.0
## - STARS               1    4836.4 30602 11238.3
## 
## Step:  AIC=9478.09
## TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + Density + pH + Sulphates + 
##     Alcohol + LabelAppeal + AcidIndex + STARS
## 
##                      Df Sum of Sq   RSS     AIC
## <none>                            25768  9478.1
## - CitricAcid          1       6.2 25775  9478.5
## - ResidualSugar       1       6.8 25775  9478.8
## - pH                  1       8.0 25776  9479.3
## - Density             1       9.9 25778  9480.0
## - Sulphates           1      11.6 25780  9480.7
## - Chlorides           1      26.4 25795  9486.6
## - Alcohol             1      40.2 25809  9492.1
## - FreeSulfurDioxide   1      42.7 25811  9493.0
## - TotalSulfurDioxide  1      54.1 25822  9497.6
## - VolatileAcidity     1     154.3 25923  9537.2
## - AcidIndex           1    1259.3 27028  9964.6
## - LabelAppeal         1    2333.9 28102 10363.7
## - STARS               1    4837.4 30606 11237.4
# 4. Stepwise Selection
stepwise_model <- step(
  lm(TARGET ~ ., data = train_data), 
  direction = "both"
)
## Start:  AIC=9479.13
## TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS
## 
##                      Df Sum of Sq   RSS     AIC
## - FixedAcidity        1       2.4 25768  9478.1
## <none>                            25766  9479.1
## - CitricAcid          1       6.2 25772  9479.6
## - ResidualSugar       1       6.6 25773  9479.8
## - pH                  1       7.9 25774  9480.3
## - Density             1      10.0 25776  9481.1
## - Sulphates           1      11.3 25777  9481.6
## - Chlorides           1      26.5 25792  9487.7
## - Alcohol             1      40.1 25806  9493.1
## - FreeSulfurDioxide   1      42.9 25809  9494.2
## - TotalSulfurDioxide  1      53.8 25820  9498.5
## - VolatileAcidity     1     154.4 25920  9538.3
## - AcidIndex           1    1207.3 26973  9945.9
## - LabelAppeal         1    2334.3 28100 10365.0
## - STARS               1    4836.4 30602 11238.3
## 
## Step:  AIC=9478.09
## TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + Density + pH + Sulphates + 
##     Alcohol + LabelAppeal + AcidIndex + STARS
## 
##                      Df Sum of Sq   RSS     AIC
## <none>                            25768  9478.1
## - CitricAcid          1       6.2 25775  9478.5
## - ResidualSugar       1       6.8 25775  9478.8
## + FixedAcidity        1       2.4 25766  9479.1
## - pH                  1       8.0 25776  9479.3
## - Density             1       9.9 25778  9480.0
## - Sulphates           1      11.6 25780  9480.7
## - Chlorides           1      26.4 25795  9486.6
## - Alcohol             1      40.2 25809  9492.1
## - FreeSulfurDioxide   1      42.7 25811  9493.0
## - TotalSulfurDioxide  1      54.1 25822  9497.6
## - VolatileAcidity     1     154.3 25923  9537.2
## - AcidIndex           1    1259.3 27028  9964.6
## - LabelAppeal         1    2333.9 28102 10363.7
## - STARS               1    4837.4 30606 11237.4
# 5. Recursive Feature Elimination (RFE)
set.seed(123)
control <- rfeControl(functions = lmFuncs, method = "cv", number = 5)
rfe_model <- rfe(
  x = train_data %>% dplyr::select(-TARGET), 
  y = train_data$TARGET,
  sizes = c(1:10),
  rfeControl = control
)
selected_vars <- rfe_model$optVariables

# Fit a new model with selected variables
rfe_model_final <- lm(
  TARGET ~ ., 
  data = train_data %>% dplyr::select(all_of(c(selected_vars, "TARGET")))
)

# 6. Polynomial Model
# Save polynomial transformations during training
poly_terms <- list(
  FixedAcidity = poly(train_data$FixedAcidity, 2),
  VolatileAcidity = poly(train_data$VolatileAcidity, 2),
  ResidualSugar = poly(train_data$ResidualSugar, 2),
  pH = poly(train_data$pH, 2),
  Alcohol = poly(train_data$Alcohol, 2)
)

# Fit the polynomial model using saved transformations
polynomial_model <- lm(
  TARGET ~ poly_terms$FixedAcidity + poly_terms$VolatileAcidity +
    poly_terms$ResidualSugar + poly_terms$pH + poly_terms$Alcohol,
  data = train_data
)

# 7. Interaction Model
interaction_model <- lm(
  TARGET ~ FixedAcidity * VolatileAcidity + ResidualSugar * pH + Alcohol * LabelAppeal, 
  data = train_data
)

# Evaluate Models
model_list <- list(
  Base = base_model,
  Forward = forward_model,
  Backward = backward_model,
  Stepwise = stepwise_model,
  RFE = rfe_model_final,
  Polynomial = polynomial_model,
  Interaction = interaction_model
)

# Initialize storage for metrics
metrics <- data.frame(
  Model = character(),
  AIC = numeric(),
  Train_RMSE = numeric(),
  Test_RMSE = numeric(),
  Train_R2 = numeric(),
  Test_R2 = numeric()
)

# Collect metrics for each model
for (model_name in names(model_list)) {
  model <- model_list[[model_name]]
  model_metrics <- evaluate_model(model, train_data, test_data)
  metrics <- rbind(metrics, data.frame(
    Model = model_name,
    AIC = AIC(model),
    Train_RMSE = round(model_metrics$train_rmse, 4),
    Test_RMSE = round(model_metrics$test_rmse, 4),
    Train_R2 = round(model_metrics$train_r2, 4),
    Test_R2 = round(model_metrics$test_r2, 4)
  ))
}
## Warning: 'newdata' had 2557 rows but variables found have 10238 rows
## Warning in test_data$TARGET - test_predictions: longer object length is not a
## multiple of shorter object length
## Warning in test_data$TARGET - test_predictions: longer object length is not a
## multiple of shorter object length
# Print metrics
print(metrics)
##         Model      AIC Train_RMSE Test_RMSE Train_R2 Test_R2
## 1        Base 38535.31     1.5864    1.5950   0.3199  0.3218
## 2     Forward 42454.21     1.9237    1.9368   0.0000  0.0000
## 3    Backward 38534.28     1.5865    1.5952   0.3198  0.3216
## 4    Stepwise 38534.28     1.5865    1.5952   0.3198  0.3216
## 5         RFE 38535.31     1.5864    1.5950   0.3199  0.3218
## 6  Polynomial 42312.79     1.9086    1.9507   0.0156 -3.0614
## 7 Interaction 40932.92     1.7844    1.7998   0.1396  0.1364
# Plot AIC Comparison
ggplot(metrics, aes(x = reorder(Model, AIC), y = AIC)) +
  geom_bar(stat = "identity", fill = "steelblue") +
  coord_flip() +
  labs(title = "AIC Comparison of Models", x = "Model", y = "AIC")

# Select and Summarize the Best Model
best_model_name <- metrics$Model[which.min(metrics$AIC)]
cat("\nBest Model: ", best_model_name, "\n")
## 
## Best Model:  Backward
summary(model_list[[best_model_name]])
## 
## Call:
## lm(formula = TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, 
##     data = train_data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.1613 -0.8565  0.3150  1.1062  4.9772 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.03343    0.01569 193.321  < 2e-16 ***
## VolatileAcidity    -0.12264    0.01567  -7.825 5.58e-15 ***
## CitricAcid          0.02459    0.01571   1.565  0.11763    
## ResidualSugar       0.02569    0.01562   1.645  0.10002    
## Chlorides          -0.05111    0.01579  -3.238  0.00121 ** 
## FreeSulfurDioxide   0.06479    0.01575   4.114 3.92e-05 ***
## TotalSulfurDioxide  0.07285    0.01573   4.632 3.66e-06 ***
## Density            -0.03091    0.01561  -1.980  0.04768 *  
## pH                 -0.02789    0.01569  -1.778  0.07551 .  
## Sulphates          -0.03362    0.01569  -2.143  0.03212 *  
## Alcohol             0.06282    0.01572   3.995 6.52e-05 ***
## LabelAppeal         0.49598    0.01630  30.430  < 2e-16 ***
## AcidIndex          -0.35467    0.01587 -22.353  < 2e-16 ***
## STARS               0.71786    0.01639  43.810  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.588 on 10224 degrees of freedom
## Multiple R-squared:  0.3198, Adjusted R-squared:  0.319 
## F-statistic: 369.8 on 13 and 10224 DF,  p-value: < 2.2e-16
# Residual Plot for the Best Model
best_model <- model_list[[best_model_name]]
plot(residuals(best_model),
     main = paste("Residuals of", best_model_name, "Model"),
     xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)

Robust regression using rlm with Huber loss

# Load necessary libraries
library(MASS)
library(dplyr)
library(Metrics)  # For RMSE computation
## 
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
## 
##     precision, recall
# Fit robust regression using rlm with Huber loss
huber_model <- rlm(TARGET ~ ., data = train_data, psi = psi.huber, k = 1.5, method = "MM")

# Fit robust regression using rlm with default settings
default_model <- rlm(TARGET ~ ., data = train_data, method = "MM")

# Compute predictions for both models (train and test)
huber_train_predictions <- predict(huber_model, newdata = train_data)
huber_test_predictions <- predict(huber_model, newdata = test_data)

default_train_predictions <- predict(default_model, newdata = train_data)
default_test_predictions <- predict(default_model, newdata = test_data)

# Compute residuals
huber_train_residuals <- train_data$TARGET - huber_train_predictions
huber_test_residuals <- test_data$TARGET - huber_test_predictions

default_train_residuals <- train_data$TARGET - default_train_predictions
default_test_residuals <- test_data$TARGET - default_test_predictions

# Compute RMSE for both models (train and test)
huber_train_rmse <- rmse(train_data$TARGET, huber_train_predictions)
huber_test_rmse <- rmse(test_data$TARGET, huber_test_predictions)

default_train_rmse <- rmse(train_data$TARGET, default_train_predictions)
default_test_rmse <- rmse(test_data$TARGET, default_test_predictions)

# Compute AIC and BIC for both models on training data
compute_aic_bic <- function(model, residuals, n) {
  rss <- sum(residuals^2)  # Residual sum of squares
  k <- length(coef(model))  # Number of parameters
  aic <- n * log(rss / n) + 2 * k  # AIC formula
  bic <- n * log(rss / n) + k * log(n)  # BIC formula
  return(list(AIC = aic, BIC = bic))
}

huber_metrics <- compute_aic_bic(huber_model, huber_train_residuals, nrow(train_data))
default_metrics <- compute_aic_bic(default_model, default_train_residuals, nrow(train_data))

# Output metrics
cat("\nHuber Model Metrics (Training):\n")
## 
## Huber Model Metrics (Training):
cat("Train RMSE:", huber_train_rmse, "\n")
## Train RMSE: 1.629752
cat("Test RMSE:", huber_test_rmse, "\n")
## Test RMSE: 1.644409
cat("AIC:", huber_metrics$AIC, "\n")
## AIC: 10031.05
cat("BIC:", huber_metrics$BIC, "\n")
## BIC: 10139.56
cat("\nDefault Model Metrics (Training):\n")
## 
## Default Model Metrics (Training):
cat("Train RMSE:", default_train_rmse, "\n")
## Train RMSE: 1.629752
cat("Test RMSE:", default_test_rmse, "\n")
## Test RMSE: 1.644408
cat("AIC:", default_metrics$AIC, "\n")
## AIC: 10031.04
cat("BIC:", default_metrics$BIC, "\n")
## BIC: 10139.55

The AIC comparison plot shows the AIC values for various regression models. A lower AIC indicates a better-fitting model. In this case:

  1. Base Model:
    • The Base model has a higher AIC compared to models that incorporate variable selection or interaction terms. However, due to its simplicity and inclusion of all variables, it is more interpretable.
    • The full model (Base) will be used when interpretability is prioritized, as it offers straightforward insights into the impact of each predictor on the target variable.
  2. Variable Selection Models:
    • Stepwise, Backward, and Forward selection models perform similarly, indicating that these selection techniques do not provide much improvement in this case.
    • These models exclude less significant predictors, making them efficient while maintaining similar performance.
  3. Interaction and Polynomial Models:
    • The Interaction and Polynomial models also have comparable AIC values, suggesting that including interaction terms or polynomial transformations doesn’t significantly reduce AIC compared to simpler models.
    • However, these models are more complex and less interpretable, making them less desirable unless such terms are critical for addressing specific research questions.

Conclusion:

The choice between these models depends on the balance between fit and interpretability: - The full Base model will be preferred for interpretability as it allows clear and direct analysis of coefficients. - For slightly improved fit, Stepwise or Backward selection models are viable, with minimal complexity added and similar AIC values. - Interaction or polynomial models should be considered only if theoretical justification for their use exists.

Variable Selection

  • Use both manual selection and stepwise approaches based on AIC/BIC.
  • Consider techniques like LASSO or random forests to rank variables by importance.

4. Model Diagnostics

  • For all models:
    • Assess goodness of fit using residual deviance and overdispersion (dispersion parameter).
    • Plot residuals against fitted values for patterns.
    • Use diagnostic plots such as Q-Q plots for residuals.
  • For Poisson and Negative Binomial:
    • Compare AIC values for model selection.
    • Evaluate the dispersion statistic to detect overdispersion.
dispersion_poisson <- sum(residuals(poisson_model_all, type = "pearson")^2) / poisson_model_all$df.residual
print(paste("Dispersion for Poisson Model:", dispersion_poisson))
## [1] "Dispersion for Poisson Model: 0.944921552951479"
dispersion_nb <- sum(residuals(nb_model_all, type = "pearson")^2) / nb_model_all$df.residual
print(paste("Dispersion for Negative Binomial Model:", dispersion_nb))
## [1] "Dispersion for Negative Binomial Model: 0.94486483473297"

Model Comparisons

  • Compare coefficients across models (e.g., impact and magnitude of STARS and LabelAppeal).
  • Evaluate whether predictors have consistent effects across models (positive, negative, or negligible).
  • Use likelihood ratio tests or Wald tests to compare nested models.

Based on the AIC in the table:

Best Model Based on AIC:

  • Zero-Inflated Poisson Regression (ZIP) has the lowest AIC (46262.60), indicating the best fit among the models compared.

Best Model Based on Interpretability:

  • While ZIP has the lowest AIC, interpretability may depend on simplicity and clarity of the coefficients:
    • Multiple Regression (AIC = 47761.56) provides easily interpretable coefficients for individual predictors like STARS and LabelAppeal. This is beneficial for explaining relationships between predictors and the target variable.

Recommendation:

  • If model performance is the primary goal, ZIP is the best choice as it minimizes AIC, indicating a better trade-off between model complexity and goodness-of-fit.
  • If interpretability is more important (e.g., explaining to non-technical stakeholders), Multiple Regression is preferred, as it avoids the additional complexity of modeling zero-inflation and allows clear insights into the predictors’ effects.

Both models should be considered based on the specific goals of the analysis: - For deeper insights into the predictors’ individual contributions: Choose Multiple Regression. - For a focus on prediction accuracy: Choose Zero-Inflated Poisson Regression. We selected the following models based on a balance of predictive performance (Test RMSE) and interpretability:

  1. Poisson (Stepwise): Achieved a Test RMSE of 1.5907, identical to the full Poisson model, while retaining fewer predictors for easier interpretation.
  2. Negative Binomial (Stepwise): Matched the full Negative Binomial model’s Test RMSE (1.5907) but improved interpretability by focusing on significant predictors.
  3. Zero-Inflated Poisson: Slightly higher Test RMSE (1.6513 vs. 1.6511 for ZINB), but preferred for its simplicity and ease of explanation.
  4. Linear Regression (Backward): Lowest Test RMSE (1.5987) among linear models, with a reduced predictor set for better interpretability.

These models strike an effective balance between predictive accuracy and simplicity, enhancing their explanatory utility.

# Initialize storage for metrics across all models
final_metrics <- data.frame(
  Model_Type = character(),
  Model = character(),
  AIC = numeric(),
  Train_RMSE = numeric(),
  Test_RMSE = numeric(),
  Train_R2 = numeric(),
  Test_R2 = numeric()
)

# Define a function to calculate evaluation metrics
evaluate_and_store <- function(model, model_name, model_type, train_data, test_data) {
  train_predictions <- predict(model, newdata = train_data, type = "response")
  test_predictions <- predict(model, newdata = test_data, type = "response")
  
  train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
  test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
  
  train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
                   sum((train_data$TARGET - mean(train_data$TARGET))^2))
  test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
                  sum((test_data$TARGET - mean(test_data$TARGET))^2))
  
  final_metrics <<- rbind(
    final_metrics, 
    data.frame(
      Model_Type = model_type,
      Model = model_name,
      AIC = round(AIC(model), 2),
      Train_RMSE = round(train_rmse, 4),
      Test_RMSE = round(test_rmse, 4),
      Train_R2 = round(train_r2, 4),
      Test_R2 = round(test_r2, 4)
    )
  )
}

# Add all your models here
model_list <- list(
  list(name = "Poisson (All Predictors)", type = "Poisson Regression", model = poisson_model_all),
  list(name = "Poisson (Stepwise)", type = "Poisson Regression", model = poisson_model_stepwise),
  list(name = "Negative Binomial (All Predictors)", type = "Negative Binomial Regression", model = nb_model_all),
  list(name = "Negative Binomial (Stepwise)", type = "Negative Binomial Regression", model = nb_model_stepwise),
  list(name = "Zero-Inflated Poisson", type = "Zero-Inflated Models", model = zip_model),
  list(name = "Zero-Inflated Negative Binomial", type = "Zero-Inflated Models", model = zinb_model),
  list(name = "Linear Regression (Base)", type = "Multiple Linear Regression", model = base_model),
  list(name = "Linear Regression (Forward)", type = "Multiple Linear Regression", model = forward_model),
  list(name = "Linear Regression (Backward)", type = "Multiple Linear Regression", model = backward_model),
  list(name = "Linear Regression (Stepwise)", type = "Multiple Linear Regression", model = stepwise_model),
  list(name = "Linear Regression (Polynomial)", type = "Multiple Linear Regression", model = polynomial_model),
  list(name = "Linear Regression (Interaction)", type = "Multiple Linear Regression", model = interaction_model)
)

# Loop through each model and calculate/store metrics
for (model_entry in model_list) {
  evaluate_and_store(
    model = model_entry$model,
    model_name = model_entry$name,
    model_type = model_entry$type,
    train_data = train_data,
    test_data = test_data
  )
}
## Warning: 'newdata' had 2557 rows but variables found have 10238 rows
## Warning in test_data$TARGET - test_predictions: longer object length is not a
## multiple of shorter object length
## Warning in test_data$TARGET - test_predictions: longer object length is not a
## multiple of shorter object length
# Sort the data frame by AIC in ascending order
final_metrics_sorted <- final_metrics %>%
  arrange(Test_RMSE)

# Display the final consolidated metrics table
print(final_metrics_sorted)
##                      Model_Type                              Model      AIC
## 1            Poisson Regression           Poisson (All Predictors) 39846.53
## 2  Negative Binomial Regression Negative Binomial (All Predictors) 39848.66
## 3            Poisson Regression                 Poisson (Stepwise) 39845.64
## 4  Negative Binomial Regression       Negative Binomial (Stepwise) 39847.77
## 5    Multiple Linear Regression           Linear Regression (Base) 38535.31
## 6    Multiple Linear Regression       Linear Regression (Backward) 38534.28
## 7    Multiple Linear Regression       Linear Regression (Stepwise) 38534.28
## 8          Zero-Inflated Models              Zero-Inflated Poisson 37052.37
## 9          Zero-Inflated Models    Zero-Inflated Negative Binomial 37054.39
## 10   Multiple Linear Regression    Linear Regression (Interaction) 40932.92
## 11   Multiple Linear Regression        Linear Regression (Forward) 42454.21
## 12   Multiple Linear Regression     Linear Regression (Polynomial) 42312.79
##    Train_RMSE Test_RMSE Train_R2 Test_R2
## 1      1.5723    1.5783   0.3319  0.3359
## 2      1.5723    1.5783   0.3319  0.3359
## 3      1.5724    1.5785   0.3318  0.3358
## 4      1.5724    1.5785   0.3318  0.3358
## 5      1.5864    1.5950   0.3199  0.3218
## 6      1.5865    1.5952   0.3198  0.3216
## 7      1.5865    1.5952   0.3198  0.3216
## 8      1.6483    1.6585   0.2658  0.2667
## 9      1.6483    1.6585   0.2658  0.2667
## 10     1.7844    1.7998   0.1396  0.1364
## 11     1.9237    1.9368   0.0000  0.0000
## 12     1.9086    1.9507   0.0156 -3.0614
# Optionally, visualize the AIC Comparison
library(ggplot2)
ggplot(final_metrics, aes(x = reorder(Model, AIC), y = AIC, fill = Model_Type)) +
  geom_bar(stat = "identity") +
  coord_flip() +
  labs(title = "Model AIC Comparison", x = "Model", y = "AIC") +
  theme_minimal()

Predict and save the predicted values

When the target variable (TARGET) in the evaluation dataset is entirely NA and is not used for prediction, you should remove it from the evaluation dataset before making predictions. Additionally, the error indicates that some predictors in the evaluation dataset might have different data types (e.g., factors instead of numeric). You’ll need to ensure the data types in the evaluation dataset match those in the training dataset.

Here’s how you can handle this issue and update the prediction code:


Steps to Handle the Issue

  1. Remove the TARGET column from the evaluation dataset since it is not used for prediction.
  2. Check and align data types for predictors between the evaluation dataset and the training dataset.
  3. Update the prediction code to work without errors.

Updated Code Blocks

Common Data Preparation for Evaluation Dataset

1. Poisson (Stepwise)

# Prepare the evaluation dataset and ensure consistent row order
eval_data_prepared <- boxcox_eval_data %>%
  dplyr::select(-TARGET, -INDEX) %>%  # Drop TARGET and INDEX columns
  dplyr::mutate(ID = boxcox_eval_data$INDEX)  # Retain INDEX for prediction alignment

# Glimpse the prepared dataset
glimpse(eval_data_prepared)
## Rows: 3,335
## Columns: 15
## $ FixedAcidity       <dbl[,1]> <matrix[26 x 1]>
## $ VolatileAcidity    <dbl[,1]> <matrix[26 x 1]>
## $ CitricAcid         <dbl[,1]> <matrix[26 x 1]>
## $ ResidualSugar      <dbl[,1]> <matrix[26 x 1]>
## $ Chlorides          <dbl[,1]> <matrix[26 x 1]>
## $ FreeSulfurDioxide  <dbl[,1]> <matrix[26 x 1]>
## $ TotalSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ Density            <dbl[,1]> <matrix[26 x 1]>
## $ pH                 <dbl[,1]> <matrix[26 x 1]>
## $ Sulphates          <dbl[,1]> <matrix[26 x 1]>
## $ Alcohol            <dbl[,1]> <matrix[26 x 1]>
## $ LabelAppeal        <dbl[,1]> <matrix[26 x 1]>
## $ AcidIndex          <dbl[,1]> <matrix[26 x 1]>
## $ STARS              <dbl[,1]> <matrix[26 x 1]>
## $ ID                 <int> 3, 9, 10, 18, 21, 30, 31, 37, 39, 47, 60, 62, 6…
# Predict using the Poisson (Stepwise) model
poisson_stepwise_predictions <- predict(poisson_model_stepwise, newdata = eval_data_prepared, type = "response")

# Create a DataFrame with predictions
poisson_predictions_df <- data.frame(
  ID = eval_data_prepared$ID,  # Add back the INDEX as ID
  Predictions = poisson_stepwise_predictions
)

# Save predictions to CSV
write.csv(poisson_predictions_df, "poisson_stepwise_predictions.csv", row.names = FALSE)
cat("Poisson (Stepwise) predictions saved to 'poisson_stepwise_predictions.csv'\n")
## Poisson (Stepwise) predictions saved to 'poisson_stepwise_predictions.csv'

2. Negative Binomial (Stepwise)

# Prepare the evaluation dataset and ensure consistent row order
eval_data_prepared <- boxcox_eval_data %>%
  dplyr::select(-TARGET, -INDEX) %>%  # Drop TARGET and INDEX columns
  dplyr::mutate(ID = boxcox_eval_data$INDEX)  # Retain INDEX for prediction alignment

# Glimpse the prepared dataset
glimpse(eval_data_prepared)
## Rows: 3,335
## Columns: 15
## $ FixedAcidity       <dbl[,1]> <matrix[26 x 1]>
## $ VolatileAcidity    <dbl[,1]> <matrix[26 x 1]>
## $ CitricAcid         <dbl[,1]> <matrix[26 x 1]>
## $ ResidualSugar      <dbl[,1]> <matrix[26 x 1]>
## $ Chlorides          <dbl[,1]> <matrix[26 x 1]>
## $ FreeSulfurDioxide  <dbl[,1]> <matrix[26 x 1]>
## $ TotalSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ Density            <dbl[,1]> <matrix[26 x 1]>
## $ pH                 <dbl[,1]> <matrix[26 x 1]>
## $ Sulphates          <dbl[,1]> <matrix[26 x 1]>
## $ Alcohol            <dbl[,1]> <matrix[26 x 1]>
## $ LabelAppeal        <dbl[,1]> <matrix[26 x 1]>
## $ AcidIndex          <dbl[,1]> <matrix[26 x 1]>
## $ STARS              <dbl[,1]> <matrix[26 x 1]>
## $ ID                 <int> 3, 9, 10, 18, 21, 30, 31, 37, 39, 47, 60, 62, 6…
# Predict using the Negative Binomial (Stepwise) model
nb_stepwise_predictions <- predict(nb_model_stepwise, newdata = eval_data_prepared, type = "response")

# Create a DataFrame with predictions
nb_predictions_df <- data.frame(
  ID = eval_data_prepared$ID,  # Add back the INDEX as ID
  Predictions = nb_stepwise_predictions
)

# Save predictions to CSV
write.csv(nb_predictions_df, "nb_stepwise_predictions.csv", row.names = FALSE)
cat("Negative Binomial (Stepwise) predictions saved to 'nb_stepwise_predictions.csv'\n")
## Negative Binomial (Stepwise) predictions saved to 'nb_stepwise_predictions.csv'

3. Zero-Inflated Poisson

# Prepare the evaluation dataset and ensure consistent row order
eval_data_prepared <- boxcox_eval_data %>%
  dplyr::select(-TARGET, -INDEX) %>%  # Drop TARGET and INDEX columns
  dplyr::mutate(ID = boxcox_eval_data$INDEX)  # Retain INDEX for prediction alignment

# Glimpse the prepared dataset
glimpse(eval_data_prepared)
## Rows: 3,335
## Columns: 15
## $ FixedAcidity       <dbl[,1]> <matrix[26 x 1]>
## $ VolatileAcidity    <dbl[,1]> <matrix[26 x 1]>
## $ CitricAcid         <dbl[,1]> <matrix[26 x 1]>
## $ ResidualSugar      <dbl[,1]> <matrix[26 x 1]>
## $ Chlorides          <dbl[,1]> <matrix[26 x 1]>
## $ FreeSulfurDioxide  <dbl[,1]> <matrix[26 x 1]>
## $ TotalSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ Density            <dbl[,1]> <matrix[26 x 1]>
## $ pH                 <dbl[,1]> <matrix[26 x 1]>
## $ Sulphates          <dbl[,1]> <matrix[26 x 1]>
## $ Alcohol            <dbl[,1]> <matrix[26 x 1]>
## $ LabelAppeal        <dbl[,1]> <matrix[26 x 1]>
## $ AcidIndex          <dbl[,1]> <matrix[26 x 1]>
## $ STARS              <dbl[,1]> <matrix[26 x 1]>
## $ ID                 <int> 3, 9, 10, 18, 21, 30, 31, 37, 39, 47, 60, 62, 6…
# Predict using the Zero-Inflated Poisson Model
zip_predictions <- predict(zip_model, newdata = eval_data_prepared, type = "response")

# Create a DataFrame with predictions
zip_predictions_df <- data.frame(
  ID = eval_data_prepared$ID,  # Add back the INDEX as ID
  Predictions = zip_predictions
)

# Save predictions to CSV
write.csv(zip_predictions_df, "zip_predictions.csv", row.names = FALSE)
cat("Zero-Inflated Poisson predictions saved to 'zip_predictions.csv'\n")
## Zero-Inflated Poisson predictions saved to 'zip_predictions.csv'

4. Linear Regression (Backward)

# Prepare the evaluation dataset and ensure consistent row order
eval_data_prepared <- boxcox_eval_data %>%
  dplyr::select(-TARGET, -INDEX) %>%  # Drop TARGET and INDEX columns
  dplyr::mutate(ID = boxcox_eval_data$INDEX)  # Retain INDEX for prediction alignment

# Glimpse the prepared dataset
glimpse(eval_data_prepared)
## Rows: 3,335
## Columns: 15
## $ FixedAcidity       <dbl[,1]> <matrix[26 x 1]>
## $ VolatileAcidity    <dbl[,1]> <matrix[26 x 1]>
## $ CitricAcid         <dbl[,1]> <matrix[26 x 1]>
## $ ResidualSugar      <dbl[,1]> <matrix[26 x 1]>
## $ Chlorides          <dbl[,1]> <matrix[26 x 1]>
## $ FreeSulfurDioxide  <dbl[,1]> <matrix[26 x 1]>
## $ TotalSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ Density            <dbl[,1]> <matrix[26 x 1]>
## $ pH                 <dbl[,1]> <matrix[26 x 1]>
## $ Sulphates          <dbl[,1]> <matrix[26 x 1]>
## $ Alcohol            <dbl[,1]> <matrix[26 x 1]>
## $ LabelAppeal        <dbl[,1]> <matrix[26 x 1]>
## $ AcidIndex          <dbl[,1]> <matrix[26 x 1]>
## $ STARS              <dbl[,1]> <matrix[26 x 1]>
## $ ID                 <int> 3, 9, 10, 18, 21, 30, 31, 37, 39, 47, 60, 62, 6…
# Predict using Linear Regression (Backward) Model
lm_backward_predictions <- predict(backward_model, newdata = eval_data_prepared)

# Create a DataFrame with predictions
lm_predictions_df <- data.frame(
  ID = eval_data_prepared$ID,  # Add back the INDEX as ID
  Predictions = lm_backward_predictions
)

# Save predictions to CSV
write.csv(lm_predictions_df, "lm_backward_predictions.csv", row.names = FALSE)
cat("Linear Regression (Backward) predictions saved to 'lm_backward_predictions.csv'\n")
## Linear Regression (Backward) predictions saved to 'lm_backward_predictions.csv'

Comparison and Evaluation of Model Predictions Across Selected Models

The pairwise scatter plot illustrates the predictions from the four selected models: Poisson (Stepwise), Negative Binomial (Stepwise), Zero-Inflated Poisson, and Linear Regression (Backward). Here are the key takeaways:

  1. Prediction Alignment Across Count-Based Models:
    • The Poisson (Stepwise), Negative Binomial (Stepwise), and Zero-Inflated Poisson models show strong agreement in their predictions, as evident from the tight linear patterns in their pairwise plots. This indicates that these models, tailored for count data, produce consistent results despite slight methodological differences.
  2. Deviation in Linear Regression Predictions:
    • The predictions from the Linear Regression (Backward) model deviate somewhat from the count-based models. This is expected since Linear Regression assumes continuous outcomes, which can lead to predictions that do not strictly adhere to the discrete nature of the response variable.
  3. Insights from Range and Spread:
    • While the prediction ranges are comparable across models, some spread in the Linear Regression predictions reflects its distinct assumption set and behavior under the same data conditions.
  4. Overall Agreement and Variability:
    • The models predominantly align well, but the differences observed in Linear Regression predictions emphasize its reduced interpretability for count data, highlighting the robustness of the count-based models for this context.

In summary, this comparison underlines the consistency of the count-based models for predictive tasks, while Linear Regression, though interpretable, shows less agreement with models designed specifically for discrete data.

# Combine predictions into a single data frame
predictions_df <- data.frame(
  Poisson = poisson_predictions_df$Predictions,
  NegativeBinomial = nb_predictions_df$Predictions,
  ZeroInflatedPoisson = zip_predictions_df$Predictions,
  LinearRegression = lm_predictions_df$Predictions
)

# Display summary of predictions
summary(predictions_df)
##     Poisson      NegativeBinomial ZeroInflatedPoisson LinearRegression 
##  Min.   :1.071   Min.   :1.071    Min.   :1.253       Min.   :0.09155  
##  1st Qu.:2.179   1st Qu.:2.179    1st Qu.:2.415       1st Qu.:2.23129  
##  Median :2.790   Median :2.790    Median :2.909       Median :2.97524  
##  Mean   :3.030   Mean   :3.030    Mean   :2.993       Mean   :3.03343  
##  3rd Qu.:3.674   3rd Qu.:3.674    3rd Qu.:3.444       3rd Qu.:3.80676  
##  Max.   :9.496   Max.   :9.496    Max.   :6.560       Max.   :6.68342
# Generate pairwise scatter plots
pairs(
  predictions_df, 
  main = "Pairwise Scatter Plot of Model Predictions",
  pch = 21, 
  bg = c("blue", "green", "red", "purple")
)