This document outlines the analysis and modeling of the wine dataset using R. The goal is to predict the number of wine cases sold based on chemical and marketing features.
# Load required libraries
library(ggplot2)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyr)
library(pscl)
## Classes and Methods for R originally developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University (2002-2015),
## by and under the direction of Simon Jackman.
## hurdle and zeroinfl functions by Achim Zeileis.
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
# Load data
train_data <- read.csv("wine-training-data.csv")
eval_data <- read.csv("wine-evaluation-data.csv")
cat("\nStructure of the training data\n")
##
## Structure of the training data
str(train_data)
## 'data.frame': 12795 obs. of 16 variables:
## $ INDEX : int 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET : int 3 3 5 3 4 0 0 4 3 6 ...
## $ FixedAcidity : num 3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
## $ VolatileAcidity : num 1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
## $ CitricAcid : num -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
## $ ResidualSugar : num 54.2 26.1 14.8 18.8 9.4 ...
## $ Chlorides : num -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
## $ FreeSulfurDioxide : num NA 15 214 22 -167 -37 287 523 -213 62 ...
## $ TotalSulfurDioxide: num 268 -327 142 115 108 15 156 551 NA 180 ...
## $ Density : num 0.993 1.028 0.995 0.996 0.995 ...
## $ pH : num 3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
## $ Sulphates : num -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
## $ Alcohol : num 9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
## $ LabelAppeal : int 0 -1 -1 -1 0 0 0 1 0 0 ...
## $ AcidIndex : int 8 7 8 6 9 11 8 7 6 8 ...
## $ STARS : int 2 3 3 1 2 NA NA 3 NA 4 ...
cat("\nStructure of the evaluation data\n")
##
## Structure of the evaluation data
str(eval_data)
## 'data.frame': 3335 obs. of 16 variables:
## $ INDEX : int 3 9 10 18 21 30 31 37 39 47 ...
## $ TARGET : logi NA NA NA NA NA NA ...
## $ FixedAcidity : num 5.4 12.4 7.2 6.2 11.4 17.6 15.5 15.9 11.6 3.8 ...
## $ VolatileAcidity : num -0.86 0.385 1.75 0.1 0.21 0.04 0.53 1.19 0.32 0.22 ...
## $ CitricAcid : num 0.27 -0.76 0.17 1.8 0.28 -1.15 -0.53 1.14 0.55 0.31 ...
## $ ResidualSugar : num -10.7 -19.7 -33 1 1.2 1.4 4.6 31.9 -50.9 -7.7 ...
## $ Chlorides : num 0.092 1.169 0.065 -0.179 0.038 ...
## $ FreeSulfurDioxide : num 23 -37 9 104 70 -250 10 115 35 40 ...
## $ TotalSulfurDioxide: num 398 68 76 89 53 140 17 381 83 129 ...
## $ Density : num 0.985 0.99 1.046 0.989 1.029 ...
## $ pH : num 5.02 3.37 4.61 3.2 2.54 3.06 3.07 2.99 3.32 4.72 ...
## $ Sulphates : num 0.64 1.09 0.68 2.11 -0.07 -0.02 0.75 0.31 2.18 -0.64 ...
## $ Alcohol : num 12.3 16 8.55 12.3 4.8 11.4 8.5 11.4 -0.5 10.9 ...
## $ LabelAppeal : int -1 0 0 -1 0 1 0 1 0 0 ...
## $ AcidIndex : int 6 6 8 8 10 8 12 7 12 7 ...
## $ STARS : int NA 2 1 1 NA 4 3 NA NA NA ...
# Load necessary libraries
library(kableExtra)
##
## Attaching package: 'kableExtra'
## The following object is masked from 'package:dplyr':
##
## group_rows
library(dplyr)
library(tidyr)
library(moments) # For skewness and kurtosis
# Create vectors of numeric and categorical variables
numeric_vars <- names(train_data)[sapply(train_data, is.numeric)]
categorical_vars <- names(train_data)[sapply(train_data, is.factor)]
# Correctly select numerical variables using the predefined numeric_vars
numerical_vars <- train_data %>%
dplyr::select(all_of(numeric_vars))
# Compute statistical summary including skewness, kurtosis, missing value counts, and percentages
statistical_summary <- numerical_vars %>%
summarise(across(
everything(),
list(
Min = ~round(min(., na.rm = TRUE), 2),
Q1 = ~round(quantile(., 0.25, na.rm = TRUE), 2),
Mean = ~round(mean(., na.rm = TRUE), 2),
Median = ~round(median(., na.rm = TRUE), 2),
Q3 = ~round(quantile(., 0.75, na.rm = TRUE), 2),
Max = ~round(max(., na.rm = TRUE), 2),
SD = ~round(sd(., na.rm = TRUE), 2),
Variance = ~round(var(., na.rm = TRUE), 2),
CV = ~round(sd(., na.rm = TRUE) / mean(., na.rm = TRUE), 2),
Skewness = ~round(skewness(., na.rm = TRUE), 2),
Kurtosis = ~round(kurtosis(., na.rm = TRUE), 2),
ZeroCount = ~sum(. == 0, na.rm = TRUE),
ZeroProportion = ~round(mean(. == 0, na.rm = TRUE) * 100, 2),
Missing = ~sum(is.na(.)),
PercentMissing = ~round(mean(is.na(.)) * 100, 2),
Overdispersion = ~round(var(., na.rm = TRUE) / mean(., na.rm = TRUE), 2),
Outliers = ~sum(. > (quantile(., 0.75, na.rm = TRUE) + 1.5 * IQR(., na.rm = TRUE)) |
. < (quantile(., 0.25, na.rm = TRUE) - 1.5 * IQR(., na.rm = TRUE)), na.rm = TRUE)
),
.names = "{.col}_{.fn}"
)) %>%
pivot_longer(
cols = everything(),
names_to = c("Variable", ".value"),
names_pattern = "^(.*)_(.*)$"
)
# Display the resulting summary table
statistical_summary %>%
kable(caption = "Summary of Numerical Variables (Including Skewness, Kurtosis, Missing Counts and Percentages)") %>%
kable_styling(bootstrap_options = c("striped", "hover", "condensed"), full_width = FALSE)
Variable | Min | Q1 | Mean | Median | Q3 | Max | SD | Variance | CV | Skewness | Kurtosis | ZeroCount | ZeroProportion | Missing | PercentMissing | Overdispersion | Outliers |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
INDEX | 1.00 | 4037.50 | 8069.98 | 8110.00 | 12106.50 | 16129.00 | 4656.91 | 21686765.18 | 0.58 | 0.00 | 1.80 | 0 | 0.00 | 0 | 0.00 | 2687.34 | 0 |
TARGET | 0.00 | 2.00 | 3.03 | 3.00 | 4.00 | 8.00 | 1.93 | 3.71 | 0.64 | -0.33 | 2.12 | 2734 | 21.37 | 0 | 0.00 | 1.23 | 17 |
FixedAcidity | -18.10 | 5.20 | 7.08 | 6.90 | 9.50 | 34.40 | 6.32 | 39.91 | 0.89 | -0.02 | 4.68 | 39 | 0.30 | 0 | 0.00 | 5.64 | 2455 |
VolatileAcidity | -2.79 | 0.13 | 0.32 | 0.28 | 0.64 | 3.68 | 0.78 | 0.61 | 2.42 | 0.02 | 4.83 | 18 | 0.14 | 0 | 0.00 | 1.90 | 2599 |
CitricAcid | -3.24 | 0.03 | 0.31 | 0.31 | 0.58 | 3.86 | 0.86 | 0.74 | 2.80 | -0.05 | 4.84 | 115 | 0.90 | 0 | 0.00 | 2.41 | 2688 |
ResidualSugar | -127.80 | -2.00 | 5.42 | 3.90 | 15.90 | 141.15 | 33.75 | 1139.02 | 6.23 | -0.05 | 4.89 | 6 | 0.05 | 616 | 4.81 | 210.20 | 3298 |
Chlorides | -1.17 | -0.03 | 0.05 | 0.05 | 0.15 | 1.35 | 0.32 | 0.10 | 5.81 | 0.03 | 4.79 | 5 | 0.04 | 638 | 4.99 | 1.85 | 3021 |
FreeSulfurDioxide | -555.00 | 0.00 | 30.85 | 30.00 | 70.00 | 623.00 | 148.71 | 22116.02 | 4.82 | 0.01 | 4.84 | 11 | 0.09 | 647 | 5.06 | 716.99 | 3712 |
TotalSulfurDioxide | -823.00 | 27.00 | 120.71 | 123.00 | 208.00 | 1057.00 | 231.91 | 53783.74 | 1.92 | -0.01 | 4.68 | 7 | 0.06 | 682 | 5.33 | 445.55 | 1590 |
Density | 0.89 | 0.99 | 0.99 | 0.99 | 1.00 | 1.10 | 0.03 | 0.00 | 0.03 | -0.02 | 4.90 | 0 | 0.00 | 0 | 0.00 | 0.00 | 3823 |
pH | 0.48 | 2.96 | 3.21 | 3.20 | 3.47 | 6.13 | 0.68 | 0.46 | 0.21 | 0.04 | 4.65 | 0 | 0.00 | 395 | 3.09 | 0.14 | 1864 |
Sulphates | -3.13 | 0.28 | 0.53 | 0.50 | 0.86 | 4.24 | 0.93 | 0.87 | 1.77 | 0.01 | 4.75 | 22 | 0.19 | 1210 | 9.46 | 1.65 | 2606 |
Alcohol | -4.70 | 9.00 | 10.49 | 10.40 | 12.40 | 26.50 | 3.73 | 13.90 | 0.36 | -0.03 | 4.54 | 2 | 0.02 | 653 | 5.10 | 1.32 | 928 |
LabelAppeal | -2.00 | -1.00 | -0.01 | 0.00 | 1.00 | 2.00 | 0.89 | 0.79 | -98.29 | 0.01 | 2.74 | 5617 | 43.90 | 0 | 0.00 | -87.58 | 0 |
AcidIndex | 4.00 | 7.00 | 7.77 | 8.00 | 8.00 | 17.00 | 1.32 | 1.75 | 0.17 | 1.65 | 8.19 | 0 | 0.00 | 0 | 0.00 | 0.23 | 1151 |
STARS | 1.00 | 1.00 | 2.04 | 2.00 | 3.00 | 4.00 | 0.90 | 0.81 | 0.44 | 0.45 | 2.31 | 0 | 0.00 | 3359 | 26.25 | 0.40 | 0 |
The summary statistics highlights key metrics for numerical variables in the dataset, including central tendencies (mean, median), variability (standard deviation, variance), distribution shape (skewness, kurtosis), and data integrity measures (missing values, outliers). Here’s a statement tailored for this modeling context:
Given the observed distribution of the TARGET variable and the characteristics of predictor variables:
This summary emphasizes the appropriate models and preprocessing steps based on the data’s statistical characteristics.
Based on the summary statistics and modeling context, here are the key variables and visuals. These visualizations will provide critical insights into the data distribution, relationships, and model suitability.
The histogram and bar plot reveal a multimodal distribution of the
TARGET
variable, with prominent peaks at 0, 3, and 4, and
very few occurrences at higher values. The excess of zeros in the data
suggests potential zero-inflation, which makes the TARGET
variable suitable for models like Zero-Inflated Poisson and
Zero-Inflated Negative Binomial, in addition to standard Poisson and
Negative Binomial models. The non-uniformity and zero-inflation also
indicate that Multiple Regression models will need to account for the
skewed and count-based nature of the TARGET
variable to
ensure accurate predictions. These visuals underscore the importance of
considering overdispersion and zero-inflation when selecting statistical
models for analysis.
library(ggplot2)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
# Remove NAs from ResidualSugar
numerical_vars_clean <- numerical_vars %>%
dplyr::filter(!is.na(ResidualSugar))
# Create the histogram of TARGET
histogram_plot <- ggplot(numerical_vars_clean, aes(x = TARGET)) +
geom_histogram(binwidth = 1, fill = "black", color = "black", alpha = 0.6) +
labs(title = "Histogram of TARGET", x = "TARGET", y = "Frequency") +
theme_minimal()
# Create the bar plot of TARGET value counts
bar_plot <- ggplot(numerical_vars_clean, aes(x = factor(TARGET))) +
geom_bar(fill = "black", color = "black", alpha = 0.6) +
labs(title = "Bar Plot of TARGET Value Counts", x = "TARGET", y = "Count") +
theme_minimal()
# Use gridExtra to arrange the plots
gridExtra::grid.arrange(histogram_plot, bar_plot, ncol = 1)
The histogram and bar plot reveal a multimodal distribution of the
TARGET
variable, with prominent peaks around 0, 3, and 4,
and fewer occurrences at higher values. This suggests that
TARGET
is not evenly distributed, making it suitable for
count-based regression models such as Poisson, Negative Binomial, and
Zero-Inflated models. The observed non-uniformity highlights the
importance of selecting appropriate statistical models to account for
overdispersion and the excess zero counts. Multiple regression models
may also need to address the variable’s skewed distribution for accurate
prediction.
The ResidualSugar
variable shows heavy skewness and
numerous outliers, as seen in the density and box plots. These issues
suggest the need for transformations (e.g., log or Box-Cox) to stabilize
variance and normalize the distribution. For Poisson
and Negative Binomial models, overdispersion and
extreme values may require robust handling, such as Zero-Inflated
models. For multiple regression, preprocessing the
variable will likely improve model performance and interpretability.
R Code:
library(ggplot2)
library(gridExtra)
# Remove NAs from ResidualSugar
numerical_vars_clean <- numerical_vars %>%
dplyr::filter(!is.na(ResidualSugar))
# Density plot of ResidualSugar
density_plot <- ggplot(numerical_vars_clean, aes(x = ResidualSugar)) +
geom_density(fill = "black", alpha = 0.6) +
labs(title = "Density Plot of ResidualSugar", x = "ResidualSugar", y = "Density") +
theme_minimal()
# Box plot of ResidualSugar
box_plot <- ggplot(numerical_vars_clean, aes(y = ResidualSugar)) +
geom_boxplot(fill = "black", color = "black", alpha = 0.6) +
labs(title = "Box Plot of ResidualSugar", y = "ResidualSugar") +
theme_minimal()
# Use gridExtra to arrange the plots
gridExtra::grid.arrange(density_plot, box_plot, ncol = 2)
The visualizations highlight the relationships between the key
predictors (STARS
and LabelAppeal
) and
TARGET
. The box plot demonstrates that TARGET
increases with higher values of STARS
, consistent with the
theoretical expectation that better wine ratings lead to higher sales.
Similarly, the bar plot shows a clear positive trend between
LabelAppeal
and the mean TARGET
, aligning with
the notion that visually appealing labels drive consumer purchases.
These predictors are significant in the models and should be emphasized
for their strong predictive power in explaining TARGET
.
library(ggplot2)
library(gridExtra)
# Box plot of TARGET by STARS
box_plot <- ggplot(numerical_vars, aes(x = factor(STARS), y = TARGET)) +
geom_boxplot(fill = "black", color = "black", alpha = 0.6) +
labs(title = "Box Plot of TARGET by STARS", x = "STARS", y = "TARGET") +
scale_x_discrete(labels = function(x) sprintf("%.4f", as.numeric(x))) +
theme_minimal()
# Bar plot of TARGET by LabelAppeal
bar_plot <- ggplot(numerical_vars, aes(x = factor(LabelAppeal), y = TARGET)) +
geom_bar(stat = "summary", fun = "mean", fill = "black", color = "black", alpha = 0.6) +
labs(title = "Bar Plot of TARGET by LabelAppeal", x = "LabelAppeal", y = "Mean TARGET") +
scale_x_discrete(labels = function(x) sprintf("%.4f", as.numeric(x))) +
theme_minimal()
# Use gridExtra to arrange the plots
gridExtra::grid.arrange(box_plot, bar_plot, ncol = 1)
The scatter plot reveals a positive but weak linear relationship
between Alcohol and TARGET, suggesting Alcohol may have a small
predictive effect on the number of cases purchased. The correlation
heatmap highlights stronger positive correlations for variables such as
STARS
and LabelAppeal
with TARGET, aligning
with their theoretical importance in the models. These findings guide
variable selection and emphasize the need for both simple and complex
model types (e.g., Poisson, Negative Binomial, Zero-Inflated, and
Multiple Regression) to capture various relationships effectively.
# Install the ggcorrplot package if not already installed
if (!requireNamespace("ggcorrplot", quietly = TRUE)) {
install.packages("ggcorrplot")
}
# Load necessary libraries
library(ggplot2)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
library(ggcorrplot)
library(gridExtra)
# Scatter plot: TARGET vs. Alcohol
scatter_plot <- ggplot(numerical_vars, aes(x = Alcohol, y = TARGET)) +
geom_point(alpha = 0.6, color = 'blue') +
geom_smooth(method = "lm", color = 'red', se = FALSE) +
labs(title = "Scatter Plot of TARGET vs Alcohol", x = "Alcohol", y = "TARGET") +
theme_minimal()
# Correlation heatmap for numeric variables
numeric_vars <- dplyr::select(numerical_vars, where(is.numeric))
cor_matrix <- cor(numeric_vars, use = "complete.obs")
heatmap <- ggcorrplot(cor_matrix, lab = TRUE, digits = 1, title = "Correlation Heatmap")
# Display scatter plot and heatmap
scatter_plot
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 653 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 653 rows containing missing values or values outside the scale range
## (`geom_point()`).
heatmap
The bar plot illustrates the proportion of zeros across numeric
variables, with the TARGET
variable exhibiting a notable
degree of zero inflation. This suggests the appropriateness of employing
Zero-Inflated Poisson or Negative Binomial regression to account for the
excess zeros effectively. In contrast, variables such as
LabelAppeal
show a higher presence of non-zero values,
indicating their suitability for traditional regression models without
zero-inflation considerations.
# Bar plot of zero proportions
zero_counts <- numerical_vars %>%
summarise(across(where(is.numeric), ~ mean(. == 0, na.rm = TRUE))) %>%
pivot_longer(everything(), names_to = "Variable", values_to = "ZeroProportion")
ggplot(zero_counts, aes(x = reorder(Variable, -ZeroProportion), y = ZeroProportion)) +
geom_bar(stat = "identity", fill = "black", color = "black", alpha = 0.6) +
coord_flip() +
labs(title = "Proportion of Zeros in Numeric Variables", x = "Variable", y = "Proportion of Zeros") +
theme_minimal()
# Load necessary libraries
library(dplyr)
library(rpart)
# Combine training and evaluation datasets for consistent imputation
combined_data <- bind_rows(
train_data %>% mutate(dataset = "train"),
eval_data %>% mutate(dataset = "eval")
)
# Step 1: Impute missing values with median for variables with < 20% missingness
combined_data_imputed <- combined_data %>%
group_by(dataset) %>%
mutate(
ResidualSugar = ifelse(is.na(ResidualSugar), median(ResidualSugar, na.rm = TRUE), ResidualSugar),
Chlorides = ifelse(is.na(Chlorides), median(Chlorides, na.rm = TRUE), Chlorides),
FreeSulfurDioxide = ifelse(is.na(FreeSulfurDioxide), median(FreeSulfurDioxide, na.rm = TRUE), FreeSulfurDioxide),
TotalSulfurDioxide = ifelse(is.na(TotalSulfurDioxide), median(TotalSulfurDioxide, na.rm = TRUE), TotalSulfurDioxide),
pH = ifelse(is.na(pH), median(pH, na.rm = TRUE), pH),
Sulphates = ifelse(is.na(Sulphates), median(Sulphates, na.rm = TRUE), Sulphates),
Alcohol = ifelse(is.na(Alcohol), median(Alcohol, na.rm = TRUE), Alcohol)
) %>%
ungroup()
# Step 2: Predictive imputation for STARS using rpart
# Separate combined data into rows with and without missing STARS
complete_stars <- combined_data_imputed %>% filter(!is.na(STARS))
missing_stars <- combined_data_imputed %>% filter(is.na(STARS))
# Check column names and exclude STARS from predictors
predictor_columns <- setdiff(names(combined_data_imputed), c("STARS", "dataset"))
# Train an rpart model to predict STARS
rpart_model <- rpart(
STARS ~ .,
data = complete_stars,
method = "anova"
)
# Predict missing STARS values
predicted_stars <- predict(rpart_model, missing_stars)
# Impute the predicted values back into the combined dataset
combined_data_imputed <- combined_data_imputed %>%
mutate(
STARS = ifelse(is.na(STARS), predicted_stars, STARS)
)
# Step 3: Create missingness flags for all variables with missing data
combined_data_imputed <- combined_data_imputed %>%
mutate(
ResidualSugar_missing = is.na(ResidualSugar),
Chlorides_missing = is.na(Chlorides),
FreeSulfurDioxide_missing = is.na(FreeSulfurDioxide),
TotalSulfurDioxide_missing = is.na(TotalSulfurDioxide),
pH_missing = is.na(pH),
Sulphates_missing = is.na(Sulphates),
Alcohol_missing = is.na(Alcohol),
STARS_missing = is.na(STARS)
)
# Step 4: Split back into train_data and eval_data
train_data_imputed <- combined_data_imputed %>% filter(dataset == "train") %>% dplyr::select(-dataset)
eval_data_imputed <- combined_data_imputed %>% filter(dataset == "eval") %>% dplyr::select(-dataset)
# Display the structure of the imputed datasets
cat("\nImputed Training Data\n\n")
##
## Imputed Training Data
str(train_data_imputed)
## tibble [12,795 × 24] (S3: tbl_df/tbl/data.frame)
## $ INDEX : int [1:12795] 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET : int [1:12795] 3 3 5 3 4 0 0 4 3 6 ...
## $ FixedAcidity : num [1:12795] 3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
## $ VolatileAcidity : num [1:12795] 1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
## $ CitricAcid : num [1:12795] -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
## $ ResidualSugar : num [1:12795] 54.2 26.1 14.8 18.8 9.4 ...
## $ Chlorides : num [1:12795] -0.567 -0.425 0.037 -0.425 0.046 0.556 0.06 0.04 -0.007 -0.277 ...
## $ FreeSulfurDioxide : num [1:12795] 30 15 214 22 -167 -37 287 523 -213 62 ...
## $ TotalSulfurDioxide : num [1:12795] 268 -327 142 115 108 15 156 551 123 180 ...
## $ Density : num [1:12795] 0.993 1.028 0.995 0.996 0.995 ...
## $ pH : num [1:12795] 3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
## $ Sulphates : num [1:12795] -0.59 0.7 0.48 1.83 1.77 1.29 1.21 0.5 0.26 0.75 ...
## $ Alcohol : num [1:12795] 9.9 10.4 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
## $ LabelAppeal : int [1:12795] 0 -1 -1 -1 0 0 0 1 0 0 ...
## $ AcidIndex : int [1:12795] 8 7 8 6 9 11 8 7 6 8 ...
## $ STARS : num [1:12795] 2 3 3 1 2 ...
## $ ResidualSugar_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Chlorides_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ FreeSulfurDioxide_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ TotalSulfurDioxide_missing: logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ pH_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Sulphates_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Alcohol_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ STARS_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
cat("\nImputed Evaluation Data\n\n")
##
## Imputed Evaluation Data
str(eval_data_imputed)
## tibble [3,335 × 24] (S3: tbl_df/tbl/data.frame)
## $ INDEX : int [1:3335] 3 9 10 18 21 30 31 37 39 47 ...
## $ TARGET : int [1:3335] NA NA NA NA NA NA NA NA NA NA ...
## $ FixedAcidity : num [1:3335] 5.4 12.4 7.2 6.2 11.4 17.6 15.5 15.9 11.6 3.8 ...
## $ VolatileAcidity : num [1:3335] -0.86 0.385 1.75 0.1 0.21 0.04 0.53 1.19 0.32 0.22 ...
## $ CitricAcid : num [1:3335] 0.27 -0.76 0.17 1.8 0.28 -1.15 -0.53 1.14 0.55 0.31 ...
## $ ResidualSugar : num [1:3335] -10.7 -19.7 -33 1 1.2 1.4 4.6 31.9 -50.9 -7.7 ...
## $ Chlorides : num [1:3335] 0.092 1.169 0.065 -0.179 0.038 ...
## $ FreeSulfurDioxide : num [1:3335] 23 -37 9 104 70 -250 10 115 35 40 ...
## $ TotalSulfurDioxide : num [1:3335] 398 68 76 89 53 140 17 381 83 129 ...
## $ Density : num [1:3335] 0.985 0.99 1.046 0.989 1.029 ...
## $ pH : num [1:3335] 5.02 3.37 4.61 3.2 2.54 3.06 3.07 2.99 3.32 4.72 ...
## $ Sulphates : num [1:3335] 0.64 1.09 0.68 2.11 -0.07 -0.02 0.75 0.31 2.18 -0.64 ...
## $ Alcohol : num [1:3335] 12.3 16 8.55 12.3 4.8 11.4 8.5 11.4 -0.5 10.9 ...
## $ LabelAppeal : int [1:3335] -1 0 0 -1 0 1 0 1 0 0 ...
## $ AcidIndex : int [1:3335] 6 6 8 8 10 8 12 7 12 7 ...
## $ STARS : num [1:3335] 1.29 2 1 1 1.29 ...
## $ ResidualSugar_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Chlorides_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ FreeSulfurDioxide_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ TotalSulfurDioxide_missing: logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ pH_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Sulphates_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Alcohol_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ STARS_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
# Combine train and evaluation datasets for consistent transformation
combined_data <- bind_rows(
train_data_imputed %>% mutate(dataset = "train"),
eval_data_imputed %>% mutate(dataset = "eval")
)
# Exclude columns from transformations
exclude_cols <- c("INDEX", "TARGET", "dataset")
# Identify numeric columns for transformation
numeric_cols <- setdiff(names(combined_data)[sapply(combined_data, is.numeric)], exclude_cols)
# Perform Box-Cox transformation
boxcox_data_combined <- combined_data
for (col in numeric_cols) {
if (col %in% c("INDEX", "TARGET") && all(is.na(combined_data[[col]][combined_data$dataset == "eval"]))) {
next # Skip transformation for INDEX and TARGET in eval_data if all values are NA
}
tryCatch({
# Extract the column as a vector
column_data <- combined_data[[col]]
# Check for non-positive values and adjust
adjustment <- 0
if (min(column_data, na.rm = TRUE) <= 0) {
adjustment <- abs(min(column_data, na.rm = TRUE)) + 0.001
column_data <- column_data + adjustment
}
# Fit a simple linear model on training data only
model <- lm(column_data[combined_data$dataset == "train"] ~ 1)
# Perform Box-Cox transformation
bc <- boxcox(model, lambda = seq(-2, 2, by = 0.1), plotit = FALSE)
optimal_lambda <- bc$x[which.max(bc$y)]
# Apply the transformation
if (!is.na(optimal_lambda)) {
if (optimal_lambda == 0) {
transformed <- log(column_data)
} else {
transformed <- ((column_data)^optimal_lambda - 1) / optimal_lambda
}
if (adjustment > 0) {
transformed <- transformed - adjustment
}
boxcox_data_combined[[col]] <- transformed
}
}, error = function(e) {
cat(paste("Error processing column", col, ":", e$message, "\n"))
})
}
# Perform Yeo-Johnson transformation
yeojohnson_data_combined <- combined_data
for (col in numeric_cols) {
if (col %in% c("INDEX", "TARGET") && all(is.na(combined_data[[col]][combined_data$dataset == "eval"]))) {
next # Skip transformation for INDEX and TARGET in eval_data if all values are NA
}
tryCatch({
# Extract the column as a vector
column_data <- combined_data[[col]]
# Compute optimal lambda using training data
lambda <- car::powerTransform(column_data[combined_data$dataset == "train"], family = "yjPower")$lambda
# Apply the Yeo-Johnson transformation
transformed <- car::yjPower(column_data, lambda)
# Save the transformed data
yeojohnson_data_combined[[col]] <- transformed
}, error = function(e) {
cat(paste("Error processing column", col, ":", e$message, "\n"))
})
}
# Split transformed dataset back into train and evaluation datasets
boxcox_train_data <- boxcox_data_combined %>%
filter(dataset == "train") %>%
dplyr::select(-c(dataset))
boxcox_eval_data <- boxcox_data_combined %>%
filter(dataset == "eval") %>%
dplyr::select(-c(dataset))
yeojohnson_train_data <- yeojohnson_data_combined %>%
filter(dataset == "train") %>%
dplyr::select(-c(dataset))
yeojohnson_eval_data <- yeojohnson_data_combined %>%
filter(dataset == "eval") %>%
dplyr::select(-c(dataset))
# Display structures of transformed datasets
cat("\nBox-Cox Transformed Training Dataset:\n\n")
##
## Box-Cox Transformed Training Dataset:
str(boxcox_train_data)
## tibble [12,795 × 24] (S3: tbl_df/tbl/data.frame)
## $ INDEX : int [1:12795] 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET : int [1:12795] 3 3 5 3 4 0 0 4 3 6 ...
## $ FixedAcidity : num [1:12795] 7.32 9.09 12.66 10.73 13.91 ...
## $ VolatileAcidity : num [1:12795] 0.427 -0.706 2.155 -0.454 -0.516 ...
## $ CitricAcid : num [1:12795] -1.92 -1.735 -1.811 -0.791 -2.222 ...
## $ ResidualSugar : num [1:12795] 150 103.1 84.5 91.1 75.6 ...
## $ Chlorides : num [1:12795] -1.558 -1.422 -0.961 -1.422 -0.952 ...
## $ FreeSulfurDioxide : num [1:12795] 457 428.6 810.4 441.8 90.8 ...
## $ TotalSulfurDioxide : num [1:12795] 1172.3 14.9 920.3 866.7 852.8 ...
## $ Density : num [1:12795] -0.00719 0.02804 -0.00482 -0.0036 -0.00543 ...
## $ pH : num [1:12795] 2.51 2.56 2.27 1.3 2.27 ...
## $ Sulphates : num [1:12795] -1.5043 -0.0567 -0.3076 1.2533 1.1829 ...
## $ Alcohol : num [1:12795] 11.75 12.4 28.1 6.97 16.77 ...
## $ LabelAppeal : num [1:12795] -1.07 -2 -2 -2 -1.07 ...
## $ AcidIndex : num [1:12795] 0.765 0.753 0.765 0.736 0.774 ...
## $ STARS : num [1:12795] 0.647 0.986 0.986 0 0.647 ...
## $ ResidualSugar_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Chlorides_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ FreeSulfurDioxide_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ TotalSulfurDioxide_missing: logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ pH_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Sulphates_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Alcohol_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ STARS_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
cat("\nBox-Cox Transformed Evaluation Dataset:\n\n")
##
## Box-Cox Transformed Evaluation Dataset:
str(boxcox_eval_data)
## tibble [3,335 × 24] (S3: tbl_df/tbl/data.frame)
## $ INDEX : int [1:3335] 3 9 10 18 21 30 31 37 39 47 ...
## $ TARGET : int [1:3335] NA NA NA NA NA NA NA NA NA NA ...
## $ FixedAcidity : num [1:3335] 10.3 20.1 12.8 11.4 18.7 ...
## $ VolatileAcidity : num [1:3335] -1.822 -0.454 1.109 -0.773 -0.65 ...
## $ CitricAcid : num [1:3335] -0.531 -1.68 -0.644 1.237 -0.52 ...
## $ ResidualSugar : num [1:3335] 43 28.56 7.44 61.94 62.26 ...
## $ Chlorides : num [1:3335] -0.905 0.236 -0.932 -1.179 -0.96 ...
## $ FreeSulfurDioxide : num [1:3335] 444 331 417 598 533 ...
## $ TotalSulfurDioxide : num [1:3335] 1435 774 789 815 744 ...
## $ Density : num [1:3335] -0.0147 -0.00951 0.04673 -0.01121 0.02912 ...
## $ pH : num [1:3335] 4.45 2.55 3.97 2.36 1.63 ...
## $ Sulphates : num [1:3335] -0.1253 0.3915 -0.0796 1.5828 -0.928 ...
## $ Alcohol : num [1:3335] 14.91 19.87 9.99 14.91 5.21 ...
## $ LabelAppeal : num [1:3335] -2 -1.07 -1.07 -2 -1.07 ...
## $ AcidIndex : num [1:3335] 0.736 0.736 0.765 0.765 0.781 ...
## $ STARS : num [1:3335] 0.246 0.647 0 0 0.246 ...
## $ ResidualSugar_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Chlorides_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ FreeSulfurDioxide_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ TotalSulfurDioxide_missing: logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ pH_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Sulphates_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Alcohol_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ STARS_missing : logi [1:3335] FALSE FALSE FALSE FALSE FALSE FALSE ...
AcidIndex
, Sulphates
, and
STARS
.AcidIndex
and Sulphates
.Density
and
Chlorides
, showed little change as they were already near
symmetric.Both transformations improve symmetry, with Box-Cox being more effective overall, while Yeo-Johnson offers versatility for handling zero or negative values. For our modeling homework, we will use the Box-Cox transformed data
# Required Libraries
library(ggplot2)
library(dplyr)
library(tidyr)
# Compute skewness for original, Box-Cox, and Yeo-Johnson transformed datasets
compute_skewness <- function(data, numeric_cols) {
sapply(data %>% dplyr::select(all_of(numeric_cols)), function(x) {
moments::skewness(x, na.rm = TRUE)
})
}
# Calculate skewness for all datasets
original_skewness <- compute_skewness(train_data_imputed, numeric_cols)
boxcox_skewness <- compute_skewness(boxcox_train_data, numeric_cols)
yeojohnson_skewness <- compute_skewness(yeojohnson_train_data, numeric_cols)
# Combine skewness data into a single dataframe for plotting
skewness_df <- data.frame(
Variable = rep(numeric_cols, 3),
Transformation = rep(c("Original", "Box-Cox", "Yeo-Johnson"), each = length(numeric_cols)),
Skewness = c(original_skewness, boxcox_skewness, yeojohnson_skewness)
)
# Create a bar chart to compare skewness
ggplot(skewness_df, aes(x = Variable, y = Skewness, fill = Transformation)) +
geom_bar(stat = "identity", position = "dodge") +
labs(
title = "Skewness Comparison Before and After Transformations",
x = "Variable",
y = "Skewness"
) +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1)) +
scale_fill_brewer(palette = "Set2")
# Scale only numeric columns excluding TARGET
boxcox_train_data <- boxcox_train_data %>%
mutate(across(where(is.numeric) & !starts_with(c("TARGET", "INDEX")), scale))
boxcox_eval_data <- boxcox_eval_data %>%
mutate(across(where(is.numeric) & !starts_with(c("TARGET", "INDEX")), scale))
str(boxcox_train_data)
## tibble [12,795 × 24] (S3: tbl_df/tbl/data.frame)
## $ INDEX : int [1:12795] 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET : int [1:12795] 3 3 5 3 4 0 0 4 3 6 ...
## $ FixedAcidity : num [1:12795, 1] -0.62514 -0.42106 -0.00944 -0.23164 0.13406 ...
## ..- attr(*, "scaled:center")= num 12.7
## ..- attr(*, "scaled:scale")= num 8.68
## $ VolatileAcidity : num [1:12795, 1] 1.07138 -0.22295 3.04599 0.06495 -0.00562 ...
## ..- attr(*, "scaled:center")= num -0.511
## ..- attr(*, "scaled:scale")= num 0.875
## $ CitricAcid : num [1:12795, 1] -1.485 -1.295 -1.373 -0.325 -1.795 ...
## ..- attr(*, "scaled:center")= num -0.475
## ..- attr(*, "scaled:scale")= num 0.973
## $ ResidualSugar : num [1:12795, 1] 1.504 0.626 0.277 0.4 0.111 ...
## ..- attr(*, "scaled:center")= num 69.7
## ..- attr(*, "scaled:scale")= num 53.4
## $ Chlorides : num [1:12795, 1] -1.9641 -1.531 -0.0697 -1.531 -0.0406 ...
## ..- attr(*, "scaled:center")= num -0.939
## ..- attr(*, "scaled:scale")= num 0.315
## $ FreeSulfurDioxide : num [1:12795, 1] -0.0185 -0.1224 1.2757 -0.074 -1.3593 ...
## ..- attr(*, "scaled:center")= num 462
## ..- attr(*, "scaled:scale")= num 273
## $ TotalSulfurDioxide : num [1:12795, 1] 0.6476 -1.9508 0.0817 -0.0386 -0.0698 ...
## ..- attr(*, "scaled:center")= num 884
## ..- attr(*, "scaled:scale")= num 445
## $ Density : num [1:12795, 1] -0.05686 1.27316 0.03283 0.07883 0.00984 ...
## ..- attr(*, "scaled:center")= num -0.00569
## ..- attr(*, "scaled:scale")= num 0.0265
## $ pH : num [1:12795, 1] 0.173 0.249 -0.142 -1.436 -0.142 ...
## ..- attr(*, "scaled:center")= num 2.38
## ..- attr(*, "scaled:scale")= num 0.75
## $ Sulphates : num [1:12795, 1] -1.2547 0.1864 -0.0634 1.4905 1.4204 ...
## ..- attr(*, "scaled:center")= num -0.244
## ..- attr(*, "scaled:scale")= num 1
## $ Alcohol : num [1:12795, 1] -0.1742 -0.0361 3.2745 -1.1801 0.8861 ...
## ..- attr(*, "scaled:center")= num 12.6
## ..- attr(*, "scaled:scale")= num 4.74
## $ LabelAppeal : num [1:12795, 1] 0.0643 -1.0769 -1.0769 -1.0769 0.0643 ...
## ..- attr(*, "scaled:center")= num -1.13
## ..- attr(*, "scaled:scale")= num 0.812
## $ AcidIndex : num [1:12795, 1] 0.363 -0.545 0.363 -1.792 1.052 ...
## ..- attr(*, "scaled:center")= num 0.76
## ..- attr(*, "scaled:scale")= num 0.0132
## $ STARS : num [1:12795, 1] 0.349 1.241 1.241 -1.354 0.349 ...
## ..- attr(*, "scaled:center")= num 0.515
## ..- attr(*, "scaled:scale")= num 0.38
## $ ResidualSugar_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Chlorides_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ FreeSulfurDioxide_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ TotalSulfurDioxide_missing: logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ pH_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Sulphates_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ Alcohol_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
## $ STARS_missing : logi [1:12795] FALSE FALSE FALSE FALSE FALSE FALSE ...
# Load necessary libraries
library(MASS)
library(pscl)
library(car)
library(dplyr)
library(caret)
## Loading required package: lattice
library(lattice)
# Remove unnecessary columns (e.g., "_missing" flags)
boxcox_train_data <- boxcox_train_data %>%
dplyr::select(-ends_with("_missing"), -INDEX)
boxcox_eval_data <- boxcox_eval_data %>%
dplyr::select(-ends_with("_missing"))
# Split data into training and testing sets
set.seed(123) # For reproducibility
train_index <- createDataPartition(boxcox_train_data$TARGET, p = 0.8, list = FALSE)
train_data <- boxcox_train_data[train_index, ]
test_data <- boxcox_train_data[-train_index, ]
# Load necessary libraries
library(MASS)
library(pscl)
library(car)
library(dplyr)
library(caret)
library(ggplot2)
# Poisson Regression Model 1: All Predictors
# Fit a Poisson regression model on training data
poisson_model_all <- glm(
TARGET ~ ., data = train_data,
family = poisson()
)
# Summarize the model
summary(poisson_model_all)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson(), data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.044806 0.006033 173.194 < 2e-16 ***
## FixedAcidity -0.006167 0.005849 -1.054 0.29169
## VolatileAcidity -0.040178 0.005704 -7.044 1.87e-12 ***
## CitricAcid 0.008565 0.005673 1.510 0.13111
## ResidualSugar 0.008053 0.005644 1.427 0.15362
## Chlorides -0.015407 0.005763 -2.674 0.00750 **
## FreeSulfurDioxide 0.022239 0.005672 3.921 8.83e-05 ***
## TotalSulfurDioxide 0.024427 0.005701 4.285 1.83e-05 ***
## Density -0.010871 0.005656 -1.922 0.05461 .
## pH -0.010682 0.005707 -1.872 0.06123 .
## Sulphates -0.011761 0.005696 -2.065 0.03895 *
## Alcohol 0.017374 0.005701 3.048 0.00231 **
## LabelAppeal 0.168867 0.006181 27.318 < 2e-16 ***
## AcidIndex -0.114966 0.005809 -19.791 < 2e-16 ***
## STARS 0.235666 0.006045 38.983 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18252 on 10237 degrees of freedom
## Residual deviance: 14249 on 10223 degrees of freedom
## AIC: 39847
##
## Number of Fisher Scoring iterations: 5
# Predict on training and testing sets
train_predictions <- predict(poisson_model_all, train_data, type = "response")
test_predictions <- predict(poisson_model_all, test_data, type = "response")
# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
# Print metrics
cat("AIC:", AIC(poisson_model_all), "\n")
## AIC: 39846.53
cat("BIC:", BIC(poisson_model_all), "\n")
## BIC: 39955.03
cat("Train RMSE:", round(train_rmse, 4), "\n")
## Train RMSE: 1.5723
cat("Test RMSE:", round(test_rmse, 4), "\n")
## Test RMSE: 1.5783
cat("Train R²:", round(train_r2, 4), "\n")
## Train R²: 0.3319
cat("Test R²:", round(test_r2, 4), "\n")
## Test R²: 0.3359
# Residual plot for diagnostics
plot(residuals(poisson_model_all),
main = "Residuals of Poisson Regression Model (All Predictors)",
xlab = "Observation", ylab = "Residuals",
col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)
# Load necessary libraries
library(dplyr)
library(MASS)
library(caret)
library(ggplot2)
# Use full dataset for stepwise regression
# Perform stepwise Poisson regression on training data
poisson_model_stepwise <- step(
glm(TARGET ~ ., data = train_data, family = poisson()),
direction = "both",
trace = FALSE # Suppress output of intermediate steps
)
# Summarize the final stepwise-selected model
summary(poisson_model_stepwise)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar +
## Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density +
## pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS,
## family = poisson(), data = train_data)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.044778 0.006033 173.191 < 2e-16 ***
## VolatileAcidity -0.040161 0.005703 -7.042 1.90e-12 ***
## CitricAcid 0.008529 0.005674 1.503 0.13279
## ResidualSugar 0.008190 0.005642 1.452 0.14662
## Chlorides -0.015352 0.005762 -2.664 0.00771 **
## FreeSulfurDioxide 0.022202 0.005672 3.914 9.07e-05 ***
## TotalSulfurDioxide 0.024475 0.005700 4.294 1.75e-05 ***
## Density -0.010849 0.005657 -1.918 0.05511 .
## pH -0.010778 0.005705 -1.889 0.05888 .
## Sulphates -0.011912 0.005694 -2.092 0.03642 *
## Alcohol 0.017375 0.005701 3.048 0.00230 **
## LabelAppeal 0.168873 0.006182 27.318 < 2e-16 ***
## AcidIndex -0.115890 0.005741 -20.186 < 2e-16 ***
## STARS 0.235688 0.006045 38.988 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18252 on 10237 degrees of freedom
## Residual deviance: 14250 on 10224 degrees of freedom
## AIC: 39846
##
## Number of Fisher Scoring iterations: 5
# Predict on training and testing sets
train_predictions <- predict(poisson_model_stepwise, train_data, type = "response")
test_predictions <- predict(poisson_model_stepwise, test_data, type = "response")
# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
# Print metrics
cat("AIC:", AIC(poisson_model_stepwise), "\n")
## AIC: 39845.64
cat("BIC:", BIC(poisson_model_stepwise), "\n")
## BIC: 39946.91
cat("Train RMSE:", round(train_rmse, 4), "\n")
## Train RMSE: 1.5724
cat("Test RMSE:", round(test_rmse, 4), "\n")
## Test RMSE: 1.5785
cat("Train R²:", round(train_r2, 4), "\n")
## Train R²: 0.3318
cat("Test R²:", round(test_r2, 4), "\n")
## Test R²: 0.3358
# Residual plot for diagnostics
plot(residuals(poisson_model_stepwise),
main = "Residuals of Stepwise Poisson Regression",
xlab = "Observation", ylab = "Residuals",
col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)
# Load necessary libraries
library(dplyr)
library(MASS)
library(caret) # For data splitting and evaluation metrics
library(ggplot2)
# Fit a Negative Binomial model on training data
nb_model_all <- glm.nb(TARGET ~ ., data = train_data)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
# Summarize the model
summary(nb_model_all)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = train_data, init.theta = 43393.59694,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.044806 0.006033 173.189 < 2e-16 ***
## FixedAcidity -0.006167 0.005849 -1.054 0.29171
## VolatileAcidity -0.040178 0.005704 -7.044 1.87e-12 ***
## CitricAcid 0.008565 0.005674 1.510 0.13113
## ResidualSugar 0.008053 0.005644 1.427 0.15362
## Chlorides -0.015407 0.005763 -2.674 0.00750 **
## FreeSulfurDioxide 0.022240 0.005673 3.921 8.83e-05 ***
## TotalSulfurDioxide 0.024428 0.005701 4.285 1.83e-05 ***
## Density -0.010872 0.005657 -1.922 0.05461 .
## pH -0.010682 0.005707 -1.872 0.06123 .
## Sulphates -0.011761 0.005696 -2.065 0.03895 *
## Alcohol 0.017374 0.005701 3.048 0.00231 **
## LabelAppeal 0.168867 0.006182 27.317 < 2e-16 ***
## AcidIndex -0.114969 0.005809 -19.791 < 2e-16 ***
## STARS 0.235665 0.006046 38.982 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(43393.6) family taken to be 1)
##
## Null deviance: 18251 on 10237 degrees of freedom
## Residual deviance: 14248 on 10223 degrees of freedom
## AIC: 39849
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 43394
## Std. Err.: 65170
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -39816.66
# Predict on training and testing sets
train_predictions <- predict(nb_model_all, train_data, type = "response")
test_predictions <- predict(nb_model_all, test_data, type = "response")
# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
# Print metrics
cat("AIC:", AIC(nb_model_all), "\n")
## AIC: 39848.66
cat("BIC:", BIC(nb_model_all), "\n")
## BIC: 39964.4
cat("Train RMSE:", train_rmse, "\n")
## Train RMSE: 1.572312
cat("Test RMSE:", test_rmse, "\n")
## Test RMSE: 1.578315
cat("Train R²:", train_r2, "\n")
## Train R²: 0.3319323
cat("Test R²:", test_r2, "\n")
## Test R²: 0.3359193
# Residual plot for diagnostics
plot(residuals(nb_model_all))
# Load necessary libraries
library(dplyr)
library(MASS)
library(caret) # For data splitting and evaluation metrics
library(ggplot2)
# Initial Poisson model to compute residuals for weights
poisson_model <- glm(TARGET ~ ., data = train_data, family = poisson)
poisson_residuals <- residuals(poisson_model, type = "response")
# Compute weights based on residuals
nb_weights <- 1 / (abs(poisson_residuals) + 1)
# Fit a weighted Negative Binomial model
nb_model_weighted <- glm.nb(
TARGET ~ .,
data = train_data,
weights = nb_weights, # Apply weights
control = glm.control(maxit = 100)
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
# Summarize the weighted model
summary(nb_model_weighted)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = train_data, weights = nb_weights,
## control = glm.control(maxit = 100), init.theta = 527063126000,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.088930 0.008393 129.741 < 2e-16 ***
## FixedAcidity -0.005856 0.007849 -0.746 0.45564
## VolatileAcidity -0.038304 0.007734 -4.952 7.33e-07 ***
## CitricAcid 0.007981 0.007590 1.052 0.29303
## ResidualSugar 0.006804 0.007541 0.902 0.36688
## Chlorides -0.014391 0.007717 -1.865 0.06221 .
## FreeSulfurDioxide 0.019892 0.007559 2.632 0.00850 **
## TotalSulfurDioxide 0.021324 0.007612 2.801 0.00509 **
## Density -0.010114 0.007627 -1.326 0.18479
## pH -0.009419 0.007678 -1.227 0.21996
## Sulphates -0.009539 0.007679 -1.242 0.21416
## Alcohol 0.018980 0.007620 2.491 0.01275 *
## LabelAppeal 0.180900 0.008405 21.524 < 2e-16 ***
## AcidIndex -0.110908 0.008169 -13.577 < 2e-16 ***
## STARS 0.212858 0.008428 25.255 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(527063125967) family taken to be 1)
##
## Null deviance: 6492.6 on 10237 degrees of freedom
## Residual deviance: 4386.9 on 10223 degrees of freedom
## AIC: 19016
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 527063125967
## Std. Err.: 292534145483
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -18984.15
# Predict on training and testing sets
train_predictions <- predict(nb_model_weighted, train_data, type = "response")
test_predictions <- predict(nb_model_weighted, test_data, type = "response")
# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
# Format metrics for display
formatted_metrics <- sprintf(
"AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
AIC(nb_model_weighted),
BIC(nb_model_weighted),
train_rmse,
test_rmse,
train_r2,
test_r2
)
# Print metrics
cat(formatted_metrics)
## AIC: 19016.15
## BIC: 19131.89
## Train RMSE: 1.5784
## Test RMSE: 1.5875
## Train R²: 0.3267
## Test R²: 0.3281
# Residual plot for diagnostics
plot(residuals(nb_model_weighted), main = "Residuals of Weighted Negative Binomial Model",
xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)
### Model 2: Negative Binomial model with stepwise variable
selection.
library(MASS)
library(dplyr)
library(car)
library(carData)
# Stepwise selection with updated parameters
nb_model_stepwise <- step(
glm.nb(TARGET ~ .,
data = train_data,
link = log),
direction = "both",
trace = FALSE # Enable trace for debugging
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(nb_model_stepwise)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar +
## Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density +
## pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS,
## data = train_data, init.theta = 43390.17136, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.044777 0.006033 173.185 < 2e-16 ***
## VolatileAcidity -0.040162 0.005704 -7.041 1.90e-12 ***
## CitricAcid 0.008529 0.005674 1.503 0.13281
## ResidualSugar 0.008191 0.005643 1.452 0.14661
## Chlorides -0.015353 0.005762 -2.664 0.00771 **
## FreeSulfurDioxide 0.022203 0.005672 3.914 9.07e-05 ***
## TotalSulfurDioxide 0.024476 0.005700 4.294 1.75e-05 ***
## Density -0.010850 0.005657 -1.918 0.05511 .
## pH -0.010779 0.005706 -1.889 0.05888 .
## Sulphates -0.011913 0.005694 -2.092 0.03643 *
## Alcohol 0.017375 0.005701 3.048 0.00230 **
## LabelAppeal 0.168873 0.006182 27.317 < 2e-16 ***
## AcidIndex -0.115893 0.005741 -20.186 < 2e-16 ***
## STARS 0.235686 0.006045 38.986 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(43390.17) family taken to be 1)
##
## Null deviance: 18251 on 10237 degrees of freedom
## Residual deviance: 14249 on 10224 degrees of freedom
## AIC: 39848
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 43390
## Std. Err.: 65185
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -39817.77
# Predict on training and testing sets
train_predictions <- predict(nb_model_stepwise, train_data, type = "response")
test_predictions <- predict(nb_model_stepwise, test_data, type = "response")
# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
# Format metrics for display
formatted_metrics <- sprintf(
"AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
AIC(nb_model_stepwise),
BIC(nb_model_stepwise),
train_rmse,
test_rmse,
train_r2,
test_r2
)
# Print metrics
cat(formatted_metrics)
## AIC: 39847.77
## BIC: 39956.28
## Train RMSE: 1.5724
## Test RMSE: 1.5785
## Train R²: 0.3318
## Test R²: 0.3358
# Residual plot for diagnostics
plot(residuals(nb_model_stepwise), main = "Residuals of Weighted Negative Binomial Model",
xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)
library(MASS) # For Negative Binomial model
library(dplyr) # For data manipulation
# Subset the data to include selected variables
selected_variables <- c(
"FixedAcidity", "VolatileAcidity", "CitricAcid", "ResidualSugar",
"Chlorides", "FreeSulfurDioxide", "TotalSulfurDioxide", "pH",
"Sulphates", "Alcohol", "LabelAppeal", "AcidIndex", "STARS"
)
nb_data_selected <- train_data %>% dplyr::select(TARGET, all_of(selected_variables))
# Fit the Negative Binomial model
nb_model_selected <- glm.nb(
TARGET ~ .,
data = nb_data_selected,
control = glm.control(maxit = 100) # Allow more iterations
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
# Display model summary
summary(nb_model_selected)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = nb_data_selected, control = glm.control(maxit = 100),
## init.theta = 47170243380, link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.044923 0.006032 173.235 < 2e-16 ***
## FixedAcidity -0.006126 0.005850 -1.047 0.29501
## VolatileAcidity -0.040224 0.005705 -7.050 1.78e-12 ***
## CitricAcid 0.008697 0.005673 1.533 0.12526
## ResidualSugar 0.008106 0.005644 1.436 0.15095
## Chlorides -0.015661 0.005760 -2.719 0.00655 **
## FreeSulfurDioxide 0.022134 0.005671 3.903 9.51e-05 ***
## TotalSulfurDioxide 0.024156 0.005698 4.239 2.24e-05 ***
## pH -0.010814 0.005706 -1.895 0.05808 .
## Sulphates -0.011639 0.005695 -2.044 0.04100 *
## Alcohol 0.017509 0.005700 3.072 0.00213 **
## LabelAppeal 0.168874 0.006181 27.323 < 2e-16 ***
## AcidIndex -0.115476 0.005803 -19.898 < 2e-16 ***
## STARS 0.235758 0.006045 38.999 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(47170243384) family taken to be 1)
##
## Null deviance: 18252 on 10237 degrees of freedom
## Residual deviance: 14252 on 10224 degrees of freedom
## AIC: 39848
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 47170243384
## Std. Err.: 91936991916
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -39817.99
# Predict on training and testing sets
train_predictions <- predict(nb_model_selected, train_data, type = "response")
test_predictions <- predict(nb_model_selected, test_data, type = "response")
# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
# Format metrics for display
formatted_metrics <- sprintf(
"AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
AIC(nb_model_selected),
BIC(nb_model_selected),
train_rmse,
test_rmse,
train_r2,
test_r2
)
# Print metrics
cat(formatted_metrics)
## AIC: 39847.99
## BIC: 39956.49
## Train RMSE: 1.5725
## Test RMSE: 1.5790
## Train R²: 0.3318
## Test R²: 0.3354
# Residual plot for diagnostics
plot(residuals(nb_model_selected), main = "Residuals of Weighted Negative Binomial Model",
xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)
# Zero-Inflated Poisson (ZIP) Model
library(pscl)
# Fit ZIP Model with corrected starting values
zip_model <- zeroinfl(
TARGET ~ . | 1,
data = train_data,
dist = "poisson"
)
# Display the summary
summary(zip_model)
##
## Call:
## zeroinfl(formula = TARGET ~ . | 1, data = train_data, dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.5645 -0.2555 0.2290 0.5110 2.3408
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2555554 0.0066738 188.132 < 2e-16 ***
## FixedAcidity -0.0008205 0.0061253 -0.134 0.89344
## VolatileAcidity -0.0161057 0.0060322 -2.670 0.00759 **
## CitricAcid 0.0010249 0.0059159 0.173 0.86247
## ResidualSugar 0.0004010 0.0059177 0.068 0.94598
## Chlorides -0.0086612 0.0060573 -1.430 0.15275
## FreeSulfurDioxide 0.0067480 0.0058980 1.144 0.25258
## TotalSulfurDioxide 0.0014842 0.0059824 0.248 0.80406
## Density -0.0076060 0.0059620 -1.276 0.20204
## pH 0.0025329 0.0059861 0.423 0.67220
## Sulphates -0.0025319 0.0059910 -0.423 0.67257
## Alcohol 0.0263208 0.0059276 4.440 8.98e-06 ***
## LabelAppeal 0.2208770 0.0066038 33.447 < 2e-16 ***
## AcidIndex -0.0428218 0.0066664 -6.424 1.33e-10 ***
## STARS 0.1045219 0.0063575 16.441 < 2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.52762 0.02942 -51.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 22
## Log-likelihood: -1.851e+04 on 16 Df
# Predict on training and testing sets
train_predictions <- predict(zip_model, train_data, type = "response")
test_predictions <- predict(zip_model, test_data, type = "response")
# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
# Format metrics for display
formatted_metrics <- sprintf(
"AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
AIC(zip_model),
BIC(zip_model),
train_rmse,
test_rmse,
train_r2,
test_r2
)
# Print metrics
cat(formatted_metrics)
## AIC: 37052.37
## BIC: 37168.11
## Train RMSE: 1.6483
## Test RMSE: 1.6585
## Train R²: 0.2658
## Test R²: 0.2667
# Residual plot for diagnostics
plot(residuals(zip_model), main = "Residuals of Weighted Negative Binomial Model",
xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)
# Zero-Inflated Negative Binomial (ZINB) Model
library(pscl)
# Fit ZIP Model with corrected starting values
zinb_model <- zeroinfl(TARGET ~ . | 1,
data = train_data,
dist = "negbin",
)
summary(zinb_model)
##
## Call:
## zeroinfl(formula = TARGET ~ . | 1, data = train_data, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -1.5645 -0.2555 0.2289 0.5110 2.3414
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2555581 0.0066738 188.132 < 2e-16 ***
## FixedAcidity -0.0008398 0.0061253 -0.137 0.89095
## VolatileAcidity -0.0160788 0.0060322 -2.666 0.00769 **
## CitricAcid 0.0010199 0.0059159 0.172 0.86313
## ResidualSugar 0.0004023 0.0059177 0.068 0.94580
## Chlorides -0.0086504 0.0060573 -1.428 0.15326
## FreeSulfurDioxide 0.0067196 0.0058980 1.139 0.25458
## TotalSulfurDioxide 0.0014702 0.0059824 0.246 0.80588
## Density -0.0076168 0.0059619 -1.278 0.20140
## pH 0.0025502 0.0059861 0.426 0.67010
## Sulphates -0.0025297 0.0059910 -0.422 0.67285
## Alcohol 0.0263471 0.0059276 4.445 8.80e-06 ***
## LabelAppeal 0.2208875 0.0066038 33.449 < 2e-16 ***
## AcidIndex -0.0428380 0.0066663 -6.426 1.31e-10 ***
## STARS 0.1045255 0.0063575 16.441 < 2e-16 ***
## Log(theta) 14.1204074 8.4954034 1.662 0.09649 .
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.52766 0.02942 -51.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 1356485.0954
## Number of iterations in BFGS optimization: 37
## Log-likelihood: -1.851e+04 on 17 Df
# Predict on training and testing sets
train_predictions <- predict(zinb_model, train_data, type = "response")
test_predictions <- predict(zinb_model, test_data, type = "response")
# Calculate RMSE for training and testing sets
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# Calculate R² for training and testing sets
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
# Format metrics for display
formatted_metrics <- sprintf(
"AIC: %.2f\nBIC: %.2f\nTrain RMSE: %.4f\nTest RMSE: %.4f\nTrain R²: %.4f\nTest R²: %.4f\n",
AIC(zinb_model),
BIC(zinb_model),
train_rmse,
test_rmse,
train_r2,
test_r2
)
# Print metrics
cat(formatted_metrics)
## AIC: 37054.39
## BIC: 37177.36
## Train RMSE: 1.6483
## Test RMSE: 1.6585
## Train R²: 0.2658
## Test R²: 0.2667
# Residual plot for diagnostics
plot(residuals(zinb_model), main = "Residuals of Weighted Negative Binomial Model",
xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)
# Load necessary libraries
library(MASS)
library(dplyr)
library(car)
library(caret)
library(ggplot2)
# Function to calculate RMSE and R²
evaluate_model <- function(model, train_data, test_data) {
# Predictions
train_predictions <- predict(model, newdata = train_data)
test_predictions <- predict(model, newdata = test_data)
# RMSE
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
# R²
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
return(list(train_rmse = train_rmse, test_rmse = test_rmse,
train_r2 = train_r2, test_r2 = test_r2))
}
# 1. Base Linear Model with all predictors
base_model <- lm(TARGET ~ ., data = train_data)
# 2. Forward Selection
forward_model <- step(
lm(TARGET ~ 1, data = train_data),
scope = list(lower = ~1, upper = ~.),
direction = "forward"
)
## Start: AIC=13398.02
## TARGET ~ 1
# 3. Backward Selection
backward_model <- step(
lm(TARGET ~ ., data = train_data),
direction = "backward"
)
## Start: AIC=9479.13
## TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + ResidualSugar +
## Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density +
## pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS
##
## Df Sum of Sq RSS AIC
## - FixedAcidity 1 2.4 25768 9478.1
## <none> 25766 9479.1
## - CitricAcid 1 6.2 25772 9479.6
## - ResidualSugar 1 6.6 25773 9479.8
## - pH 1 7.9 25774 9480.3
## - Density 1 10.0 25776 9481.1
## - Sulphates 1 11.3 25777 9481.6
## - Chlorides 1 26.5 25792 9487.7
## - Alcohol 1 40.1 25806 9493.1
## - FreeSulfurDioxide 1 42.9 25809 9494.2
## - TotalSulfurDioxide 1 53.8 25820 9498.5
## - VolatileAcidity 1 154.4 25920 9538.3
## - AcidIndex 1 1207.3 26973 9945.9
## - LabelAppeal 1 2334.3 28100 10365.0
## - STARS 1 4836.4 30602 11238.3
##
## Step: AIC=9478.09
## TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Density + pH + Sulphates +
## Alcohol + LabelAppeal + AcidIndex + STARS
##
## Df Sum of Sq RSS AIC
## <none> 25768 9478.1
## - CitricAcid 1 6.2 25775 9478.5
## - ResidualSugar 1 6.8 25775 9478.8
## - pH 1 8.0 25776 9479.3
## - Density 1 9.9 25778 9480.0
## - Sulphates 1 11.6 25780 9480.7
## - Chlorides 1 26.4 25795 9486.6
## - Alcohol 1 40.2 25809 9492.1
## - FreeSulfurDioxide 1 42.7 25811 9493.0
## - TotalSulfurDioxide 1 54.1 25822 9497.6
## - VolatileAcidity 1 154.3 25923 9537.2
## - AcidIndex 1 1259.3 27028 9964.6
## - LabelAppeal 1 2333.9 28102 10363.7
## - STARS 1 4837.4 30606 11237.4
# 4. Stepwise Selection
stepwise_model <- step(
lm(TARGET ~ ., data = train_data),
direction = "both"
)
## Start: AIC=9479.13
## TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + ResidualSugar +
## Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density +
## pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS
##
## Df Sum of Sq RSS AIC
## - FixedAcidity 1 2.4 25768 9478.1
## <none> 25766 9479.1
## - CitricAcid 1 6.2 25772 9479.6
## - ResidualSugar 1 6.6 25773 9479.8
## - pH 1 7.9 25774 9480.3
## - Density 1 10.0 25776 9481.1
## - Sulphates 1 11.3 25777 9481.6
## - Chlorides 1 26.5 25792 9487.7
## - Alcohol 1 40.1 25806 9493.1
## - FreeSulfurDioxide 1 42.9 25809 9494.2
## - TotalSulfurDioxide 1 53.8 25820 9498.5
## - VolatileAcidity 1 154.4 25920 9538.3
## - AcidIndex 1 1207.3 26973 9945.9
## - LabelAppeal 1 2334.3 28100 10365.0
## - STARS 1 4836.4 30602 11238.3
##
## Step: AIC=9478.09
## TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Density + pH + Sulphates +
## Alcohol + LabelAppeal + AcidIndex + STARS
##
## Df Sum of Sq RSS AIC
## <none> 25768 9478.1
## - CitricAcid 1 6.2 25775 9478.5
## - ResidualSugar 1 6.8 25775 9478.8
## + FixedAcidity 1 2.4 25766 9479.1
## - pH 1 8.0 25776 9479.3
## - Density 1 9.9 25778 9480.0
## - Sulphates 1 11.6 25780 9480.7
## - Chlorides 1 26.4 25795 9486.6
## - Alcohol 1 40.2 25809 9492.1
## - FreeSulfurDioxide 1 42.7 25811 9493.0
## - TotalSulfurDioxide 1 54.1 25822 9497.6
## - VolatileAcidity 1 154.3 25923 9537.2
## - AcidIndex 1 1259.3 27028 9964.6
## - LabelAppeal 1 2333.9 28102 10363.7
## - STARS 1 4837.4 30606 11237.4
# 5. Recursive Feature Elimination (RFE)
set.seed(123)
control <- rfeControl(functions = lmFuncs, method = "cv", number = 5)
rfe_model <- rfe(
x = train_data %>% dplyr::select(-TARGET),
y = train_data$TARGET,
sizes = c(1:10),
rfeControl = control
)
selected_vars <- rfe_model$optVariables
# Fit a new model with selected variables
rfe_model_final <- lm(
TARGET ~ .,
data = train_data %>% dplyr::select(all_of(c(selected_vars, "TARGET")))
)
# 6. Polynomial Model
# Save polynomial transformations during training
poly_terms <- list(
FixedAcidity = poly(train_data$FixedAcidity, 2),
VolatileAcidity = poly(train_data$VolatileAcidity, 2),
ResidualSugar = poly(train_data$ResidualSugar, 2),
pH = poly(train_data$pH, 2),
Alcohol = poly(train_data$Alcohol, 2)
)
# Fit the polynomial model using saved transformations
polynomial_model <- lm(
TARGET ~ poly_terms$FixedAcidity + poly_terms$VolatileAcidity +
poly_terms$ResidualSugar + poly_terms$pH + poly_terms$Alcohol,
data = train_data
)
# 7. Interaction Model
interaction_model <- lm(
TARGET ~ FixedAcidity * VolatileAcidity + ResidualSugar * pH + Alcohol * LabelAppeal,
data = train_data
)
# Evaluate Models
model_list <- list(
Base = base_model,
Forward = forward_model,
Backward = backward_model,
Stepwise = stepwise_model,
RFE = rfe_model_final,
Polynomial = polynomial_model,
Interaction = interaction_model
)
# Initialize storage for metrics
metrics <- data.frame(
Model = character(),
AIC = numeric(),
Train_RMSE = numeric(),
Test_RMSE = numeric(),
Train_R2 = numeric(),
Test_R2 = numeric()
)
# Collect metrics for each model
for (model_name in names(model_list)) {
model <- model_list[[model_name]]
model_metrics <- evaluate_model(model, train_data, test_data)
metrics <- rbind(metrics, data.frame(
Model = model_name,
AIC = AIC(model),
Train_RMSE = round(model_metrics$train_rmse, 4),
Test_RMSE = round(model_metrics$test_rmse, 4),
Train_R2 = round(model_metrics$train_r2, 4),
Test_R2 = round(model_metrics$test_r2, 4)
))
}
## Warning: 'newdata' had 2557 rows but variables found have 10238 rows
## Warning in test_data$TARGET - test_predictions: longer object length is not a
## multiple of shorter object length
## Warning in test_data$TARGET - test_predictions: longer object length is not a
## multiple of shorter object length
# Print metrics
print(metrics)
## Model AIC Train_RMSE Test_RMSE Train_R2 Test_R2
## 1 Base 38535.31 1.5864 1.5950 0.3199 0.3218
## 2 Forward 42454.21 1.9237 1.9368 0.0000 0.0000
## 3 Backward 38534.28 1.5865 1.5952 0.3198 0.3216
## 4 Stepwise 38534.28 1.5865 1.5952 0.3198 0.3216
## 5 RFE 38535.31 1.5864 1.5950 0.3199 0.3218
## 6 Polynomial 42312.79 1.9086 1.9507 0.0156 -3.0614
## 7 Interaction 40932.92 1.7844 1.7998 0.1396 0.1364
# Plot AIC Comparison
ggplot(metrics, aes(x = reorder(Model, AIC), y = AIC)) +
geom_bar(stat = "identity", fill = "steelblue") +
coord_flip() +
labs(title = "AIC Comparison of Models", x = "Model", y = "AIC")
# Select and Summarize the Best Model
best_model_name <- metrics$Model[which.min(metrics$AIC)]
cat("\nBest Model: ", best_model_name, "\n")
##
## Best Model: Backward
summary(model_list[[best_model_name]])
##
## Call:
## lm(formula = TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar +
## Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density +
## pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS,
## data = train_data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1613 -0.8565 0.3150 1.1062 4.9772
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.03343 0.01569 193.321 < 2e-16 ***
## VolatileAcidity -0.12264 0.01567 -7.825 5.58e-15 ***
## CitricAcid 0.02459 0.01571 1.565 0.11763
## ResidualSugar 0.02569 0.01562 1.645 0.10002
## Chlorides -0.05111 0.01579 -3.238 0.00121 **
## FreeSulfurDioxide 0.06479 0.01575 4.114 3.92e-05 ***
## TotalSulfurDioxide 0.07285 0.01573 4.632 3.66e-06 ***
## Density -0.03091 0.01561 -1.980 0.04768 *
## pH -0.02789 0.01569 -1.778 0.07551 .
## Sulphates -0.03362 0.01569 -2.143 0.03212 *
## Alcohol 0.06282 0.01572 3.995 6.52e-05 ***
## LabelAppeal 0.49598 0.01630 30.430 < 2e-16 ***
## AcidIndex -0.35467 0.01587 -22.353 < 2e-16 ***
## STARS 0.71786 0.01639 43.810 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.588 on 10224 degrees of freedom
## Multiple R-squared: 0.3198, Adjusted R-squared: 0.319
## F-statistic: 369.8 on 13 and 10224 DF, p-value: < 2.2e-16
# Residual Plot for the Best Model
best_model <- model_list[[best_model_name]]
plot(residuals(best_model),
main = paste("Residuals of", best_model_name, "Model"),
xlab = "Observation", ylab = "Residuals", col = "blue", pch = 20)
abline(h = 0, col = "red", lty = 2)
# Load necessary libraries
library(MASS)
library(dplyr)
library(Metrics) # For RMSE computation
##
## Attaching package: 'Metrics'
## The following objects are masked from 'package:caret':
##
## precision, recall
# Fit robust regression using rlm with Huber loss
huber_model <- rlm(TARGET ~ ., data = train_data, psi = psi.huber, k = 1.5, method = "MM")
# Fit robust regression using rlm with default settings
default_model <- rlm(TARGET ~ ., data = train_data, method = "MM")
# Compute predictions for both models (train and test)
huber_train_predictions <- predict(huber_model, newdata = train_data)
huber_test_predictions <- predict(huber_model, newdata = test_data)
default_train_predictions <- predict(default_model, newdata = train_data)
default_test_predictions <- predict(default_model, newdata = test_data)
# Compute residuals
huber_train_residuals <- train_data$TARGET - huber_train_predictions
huber_test_residuals <- test_data$TARGET - huber_test_predictions
default_train_residuals <- train_data$TARGET - default_train_predictions
default_test_residuals <- test_data$TARGET - default_test_predictions
# Compute RMSE for both models (train and test)
huber_train_rmse <- rmse(train_data$TARGET, huber_train_predictions)
huber_test_rmse <- rmse(test_data$TARGET, huber_test_predictions)
default_train_rmse <- rmse(train_data$TARGET, default_train_predictions)
default_test_rmse <- rmse(test_data$TARGET, default_test_predictions)
# Compute AIC and BIC for both models on training data
compute_aic_bic <- function(model, residuals, n) {
rss <- sum(residuals^2) # Residual sum of squares
k <- length(coef(model)) # Number of parameters
aic <- n * log(rss / n) + 2 * k # AIC formula
bic <- n * log(rss / n) + k * log(n) # BIC formula
return(list(AIC = aic, BIC = bic))
}
huber_metrics <- compute_aic_bic(huber_model, huber_train_residuals, nrow(train_data))
default_metrics <- compute_aic_bic(default_model, default_train_residuals, nrow(train_data))
# Output metrics
cat("\nHuber Model Metrics (Training):\n")
##
## Huber Model Metrics (Training):
cat("Train RMSE:", huber_train_rmse, "\n")
## Train RMSE: 1.629752
cat("Test RMSE:", huber_test_rmse, "\n")
## Test RMSE: 1.644409
cat("AIC:", huber_metrics$AIC, "\n")
## AIC: 10031.05
cat("BIC:", huber_metrics$BIC, "\n")
## BIC: 10139.56
cat("\nDefault Model Metrics (Training):\n")
##
## Default Model Metrics (Training):
cat("Train RMSE:", default_train_rmse, "\n")
## Train RMSE: 1.629752
cat("Test RMSE:", default_test_rmse, "\n")
## Test RMSE: 1.644408
cat("AIC:", default_metrics$AIC, "\n")
## AIC: 10031.04
cat("BIC:", default_metrics$BIC, "\n")
## BIC: 10139.55
The AIC comparison plot shows the AIC values for various regression models. A lower AIC indicates a better-fitting model. In this case:
The choice between these models depends on the balance between fit and interpretability: - The full Base model will be preferred for interpretability as it allows clear and direct analysis of coefficients. - For slightly improved fit, Stepwise or Backward selection models are viable, with minimal complexity added and similar AIC values. - Interaction or polynomial models should be considered only if theoretical justification for their use exists.
dispersion_poisson <- sum(residuals(poisson_model_all, type = "pearson")^2) / poisson_model_all$df.residual
print(paste("Dispersion for Poisson Model:", dispersion_poisson))
## [1] "Dispersion for Poisson Model: 0.944921552951479"
dispersion_nb <- sum(residuals(nb_model_all, type = "pearson")^2) / nb_model_all$df.residual
print(paste("Dispersion for Negative Binomial Model:", dispersion_nb))
## [1] "Dispersion for Negative Binomial Model: 0.94486483473297"
STARS
and LabelAppeal
).Based on the AIC in the table:
STARS
and LabelAppeal
. This is beneficial for
explaining relationships between predictors and the target
variable.Both models should be considered based on the specific goals of the analysis: - For deeper insights into the predictors’ individual contributions: Choose Multiple Regression. - For a focus on prediction accuracy: Choose Zero-Inflated Poisson Regression. We selected the following models based on a balance of predictive performance (Test RMSE) and interpretability:
These models strike an effective balance between predictive accuracy and simplicity, enhancing their explanatory utility.
# Initialize storage for metrics across all models
final_metrics <- data.frame(
Model_Type = character(),
Model = character(),
AIC = numeric(),
Train_RMSE = numeric(),
Test_RMSE = numeric(),
Train_R2 = numeric(),
Test_R2 = numeric()
)
# Define a function to calculate evaluation metrics
evaluate_and_store <- function(model, model_name, model_type, train_data, test_data) {
train_predictions <- predict(model, newdata = train_data, type = "response")
test_predictions <- predict(model, newdata = test_data, type = "response")
train_rmse <- sqrt(mean((train_data$TARGET - train_predictions)^2))
test_rmse <- sqrt(mean((test_data$TARGET - test_predictions)^2))
train_r2 <- 1 - (sum((train_data$TARGET - train_predictions)^2) /
sum((train_data$TARGET - mean(train_data$TARGET))^2))
test_r2 <- 1 - (sum((test_data$TARGET - test_predictions)^2) /
sum((test_data$TARGET - mean(test_data$TARGET))^2))
final_metrics <<- rbind(
final_metrics,
data.frame(
Model_Type = model_type,
Model = model_name,
AIC = round(AIC(model), 2),
Train_RMSE = round(train_rmse, 4),
Test_RMSE = round(test_rmse, 4),
Train_R2 = round(train_r2, 4),
Test_R2 = round(test_r2, 4)
)
)
}
# Add all your models here
model_list <- list(
list(name = "Poisson (All Predictors)", type = "Poisson Regression", model = poisson_model_all),
list(name = "Poisson (Stepwise)", type = "Poisson Regression", model = poisson_model_stepwise),
list(name = "Negative Binomial (All Predictors)", type = "Negative Binomial Regression", model = nb_model_all),
list(name = "Negative Binomial (Stepwise)", type = "Negative Binomial Regression", model = nb_model_stepwise),
list(name = "Zero-Inflated Poisson", type = "Zero-Inflated Models", model = zip_model),
list(name = "Zero-Inflated Negative Binomial", type = "Zero-Inflated Models", model = zinb_model),
list(name = "Linear Regression (Base)", type = "Multiple Linear Regression", model = base_model),
list(name = "Linear Regression (Forward)", type = "Multiple Linear Regression", model = forward_model),
list(name = "Linear Regression (Backward)", type = "Multiple Linear Regression", model = backward_model),
list(name = "Linear Regression (Stepwise)", type = "Multiple Linear Regression", model = stepwise_model),
list(name = "Linear Regression (Polynomial)", type = "Multiple Linear Regression", model = polynomial_model),
list(name = "Linear Regression (Interaction)", type = "Multiple Linear Regression", model = interaction_model)
)
# Loop through each model and calculate/store metrics
for (model_entry in model_list) {
evaluate_and_store(
model = model_entry$model,
model_name = model_entry$name,
model_type = model_entry$type,
train_data = train_data,
test_data = test_data
)
}
## Warning: 'newdata' had 2557 rows but variables found have 10238 rows
## Warning in test_data$TARGET - test_predictions: longer object length is not a
## multiple of shorter object length
## Warning in test_data$TARGET - test_predictions: longer object length is not a
## multiple of shorter object length
# Sort the data frame by AIC in ascending order
final_metrics_sorted <- final_metrics %>%
arrange(Test_RMSE)
# Display the final consolidated metrics table
print(final_metrics_sorted)
## Model_Type Model AIC
## 1 Poisson Regression Poisson (All Predictors) 39846.53
## 2 Negative Binomial Regression Negative Binomial (All Predictors) 39848.66
## 3 Poisson Regression Poisson (Stepwise) 39845.64
## 4 Negative Binomial Regression Negative Binomial (Stepwise) 39847.77
## 5 Multiple Linear Regression Linear Regression (Base) 38535.31
## 6 Multiple Linear Regression Linear Regression (Backward) 38534.28
## 7 Multiple Linear Regression Linear Regression (Stepwise) 38534.28
## 8 Zero-Inflated Models Zero-Inflated Poisson 37052.37
## 9 Zero-Inflated Models Zero-Inflated Negative Binomial 37054.39
## 10 Multiple Linear Regression Linear Regression (Interaction) 40932.92
## 11 Multiple Linear Regression Linear Regression (Forward) 42454.21
## 12 Multiple Linear Regression Linear Regression (Polynomial) 42312.79
## Train_RMSE Test_RMSE Train_R2 Test_R2
## 1 1.5723 1.5783 0.3319 0.3359
## 2 1.5723 1.5783 0.3319 0.3359
## 3 1.5724 1.5785 0.3318 0.3358
## 4 1.5724 1.5785 0.3318 0.3358
## 5 1.5864 1.5950 0.3199 0.3218
## 6 1.5865 1.5952 0.3198 0.3216
## 7 1.5865 1.5952 0.3198 0.3216
## 8 1.6483 1.6585 0.2658 0.2667
## 9 1.6483 1.6585 0.2658 0.2667
## 10 1.7844 1.7998 0.1396 0.1364
## 11 1.9237 1.9368 0.0000 0.0000
## 12 1.9086 1.9507 0.0156 -3.0614
# Optionally, visualize the AIC Comparison
library(ggplot2)
ggplot(final_metrics, aes(x = reorder(Model, AIC), y = AIC, fill = Model_Type)) +
geom_bar(stat = "identity") +
coord_flip() +
labs(title = "Model AIC Comparison", x = "Model", y = "AIC") +
theme_minimal()
When the target variable (TARGET
) in the evaluation
dataset is entirely NA
and is not used for prediction, you
should remove it from the evaluation dataset before making predictions.
Additionally, the error indicates that some predictors in the evaluation
dataset might have different data types (e.g., factors instead of
numeric). You’ll need to ensure the data types in the evaluation dataset
match those in the training dataset.
Here’s how you can handle this issue and update the prediction code:
TARGET
column from the evaluation
dataset since it is not used for prediction.# Prepare the evaluation dataset and ensure consistent row order
eval_data_prepared <- boxcox_eval_data %>%
dplyr::select(-TARGET, -INDEX) %>% # Drop TARGET and INDEX columns
dplyr::mutate(ID = boxcox_eval_data$INDEX) # Retain INDEX for prediction alignment
# Glimpse the prepared dataset
glimpse(eval_data_prepared)
## Rows: 3,335
## Columns: 15
## $ FixedAcidity <dbl[,1]> <matrix[26 x 1]>
## $ VolatileAcidity <dbl[,1]> <matrix[26 x 1]>
## $ CitricAcid <dbl[,1]> <matrix[26 x 1]>
## $ ResidualSugar <dbl[,1]> <matrix[26 x 1]>
## $ Chlorides <dbl[,1]> <matrix[26 x 1]>
## $ FreeSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ TotalSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ Density <dbl[,1]> <matrix[26 x 1]>
## $ pH <dbl[,1]> <matrix[26 x 1]>
## $ Sulphates <dbl[,1]> <matrix[26 x 1]>
## $ Alcohol <dbl[,1]> <matrix[26 x 1]>
## $ LabelAppeal <dbl[,1]> <matrix[26 x 1]>
## $ AcidIndex <dbl[,1]> <matrix[26 x 1]>
## $ STARS <dbl[,1]> <matrix[26 x 1]>
## $ ID <int> 3, 9, 10, 18, 21, 30, 31, 37, 39, 47, 60, 62, 6…
# Predict using the Poisson (Stepwise) model
poisson_stepwise_predictions <- predict(poisson_model_stepwise, newdata = eval_data_prepared, type = "response")
# Create a DataFrame with predictions
poisson_predictions_df <- data.frame(
ID = eval_data_prepared$ID, # Add back the INDEX as ID
Predictions = poisson_stepwise_predictions
)
# Save predictions to CSV
write.csv(poisson_predictions_df, "poisson_stepwise_predictions.csv", row.names = FALSE)
cat("Poisson (Stepwise) predictions saved to 'poisson_stepwise_predictions.csv'\n")
## Poisson (Stepwise) predictions saved to 'poisson_stepwise_predictions.csv'
# Prepare the evaluation dataset and ensure consistent row order
eval_data_prepared <- boxcox_eval_data %>%
dplyr::select(-TARGET, -INDEX) %>% # Drop TARGET and INDEX columns
dplyr::mutate(ID = boxcox_eval_data$INDEX) # Retain INDEX for prediction alignment
# Glimpse the prepared dataset
glimpse(eval_data_prepared)
## Rows: 3,335
## Columns: 15
## $ FixedAcidity <dbl[,1]> <matrix[26 x 1]>
## $ VolatileAcidity <dbl[,1]> <matrix[26 x 1]>
## $ CitricAcid <dbl[,1]> <matrix[26 x 1]>
## $ ResidualSugar <dbl[,1]> <matrix[26 x 1]>
## $ Chlorides <dbl[,1]> <matrix[26 x 1]>
## $ FreeSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ TotalSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ Density <dbl[,1]> <matrix[26 x 1]>
## $ pH <dbl[,1]> <matrix[26 x 1]>
## $ Sulphates <dbl[,1]> <matrix[26 x 1]>
## $ Alcohol <dbl[,1]> <matrix[26 x 1]>
## $ LabelAppeal <dbl[,1]> <matrix[26 x 1]>
## $ AcidIndex <dbl[,1]> <matrix[26 x 1]>
## $ STARS <dbl[,1]> <matrix[26 x 1]>
## $ ID <int> 3, 9, 10, 18, 21, 30, 31, 37, 39, 47, 60, 62, 6…
# Predict using the Negative Binomial (Stepwise) model
nb_stepwise_predictions <- predict(nb_model_stepwise, newdata = eval_data_prepared, type = "response")
# Create a DataFrame with predictions
nb_predictions_df <- data.frame(
ID = eval_data_prepared$ID, # Add back the INDEX as ID
Predictions = nb_stepwise_predictions
)
# Save predictions to CSV
write.csv(nb_predictions_df, "nb_stepwise_predictions.csv", row.names = FALSE)
cat("Negative Binomial (Stepwise) predictions saved to 'nb_stepwise_predictions.csv'\n")
## Negative Binomial (Stepwise) predictions saved to 'nb_stepwise_predictions.csv'
# Prepare the evaluation dataset and ensure consistent row order
eval_data_prepared <- boxcox_eval_data %>%
dplyr::select(-TARGET, -INDEX) %>% # Drop TARGET and INDEX columns
dplyr::mutate(ID = boxcox_eval_data$INDEX) # Retain INDEX for prediction alignment
# Glimpse the prepared dataset
glimpse(eval_data_prepared)
## Rows: 3,335
## Columns: 15
## $ FixedAcidity <dbl[,1]> <matrix[26 x 1]>
## $ VolatileAcidity <dbl[,1]> <matrix[26 x 1]>
## $ CitricAcid <dbl[,1]> <matrix[26 x 1]>
## $ ResidualSugar <dbl[,1]> <matrix[26 x 1]>
## $ Chlorides <dbl[,1]> <matrix[26 x 1]>
## $ FreeSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ TotalSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ Density <dbl[,1]> <matrix[26 x 1]>
## $ pH <dbl[,1]> <matrix[26 x 1]>
## $ Sulphates <dbl[,1]> <matrix[26 x 1]>
## $ Alcohol <dbl[,1]> <matrix[26 x 1]>
## $ LabelAppeal <dbl[,1]> <matrix[26 x 1]>
## $ AcidIndex <dbl[,1]> <matrix[26 x 1]>
## $ STARS <dbl[,1]> <matrix[26 x 1]>
## $ ID <int> 3, 9, 10, 18, 21, 30, 31, 37, 39, 47, 60, 62, 6…
# Predict using the Zero-Inflated Poisson Model
zip_predictions <- predict(zip_model, newdata = eval_data_prepared, type = "response")
# Create a DataFrame with predictions
zip_predictions_df <- data.frame(
ID = eval_data_prepared$ID, # Add back the INDEX as ID
Predictions = zip_predictions
)
# Save predictions to CSV
write.csv(zip_predictions_df, "zip_predictions.csv", row.names = FALSE)
cat("Zero-Inflated Poisson predictions saved to 'zip_predictions.csv'\n")
## Zero-Inflated Poisson predictions saved to 'zip_predictions.csv'
# Prepare the evaluation dataset and ensure consistent row order
eval_data_prepared <- boxcox_eval_data %>%
dplyr::select(-TARGET, -INDEX) %>% # Drop TARGET and INDEX columns
dplyr::mutate(ID = boxcox_eval_data$INDEX) # Retain INDEX for prediction alignment
# Glimpse the prepared dataset
glimpse(eval_data_prepared)
## Rows: 3,335
## Columns: 15
## $ FixedAcidity <dbl[,1]> <matrix[26 x 1]>
## $ VolatileAcidity <dbl[,1]> <matrix[26 x 1]>
## $ CitricAcid <dbl[,1]> <matrix[26 x 1]>
## $ ResidualSugar <dbl[,1]> <matrix[26 x 1]>
## $ Chlorides <dbl[,1]> <matrix[26 x 1]>
## $ FreeSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ TotalSulfurDioxide <dbl[,1]> <matrix[26 x 1]>
## $ Density <dbl[,1]> <matrix[26 x 1]>
## $ pH <dbl[,1]> <matrix[26 x 1]>
## $ Sulphates <dbl[,1]> <matrix[26 x 1]>
## $ Alcohol <dbl[,1]> <matrix[26 x 1]>
## $ LabelAppeal <dbl[,1]> <matrix[26 x 1]>
## $ AcidIndex <dbl[,1]> <matrix[26 x 1]>
## $ STARS <dbl[,1]> <matrix[26 x 1]>
## $ ID <int> 3, 9, 10, 18, 21, 30, 31, 37, 39, 47, 60, 62, 6…
# Predict using Linear Regression (Backward) Model
lm_backward_predictions <- predict(backward_model, newdata = eval_data_prepared)
# Create a DataFrame with predictions
lm_predictions_df <- data.frame(
ID = eval_data_prepared$ID, # Add back the INDEX as ID
Predictions = lm_backward_predictions
)
# Save predictions to CSV
write.csv(lm_predictions_df, "lm_backward_predictions.csv", row.names = FALSE)
cat("Linear Regression (Backward) predictions saved to 'lm_backward_predictions.csv'\n")
## Linear Regression (Backward) predictions saved to 'lm_backward_predictions.csv'
The pairwise scatter plot illustrates the predictions from the four selected models: Poisson (Stepwise), Negative Binomial (Stepwise), Zero-Inflated Poisson, and Linear Regression (Backward). Here are the key takeaways:
In summary, this comparison underlines the consistency of the count-based models for predictive tasks, while Linear Regression, though interpretable, shows less agreement with models designed specifically for discrete data.
# Combine predictions into a single data frame
predictions_df <- data.frame(
Poisson = poisson_predictions_df$Predictions,
NegativeBinomial = nb_predictions_df$Predictions,
ZeroInflatedPoisson = zip_predictions_df$Predictions,
LinearRegression = lm_predictions_df$Predictions
)
# Display summary of predictions
summary(predictions_df)
## Poisson NegativeBinomial ZeroInflatedPoisson LinearRegression
## Min. :1.071 Min. :1.071 Min. :1.253 Min. :0.09155
## 1st Qu.:2.179 1st Qu.:2.179 1st Qu.:2.415 1st Qu.:2.23129
## Median :2.790 Median :2.790 Median :2.909 Median :2.97524
## Mean :3.030 Mean :3.030 Mean :2.993 Mean :3.03343
## 3rd Qu.:3.674 3rd Qu.:3.674 3rd Qu.:3.444 3rd Qu.:3.80676
## Max. :9.496 Max. :9.496 Max. :6.560 Max. :6.68342
# Generate pairwise scatter plots
pairs(
predictions_df,
main = "Pairwise Scatter Plot of Model Predictions",
pch = 21,
bg = c("blue", "green", "red", "purple")
)