# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 534753 28.6 1193162 63.8 NA 669405 35.8
## Vcells 992663 7.6 8388608 64.0 16384 1851691 14.2
cat("\f") # Clear the console
# Set the seed for reproducibility
set.seed(123)
# Prepare needed libraries
packages <- c("psych",
"tidyverse",
"conflicted",
"MASS",
"caret",
"skimr",
"car",
"visdat",
"stargazer",
"ggcorrplot",
"glmmTMB",
"performance",
"corrplot"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: lattice
## Warning: package 'car' was built under R version 4.3.3
## Loading required package: carData
##
## Please cite as:
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Warning: package 'glmmTMB' was built under R version 4.3.3
## Warning in checkMatrixPackageVersion(getOption("TMB.check.Matrix", TRUE)): Package version inconsistency detected.
## TMB was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package
## Warning: package 'performance' was built under R version 4.3.3
## corrplot 0.92 loaded
rm(packages)
conflict_prefer(name = "describe", winner = "psych")
## [conflicted] Will prefer psych::describe over any other package.
conflict_prefer(name = "select" , winner = "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
# Set working directory and path to data
setwd("~/Desktop")
# Load your data
df <- read.csv("wine-data-1.csv")
stargazer(df, type = "text", summary = TRUE)
##
## ================================================================
## Statistic N Mean St. Dev. Min Max
## ----------------------------------------------------------------
## INDEX 12,795 8,069.980 4,656.905 1 16,129
## TARGET 12,795 3.029 1.926 0 8
## FixedAcidity 12,795 7.076 6.318 -18.100 34.400
## VolatileAcidity 12,795 0.324 0.784 -2.790 3.680
## CitricAcid 12,795 0.308 0.862 -3.240 3.860
## ResidualSugar 12,179 5.419 33.749 -127.800 141.150
## Chlorides 12,157 0.055 0.318 -1.171 1.351
## FreeSulfurDioxide 12,148 30.846 148.715 -555.000 623.000
## TotalSulfurDioxide 12,113 120.714 231.913 -823.000 1,057.000
## Density 12,795 0.994 0.027 0.888 1.099
## pH 12,400 3.208 0.680 0.480 6.130
## Sulphates 11,585 0.527 0.932 -3.130 4.240
## Alcohol 12,142 10.489 3.728 -4.700 26.500
## LabelAppeal 12,795 -0.009 0.891 -2 2
## AcidIndex 12,795 7.773 1.324 4 17
## STARS 9,436 2.042 0.903 1 4
## ----------------------------------------------------------------
# Drop the INDEX column
df <- df[, !(colnames(df) %in% c("INDEX"))]
vis_miss(df)
# List of columns to drop
columns_to_drop <- c("FixedAcidity", "VolatileAcidity",
"ResidualSugar", "Chlorides", "FreeSulfurDioxide", "Sulphates")
# Drop the columns from the dataset
df <- df[, !(names(df) %in% columns_to_drop)]
# List of variables to take the absolute values
columns_to_abs <- c("CitricAcid", "TotalSulfurDioxide", "Alcohol")
# Apply absolute value transformation
df[columns_to_abs] <- lapply(df[columns_to_abs], abs)
# Impute missing values in the pH column with the median
df$pH[is.na(df$pH)] <- median(df$pH, na.rm = TRUE)
# Impute missing values for the Alcohol variable
df$Alcohol[is.na(df$Alcohol)] <- median(df$Alcohol, na.rm = TRUE)
# Impute missing values for other variables (using median)
columns_to_impute <- c("CitricAcid", "TotalSulfurDioxide", "pH")
for (col in columns_to_impute) {
df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
}
#Create a missing flag for STARS
df$STARS_Flag <- ifelse(is.na(df$STARS), 1, 0)
#Impute missing values in STARS with 0
df$STARS[is.na(df$STARS)] <- 0
df$log_AcidIndex <- log(df$AcidIndex+1)
df$log_TotalSulfurDioxide <- log(df$TotalSulfurDioxide+1)
str(df)
## 'data.frame': 12795 obs. of 12 variables:
## $ TARGET : int 3 3 5 3 4 0 0 4 3 6 ...
## $ CitricAcid : num 0.98 0.81 0.88 0.04 1.26 0.59 0.4 0.34 1.05 0.39 ...
## $ TotalSulfurDioxide : num 268 327 142 115 108 15 156 551 154 180 ...
## $ Density : num 0.993 1.028 0.995 0.996 0.995 ...
## $ pH : num 3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
## $ Alcohol : num 9.9 10.4 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
## $ LabelAppeal : int 0 -1 -1 -1 0 0 0 1 0 0 ...
## $ AcidIndex : int 8 7 8 6 9 11 8 7 6 8 ...
## $ STARS : num 2 3 3 1 2 0 0 3 0 4 ...
## $ STARS_Flag : num 0 0 0 0 0 1 1 0 1 0 ...
## $ log_AcidIndex : num 2.2 2.08 2.2 1.95 2.3 ...
## $ log_TotalSulfurDioxide: num 5.59 5.79 4.96 4.75 4.69 ...
No more NA values.
# Set the seed for reproducibility
set.seed(123)
# Generate a vector of indices for the training set
train_index <- sample(x = nrow(df),
size = round(0.8 * nrow(df)
)
)
# Create the training and testing sets
df_train <- df[ train_index, ]
df_eval <- df[-train_index, ]
str(df_train)
## 'data.frame': 10236 obs. of 12 variables:
## $ TARGET : int 0 3 3 2 6 3 3 4 4 0 ...
## $ CitricAcid : num 0.78 0.25 0.4 0.3 0.28 0.58 1.32 1.2 0.16 0.28 ...
## $ TotalSulfurDioxide : num 326 138 154 369 154 161 122 97 115 83 ...
## $ Density : num 0.946 1 0.998 0.964 0.956 ...
## $ pH : num 3.2 4.47 2.4 3.29 3.31 3.06 3.94 3.23 4.15 3.25 ...
## $ Alcohol : num 9.5 4.8 10.6 15.6 16.9 8.9 18.3 21.7 12 10.4 ...
## $ LabelAppeal : int 0 0 -2 -1 1 -1 -1 0 0 -1 ...
## $ AcidIndex : int 9 7 9 7 6 9 6 8 6 7 ...
## $ STARS : num 2 2 2 0 2 2 2 2 3 0 ...
## $ STARS_Flag : num 0 0 0 1 0 0 0 0 0 1 ...
## $ log_AcidIndex : num 2.3 2.08 2.3 2.08 1.95 ...
## $ log_TotalSulfurDioxide: num 5.79 4.93 5.04 5.91 5.04 ...
nrow(df_train)
## [1] 10236
str(df_eval)
## 'data.frame': 2559 obs. of 12 variables:
## $ TARGET : int 3 4 5 2 6 4 5 4 0 4 ...
## $ CitricAcid : num 0.04 0.34 0.27 0.24 0.27 1.1 0.09 2.75 2.32 0.26 ...
## $ TotalSulfurDioxide : num 115 551 229 60 154 129 154 126 234 220 ...
## $ Density : num 0.996 1.032 0.999 1.009 0.994 ...
## $ pH : num 2.24 3.2 3.14 3.19 1.95 3.2 3.3 3.04 3.11 3.2 ...
## $ Alcohol : num 6.2 11.6 11 9.8 13 16.6 12.5 9.7 9.9 10.6 ...
## $ LabelAppeal : int -1 1 1 -1 1 0 0 0 -1 0 ...
## $ AcidIndex : int 6 7 11 7 7 9 8 8 7 7 ...
## $ STARS : num 1 3 2 1 4 2 2 1 1 2 ...
## $ STARS_Flag : num 0 0 0 0 0 0 0 0 0 0 ...
## $ log_AcidIndex : num 1.95 2.08 2.48 2.08 2.08 ...
## $ log_TotalSulfurDioxide: num 4.75 6.31 5.44 4.11 5.04 ...
nrow(df_eval)
## [1] 2559
# Calculate the correlation matrix for numeric variables
cor_matrix <- cor(df_train, use = "complete.obs")
# Calculate p-values for the significance of correlations
p.mat <- cor.mtest(df_train, conf.level = 0.95)$p
# Create a smaller, less crowded correlation plot with significance
corrplot(cor_matrix,
type = "lower",
method = "circle",
tl.cex = 0.5,
diag = FALSE,
number.cex = 0.6,
mar = c(0, 0, 1, 0),
tl.srt = 45,
tl.col = "black",
p.mat = p.mat,
sig.level = 0.05,
insig = "pch",
pch = 4,
cl.pos = "r")
CitricAcid: Based on the checkmarks in the matrix, CitricAcid does not appear to be strongly correlated with the other variables shown.
Density: Density shows negative correlations with pH and Alcohol, based on the pattern of checkmarks.
pH: The pH variable exhibits slight negative correlations with TotalSulfurDioxide, Density, and Alcohol.
The log-transformed variables (log_AcidIndex and log_TotalSulfurDioxide) appear to have positive correlations with the original variables as expected.
The STARS and STARS_Flag variables seem to be positively correlated with each other, as one would expect.
STARS seem to be fairly positively correlated with the Target.
I have transformed certain values after checking the distribution to reduce the skewness.
df_train_long <- df_train %>%
gather(key = "variable", value = "value")
# Plot the distributions of all columns
ggplot(df_train_long, aes(x = value)) +
geom_histogram(bins = 30, fill = "lightblue", color = "black", alpha = 0.7) +
facet_wrap(~ variable, scales = "free_x") +
theme_minimal() +
theme(strip.text = element_text(size = 8)) +
labs(title = "Distributions of All Columns", x = "Value", y = "Frequency")
# Calculate mean and variance of the target variable
mean_target <- mean(df_train$TARGET, na.rm = TRUE)
variance_target <- var(df_train$TARGET, na.rm = TRUE)
cat("Mean of TARGET:", mean_target, "\n")
## Mean of TARGET: 3.02628
cat("Variance of TARGET:", variance_target, "\n")
## Variance of TARGET: 3.714111
# Check for overdispersion
if (variance_target > mean_target) {
cat("There is overdispersion in the data.\n")
} else {
cat("No overdispersion detected.\n")
}
## There is overdispersion in the data.
# Quasi-Poisson Model 1 (Original variables)
model_quasi1 <- glm(TARGET ~ .,
family = quasipoisson(link = "log"),
data = df_train)
# Summary of the model
summary(model_quasi1)
##
## Call:
## glm(formula = TARGET ~ ., family = quasipoisson(link = "log"),
## data = df_train)
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.543e+00 5.132e-01 -3.006 0.00265 **
## CitricAcid 9.511e-03 8.740e-03 1.088 0.27653
## TotalSulfurDioxide -1.985e-04 6.517e-05 -3.046 0.00232 **
## Density -3.058e-01 2.011e-01 -1.521 0.12833
## pH -1.217e-02 8.092e-03 -1.503 0.13274
## Alcohol 2.519e-03 1.522e-03 1.655 0.09793 .
## LabelAppeal 1.592e-01 6.395e-03 24.902 < 2e-16 ***
## AcidIndex -3.363e-01 4.009e-02 -8.388 < 2e-16 ***
## STARS 1.905e-01 6.381e-03 29.863 < 2e-16 ***
## STARS_Flag -6.412e-01 2.248e-02 -28.525 < 2e-16 ***
## log_AcidIndex 2.337e+00 3.609e-01 6.476 9.85e-11 ***
## log_TotalSulfurDioxide 6.169e-02 1.252e-02 4.929 8.40e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 0.8816674)
##
## Null deviance: 18306 on 10235 degrees of freedom
## Residual deviance: 10955 on 10224 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 6
Key Findings:
LabelAppeal (p < 2e-16) and
STARS (p < 2e-16) are highly significant predictors,
with LabelAppeal having a positive impact (with a large
positive coefficient), suggesting that higher LabelAppeal
leads to a higher expected count of the target variable.
AcidIndex (p < 2e-16) and log_AcidIndex (p < 2e-16) show negative relationships, meaning as these values increase, the expected count of the target decreases.
TotalSulfurDioxide (p = 0.00232) and log_TotalSulfurDioxide (p = 8.40e-07) are also statistically significant, with positive relationships, indicating an increase in these values is associated with an increase in the expected count of the target.
It is has lower than 1 value for dispersion which might mean that it is facing underdispersion. So I’ll switch to Poisson model.
# Poisson Model 1
model_p1 <- glm(TARGET ~ . -AcidIndex -TotalSulfurDioxide,
family = poisson(link = "log"),
data = df_train)
# Summary of the model
summary(model_p1)
##
## Call:
## glm(formula = TARGET ~ . - AcidIndex - TotalSulfurDioxide, family = poisson(link = "log"),
## data = df_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.493624 0.237149 10.515 < 2e-16 ***
## CitricAcid 0.009679 0.009310 1.040 0.298
## Density -0.296588 0.214043 -1.386 0.166
## pH -0.013944 0.008619 -1.618 0.106
## Alcohol 0.002167 0.001620 1.338 0.181
## LabelAppeal 0.158276 0.006810 23.242 < 2e-16 ***
## STARS 0.192519 0.006788 28.364 < 2e-16 ***
## STARS_Flag -0.649214 0.023927 -27.133 < 2e-16 ***
## log_AcidIndex -0.690624 0.044828 -15.406 < 2e-16 ***
## log_TotalSulfurDioxide 0.032655 0.006885 4.743 2.11e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18306 on 10235 degrees of freedom
## Residual deviance: 11032 on 10226 degrees of freedom
## AIC: 36592
##
## Number of Fisher Scoring iterations: 6
vif(model_p1)
## CitricAcid Density pH
## 1.002323 1.002079 1.006182
## Alcohol LabelAppeal STARS
## 1.010625 1.130031 1.797289
## STARS_Flag log_AcidIndex log_TotalSulfurDioxide
## 1.626384 1.035999 1.007606
Key Findings:
LabelAppeal (0.158): For each one-unit increase in LabelAppeal, the expected count of the target variable increases by 1.17 times. This suggests that higher label appeal is strongly associated with an increase in the outcome. This result is highly significant (p < 2e-16), indicating its robust impact.
STARS (0.193): Each one-unit increase in STARS leads to an increase in the expected count of the target by 1.21 times. This implies that more stars are positively related to the outcome, with a 21% increase in the expected count for every additional star. This is also highly significant (p < 2e-16).
STARS_Flag (-0.649): The presence of a STARS_Flag is associated with a 48% decrease in the expected count of the target variable, with a statistical significance (p < 2e-16). This suggests that the STARS_Flag may act as a significant negative factor.
LabelAppeal and STARS are the strongest positive predictors, with STARS_Flag being a significant negative predictor.
# Poisson Model 2
model_p2 <- glm(TARGET ~ LabelAppeal + log_AcidIndex + STARS + log_TotalSulfurDioxide + STARS_Flag,
family = poisson(link = "log"),
data = df_train)
# Summary of the model
summary(model_p2)
##
## Call:
## glm(formula = TARGET ~ LabelAppeal + log_AcidIndex + STARS +
## log_TotalSulfurDioxide + STARS_Flag, family = poisson(link = "log"),
## data = df_train)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.180830 0.105506 20.670 < 2e-16 ***
## LabelAppeal 0.158238 0.006807 23.247 < 2e-16 ***
## log_AcidIndex -0.689366 0.044569 -15.467 < 2e-16 ***
## STARS 0.193249 0.006770 28.546 < 2e-16 ***
## log_TotalSulfurDioxide 0.032406 0.006877 4.712 2.45e-06 ***
## STARS_Flag -0.648910 0.023916 -27.133 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18306 on 10235 degrees of freedom
## Residual deviance: 11039 on 10230 degrees of freedom
## AIC: 36591
##
## Number of Fisher Scoring iterations: 6
vif(model_p2)
## LabelAppeal log_AcidIndex STARS
## 1.128980 1.023758 1.787895
## log_TotalSulfurDioxide STARS_Flag
## 1.005952 1.624839
Magnitude:
Significance: All variables are highly significant, with p-values well below the typical threshold of 0.05.
Direction:
LabelAppeal, STARS, and log_TotalSulfurDioxide have a positive direction, meaning that increases in these variables are associated with higher odds of the target outcome.
log_AcidIndex and STARS_Flag have a negative direction, meaning that increases in these variables are associated with a decrease in the odds of the target outcome. Specifically, log_AcidIndex shows a decrease in the outcome with higher acidity, and STARS_Flag decreases the odds of the outcome when flagged.
model_nb1 <- glmmTMB(TARGET ~ CitricAcid + Density + pH + Alcohol + LabelAppeal + log_AcidIndex
+ log_TotalSulfurDioxide + STARS_Flag,
data = df_train, family = nbinom2)
summary(model_nb1)
## Family: nbinom2 ( log )
## Formula: TARGET ~ CitricAcid + Density + pH + Alcohol + LabelAppeal +
## log_AcidIndex + log_TotalSulfurDioxide + STARS_Flag
## Data: df_train
##
## AIC BIC logLik deviance df.resid
## 37387.4 37459.7 -18683.7 37367.4 10226
##
##
## Dispersion parameter for nbinom2 family (): 1.91e+07
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 3.088174 0.235726 13.10 < 2e-16 ***
## CitricAcid 0.009795 0.009318 1.05 0.293161
## Density -0.314104 0.213720 -1.47 0.141644
## pH -0.014840 0.008621 -1.72 0.085199 .
## Alcohol 0.005382 0.001614 3.33 0.000857 ***
## LabelAppeal 0.219472 0.006433 34.11 < 2e-16 ***
## log_AcidIndex -0.785658 0.044473 -17.67 < 2e-16 ***
## log_TotalSulfurDioxide 0.033023 0.006852 4.82 1.44e-06 ***
## STARS_Flag -1.050012 0.018949 -55.41 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Key Findings:
LabelAppeal and log_TotalSulfurDioxide are positively related to the outcome, with LabelAppeal having the largest effect.
STARS_Flag and log_AcidIndex have significant negative effects, reducing the expected count of the target.
Alcohol also shows a moderate positive relationship, though less impactful than the others.
model_nb2 <- glmmTMB(TARGET ~ LabelAppeal + log_AcidIndex
+ log_TotalSulfurDioxide + STARS_Flag,
data = df_train, family = nbinom2)
summary(model_nb2)
## Family: nbinom2 ( log )
## Formula:
## TARGET ~ LabelAppeal + log_AcidIndex + log_TotalSulfurDioxide + STARS_Flag
## Data: df_train
##
## AIC BIC logLik deviance df.resid
## 37396.9 37440.3 -18692.4 37384.9 10230
##
##
## Dispersion parameter for nbinom2 family (): 2.1e+07
##
## Conditional model:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.804523 0.102531 27.35 < 2e-16 ***
## LabelAppeal 0.219699 0.006431 34.16 < 2e-16 ***
## log_AcidIndex -0.789490 0.044212 -17.86 < 2e-16 ***
## log_TotalSulfurDioxide 0.032229 0.006841 4.71 2.46e-06 ***
## STARS_Flag -1.051858 0.018942 -55.53 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Key Findings:
LabelAppeal remains a significant positive predictor of the outcome, with the largest effect.
log_AcidIndex and STARS_Flag are significant negative predictors.
log_TotalSulfurDioxide also shows a positive relationship, albeit less impactful than LabelAppeal.
# Multiple Linear Regression Model 1 (Original variables)
mlr_model1 <- lm(TARGET ~ .,
data = df_train)
# Summary of the model
summary(mlr_model1)
##
## Call:
## lm(formula = TARGET ~ ., data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6673 -0.8485 0.0341 0.8492 5.6573
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7092568 1.0585680 0.670 0.502863
## CitricAcid 0.0373772 0.0212971 1.755 0.079283 .
## TotalSulfurDioxide -0.0005147 0.0001515 -3.396 0.000686 ***
## Density -0.8687174 0.4856191 -1.789 0.073663 .
## pH -0.0328724 0.0195134 -1.685 0.092095 .
## Alcohol 0.0089284 0.0036836 2.424 0.015375 *
## LabelAppeal 0.4677835 0.0151916 30.792 < 2e-16 ***
## AcidIndex -0.4315300 0.0731687 -5.898 3.80e-09 ***
## STARS 0.7876107 0.0174415 45.157 < 2e-16 ***
## STARS_Flag -0.6725265 0.0456000 -14.748 < 2e-16 ***
## log_AcidIndex 2.2247775 0.6925852 3.212 0.001321 **
## log_TotalSulfurDioxide 0.1623784 0.0277456 5.852 4.99e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.308 on 10224 degrees of freedom
## Multiple R-squared: 0.54, Adjusted R-squared: 0.5395
## F-statistic: 1091 on 11 and 10224 DF, p-value: < 2.2e-16
vif(mlr_model1)
## CitricAcid TotalSulfurDioxide Density
## 1.002505 3.479969 1.002117
## pH Alcohol LabelAppeal
## 1.006447 1.008055 1.107214
## AcidIndex STARS STARS_Flag
## 57.433712 2.573974 2.410021
## log_AcidIndex log_TotalSulfurDioxide
## 57.277803 3.520372
The model shows that LabelAppeal, log_AcidIndex, log_TotalSulfurDioxide, and STARS are strong and significant predictors of the target. There are both positive and negative relationships with the outcome, with LabelAppeal and STARS having the most significant positive impacts, while AcidIndex and STARS_Flag exert negative influences.
# Multiple Linear Regression Model 2 (Log-transformed variables)
mlr_model2 <- lm(TARGET ~ . -AcidIndex -TotalSulfurDioxide,
data = df_train)
# Summary of the model
summary(mlr_model2)
##
## Call:
## lm(formula = TARGET ~ . - AcidIndex - TotalSulfurDioxide, data = df_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6330 -0.8448 0.0263 0.8496 5.7450
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.431748 0.534462 12.034 < 2e-16 ***
## CitricAcid 0.036240 0.021343 1.698 0.0895 .
## Density -0.860836 0.486638 -1.769 0.0769 .
## pH -0.037003 0.019545 -1.893 0.0584 .
## Alcohol 0.008444 0.003690 2.288 0.0221 *
## LabelAppeal 0.464291 0.015216 30.514 < 2e-16 ***
## STARS 0.790748 0.017473 45.257 < 2e-16 ***
## STARS_Flag -0.681068 0.045682 -14.909 < 2e-16 ***
## log_AcidIndex -1.844852 0.094099 -19.605 < 2e-16 ***
## log_TotalSulfurDioxide 0.087169 0.014927 5.840 5.39e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.311 on 10226 degrees of freedom
## Multiple R-squared: 0.5379, Adjusted R-squared: 0.5375
## F-statistic: 1323 on 9 and 10226 DF, p-value: < 2.2e-16
vif(mlr_model2)
## CitricAcid Density pH
## 1.002434 1.001955 1.005269
## Alcohol LabelAppeal STARS
## 1.007084 1.105924 2.571920
## STARS_Flag log_AcidIndex log_TotalSulfurDioxide
## 2.408152 1.052729 1.014531
Key Findings:
LabelAppeal (0.464): LabelAppeal remains a highly significant predictor, with a strong positive relationship to the target (p < 2e-16). An increase in LabelAppeal leads to a substantial increase in the target.
STARS (0.791): STARS also shows a very strong positive impact (p < 2e-16). Higher STARS values are strongly associated with higher values of the target.
log_AcidIndex (-1.845): The relationship between log_AcidIndex and the target is negative and highly significant (p < 2e-16). A higher log_AcidIndex reduces the target.
log_TotalSulfurDioxide (0.087): log_TotalSulfurDioxide has a positive effect (p < 5.39e-09), meaning higher sulfur dioxide levels are linked to an increase in the target.
Alcohol (0.008): Alcohol also shows a significant positive effect (p = 0.0221), indicating that increased alcohol content is associated with an increase in the target.
CitricAcid (0.036), Density
(-0.861), and pH (-0.037): These variables
have weaker effects with marginal significance.
CitricAcid and Density show a tendency
to influence the target (p = 0.0895 and p = 0.0769, respectively), while
pH shows a trend toward significance (p =
0.0584).
Poisson Models:
Full Model:
Residual Deviance: 11,032
AIC: 36,592
Significant Predictors: LabelAppeal, STARS, STARS_Flag, log_AcidIndex, log_TotalSulfurDioxide.
Reduced Model:
Residual Deviance: 11,039 (slightly worse)
AIC: 36,591 (slightly better)
Significant Predictors: Same as the full model.
The reduced model has fewer predictors with almost the same fit, making it more parsimonious.
Negative Binomial Models:
Full Model:
AIC: 37,387
Significant Predictors: Similar to Poisson but includes Alcohol.
Reduced Model:
AIC: 37,396 (slightly worse)
Significant Predictors: Similar to Poisson.
Both models have higher AIC than Poisson. The inclusion of Alcohol in the full model adds complexity without substantial improvement.
Reduced Linear Model:
Adjusted R-squared: 0.5375
Residual Standard Error: 1.311
Significant Predictors: LabelAppeal, STARS, STARS_Flag, log_AcidIndex, log_TotalSulfurDioxide.
The linear model explains ~53.8% of the variance. However, it may not capture count-like behavior in TARGET.
The reduced Poisson model is the best:
Lowest AIC (36,591) - could be better.
Parsimonious with significant predictors only.
round()
function on the
Poisson/negative binomial regression forecasts, you will obtain
integer-like predictions that are more suitable for count data. These
rounded predictions should be more aligned with your expectations of
integer values.# Generate predictions for the evaluation dataset using the reduced Poisson model
forecasts_reduced_poisson <-
predict.glm(object = model_p2,
newdata = df_eval,
type = "response")
# Summarize the forecasts
summary(forecasts_reduced_poisson)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.6677 1.5594 3.0457 3.0293 3.9689 8.6394
# Round the predictions to the nearest whole number
rounded_forecasts_reduced_poisson <- round(forecasts_reduced_poisson)
# View the rounded predictions
head(rounded_forecasts_reduced_poisson)
## 4 8 19 22 28 29
## 3 5 3 2 6 3
The predictions, on average, seem to be around 3 with a slight skew towards higher values, given the range (Min: 0.67, Max: 8.64).
The most common rounded prediction is 8, which appears 5 times, and the distribution is fairly spread out.
table(df_eval$TARGET)
##
## 0 1 2 3 4 5 6 7 8
## 545 44 215 532 618 423 154 25 3
table(rounded_forecasts_reduced_poisson )
## rounded_forecasts_reduced_poisson
## 1 2 3 4 5 6 7 8 9
## 610 236 769 581 235 98 25 4 1
table(df_eval$TARGET, rounded_forecasts_reduced_poisson)
## rounded_forecasts_reduced_poisson
## 1 2 3 4 5 6 7 8 9
## 0 361 81 91 11 1 0 0 0 0
## 1 30 10 4 0 0 0 0 0 0
## 2 60 60 84 10 1 0 0 0 0
## 3 92 52 267 110 11 0 0 0 0
## 4 46 15 252 229 68 7 1 0 0
## 5 17 10 63 176 106 41 9 0 1
## 6 3 8 8 44 44 36 11 0 0
## 7 0 0 0 1 4 12 4 4 0
## 8 1 0 0 0 0 2 0 0 0
# Calculate the accuracy
correct_predictions <- sum(rounded_forecasts_reduced_poisson == df_eval$TARGET)
total_predictions <- length(df_eval$TARGET)
accuracy <- correct_predictions / total_predictions
print(paste("Accuracy: ", round(accuracy, 4)))
## [1] "Accuracy: 0.286"
An accuracy of 0.286 (or 28.6%) indicates that the model correctly predicted the target values about 28.6% of the time. This suggests that the model might not be performing as well as expected.