1 Setup

1.0.1 Setup - empty variables and functions in the environment tab/window

# Clear the workspace

rm(list = ls()) # Clear environment
gc()            # Clear unused memory

##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 534753 28.6    1193162 63.8         NA   669405 35.8
## Vcells 992663  7.6    8388608 64.0      16384  1851691 14.2

cat("\f")       # Clear the console

1.0.2 Setup - load packages

# Set the seed for reproducibility
set.seed(123)

# Prepare needed libraries
packages <- c("psych",
              "tidyverse", 
              "conflicted",
              "MASS",
              "caret",
              "skimr", 
              "car",
              "visdat",
              "stargazer", 
              "ggcorrplot",
              "glmmTMB",
              "performance", 
              "corrplot"
              )

for (i in 1:length(packages)) {
  if (!packages[i] %in% rownames(installed.packages())) {
    install.packages(packages[i]
                     , repos = "http://cran.rstudio.com/"
                     , dependencies = TRUE
                     )
  }
  library(packages[i], character.only = TRUE)
}

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: lattice

## Warning: package 'car' was built under R version 4.3.3

## Loading required package: carData
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

## Warning: package 'glmmTMB' was built under R version 4.3.3

## Warning in checkMatrixPackageVersion(getOption("TMB.check.Matrix", TRUE)): Package version inconsistency detected.
## TMB was built with Matrix ABI version 1
## Current Matrix ABI version is 0
## Please re-install 'TMB' from source using install.packages('TMB', type = 'source') or ask CRAN for a binary version of 'TMB' matching CRAN's 'Matrix' package

## Warning: package 'performance' was built under R version 4.3.3

## corrplot 0.92 loaded

rm(packages)

conflict_prefer(name = "describe", winner = "psych")

## [conflicted] Will prefer psych::describe over any other package.

conflict_prefer(name = "select"  , winner = "dplyr")

## [conflicted] Will prefer dplyr::select over any other package.

1.0.3 Setup - load raw data

# Set working directory and path to data
setwd("~/Desktop")
  
# Load your data
df    <- read.csv("wine-data-1.csv")

stargazer(df, type = "text", summary = TRUE)

## 
## ================================================================
## Statistic            N      Mean    St. Dev.    Min       Max   
## ----------------------------------------------------------------
## INDEX              12,795 8,069.980 4,656.905    1      16,129  
## TARGET             12,795   3.029     1.926      0         8    
## FixedAcidity       12,795   7.076     6.318   -18.100   34.400  
## VolatileAcidity    12,795   0.324     0.784    -2.790    3.680  
## CitricAcid         12,795   0.308     0.862    -3.240    3.860  
## ResidualSugar      12,179   5.419    33.749   -127.800  141.150 
## Chlorides          12,157   0.055     0.318    -1.171    1.351  
## FreeSulfurDioxide  12,148  30.846    148.715  -555.000  623.000 
## TotalSulfurDioxide 12,113  120.714   231.913  -823.000 1,057.000
## Density            12,795   0.994     0.027    0.888     1.099  
## pH                 12,400   3.208     0.680    0.480     6.130  
## Sulphates          11,585   0.527     0.932    -3.130    4.240  
## Alcohol            12,142  10.489     3.728    -4.700   26.500  
## LabelAppeal        12,795  -0.009     0.891      -2        2    
## AcidIndex          12,795   7.773     1.324      4        17    
## STARS              9,436    2.042     0.903      1         4    
## ----------------------------------------------------------------

# Drop the INDEX column
df <- df[, !(colnames(df) %in% c("INDEX"))]

vis_miss(df)

2 Clean

# List of columns to drop
columns_to_drop <- c("FixedAcidity", "VolatileAcidity", 
                     "ResidualSugar", "Chlorides", "FreeSulfurDioxide", "Sulphates")

# Drop the columns from the dataset
df <- df[, !(names(df) %in% columns_to_drop)]

# List of variables to take the absolute values
columns_to_abs <- c("CitricAcid", "TotalSulfurDioxide", "Alcohol")

# Apply absolute value transformation
df[columns_to_abs] <- lapply(df[columns_to_abs], abs)

# Impute missing values in the pH column with the median
df$pH[is.na(df$pH)] <- median(df$pH, na.rm = TRUE)

# Impute missing values for the Alcohol variable 
df$Alcohol[is.na(df$Alcohol)] <- median(df$Alcohol, na.rm = TRUE) 

# Impute missing values for other variables (using median)
columns_to_impute <- c("CitricAcid", "TotalSulfurDioxide", "pH")

for (col in columns_to_impute) {
  df[[col]][is.na(df[[col]])] <- median(df[[col]], na.rm = TRUE)
}

#Create a missing flag for STARS
df$STARS_Flag <- ifelse(is.na(df$STARS), 1, 0)

#Impute missing values in STARS with 0
df$STARS[is.na(df$STARS)] <- 0

df$log_AcidIndex <- log(df$AcidIndex+1)
df$log_TotalSulfurDioxide <- log(df$TotalSulfurDioxide+1)

str(df)

## 'data.frame':    12795 obs. of  12 variables:
##  $ TARGET                : int  3 3 5 3 4 0 0 4 3 6 ...
##  $ CitricAcid            : num  0.98 0.81 0.88 0.04 1.26 0.59 0.4 0.34 1.05 0.39 ...
##  $ TotalSulfurDioxide    : num  268 327 142 115 108 15 156 551 154 180 ...
##  $ Density               : num  0.993 1.028 0.995 0.996 0.995 ...
##  $ pH                    : num  3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
##  $ Alcohol               : num  9.9 10.4 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
##  $ LabelAppeal           : int  0 -1 -1 -1 0 0 0 1 0 0 ...
##  $ AcidIndex             : int  8 7 8 6 9 11 8 7 6 8 ...
##  $ STARS                 : num  2 3 3 1 2 0 0 3 0 4 ...
##  $ STARS_Flag            : num  0 0 0 0 0 1 1 0 1 0 ...
##  $ log_AcidIndex         : num  2.2 2.08 2.2 1.95 2.3 ...
##  $ log_TotalSulfurDioxide: num  5.59 5.79 4.96 4.75 4.69 ...

No more NA values.

2.1 Setup - split raw data into testing and training data

# Set the seed for reproducibility
set.seed(123)

# Generate a vector of indices for the training set
train_index <- sample(x = nrow(df), 
                   size = round(0.8 * nrow(df)
                                   )
                      )

# Create the training and testing sets
df_train <- df[ train_index, ]
df_eval  <- df[-train_index, ]

2.1.1 Training Data

str(df_train)

## 'data.frame':    10236 obs. of  12 variables:
##  $ TARGET                : int  0 3 3 2 6 3 3 4 4 0 ...
##  $ CitricAcid            : num  0.78 0.25 0.4 0.3 0.28 0.58 1.32 1.2 0.16 0.28 ...
##  $ TotalSulfurDioxide    : num  326 138 154 369 154 161 122 97 115 83 ...
##  $ Density               : num  0.946 1 0.998 0.964 0.956 ...
##  $ pH                    : num  3.2 4.47 2.4 3.29 3.31 3.06 3.94 3.23 4.15 3.25 ...
##  $ Alcohol               : num  9.5 4.8 10.6 15.6 16.9 8.9 18.3 21.7 12 10.4 ...
##  $ LabelAppeal           : int  0 0 -2 -1 1 -1 -1 0 0 -1 ...
##  $ AcidIndex             : int  9 7 9 7 6 9 6 8 6 7 ...
##  $ STARS                 : num  2 2 2 0 2 2 2 2 3 0 ...
##  $ STARS_Flag            : num  0 0 0 1 0 0 0 0 0 1 ...
##  $ log_AcidIndex         : num  2.3 2.08 2.3 2.08 1.95 ...
##  $ log_TotalSulfurDioxide: num  5.79 4.93 5.04 5.91 5.04 ...

nrow(df_train)

## [1] 10236

2.1.2 Testing Data

str(df_eval)

## 'data.frame':    2559 obs. of  12 variables:
##  $ TARGET                : int  3 4 5 2 6 4 5 4 0 4 ...
##  $ CitricAcid            : num  0.04 0.34 0.27 0.24 0.27 1.1 0.09 2.75 2.32 0.26 ...
##  $ TotalSulfurDioxide    : num  115 551 229 60 154 129 154 126 234 220 ...
##  $ Density               : num  0.996 1.032 0.999 1.009 0.994 ...
##  $ pH                    : num  2.24 3.2 3.14 3.19 1.95 3.2 3.3 3.04 3.11 3.2 ...
##  $ Alcohol               : num  6.2 11.6 11 9.8 13 16.6 12.5 9.7 9.9 10.6 ...
##  $ LabelAppeal           : int  -1 1 1 -1 1 0 0 0 -1 0 ...
##  $ AcidIndex             : int  6 7 11 7 7 9 8 8 7 7 ...
##  $ STARS                 : num  1 3 2 1 4 2 2 1 1 2 ...
##  $ STARS_Flag            : num  0 0 0 0 0 0 0 0 0 0 ...
##  $ log_AcidIndex         : num  1.95 2.08 2.48 2.08 2.08 ...
##  $ log_TotalSulfurDioxide: num  4.75 6.31 5.44 4.11 5.04 ...

nrow(df_eval)

## [1] 2559

3 Visualization

# Calculate the correlation matrix for numeric variables
cor_matrix <- cor(df_train, use = "complete.obs")

# Calculate p-values for the significance of correlations
p.mat <- cor.mtest(df_train, conf.level = 0.95)$p

# Create a smaller, less crowded correlation plot with significance
corrplot(cor_matrix, 
         type = "lower",                
         method = "circle",            
         tl.cex = 0.5,                
         diag = FALSE,                  
         number.cex = 0.6,             
         mar = c(0, 0, 1, 0),           
         tl.srt = 45,                  
         tl.col = "black",            
         p.mat = p.mat,               
         sig.level = 0.05,             
         insig = "pch",                
         pch = 4,                    
         cl.pos = "r")

CitricAcid: Based on the checkmarks in the matrix, CitricAcid does not appear to be strongly correlated with the other variables shown.
Density: Density shows negative correlations with pH and Alcohol, based on the pattern of checkmarks.
pH: The pH variable exhibits slight negative correlations with TotalSulfurDioxide, Density, and Alcohol.
The log-transformed variables (log_AcidIndex and log_TotalSulfurDioxide) appear to have positive correlations with the original variables as expected.
The STARS and STARS_Flag variables seem to be positively correlated with each other, as one would expect.
STARS seem to be fairly positively correlated with the Target.

I have transformed certain values after checking the distribution to reduce the skewness.

df_train_long <- df_train %>%
  gather(key = "variable", value = "value")

# Plot the distributions of all columns
ggplot(df_train_long, aes(x = value)) +
  geom_histogram(bins = 30, fill = "lightblue", color = "black", alpha = 0.7) +
  facet_wrap(~ variable, scales = "free_x") +
  theme_minimal() +
  theme(strip.text = element_text(size = 8)) +
  labs(title = "Distributions of All Columns", x = "Value", y = "Frequency")

4 Model

4.1 Checking for overdispersion

# Calculate mean and variance of the target variable
mean_target <- mean(df_train$TARGET, na.rm = TRUE)
variance_target <- var(df_train$TARGET, na.rm = TRUE)

cat("Mean of TARGET:", mean_target, "\n")

## Mean of TARGET: 3.02628

cat("Variance of TARGET:", variance_target, "\n")

## Variance of TARGET: 3.714111

# Check for overdispersion
if (variance_target > mean_target) {
  cat("There is overdispersion in the data.\n")
} else {
  cat("No overdispersion detected.\n")
}

## There is overdispersion in the data.

4.2 Quasi-poisson model 1

# Quasi-Poisson Model 1 (Original variables)
model_quasi1 <- glm(TARGET ~ ., 
                    family = quasipoisson(link = "log"), 
                    data = df_train)

# Summary of the model
summary(model_quasi1)

## 
## Call:
## glm(formula = TARGET ~ ., family = quasipoisson(link = "log"), 
##     data = df_train)
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            -1.543e+00  5.132e-01  -3.006  0.00265 ** 
## CitricAcid              9.511e-03  8.740e-03   1.088  0.27653    
## TotalSulfurDioxide     -1.985e-04  6.517e-05  -3.046  0.00232 ** 
## Density                -3.058e-01  2.011e-01  -1.521  0.12833    
## pH                     -1.217e-02  8.092e-03  -1.503  0.13274    
## Alcohol                 2.519e-03  1.522e-03   1.655  0.09793 .  
## LabelAppeal             1.592e-01  6.395e-03  24.902  < 2e-16 ***
## AcidIndex              -3.363e-01  4.009e-02  -8.388  < 2e-16 ***
## STARS                   1.905e-01  6.381e-03  29.863  < 2e-16 ***
## STARS_Flag             -6.412e-01  2.248e-02 -28.525  < 2e-16 ***
## log_AcidIndex           2.337e+00  3.609e-01   6.476 9.85e-11 ***
## log_TotalSulfurDioxide  6.169e-02  1.252e-02   4.929 8.40e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.8816674)
## 
##     Null deviance: 18306  on 10235  degrees of freedom
## Residual deviance: 10955  on 10224  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

Key Findings:

LabelAppeal (p < 2e-16) and STARS (p < 2e-16) are highly significant predictors, with LabelAppeal having a positive impact (with a large positive coefficient), suggesting that higher LabelAppeal leads to a higher expected count of the target variable.
AcidIndex (p < 2e-16) and log_AcidIndex (p < 2e-16) show negative relationships, meaning as these values increase, the expected count of the target decreases.
TotalSulfurDioxide (p = 0.00232) and log_TotalSulfurDioxide (p = 8.40e-07) are also statistically significant, with positive relationships, indicating an increase in these values is associated with an increase in the expected count of the target.

It is has lower than 1 value for dispersion which might mean that it is facing underdispersion. So I’ll switch to Poisson model.

4.3 Poisson model 2

# Poisson Model 1
model_p1 <- glm(TARGET ~ . -AcidIndex -TotalSulfurDioxide, 
                    family = poisson(link = "log"), 
                    data = df_train)

# Summary of the model
summary(model_p1)

## 
## Call:
## glm(formula = TARGET ~ . - AcidIndex - TotalSulfurDioxide, family = poisson(link = "log"), 
##     data = df_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             2.493624   0.237149  10.515  < 2e-16 ***
## CitricAcid              0.009679   0.009310   1.040    0.298    
## Density                -0.296588   0.214043  -1.386    0.166    
## pH                     -0.013944   0.008619  -1.618    0.106    
## Alcohol                 0.002167   0.001620   1.338    0.181    
## LabelAppeal             0.158276   0.006810  23.242  < 2e-16 ***
## STARS                   0.192519   0.006788  28.364  < 2e-16 ***
## STARS_Flag             -0.649214   0.023927 -27.133  < 2e-16 ***
## log_AcidIndex          -0.690624   0.044828 -15.406  < 2e-16 ***
## log_TotalSulfurDioxide  0.032655   0.006885   4.743 2.11e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18306  on 10235  degrees of freedom
## Residual deviance: 11032  on 10226  degrees of freedom
## AIC: 36592
## 
## Number of Fisher Scoring iterations: 6

vif(model_p1)

##             CitricAcid                Density                     pH 
##               1.002323               1.002079               1.006182 
##                Alcohol            LabelAppeal                  STARS 
##               1.010625               1.130031               1.797289 
##             STARS_Flag          log_AcidIndex log_TotalSulfurDioxide 
##               1.626384               1.035999               1.007606

Key Findings:

LabelAppeal (0.158): For each one-unit increase in LabelAppeal, the expected count of the target variable increases by 1.17 times. This suggests that higher label appeal is strongly associated with an increase in the outcome. This result is highly significant (p < 2e-16), indicating its robust impact.
STARS (0.193): Each one-unit increase in STARS leads to an increase in the expected count of the target by 1.21 times. This implies that more stars are positively related to the outcome, with a 21% increase in the expected count for every additional star. This is also highly significant (p < 2e-16).
STARS_Flag (-0.649): The presence of a STARS_Flag is associated with a 48% decrease in the expected count of the target variable, with a statistical significance (p < 2e-16). This suggests that the STARS_Flag may act as a significant negative factor.

LabelAppeal and STARS are the strongest positive predictors, with STARS_Flag being a significant negative predictor.

4.4 Poisson model 3 - Main model

# Poisson Model 2 
model_p2 <- glm(TARGET ~ LabelAppeal + log_AcidIndex + STARS + log_TotalSulfurDioxide + STARS_Flag, 
                    family = poisson(link = "log"),  
                    data = df_train)

# Summary of the model
summary(model_p2)

## 
## Call:
## glm(formula = TARGET ~ LabelAppeal + log_AcidIndex + STARS + 
##     log_TotalSulfurDioxide + STARS_Flag, family = poisson(link = "log"), 
##     data = df_train)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             2.180830   0.105506  20.670  < 2e-16 ***
## LabelAppeal             0.158238   0.006807  23.247  < 2e-16 ***
## log_AcidIndex          -0.689366   0.044569 -15.467  < 2e-16 ***
## STARS                   0.193249   0.006770  28.546  < 2e-16 ***
## log_TotalSulfurDioxide  0.032406   0.006877   4.712 2.45e-06 ***
## STARS_Flag             -0.648910   0.023916 -27.133  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18306  on 10235  degrees of freedom
## Residual deviance: 11039  on 10230  degrees of freedom
## AIC: 36591
## 
## Number of Fisher Scoring iterations: 6

vif(model_p2)

##            LabelAppeal          log_AcidIndex                  STARS 
##               1.128980               1.023758               1.787895 
## log_TotalSulfurDioxide             STARS_Flag 
##               1.005952               1.624839

4.4.1 Key Findings:

Magnitude:

Intercept: The intercept is 2.1808, which is the baseline value for the model when all predictors are zero.
LabelAppeal: The coefficient for LabelAppeal is 0.1582. Exponentiating this, we get an odds ratio of approximately 1.171 (exp(0.1582)), which means that for each unit increase in LabelAppeal, the odds of the target outcome increase by 17.1%, holding all other variables constant. This is a substantial positive effect on the outcome.
log_AcidIndex: The coefficient for log_AcidIndex is -0.6894, which is negative. Exponentiating this gives an odds ratio of approximately 0.502 (exp(-0.6894)), suggesting that for each unit increase in log_AcidIndex, the odds of the target outcome decrease by about 50.2%. This suggests that as the AcidIndex increases, the likelihood of the target event happening decreases.
STARS: The coefficient for STARS is 0.1932, which gives an odds ratio of 1.213 (exp(0.1932)), indicating that for each unit increase in STARS, the odds of the target outcome increase by approximately 21.3%, assuming all other factors remain constant. This suggests that a higher STARS value increases the likelihood of the outcome.
log_TotalSulfurDioxide: The coefficient for log_TotalSulfurDioxide is 0.0324, which gives an odds ratio of 1.033 (exp(0.0324)), suggesting that for each unit increase in log_TotalSulfurDioxide, the odds of the target outcome increase by 3.3%. This is a relatively small but positive effect on the outcome.
STARS_Flag: The coefficient for STARS_Flag is -0.6489, leading to an odds ratio of 0.523 (exp(-0.6489)). This indicates that when the STARS_Flag is active, the odds of the target outcome decrease by about 47.7% compared to when the STARS_Flag is inactive, assuming all other variables are held constant. This shows a negative relationship with the outcome.

Significance: All variables are highly significant, with p-values well below the typical threshold of 0.05.

Direction:

LabelAppeal, STARS, and log_TotalSulfurDioxide have a positive direction, meaning that increases in these variables are associated with higher odds of the target outcome.
log_AcidIndex and STARS_Flag have a negative direction, meaning that increases in these variables are associated with a decrease in the odds of the target outcome. Specifically, log_AcidIndex shows a decrease in the outcome with higher acidity, and STARS_Flag decreases the odds of the outcome when flagged.

4.5 Negative binomial model 1

model_nb1 <- glmmTMB(TARGET ~  CitricAcid + Density + pH + Alcohol + LabelAppeal + log_AcidIndex 
                     + log_TotalSulfurDioxide + STARS_Flag, 
                     data = df_train, family = nbinom2)

summary(model_nb1)

##  Family: nbinom2  ( log )
## Formula:          TARGET ~ CitricAcid + Density + pH + Alcohol + LabelAppeal +  
##     log_AcidIndex + log_TotalSulfurDioxide + STARS_Flag
## Data: df_train
## 
##      AIC      BIC   logLik deviance df.resid 
##  37387.4  37459.7 -18683.7  37367.4    10226 
## 
## 
## Dispersion parameter for nbinom2 family (): 1.91e+07 
## 
## Conditional model:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             3.088174   0.235726   13.10  < 2e-16 ***
## CitricAcid              0.009795   0.009318    1.05 0.293161    
## Density                -0.314104   0.213720   -1.47 0.141644    
## pH                     -0.014840   0.008621   -1.72 0.085199 .  
## Alcohol                 0.005382   0.001614    3.33 0.000857 ***
## LabelAppeal             0.219472   0.006433   34.11  < 2e-16 ***
## log_AcidIndex          -0.785658   0.044473  -17.67  < 2e-16 ***
## log_TotalSulfurDioxide  0.033023   0.006852    4.82 1.44e-06 ***
## STARS_Flag             -1.050012   0.018949  -55.41  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Key Findings:

LabelAppeal and log_TotalSulfurDioxide are positively related to the outcome, with LabelAppeal having the largest effect.
STARS_Flag and log_AcidIndex have significant negative effects, reducing the expected count of the target.
Alcohol also shows a moderate positive relationship, though less impactful than the others.

4.6 Negative binomial model 2

model_nb2 <- glmmTMB(TARGET ~  LabelAppeal + log_AcidIndex 
                     + log_TotalSulfurDioxide + STARS_Flag, 
                     data = df_train, family = nbinom2)

summary(model_nb2)

##  Family: nbinom2  ( log )
## Formula:          
## TARGET ~ LabelAppeal + log_AcidIndex + log_TotalSulfurDioxide +      STARS_Flag
## Data: df_train
## 
##      AIC      BIC   logLik deviance df.resid 
##  37396.9  37440.3 -18692.4  37384.9    10230 
## 
## 
## Dispersion parameter for nbinom2 family (): 2.1e+07 
## 
## Conditional model:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             2.804523   0.102531   27.35  < 2e-16 ***
## LabelAppeal             0.219699   0.006431   34.16  < 2e-16 ***
## log_AcidIndex          -0.789490   0.044212  -17.86  < 2e-16 ***
## log_TotalSulfurDioxide  0.032229   0.006841    4.71 2.46e-06 ***
## STARS_Flag             -1.051858   0.018942  -55.53  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Key Findings:

LabelAppeal remains a significant positive predictor of the outcome, with the largest effect.
log_AcidIndex and STARS_Flag are significant negative predictors.
log_TotalSulfurDioxide also shows a positive relationship, albeit less impactful than LabelAppeal.

4.7 Multiple Linear Regression Model 1

# Multiple Linear Regression Model 1 (Original variables)
mlr_model1 <- lm(TARGET ~ ., 
                 data = df_train)

# Summary of the model
summary(mlr_model1)

## 
## Call:
## lm(formula = TARGET ~ ., data = df_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6673 -0.8485  0.0341  0.8492  5.6573 
## 
## Coefficients:
##                          Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             0.7092568  1.0585680   0.670 0.502863    
## CitricAcid              0.0373772  0.0212971   1.755 0.079283 .  
## TotalSulfurDioxide     -0.0005147  0.0001515  -3.396 0.000686 ***
## Density                -0.8687174  0.4856191  -1.789 0.073663 .  
## pH                     -0.0328724  0.0195134  -1.685 0.092095 .  
## Alcohol                 0.0089284  0.0036836   2.424 0.015375 *  
## LabelAppeal             0.4677835  0.0151916  30.792  < 2e-16 ***
## AcidIndex              -0.4315300  0.0731687  -5.898 3.80e-09 ***
## STARS                   0.7876107  0.0174415  45.157  < 2e-16 ***
## STARS_Flag             -0.6725265  0.0456000 -14.748  < 2e-16 ***
## log_AcidIndex           2.2247775  0.6925852   3.212 0.001321 ** 
## log_TotalSulfurDioxide  0.1623784  0.0277456   5.852 4.99e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.308 on 10224 degrees of freedom
## Multiple R-squared:   0.54,  Adjusted R-squared:  0.5395 
## F-statistic:  1091 on 11 and 10224 DF,  p-value: < 2.2e-16

vif(mlr_model1)

##             CitricAcid     TotalSulfurDioxide                Density 
##               1.002505               3.479969               1.002117 
##                     pH                Alcohol            LabelAppeal 
##               1.006447               1.008055               1.107214 
##              AcidIndex                  STARS             STARS_Flag 
##              57.433712               2.573974               2.410021 
##          log_AcidIndex log_TotalSulfurDioxide 
##              57.277803               3.520372

The model shows that LabelAppeal, log_AcidIndex, log_TotalSulfurDioxide, and STARS are strong and significant predictors of the target. There are both positive and negative relationships with the outcome, with LabelAppeal and STARS having the most significant positive impacts, while AcidIndex and STARS_Flag exert negative influences.

4.8 Multiple Linear Regression Model 2

# Multiple Linear Regression Model 2 (Log-transformed variables)
mlr_model2 <- lm(TARGET ~ . -AcidIndex -TotalSulfurDioxide, 
                 data = df_train)

# Summary of the model
summary(mlr_model2)

## 
## Call:
## lm(formula = TARGET ~ . - AcidIndex - TotalSulfurDioxide, data = df_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6330 -0.8448  0.0263  0.8496  5.7450 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)             6.431748   0.534462  12.034  < 2e-16 ***
## CitricAcid              0.036240   0.021343   1.698   0.0895 .  
## Density                -0.860836   0.486638  -1.769   0.0769 .  
## pH                     -0.037003   0.019545  -1.893   0.0584 .  
## Alcohol                 0.008444   0.003690   2.288   0.0221 *  
## LabelAppeal             0.464291   0.015216  30.514  < 2e-16 ***
## STARS                   0.790748   0.017473  45.257  < 2e-16 ***
## STARS_Flag             -0.681068   0.045682 -14.909  < 2e-16 ***
## log_AcidIndex          -1.844852   0.094099 -19.605  < 2e-16 ***
## log_TotalSulfurDioxide  0.087169   0.014927   5.840 5.39e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.311 on 10226 degrees of freedom
## Multiple R-squared:  0.5379, Adjusted R-squared:  0.5375 
## F-statistic:  1323 on 9 and 10226 DF,  p-value: < 2.2e-16

vif(mlr_model2)

##             CitricAcid                Density                     pH 
##               1.002434               1.001955               1.005269 
##                Alcohol            LabelAppeal                  STARS 
##               1.007084               1.105924               2.571920 
##             STARS_Flag          log_AcidIndex log_TotalSulfurDioxide 
##               2.408152               1.052729               1.014531

Key Findings:

LabelAppeal (0.464): LabelAppeal remains a highly significant predictor, with a strong positive relationship to the target (p < 2e-16). An increase in LabelAppeal leads to a substantial increase in the target.
STARS (0.791): STARS also shows a very strong positive impact (p < 2e-16). Higher STARS values are strongly associated with higher values of the target.
log_AcidIndex (-1.845): The relationship between log_AcidIndex and the target is negative and highly significant (p < 2e-16). A higher log_AcidIndex reduces the target.
log_TotalSulfurDioxide (0.087): log_TotalSulfurDioxide has a positive effect (p < 5.39e-09), meaning higher sulfur dioxide levels are linked to an increase in the target.
Alcohol (0.008): Alcohol also shows a significant positive effect (p = 0.0221), indicating that increased alcohol content is associated with an increase in the target.
CitricAcid (0.036), Density (-0.861), and pH (-0.037): These variables have weaker effects with marginal significance. CitricAcid and Density show a tendency to influence the target (p = 0.0895 and p = 0.0769, respectively), while pH shows a trend toward significance (p = 0.0584).

5 Model Comparison

Poisson Models:

Full Model:

Residual Deviance: 11,032
AIC: 36,592
Significant Predictors: LabelAppeal, STARS, STARS_Flag, log_AcidIndex, log_TotalSulfurDioxide.

Reduced Model:

Residual Deviance: 11,039 (slightly worse)
AIC: 36,591 (slightly better)
Significant Predictors: Same as the full model.

The reduced model has fewer predictors with almost the same fit, making it more parsimonious.

Negative Binomial Models:

Full Model:

AIC: 37,387
Significant Predictors: Similar to Poisson but includes Alcohol.

Reduced Model:

AIC: 37,396 (slightly worse)
Significant Predictors: Similar to Poisson.

Both models have higher AIC than Poisson. The inclusion of Alcohol in the full model adds complexity without substantial improvement.

Reduced Linear Model:

Adjusted R-squared: 0.5375
Residual Standard Error: 1.311
Significant Predictors: LabelAppeal, STARS, STARS_Flag, log_AcidIndex, log_TotalSulfurDioxide.

The linear model explains ~53.8% of the variance. However, it may not capture count-like behavior in TARGET.

5.0.1 Chosen Model:

The reduced Poisson model is the best:

Lowest AIC (36,591) - could be better.
Parsimonious with significant predictors only.

6 Predict on Evaluation data

By using the round() function on the Poisson/negative binomial regression forecasts, you will obtain integer-like predictions that are more suitable for count data. These rounded predictions should be more aligned with your expectations of integer values.

# Generate predictions for the evaluation dataset using the reduced Poisson model
forecasts_reduced_poisson <- 
  predict.glm(object = model_p2, 
              newdata = df_eval,
              type = "response")

# Summarize the forecasts
summary(forecasts_reduced_poisson)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.6677  1.5594  3.0457  3.0293  3.9689  8.6394

# Round the predictions to the nearest whole number
rounded_forecasts_reduced_poisson <- round(forecasts_reduced_poisson)

# View the rounded predictions
head(rounded_forecasts_reduced_poisson)

##  4  8 19 22 28 29 
##  3  5  3  2  6  3

The predictions, on average, seem to be around 3 with a slight skew towards higher values, given the range (Min: 0.67, Max: 8.64).
The most common rounded prediction is 8, which appears 5 times, and the distribution is fairly spread out.

7 Confusion Matrix

table(df_eval$TARGET)

## 
##   0   1   2   3   4   5   6   7   8 
## 545  44 215 532 618 423 154  25   3

table(rounded_forecasts_reduced_poisson )

## rounded_forecasts_reduced_poisson
##   1   2   3   4   5   6   7   8   9 
## 610 236 769 581 235  98  25   4   1

table(df_eval$TARGET, rounded_forecasts_reduced_poisson)

##    rounded_forecasts_reduced_poisson
##       1   2   3   4   5   6   7   8   9
##   0 361  81  91  11   1   0   0   0   0
##   1  30  10   4   0   0   0   0   0   0
##   2  60  60  84  10   1   0   0   0   0
##   3  92  52 267 110  11   0   0   0   0
##   4  46  15 252 229  68   7   1   0   0
##   5  17  10  63 176 106  41   9   0   1
##   6   3   8   8  44  44  36  11   0   0
##   7   0   0   0   1   4  12   4   4   0
##   8   1   0   0   0   0   2   0   0   0

# Calculate the accuracy
correct_predictions <- sum(rounded_forecasts_reduced_poisson == df_eval$TARGET)
total_predictions <- length(df_eval$TARGET)

accuracy <- correct_predictions / total_predictions
print(paste("Accuracy: ", round(accuracy, 4)))

## [1] "Accuracy:  0.286"

An accuracy of 0.286 (or 28.6%) indicates that the model correctly predicted the target values about 28.6% of the time. This suggests that the model might not be performing as well as expected.

Homework 3

Aritra Ray

2024-12-12

1 Setup

1.0.1 Setup - empty variables and functions in the environment tab/window

1.0.2 Setup - load packages

1.0.3 Setup - load raw data

2 Clean

2.1 Setup - split raw data into testing and training data

2.1.1 Training Data

2.1.2 Testing Data

3 Visualization

4 Model

4.1 Checking for overdispersion

4.2 Quasi-poisson model 1

4.3 Poisson model 2

4.4 Poisson model 3 - Main model

4.4.1 Key Findings:

4.5 Negative binomial model 1

4.6 Negative binomial model 2

4.7 Multiple Linear Regression Model 1

4.8 Multiple Linear Regression Model 2

5 Model Comparison

5.0.1 Chosen Model:

6 Predict on Evaluation data

7 Confusion Matrix