1 Setup

1.0.1 Setup - empty variables and functions in the environment tab/window

# Clear the workspace

rm(list = ls()) # Clear environment
gc()            # Clear unused memory
##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  553566 29.6    1231488 65.8         NA   700257 37.4
## Vcells 1026324  7.9    8388608 64.0      32768  1963430 15.0
cat("\f")       # Clear the console

1.0.2 Setup - load packages

# Set the seed for reproducibility
set.seed(123)

# Prepare needed libraries
packages <- c("psych",
              "tidyverse", 
              "conflicted",
              "MASS",
              "caret",
              "skimr",
              "htmltools",
              "stargazer",
              "modelsummary"
              )


for (i in 1:length(packages)) {
  if (!packages[i] %in% rownames(installed.packages())) {
    install.packages(packages[i]
                     , repos = "http://cran.rstudio.com/"
                     , dependencies = TRUE
                     )
  }
  library(packages[i], character.only = TRUE)
}
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: lattice
## 
## 
## Please cite as: 
## 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## 
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
## 
## 
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)
rm(packages)

conflict_prefer(name = "describe", winner = "psych")
## [conflicted] Will prefer psych::describe over any other package.
conflict_prefer(name = "select"  , winner = "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.

1.0.3 Setup - load raw data

# Set working directory and path to data
setwd("~/Library/CloudStorage/Dropbox/WCAS/Econometrics/")
  
# Load your data
df    <- read.csv("wine-data.csv")

skim(df)
Data summary
Name df
Number of rows 12795
Number of columns 16
_______________________
Column type frequency:
numeric 16
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
INDEX 0 1.00 8069.98 4656.91 1.00 4037.50 8110.00 12106.50 16129.00 ▇▇▇▇▇
TARGET 0 1.00 3.03 1.93 0.00 2.00 3.00 4.00 8.00 ▆▇▇▆▁
FixedAcidity 0 1.00 7.08 6.32 -18.10 5.20 6.90 9.50 34.40 ▁▂▇▂▁
VolatileAcidity 0 1.00 0.32 0.78 -2.79 0.13 0.28 0.64 3.68 ▁▂▇▂▁
CitricAcid 0 1.00 0.31 0.86 -3.24 0.03 0.31 0.58 3.86 ▁▂▇▂▁
ResidualSugar 616 0.95 5.42 33.75 -127.80 -2.00 3.90 15.90 141.15 ▁▂▇▂▁
Chlorides 638 0.95 0.05 0.32 -1.17 -0.03 0.05 0.15 1.35 ▁▂▇▂▁
FreeSulfurDioxide 647 0.95 30.85 148.71 -555.00 0.00 30.00 70.00 623.00 ▁▂▇▂▁
TotalSulfurDioxide 682 0.95 120.71 231.91 -823.00 27.00 123.00 208.00 1057.00 ▁▂▇▂▁
Density 0 1.00 0.99 0.03 0.89 0.99 0.99 1.00 1.10 ▁▂▇▂▁
pH 395 0.97 3.21 0.68 0.48 2.96 3.20 3.47 6.13 ▁▂▇▂▁
Sulphates 1210 0.91 0.53 0.93 -3.13 0.28 0.50 0.86 4.24 ▁▂▇▂▁
Alcohol 653 0.95 10.49 3.73 -4.70 9.00 10.40 12.40 26.50 ▁▂▇▂▁
LabelAppeal 0 1.00 -0.01 0.89 -2.00 -1.00 0.00 1.00 2.00 ▁▅▇▅▁
AcidIndex 0 1.00 7.77 1.32 4.00 7.00 8.00 8.00 17.00 ▁▇▁▁▁
STARS 3359 0.74 2.04 0.90 1.00 1.00 2.00 3.00 4.00 ▇▇▁▅▂

1.1 Setup - split raw data into testing and training data

# Set the seed for reproducibility
set.seed(123)

# Generate a vector of indices for the training set
train_index <- sample(x = nrow(df), 
                   size = round(0.8 * nrow(df)
                                   )
                      )

# Create the training and testing sets
df_train <- df[ train_index, ]
df_eval  <- df[-train_index, ]

1.1.1 Testing Data

str(df_train)
## 'data.frame':    10236 obs. of  16 variables:
##  $ INDEX             : int  3117 3172 13132 10988 15741 3772 2319 11765 4253 14680 ...
##  $ TARGET            : int  0 3 3 2 6 3 3 4 4 0 ...
##  $ FixedAcidity      : num  9.5 -2.4 9.9 6.7 11.9 9.1 5.7 5.5 5.6 6.5 ...
##  $ VolatileAcidity   : num  -0.71 1.08 0.13 0.34 0.2 0.98 0.21 0.3 0.485 0.24 ...
##  $ CitricAcid        : num  0.78 0.25 -0.4 0.3 0.28 0.58 1.32 1.2 0.16 0.28 ...
##  $ ResidualSugar     : num  -42.2 -10 2.5 6.4 18.7 ...
##  $ Chlorides         : num  -0.022 -0.051 -0.403 -0.386 -0.169 0.036 0.03 0.029 0.051 -0.844 ...
##  $ FreeSulfurDioxide : num  270 321 244 105 -58 44 277 28 193 26 ...
##  $ TotalSulfurDioxide: num  -326 138 NA 369 NA 161 122 97 115 83 ...
##  $ Density           : num  0.946 1 0.998 0.964 0.956 ...
##  $ pH                : num  3.2 4.47 2.4 3.29 3.31 3.06 3.94 3.23 4.15 3.25 ...
##  $ Sulphates         : num  NA -1.05 1.3 0.52 1.24 0.41 0.52 0.44 0.38 0.38 ...
##  $ Alcohol           : num  9.5 4.8 10.6 15.6 16.9 8.9 18.3 21.7 12 NA ...
##  $ LabelAppeal       : int  0 0 -2 -1 1 -1 -1 0 0 -1 ...
##  $ AcidIndex         : int  9 7 9 7 6 9 6 8 6 7 ...
##  $ STARS             : int  2 2 2 NA 2 2 2 2 3 NA ...
nrow(df_train)
## [1] 10236

1.1.2 Training Data

str(df_eval)
## 'data.frame':    2559 obs. of  16 variables:
##  $ INDEX             : int  5 11 24 27 35 36 45 50 58 74 ...
##  $ TARGET            : int  3 4 5 2 6 4 5 4 0 4 ...
##  $ FixedAcidity      : num  5.7 6.5 10 6.5 8.3 8 14.8 3.1 6.6 6.2 ...
##  $ VolatileAcidity   : num  0.385 -1.22 0.23 0.24 -2.46 -0.01 1.17 1.65 0.24 0.16 ...
##  $ CitricAcid        : num  0.04 0.34 0.27 0.24 0.27 -1.1 -0.09 -2.75 2.32 0.26 ...
##  $ ResidualSugar     : num  18.8 1.4 14.1 -18.2 13.2 ...
##  $ Chlorides         : num  -0.425 0.04 0.033 -0.182 0.036 -0.35 0.027 -0.002 -0.356 -0.204 ...
##  $ FreeSulfurDioxide : num  22 523 -188 15 31 17 -166 54 242 208 ...
##  $ TotalSulfurDioxide: num  115 551 229 60 NA 129 NA 126 234 220 ...
##  $ Density           : num  0.996 1.032 0.999 1.009 0.994 ...
##  $ pH                : num  2.24 3.2 3.14 3.19 1.95 3.2 3.3 3.04 3.11 NA ...
##  $ Sulphates         : num  1.83 NA 0.88 -1.01 0.02 NA 0.33 NA 0.39 0.58 ...
##  $ Alcohol           : num  6.2 11.6 11 9.8 13 16.6 12.5 9.7 9.9 10.6 ...
##  $ LabelAppeal       : int  -1 1 1 -1 1 0 0 0 -1 0 ...
##  $ AcidIndex         : int  6 7 11 7 7 9 8 8 7 7 ...
##  $ STARS             : int  1 3 2 1 4 2 2 1 1 2 ...
nrow(df_eval)
## [1] 2559

2 Clean and Model on Testing data

2.1 Clean

df_train_clean <- df_train 
df_train_clean$pH                     <- replace(x = df_train$pH, list = -3, values = 0)
df_train_clean$pH[is.na(df_train$pH)] <- mean(df_train$pH, na.rm = TRUE)

df_train_clean$AcidIndex              <- replace(df_train$AcidIndex, list = -3, values =  0)
df_train_clean$AcidIndex[is.na(df_train$AcidIndex)] <- mean(df_train$AcidIndex, na.rm = TRUE)

# select variables
df_train_clean <- dplyr::select(.data = df_train_clean, 
                         TARGET, pH, AcidIndex, STARS, Chlorides, ResidualSugar)

3 Model

  1. Ultimately, the choice between using a negative binomial or Poisson model depends on the characteristics of your data and the goals of your analysis. If overdispersion is a concern in your data, and you want to accurately model count data, the negative binomial model is usually a better choice, even if the predictions are not strictly integers. So have a look at your y variable.
?glm # glm is used to fit generalized linear models, specified by giving a symbolic description of the linear predictor and a description of the error distribution

?stats::family

poisson <- glm(data    = df_train_clean , 
              formula  = TARGET ~  . , 
              family   = poisson(link = "log")
             )

summary(poisson)
## 
## Call:
## glm(formula = TARGET ~ ., family = poisson(link = "log"), data = df_train_clean)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    7.671e-01  1.651e-02  46.452   <2e-16 ***
## pH            -5.104e-04  1.139e-02  -0.045   0.9643    
## AcidIndex     -2.043e-02  6.423e-02  -0.318   0.7504    
## STARS          2.504e-01  6.733e-03  37.184   <2e-16 ***
## Chlorides     -3.936e-02  1.994e-02  -1.973   0.0484 *  
## ResidualSugar -8.924e-06  1.869e-04  -0.048   0.9619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6200.5  on 6815  degrees of freedom
## Residual deviance: 4840.6  on 6810  degrees of freedom
##   (3420 observations deleted due to missingness)
## AIC: 25131
## 
## Number of Fisher Scoring iterations: 5
negative_binomial <- MASS::glm.nb(data    = df_train_clean , 
                                 formula = TARGET ~  . 
                                  )
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(negative_binomial)
## 
## Call:
## MASS::glm.nb(formula = TARGET ~ ., data = df_train_clean, init.theta = 117808.6724, 
##     link = log)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    7.671e-01  1.651e-02  46.451   <2e-16 ***
## pH            -5.105e-04  1.139e-02  -0.045   0.9643    
## AcidIndex     -2.043e-02  6.423e-02  -0.318   0.7504    
## STARS          2.504e-01  6.733e-03  37.183   <2e-16 ***
## Chlorides     -3.936e-02  1.994e-02  -1.973   0.0484 *  
## ResidualSugar -8.923e-06  1.869e-04  -0.048   0.9619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(117808.7) family taken to be 1)
## 
##     Null deviance: 6200.4  on 6815  degrees of freedom
## Residual deviance: 4840.5  on 6810  degrees of freedom
##   (3420 observations deleted due to missingness)
## AIC: 25133
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  117809 
##           Std. Err.:  188525 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -25118.88
# stargazer(poisson, negative_binomial, 
#           type = "text"
#          )
 
negative_binomial2 <- MASS::glm.nb(data    = df_train_clean[,c(1,3:6)] , 
                                  formula = TARGET ~  . 
                                  ) 
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

Fancier Models

Extensions

The stargazer package expects certain model classes. While negative_binomial2 inherits from glm and lm, the additional negbin class can confuse stargazer. Temporarily coercing the object to glm does not resolve the conflict, so I chnage the package.

class(poisson)
## [1] "glm" "lm"
class(negative_binomial)
## [1] "negbin" "glm"    "lm"
??modelsummary

modelsummary(list("Poisson" = poisson, "Negative Binomial" = negative_binomial2), 
             output = "html")
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
Poisson Negative Binomial
(Intercept) 0.767 0.767
(0.017) (0.016)
pH −0.001
(0.011)
AcidIndex −0.020 −0.021
(0.064) (0.064)
STARS 0.250 0.250
(0.007) (0.007)
Chlorides −0.039 −0.039
(0.020) (0.020)
ResidualSugar 0.000 0.000
(0.000) (0.000)
Num.Obs. 6816 6816
AIC 25130.8 25130.9
BIC 25171.7 25171.8
Log.Lik. −12559.385 −12559.444
F 277.774 347.206
RMSE 1.30 1.30
stargazer(summary(poisson)$coefficients,
          header = FALSE, 
          font.size = "tiny",
          summary = FALSE, 
          title = "Poisson regression model 1",
          type = "text"
          )
## 
## Poisson regression model 1
## =====================================================
##               Estimate Std. Error z value Pr(> | z| )
## -----------------------------------------------------
## (Intercept)    0.767     0.017    46.452       0     
## pH             -0.001    0.011    -0.045     0.964   
## AcidIndex      -0.020    0.064    -0.318     0.750   
## STARS          0.250     0.007    37.184       0     
## Chlorides      -0.039    0.020    -1.973     0.048   
## ResidualSugar -0.00001   0.0002   -0.048     0.962   
## -----------------------------------------------------

4 Predict on Evaluation data

  1. By using the round() function on the Poisson/negative binomial regression forecasts, you will obtain integer-like predictions that are more suitable for count data. These rounded predictions should be more aligned with your expectations of integer values.
?predict.glm

# Make predictions using the fitted model

forecasts_poisson <- 
predict.glm(object  = poisson, 
            newdata = df_eval,
            type    = "response"
            )

forecasts_negative_binomial <- 
predict.glm(object  = negative_binomial, 
            newdata = df_eval,
            type    = "response"
            )
forecasts_negative_binomial2 <- 
predict.glm(object  = negative_binomial2, 
            newdata = df_eval,
            type    = "response"
            )

summary(forecasts_poisson)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.391   3.017   3.130   3.813   5.380     880
summary(forecasts_negative_binomial)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.391   3.017   3.130   3.813   5.380     880
summary(forecasts_negative_binomial2)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.392   3.024   3.135   3.825   5.390     830
# Round the predictions to the nearest whole number
?round
## Help on topic 'round' was found in the following packages:
## 
##   Package               Library
##   timeDate              /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
##   base                  /Library/Frameworks/R.framework/Resources/library
## 
## 
## Using the first match ...
rounded_forecasts_poisson            <- round(forecasts_poisson)
rounded_forecasts_negative_binomial  <- round(forecasts_negative_binomial)
rounded_forecasts_negative_binomial2 <- round(forecasts_negative_binomial2)

5 Create a Confusion Matrix

5.1 Manual

table(df_eval$TARGET)
## 
##   0   1   2   3   4   5   6   7   8 
## 545  44 215 532 618 423 154  25   3
table(rounded_forecasts_poisson )
## rounded_forecasts_poisson
##   2   3   4   5 
## 541 655 383 100
table(rounded_forecasts_negative_binomial)
## rounded_forecasts_negative_binomial
##   2   3   4   5 
## 541 655 383 100
table(rounded_forecasts_negative_binomial2)
## rounded_forecasts_negative_binomial2
##   2   3   4   5 
## 554 677 393 105
table(df_eval$TARGET, rounded_forecasts_poisson)
##    rounded_forecasts_poisson
##       2   3   4   5
##   0 108  15   0   0
##   1   7   5   0   0
##   2  88  50   4   0
##   3 169 172  58   0
##   4 128 230 134  11
##   5  37 140 127  45
##   6   4  42  52  29
##   7   0   1   7  14
##   8   0   0   1   1

5.2 Caret Package

Please figure out the code.

?caret
## No documentation for 'caret' in specified packages and libraries:
## you could try '??caret'
confusionMatrix(data      = as.factor(rounded_forecasts_poisson), 
                reference = as.factor(df_eval$TARGET), 
                positive = "1"
                ) 
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(data = as.factor(rounded_forecasts_poisson),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8
##          0   0   0   0   0   0   0   0   0   0
##          1   0   0   0   0   0   0   0   0   0
##          2 108   7  88 169 128  37   4   0   0
##          3  15   5  50 172 230 140  42   1   0
##          4   0   0   4  58 134 127  52   7   1
##          5   0   0   0   0  11  45  29  14   1
##          6   0   0   0   0   0   0   0   0   0
##          7   0   0   0   0   0   0   0   0   0
##          8   0   0   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2615          
##                  95% CI : (0.2406, 0.2832)
##     No Information Rate : 0.2996          
##     P-Value [Acc > NIR] : 0.9997          
##                                           
##                   Kappa : 0.076           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.00000 0.000000  0.61972   0.4311  0.26640  0.12894
## Specificity           1.00000 1.000000  0.70527   0.6227  0.78827  0.95865
## Pos Pred Value            NaN      NaN  0.16266   0.2626  0.34987  0.45000
## Neg Pred Value        0.92674 0.992853  0.95255   0.7783  0.71528  0.80747
## Prevalence            0.07326 0.007147  0.08457   0.2376  0.29958  0.20786
## Detection Rate        0.00000 0.000000  0.05241   0.1024  0.07981  0.02680
## Detection Prevalence  0.00000 0.000000  0.32222   0.3901  0.22811  0.05956
## Balanced Accuracy     0.50000 0.500000  0.66249   0.5269  0.52733  0.54379
##                      Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000   0.0000 0.000000
## Specificity           1.00000   1.0000 1.000000
## Pos Pred Value            NaN      NaN      NaN
## Neg Pred Value        0.92436   0.9869 0.998809
## Prevalence            0.07564   0.0131 0.001191
## Detection Rate        0.00000   0.0000 0.000000
## Detection Prevalence  0.00000   0.0000 0.000000
## Balanced Accuracy     0.50000   0.5000 0.500000