1 Setup

1.0.1 Setup - empty variables and functions in the environment tab/window

# Clear the workspace

rm(list = ls()) # Clear environment
gc()            # Clear unused memory

##           used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells  553566 29.6    1231488 65.8         NA   700257 37.4
## Vcells 1026324  7.9    8388608 64.0      32768  1963430 15.0

cat("\f")       # Clear the console

1.0.2 Setup - load packages

# Set the seed for reproducibility
set.seed(123)

# Prepare needed libraries
packages <- c("psych",
              "tidyverse", 
              "conflicted",
              "MASS",
              "caret",
              "skimr",
              "htmltools",
              "stargazer",
              "modelsummary"
              )


for (i in 1:length(packages)) {
  if (!packages[i] %in% rownames(installed.packages())) {
    install.packages(packages[i]
                     , repos = "http://cran.rstudio.com/"
                     , dependencies = TRUE
                     )
  }
  library(packages[i], character.only = TRUE)
}

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: lattice
## 
## 
## Please cite as: 
## 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## 
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer 
## 
## 
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
##   backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
## 
## Revert to `kableExtra` for one session:
## 
##   options(modelsummary_factory_default = 'kableExtra')
##   options(modelsummary_factory_latex = 'kableExtra')
##   options(modelsummary_factory_html = 'kableExtra')
## 
## Silence this message forever:
## 
##   config_modelsummary(startup_message = FALSE)

rm(packages)

conflict_prefer(name = "describe", winner = "psych")

## [conflicted] Will prefer psych::describe over any other package.

conflict_prefer(name = "select"  , winner = "dplyr")

## [conflicted] Will prefer dplyr::select over any other package.

1.0.3 Setup - load raw data

# Set working directory and path to data
setwd("~/Library/CloudStorage/Dropbox/WCAS/Econometrics/")
  
# Load your data
df    <- read.csv("wine-data.csv")

skim(df)

Data summary
Name	df
Number of rows	12795
Number of columns	16
_______________________
Column type frequency:
numeric	16
________________________
Group variables	None

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
INDEX	0	1.00	8069.98	4656.91	1.00	4037.50	8110.00	12106.50	16129.00	▇▇▇▇▇
TARGET	0	1.00	3.03	1.93	0.00	2.00	3.00	4.00	8.00	▆▇▇▆▁
FixedAcidity	0	1.00	7.08	6.32	-18.10	5.20	6.90	9.50	34.40	▁▂▇▂▁
VolatileAcidity	0	1.00	0.32	0.78	-2.79	0.13	0.28	0.64	3.68	▁▂▇▂▁
CitricAcid	0	1.00	0.31	0.86	-3.24	0.03	0.31	0.58	3.86	▁▂▇▂▁
ResidualSugar	616	0.95	5.42	33.75	-127.80	-2.00	3.90	15.90	141.15	▁▂▇▂▁
Chlorides	638	0.95	0.05	0.32	-1.17	-0.03	0.05	0.15	1.35	▁▂▇▂▁
FreeSulfurDioxide	647	0.95	30.85	148.71	-555.00	0.00	30.00	70.00	623.00	▁▂▇▂▁
TotalSulfurDioxide	682	0.95	120.71	231.91	-823.00	27.00	123.00	208.00	1057.00	▁▂▇▂▁
Density	0	1.00	0.99	0.03	0.89	0.99	0.99	1.00	1.10	▁▂▇▂▁
pH	395	0.97	3.21	0.68	0.48	2.96	3.20	3.47	6.13	▁▂▇▂▁
Sulphates	1210	0.91	0.53	0.93	-3.13	0.28	0.50	0.86	4.24	▁▂▇▂▁
Alcohol	653	0.95	10.49	3.73	-4.70	9.00	10.40	12.40	26.50	▁▂▇▂▁
LabelAppeal	0	1.00	-0.01	0.89	-2.00	-1.00	0.00	1.00	2.00	▁▅▇▅▁
AcidIndex	0	1.00	7.77	1.32	4.00	7.00	8.00	8.00	17.00	▁▇▁▁▁
STARS	3359	0.74	2.04	0.90	1.00	1.00	2.00	3.00	4.00	▇▇▁▅▂

1.1 Setup - split raw data into testing and training data

# Set the seed for reproducibility
set.seed(123)

# Generate a vector of indices for the training set
train_index <- sample(x = nrow(df), 
                   size = round(0.8 * nrow(df)
                                   )
                      )

# Create the training and testing sets
df_train <- df[ train_index, ]
df_eval  <- df[-train_index, ]

1.1.1 Testing Data

str(df_train)

## 'data.frame':    10236 obs. of  16 variables:
##  $ INDEX             : int  3117 3172 13132 10988 15741 3772 2319 11765 4253 14680 ...
##  $ TARGET            : int  0 3 3 2 6 3 3 4 4 0 ...
##  $ FixedAcidity      : num  9.5 -2.4 9.9 6.7 11.9 9.1 5.7 5.5 5.6 6.5 ...
##  $ VolatileAcidity   : num  -0.71 1.08 0.13 0.34 0.2 0.98 0.21 0.3 0.485 0.24 ...
##  $ CitricAcid        : num  0.78 0.25 -0.4 0.3 0.28 0.58 1.32 1.2 0.16 0.28 ...
##  $ ResidualSugar     : num  -42.2 -10 2.5 6.4 18.7 ...
##  $ Chlorides         : num  -0.022 -0.051 -0.403 -0.386 -0.169 0.036 0.03 0.029 0.051 -0.844 ...
##  $ FreeSulfurDioxide : num  270 321 244 105 -58 44 277 28 193 26 ...
##  $ TotalSulfurDioxide: num  -326 138 NA 369 NA 161 122 97 115 83 ...
##  $ Density           : num  0.946 1 0.998 0.964 0.956 ...
##  $ pH                : num  3.2 4.47 2.4 3.29 3.31 3.06 3.94 3.23 4.15 3.25 ...
##  $ Sulphates         : num  NA -1.05 1.3 0.52 1.24 0.41 0.52 0.44 0.38 0.38 ...
##  $ Alcohol           : num  9.5 4.8 10.6 15.6 16.9 8.9 18.3 21.7 12 NA ...
##  $ LabelAppeal       : int  0 0 -2 -1 1 -1 -1 0 0 -1 ...
##  $ AcidIndex         : int  9 7 9 7 6 9 6 8 6 7 ...
##  $ STARS             : int  2 2 2 NA 2 2 2 2 3 NA ...

nrow(df_train)

## [1] 10236

1.1.2 Training Data

str(df_eval)

## 'data.frame':    2559 obs. of  16 variables:
##  $ INDEX             : int  5 11 24 27 35 36 45 50 58 74 ...
##  $ TARGET            : int  3 4 5 2 6 4 5 4 0 4 ...
##  $ FixedAcidity      : num  5.7 6.5 10 6.5 8.3 8 14.8 3.1 6.6 6.2 ...
##  $ VolatileAcidity   : num  0.385 -1.22 0.23 0.24 -2.46 -0.01 1.17 1.65 0.24 0.16 ...
##  $ CitricAcid        : num  0.04 0.34 0.27 0.24 0.27 -1.1 -0.09 -2.75 2.32 0.26 ...
##  $ ResidualSugar     : num  18.8 1.4 14.1 -18.2 13.2 ...
##  $ Chlorides         : num  -0.425 0.04 0.033 -0.182 0.036 -0.35 0.027 -0.002 -0.356 -0.204 ...
##  $ FreeSulfurDioxide : num  22 523 -188 15 31 17 -166 54 242 208 ...
##  $ TotalSulfurDioxide: num  115 551 229 60 NA 129 NA 126 234 220 ...
##  $ Density           : num  0.996 1.032 0.999 1.009 0.994 ...
##  $ pH                : num  2.24 3.2 3.14 3.19 1.95 3.2 3.3 3.04 3.11 NA ...
##  $ Sulphates         : num  1.83 NA 0.88 -1.01 0.02 NA 0.33 NA 0.39 0.58 ...
##  $ Alcohol           : num  6.2 11.6 11 9.8 13 16.6 12.5 9.7 9.9 10.6 ...
##  $ LabelAppeal       : int  -1 1 1 -1 1 0 0 0 -1 0 ...
##  $ AcidIndex         : int  6 7 11 7 7 9 8 8 7 7 ...
##  $ STARS             : int  1 3 2 1 4 2 2 1 1 2 ...

nrow(df_eval)

## [1] 2559

2 Clean and Model on Testing data

2.1 Clean

df_train_clean <- df_train 
df_train_clean$pH                     <- replace(x = df_train$pH, list = -3, values = 0)
df_train_clean$pH[is.na(df_train$pH)] <- mean(df_train$pH, na.rm = TRUE)

df_train_clean$AcidIndex              <- replace(df_train$AcidIndex, list = -3, values =  0)
df_train_clean$AcidIndex[is.na(df_train$AcidIndex)] <- mean(df_train$AcidIndex, na.rm = TRUE)

# select variables
df_train_clean <- dplyr::select(.data = df_train_clean, 
                         TARGET, pH, AcidIndex, STARS, Chlorides, ResidualSugar)

spineMiss package might be worth a look at https://rdrr.io/cran/VIM/

3 Model

Ultimately, the choice between using a negative binomial or Poisson model depends on the characteristics of your data and the goals of your analysis. If overdispersion is a concern in your data, and you want to accurately model count data, the negative binomial model is usually a better choice, even if the predictions are not strictly integers. So have a look at your y variable.

?glm # glm is used to fit generalized linear models, specified by giving a symbolic description of the linear predictor and a description of the error distribution

?stats::family

poisson <- glm(data    = df_train_clean , 
              formula  = TARGET ~  . , 
              family   = poisson(link = "log")
             )

summary(poisson)

## 
## Call:
## glm(formula = TARGET ~ ., family = poisson(link = "log"), data = df_train_clean)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    7.671e-01  1.651e-02  46.452   <2e-16 ***
## pH            -5.104e-04  1.139e-02  -0.045   0.9643    
## AcidIndex     -2.043e-02  6.423e-02  -0.318   0.7504    
## STARS          2.504e-01  6.733e-03  37.184   <2e-16 ***
## Chlorides     -3.936e-02  1.994e-02  -1.973   0.0484 *  
## ResidualSugar -8.924e-06  1.869e-04  -0.048   0.9619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6200.5  on 6815  degrees of freedom
## Residual deviance: 4840.6  on 6810  degrees of freedom
##   (3420 observations deleted due to missingness)
## AIC: 25131
## 
## Number of Fisher Scoring iterations: 5

negative_binomial <- MASS::glm.nb(data    = df_train_clean , 
                                 formula = TARGET ~  . 
                                  )

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(negative_binomial)

## 
## Call:
## MASS::glm.nb(formula = TARGET ~ ., data = df_train_clean, init.theta = 117808.6724, 
##     link = log)
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    7.671e-01  1.651e-02  46.451   <2e-16 ***
## pH            -5.105e-04  1.139e-02  -0.045   0.9643    
## AcidIndex     -2.043e-02  6.423e-02  -0.318   0.7504    
## STARS          2.504e-01  6.733e-03  37.183   <2e-16 ***
## Chlorides     -3.936e-02  1.994e-02  -1.973   0.0484 *  
## ResidualSugar -8.923e-06  1.869e-04  -0.048   0.9619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(117808.7) family taken to be 1)
## 
##     Null deviance: 6200.4  on 6815  degrees of freedom
## Residual deviance: 4840.5  on 6810  degrees of freedom
##   (3420 observations deleted due to missingness)
## AIC: 25133
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  117809 
##           Std. Err.:  188525 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -25118.88

# stargazer(poisson, negative_binomial, 
#           type = "text"
#          )
 
negative_binomial2 <- MASS::glm.nb(data    = df_train_clean[,c(1,3:6)] , 
                                  formula = TARGET ~  . 
                                  )

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

Fancier Models

Extensions

The stargazer package expects certain model classes. While negative_binomial2 inherits from glm and lm, the additional negbin class can confuse stargazer. Temporarily coercing the object to glm does not resolve the conflict, so I chnage the package.

class(poisson)

## [1] "glm" "lm"

class(negative_binomial)

## [1] "negbin" "glm"    "lm"

??modelsummary

modelsummary(list("Poisson" = poisson, "Negative Binomial" = negative_binomial2), 
             output = "html")

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

	Poisson	Negative Binomial
(Intercept)	0.767	0.767
	(0.017)	(0.016)
pH	−0.001
	(0.011)
AcidIndex	−0.020	−0.021
	(0.064)	(0.064)
STARS	0.250	0.250
	(0.007)	(0.007)
Chlorides	−0.039	−0.039
	(0.020)	(0.020)
ResidualSugar	0.000	0.000
	(0.000)	(0.000)
Num.Obs.	6816	6816
AIC	25130.8	25130.9
BIC	25171.7	25171.8
Log.Lik.	−12559.385	−12559.444
F	277.774	347.206
RMSE	1.30	1.30

stargazer(summary(poisson)$coefficients,
          header = FALSE, 
          font.size = "tiny",
          summary = FALSE, 
          title = "Poisson regression model 1",
          type = "text"
          )

## 
## Poisson regression model 1
## =====================================================
##               Estimate Std. Error z value Pr(> | z| )
## -----------------------------------------------------
## (Intercept)    0.767     0.017    46.452       0     
## pH             -0.001    0.011    -0.045     0.964   
## AcidIndex      -0.020    0.064    -0.318     0.750   
## STARS          0.250     0.007    37.184       0     
## Chlorides      -0.039    0.020    -1.973     0.048   
## ResidualSugar -0.00001   0.0002   -0.048     0.962   
## -----------------------------------------------------

4 Predict on Evaluation data

By using the round() function on the Poisson/negative binomial regression forecasts, you will obtain integer-like predictions that are more suitable for count data. These rounded predictions should be more aligned with your expectations of integer values.

?predict.glm

# Make predictions using the fitted model

forecasts_poisson <- 
predict.glm(object  = poisson, 
            newdata = df_eval,
            type    = "response"
            )

forecasts_negative_binomial <- 
predict.glm(object  = negative_binomial, 
            newdata = df_eval,
            type    = "response"
            )
forecasts_negative_binomial2 <- 
predict.glm(object  = negative_binomial2, 
            newdata = df_eval,
            type    = "response"
            )

summary(forecasts_poisson)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.391   3.017   3.130   3.813   5.380     880

summary(forecasts_negative_binomial)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.391   3.017   3.130   3.813   5.380     880

summary(forecasts_negative_binomial2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.392   3.024   3.135   3.825   5.390     830

# Round the predictions to the nearest whole number
?round

## Help on topic 'round' was found in the following packages:
## 
##   Package               Library
##   timeDate              /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
##   base                  /Library/Frameworks/R.framework/Resources/library
## 
## 
## Using the first match ...

rounded_forecasts_poisson            <- round(forecasts_poisson)
rounded_forecasts_negative_binomial  <- round(forecasts_negative_binomial)
rounded_forecasts_negative_binomial2 <- round(forecasts_negative_binomial2)

5 Create a Confusion Matrix

5.1 Manual

table(df_eval$TARGET)

## 
##   0   1   2   3   4   5   6   7   8 
## 545  44 215 532 618 423 154  25   3

table(rounded_forecasts_poisson )

## rounded_forecasts_poisson
##   2   3   4   5 
## 541 655 383 100

table(rounded_forecasts_negative_binomial)

## rounded_forecasts_negative_binomial
##   2   3   4   5 
## 541 655 383 100

table(rounded_forecasts_negative_binomial2)

## rounded_forecasts_negative_binomial2
##   2   3   4   5 
## 554 677 393 105

table(df_eval$TARGET, rounded_forecasts_poisson)

##    rounded_forecasts_poisson
##       2   3   4   5
##   0 108  15   0   0
##   1   7   5   0   0
##   2  88  50   4   0
##   3 169 172  58   0
##   4 128 230 134  11
##   5  37 140 127  45
##   6   4  42  52  29
##   7   0   1   7  14
##   8   0   0   1   1

5.2 Caret Package

Please figure out the code.

?caret

## No documentation for 'caret' in specified packages and libraries:
## you could try '??caret'

confusionMatrix(data      = as.factor(rounded_forecasts_poisson), 
                reference = as.factor(df_eval$TARGET), 
                positive = "1"
                )

## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length

## Warning in confusionMatrix.default(data = as.factor(rounded_forecasts_poisson),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8
##          0   0   0   0   0   0   0   0   0   0
##          1   0   0   0   0   0   0   0   0   0
##          2 108   7  88 169 128  37   4   0   0
##          3  15   5  50 172 230 140  42   1   0
##          4   0   0   4  58 134 127  52   7   1
##          5   0   0   0   0  11  45  29  14   1
##          6   0   0   0   0   0   0   0   0   0
##          7   0   0   0   0   0   0   0   0   0
##          8   0   0   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2615          
##                  95% CI : (0.2406, 0.2832)
##     No Information Rate : 0.2996          
##     P-Value [Acc > NIR] : 0.9997          
##                                           
##                   Kappa : 0.076           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.00000 0.000000  0.61972   0.4311  0.26640  0.12894
## Specificity           1.00000 1.000000  0.70527   0.6227  0.78827  0.95865
## Pos Pred Value            NaN      NaN  0.16266   0.2626  0.34987  0.45000
## Neg Pred Value        0.92674 0.992853  0.95255   0.7783  0.71528  0.80747
## Prevalence            0.07326 0.007147  0.08457   0.2376  0.29958  0.20786
## Detection Rate        0.00000 0.000000  0.05241   0.1024  0.07981  0.02680
## Detection Prevalence  0.00000 0.000000  0.32222   0.3901  0.22811  0.05956
## Balanced Accuracy     0.50000 0.500000  0.66249   0.5269  0.52733  0.54379
##                      Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000   0.0000 0.000000
## Specificity           1.00000   1.0000 1.000000
## Pos Pred Value            NaN      NaN      NaN
## Neg Pred Value        0.92436   0.9869 0.998809
## Prevalence            0.07564   0.0131 0.001191
## Detection Rate        0.00000   0.0000 0.000000
## Detection Prevalence  0.00000   0.0000 0.000000
## Balanced Accuracy     0.50000   0.5000 0.500000

Homework 3

Arvind Sharma

2024-12-20