# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 553566 29.6 1231488 65.8 NA 700257 37.4
## Vcells 1026324 7.9 8388608 64.0 32768 1963430 15.0
cat("\f") # Clear the console
# Set the seed for reproducibility
set.seed(123)
# Prepare needed libraries
packages <- c("psych",
"tidyverse",
"conflicted",
"MASS",
"caret",
"skimr",
"htmltools",
"stargazer",
"modelsummary"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: lattice
##
##
## Please cite as:
##
##
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
##
##
## `modelsummary` 2.0.0 now uses `tinytable` as its default table-drawing
## backend. Learn more at: https://vincentarelbundock.github.io/tinytable/
##
## Revert to `kableExtra` for one session:
##
## options(modelsummary_factory_default = 'kableExtra')
## options(modelsummary_factory_latex = 'kableExtra')
## options(modelsummary_factory_html = 'kableExtra')
##
## Silence this message forever:
##
## config_modelsummary(startup_message = FALSE)
rm(packages)
conflict_prefer(name = "describe", winner = "psych")
## [conflicted] Will prefer psych::describe over any other package.
conflict_prefer(name = "select" , winner = "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
# Set working directory and path to data
setwd("~/Library/CloudStorage/Dropbox/WCAS/Econometrics/")
# Load your data
df <- read.csv("wine-data.csv")
skim(df)
Name | df |
Number of rows | 12795 |
Number of columns | 16 |
_______________________ | |
Column type frequency: | |
numeric | 16 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
INDEX | 0 | 1.00 | 8069.98 | 4656.91 | 1.00 | 4037.50 | 8110.00 | 12106.50 | 16129.00 | ▇▇▇▇▇ |
TARGET | 0 | 1.00 | 3.03 | 1.93 | 0.00 | 2.00 | 3.00 | 4.00 | 8.00 | ▆▇▇▆▁ |
FixedAcidity | 0 | 1.00 | 7.08 | 6.32 | -18.10 | 5.20 | 6.90 | 9.50 | 34.40 | ▁▂▇▂▁ |
VolatileAcidity | 0 | 1.00 | 0.32 | 0.78 | -2.79 | 0.13 | 0.28 | 0.64 | 3.68 | ▁▂▇▂▁ |
CitricAcid | 0 | 1.00 | 0.31 | 0.86 | -3.24 | 0.03 | 0.31 | 0.58 | 3.86 | ▁▂▇▂▁ |
ResidualSugar | 616 | 0.95 | 5.42 | 33.75 | -127.80 | -2.00 | 3.90 | 15.90 | 141.15 | ▁▂▇▂▁ |
Chlorides | 638 | 0.95 | 0.05 | 0.32 | -1.17 | -0.03 | 0.05 | 0.15 | 1.35 | ▁▂▇▂▁ |
FreeSulfurDioxide | 647 | 0.95 | 30.85 | 148.71 | -555.00 | 0.00 | 30.00 | 70.00 | 623.00 | ▁▂▇▂▁ |
TotalSulfurDioxide | 682 | 0.95 | 120.71 | 231.91 | -823.00 | 27.00 | 123.00 | 208.00 | 1057.00 | ▁▂▇▂▁ |
Density | 0 | 1.00 | 0.99 | 0.03 | 0.89 | 0.99 | 0.99 | 1.00 | 1.10 | ▁▂▇▂▁ |
pH | 395 | 0.97 | 3.21 | 0.68 | 0.48 | 2.96 | 3.20 | 3.47 | 6.13 | ▁▂▇▂▁ |
Sulphates | 1210 | 0.91 | 0.53 | 0.93 | -3.13 | 0.28 | 0.50 | 0.86 | 4.24 | ▁▂▇▂▁ |
Alcohol | 653 | 0.95 | 10.49 | 3.73 | -4.70 | 9.00 | 10.40 | 12.40 | 26.50 | ▁▂▇▂▁ |
LabelAppeal | 0 | 1.00 | -0.01 | 0.89 | -2.00 | -1.00 | 0.00 | 1.00 | 2.00 | ▁▅▇▅▁ |
AcidIndex | 0 | 1.00 | 7.77 | 1.32 | 4.00 | 7.00 | 8.00 | 8.00 | 17.00 | ▁▇▁▁▁ |
STARS | 3359 | 0.74 | 2.04 | 0.90 | 1.00 | 1.00 | 2.00 | 3.00 | 4.00 | ▇▇▁▅▂ |
# Set the seed for reproducibility
set.seed(123)
# Generate a vector of indices for the training set
train_index <- sample(x = nrow(df),
size = round(0.8 * nrow(df)
)
)
# Create the training and testing sets
df_train <- df[ train_index, ]
df_eval <- df[-train_index, ]
str(df_train)
## 'data.frame': 10236 obs. of 16 variables:
## $ INDEX : int 3117 3172 13132 10988 15741 3772 2319 11765 4253 14680 ...
## $ TARGET : int 0 3 3 2 6 3 3 4 4 0 ...
## $ FixedAcidity : num 9.5 -2.4 9.9 6.7 11.9 9.1 5.7 5.5 5.6 6.5 ...
## $ VolatileAcidity : num -0.71 1.08 0.13 0.34 0.2 0.98 0.21 0.3 0.485 0.24 ...
## $ CitricAcid : num 0.78 0.25 -0.4 0.3 0.28 0.58 1.32 1.2 0.16 0.28 ...
## $ ResidualSugar : num -42.2 -10 2.5 6.4 18.7 ...
## $ Chlorides : num -0.022 -0.051 -0.403 -0.386 -0.169 0.036 0.03 0.029 0.051 -0.844 ...
## $ FreeSulfurDioxide : num 270 321 244 105 -58 44 277 28 193 26 ...
## $ TotalSulfurDioxide: num -326 138 NA 369 NA 161 122 97 115 83 ...
## $ Density : num 0.946 1 0.998 0.964 0.956 ...
## $ pH : num 3.2 4.47 2.4 3.29 3.31 3.06 3.94 3.23 4.15 3.25 ...
## $ Sulphates : num NA -1.05 1.3 0.52 1.24 0.41 0.52 0.44 0.38 0.38 ...
## $ Alcohol : num 9.5 4.8 10.6 15.6 16.9 8.9 18.3 21.7 12 NA ...
## $ LabelAppeal : int 0 0 -2 -1 1 -1 -1 0 0 -1 ...
## $ AcidIndex : int 9 7 9 7 6 9 6 8 6 7 ...
## $ STARS : int 2 2 2 NA 2 2 2 2 3 NA ...
nrow(df_train)
## [1] 10236
str(df_eval)
## 'data.frame': 2559 obs. of 16 variables:
## $ INDEX : int 5 11 24 27 35 36 45 50 58 74 ...
## $ TARGET : int 3 4 5 2 6 4 5 4 0 4 ...
## $ FixedAcidity : num 5.7 6.5 10 6.5 8.3 8 14.8 3.1 6.6 6.2 ...
## $ VolatileAcidity : num 0.385 -1.22 0.23 0.24 -2.46 -0.01 1.17 1.65 0.24 0.16 ...
## $ CitricAcid : num 0.04 0.34 0.27 0.24 0.27 -1.1 -0.09 -2.75 2.32 0.26 ...
## $ ResidualSugar : num 18.8 1.4 14.1 -18.2 13.2 ...
## $ Chlorides : num -0.425 0.04 0.033 -0.182 0.036 -0.35 0.027 -0.002 -0.356 -0.204 ...
## $ FreeSulfurDioxide : num 22 523 -188 15 31 17 -166 54 242 208 ...
## $ TotalSulfurDioxide: num 115 551 229 60 NA 129 NA 126 234 220 ...
## $ Density : num 0.996 1.032 0.999 1.009 0.994 ...
## $ pH : num 2.24 3.2 3.14 3.19 1.95 3.2 3.3 3.04 3.11 NA ...
## $ Sulphates : num 1.83 NA 0.88 -1.01 0.02 NA 0.33 NA 0.39 0.58 ...
## $ Alcohol : num 6.2 11.6 11 9.8 13 16.6 12.5 9.7 9.9 10.6 ...
## $ LabelAppeal : int -1 1 1 -1 1 0 0 0 -1 0 ...
## $ AcidIndex : int 6 7 11 7 7 9 8 8 7 7 ...
## $ STARS : int 1 3 2 1 4 2 2 1 1 2 ...
nrow(df_eval)
## [1] 2559
df_train_clean <- df_train
df_train_clean$pH <- replace(x = df_train$pH, list = -3, values = 0)
df_train_clean$pH[is.na(df_train$pH)] <- mean(df_train$pH, na.rm = TRUE)
df_train_clean$AcidIndex <- replace(df_train$AcidIndex, list = -3, values = 0)
df_train_clean$AcidIndex[is.na(df_train$AcidIndex)] <- mean(df_train$AcidIndex, na.rm = TRUE)
# select variables
df_train_clean <- dplyr::select(.data = df_train_clean,
TARGET, pH, AcidIndex, STARS, Chlorides, ResidualSugar)
spineMiss
package might be
worth a look at https://rdrr.io/cran/VIM/?glm # glm is used to fit generalized linear models, specified by giving a symbolic description of the linear predictor and a description of the error distribution
?stats::family
poisson <- glm(data = df_train_clean ,
formula = TARGET ~ . ,
family = poisson(link = "log")
)
summary(poisson)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson(link = "log"), data = df_train_clean)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.671e-01 1.651e-02 46.452 <2e-16 ***
## pH -5.104e-04 1.139e-02 -0.045 0.9643
## AcidIndex -2.043e-02 6.423e-02 -0.318 0.7504
## STARS 2.504e-01 6.733e-03 37.184 <2e-16 ***
## Chlorides -3.936e-02 1.994e-02 -1.973 0.0484 *
## ResidualSugar -8.924e-06 1.869e-04 -0.048 0.9619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 6200.5 on 6815 degrees of freedom
## Residual deviance: 4840.6 on 6810 degrees of freedom
## (3420 observations deleted due to missingness)
## AIC: 25131
##
## Number of Fisher Scoring iterations: 5
negative_binomial <- MASS::glm.nb(data = df_train_clean ,
formula = TARGET ~ .
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(negative_binomial)
##
## Call:
## MASS::glm.nb(formula = TARGET ~ ., data = df_train_clean, init.theta = 117808.6724,
## link = log)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.671e-01 1.651e-02 46.451 <2e-16 ***
## pH -5.105e-04 1.139e-02 -0.045 0.9643
## AcidIndex -2.043e-02 6.423e-02 -0.318 0.7504
## STARS 2.504e-01 6.733e-03 37.183 <2e-16 ***
## Chlorides -3.936e-02 1.994e-02 -1.973 0.0484 *
## ResidualSugar -8.923e-06 1.869e-04 -0.048 0.9619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(117808.7) family taken to be 1)
##
## Null deviance: 6200.4 on 6815 degrees of freedom
## Residual deviance: 4840.5 on 6810 degrees of freedom
## (3420 observations deleted due to missingness)
## AIC: 25133
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 117809
## Std. Err.: 188525
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -25118.88
# stargazer(poisson, negative_binomial,
# type = "text"
# )
negative_binomial2 <- MASS::glm.nb(data = df_train_clean[,c(1,3:6)] ,
formula = TARGET ~ .
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
Fancier Models
The stargazer package expects certain model classes. While
negative_binomial2
inherits from glm
and
lm
, the additional negbin
class can confuse
stargazer
. Temporarily coercing the object to
glm
does not resolve the conflict, so I chnage the
package.
class(poisson)
## [1] "glm" "lm"
class(negative_binomial)
## [1] "negbin" "glm" "lm"
??modelsummary
modelsummary(list("Poisson" = poisson, "Negative Binomial" = negative_binomial2),
output = "html")
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
Poisson | Negative Binomial | |
---|---|---|
(Intercept) | 0.767 | 0.767 |
(0.017) | (0.016) | |
pH | −0.001 | |
(0.011) | ||
AcidIndex | −0.020 | −0.021 |
(0.064) | (0.064) | |
STARS | 0.250 | 0.250 |
(0.007) | (0.007) | |
Chlorides | −0.039 | −0.039 |
(0.020) | (0.020) | |
ResidualSugar | 0.000 | 0.000 |
(0.000) | (0.000) | |
Num.Obs. | 6816 | 6816 |
AIC | 25130.8 | 25130.9 |
BIC | 25171.7 | 25171.8 |
Log.Lik. | −12559.385 | −12559.444 |
F | 277.774 | 347.206 |
RMSE | 1.30 | 1.30 |
stargazer(summary(poisson)$coefficients,
header = FALSE,
font.size = "tiny",
summary = FALSE,
title = "Poisson regression model 1",
type = "text"
)
##
## Poisson regression model 1
## =====================================================
## Estimate Std. Error z value Pr(> | z| )
## -----------------------------------------------------
## (Intercept) 0.767 0.017 46.452 0
## pH -0.001 0.011 -0.045 0.964
## AcidIndex -0.020 0.064 -0.318 0.750
## STARS 0.250 0.007 37.184 0
## Chlorides -0.039 0.020 -1.973 0.048
## ResidualSugar -0.00001 0.0002 -0.048 0.962
## -----------------------------------------------------
round()
function on the
Poisson/negative binomial regression forecasts, you will obtain
integer-like predictions that are more suitable for count data. These
rounded predictions should be more aligned with your expectations of
integer values.?predict.glm
# Make predictions using the fitted model
forecasts_poisson <-
predict.glm(object = poisson,
newdata = df_eval,
type = "response"
)
forecasts_negative_binomial <-
predict.glm(object = negative_binomial,
newdata = df_eval,
type = "response"
)
forecasts_negative_binomial2 <-
predict.glm(object = negative_binomial2,
newdata = df_eval,
type = "response"
)
summary(forecasts_poisson)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.075 2.391 3.017 3.130 3.813 5.380 880
summary(forecasts_negative_binomial)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.075 2.391 3.017 3.130 3.813 5.380 880
summary(forecasts_negative_binomial2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.075 2.392 3.024 3.135 3.825 5.390 830
# Round the predictions to the nearest whole number
?round
## Help on topic 'round' was found in the following packages:
##
## Package Library
## timeDate /Library/Frameworks/R.framework/Versions/4.4-arm64/Resources/library
## base /Library/Frameworks/R.framework/Resources/library
##
##
## Using the first match ...
rounded_forecasts_poisson <- round(forecasts_poisson)
rounded_forecasts_negative_binomial <- round(forecasts_negative_binomial)
rounded_forecasts_negative_binomial2 <- round(forecasts_negative_binomial2)
table(df_eval$TARGET)
##
## 0 1 2 3 4 5 6 7 8
## 545 44 215 532 618 423 154 25 3
table(rounded_forecasts_poisson )
## rounded_forecasts_poisson
## 2 3 4 5
## 541 655 383 100
table(rounded_forecasts_negative_binomial)
## rounded_forecasts_negative_binomial
## 2 3 4 5
## 541 655 383 100
table(rounded_forecasts_negative_binomial2)
## rounded_forecasts_negative_binomial2
## 2 3 4 5
## 554 677 393 105
table(df_eval$TARGET, rounded_forecasts_poisson)
## rounded_forecasts_poisson
## 2 3 4 5
## 0 108 15 0 0
## 1 7 5 0 0
## 2 88 50 4 0
## 3 169 172 58 0
## 4 128 230 134 11
## 5 37 140 127 45
## 6 4 42 52 29
## 7 0 1 7 14
## 8 0 0 1 1
Please figure out the code.
?caret
## No documentation for 'caret' in specified packages and libraries:
## you could try '??caret'
confusionMatrix(data = as.factor(rounded_forecasts_poisson),
reference = as.factor(df_eval$TARGET),
positive = "1"
)
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(data = as.factor(rounded_forecasts_poisson),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8
## 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0
## 2 108 7 88 169 128 37 4 0 0
## 3 15 5 50 172 230 140 42 1 0
## 4 0 0 4 58 134 127 52 7 1
## 5 0 0 0 0 11 45 29 14 1
## 6 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.2615
## 95% CI : (0.2406, 0.2832)
## No Information Rate : 0.2996
## P-Value [Acc > NIR] : 0.9997
##
## Kappa : 0.076
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.00000 0.000000 0.61972 0.4311 0.26640 0.12894
## Specificity 1.00000 1.000000 0.70527 0.6227 0.78827 0.95865
## Pos Pred Value NaN NaN 0.16266 0.2626 0.34987 0.45000
## Neg Pred Value 0.92674 0.992853 0.95255 0.7783 0.71528 0.80747
## Prevalence 0.07326 0.007147 0.08457 0.2376 0.29958 0.20786
## Detection Rate 0.00000 0.000000 0.05241 0.1024 0.07981 0.02680
## Detection Prevalence 0.00000 0.000000 0.32222 0.3901 0.22811 0.05956
## Balanced Accuracy 0.50000 0.500000 0.66249 0.5269 0.52733 0.54379
## Class: 6 Class: 7 Class: 8
## Sensitivity 0.00000 0.0000 0.000000
## Specificity 1.00000 1.0000 1.000000
## Pos Pred Value NaN NaN NaN
## Neg Pred Value 0.92436 0.9869 0.998809
## Prevalence 0.07564 0.0131 0.001191
## Detection Rate 0.00000 0.0000 0.000000
## Detection Prevalence 0.00000 0.0000 0.000000
## Balanced Accuracy 0.50000 0.5000 0.500000