# Clear the workspace
rm(list = ls()) # Clear environment
gc() # Clear unused memory
## used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 531196 28.4 1183070 63.2 NA 669265 35.8
## Vcells 974736 7.5 8388608 64.0 16384 1840444 14.1
cat("\f") # Clear the console
# Set the seed for reproducibility
set.seed(123)
# Prepare needed libraries
packages <- c("psych",
"tidyverse",
"conflicted",
"MASS",
"caret"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
## Warning: package 'psych' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tidyr' was built under R version 4.2.3
## Warning: package 'readr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'MASS' was built under R version 4.2.3
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.2.3
rm(packages)
conflict_prefer(name = "describe", winner = "psych")
## [conflicted] Will prefer psych::describe over any other package.
conflict_prefer(name = "select" , winner = "dplyr")
## [conflicted] Will prefer dplyr::select over any other package.
# Set working directory and path to data
setwd("/Users/arvindsharma/Dropbox/WCAS/Econometrics/")
# Load your data
df <- read.csv("wine-data.csv")
# Set the seed for reproducibility
set.seed(123)
# Generate a vector of indices for the training set
train_index <- sample(x = nrow(df),
size = round(0.8 * nrow(df)
)
)
# Create the training and testing sets
df_train <- df[ train_index, ]
df_eval <- df[-train_index, ]
str(df_train)
## 'data.frame': 10236 obs. of 16 variables:
## $ INDEX : int 3117 3172 13132 10988 15741 3772 2319 11765 4253 14680 ...
## $ TARGET : int 0 3 3 2 6 3 3 4 4 0 ...
## $ FixedAcidity : num 9.5 -2.4 9.9 6.7 11.9 9.1 5.7 5.5 5.6 6.5 ...
## $ VolatileAcidity : num -0.71 1.08 0.13 0.34 0.2 0.98 0.21 0.3 0.485 0.24 ...
## $ CitricAcid : num 0.78 0.25 -0.4 0.3 0.28 0.58 1.32 1.2 0.16 0.28 ...
## $ ResidualSugar : num -42.2 -10 2.5 6.4 18.7 ...
## $ Chlorides : num -0.022 -0.051 -0.403 -0.386 -0.169 0.036 0.03 0.029 0.051 -0.844 ...
## $ FreeSulfurDioxide : num 270 321 244 105 -58 44 277 28 193 26 ...
## $ TotalSulfurDioxide: num -326 138 NA 369 NA 161 122 97 115 83 ...
## $ Density : num 0.946 1 0.998 0.964 0.956 ...
## $ pH : num 3.2 4.47 2.4 3.29 3.31 3.06 3.94 3.23 4.15 3.25 ...
## $ Sulphates : num NA -1.05 1.3 0.52 1.24 0.41 0.52 0.44 0.38 0.38 ...
## $ Alcohol : num 9.5 4.8 10.6 15.6 16.9 8.9 18.3 21.7 12 NA ...
## $ LabelAppeal : int 0 0 -2 -1 1 -1 -1 0 0 -1 ...
## $ AcidIndex : int 9 7 9 7 6 9 6 8 6 7 ...
## $ STARS : int 2 2 2 NA 2 2 2 2 3 NA ...
nrow(df_train)
## [1] 10236
str(df_eval)
## 'data.frame': 2559 obs. of 16 variables:
## $ INDEX : int 5 11 24 27 35 36 45 50 58 74 ...
## $ TARGET : int 3 4 5 2 6 4 5 4 0 4 ...
## $ FixedAcidity : num 5.7 6.5 10 6.5 8.3 8 14.8 3.1 6.6 6.2 ...
## $ VolatileAcidity : num 0.385 -1.22 0.23 0.24 -2.46 -0.01 1.17 1.65 0.24 0.16 ...
## $ CitricAcid : num 0.04 0.34 0.27 0.24 0.27 -1.1 -0.09 -2.75 2.32 0.26 ...
## $ ResidualSugar : num 18.8 1.4 14.1 -18.2 13.2 ...
## $ Chlorides : num -0.425 0.04 0.033 -0.182 0.036 -0.35 0.027 -0.002 -0.356 -0.204 ...
## $ FreeSulfurDioxide : num 22 523 -188 15 31 17 -166 54 242 208 ...
## $ TotalSulfurDioxide: num 115 551 229 60 NA 129 NA 126 234 220 ...
## $ Density : num 0.996 1.032 0.999 1.009 0.994 ...
## $ pH : num 2.24 3.2 3.14 3.19 1.95 3.2 3.3 3.04 3.11 NA ...
## $ Sulphates : num 1.83 NA 0.88 -1.01 0.02 NA 0.33 NA 0.39 0.58 ...
## $ Alcohol : num 6.2 11.6 11 9.8 13 16.6 12.5 9.7 9.9 10.6 ...
## $ LabelAppeal : int -1 1 1 -1 1 0 0 0 -1 0 ...
## $ AcidIndex : int 6 7 11 7 7 9 8 8 7 7 ...
## $ STARS : int 1 3 2 1 4 2 2 1 1 2 ...
nrow(df_eval)
## [1] 2559
df_train_clean <- df_train
df_train_clean$pH <- replace(x = df_train$pH, list = -3, values = 0)
df_train_clean$pH[is.na(df_train$pH)] <- mean(df_train$pH, na.rm = TRUE)
df_train_clean$AcidIndex <- replace(df_train$AcidIndex, list = -3, values = 0)
df_train_clean$AcidIndex[is.na(df_train$AcidIndex)] <- mean(df_train$AcidIndex, na.rm = TRUE)
# select variables
df_train_clean <- dplyr::select(.data = df_train_clean,
TARGET, pH, AcidIndex, STARS, Chlorides, ResidualSugar)
?glm # glm is used to fit generalized linear models, specified by giving a symbolic description of the linear predictor and a description of the error distribution
?stats::family
poisson <- glm(data = df_train_clean ,
formula = TARGET ~ . ,
family = poisson(link = "log")
)
summary(poisson)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson(link = "log"), data = df_train_clean)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6898 -0.3163 0.1429 0.2787 1.7299
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.671e-01 1.651e-02 46.452 <2e-16 ***
## pH -5.104e-04 1.139e-02 -0.045 0.9643
## AcidIndex -2.043e-02 6.423e-02 -0.318 0.7504
## STARS 2.504e-01 6.733e-03 37.184 <2e-16 ***
## Chlorides -3.936e-02 1.994e-02 -1.973 0.0484 *
## ResidualSugar -8.924e-06 1.869e-04 -0.048 0.9619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 6200.5 on 6815 degrees of freedom
## Residual deviance: 4840.6 on 6810 degrees of freedom
## (3420 observations deleted due to missingness)
## AIC: 25131
##
## Number of Fisher Scoring iterations: 5
negative_binomial <- MASS::glm.nb(data = df_train_clean ,
formula = TARGET ~ .
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(negative_binomial)
##
## Call:
## MASS::glm.nb(formula = TARGET ~ ., data = df_train_clean, init.theta = 117808.7193,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6897 -0.3163 0.1429 0.2787 1.7299
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 7.671e-01 1.651e-02 46.451 <2e-16 ***
## pH -5.105e-04 1.139e-02 -0.045 0.9643
## AcidIndex -2.043e-02 6.423e-02 -0.318 0.7504
## STARS 2.504e-01 6.733e-03 37.183 <2e-16 ***
## Chlorides -3.936e-02 1.994e-02 -1.973 0.0484 *
## ResidualSugar -8.923e-06 1.869e-04 -0.048 0.9619
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(117808.7) family taken to be 1)
##
## Null deviance: 6200.4 on 6815 degrees of freedom
## Residual deviance: 4840.5 on 6810 degrees of freedom
## (3420 observations deleted due to missingness)
## AIC: 25133
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 117809
## Std. Err.: 188524
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -25118.88
# stargazer(poisson, negative_binomial,
# type = "text"
# )
negative_binomial2 <- MASS::glm.nb(data = df_train_clean[,c(1,3:6)] ,
formula = TARGET ~ .
)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
Fancier Models
round() function on the
Poisson/negative binomial regression forecasts, you will obtain
integer-like predictions that are more suitable for count data. These
rounded predictions should be more aligned with your expectations of
integer values.?predict.glm
# Make predictions using the fitted model
forecasts_poisson <-
predict.glm(object = poisson,
newdata = df_eval,
type = "response"
)
forecasts_negative_binomial <-
predict.glm(object = negative_binomial,
newdata = df_eval,
type = "response"
)
forecasts_negative_binomial2 <-
predict.glm(object = negative_binomial2,
newdata = df_eval,
type = "response"
)
summary(forecasts_poisson)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.075 2.391 3.017 3.130 3.813 5.380 880
summary(forecasts_negative_binomial)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.075 2.391 3.017 3.130 3.813 5.380 880
summary(forecasts_negative_binomial2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 2.075 2.392 3.024 3.135 3.825 5.390 830
# Round the predictions to the nearest whole number
?round
## Help on topic 'round' was found in the following packages:
##
## Package Library
## base /Library/Frameworks/R.framework/Resources/library
## timeDate /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
##
##
## Using the first match ...
rounded_forecasts_poisson <- round(forecasts_poisson)
rounded_forecasts_negative_binomial <- round(forecasts_negative_binomial)
rounded_forecasts_negative_binomial2 <- round(forecasts_negative_binomial2)
table(df_eval$TARGET)
##
## 0 1 2 3 4 5 6 7 8
## 545 44 215 532 618 423 154 25 3
table(rounded_forecasts_poisson )
## rounded_forecasts_poisson
## 2 3 4 5
## 541 655 383 100
table(rounded_forecasts_negative_binomial)
## rounded_forecasts_negative_binomial
## 2 3 4 5
## 541 655 383 100
table(rounded_forecasts_negative_binomial2)
## rounded_forecasts_negative_binomial2
## 2 3 4 5
## 554 677 393 105
table(df_eval$TARGET, rounded_forecasts_poisson)
## rounded_forecasts_poisson
## 2 3 4 5
## 0 108 15 0 0
## 1 7 5 0 0
## 2 88 50 4 0
## 3 169 172 58 0
## 4 128 230 134 11
## 5 37 140 127 45
## 6 4 42 52 29
## 7 0 1 7 14
## 8 0 0 1 1
Please figure out the code.
?caret
## No documentation for 'caret' in specified packages and libraries:
## you could try '??caret'
confusionMatrix(data = as.factor(rounded_forecasts_poisson),
reference = as.factor(df_eval$TARGET),
positive = "1"
)
## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length
## Warning in confusionMatrix.default(data = as.factor(rounded_forecasts_poisson),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1 2 3 4 5 6 7 8
## 0 0 0 0 0 0 0 0 0 0
## 1 0 0 0 0 0 0 0 0 0
## 2 108 7 88 169 128 37 4 0 0
## 3 15 5 50 172 230 140 42 1 0
## 4 0 0 4 58 134 127 52 7 1
## 5 0 0 0 0 11 45 29 14 1
## 6 0 0 0 0 0 0 0 0 0
## 7 0 0 0 0 0 0 0 0 0
## 8 0 0 0 0 0 0 0 0 0
##
## Overall Statistics
##
## Accuracy : 0.2615
## 95% CI : (0.2406, 0.2832)
## No Information Rate : 0.2996
## P-Value [Acc > NIR] : 0.9997
##
## Kappa : 0.076
##
## Mcnemar's Test P-Value : NA
##
## Statistics by Class:
##
## Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity 0.00000 0.000000 0.61972 0.4311 0.26640 0.12894
## Specificity 1.00000 1.000000 0.70527 0.6227 0.78827 0.95865
## Pos Pred Value NaN NaN 0.16266 0.2626 0.34987 0.45000
## Neg Pred Value 0.92674 0.992853 0.95255 0.7783 0.71528 0.80747
## Prevalence 0.07326 0.007147 0.08457 0.2376 0.29958 0.20786
## Detection Rate 0.00000 0.000000 0.05241 0.1024 0.07981 0.02680
## Detection Prevalence 0.00000 0.000000 0.32222 0.3901 0.22811 0.05956
## Balanced Accuracy 0.50000 0.500000 0.66249 0.5269 0.52733 0.54379
## Class: 6 Class: 7 Class: 8
## Sensitivity 0.00000 0.0000 0.000000
## Specificity 1.00000 1.0000 1.000000
## Pos Pred Value NaN NaN NaN
## Neg Pred Value 0.92436 0.9869 0.998809
## Prevalence 0.07564 0.0131 0.001191
## Detection Rate 0.00000 0.0000 0.000000
## Detection Prevalence 0.00000 0.0000 0.000000
## Balanced Accuracy 0.50000 0.5000 0.500000