1 Setup

1.0.1 Setup - empty variables and functions in the environment tab/window

# Clear the workspace
  rm(list = ls()) # Clear environment
  gc()            # Clear unused memory

##          used (Mb) gc trigger (Mb) limit (Mb) max used (Mb)
## Ncells 531196 28.4    1183070 63.2         NA   669265 35.8
## Vcells 974736  7.5    8388608 64.0      16384  1840444 14.1

  cat("\f")       # Clear the console

1.0.2 Setup - load packages

# Set the seed for reproducibility
set.seed(123)

# Prepare needed libraries
packages <- c("psych",
              "tidyverse", 
              "conflicted",
              "MASS",
              "caret"
              )

  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }

## Warning: package 'psych' was built under R version 4.2.3

## Warning: package 'ggplot2' was built under R version 4.2.3

## Warning: package 'tidyr' was built under R version 4.2.3

## Warning: package 'readr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'stringr' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'MASS' was built under R version 4.2.3

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.2.3

rm(packages)

conflict_prefer(name = "describe", winner = "psych")

## [conflicted] Will prefer psych::describe over any other package.

conflict_prefer(name = "select"  , winner = "dplyr")

## [conflicted] Will prefer dplyr::select over any other package.

1.0.3 Setup - load raw data

# Set working directory and path to data
  setwd("/Users/arvindsharma/Dropbox/WCAS/Econometrics/")
  
# Load your data
df    <- read.csv("wine-data.csv")

1.1 Setup -split raw data into testing and training data

# Set the seed for reproducibility
set.seed(123)

# Generate a vector of indices for the training set
train_index <- sample(x = nrow(df), 
                   size = round(0.8 * nrow(df)
                                   )
                      )

# Create the training and testing sets
df_train <- df[ train_index, ]
df_eval  <- df[-train_index, ]

1.1.1 Testing Data

str(df_train)

## 'data.frame':    10236 obs. of  16 variables:
##  $ INDEX             : int  3117 3172 13132 10988 15741 3772 2319 11765 4253 14680 ...
##  $ TARGET            : int  0 3 3 2 6 3 3 4 4 0 ...
##  $ FixedAcidity      : num  9.5 -2.4 9.9 6.7 11.9 9.1 5.7 5.5 5.6 6.5 ...
##  $ VolatileAcidity   : num  -0.71 1.08 0.13 0.34 0.2 0.98 0.21 0.3 0.485 0.24 ...
##  $ CitricAcid        : num  0.78 0.25 -0.4 0.3 0.28 0.58 1.32 1.2 0.16 0.28 ...
##  $ ResidualSugar     : num  -42.2 -10 2.5 6.4 18.7 ...
##  $ Chlorides         : num  -0.022 -0.051 -0.403 -0.386 -0.169 0.036 0.03 0.029 0.051 -0.844 ...
##  $ FreeSulfurDioxide : num  270 321 244 105 -58 44 277 28 193 26 ...
##  $ TotalSulfurDioxide: num  -326 138 NA 369 NA 161 122 97 115 83 ...
##  $ Density           : num  0.946 1 0.998 0.964 0.956 ...
##  $ pH                : num  3.2 4.47 2.4 3.29 3.31 3.06 3.94 3.23 4.15 3.25 ...
##  $ Sulphates         : num  NA -1.05 1.3 0.52 1.24 0.41 0.52 0.44 0.38 0.38 ...
##  $ Alcohol           : num  9.5 4.8 10.6 15.6 16.9 8.9 18.3 21.7 12 NA ...
##  $ LabelAppeal       : int  0 0 -2 -1 1 -1 -1 0 0 -1 ...
##  $ AcidIndex         : int  9 7 9 7 6 9 6 8 6 7 ...
##  $ STARS             : int  2 2 2 NA 2 2 2 2 3 NA ...

nrow(df_train)

## [1] 10236

1.1.2 Training Data

str(df_eval)

## 'data.frame':    2559 obs. of  16 variables:
##  $ INDEX             : int  5 11 24 27 35 36 45 50 58 74 ...
##  $ TARGET            : int  3 4 5 2 6 4 5 4 0 4 ...
##  $ FixedAcidity      : num  5.7 6.5 10 6.5 8.3 8 14.8 3.1 6.6 6.2 ...
##  $ VolatileAcidity   : num  0.385 -1.22 0.23 0.24 -2.46 -0.01 1.17 1.65 0.24 0.16 ...
##  $ CitricAcid        : num  0.04 0.34 0.27 0.24 0.27 -1.1 -0.09 -2.75 2.32 0.26 ...
##  $ ResidualSugar     : num  18.8 1.4 14.1 -18.2 13.2 ...
##  $ Chlorides         : num  -0.425 0.04 0.033 -0.182 0.036 -0.35 0.027 -0.002 -0.356 -0.204 ...
##  $ FreeSulfurDioxide : num  22 523 -188 15 31 17 -166 54 242 208 ...
##  $ TotalSulfurDioxide: num  115 551 229 60 NA 129 NA 126 234 220 ...
##  $ Density           : num  0.996 1.032 0.999 1.009 0.994 ...
##  $ pH                : num  2.24 3.2 3.14 3.19 1.95 3.2 3.3 3.04 3.11 NA ...
##  $ Sulphates         : num  1.83 NA 0.88 -1.01 0.02 NA 0.33 NA 0.39 0.58 ...
##  $ Alcohol           : num  6.2 11.6 11 9.8 13 16.6 12.5 9.7 9.9 10.6 ...
##  $ LabelAppeal       : int  -1 1 1 -1 1 0 0 0 -1 0 ...
##  $ AcidIndex         : int  6 7 11 7 7 9 8 8 7 7 ...
##  $ STARS             : int  1 3 2 1 4 2 2 1 1 2 ...

nrow(df_eval)

## [1] 2559

2 Clean and Model on Testing data

2.1 Clean

df_train_clean <- df_train 
df_train_clean$pH                     <- replace(x = df_train$pH, list = -3, values = 0)
df_train_clean$pH[is.na(df_train$pH)] <- mean(df_train$pH, na.rm = TRUE)

df_train_clean$AcidIndex              <- replace(df_train$AcidIndex, list = -3, values =  0)
df_train_clean$AcidIndex[is.na(df_train$AcidIndex)] <- mean(df_train$AcidIndex, na.rm = TRUE)

# select variables
df_train_clean <- dplyr::select(.data = df_train_clean, 
                         TARGET, pH, AcidIndex, STARS, Chlorides, ResidualSugar)

3 Model

Ultimately, the choice between using a negative binomial or Poisson model depends on the characteristics of your data and the goals of your analysis. If overdispersion is a concern in your data, and you want to accurately model count data, the negative binomial model is usually a better choice, even if the predictions are not strictly integers. So have a look at your y variable.

?glm # glm is used to fit generalized linear models, specified by giving a symbolic description of the linear predictor and a description of the error distribution

?stats::family

poisson <- glm(data    = df_train_clean , 
              formula  = TARGET ~  . , 
              family   = poisson(link = "log")
             )

 summary(poisson)

## 
## Call:
## glm(formula = TARGET ~ ., family = poisson(link = "log"), data = df_train_clean)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6898  -0.3163   0.1429   0.2787   1.7299  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    7.671e-01  1.651e-02  46.452   <2e-16 ***
## pH            -5.104e-04  1.139e-02  -0.045   0.9643    
## AcidIndex     -2.043e-02  6.423e-02  -0.318   0.7504    
## STARS          2.504e-01  6.733e-03  37.184   <2e-16 ***
## Chlorides     -3.936e-02  1.994e-02  -1.973   0.0484 *  
## ResidualSugar -8.924e-06  1.869e-04  -0.048   0.9619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6200.5  on 6815  degrees of freedom
## Residual deviance: 4840.6  on 6810  degrees of freedom
##   (3420 observations deleted due to missingness)
## AIC: 25131
## 
## Number of Fisher Scoring iterations: 5

negative_binomial <- MASS::glm.nb(data    = df_train_clean , 
                                 formula = TARGET ~  . 
                                  )

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

 summary(negative_binomial)

## 
## Call:
## MASS::glm.nb(formula = TARGET ~ ., data = df_train_clean, init.theta = 117808.7193, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6897  -0.3163   0.1429   0.2787   1.7299  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    7.671e-01  1.651e-02  46.451   <2e-16 ***
## pH            -5.105e-04  1.139e-02  -0.045   0.9643    
## AcidIndex     -2.043e-02  6.423e-02  -0.318   0.7504    
## STARS          2.504e-01  6.733e-03  37.183   <2e-16 ***
## Chlorides     -3.936e-02  1.994e-02  -1.973   0.0484 *  
## ResidualSugar -8.923e-06  1.869e-04  -0.048   0.9619    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(117808.7) family taken to be 1)
## 
##     Null deviance: 6200.4  on 6815  degrees of freedom
## Residual deviance: 4840.5  on 6810  degrees of freedom
##   (3420 observations deleted due to missingness)
## AIC: 25133
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  117809 
##           Std. Err.:  188524 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -25118.88

# stargazer(poisson, negative_binomial, 
#           type = "text"
#          )
 
negative_binomial2 <- MASS::glm.nb(data    = df_train_clean[,c(1,3:6)] , 
                                  formula = TARGET ~  . 
                                  )

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

Fancier Models

Extentions

4 Predict on Evaluation data

By using the round() function on the Poisson/negative binomial regression forecasts, you will obtain integer-like predictions that are more suitable for count data. These rounded predictions should be more aligned with your expectations of integer values.

?predict.glm

# Make predictions using the fitted model

forecasts_poisson <- 
predict.glm(object  = poisson, 
            newdata = df_eval,
            type    = "response"
            )

forecasts_negative_binomial <- 
predict.glm(object  = negative_binomial, 
            newdata = df_eval,
            type    = "response"
            )
forecasts_negative_binomial2 <- 
predict.glm(object  = negative_binomial2, 
            newdata = df_eval,
            type    = "response"
            )

summary(forecasts_poisson)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.391   3.017   3.130   3.813   5.380     880

summary(forecasts_negative_binomial)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.391   3.017   3.130   3.813   5.380     880

summary(forecasts_negative_binomial2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   2.075   2.392   3.024   3.135   3.825   5.390     830

# Round the predictions to the nearest whole number
?round

## Help on topic 'round' was found in the following packages:
## 
##   Package               Library
##   base                  /Library/Frameworks/R.framework/Resources/library
##   timeDate              /Library/Frameworks/R.framework/Versions/4.2-arm64/Resources/library
## 
## 
## Using the first match ...

rounded_forecasts_poisson            <- round(forecasts_poisson)
rounded_forecasts_negative_binomial  <- round(forecasts_negative_binomial)
rounded_forecasts_negative_binomial2 <- round(forecasts_negative_binomial2)

5 Create a Confusion Matrix

5.1 Manual

table(df_eval$TARGET)

## 
##   0   1   2   3   4   5   6   7   8 
## 545  44 215 532 618 423 154  25   3

table(rounded_forecasts_poisson )

## rounded_forecasts_poisson
##   2   3   4   5 
## 541 655 383 100

table(rounded_forecasts_negative_binomial)

## rounded_forecasts_negative_binomial
##   2   3   4   5 
## 541 655 383 100

table(rounded_forecasts_negative_binomial2)

## rounded_forecasts_negative_binomial2
##   2   3   4   5 
## 554 677 393 105

table(df_eval$TARGET, rounded_forecasts_poisson)

##    rounded_forecasts_poisson
##       2   3   4   5
##   0 108  15   0   0
##   1   7   5   0   0
##   2  88  50   4   0
##   3 169 172  58   0
##   4 128 230 134  11
##   5  37 140 127  45
##   6   4  42  52  29
##   7   0   1   7  14
##   8   0   0   1   1

5.2 Caret Package

Please figure out the code.

?caret

## No documentation for 'caret' in specified packages and libraries:
## you could try '??caret'

confusionMatrix(data      = as.factor(rounded_forecasts_poisson), 
                reference = as.factor(df_eval$TARGET), 
                positive = "1"
                )

## Warning in levels(reference) != levels(data): longer object length is not a
## multiple of shorter object length

## Warning in confusionMatrix.default(data = as.factor(rounded_forecasts_poisson),
## : Levels are not in the same order for reference and data. Refactoring data to
## match.

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1   2   3   4   5   6   7   8
##          0   0   0   0   0   0   0   0   0   0
##          1   0   0   0   0   0   0   0   0   0
##          2 108   7  88 169 128  37   4   0   0
##          3  15   5  50 172 230 140  42   1   0
##          4   0   0   4  58 134 127  52   7   1
##          5   0   0   0   0  11  45  29  14   1
##          6   0   0   0   0   0   0   0   0   0
##          7   0   0   0   0   0   0   0   0   0
##          8   0   0   0   0   0   0   0   0   0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2615          
##                  95% CI : (0.2406, 0.2832)
##     No Information Rate : 0.2996          
##     P-Value [Acc > NIR] : 0.9997          
##                                           
##                   Kappa : 0.076           
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity           0.00000 0.000000  0.61972   0.4311  0.26640  0.12894
## Specificity           1.00000 1.000000  0.70527   0.6227  0.78827  0.95865
## Pos Pred Value            NaN      NaN  0.16266   0.2626  0.34987  0.45000
## Neg Pred Value        0.92674 0.992853  0.95255   0.7783  0.71528  0.80747
## Prevalence            0.07326 0.007147  0.08457   0.2376  0.29958  0.20786
## Detection Rate        0.00000 0.000000  0.05241   0.1024  0.07981  0.02680
## Detection Prevalence  0.00000 0.000000  0.32222   0.3901  0.22811  0.05956
## Balanced Accuracy     0.50000 0.500000  0.66249   0.5269  0.52733  0.54379
##                      Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000   0.0000 0.000000
## Specificity           1.00000   1.0000 1.000000
## Pos Pred Value            NaN      NaN      NaN
## Neg Pred Value        0.92436   0.9869 0.998809
## Prevalence            0.07564   0.0131 0.001191
## Detection Rate        0.00000   0.0000 0.000000
## Detection Prevalence  0.00000   0.0000 0.000000
## Balanced Accuracy     0.50000   0.5000 0.500000

Homework 3

Arvind Sharma

2024-04-18

1 Setup

1.0.1 Setup - empty variables and functions in the environment tab/window

1.0.2 Setup - load packages

1.0.3 Setup - load raw data

1.1 Setup -split raw data into testing and training data

1.1.1 Testing Data

1.1.2 Training Data

2 Clean and Model on Testing data

2.1 Clean

3 Model

4 Predict on Evaluation data

5 Create a Confusion Matrix

5.1 Manual

5.2 Caret Package