Data Exploration

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
library(psych)
## 
## Attaching package: 'psych'
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
library(ggplot2)
library(dplyr)
library(stargazer)
## 
## Please cite as: 
## 
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
wine_orig <- read.csv("~/Desktop/wine-training-data.csv", na.strings = c(""), stringsAsFactors = FALSE)

set.seed(123)

train_index <- sample(x = nrow(wine_orig),  size = round(0.8 * nrow(wine_orig) ) )

df_train <- wine_orig[train_index, ]

df_test  <- wine_orig[-train_index, ]

At this point, I have generated a vector of indices for the training set and created the training and testing sets randomly. The training set will be the only one used until the end where I will use the test set to see how accurate my regression model is.

Remove unnecessary variable

df_train_wine<- df_train[,-1] 
df_test_wine <- df_test[, -1]

Removed the index variable from both the training and testing sets. This variable does not mean anything to us.

Types of Variables

sapply(X=df_train_wine,
       FUN= class)
##             TARGET       FixedAcidity    VolatileAcidity         CitricAcid 
##          "integer"          "numeric"          "numeric"          "numeric" 
##      ResidualSugar          Chlorides  FreeSulfurDioxide TotalSulfurDioxide 
##          "numeric"          "numeric"          "numeric"          "numeric" 
##            Density                 pH          Sulphates            Alcohol 
##          "numeric"          "numeric"          "numeric"          "numeric" 
##        LabelAppeal          AcidIndex              STARS 
##          "integer"          "integer"          "integer"

Correlation

data_cor <- cor(df_train_wine[ , colnames(df_train_wine) != "TARGET"],  
                df_train_wine$TARGET, use = "na.or.complete") #Calculate correlations of target wins vs everything else
data_cor
##                            [,1]
## FixedAcidity       -0.018427213
## VolatileAcidity    -0.087566066
## CitricAcid          0.018376980
## ResidualSugar       0.002984054
## Chlorides          -0.038807560
## FreeSulfurDioxide   0.028934689
## TotalSulfurDioxide  0.016623204
## Density            -0.041108959
## pH                 -0.011333421
## Sulphates          -0.013567716
## Alcohol             0.066215151
## LabelAppeal         0.491119407
## AcidIndex          -0.155828930
## STARS               0.554951128
library(ggcorrplot)

mycorr<- cor(x = df_train_wine[, 1:ncol(df_train_wine)], use = "na.or.complete")

p.mat <- ggcorrplot::cor_pmat(x = df_train_wine [,1:ncol(df_train_wine)])
head(p.mat)
##                       TARGET FixedAcidity VolatileAcidity  CitricAcid
## TARGET          0.000000e+00 4.719477e-07    7.412342e-20 0.268360126
## FixedAcidity    4.719477e-07 0.000000e+00    4.698445e-01 0.052750515
## VolatileAcidity 7.412342e-20 4.698445e-01    0.000000e+00 0.005553452
## CitricAcid      2.683601e-01 5.275051e-02    5.553452e-03 0.000000000
## ResidualSugar   1.142410e-01 4.291178e-02    4.917090e-01 0.843012633
## Chlorides       7.260016e-06 8.913471e-01    9.816855e-01 0.477941768
##                 ResidualSugar    Chlorides FreeSulfurDioxide TotalSulfurDioxide
## TARGET             0.11424101 7.260016e-06      6.163034e-07       5.213524e-07
## FixedAcidity       0.04291178 8.913471e-01      3.956000e-01       1.286502e-02
## VolatileAcidity    0.49170898 9.816855e-01      8.729870e-01       7.845186e-02
## CitricAcid         0.84301263 4.779418e-01      4.210998e-01       7.353016e-01
## ResidualSugar      0.00000000 3.761255e-01      3.432263e-01       1.551479e-02
## Chlorides          0.37612552 0.000000e+00      1.052409e-01       2.365754e-01
##                     Density         pH    Sulphates      Alcohol   LabelAppeal
## TARGET          0.001760954 0.43130464 0.0001601662 2.565261e-07 7.069523e-302
## FixedAcidity    0.353750495 0.21838066 0.0003159342 2.301024e-01  8.190745e-01
## VolatileAcidity 0.041988247 0.17996373 0.8083181069 5.743378e-01  3.581330e-02
## CitricAcid      0.398336622 0.27909665 0.2913930964 4.326633e-02  8.612480e-02
## ResidualSugar   0.871668644 0.13027390 0.1210038256 9.223324e-03  6.127689e-01
## Chlorides       0.036969238 0.04953973 0.7726741611 7.667504e-02  8.583342e-02
##                     AcidIndex        STARS
## TARGET          5.030609e-138 0.000000e+00
## FixedAcidity     1.909060e-79 5.047857e-01
## VolatileAcidity  1.959376e-04 8.156817e-05
## CitricAcid       5.528456e-13 4.016205e-01
## ResidualSugar    2.063247e-01 1.722924e-01
## Chlorides        1.208368e-02 6.380890e-01
  myplot<-ggcorrplot(corr     = mycorr,   
                     method   = "square", 
                     type     = "lower",  
                     title    = "Correlation Plot", 
                     colors   = c("red", "white","green"), 
                     lab      = TRUE, 
                     lab_size = 2,      
                     p.mat    = p.mat,  
                     insig    = "pch",  
                     pch      = 4, 
                     hc.order = TRUE, 
                     tl.cex   = 8, 
                      tl.col   = "black", 
                     digits = 2
                     )

?ggcorrplot
myplot 

Just from looking at the correlation plot, we can see that Target and STARS are strongly correlated to each other. This was to be expected because a higher number of stars suggests higher sales. We also see that Target and Label Appeal are strongly correlated to each other. This was also to be expected because many consumers purchased wine because of the visual appeal of the wine’s label, and that higher numbers suggest higher sales. The correlations that I mentioned are the strongest between all of the variables.

Distribution Plots of Each Variable

gather_df_train <- df_train_wine %>% gather(key = 'variable', value = 'value')

ggplot(gather_df_train) + 
  geom_histogram(aes(x=value), bins=30) + 
  facet_wrap(. ~variable, scales='free', ncol=4)
## Warning: Removed 6578 rows containing non-finite values (`stat_bin()`).

The majority of stars given was 2 stars, with the least amount being given being 4 stars. The majority of label appeal ratings was 0. This least amount of label appeal ratings was -2 and 2.

Data Preparation

Show where n.as are

df_train_wine%>% 
    summarise(across(everything(),~ sum(is.na(.)))) %>%
    glimpse()
## Rows: 1
## Columns: 15
## $ TARGET             <int> 0
## $ FixedAcidity       <int> 0
## $ VolatileAcidity    <int> 0
## $ CitricAcid         <int> 0
## $ ResidualSugar      <int> 494
## $ Chlorides          <int> 525
## $ FreeSulfurDioxide  <int> 539
## $ TotalSulfurDioxide <int> 532
## $ Density            <int> 0
## $ pH                 <int> 321
## $ Sulphates          <int> 960
## $ Alcohol            <int> 519
## $ LabelAppeal        <int> 0
## $ AcidIndex          <int> 0
## $ STARS              <int> 2688
library(naniar)
vis_miss(df_train_wine)

vis_miss(df_test_wine)

Only 4.2% of the overall data is missing, with STARS missing the most of its own data. pH is missing the least amount of data with only 3% missing. Because only 4.2% of the overall data is missing, I am not going to remove these rows. Instead I am going to impute the missing values with the median. However, I will not impute the missing values of STARS with the median. A missing value in STARS probably means that the wine has not been rated yet. I am going to change the missing values to 0 as it might be predictive of the Target.

df_train_wine$STARS[is.na(df_train_wine$STARS)] = 0 
df_test_wine$STARS[is.na(df_test_wine$STARS)] = 0

Impute rest of missing values with median

df_train_wine <- df_train_wine %>% mutate(across(where(is.numeric), ~replace_na(., median(., na.rm=TRUE))))
df_test_wine <- df_test_wine %>% mutate(across(where(is.numeric), ~replace_na(., median(., na.rm=TRUE))))

Creat Dummy Variables for STARS rating

df_train_wine$STARS_1 <- ifelse(df_train_wine$STARS == 1, 1, 0)
df_train_wine$STARS_2 <- ifelse(df_train_wine$STARS == 2, 1, 0)
df_train_wine$STARS_3 <- ifelse(df_train_wine$STARS == 3, 1, 0)
df_train_wine$STARS_4 <- ifelse(df_train_wine$STARS == 4, 1, 0)

df_test_wine$STARS_1 <- ifelse(df_test_wine$STARS == 1, 1, 0)
df_test_wine$STARS_2 <- ifelse(df_test_wine$STARS == 2, 1, 0)
df_test_wine$STARS_3 <- ifelse(df_test_wine$STARS == 3, 1, 0)
df_test_wine$STARS_4 <- ifelse(df_test_wine$STARS == 4, 1, 0)

df_train_wine<- df_train_wine %>% dplyr::select(-STARS)
df_test_wine <- df_test_wine %>% dplyr::select(-STARS)

I converted the star ratings into dummy variables. This will be helpful when running the regressions…

Check Difference in Correlation

data_cor_2 <- cor(df_train_wine[ , colnames(df_train_wine) != "TARGET"],  
                df_train_wine$TARGET, use = "na.or.complete") #Calculate correlations of target wins vs everything else
data_cor_2
##                           [,1]
## FixedAcidity       -0.04976493
## VolatileAcidity    -0.08998494
## CitricAcid          0.01094114
## ResidualSugar       0.01541047
## Chlorides          -0.04428094
## FreeSulfurDioxide   0.04921729
## TotalSulfurDioxide  0.04958612
## Density            -0.03091212
## pH                 -0.00772925
## Sulphates          -0.03714369
## Alcohol             0.05088321
## LabelAppeal         0.35508159
## AcidIndex          -0.24347911
## STARS_1            -0.13175784
## STARS_2             0.24635609
## STARS_3             0.36370219
## STARS_4             0.27881141

Stars 3 has the strongest positive correlated with the number of cases purchased, with label appeal being a close second. Stars 1 has a negative correlation with the number of cases purchased. If a wine is only given one star, it means that it is been rated poor. The wine distribution companies are probably less likely to buy more cases of it. Stars 4 has a positive, but lower correlation with the number of cases purchased. I wonder if this is because it will be more expensive, so wine distribution companies may be more inclined to buy a wine that is given 3 stars.

Summary stats after cleaning data

stargazer(df_train_wine, 
          type = "text", # html, latex
          # out =
          # summary.stat = 
          # covariate.labels =
          digits = 2)
## 
## ==========================================================
## Statistic            N     Mean  St. Dev.   Min     Max   
## ----------------------------------------------------------
## TARGET             10,236  3.03    1.93      0       8    
## FixedAcidity       10,236  7.09    6.33   -18.00   34.40  
## VolatileAcidity    10,236  0.33    0.78    -2.79    3.68  
## CitricAcid         10,236  0.31    0.86    -3.24    3.86  
## ResidualSugar      10,236  5.21   33.11   -127.80  141.15 
## Chlorides          10,236  0.05    0.31    -1.17    1.35  
## FreeSulfurDioxide  10,236 30.85   143.38  -546.00  622.00 
## TotalSulfurDioxide 10,236 121.94  224.95  -823.00 1,057.00
## Density            10,236  0.99    0.03    0.89     1.10  
## pH                 10,236  3.21    0.66    0.48     6.05  
## Sulphates          10,236  0.53    0.89    -3.13    4.24  
## Alcohol            10,236 10.49    3.62    -4.70   26.10  
## LabelAppeal        10,236 -0.01    0.90     -2       2    
## AcidIndex          10,236  7.78    1.34      4       17   
## STARS_1            10,236  0.24    0.43      0       1    
## STARS_2            10,236  0.28    0.45      0       1    
## STARS_3            10,236  0.17    0.38      0       1    
## STARS_4            10,236  0.05    0.21      0       1    
## ----------------------------------------------------------

The mean of the Target variable (number of cases of wine purchased) skews slightly to the left, with the mean being 3.03 cases of wine purchased, and the min being 0 and the max being 8. The mean of the label appeal (marketing score indicating the appeal of the design) is negative and very close to zero, which I find to be interesting.

Build Models

Below there are 6 regression models. The first one being poisson, the second one being a quasipoisson, the third and fourth being negative binomial, and the fifth and sixth being multiple linear regression. When I have manually selected variables to take out, I will discuss why. At the end, I will discuss the coefficients using the stargazer package.

Poisson 1

library(MASS)
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
pm1 <- stepAIC(object = glm(data=df_train_wine, 
           TARGET ~ . -pH -Sulphates -Alcohol,
           family = "poisson"))
## Start:  AIC=36466.32
## TARGET ~ (FixedAcidity + VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS_1 + 
##     STARS_2 + STARS_3 + STARS_4) - pH - Sulphates - Alcohol
## 
##                      Df Deviance   AIC
## - FixedAcidity        1    10897 36464
## - ResidualSugar       1    10897 36464
## - CitricAcid          1    10898 36466
## - Density             1    10898 36466
## <none>                     10897 36466
## - FreeSulfurDioxide   1    10903 36471
## - Chlorides           1    10904 36472
## - TotalSulfurDioxide  1    10905 36473
## - VolatileAcidity     1    10913 36480
## - AcidIndex           1    11157 36725
## - LabelAppeal         1    11436 37004
## - STARS_1             1    12210 37778
## - STARS_4             1    13126 38694
## - STARS_2             1    14263 39831
## - STARS_3             1    14522 40090
## 
## Step:  AIC=36464.34
## TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + Density + LabelAppeal + 
##     AcidIndex + STARS_1 + STARS_2 + STARS_3 + STARS_4
## 
##                      Df Deviance   AIC
## - ResidualSugar       1    10897 36462
## - CitricAcid          1    10898 36464
## - Density             1    10898 36464
## <none>                     10897 36464
## - FreeSulfurDioxide   1    10903 36469
## - Chlorides           1    10904 36470
## - TotalSulfurDioxide  1    10905 36471
## - VolatileAcidity     1    10913 36478
## - AcidIndex           1    11163 36728
## - LabelAppeal         1    11436 37002
## - STARS_1             1    12210 37776
## - STARS_4             1    13126 38692
## - STARS_2             1    14264 39829
## - STARS_3             1    14522 40088
## 
## Step:  AIC=36462.49
## TARGET ~ VolatileAcidity + CitricAcid + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + Density + LabelAppeal + AcidIndex + 
##     STARS_1 + STARS_2 + STARS_3 + STARS_4
## 
##                      Df Deviance   AIC
## - CitricAcid          1    10898 36462
## - Density             1    10898 36462
## <none>                     10897 36462
## - FreeSulfurDioxide   1    10903 36467
## - Chlorides           1    10905 36468
## - TotalSulfurDioxide  1    10905 36469
## - VolatileAcidity     1    10913 36477
## - AcidIndex           1    11163 36727
## - LabelAppeal         1    11436 37000
## - STARS_1             1    12210 37774
## - STARS_4             1    13127 38691
## - STARS_2             1    14264 39827
## - STARS_3             1    14523 40086
## 
## Step:  AIC=36461.92
## TARGET ~ VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + LabelAppeal + AcidIndex + STARS_1 + STARS_2 + STARS_3 + 
##     STARS_4
## 
##                      Df Deviance   AIC
## - Density             1    10900 36462
## <none>                     10898 36462
## - FreeSulfurDioxide   1    10905 36466
## - Chlorides           1    10906 36468
## - TotalSulfurDioxide  1    10907 36468
## - VolatileAcidity     1    10915 36476
## - AcidIndex           1    11163 36725
## - LabelAppeal         1    11438 37000
## - STARS_1             1    12212 37773
## - STARS_4             1    13130 38692
## - STARS_2             1    14268 39830
## - STARS_3             1    14525 40087
## 
## Step:  AIC=36461.58
## TARGET ~ VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     LabelAppeal + AcidIndex + STARS_1 + STARS_2 + STARS_3 + STARS_4
## 
##                      Df Deviance   AIC
## <none>                     10900 36462
## - FreeSulfurDioxide   1    10906 36466
## - Chlorides           1    10908 36468
## - TotalSulfurDioxide  1    10908 36468
## - VolatileAcidity     1    10916 36476
## - AcidIndex           1    11167 36726
## - LabelAppeal         1    11440 37000
## - STARS_1             1    12214 37774
## - STARS_4             1    13132 38692
## - STARS_2             1    14271 39831
## - STARS_3             1    14529 40089
summary(pm1)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS_1 + 
##     STARS_2 + STARS_3 + STARS_4, family = "poisson", data = df_train_wine)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3239  -0.6695  -0.0046   0.4511   3.5092  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         8.256e-01  4.375e-02  18.872  < 2e-16 ***
## VolatileAcidity    -2.972e-02  7.312e-03  -4.065 4.81e-05 ***
## Chlorides          -5.201e-02  1.838e-02  -2.830  0.00465 ** 
## FreeSulfurDioxide   9.959e-05  3.955e-05   2.518  0.01181 *  
## TotalSulfurDioxide  7.293e-05  2.553e-05   2.857  0.00428 ** 
## LabelAppeal         1.583e-01  6.810e-03  23.243  < 2e-16 ***
## AcidIndex          -7.946e-02  4.975e-03 -15.973  < 2e-16 ***
## STARS_1             7.701e-01  2.190e-02  35.161  < 2e-16 ***
## STARS_2             1.094e+00  2.042e-02  53.607  < 2e-16 ***
## STARS_3             1.219e+00  2.146e-02  56.794  < 2e-16 ***
## STARS_4             1.336e+00  2.706e-02  49.363  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18306  on 10235  degrees of freedom
## Residual deviance: 10900  on 10225  degrees of freedom
## AIC: 36462
## 
## Number of Fisher Scoring iterations: 6

Poisson 2

After seeing that my poisson and negative binomial models were similar, I wanted to try a quasipoisson…

library(MASS)

pm2 <- glm(data=df_train_wine, 
           TARGET ~ . ,
           family = quasipoisson)


?stepAIC
summary(pm2)
## 
## Call:
## glm(formula = TARGET ~ ., family = quasipoisson, data = df_train_wine)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2814  -0.6671  -0.0019   0.4499   3.4867  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.122e+00  2.051e-01   5.472 4.54e-08 ***
## FixedAcidity        1.765e-04  8.612e-04   0.205  0.83761    
## VolatileAcidity    -2.924e-02  6.862e-03  -4.262 2.05e-05 ***
## CitricAcid          7.570e-03  6.184e-03   1.224  0.22095    
## ResidualSugar       7.585e-05  1.615e-04   0.470  0.63868    
## Chlorides          -5.153e-02  1.726e-02  -2.986  0.00283 ** 
## FreeSulfurDioxide   1.007e-04  3.713e-05   2.711  0.00671 ** 
## TotalSulfurDioxide  7.462e-05  2.398e-05   3.112  0.00187 ** 
## Density            -2.743e-01  2.008e-01  -1.366  0.17189    
## pH                 -1.340e-02  8.093e-03  -1.656  0.09777 .  
## Sulphates          -1.030e-02  5.996e-03  -1.718  0.08577 .  
## Alcohol             2.302e-03  1.481e-03   1.554  0.12017    
## LabelAppeal         1.585e-01  6.392e-03  24.797  < 2e-16 ***
## AcidIndex          -7.982e-02  4.752e-03 -16.797  < 2e-16 ***
## STARS_1             7.691e-01  2.055e-02  37.425  < 2e-16 ***
## STARS_2             1.093e+00  1.916e-02  57.038  < 2e-16 ***
## STARS_3             1.215e+00  2.016e-02  60.294  < 2e-16 ***
## STARS_4             1.332e+00  2.543e-02  52.396  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.8802601)
## 
##     Null deviance: 18306  on 10235  degrees of freedom
## Residual deviance: 10889  on 10218  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6

Negative Binomial 1

Based on the poisson models, I noticed that Fixed Acidity, Residual Sugar and Citric Acid were not significant to the number of wine cases sold. Also looking back at the correlation, none of these variables had a significant correlation to the target variable.

nb1 <- glm.nb(data = df_train_wine, 
              TARGET ~ . 
              -FixedAcidity -ResidualSugar -CitricAcid)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(nb1)
## 
## Call:
## glm.nb(formula = TARGET ~ . - FixedAcidity - ResidualSugar - 
##     CitricAcid, data = df_train_wine, init.theta = 40903.10423, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2830  -0.6677  -0.0001   0.4498   3.4979  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.123e+00  2.185e-01   5.140 2.74e-07 ***
## VolatileAcidity    -2.953e-02  7.311e-03  -4.039 5.37e-05 ***
## Chlorides          -5.176e-02  1.839e-02  -2.814  0.00489 ** 
## FreeSulfurDioxide   1.013e-04  3.958e-05   2.560  0.01045 *  
## TotalSulfurDioxide  7.473e-05  2.555e-05   2.925  0.00345 ** 
## Density            -2.761e-01  2.140e-01  -1.290  0.19689    
## pH                 -1.330e-02  8.625e-03  -1.542  0.12319    
## Sulphates          -1.036e-02  6.387e-03  -1.622  0.10487    
## Alcohol             2.335e-03  1.578e-03   1.480  0.13883    
## LabelAppeal         1.586e-01  6.812e-03  23.284  < 2e-16 ***
## AcidIndex          -7.931e-02  4.999e-03 -15.864  < 2e-16 ***
## STARS_1             7.692e-01  2.191e-02  35.113  < 2e-16 ***
## STARS_2             1.093e+00  2.042e-02  53.535  < 2e-16 ***
## STARS_3             1.216e+00  2.148e-02  56.578  < 2e-16 ***
## STARS_4             1.333e+00  2.710e-02  49.179  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(40903.1) family taken to be 1)
## 
##     Null deviance: 18305  on 10235  degrees of freedom
## Residual deviance: 10891  on 10221  degrees of freedom
## AIC: 36463
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  40903 
##           Std. Err.:  38482 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -36430.97

Negative Binomial 2

Based on the above model, I decided to use STEPAIC as well as taking out more variables that were not significant, so the remaining variables were only those that were significant.

nb2 <- stepAIC(object = glm.nb(data=df_train_wine, 
           TARGET ~ . -Alcohol -Sulphates -pH))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Start:  AIC=36466.65
## TARGET ~ (FixedAcidity + VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS_1 + 
##     STARS_2 + STARS_3 + STARS_4) - Alcohol - Sulphates - pH
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##                      Df   AIC
## - FixedAcidity        1 36465
## - ResidualSugar       1 36465
## - CitricAcid          1 36466
## - Density             1 36466
## <none>                  36467
## - FreeSulfurDioxide   1 36471
## - Chlorides           1 36472
## - TotalSulfurDioxide  1 36473
## - VolatileAcidity     1 36481
## - AcidIndex           1 36725
## - LabelAppeal         1 37004
## - STARS_1             1 37778
## - STARS_4             1 38694
## - STARS_2             1 39831
## - STARS_3             1 40090
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=36464.67
## TARGET ~ VolatileAcidity + CitricAcid + ResidualSugar + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + Density + LabelAppeal + 
##     AcidIndex + STARS_1 + STARS_2 + STARS_3 + STARS_4
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##                      Df   AIC
## - ResidualSugar       1 36463
## - CitricAcid          1 36464
## - Density             1 36464
## <none>                  36465
## - FreeSulfurDioxide   1 36469
## - Chlorides           1 36470
## - TotalSulfurDioxide  1 36471
## - VolatileAcidity     1 36479
## - AcidIndex           1 36729
## - LabelAppeal         1 37002
## - STARS_1             1 37776
## - STARS_4             1 38692
## - STARS_2             1 39829
## - STARS_3             1 40088
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=36462.82
## TARGET ~ VolatileAcidity + CitricAcid + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + Density + LabelAppeal + AcidIndex + 
##     STARS_1 + STARS_2 + STARS_3 + STARS_4
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##                      Df   AIC
## - CitricAcid          1 36462
## - Density             1 36462
## <none>                  36463
## - FreeSulfurDioxide   1 36467
## - Chlorides           1 36469
## - TotalSulfurDioxide  1 36469
## - VolatileAcidity     1 36477
## - AcidIndex           1 36727
## - LabelAppeal         1 37000
## - STARS_1             1 37774
## - STARS_4             1 38691
## - STARS_2             1 39828
## - STARS_3             1 40086
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=36462.26
## TARGET ~ VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + LabelAppeal + AcidIndex + STARS_1 + STARS_2 + STARS_3 + 
##     STARS_4
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##                      Df   AIC
## - Density             1 36462
## <none>                  36462
## - FreeSulfurDioxide   1 36467
## - Chlorides           1 36468
## - TotalSulfurDioxide  1 36469
## - VolatileAcidity     1 36477
## - AcidIndex           1 36725
## - LabelAppeal         1 37000
## - STARS_1             1 37774
## - STARS_4             1 38692
## - STARS_2             1 39830
## - STARS_3             1 40087
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=36461.91
## TARGET ~ VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     LabelAppeal + AcidIndex + STARS_1 + STARS_2 + STARS_3 + STARS_4
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##                      Df   AIC
## <none>                  36462
## - FreeSulfurDioxide   1 36466
## - Chlorides           1 36468
## - TotalSulfurDioxide  1 36468
## - VolatileAcidity     1 36476
## - AcidIndex           1 36727
## - LabelAppeal         1 37000
## - STARS_1             1 37774
## - STARS_4             1 38692
## - STARS_2             1 39831
## - STARS_3             1 40089
summary(nb2)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS_1 + 
##     STARS_2 + STARS_3 + STARS_4, data = df_train_wine, init.theta = 40834.49538, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.3238  -0.6695  -0.0046   0.4511   3.5090  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         8.256e-01  4.375e-02  18.872  < 2e-16 ***
## VolatileAcidity    -2.972e-02  7.312e-03  -4.065 4.81e-05 ***
## Chlorides          -5.202e-02  1.838e-02  -2.830  0.00465 ** 
## FreeSulfurDioxide   9.959e-05  3.955e-05   2.518  0.01181 *  
## TotalSulfurDioxide  7.293e-05  2.553e-05   2.857  0.00428 ** 
## LabelAppeal         1.583e-01  6.811e-03  23.242  < 2e-16 ***
## AcidIndex          -7.947e-02  4.975e-03 -15.973  < 2e-16 ***
## STARS_1             7.701e-01  2.190e-02  35.161  < 2e-16 ***
## STARS_2             1.094e+00  2.042e-02  53.605  < 2e-16 ***
## STARS_3             1.219e+00  2.146e-02  56.792  < 2e-16 ***
## STARS_4             1.336e+00  2.706e-02  49.361  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(40834.5) family taken to be 1)
## 
##     Null deviance: 18305  on 10235  degrees of freedom
## Residual deviance: 10900  on 10225  degrees of freedom
## AIC: 36464
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  40834 
##           Std. Err.:  38408 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -36439.91

Multiple Linear Model 1

I wanted to see the relationship of the only variables that had a supposed theoretical effect on the number of cases wine sold.

lm1 <- lm(data = df_train_wine, 
          TARGET ~ LabelAppeal + STARS_1 + STARS_2 + STARS_3 + STARS_4)
summary(lm1)
## 
## Call:
## lm(formula = TARGET ~ LabelAppeal + STARS_1 + STARS_2 + STARS_3 + 
##     STARS_4, data = df_train_wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.6638 -0.8091  0.1598  0.7774  5.8673 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.25029    0.02588   48.32   <2e-16 ***
## LabelAppeal  0.44122    0.01545   28.55   <2e-16 ***
## STARS_1      1.44909    0.03742   38.73   <2e-16 ***
## STARS_2      2.53104    0.03604   70.23   <2e-16 ***
## STARS_3      3.14872    0.04157   75.74   <2e-16 ***
## STARS_4      3.84918    0.06659   57.80   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.335 on 10230 degrees of freedom
## Multiple R-squared:  0.5203, Adjusted R-squared:  0.5201 
## F-statistic:  2220 on 5 and 10230 DF,  p-value: < 2.2e-16

Multiple Linear Model 2

lm2 <-  stepAIC(object = lm(data = df_train_wine, TARGET ~ . -CitricAcid -Density -pH -Sulphates), 
                  direction = c("backward"))
## Start:  AIC=5437.79
## TARGET ~ (FixedAcidity + VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS_1 + 
##     STARS_2 + STARS_3 + STARS_4) - CitricAcid - Density - pH - 
##     Sulphates
## 
##                      Df Sum of Sq   RSS    AIC
## - FixedAcidity        1       0.4 17365 5436.0
## - ResidualSugar       1       0.7 17365 5436.2
## <none>                            17364 5437.8
## - Alcohol             1      10.2 17375 5441.8
## - FreeSulfurDioxide   1      18.6 17383 5446.7
## - TotalSulfurDioxide  1      22.7 17387 5449.1
## - Chlorides           1      24.8 17389 5450.4
## - VolatileAcidity     1      55.9 17420 5468.7
## - AcidIndex           1     674.0 18038 5825.6
## - LabelAppeal         1    1608.2 18972 6342.4
## - STARS_1             1    2343.8 19708 6731.8
## - STARS_4             1    5319.5 22684 8171.2
## - STARS_2             1    7674.9 25039 9182.4
## - STARS_3             1    8889.7 26254 9667.4
## 
## Step:  AIC=5436.03
## TARGET ~ VolatileAcidity + ResidualSugar + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + Alcohol + LabelAppeal + AcidIndex + 
##     STARS_1 + STARS_2 + STARS_3 + STARS_4
## 
##                      Df Sum of Sq   RSS    AIC
## - ResidualSugar       1       0.7 17365 5434.4
## <none>                            17365 5436.0
## - Alcohol             1      10.2 17375 5440.1
## - FreeSulfurDioxide   1      18.7 17383 5445.0
## - TotalSulfurDioxide  1      22.6 17387 5447.3
## - Chlorides           1      24.8 17390 5448.7
## - VolatileAcidity     1      55.9 17421 5466.9
## - AcidIndex           1     690.5 18055 5833.2
## - LabelAppeal         1    1607.8 18972 6340.4
## - STARS_1             1    2343.4 19708 6729.8
## - STARS_4             1    5319.3 22684 8169.3
## - STARS_2             1    7675.6 25040 9180.9
## - STARS_3             1    8890.0 26255 9665.6
## 
## Step:  AIC=5434.42
## TARGET ~ VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Alcohol + LabelAppeal + AcidIndex + STARS_1 + STARS_2 + STARS_3 + 
##     STARS_4
## 
##                      Df Sum of Sq   RSS    AIC
## <none>                            17365 5434.4
## - Alcohol             1      10.1 17376 5438.4
## - FreeSulfurDioxide   1      18.7 17384 5443.4
## - TotalSulfurDioxide  1      22.8 17388 5445.8
## - Chlorides           1      24.9 17390 5447.1
## - VolatileAcidity     1      56.0 17421 5465.4
## - AcidIndex           1     691.0 18056 5831.8
## - LabelAppeal         1    1607.9 18973 6338.9
## - STARS_1             1    2343.3 19709 6728.1
## - STARS_4             1    5321.2 22686 8168.4
## - STARS_2             1    7675.8 25041 9179.2
## - STARS_3             1    8891.8 26257 9664.6
summary(lm2)
## 
## Call:
## lm(formula = TARGET ~ VolatileAcidity + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + Alcohol + LabelAppeal + AcidIndex + 
##     STARS_1 + STARS_2 + STARS_3 + STARS_4, data = df_train_wine)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9229 -0.8642  0.0179  0.8415  5.7387 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.801e+00  9.355e-02  29.945  < 2e-16 ***
## VolatileAcidity    -9.463e-02  1.648e-02  -5.741 9.66e-09 ***
## Chlorides          -1.589e-01  4.150e-02  -3.830 0.000129 ***
## FreeSulfurDioxide   2.988e-04  9.003e-05   3.319 0.000907 ***
## TotalSulfurDioxide  2.101e-04  5.738e-05   3.661 0.000253 ***
## Alcohol             8.699e-03  3.569e-03   2.437 0.014806 *  
## LabelAppeal         4.656e-01  1.513e-02  30.768  < 2e-16 ***
## AcidIndex          -1.983e-01  9.832e-03 -20.170  < 2e-16 ***
## STARS_1             1.364e+00  3.673e-02  37.143  < 2e-16 ***
## STARS_2             2.398e+00  3.567e-02  67.225  < 2e-16 ***
## STARS_3             2.984e+00  4.125e-02  72.354  < 2e-16 ***
## STARS_4             3.669e+00  6.556e-02  55.972  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.303 on 10224 degrees of freedom
## Multiple R-squared:  0.5432, Adjusted R-squared:  0.5427 
## F-statistic:  1105 on 11 and 10224 DF,  p-value: < 2.2e-16

Coefficient Interpretation

stargazer(pm1, pm2, nb1, nb2, lm1, lm2,
          type = "text")
## 
## ============================================================================================================================================================
##                                                                               Dependent variable:                                                           
##                     ----------------------------------------------------------------------------------------------------------------------------------------
##                                                                                      TARGET                                                                 
##                       Poisson   glm: quasipoisson                    negative                                                OLS                            
##                                    link = log                        binomial                                                                               
##                         (1)            (2)                  (3)                     (4)                       (5)                           (6)             
## ------------------------------------------------------------------------------------------------------------------------------------------------------------
## FixedAcidity                         0.0002                                                                                                                 
##                                      (0.001)                                                                                                                
##                                                                                                                                                             
## VolatileAcidity      -0.030***      -0.029***            -0.030***               -0.030***                                               -0.095***          
##                       (0.007)        (0.007)              (0.007)                 (0.007)                                                 (0.016)           
##                                                                                                                                                             
## CitricAcid                            0.008                                                                                                                 
##                                      (0.006)                                                                                                                
##                                                                                                                                                             
## ResidualSugar                        0.0001                                                                                                                 
##                                     (0.0002)                                                                                                                
##                                                                                                                                                             
## Chlorides            -0.052***      -0.052***            -0.052***               -0.052***                                               -0.159***          
##                       (0.018)        (0.017)              (0.018)                 (0.018)                                                 (0.041)           
##                                                                                                                                                             
## FreeSulfurDioxide    0.0001**       0.0001***            0.0001**                0.0001**                                                0.0003***          
##                      (0.00004)      (0.00004)            (0.00004)               (0.00004)                                               (0.0001)           
##                                                                                                                                                             
## TotalSulfurDioxide   0.0001***      0.0001***            0.0001***               0.0001***                                               0.0002***          
##                      (0.00003)      (0.00002)            (0.00003)               (0.00003)                                               (0.0001)           
##                                                                                                                                                             
## Density                              -0.274               -0.276                                                                                            
##                                      (0.201)              (0.214)                                                                                           
##                                                                                                                                                             
## pH                                   -0.013*              -0.013                                                                                            
##                                      (0.008)              (0.009)                                                                                           
##                                                                                                                                                             
## Sulphates                            -0.010*              -0.010                                                                                            
##                                      (0.006)              (0.006)                                                                                           
##                                                                                                                                                             
## Alcohol                               0.002                0.002                                                                          0.009**           
##                                      (0.001)              (0.002)                                                                         (0.004)           
##                                                                                                                                                             
## LabelAppeal          0.158***       0.158***             0.159***                0.158***                   0.441***                     0.466***           
##                       (0.007)        (0.006)              (0.007)                 (0.007)                   (0.015)                       (0.015)           
##                                                                                                                                                             
## AcidIndex            -0.079***      -0.080***            -0.079***               -0.079***                                               -0.198***          
##                       (0.005)        (0.005)              (0.005)                 (0.005)                                                 (0.010)           
##                                                                                                                                                             
## STARS_1              0.770***       0.769***             0.769***                0.770***                   1.449***                     1.364***           
##                       (0.022)        (0.021)              (0.022)                 (0.022)                   (0.037)                       (0.037)           
##                                                                                                                                                             
## STARS_2              1.094***       1.093***             1.093***                1.094***                   2.531***                     2.398***           
##                       (0.020)        (0.019)              (0.020)                 (0.020)                   (0.036)                       (0.036)           
##                                                                                                                                                             
## STARS_3              1.219***       1.215***             1.216***                1.219***                   3.149***                     2.984***           
##                       (0.021)        (0.020)              (0.021)                 (0.021)                   (0.042)                       (0.041)           
##                                                                                                                                                             
## STARS_4              1.336***       1.332***             1.333***                1.336***                   3.849***                     3.669***           
##                       (0.027)        (0.025)              (0.027)                 (0.027)                   (0.067)                       (0.066)           
##                                                                                                                                                             
## Constant             0.826***       1.122***             1.123***                0.826***                   1.250***                     2.801***           
##                       (0.044)        (0.205)              (0.219)                 (0.044)                   (0.026)                       (0.094)           
##                                                                                                                                                             
## ------------------------------------------------------------------------------------------------------------------------------------------------------------
## Observations          10,236         10,236               10,236                  10,236                     10,236                       10,236            
## R2                                                                                                           0.520                         0.543            
## Adjusted R2                                                                                                  0.520                         0.543            
## Log Likelihood      -18,219.790                         -18,216.480             -18,220.960                                                                 
## theta                                             40,903.100 (38,481.510) 40,834.500 (38,408.060)                                                           
## Akaike Inf. Crit.   36,461.570                          36,462.970              36,463.910                                                                  
## Residual Std. Error                                                                                    1.335 (df = 10230)           1.303 (df = 10224)      
## F Statistic                                                                                       2,219.609*** (df = 5; 10230) 1,105.181*** (df = 11; 10224)
## ============================================================================================================================================================
## Note:                                                                                                                            *p<0.1; **p<0.05; ***p<0.01
exp(0.769)
## [1] 2.157608
exp(1.093)
## [1] 2.98321
exp(1.219)
## [1] 3.383802
exp(1.333)
## [1] 3.792404

Although I tried to use different variables (while still trying to make each model the best fit), there is no denying that the poisson, quasipoisson, and both negative binomial models are very similar. In addition, for the negative binomial models, I got the warning message that the iteration limit had been reached. This could be because the data is equidispersed, where the variance and mean are equal to each other, or that it is underdispersed, where the variance is less than the mean. The negative binomial models could be effectively converging to a Poisson model because of this. The negative binomial models do not seem to be the best fit for this data.

Poisson 1: For a one unit increase the of rating of label appeal, we expect an increase of 1.1719 in the number of cases of wine that will be sold, given the other predictor variables in the model are held constant. This makes sense because theoretically, the many consumers will buy a wine because of the wine label design being good.

Qausipoisson: For a one unit increase (going from 0 to 1 stars) for 1 star, we expect an increase of 2.1576 in the number of cases of wine that will be sold, given the other predictor variables in the model are held constant. This makes sense, especially going from not being rated to being rated one star. A buyer is not going to buy a case of wine if it has not been rated, and they will be more likely to buy a wine that has been rated than not.

Negative Binomial 1: For a one unit increase (going from 1 star to 2 star rating) for 2 stars, we expect an increase of 2.9832 in the number of cases of wine that will be sold, given the other predictor variables in the model are held constant. This makes sense because as the ratings increase, the number of sales should also increase.

Negative Binomial 2: For a one unit increase (going from 2 star to 3 star ratings) for 3 stars, we expect an increase of 3.3838 in the number of cases of wine that will be sold, given the other predictor variables in the model are held constant. This makes sense because as the ratings increase, the number of sales should also increase.

Poisson 1: For a one unit increase (going from 3 stars to 4 star ratings) for 4 stars, we expect an increase of 3.7924 in the number of cases of wine that will be sold, given the other predictor variables in the model are held constant. This makes sense because as the ratings increase, the number of sales should also increase. This is also the highest rated wine so the sales should be greater than they were for 1 star 2 star and 3 star rated bottles.

Select Models

Because the target variable is a count variable, I will not be selecting either of the OLS models.

Log likelihood is a measure of goodness of fit for the model. The larger the log likelihood, the better the model. Between the poisson 1, negative binomial 1 and 2, the log likelihoods are very similar. Log likelihood is also not a good determinate of the best model. In terms of AIC, the smaller the AIC the better the model fit. The AIC values for poisson 1, negative binomial 1 and 2 and very similar, but poisson 1 has a smaller log likelihood.

Because of the errors I got with the negative binomial models (which I talked about previously), and what I mentioned above in terms of AIC, I am going to select the poisson 1 model to continue with. I also used stepAIC in this model, as well as taking out variables that I saw did not have a significant effect on my target variable.

Predictions

wine_predictions <- df_test_wine %>%
  mutate(predictions = predict(pm1, newdata= df_test_wine))
head(wine_predictions)
##    TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 4       3          5.7           0.385       0.04          18.8    -0.425
## 8       4          6.5          -1.220       0.34           1.4     0.040
## 19      5         10.0           0.230       0.27          14.1     0.033
## 22      2          6.5           0.240       0.24         -18.2    -0.182
## 28      6          8.3          -2.460       0.27          13.2     0.036
## 29      4          8.0          -0.010      -1.10          44.8    -0.350
##    FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 4                 22                115 0.99640 2.24      1.83     6.2
## 8                523                551 1.03236 3.20      0.50    11.6
## 19              -188                229 0.99880 3.14      0.88    11.0
## 22                15                 60 1.00916 3.19     -1.01     9.8
## 28                31                122 0.99408 1.95      0.02    13.0
## 29                17                129 1.03564 3.20      0.50    16.6
##    LabelAppeal AcidIndex STARS_1 STARS_2 STARS_3 STARS_4 predictions
## 4           -1         6       1       0       0       0   0.9817985
## 8            1         7       0       0       1       0   1.7726566
## 19           1        11       0       1       0       0   1.1936065
## 22          -1         7       1       0       0       0   0.8892970
## 28           1         7       0       0       0       1   1.8465231
## 29           0         9       0       1       0       0   1.2344147

This model did not do a great job predicting the number of wine cases purchased. However, it did follow a similar trend. Looking at the first row, where the target variable is 3, the model predicted that .9817 cases of wine would be purchased. Compare this to the second row, where the target variable is 4. The model predicted that 1.772 cases of wine would be purchased. Looking at the fourth row, where the target variable is 2, the model predicted that .889 cases of wine would be purchased. And the fifth row is the only one that has a 4 star rating and the model predicted that 1.846 cases would be purchased. Although it does not align with the exact numbers of the target variable, the model did pick up on the differences.

Confusion Matrix

library(caret)
## Loading required package: lattice
## 
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
## 
##     lift
predicted <- predict(pm1, data = df_train_wine, type="response")

df_train_wine$predicted <- predict(pm1, data = df_train_wine, type="response")

levels(factor(df_train_wine$TARGET))
## [1] "0" "1" "2" "3" "4" "5" "6" "7" "8"
levels(factor(round(df_train_wine$predicted)))
## [1] "1" "2" "3" "4" "5" "6" "7" "8"
df_train_wine$predicted <- ifelse(df_train_wine$predicted == 0, 1, 0)

confusionMatrix(reference = factor(df_train_wine$TARGET),
                data      = factor(round(df_train_wine$predicted, digits = 1)),
                positive  = "1"
                )
## Warning in confusionMatrix.default(reference = factor(df_train_wine$TARGET), :
## Levels are not in the same order for reference and data. Refactoring data to
## match.
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1    2    3    4    5    6    7    8
##          0 2189  200  876 2079 2559 1591  611  117   14
##          1    0    0    0    0    0    0    0    0    0
##          2    0    0    0    0    0    0    0    0    0
##          3    0    0    0    0    0    0    0    0    0
##          4    0    0    0    0    0    0    0    0    0
##          5    0    0    0    0    0    0    0    0    0
##          6    0    0    0    0    0    0    0    0    0
##          7    0    0    0    0    0    0    0    0    0
##          8    0    0    0    0    0    0    0    0    0
## 
## Overall Statistics
##                                           
##                Accuracy : 0.2139          
##                  95% CI : (0.2059, 0.2219)
##     No Information Rate : 0.25            
##     P-Value [Acc > NIR] : 1               
##                                           
##                   Kappa : 0               
##                                           
##  Mcnemar's Test P-Value : NA              
## 
## Statistics by Class:
## 
##                      Class: 0 Class: 1 Class: 2 Class: 3 Class: 4 Class: 5
## Sensitivity            1.0000  0.00000  0.00000   0.0000     0.00   0.0000
## Specificity            0.0000  1.00000  1.00000   1.0000     1.00   1.0000
## Pos Pred Value         0.2139      NaN      NaN      NaN      NaN      NaN
## Neg Pred Value            NaN  0.98046  0.91442   0.7969     0.75   0.8446
## Prevalence             0.2139  0.01954  0.08558   0.2031     0.25   0.1554
## Detection Rate         0.2139  0.00000  0.00000   0.0000     0.00   0.0000
## Detection Prevalence   1.0000  0.00000  0.00000   0.0000     0.00   0.0000
## Balanced Accuracy      0.5000  0.50000  0.50000   0.5000     0.50   0.5000
##                      Class: 6 Class: 7 Class: 8
## Sensitivity           0.00000  0.00000 0.000000
## Specificity           1.00000  1.00000 1.000000
## Pos Pred Value            NaN      NaN      NaN
## Neg Pred Value        0.94031  0.98857 0.998632
## Prevalence            0.05969  0.01143 0.001368
## Detection Rate        0.00000  0.00000 0.000000
## Detection Prevalence  0.00000  0.00000 0.000000
## Balanced Accuracy     0.50000  0.50000 0.500000

This model accurately predicted 21.39% of the data. Therefore its error rate is quite high (around 72%). You want sensitivity to be 1, the worst is 0. The best sensitivity was in class 0 (so where the target values were zero), and the rest of the class’s all had zeros indicating it had poor sensitivity. I am not very impressed with this model and I wonder if a negative binomial would have been the better model for this data set (if it didn’t give all of those errors).