Set Up

rm(list = ls())

Load Package

packages <- c("psych", 
              "stargazer", 
              "tidyverse", 
              "corrplot", 
              "ggplot2", 
              "data.table", 
              "car",
              "MASS",
              "Metrics",
              "vars",
              "gptstudio",
              "visdat",
              "reshape2",
              "corrplot",
              "pscl"
              
              )
 for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i]
                       , repos = "http://cran.rstudio.com/"
                       , dependencies = TRUE
                       )
    }
    library(packages[i], character.only = TRUE)
  }
## Warning: 程辑包'psych'是用R版本4.3.1 来建造的
## 
## Please cite as:
##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## corrplot 0.92 loaded
## 
## 
## 载入程辑包:'data.table'
## 
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## 
## 载入需要的程辑包:carData
## 
## 
## 载入程辑包:'car'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## 
## 
## 载入程辑包:'MASS'
## 
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## 
## 载入需要的程辑包:strucchange
## 
## 载入需要的程辑包:zoo
## 
## 
## 载入程辑包:'zoo'
## 
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## 载入需要的程辑包:sandwich
## 
## 
## 载入程辑包:'strucchange'
## 
## 
## The following object is masked from 'package:stringr':
## 
##     boundary
## 
## 
## 载入需要的程辑包:urca
## 
## 载入需要的程辑包:lmtest
## 
## 
## 载入程辑包:'reshape2'
## 
## 
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths
## 
## 
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

Import and Split Data

Originaldf <- read.csv("D:/Study/Econometric/Assignment/Wine/wine-training-data.csv")

set.seed(123)

# Generate a vector of indices for the training set

train_index <- sample(x = nrow(Originaldf),  size = round(0.8 * nrow(Originaldf) ) )

# Create the training and testing sets

train <- Originaldf[train_index, ]

test  <- Originaldf[-train_index, ]  

Creat Label for my data

mylabel <- c( 'INDEX' = 'Index',
              'TARGET' = '#Purchased',
              'AcidIndex' = 'Method',
              'Alcohol' = 'Alcohol',
              'Chlorides' = 'Chloride',
              'CitricAcid' = 'CitricAcid',
              'Density' = 'Density',
              'FixedAcidity' = 'FixAcid',
              'FreeSulfurDioxide' = 'SulfurDioxide',
              'LabelAppeal' = 'MarketScore',
              'ResidualSugar' = 'Sugar',
              'STARS' = 'Rating',
              'Sulphates' = 'Sulfate',
              'TotalSulfurDioxide' = 'SumSulDioxide',
              'VolatileAcidity' = 'VolatileAcid',
              'pH' = 'pHValue'
  
)

Data Exploring

vis_dat(x = train) #visualize the data

vis_miss(x = train)

in the graph we can see in the dataset there are only two types of data and there is 4% of missing values.

#make the dataset become long for plotting.
trainlong <- melt(train) 
## No id variables; using all as measure variables
#plot the histogram of each variable
ggplot(trainlong, aes(x=value)) +
  geom_histogram(binwidth=1, fill="blue", color="black") +
  facet_wrap(~variable, scales="free", labeller = as_labeller(mylabel)) +
  theme(plot.title = element_text(hjust = 0.5))
## Warning: Removed 6578 rows containing non-finite values (`stat_bin()`).

#count the number of missing values in each variable
dplyr::summarise(.data = train, 
                 across(.cols = everything(),
                        .fns  = ~ sum(is.na(.x))
                       )
                ) %>%
    glimpse() 
## Rows: 1
## Columns: 16
## $ INDEX              <int> 0
## $ TARGET             <int> 0
## $ FixedAcidity       <int> 0
## $ VolatileAcidity    <int> 0
## $ CitricAcid         <int> 0
## $ ResidualSugar      <int> 494
## $ Chlorides          <int> 525
## $ FreeSulfurDioxide  <int> 539
## $ TotalSulfurDioxide <int> 532
## $ Density            <int> 0
## $ pH                 <int> 321
## $ Sulphates          <int> 960
## $ Alcohol            <int> 519
## $ LabelAppeal        <int> 0
## $ AcidIndex          <int> 0
## $ STARS              <int> 2688
#calculate the percentage of missing values in each variable
round(x = sapply(X = train,
                 FUN = function(df){100*sum(is.na(df==TRUE)/length(df))}),
      digits = 2)
##              INDEX             TARGET       FixedAcidity    VolatileAcidity 
##               0.00               0.00               0.00               0.00 
##         CitricAcid      ResidualSugar          Chlorides  FreeSulfurDioxide 
##               0.00               4.83               5.13               5.27 
## TotalSulfurDioxide            Density                 pH          Sulphates 
##               5.20               0.00               3.14               9.38 
##            Alcohol        LabelAppeal          AcidIndex              STARS 
##               5.07               0.00               0.00              26.26

In the histogram plot, we can see the average number of purchase is around 4, the fix acidity average is about 5. The average Score of the wine is around 0. The most common rating of the wine is 2 and 1. The NAs value are in pH, Sulphates, Alcohol, ResidualSugar, Chlorides, FreeSulfurDioxide, TotalSulfurDioxide and Stars.

corrplot(cor(train, use = "na.or.complete"), 
         type = "lower", 
         method = "square"
         )

In the correlation plots we can see Label Appeal and wine stars has a strong relationship with number of Purchased, Stars has strong relationship with marketing score

Data Preparation

#remove all NA values from the train dataset
trainC <- na.omit(train)

#print the structure of the train dataset
str(trainC)
## 'data.frame':    5147 obs. of  16 variables:
##  $ INDEX             : int  3172 3772 2319 11765 4253 8527 12393 3491 6449 11520 ...
##  $ TARGET            : int  3 3 3 4 4 4 0 0 6 6 ...
##  $ FixedAcidity      : num  -2.4 9.1 5.7 5.5 5.6 8.1 16.5 6.5 10.2 -3.3 ...
##  $ VolatileAcidity   : num  1.08 0.98 0.21 0.3 0.485 0.27 -1.72 0.19 -0.5 0.25 ...
##  $ CitricAcid        : num  0.25 0.58 1.32 1.2 0.16 0.33 1.79 0.49 -0.34 0.27 ...
##  $ ResidualSugar     : num  -10 33.6 1.6 67.6 12.6 ...
##  $ Chlorides         : num  -0.051 0.036 0.03 0.029 0.051 0.32 0.028 0.04 0.238 0.039 ...
##  $ FreeSulfurDioxide : num  321 44 277 28 193 171 65 31 23 37 ...
##  $ TotalSulfurDioxide: num  138 161 122 97 115 100 158 466 -526 482 ...
##  $ Density           : num  1 1.03 0.99 1.02 1.02 ...
##  $ pH                : num  4.47 3.06 3.94 3.23 4.15 4.04 3.26 4.28 3.2 3.27 ...
##  $ Sulphates         : num  -1.05 0.41 0.52 0.44 0.38 0.44 -0.47 0.68 0.39 -0.83 ...
##  $ Alcohol           : num  4.8 8.9 18.3 21.7 12 15.9 10.7 11.9 7.34 16.2 ...
##  $ LabelAppeal       : int  0 -1 -1 0 0 0 -1 0 1 0 ...
##  $ AcidIndex         : int  7 9 6 8 6 9 7 7 7 7 ...
##  $ STARS             : int  2 2 2 2 3 3 1 1 4 4 ...
##  - attr(*, "na.action")= 'omit' Named int [1:5089] 1 3 4 5 10 11 17 18 20 21 ...
##   ..- attr(*, "names")= chr [1:5089] "2463" "10419" "8718" "12483" ...

As the list above, to clean the missing value I have to drop almost 50% of my data, but I think it is worthy that if I impute those missing data with mean or median I will lose the accuracy of this study.

Model Building

Poisson Regression Model

Poisson1 <- glm(data = trainC, family = poisson, trainC$TARGET ~. -trainC$INDEX )

summary(Poisson1)
## 
## Call:
## glm(formula = trainC$TARGET ~ . - trainC$INDEX, family = poisson, 
##     data = trainC)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.594e+00  2.799e-01   5.695 1.24e-08 ***
## INDEX               1.680e-06  1.572e-06   1.069  0.28520    
## FixedAcidity       -1.914e-04  1.177e-03  -0.163  0.87083    
## VolatileAcidity    -2.623e-02  9.339e-03  -2.808  0.00498 ** 
## CitricAcid          2.977e-03  8.458e-03   0.352  0.72486    
## ResidualSugar      -9.270e-05  2.169e-04  -0.427  0.66905    
## Chlorides          -3.885e-02  2.294e-02  -1.694  0.09034 .  
## FreeSulfurDioxide   8.268e-05  4.959e-05   1.667  0.09543 .  
## TotalSulfurDioxide  1.440e-05  3.225e-05   0.446  0.65525    
## Density            -3.850e-01  2.751e-01  -1.399  0.16171    
## pH                 -8.063e-03  1.084e-02  -0.744  0.45691    
## Sulphates          -1.846e-03  7.848e-03  -0.235  0.81401    
## Alcohol             2.938e-03  1.975e-03   1.487  0.13697    
## LabelAppeal         1.725e-01  8.806e-03  19.595  < 2e-16 ***
## AcidIndex          -4.600e-02  6.535e-03  -7.039 1.93e-12 ***
## STARS               1.889e-01  8.337e-03  22.655  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4634.3  on 5146  degrees of freedom
## Residual deviance: 3186.6  on 5131  degrees of freedom
## AIC: 18539
## 
## Number of Fisher Scoring iterations: 5

The coefficient represent the change in the log count for each unit increase in the predictor, assuming the other predictors are held constant. For example, the coefficient for Alcohol is 2.938e-03. This means that for each unit increase in Alcohol, the log count of the TARGET variable is expected to increase by 2.938e-03, while all other variables are held constant. From this output, it appears that LabelAppeal, AcidIndex, and STARS are highly significant predictors. VolatileAcidity is also significant with a p-value less than 0.01. Other variables such as Alcohol, Density, FreeSulfurDioxide, and Chlorides are not statistically significant at common thresholds (like 0.05), suggesting they might not have a substantial effect on the TARGET variable in this model.

Poisson2 <- glm(data = trainC, family = poisson, trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + trainC$Alcohol -trainC$INDEX )

summary(Poisson2)
## 
## Call:
## glm(formula = trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + 
##     trainC$Alcohol - trainC$INDEX, family = poisson, data = trainC)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.819404   0.027444  29.857   <2e-16 ***
## trainC$STARS       0.195858   0.008262  23.706   <2e-16 ***
## trainC$LabelAppeal 0.170212   0.008770  19.409   <2e-16 ***
## trainC$Alcohol     0.003734   0.001967   1.898   0.0577 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4634.3  on 5146  degrees of freedom
## Residual deviance: 3258.9  on 5143  degrees of freedom
## AIC: 18587
## 
## Number of Fisher Scoring iterations: 4

In summary, STARS, LabelAppeal, and Alcohol are all significant predictors, with STARS and LabelAppeal being highly significant.

Negative binomial model

binomial1 <- glm.nb(data = trainC, trainC$TARGET ~. -trainC$INDEX)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(binomial1)
## 
## Call:
## glm.nb(formula = trainC$TARGET ~ . - trainC$INDEX, data = trainC, 
##     init.theta = 140865.1965, link = log)
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.594e+00  2.799e-01   5.694 1.24e-08 ***
## INDEX               1.680e-06  1.572e-06   1.069  0.28520    
## FixedAcidity       -1.914e-04  1.177e-03  -0.163  0.87084    
## VolatileAcidity    -2.623e-02  9.339e-03  -2.808  0.00498 ** 
## CitricAcid          2.977e-03  8.458e-03   0.352  0.72486    
## ResidualSugar      -9.270e-05  2.169e-04  -0.427  0.66905    
## Chlorides          -3.885e-02  2.294e-02  -1.694  0.09034 .  
## FreeSulfurDioxide   8.268e-05  4.959e-05   1.667  0.09544 .  
## TotalSulfurDioxide  1.440e-05  3.225e-05   0.446  0.65525    
## Density            -3.850e-01  2.751e-01  -1.399  0.16171    
## pH                 -8.063e-03  1.084e-02  -0.744  0.45692    
## Sulphates          -1.846e-03  7.849e-03  -0.235  0.81401    
## Alcohol             2.938e-03  1.975e-03   1.487  0.13697    
## LabelAppeal         1.725e-01  8.806e-03  19.595  < 2e-16 ***
## AcidIndex          -4.600e-02  6.535e-03  -7.039 1.93e-12 ***
## STARS               1.889e-01  8.338e-03  22.654  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(140865.2) family taken to be 1)
## 
##     Null deviance: 4634.2  on 5146  degrees of freedom
## Residual deviance: 3186.5  on 5131  degrees of freedom
## AIC: 18541
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  140865 
##           Std. Err.:  264201 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -18506.62

The variables LabelAppeal, AcidIndex, and STARS are highly statistically significant. The large value of theta suggests that there might be overdispersion in the data, which means that there is more variation in the data than would be expected based on the Negative Binomial model.

binomial2 <- glm.nb(data = trainC, trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + trainC$Alcohol -trainC$INDEX )
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(binomial2)
## 
## Call:
## glm.nb(formula = trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + 
##     trainC$Alcohol - trainC$INDEX, data = trainC, init.theta = 141467.724, 
##     link = log)
## 
## Coefficients:
##                    Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        0.819403   0.027445  29.856   <2e-16 ***
## trainC$STARS       0.195858   0.008262  23.705   <2e-16 ***
## trainC$LabelAppeal 0.170212   0.008770  19.409   <2e-16 ***
## trainC$Alcohol     0.003734   0.001967   1.898   0.0577 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(141467.7) family taken to be 1)
## 
##     Null deviance: 4634.2  on 5146  degrees of freedom
## Residual deviance: 3258.8  on 5143  degrees of freedom
## AIC: 18589
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  141468 
##           Std. Err.:  267865 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -18578.91

Both STARS and LabelAppeal have very low p-values, indicating that these predictors are significantly associated with the target variable. The Alcohol variable has a p-value close to the usual 0.05 significance level. This suggests that Alcohol may have a marginal effect on the target variable.

Multilinear Regression Model

Linear1 <- lm(data = trainC, trainC$TARGET ~. -trainC$INDEX)

summary(Linear1)
## 
## Call:
## lm(formula = trainC$TARGET ~ . - trainC$INDEX, data = trainC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0127 -0.5129  0.1165  0.7209  3.1918 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         4.528e+00  6.169e-01   7.341 2.46e-13 ***
## INDEX               5.388e-06  3.481e-06   1.548  0.12174    
## FixedAcidity       -6.378e-06  2.606e-03  -0.002  0.99805    
## VolatileAcidity    -9.849e-02  2.061e-02  -4.779 1.81e-06 ***
## CitricAcid          9.579e-03  1.875e-02   0.511  0.60942    
## ResidualSugar      -3.450e-04  4.794e-04  -0.720  0.47180    
## Chlorides          -1.393e-01  5.087e-02  -2.738  0.00620 ** 
## FreeSulfurDioxide   2.844e-04  1.096e-04   2.594  0.00952 ** 
## TotalSulfurDioxide  5.342e-05  7.104e-05   0.752  0.45209    
## Density            -1.274e+00  6.063e-01  -2.101  0.03568 *  
## pH                 -2.261e-02  2.396e-02  -0.943  0.34554    
## Sulphates          -5.686e-03  1.733e-02  -0.328  0.74286    
## Alcohol             1.298e-02  4.354e-03   2.980  0.00290 ** 
## LabelAppeal         6.281e-01  1.933e-02  32.486  < 2e-16 ***
## AcidIndex          -1.561e-01  1.370e-02 -11.398  < 2e-16 ***
## STARS               7.325e-01  1.904e-02  38.472  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.154 on 5131 degrees of freedom
## Multiple R-squared:  0.4416, Adjusted R-squared:  0.4399 
## F-statistic: 270.5 on 15 and 5131 DF,  p-value: < 2.2e-16

Based on this output, it appears that LabelAppeal, AcidIndex, and STARS are the strongest predictors of TARGET. Variables like VolatileAcidity, Chlorides, FreeSulfurDioxide, Density, and Alcohol are also statistically significant at less stringent levels, but their effects are smaller.

Linear2 <- lm(data = trainC, trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + trainC$Alcohol -trainC$INDEX )

summary(Linear2)
## 
## Call:
## lm(formula = trainC$TARGET ~ trainC$STARS + trainC$LabelAppeal + 
##     trainC$Alcohol - trainC$INDEX, data = trainC)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0040 -0.5211  0.1321  0.7502  3.1332 
## 
## Coefficients:
##                    Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        1.937985   0.060686  31.934  < 2e-16 ***
## trainC$STARS       0.759083   0.019223  39.488  < 2e-16 ***
## trainC$LabelAppeal 0.620253   0.019606  31.635  < 2e-16 ***
## trainC$Alcohol     0.015291   0.004411   3.467 0.000531 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.173 on 5143 degrees of freedom
## Multiple R-squared:  0.4213, Adjusted R-squared:  0.421 
## F-statistic:  1248 on 3 and 5143 DF,  p-value: < 2.2e-16

In conclusion, this model suggests that STARS, LabelAppeal, and Alcohol are significant predictors of TARGET. Among these, STARS has the highest impact, followed by LabelAppeal and then Alcohol.

stargazer(binomial1, binomial2, Poisson1,Poisson2,Linear1, Linear2, type = "text", title = "Model Summary")
## 
## Model Summary
## ====================================================================================================================================================
##                                                                           Dependent variable:                                                       
##                     --------------------------------------------------------------------------------------------------------------------------------
##                                                                                  TARGET                                                             
##                                          negative                              Poisson                                 OLS                          
##                                          binomial                                                                                                   
##                                (1)                       (2)               (3)        (4)                (5)                         (6)            
## ----------------------------------------------------------------------------------------------------------------------------------------------------
## INDEX                        0.00000                                     0.00000                       0.00001                                      
##                             (0.00000)                                   (0.00000)                     (0.00000)                                     
##                                                                                                                                                     
## FixedAcidity                 -0.0002                                     -0.0002                       -0.00001                                     
##                              (0.001)                                     (0.001)                       (0.003)                                      
##                                                                                                                                                     
## VolatileAcidity             -0.026***                                   -0.026***                     -0.098***                                     
##                              (0.009)                                     (0.009)                       (0.021)                                      
##                                                                                                                                                     
## CitricAcid                    0.003                                       0.003                         0.010                                       
##                              (0.008)                                     (0.008)                       (0.019)                                      
##                                                                                                                                                     
## ResidualSugar                -0.0001                                     -0.0001                       -0.0003                                      
##                             (0.0002)                                     (0.0002)                      (0.0005)                                     
##                                                                                                                                                     
## Chlorides                    -0.039*                                     -0.039*                      -0.139***                                     
##                              (0.023)                                     (0.023)                       (0.051)                                      
##                                                                                                                                                     
## FreeSulfurDioxide            0.0001*                                     0.0001*                      0.0003***                                     
##                             (0.00005)                                   (0.00005)                      (0.0001)                                     
##                                                                                                                                                     
## TotalSulfurDioxide           0.00001                                     0.00001                        0.0001                                      
##                             (0.00003)                                   (0.00003)                      (0.0001)                                     
##                                                                                                                                                     
## Density                      -0.385                                       -0.385                       -1.274**                                     
##                              (0.275)                                     (0.275)                       (0.606)                                      
##                                                                                                                                                     
## pH                           -0.008                                       -0.008                        -0.023                                      
##                              (0.011)                                     (0.011)                       (0.024)                                      
##                                                                                                                                                     
## Sulphates                    -0.002                                       -0.002                        -0.006                                      
##                              (0.008)                                     (0.008)                       (0.017)                                      
##                                                                                                                                                     
## Alcohol                       0.003                                       0.003                        0.013***                                     
##                              (0.002)                                     (0.002)                       (0.004)                                      
##                                                                                                                                                     
## LabelAppeal                 0.173***                                     0.173***                      0.628***                                     
##                              (0.009)                                     (0.009)                       (0.019)                                      
##                                                                                                                                                     
## AcidIndex                   -0.046***                                   -0.046***                     -0.156***                                     
##                              (0.007)                                     (0.007)                       (0.014)                                      
##                                                                                                                                                     
## STARS                       0.189***                                     0.189***                      0.733***                                     
##                              (0.008)                                     (0.008)                       (0.019)                                      
##                                                                                                                                                     
## STARS                                                 0.196***                      0.196***                                      0.759***          
##                                                        (0.008)                      (0.008)                                        (0.019)          
##                                                                                                                                                     
## LabelAppeal                                           0.170***                      0.170***                                      0.620***          
##                                                        (0.009)                      (0.009)                                        (0.020)          
##                                                                                                                                                     
## Alcohol                                                0.004*                        0.004*                                       0.015***          
##                                                        (0.002)                      (0.002)                                        (0.004)          
##                                                                                                                                                     
## Constant                    1.594***                  0.819***           1.594***   0.819***           4.528***                   1.938***          
##                              (0.280)                   (0.027)           (0.280)    (0.027)            (0.617)                     (0.061)          
##                                                                                                                                                     
## ----------------------------------------------------------------------------------------------------------------------------------------------------
## Observations                  5,147                     5,147             5,147      5,147              5,147                       5,147           
## R2                                                                                                      0.442                       0.421           
## Adjusted R2                                                                                             0.440                       0.421           
## Log Likelihood             -9,254.311                -9,290.457         -9,253.269 -9,289.415                                                       
## theta               140,865.200 (264,201.100) 141,467.700 (267,865.100)                                                                             
## Akaike Inf. Crit.          18,540.620                18,588.910         18,538.540 18,586.830                                                       
## Residual Std. Error                                                                               1.154 (df = 5131)           1.173 (df = 5143)     
## F Statistic                                                                                   270.488*** (df = 15; 5131) 1,248.199*** (df = 3; 5143)
## ====================================================================================================================================================
## Note:                                                                                                                    *p<0.1; **p<0.05; ***p<0.01

In all the model, Stars and number of purchased are significance predictor which make sense that people tend to buy wine that has higher rating and if they like that wine, they will give it a good rate which form a good circulation. Also, I have the same result for my Poisson and binomial model which I guess is because the Poisson regression model assumes that the mean and variance of the distribution are equaland the negative binomial regression model is a generalization of the Poisson regression model that allows for overdispersion, where the variance is greater than the mean. So this could means that the data is not overdispersed and thus fits well to the Poisson distribution’s assumption of equidispersion.

Model Selecting

# Calculate the pR2 for the binomial1 model
b1R2 <- pR2(binomial1)
## fitting null model for pseudo-r2
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
b1R2
##           llh       llhNull            G2      McFadden          r2ML 
## -9.253311e+03 -9.977172e+03  1.447722e+03  7.255172e-02  2.451792e-01 
##          r2CU 
##  2.503656e-01
# Calculate the pR2 for the binomial2 model
b2R2 <- pR2(binomial2)
## fitting null model for pseudo-r2
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
b2R2
##           llh       llhNull            G2      McFadden          r2ML 
## -9289.4567713 -9977.1722408  1375.4309390     0.0689289     0.2345027 
##          r2CU 
##     0.2394633
# Calculate the pR2 for the Poisson1 model
P1R2 <- pR2(Poisson1)
## fitting null model for pseudo-r2
P1R2
##           llh       llhNull            G2      McFadden          r2ML 
## -9.253269e+03 -9.977139e+03  1.447739e+03  7.255282e-02  2.451817e-01 
##          r2CU 
##  2.503682e-01
# Calculate the pR2 for the Poisson2 model
P2R2 <- pR2(Poisson2)
## fitting null model for pseudo-r2
P2R2
##           llh       llhNull            G2      McFadden          r2ML 
## -9289.4154484 -9977.1386049  1375.4463130     0.0689299     0.2345050 
##          r2CU 
##     0.2394657

Based on the outputs, the best model appears to be the multiple linear regression model 2 because it has fewer predictos which makes it easier to interpret and less likely to overfit the data. Also this model has an adjusted R-squared of 0.421, which suggests that it explains about 42.1% of the variability in TARGET. Although this isn’t dramatically higher than some of the other models, it is quite good considering the model’s simplicity.

Predicting

predictedbuys <-predict(object = Linear2, 
                newdata = test, 
                interval = "prediction")
## Warning: 'newdata'必需有2559行 但变量里有5147行
summary(predictedbuys)
##       fit             lwr               upr       
##  Min.   :1.458   Min.   :-0.8451   Min.   :3.761  
##  1st Qu.:2.891   1st Qu.: 0.5901   1st Qu.:5.192  
##  Median :3.612   Median : 1.3119   Median :5.912  
##  Mean   :3.674   Mean   : 1.3729   Mean   :5.975  
##  3rd Qu.:4.358   3rd Qu.: 2.0576   3rd Qu.:6.659  
##  Max.   :6.577   Max.   : 4.2727   Max.   :8.882