Data Exploration

train <- read.csv('/Users/jordanglendrange/Documents/Data 621/Homework4/wine-training-data.csv')
test <- read.csv('/Users/jordanglendrange/Documents/Data 621/Homework4/wine-evaluation-data.csv')

head(train)
##   INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1     1      3          3.2           1.160      -0.98          54.2    -0.567
## 2     2      3          4.5           0.160      -0.81          26.1    -0.425
## 3     4      5          7.1           2.640      -0.88          14.8     0.037
## 4     5      3          5.7           0.385       0.04          18.8    -0.425
## 5     6      4          8.0           0.330      -1.26           9.4        NA
## 6     7      0         11.3           0.320       0.59           2.2     0.556
##   FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 1                NA                268 0.99280 3.33     -0.59     9.9
## 2                15               -327 1.02792 3.38      0.70      NA
## 3               214                142 0.99518 3.12      0.48    22.0
## 4                22                115 0.99640 2.24      1.83     6.2
## 5              -167                108 0.99457 3.12      1.77    13.7
## 6               -37                 15 0.99940 3.20      1.29    15.4
##   LabelAppeal AcidIndex STARS
## 1           0         8     2
## 2          -1         7     3
## 3          -1         8     3
## 4          -1         6     1
## 5           0         9     2
## 6           0        11    NA

Dimensions of the training set are 12,795 rows and 16 columns.

dim(train)
## [1] 12795    16

The median and mean of cases purchased are around 3. Glad to see there aren’t a lot of 0 values. Some of the columns may need to be normalized for our models.

summary(train)
##      INDEX           TARGET       FixedAcidity     VolatileAcidity  
##  Min.   :    1   Min.   :0.000   Min.   :-18.100   Min.   :-2.7900  
##  1st Qu.: 4038   1st Qu.:2.000   1st Qu.:  5.200   1st Qu.: 0.1300  
##  Median : 8110   Median :3.000   Median :  6.900   Median : 0.2800  
##  Mean   : 8070   Mean   :3.029   Mean   :  7.076   Mean   : 0.3241  
##  3rd Qu.:12106   3rd Qu.:4.000   3rd Qu.:  9.500   3rd Qu.: 0.6400  
##  Max.   :16129   Max.   :8.000   Max.   : 34.400   Max.   : 3.6800  
##                                                                     
##    CitricAcid      ResidualSugar        Chlorides       FreeSulfurDioxide
##  Min.   :-3.2400   Min.   :-127.800   Min.   :-1.1710   Min.   :-555.00  
##  1st Qu.: 0.0300   1st Qu.:  -2.000   1st Qu.:-0.0310   1st Qu.:   0.00  
##  Median : 0.3100   Median :   3.900   Median : 0.0460   Median :  30.00  
##  Mean   : 0.3084   Mean   :   5.419   Mean   : 0.0548   Mean   :  30.85  
##  3rd Qu.: 0.5800   3rd Qu.:  15.900   3rd Qu.: 0.1530   3rd Qu.:  70.00  
##  Max.   : 3.8600   Max.   : 141.150   Max.   : 1.3510   Max.   : 623.00  
##                    NA's   :616        NA's   :638       NA's   :647      
##  TotalSulfurDioxide    Density             pH          Sulphates      
##  Min.   :-823.0     Min.   :0.8881   Min.   :0.480   Min.   :-3.1300  
##  1st Qu.:  27.0     1st Qu.:0.9877   1st Qu.:2.960   1st Qu.: 0.2800  
##  Median : 123.0     Median :0.9945   Median :3.200   Median : 0.5000  
##  Mean   : 120.7     Mean   :0.9942   Mean   :3.208   Mean   : 0.5271  
##  3rd Qu.: 208.0     3rd Qu.:1.0005   3rd Qu.:3.470   3rd Qu.: 0.8600  
##  Max.   :1057.0     Max.   :1.0992   Max.   :6.130   Max.   : 4.2400  
##  NA's   :682                         NA's   :395     NA's   :1210     
##     Alcohol       LabelAppeal          AcidIndex          STARS      
##  Min.   :-4.70   Min.   :-2.000000   Min.   : 4.000   Min.   :1.000  
##  1st Qu.: 9.00   1st Qu.:-1.000000   1st Qu.: 7.000   1st Qu.:1.000  
##  Median :10.40   Median : 0.000000   Median : 8.000   Median :2.000  
##  Mean   :10.49   Mean   :-0.009066   Mean   : 7.773   Mean   :2.042  
##  3rd Qu.:12.40   3rd Qu.: 1.000000   3rd Qu.: 8.000   3rd Qu.:3.000  
##  Max.   :26.50   Max.   : 2.000000   Max.   :17.000   Max.   :4.000  
##  NA's   :653                                          NA's   :3359

Interesting. The target variable is almost a normal distribution if we ignore how many people ordered 0 cases.

hist(train$TARGET)

hist(train$STARS)

Looks like we will have to clean up some of the null values. 8 columns contains nulls.

sapply(train, function(x) sum(is.na(x)))
##              INDEX             TARGET       FixedAcidity    VolatileAcidity 
##                  0                  0                  0                  0 
##         CitricAcid      ResidualSugar          Chlorides  FreeSulfurDioxide 
##                  0                616                638                647 
## TotalSulfurDioxide            Density                 pH          Sulphates 
##                682                  0                395               1210 
##            Alcohol        LabelAppeal          AcidIndex              STARS 
##                653                  0                  0               3359

Data Preparation

For data prep the only thing I will be doing is filling in my missing values. Since all the columns are numeric there is no need to create dummy variables.

train$ResidualSugar[is.na(train$ResidualSugar)] <- median(train$ResidualSugar, na.rm=T)
train$Chlorides[is.na(train$Chlorides)] <- median(train$Chlorides, na.rm=T)
train$FreeSulfurDioxide[is.na(train$FreeSulfurDioxide)] <- median(train$FreeSulfurDioxide, na.rm=T)
train$TotalSulfurDioxide[is.na(train$TotalSulfurDioxide)] <- median(train$TotalSulfurDioxide, na.rm=T)
train$pH[is.na(train$pH)] <- median(train$pH, na.rm=T)
train$Sulphates[is.na(train$Sulphates)] <- median(train$Sulphates, na.rm=T)
train$Alcohol[is.na(train$Alcohol)] <- median(train$Alcohol, na.rm=T)
train$STARS[is.na(train$STARS)] <- median(train$STARS, na.rm=T)

Now there are no null values.

sapply(train, function(x) sum(is.na(x)))
##              INDEX             TARGET       FixedAcidity    VolatileAcidity 
##                  0                  0                  0                  0 
##         CitricAcid      ResidualSugar          Chlorides  FreeSulfurDioxide 
##                  0                  0                  0                  0 
## TotalSulfurDioxide            Density                 pH          Sulphates 
##                  0                  0                  0                  0 
##            Alcohol        LabelAppeal          AcidIndex              STARS 
##                  0                  0                  0                  0

Need to make the same transformations on our test data

test$ResidualSugar[is.na(test$ResidualSugar)] <- median(test$ResidualSugar, na.rm=T)
test$Chlorides[is.na(test$Chlorides)] <- median(test$Chlorides, na.rm=T)
test$FreeSulfurDioxide[is.na(test$FreeSulfurDioxide)] <- median(test$FreeSulfurDioxide, na.rm=T)
test$TotalSulfurDioxide[is.na(test$TotalSulfurDioxide)] <- median(test$TotalSulfurDioxide, na.rm=T)
test$pH[is.na(test$pH)] <- median(test$pH, na.rm=T)
test$Sulphates[is.na(test$Sulphates)] <- median(test$Sulphates, na.rm=T)
test$Alcohol[is.na(test$Alcohol)] <- median(test$Alcohol, na.rm=T)
test$STARS[is.na(test$STARS)] <- median(test$STARS, na.rm=T)

Data Models

I am going to split the variables into 2 categories. 1 model will have all the variables and the other will not use any of the chemical variables. My hypothesis is that Stars and Label Appeal will have the biggest impact on the model.

Multiple Regression

The first model with only 2 variables actually gives us a pretty good result, but I’d say including all the variables did help our R^2 value. In this case Id pick the second model.

model_mr1 <- lm(TARGET~STARS+LabelAppeal,data=train)

summary(model_mr1)
## 
## Call:
## lm(formula = TARGET ~ STARS + LabelAppeal, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.1498 -0.8165  0.4204  1.1835  4.4204 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  1.39516    0.04371   31.92   <2e-16 ***
## STARS        0.80712    0.02019   39.97   <2e-16 ***
## LabelAppeal  0.57019    0.01757   32.45   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.697 on 12792 degrees of freedom
## Multiple R-squared:  0.224,  Adjusted R-squared:  0.2239 
## F-statistic:  1846 on 2 and 12792 DF,  p-value: < 2.2e-16
model_mr2 <- lm(TARGET~FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=train)

summary(model_mr2)
## 
## Call:
## lm(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.2211 -0.7540  0.3598  1.1254  4.3550 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.355e+00  5.517e-01   9.707  < 2e-16 ***
## FixedAcidity       -1.168e-03  2.315e-03  -0.505 0.613911    
## VolatileAcidity    -1.549e-01  1.838e-02  -8.429  < 2e-16 ***
## CitricAcid          3.976e-02  1.673e-02   2.377 0.017476 *  
## ResidualSugar       4.716e-04  4.371e-04   1.079 0.280670    
## Chlorides          -1.931e-01  4.638e-02  -4.164 3.15e-05 ***
## FreeSulfurDioxide   4.286e-04  9.941e-05   4.312 1.63e-05 ***
## TotalSulfurDioxide  3.098e-04  6.387e-05   4.851 1.25e-06 ***
## Density            -1.274e+00  5.427e-01  -2.347 0.018959 *  
## pH                 -6.387e-02  2.154e-02  -2.965 0.003028 ** 
## Sulphates          -5.485e-02  1.623e-02  -3.380 0.000728 ***
## Alcohol             1.883e-02  3.972e-03   4.739 2.17e-06 ***
## LabelAppeal         5.945e-01  1.686e-02  35.250  < 2e-16 ***
## AcidIndex          -3.259e-01  1.117e-02 -29.169  < 2e-16 ***
## STARS               7.478e-01  1.946e-02  38.431  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.626 on 12780 degrees of freedom
## Multiple R-squared:  0.2879, Adjusted R-squared:  0.2871 
## F-statistic: 369.1 on 14 and 12780 DF,  p-value: < 2.2e-16

Negative Binary Model

In our first model the chi^2 value is 22859-12794=10,065 and we have 2 degrees of freedom. This gives us an extremly low p value which means this is a good model for predicting. However our second model is even better because the residual deviance is smaller.

library(MASS)

model_nb1 <- glm.nb(TARGET~STARS+LabelAppeal,data=train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model_nb1)
## 
## Call:
## glm.nb(formula = TARGET ~ STARS + LabelAppeal, data = train, 
##     init.theta = 33588.12602, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9013  -0.3672   0.2576   0.6511   2.0647  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 0.573593   0.014922   38.44   <2e-16 ***
## STARS       0.243263   0.006440   37.78   <2e-16 ***
## LabelAppeal 0.188569   0.005998   31.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(33588.13) family taken to be 1)
## 
##     Null deviance: 22859  on 12794  degrees of freedom
## Residual deviance: 19470  on 12792  degrees of freedom
## AIC: 51422
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  33588 
##           Std. Err.:  63797 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -51413.64
model_nb2 <- glm.nb(TARGET~FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model_nb2)
## 
## Call:
## glm.nb(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = train, init.theta = 39421.17987, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4818  -0.5261   0.2041   0.6365   2.5503  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         2.017e+00  1.957e-01  10.305  < 2e-16 ***
## FixedAcidity       -4.515e-04  8.196e-04  -0.551 0.581705    
## VolatileAcidity    -5.045e-02  6.493e-03  -7.770 7.83e-15 ***
## CitricAcid          1.331e-02  5.892e-03   2.258 0.023931 *  
## ResidualSugar       1.443e-04  1.545e-04   0.934 0.350498    
## Chlorides          -6.005e-02  1.645e-02  -3.650 0.000262 ***
## FreeSulfurDioxide   1.424e-04  3.513e-05   4.054 5.04e-05 ***
## TotalSulfurDioxide  1.065e-04  2.269e-05   4.695 2.67e-06 ***
## Density            -4.315e-01  1.921e-01  -2.247 0.024655 *  
## pH                 -2.389e-02  7.639e-03  -3.128 0.001763 ** 
## Sulphates          -1.877e-02  5.739e-03  -3.271 0.001073 ** 
## Alcohol             5.368e-03  1.410e-03   3.806 0.000141 ***
## LabelAppeal         1.965e-01  6.021e-03  32.633  < 2e-16 ***
## AcidIndex          -1.222e-01  4.464e-03 -27.380  < 2e-16 ***
## STARS               2.198e-01  6.468e-03  33.984  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(39421.18) family taken to be 1)
## 
##     Null deviance: 22860  on 12794  degrees of freedom
## Residual deviance: 18419  on 12780  degrees of freedom
## AIC: 50394
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  39421 
##           Std. Err.:  59707 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -50361.62

Poisson Model

The deviance in the is almost the same as the Negative Binary Model.

model_p1 <- glm(TARGET~STARS+LabelAppeal,data=train, family='poisson')

summary(model_p1)
## 
## Call:
## glm(formula = TARGET ~ STARS + LabelAppeal, family = "poisson", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9014  -0.3672   0.2577   0.6512   2.0648  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) 0.573588   0.014922   38.44   <2e-16 ***
## STARS       0.243266   0.006439   37.78   <2e-16 ***
## LabelAppeal 0.188568   0.005998   31.44   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22861  on 12794  degrees of freedom
## Residual deviance: 19472  on 12792  degrees of freedom
## AIC: 51420
## 
## Number of Fisher Scoring iterations: 5
model_p2 <- glm(TARGET~FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=train, family="poisson")

summary(model_p2)
## 
## Call:
## glm(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, family = "poisson", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.4819  -0.5261   0.2041   0.6365   2.5504  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         2.017e+00  1.957e-01  10.305  < 2e-16 ***
## FixedAcidity       -4.515e-04  8.195e-04  -0.551 0.581697    
## VolatileAcidity    -5.045e-02  6.492e-03  -7.771 7.82e-15 ***
## CitricAcid          1.331e-02  5.892e-03   2.258 0.023925 *  
## ResidualSugar       1.442e-04  1.545e-04   0.934 0.350499    
## Chlorides          -6.005e-02  1.645e-02  -3.650 0.000262 ***
## FreeSulfurDioxide   1.424e-04  3.513e-05   4.054 5.04e-05 ***
## TotalSulfurDioxide  1.065e-04  2.268e-05   4.695 2.67e-06 ***
## Density            -4.315e-01  1.921e-01  -2.247 0.024651 *  
## pH                 -2.389e-02  7.639e-03  -3.128 0.001762 ** 
## Sulphates          -1.877e-02  5.739e-03  -3.271 0.001073 ** 
## Alcohol             5.368e-03  1.410e-03   3.807 0.000141 ***
## LabelAppeal         1.965e-01  6.021e-03  32.634  < 2e-16 ***
## AcidIndex          -1.222e-01  4.463e-03 -27.381  < 2e-16 ***
## STARS               2.198e-01  6.468e-03  33.986  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22861  on 12794  degrees of freedom
## Residual deviance: 18419  on 12780  degrees of freedom
## AIC: 50391
## 
## Number of Fisher Scoring iterations: 5

Select Models

I am going to select the second Negative Binary Model I put together.

library(AER)
## Loading required package: car
## Loading required package: carData
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 4.0.5
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
## Loading required package: survival
dispersiontest(model_p2, trafo=1)
## 
##  Overdispersion test
## 
## data:  model_p2
## z = -2.8111, p-value = 0.9975
## alternative hypothesis: true alpha is greater than 0
## sample estimates:
##       alpha 
## -0.03690101

Here were are testing if our model is better than a random model and it is!

modelnull <- glm(TARGET~1, data=train, family='poisson')

anova(modelnull, model_p2, test="Chisq")
## Analysis of Deviance Table
## 
## Model 1: TARGET ~ 1
## Model 2: TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + ResidualSugar + 
##     Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + Density + 
##     pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS
##   Resid. Df Resid. Dev Df Deviance  Pr(>Chi)    
## 1     12794      22861                          
## 2     12780      18420 14   4441.4 < 2.2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Last step is to predict the Target value for the test data.

test$TARGET <- predict(model_mr2, newdata=test)
head(test)
##   IN   TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1  3 3.169742          5.4          -0.860       0.27         -10.7     0.092
## 2  9 3.325782         12.4           0.385      -0.76         -19.7     1.169
## 3 10 1.718621          7.2           1.750       0.17         -33.0     0.065
## 4 18 1.709240          6.2           0.100       1.80           1.0    -0.179
## 5 21 2.217892         11.4           0.210       0.28           1.2     0.038
## 6 30 4.904551         17.6           0.040      -1.15           1.4     0.535
##   FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 1                23                398 0.98527 5.02      0.64   12.30
## 2               -37                 68 0.99048 3.37      1.09   16.00
## 3                 9                 76 1.04641 4.61      0.68    8.55
## 4               104                 89 0.98877 3.20      2.11   12.30
## 5                70                 53 1.02899 2.54     -0.07    4.80
## 6              -250                140 0.95028 3.06     -0.02   11.40
##   LabelAppeal AcidIndex STARS
## 1          -1         6     2
## 2           0         6     2
## 3           0         8     1
## 4          -1         8     1
## 5           0        10     2
## 6           1         8     4