First, i will import the data.

wtrain <- read.csv("wine-training-data.csv")
weval <- read.csv("wine-evaluation-data.csv")

DATA EXPLORATION

First, I will remove the index row. Then, I will look at the first rows, to get a sense of the data and see what types of variable each column contains.

wtrain <- wtrain[,-1]
head(wtrain)
##   TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1      3          3.2           1.160      -0.98          54.2    -0.567
## 2      3          4.5           0.160      -0.81          26.1    -0.425
## 3      5          7.1           2.640      -0.88          14.8     0.037
## 4      3          5.7           0.385       0.04          18.8    -0.425
## 5      4          8.0           0.330      -1.26           9.4        NA
## 6      0         11.3           0.320       0.59           2.2     0.556
##   FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 1                NA                268 0.99280 3.33     -0.59     9.9
## 2                15               -327 1.02792 3.38      0.70      NA
## 3               214                142 0.99518 3.12      0.48    22.0
## 4                22                115 0.99640 2.24      1.83     6.2
## 5              -167                108 0.99457 3.12      1.77    13.7
## 6               -37                 15 0.99940 3.20      1.29    15.4
##   LabelAppeal AcidIndex STARS
## 1           0         8     2
## 2          -1         7     3
## 3          -1         8     3
## 4          -1         6     1
## 5           0         9     2
## 6           0        11    NA
#apply same transformations to weval
weval <- weval[,-1]

I willuse a box plot for each variable so I can see the outliers.

boxplot(wtrain)

As we can see, CitricAcid, FreeSulfurDioxide and TotalSulfurDioxide have many outliers. So we will remove this and look again at the box plots of the other variables.

I will look at a summary of each variable below:

summary(wtrain)
##      TARGET       FixedAcidity     VolatileAcidity     CitricAcid     
##  Min.   :0.000   Min.   :-18.100   Min.   :-2.7900   Min.   :-3.2400  
##  1st Qu.:2.000   1st Qu.:  5.200   1st Qu.: 0.1300   1st Qu.: 0.0300  
##  Median :3.000   Median :  6.900   Median : 0.2800   Median : 0.3100  
##  Mean   :3.029   Mean   :  7.076   Mean   : 0.3241   Mean   : 0.3084  
##  3rd Qu.:4.000   3rd Qu.:  9.500   3rd Qu.: 0.6400   3rd Qu.: 0.5800  
##  Max.   :8.000   Max.   : 34.400   Max.   : 3.6800   Max.   : 3.8600  
##                                                                       
##  ResidualSugar        Chlorides       FreeSulfurDioxide TotalSulfurDioxide
##  Min.   :-127.800   Min.   :-1.1710   Min.   :-555.00   Min.   :-823.0    
##  1st Qu.:  -2.000   1st Qu.:-0.0310   1st Qu.:   0.00   1st Qu.:  27.0    
##  Median :   3.900   Median : 0.0460   Median :  30.00   Median : 123.0    
##  Mean   :   5.419   Mean   : 0.0548   Mean   :  30.85   Mean   : 120.7    
##  3rd Qu.:  15.900   3rd Qu.: 0.1530   3rd Qu.:  70.00   3rd Qu.: 208.0    
##  Max.   : 141.150   Max.   : 1.3510   Max.   : 623.00   Max.   :1057.0    
##  NA's   :616        NA's   :638       NA's   :647       NA's   :682       
##     Density             pH          Sulphates          Alcohol     
##  Min.   :0.8881   Min.   :0.480   Min.   :-3.1300   Min.   :-4.70  
##  1st Qu.:0.9877   1st Qu.:2.960   1st Qu.: 0.2800   1st Qu.: 9.00  
##  Median :0.9945   Median :3.200   Median : 0.5000   Median :10.40  
##  Mean   :0.9942   Mean   :3.208   Mean   : 0.5271   Mean   :10.49  
##  3rd Qu.:1.0005   3rd Qu.:3.470   3rd Qu.: 0.8600   3rd Qu.:12.40  
##  Max.   :1.0992   Max.   :6.130   Max.   : 4.2400   Max.   :26.50  
##                   NA's   :395     NA's   :1210      NA's   :653    
##   LabelAppeal          AcidIndex          STARS      
##  Min.   :-2.000000   Min.   : 4.000   Min.   :1.000  
##  1st Qu.:-1.000000   1st Qu.: 7.000   1st Qu.:1.000  
##  Median : 0.000000   Median : 8.000   Median :2.000  
##  Mean   :-0.009066   Mean   : 7.773   Mean   :2.042  
##  3rd Qu.: 1.000000   3rd Qu.: 8.000   3rd Qu.:3.000  
##  Max.   : 2.000000   Max.   :17.000   Max.   :4.000  
##                                       NA's   :3359

We can see that ResidualSugar, Chlorides, FreeSulfurDioxide, TotalSulfurDioxide, pH, Sulphates, Alcohol, and STARS have missing values. We will need to adjust those to the mean of that variable. I am choosing to adjust the missing values to the mean for each varaiable, because many variables a 0 rating wouldn’t make sense, and would skew the distribution of values.

wtrain$ResidualSugar[is.na(wtrain$ResidualSugar)] <- mean(wtrain$ResidualSugar, na.rm = TRUE)
wtrain$Chlorides[is.na(wtrain$Chlorides)] <- mean(wtrain$Chlorides, na.rm = TRUE)
wtrain$FreeSulfurDioxide[is.na(wtrain$FreeSulfurDioxide)] <- mean(wtrain$FreeSulfurDioxide, na.rm = TRUE)
wtrain$TotalSulfurDioxide[is.na(wtrain$TotalSulfurDioxide)] <- mean(wtrain$TotalSulfurDioxide, na.rm = TRUE)
wtrain$pH[is.na(wtrain$pH)] <- mean(wtrain$pH, na.rm = TRUE)
wtrain$Sulphates[is.na(wtrain$Sulphates)] <- mean(wtrain$Sulphates, na.rm = TRUE)
wtrain$Alcohol[is.na(wtrain$Alcohol)] <- mean(wtrain$Alcohol, na.rm = TRUE)
wtrain$STARS[is.na(wtrain$STARS)] <- mean(wtrain$STARS, na.rm = TRUE)

#apply same transformations to weval
weval$ResidualSugar[is.na(weval$ResidualSugar)] <- mean(weval$ResidualSugar, na.rm = TRUE)
weval$Chlorides[is.na(weval$Chlorides)] <- mean(weval$Chlorides, na.rm = TRUE)
weval$FreeSulfurDioxide[is.na(weval$FreeSulfurDioxide)] <- mean(weval$FreeSulfurDioxide, na.rm = TRUE)
weval$TotalSulfurDioxide[is.na(weval$TotalSulfurDioxide)] <- mean(weval$TotalSulfurDioxide, na.rm = TRUE)
weval$pH[is.na(weval$pH)] <- mean(weval$pH, na.rm = TRUE)
weval$Sulphates[is.na(weval$Sulphates)] <- mean(weval$Sulphates, na.rm = TRUE)
weval$Alcohol[is.na(weval$Alcohol)] <- mean(weval$Alcohol, na.rm = TRUE)
weval$STARS[is.na(weval$STARS)] <- mean(weval$STARS, na.rm = TRUE)

Below we will look at the correlation between each variable:

M <-cor(wtrain)
corrplot(M, type="upper", order="hclust",
         col=brewer.pal(n=8, name="RdYlBu"))

We can see that there is a slight correlation between TARGET and LabelAppeal, LabelAppeal and STARS, TARGET and STARS, and TARGET and AcidIndex. The correlations are not very strong.

Next, we can see that more than 20% of the TARGET values are 0, making this data set a good candidate for Zero-inflated Poisson regression.

table <- prop.table(table(wtrain$TARGET))
barplot(table, main="TARGET values")

DATA PREPARATION

First, we will clean up negative values, since it’s impossible for these variables to have true negative values. This does raise concern with the studies quality of data collection. Likely, these were typos. We will check the distriubution of the absolute value of the negative values and compare them to the positive values.

mel_train <-
  wtrain %>% 
  reshape2::melt() %>% 
  mutate(is_negative = as.factor(value < 0),
         abs_value = abs(value))
## No id variables; using all as measure variables
## Warning: The `printer` argument is deprecated as of rlang 0.3.0.
## This warning is displayed once per session.
  # create filter within ggplot to plot true false for question is negative
ggplot(data = mel_train, aes(x = variable, y = abs_value)) + 
  geom_boxplot(aes(fill = is_negative)) + 
  facet_wrap( ~ variable, scales = "free")

For all variables with negative values, the distributions for the positive and negative values are similar. Therefore, I will change the negative values to their absolute value.

wtrain <-abs(wtrain)
#apply same transformations to weval
weval[,2:15] <-abs(weval[,2:15]) #start from col 2 bc col 1 is target which are all NAs at this point

Then, we will check if we will transform the data mathematically for the various models:

#m1 data
m1d <- wtrain

#m2 data / squared variables
m2d <- (wtrain)^(2)
m2d <- droplevels(m2d)


#m3 data / squart root variables
m3d <- (wtrain)^(.5)


#m4 data / log variables
m4d <- log(wtrain + 1)

wine_cordf <- data.frame(matrix(0, ncol = 14, nrow = 4))
colnames(wine_cordf) <- colnames(wtrain[,2:15])
rownames(wine_cordf) <- c("model1", "model2", "model3", "model4")
wine_cordf[1,] <- cor(m1d$TARGET, m1d[,2:15])
wine_cordf[2,] <- cor(m2d$TARGET, m2d[,2:15])
wine_cordf[3,] <- cor(m3d$TARGET, m3d[,2:15])
wine_cordf[4,] <- cor(m4d$TARGET, m4d[,2:15])
wine_cordf
##        FixedAcidity VolatileAcidity  CitricAcid ResidualSugar   Chlorides
## model1  -0.05298434     -0.07019454 0.013953316   0.001761491 -0.02778189
## model2  -0.03792503     -0.04190950 0.005058022  -0.006142565 -0.02262462
## model3  -0.05321207     -0.08263212 0.029618651   0.014678978 -0.03597507
## model4  -0.04732340     -0.08119830 0.019977454   0.025074043 -0.02743230
##        FreeSulfurDioxide TotalSulfurDioxide     Density           pH
## model1       0.023598901         0.03334379 -0.03551750 -0.009280513
## model2       0.007765413         0.00148680 -0.03451527  0.002489103
## model3       0.045749617         0.07372928 -0.03117329 -0.017773911
## model4       0.078825226         0.10323804 -0.03168417 -0.017031483
##          Sulphates    Alcohol LabelAppeal  AcidIndex     STARS
## model1 -0.03127247 0.06172638 -0.00454383 -0.2460494 0.3866978
## model2 -0.01543401 0.07653891  0.06737239 -0.1943273 0.5016558
## model3 -0.04364746 0.03719616 -0.01865838 -0.2631733 0.2687399
## model4 -0.04293995 0.03395682 -0.02813892 -0.2579731 0.2602482

We will proceed with model 2, since there is the strogest correlation for model 2 with the Alcohol, LabelAppeal, AcidIndex, and STARS variables.

BUILD MODELS

We will build a binomial model based on the original data. First model:

m1 <- zeroinfl(formula = TARGET ~ . , data=m1d)

summary(m1)
## 
## Call:
## zeroinfl(formula = TARGET ~ ., data = m1d)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -1.8185 -0.5196  0.1127  0.5185  5.5603 
## 
## Count model coefficients (poisson with log link):
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.256e+00  2.080e-01   6.038 1.56e-09 ***
## FixedAcidity       -1.743e-04  1.091e-03  -0.160   0.8731    
## VolatileAcidity    -1.946e-02  9.786e-03  -1.989   0.0467 *  
## CitricAcid          4.018e-03  8.824e-03   0.455   0.6489    
## ResidualSugar      -1.393e-04  2.178e-04  -0.640   0.5224    
## Chlorides          -3.696e-02  2.328e-02  -1.588   0.1123    
## FreeSulfurDioxide   2.638e-05  4.900e-05   0.538   0.5904    
## TotalSulfurDioxide -5.241e-05  3.223e-05  -1.626   0.1039    
## Density            -3.513e-01  2.034e-01  -1.727   0.0841 .  
## pH                  1.303e-02  8.006e-03   1.627   0.1037    
## Sulphates           5.565e-03  8.509e-03   0.654   0.5131    
## Alcohol             7.328e-03  1.501e-03   4.884 1.04e-06 ***
## LabelAppeal        -1.411e-02  8.581e-03  -1.644   0.1001    
## AcidIndex          -1.105e-02  4.990e-03  -2.215   0.0268 *  
## STARS               1.903e-01  6.103e-03  31.178  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -5.9025248  1.0055680  -5.870 4.36e-09 ***
## FixedAcidity        0.0037153  0.0051678   0.719   0.4722    
## VolatileAcidity     0.2395493  0.0441675   5.424 5.84e-08 ***
## CitricAcid         -0.1136557  0.0454074  -2.503   0.0123 *  
## ResidualSugar      -0.0005434  0.0010747  -0.506   0.6131    
## Chlorides           0.0967292  0.1120046   0.864   0.3878    
## FreeSulfurDioxide  -0.0004919  0.0002496  -1.971   0.0488 *  
## TotalSulfurDioxide -0.0010105  0.0001809  -5.585 2.34e-08 ***
## Density             0.8559649  0.9853089   0.869   0.3850    
## pH                  0.1971050  0.0391356   5.036 4.74e-07 ***
## Sulphates           0.1621231  0.0387686   4.182 2.89e-05 ***
## Alcohol             0.0066050  0.0073618   0.897   0.3696    
## LabelAppeal         0.0069597  0.0419046   0.166   0.8681    
## AcidIndex           0.4847777  0.0191899  25.262  < 2e-16 ***
## STARS              -0.5120659  0.0353232 -14.497  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 36 
## Log-likelihood: -2.336e+04 on 30 Df

Then, we will build a model based on the second model data:

m2 <- glm(formula = TARGET ~ . , data=m2d)
summary(m2)
## 
## Call:
## glm(formula = TARGET ~ ., data = m2d)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -20.013   -7.452   -1.715    5.329   51.673  
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         1.372e+01  1.600e+00   8.575  < 2e-16 ***
## FixedAcidity       -5.314e-04  7.206e-04  -0.737  0.46086    
## VolatileAcidity    -2.682e-01  6.351e-02  -4.223 2.43e-05 ***
## CitricAcid          3.200e-02  5.415e-02   0.591  0.55457    
## ResidualSugar      -4.326e-05  3.744e-05  -1.155  0.24798    
## Chlorides          -1.138e+00  4.221e-01  -2.695  0.00704 ** 
## FreeSulfurDioxide   2.943e-06  1.915e-06   1.536  0.12445    
## TotalSulfurDioxide  2.216e-07  7.267e-07   0.305  0.76039    
## Density            -4.299e+00  1.579e+00  -2.723  0.00648 ** 
## pH                 -1.296e-02  1.894e-02  -0.684  0.49380    
## Sulphates          -4.927e-02  4.443e-02  -1.109  0.26742    
## Alcohol             6.383e-03  1.043e-03   6.122 9.49e-10 ***
## LabelAppeal         5.262e-01  7.957e-02   6.614 3.90e-11 ***
## AcidIndex          -7.219e-02  3.568e-03 -20.232  < 2e-16 ***
## STARS               1.508e+00  2.341e-02  64.405  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 88.55802)
## 
##     Null deviance: 1578705  on 12794  degrees of freedom
## Residual deviance: 1131771  on 12780  degrees of freedom
## AIC: 93696
## 
## Number of Fisher Scoring iterations: 2

This second model lowered the Pr(>|z|) value, so I will be proceeding with model 2. SELECT MODEL

wtrain$predict <- predict(m1, m1d, type='response')

I will plug in the evaluation data below:

#run
final_Pred =predict(m2, newdata=weval)
hist(final_Pred)

head(final_Pred)
##         1         2         3         4         5         6 
## 12.304513 10.631887  9.569285 10.682256 11.547331 15.053990

We can see the above distribution of the estimated cases purchased and the predictions of number cases bought for the first 5 wines of the weval dataframe.