Data621_HW4_Wine_Prediction

Wine Prediction

Our objective is to build a count regression model to predict the number of cases of wine that will be sold given certain properties of the wine.

Load the required libraries

library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(ggplot2)
library(stringr)
library(mice)

## Warning: package 'mice' was built under R version 3.6.2

## 
## Attaching package: 'mice'

## The following objects are masked from 'package:base':
## 
##     cbind, rbind

library(tidyr)
library(corrplot)

## Warning: package 'corrplot' was built under R version 3.6.3

## corrplot 0.84 loaded

library(VIM)

## Warning: package 'VIM' was built under R version 3.6.2

## Loading required package: colorspace

## Loading required package: grid

## Loading required package: data.table

## 
## Attaching package: 'data.table'

## The following objects are masked from 'package:dplyr':
## 
##     between, first, last

## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.

## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues

## 
## Attaching package: 'VIM'

## The following object is masked from 'package:datasets':
## 
##     sleep

library(MASS)

## 
## Attaching package: 'MASS'

## The following object is masked from 'package:dplyr':
## 
##     select

library(magrittr)

## 
## Attaching package: 'magrittr'

## The following object is masked from 'package:tidyr':
## 
##     extract

library(pscl)

## Warning: package 'pscl' was built under R version 3.6.3

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

Loading data and doing the exploratory data analysis

wine_raw <- read.csv("https://raw.githubusercontent.com/deepakmongia/Data621/master/HW-5/data/wine-training-data.csv",
                     header = TRUE, row.names = 1)

print(head(wine_raw))

##   TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1      3          3.2           1.160      -0.98          54.2    -0.567
## 2      3          4.5           0.160      -0.81          26.1    -0.425
## 4      5          7.1           2.640      -0.88          14.8     0.037
## 5      3          5.7           0.385       0.04          18.8    -0.425
## 6      4          8.0           0.330      -1.26           9.4        NA
## 7      0         11.3           0.320       0.59           2.2     0.556
##   FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 1                NA                268 0.99280 3.33     -0.59     9.9
## 2                15               -327 1.02792 3.38      0.70      NA
## 4               214                142 0.99518 3.12      0.48    22.0
## 5                22                115 0.99640 2.24      1.83     6.2
## 6              -167                108 0.99457 3.12      1.77    13.7
## 7               -37                 15 0.99940 3.20      1.29    15.4
##   LabelAppeal AcidIndex STARS
## 1           0         8     2
## 2          -1         7     3
## 4          -1         8     3
## 5          -1         6     1
## 6           0         9     2
## 7           0        11    NA

print(dim(wine_raw))

## [1] 12795    15

print(str(wine_raw))

## 'data.frame':    12795 obs. of  15 variables:
##  $ TARGET            : int  3 3 5 3 4 0 0 4 3 6 ...
##  $ FixedAcidity      : num  3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
##  $ VolatileAcidity   : num  1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
##  $ CitricAcid        : num  -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
##  $ ResidualSugar     : num  54.2 26.1 14.8 18.8 9.4 ...
##  $ Chlorides         : num  -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
##  $ FreeSulfurDioxide : num  NA 15 214 22 -167 -37 287 523 -213 62 ...
##  $ TotalSulfurDioxide: num  268 -327 142 115 108 15 156 551 NA 180 ...
##  $ Density           : num  0.993 1.028 0.995 0.996 0.995 ...
##  $ pH                : num  3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
##  $ Sulphates         : num  -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
##  $ Alcohol           : num  9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
##  $ LabelAppeal       : int  0 -1 -1 -1 0 0 0 1 0 0 ...
##  $ AcidIndex         : int  8 7 8 6 9 11 8 7 6 8 ...
##  $ STARS             : int  2 3 3 1 2 NA NA 3 NA 4 ...
## NULL

print(summary(wine_raw))

##      TARGET       FixedAcidity     VolatileAcidity     CitricAcid     
##  Min.   :0.000   Min.   :-18.100   Min.   :-2.7900   Min.   :-3.2400  
##  1st Qu.:2.000   1st Qu.:  5.200   1st Qu.: 0.1300   1st Qu.: 0.0300  
##  Median :3.000   Median :  6.900   Median : 0.2800   Median : 0.3100  
##  Mean   :3.029   Mean   :  7.076   Mean   : 0.3241   Mean   : 0.3084  
##  3rd Qu.:4.000   3rd Qu.:  9.500   3rd Qu.: 0.6400   3rd Qu.: 0.5800  
##  Max.   :8.000   Max.   : 34.400   Max.   : 3.6800   Max.   : 3.8600  
##                                                                       
##  ResidualSugar        Chlorides       FreeSulfurDioxide TotalSulfurDioxide
##  Min.   :-127.800   Min.   :-1.1710   Min.   :-555.00   Min.   :-823.0    
##  1st Qu.:  -2.000   1st Qu.:-0.0310   1st Qu.:   0.00   1st Qu.:  27.0    
##  Median :   3.900   Median : 0.0460   Median :  30.00   Median : 123.0    
##  Mean   :   5.419   Mean   : 0.0548   Mean   :  30.85   Mean   : 120.7    
##  3rd Qu.:  15.900   3rd Qu.: 0.1530   3rd Qu.:  70.00   3rd Qu.: 208.0    
##  Max.   : 141.150   Max.   : 1.3510   Max.   : 623.00   Max.   :1057.0    
##  NA's   :616        NA's   :638       NA's   :647       NA's   :682       
##     Density             pH          Sulphates          Alcohol     
##  Min.   :0.8881   Min.   :0.480   Min.   :-3.1300   Min.   :-4.70  
##  1st Qu.:0.9877   1st Qu.:2.960   1st Qu.: 0.2800   1st Qu.: 9.00  
##  Median :0.9945   Median :3.200   Median : 0.5000   Median :10.40  
##  Mean   :0.9942   Mean   :3.208   Mean   : 0.5271   Mean   :10.49  
##  3rd Qu.:1.0005   3rd Qu.:3.470   3rd Qu.: 0.8600   3rd Qu.:12.40  
##  Max.   :1.0992   Max.   :6.130   Max.   : 4.2400   Max.   :26.50  
##                   NA's   :395     NA's   :1210      NA's   :653    
##   LabelAppeal          AcidIndex          STARS      
##  Min.   :-2.000000   Min.   : 4.000   Min.   :1.000  
##  1st Qu.:-1.000000   1st Qu.: 7.000   1st Qu.:1.000  
##  Median : 0.000000   Median : 8.000   Median :2.000  
##  Mean   :-0.009066   Mean   : 7.773   Mean   :2.042  
##  3rd Qu.: 1.000000   3rd Qu.: 8.000   3rd Qu.:3.000  
##  Max.   : 2.000000   Max.   :17.000   Max.   :4.000  
##                                       NA's   :3359

Check blank or missing data

We are now checking if there is any blank or NA data

### Blank data
train_missing_df <- data.frame(apply(wine_raw, 2, function(x) length(which(x == ''))))
print(train_missing_df)

##                    apply.wine_raw..2..function.x..length.which.x.........
## TARGET                                                                  0
## FixedAcidity                                                            0
## VolatileAcidity                                                         0
## CitricAcid                                                              0
## ResidualSugar                                                           0
## Chlorides                                                               0
## FreeSulfurDioxide                                                       0
## TotalSulfurDioxide                                                      0
## Density                                                                 0
## pH                                                                      0
## Sulphates                                                               0
## Alcohol                                                                 0
## LabelAppeal                                                             0
## AcidIndex                                                               0
## STARS                                                                   0

### NA data
train_na_df1 <- data.frame(apply(wine_raw, 2, function(x) length(which(is.na(x)))))
print(train_na_df1)

##                    apply.wine_raw..2..function.x..length.which.is.na.x....
## TARGET                                                                   0
## FixedAcidity                                                             0
## VolatileAcidity                                                          0
## CitricAcid                                                               0
## ResidualSugar                                                          616
## Chlorides                                                              638
## FreeSulfurDioxide                                                      647
## TotalSulfurDioxide                                                     682
## Density                                                                  0
## pH                                                                     395
## Sulphates                                                             1210
## Alcohol                                                                653
## LabelAppeal                                                              0
## AcidIndex                                                                0
## STARS                                                                 3359

So there is no data with blank column data but there is data with missing or NA column data. We will work on imputing this missing / NA data, but let us first explore each column before that.

Target variable

print(unique(wine_raw$TARGET))

## [1] 3 5 4 0 6 7 2 1 8

table(wine_raw$TARGET)

## 
##    0    1    2    3    4    5    6    7    8 
## 2734  244 1091 2611 3177 2014  765  142   17

barplot(table(wine_raw$TARGET))

class(wine_raw$TARGET)

## [1] "integer"

summary(wine_raw$TARGET)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   3.000   3.029   4.000   8.000

FixedAcidity

summary(wine_raw$FixedAcidity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -18.100   5.200   6.900   7.076   9.500  34.400

boxplot(wine_raw$FixedAcidity)

ggplot(wine_raw, aes(FixedAcidity)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

VolatileAcidity

summary(wine_raw$VolatileAcidity)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -2.7900  0.1300  0.2800  0.3241  0.6400  3.6800

boxplot(wine_raw$VolatileAcidity)

ggplot(wine_raw, aes(VolatileAcidity)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

CitricAcid

summary(wine_raw$CitricAcid)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.2400  0.0300  0.3100  0.3084  0.5800  3.8600

boxplot(wine_raw$CitricAcid)

ggplot(wine_raw, aes(CitricAcid)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ResidualSugar

summary(wine_raw$ResidualSugar)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -127.800   -2.000    3.900    5.419   15.900  141.150      616

boxplot(wine_raw$ResidualSugar)

ggplot(wine_raw, aes(ResidualSugar)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 616 rows containing non-finite values (stat_bin).

chlorides

summary(wine_raw$Chlorides)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -1.1710 -0.0310  0.0460  0.0548  0.1530  1.3510     638

boxplot(wine_raw$Chlorides)

ggplot(wine_raw, aes(Chlorides)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 638 rows containing non-finite values (stat_bin).

FreeSulfurDioxide

summary(wine_raw$FreeSulfurDioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -555.00    0.00   30.00   30.85   70.00  623.00     647

boxplot(wine_raw$FreeSulfurDioxide)

ggplot(wine_raw, aes(FreeSulfurDioxide)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 647 rows containing non-finite values (stat_bin).

TotalSulfurDioxide

summary(wine_raw$TotalSulfurDioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -823.0    27.0   123.0   120.7   208.0  1057.0     682

boxplot(wine_raw$TotalSulfurDioxide)

ggplot(wine_raw, aes(TotalSulfurDioxide)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 682 rows containing non-finite values (stat_bin).

Density

summary(wine_raw$Density)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.8881  0.9877  0.9945  0.9942  1.0005  1.0992

boxplot(wine_raw$Density)

ggplot(wine_raw, aes(Density)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

pH

summary(wine_raw$pH)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.480   2.960   3.200   3.208   3.470   6.130     395

boxplot(wine_raw$pH)

ggplot(wine_raw, aes(pH)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 395 rows containing non-finite values (stat_bin).

Sulphates

summary(wine_raw$Sulphates)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -3.1300  0.2800  0.5000  0.5271  0.8600  4.2400    1210

boxplot(wine_raw$Sulphates)

ggplot(wine_raw, aes(Sulphates)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 1210 rows containing non-finite values (stat_bin).

Alcohol

summary(wine_raw$Alcohol)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -4.70    9.00   10.40   10.49   12.40   26.50     653

boxplot(wine_raw$Alcohol)

ggplot(wine_raw, aes(Alcohol)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 653 rows containing non-finite values (stat_bin).

LabelAppeal

summary(wine_raw$LabelAppeal)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
## -2.000000 -1.000000  0.000000 -0.009066  1.000000  2.000000

unique(wine_raw$LabelAppeal)

## [1]  0 -1  1  2 -2

boxplot(wine_raw$LabelAppeal)

ggplot(wine_raw, aes(LabelAppeal)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

AcidIndex

summary(wine_raw$AcidIndex)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   4.000   7.000   8.000   7.773   8.000  17.000

unique(wine_raw$AcidIndex)

##  [1]  8  7  6  9 11  5 10 14 13 12 16 15 17  4

boxplot(wine_raw$AcidIndex)

ggplot(wine_raw, aes(AcidIndex)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

STARS

summary(wine_raw$STARS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.042   3.000   4.000    3359

unique(wine_raw$STARS)

## [1]  2  3  1 NA  4

boxplot(wine_raw$STARS)

ggplot(wine_raw, aes(STARS)) + geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## Warning: Removed 3359 rows containing non-finite values (stat_bin).

Imputing Missing data

### Imputing missing values
summary(wine_raw$ResidualSugar)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -127.800   -2.000    3.900    5.419   15.900  141.150      616

summary(wine_raw$Chlorides)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -1.1710 -0.0310  0.0460  0.0548  0.1530  1.3510     638

summary(wine_raw$FreeSulfurDioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -555.00    0.00   30.00   30.85   70.00  623.00     647

summary(wine_raw$TotalSulfurDioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -823.0    27.0   123.0   120.7   208.0  1057.0     682

summary(wine_raw$pH)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.480   2.960   3.200   3.208   3.470   6.130     395

summary(wine_raw$Sulphates)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -3.1300  0.2800  0.5000  0.5271  0.8600  4.2400    1210

summary(wine_raw$Alcohol)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -4.70    9.00   10.40   10.49   12.40   26.50     653

summary(wine_raw$STARS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.042   3.000   4.000    3359

As we see above the column STARS has a very big number of data missing, so we won’t impute this column. Rather we will remove this column first for our impute efforts and then will put it back to the imputed dataset.

wine_raw3 <- subset(wine_raw, select = -c(STARS))

aggr_plot3 <- aggr(wine_raw3, col=c('navyblue','red'), 
                  numbers=TRUE, sortVars=TRUE, 
                  labels=names(wine_raw3), cex.axis=.7, 
                  gap=3, ylab=c("Histogram of missing data","Pattern"))

## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

## 
##  Variables sorted by number of missings: 
##            Variable      Count
##           Sulphates 0.09456819
##  TotalSulfurDioxide 0.05330207
##             Alcohol 0.05103556
##   FreeSulfurDioxide 0.05056663
##           Chlorides 0.04986323
##       ResidualSugar 0.04814381
##                  pH 0.03087143
##              TARGET 0.00000000
##        FixedAcidity 0.00000000
##     VolatileAcidity 0.00000000
##          CitricAcid 0.00000000
##             Density 0.00000000
##         LabelAppeal 0.00000000
##           AcidIndex 0.00000000

wine_raw_imputed <- mice(data = wine_raw3, m = 1,
                         method = "pmm", maxit = 5, seed = 500)

## 
##  iter imp variable
##   1   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol
##   2   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol
##   3   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol
##   4   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol
##   5   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol

wine_raw_imputed_df <- mice::complete(wine_raw_imputed, 1)
wine_raw_imputed_df_bk <- wine_raw_imputed_df

wine_raw_imputed_df <- cbind(wine_raw_imputed_df, wine_raw[, "STARS"])

colnames(wine_raw_imputed_df)[colnames(wine_raw_imputed_df) == 'wine_raw[, "STARS"]'] <-
  "STARS"

We have imputed all fields except STARS. Let’s compare the raw dataset columns having NA and the same columns in the imputed dataset.

summary(wine_raw$ResidualSugar)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max.     NA's 
## -127.800   -2.000    3.900    5.419   15.900  141.150      616

summary(wine_raw_imputed_df$ResidualSugar)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -127.800   -2.300    3.900    5.444   16.000  141.150

summary(wine_raw$Chlorides)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -1.1710 -0.0310  0.0460  0.0548  0.1530  1.3510     638

summary(wine_raw_imputed_df$Chlorides)

##     Min.  1st Qu.   Median     Mean  3rd Qu.     Max. 
## -1.17100 -0.02900  0.04600  0.05584  0.15600  1.35100

summary(wine_raw$FreeSulfurDioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -555.00    0.00   30.00   30.85   70.00  623.00     647

summary(wine_raw_imputed_df$FreeSulfurDioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -555.00    0.00   30.00   30.69   70.00  623.00

summary(wine_raw$TotalSulfurDioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##  -823.0    27.0   123.0   120.7   208.0  1057.0     682

summary(wine_raw_imputed_df$TotalSulfurDioxide)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  -823.0    27.0   124.0   121.2   209.0  1057.0

summary(wine_raw$pH)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   0.480   2.960   3.200   3.208   3.470   6.130     395

summary(wine_raw_imputed_df$pH)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.480   2.960   3.200   3.208   3.470   6.130

summary(wine_raw$Sulphates)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
## -3.1300  0.2800  0.5000  0.5271  0.8600  4.2400    1210

summary(wine_raw_imputed_df$Sulphates)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
## -3.1300  0.2800  0.5000  0.5278  0.8600  4.2400

summary(wine_raw$Alcohol)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   -4.70    9.00   10.40   10.49   12.40   26.50     653

summary(wine_raw_imputed_df$Alcohol)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    -4.7     9.0    10.4    10.5    12.4    26.5

summary(wine_raw$STARS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.042   3.000   4.000    3359

summary(wine_raw_imputed_df$STARS)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   1.000   1.000   2.000   2.042   3.000   4.000    3359

Correlation checks

Let’s build the correlation plot.

corrMatrix <- round(cor(wine_raw_imputed_df_bk),4)

corrMatrix %>% corrplot(., method = "color", outline = T, 
                        addgrid.col = "darkgray", order="hclust", 
                        addrect = 4, rect.col = "black", 
                        rect.lwd = 5,cl.pos = "b", tl.col = "indianred4", 
                        tl.cex = 1.0, cl.cex = 1.0, addCoef.col = "white", 
                        number.digits = 2, number.cex = 0.8, 
                        col = colorRampPalette(c("darkred","white","dodgerblue4"))(100))

Splitting the dataset into test and train dataset

n <- nrow(wine_raw_imputed_df)
set.seed(123)
wine_random_index <- wine_raw_imputed_df[sample(n), ]

wine.train.df <- wine_raw_imputed_df[1:as.integer(0.7*n),]

wine.test.df <- wine_raw_imputed_df[as.integer(0.7*n +1):n, ]

table(wine.test.df$TARGET) / nrow(wine.test.df)

## 
##            0            1            2            3            4 
## 0.2159416515 0.0187548841 0.0862203699 0.1997916124 0.2448554311 
##            5            6            7            8 
## 0.1604584527 0.0617348268 0.0114613181 0.0007814535

table(wine.train.df$TARGET) / nrow(wine.train.df)

## 
##           0           1           2           3           4           5 
## 0.212706565 0.019205002 0.084859312 0.205895489 0.249776686 0.156096472 
##           6           7           8 
## 0.058954891 0.010942385 0.001563198

Building the first model for counts - using Poisson Regression

poisson_model1 <- glm(formula = TARGET ~ . - STARS,
                      data = wine.train.df,
                      family = poisson)

summary(poisson_model1)

## 
## Call:
## glm(formula = TARGET ~ . - STARS, family = poisson, data = wine.train.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5426  -0.2929   0.1064   0.3627   1.9301  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.993e+00  2.464e-01   8.087 6.10e-16 ***
## FixedAcidity        4.580e-04  1.040e-03   0.440   0.6597    
## VolatileAcidity    -3.367e-02  8.192e-03  -4.110 3.95e-05 ***
## CitricAcid          1.934e-03  7.384e-03   0.262   0.7934    
## ResidualSugar       1.951e-05  1.909e-04   0.102   0.9186    
## Chlorides          -1.083e-02  2.031e-02  -0.533   0.5940    
## FreeSulfurDioxide   8.108e-05  4.341e-05   1.868   0.0618 .  
## TotalSulfurDioxide  2.534e-05  2.798e-05   0.905   0.3652    
## Density            -3.144e-01  2.425e-01  -1.297   0.1948    
## pH                 -1.978e-03  9.628e-03  -0.205   0.8372    
## Sulphates          -9.199e-03  6.878e-03  -1.338   0.1810    
## Alcohol             7.056e-03  1.747e-03   4.039 5.36e-05 ***
## LabelAppeal         2.466e-01  7.340e-03  33.605  < 2e-16 ***
## AcidIndex          -6.289e-02  5.778e-03 -10.885  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6186.4  on 6640  degrees of freedom
## Residual deviance: 4870.8  on 6627  degrees of freedom
##   (2315 observations deleted due to missingness)
## AIC: 24583
## 
## Number of Fisher Scoring iterations: 5

length(predict(poisson_model1, newdata = wine.test.df))

## [1] 3839

predict_model1_test <- predict(poisson_model1, newdata = wine.test.df, 
                               type = "response") %>% round(0)

table(Predicted = predict_model1_test, Actual = wine.test.df$TARGET)

##          Actual
## Predicted   0   1   2   3   4   5   6   7   8
##         1   1   0   0   0   0   0   0   0   0
##         2  97  45  66  49  10   2   0   0   0
##         3 371  27 231 480 329 101  18   1   0
##         4 243   0  33 231 479 290  70   7   0
##         5  96   0   1   7 118 196 103  15   1
##         6  21   0   0   0   4  26  43  20   2
##         7   0   0   0   0   0   1   3   1   0

Second Poisson Regression model

Removing the statistically insignificant variables

poisson_model2 <- glm(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal +
                        AcidIndex + FreeSulfurDioxide,
                      data = wine.train.df,
                      family = poisson)

summary(poisson_model2)

## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal + 
##     AcidIndex + FreeSulfurDioxide, family = poisson, data = wine.train.df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8451  -0.4753   0.2246   0.6321   2.9385  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        2.050e+00  4.418e-02  46.409  < 2e-16 ***
## VolatileAcidity   -6.404e-02  7.726e-03  -8.290  < 2e-16 ***
## Alcohol            8.891e-03  1.641e-03   5.418 6.02e-08 ***
## LabelAppeal        2.582e-01  6.835e-03  37.772  < 2e-16 ***
## AcidIndex         -1.359e-01  5.181e-03 -26.228  < 2e-16 ***
## FreeSulfurDioxide  1.477e-04  4.095e-05   3.606 0.000311 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 15931  on 8955  degrees of freedom
## Residual deviance: 13620  on 8950  degrees of freedom
## AIC: 36010
## 
## Number of Fisher Scoring iterations: 5

predict_model2_test <- predict(poisson_model2, newdata = wine.test.df, 
                               type = "response") %>% round(0)

unique(wine.test.df$TARGET)

## [1] 3 2 0 4 5 6 7 1 8

table(Predicted = predict_model2_test, Actual = wine.test.df$TARGET)

##          Actual
## Predicted   0   1   2   3   4   5   6   7   8
##         1  40   7   4  10   2   2   0   0   0
##         2 295  54 214 278 115  41   4   0   0
##         3 323  11 103 411 560 239  56   6   0
##         4 125   0  10  66 239 268 104  12   1
##         5  37   0   0   2  20  54  60  16   1
##         6   9   0   0   0   4  10  10   9   1
##         7   0   0   0   0   0   2   3   0   0
##         8   0   0   0   0   0   0   0   1   0

Third model - negative Binomial

nb_model3 <- glm.nb(formula = TARGET ~ . - STARS,
                    data = wine.train.df)

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(nb_model3)

## 
## Call:
## glm.nb(formula = TARGET ~ . - STARS, data = wine.train.df, init.theta = 130137.6579, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.5425  -0.2929   0.1064   0.3627   1.9301  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.993e+00  2.464e-01   8.087 6.10e-16 ***
## FixedAcidity        4.580e-04  1.040e-03   0.440   0.6597    
## VolatileAcidity    -3.367e-02  8.192e-03  -4.110 3.95e-05 ***
## CitricAcid          1.934e-03  7.384e-03   0.262   0.7934    
## ResidualSugar       1.951e-05  1.909e-04   0.102   0.9186    
## Chlorides          -1.083e-02  2.031e-02  -0.533   0.5940    
## FreeSulfurDioxide   8.108e-05  4.341e-05   1.868   0.0618 .  
## TotalSulfurDioxide  2.534e-05  2.798e-05   0.905   0.3652    
## Density            -3.144e-01  2.425e-01  -1.297   0.1948    
## pH                 -1.978e-03  9.629e-03  -0.205   0.8372    
## Sulphates          -9.199e-03  6.878e-03  -1.338   0.1810    
## Alcohol             7.056e-03  1.747e-03   4.039 5.36e-05 ***
## LabelAppeal         2.466e-01  7.340e-03  33.604  < 2e-16 ***
## AcidIndex          -6.290e-02  5.778e-03 -10.885  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(130137.7) family taken to be 1)
## 
##     Null deviance: 6186.3  on 6640  degrees of freedom
## Residual deviance: 4870.7  on 6627  degrees of freedom
##   (2315 observations deleted due to missingness)
## AIC: 24585
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  130138 
##           Std. Err.:  225828 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -24555.19

predict_model3_test <- predict(nb_model3, newdata = wine.test.df, 
                               type = "response") %>% round(0)

unique(wine.test.df$TARGET)

## [1] 3 2 0 4 5 6 7 1 8

table(Predicted = predict_model3_test, Actual = wine.test.df$TARGET)

##          Actual
## Predicted   0   1   2   3   4   5   6   7   8
##         1   1   0   0   0   0   0   0   0   0
##         2  97  45  66  49  10   2   0   0   0
##         3 371  27 231 480 329 101  18   1   0
##         4 243   0  33 231 479 290  70   7   0
##         5  96   0   1   7 118 196 103  15   1
##         6  21   0   0   0   4  26  43  20   2
##         7   0   0   0   0   0   1   3   1   0

Fourth model - negative Binomial

Keeping only statistically relevant columns

nb_model4 <- glm.nb(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal +
                      AcidIndex + FreeSulfurDioxide,
                    data = wine.train.df)

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(nb_model4)

## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal + 
##     AcidIndex + FreeSulfurDioxide, data = wine.train.df, init.theta = 21776.5604, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.8448  -0.4752   0.2246   0.6320   2.9383  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        2.050e+00  4.418e-02  46.406  < 2e-16 ***
## VolatileAcidity   -6.404e-02  7.726e-03  -8.289  < 2e-16 ***
## Alcohol            8.891e-03  1.641e-03   5.418 6.03e-08 ***
## LabelAppeal        2.582e-01  6.836e-03  37.769  < 2e-16 ***
## AcidIndex         -1.359e-01  5.181e-03 -26.226  < 2e-16 ***
## FreeSulfurDioxide  1.477e-04  4.095e-05   3.606 0.000311 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(21776.56) family taken to be 1)
## 
##     Null deviance: 15929  on 8955  degrees of freedom
## Residual deviance: 13618  on 8950  degrees of freedom
## AIC: 36012
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  21777 
##           Std. Err.:  63257 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -35997.77

predict_model4_test <- predict(nb_model4, newdata = wine.test.df, 
                               type = "response") %>% round(0)


table(Predicted = predict_model4_test, Actual = wine.test.df$TARGET)

##          Actual
## Predicted   0   1   2   3   4   5   6   7   8
##         1  40   7   4  10   2   2   0   0   0
##         2 295  54 214 278 115  41   4   0   0
##         3 323  11 103 411 560 239  56   6   0
##         4 125   0  10  66 239 268 104  12   1
##         5  37   0   0   2  20  54  60  16   1
##         6   9   0   0   0   4  10  10   9   1
##         7   0   0   0   0   0   2   3   0   0
##         8   0   0   0   0   0   0   0   1   0

Fifth model - Zero Inflated Poisson regression

zero_infl_model5 <- zeroinfl(formula = TARGET ~ VolatileAcidity + Alcohol + 
                               LabelAppeal + AcidIndex + FreeSulfurDioxide,
                             data = wine.train.df)

summary(zero_infl_model5)

## 
## Call:
## zeroinfl(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal + 
##     AcidIndex + FreeSulfurDioxide, data = wine.train.df)
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.1865 -0.3292  0.1986  0.4740  4.8234 
## 
## Count model coefficients (poisson with log link):
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        1.353e+00  4.904e-02  27.588  < 2e-16 ***
## VolatileAcidity   -1.541e-02  8.109e-03  -1.901  0.05736 .  
## Alcohol            9.033e-03  1.720e-03   5.250 1.52e-07 ***
## LabelAppeal        2.821e-01  7.141e-03  39.507  < 2e-16 ***
## AcidIndex         -2.020e-02  5.822e-03  -3.470  0.00052 ***
## FreeSulfurDioxide  3.304e-05  4.209e-05   0.785  0.43237    
## 
## Zero-inflation model coefficients (binomial with logit link):
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)       -5.5461860  0.2111140 -26.271  < 2e-16 ***
## VolatileAcidity    0.2763853  0.0402520   6.866 6.59e-12 ***
## Alcohol           -0.0005266  0.0084825  -0.062 0.950496    
## LabelAppeal        0.1559517  0.0363520   4.290 1.79e-05 ***
## AcidIndex          0.4956530  0.0224337  22.094  < 2e-16 ***
## FreeSulfurDioxide -0.0007390  0.0002110  -3.502 0.000462 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 16 
## Log-likelihood: -1.6e+04 on 12 Df

predict_model5_test_prob <- predict(zero_infl_model5, newdata = wine.test.df, 
                               type = "prob")
predict_model5_test_final <- apply(predict_model5_test_prob,1,which.max)
predict_model5_test_final <- predict_model5_test_final - 1
table(predict_model5_test_final)

## predict_model5_test_final
##    0    1    2    3    4    5    6    7 
## 1899    9  617  887  277  120   28    2

table(Predicted = predict_model5_test_final, Actual = wine.test.df$TARGET)

##          Actual
## Predicted   0   1   2   3   4   5   6   7   8
##         0 536  32 126 320 418 308 136  22   1
##         1   1   4   4   0   0   0   0   0   0
##         2  98  36 168 220  79  15   1   0   0
##         3 125   0  32 220 335 152  23   0   0
##         4  46   0   0   5  92  90  40   4   0
##         5  18   0   1   2  13  45  31  10   0
##         6   5   0   0   0   3   6   5   7   2
##         7   0   0   0   0   0   0   1   1   0

Multiple Linear Regression models

### Multiple Linear Regression 1
linear_model6 <- lm(formula = TARGET ~ . - STARS,
                    data = wine.train.df)

summary(linear_model6)

## 
## Call:
## lm(formula = TARGET ~ . - STARS, data = wine.train.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.7094 -0.5646  0.2050  0.6891  3.7765 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         6.124e+00  6.184e-01   9.903  < 2e-16 ***
## FixedAcidity        1.725e-03  2.615e-03   0.660  0.50953    
## VolatileAcidity    -1.221e-01  2.057e-02  -5.936 3.07e-09 ***
## CitricAcid          7.018e-03  1.867e-02   0.376  0.70702    
## ResidualSugar       2.141e-04  4.793e-04   0.447  0.65503    
## Chlorides          -4.094e-02  5.074e-02  -0.807  0.41973    
## FreeSulfurDioxide   2.935e-04  1.088e-04   2.697  0.00701 ** 
## TotalSulfurDioxide  9.876e-05  6.993e-05   1.412  0.15795    
## Density            -1.088e+00  6.088e-01  -1.787  0.07397 .  
## pH                 -2.964e-03  2.412e-02  -0.123  0.90218    
## Sulphates          -3.026e-02  1.731e-02  -1.748  0.08052 .  
## Alcohol             2.625e-02  4.354e-03   6.028 1.75e-09 ***
## LabelAppeal         9.008e-01  1.837e-02  49.049  < 2e-16 ***
## AcidIndex          -2.176e-01  1.375e-02 -15.822  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.313 on 6627 degrees of freedom
##   (2315 observations deleted due to missingness)
## Multiple R-squared:  0.2957, Adjusted R-squared:  0.2943 
## F-statistic:   214 on 13 and 6627 DF,  p-value: < 2.2e-16

### Multiple Linear Regression 2
linear_model7 <- lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
                      TotalSulfurDioxide + Density + Sulphates + Alcohol +
                      LabelAppeal + AcidIndex,
                    data = wine.train.df)

summary(linear_model7)

## 
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + Sulphates + Alcohol + LabelAppeal + AcidIndex, 
##     data = wine.train.df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.9198 -0.6702  0.3975  1.1269  5.0180 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         7.4095052  0.6841677  10.830  < 2e-16 ***
## VolatileAcidity    -0.1876416  0.0230711  -8.133 4.74e-16 ***
## FreeSulfurDioxide   0.0004566  0.0001218   3.750 0.000178 ***
## TotalSulfurDioxide  0.0003207  0.0000779   4.117 3.88e-05 ***
## Density            -1.8531822  0.6815777  -2.719 0.006561 ** 
## Sulphates          -0.0413100  0.0192825  -2.142 0.032192 *  
## Alcohol             0.0280679  0.0048590   5.776 7.88e-09 ***
## LabelAppeal         0.7825607  0.0202709  38.605  < 2e-16 ***
## AcidIndex          -0.3580132  0.0136153 -26.295  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.711 on 8947 degrees of freedom
## Multiple R-squared:  0.2088, Adjusted R-squared:  0.2081 
## F-statistic: 295.1 on 8 and 8947 DF,  p-value: < 2.2e-16

predict_model7_test <- predict(linear_model7, newdata = wine.test.df)
table(round(predict_model7_test,0))

## 
##   -1    0    1    2    3    4    5    6 
##    1   14  133  814 1674 1043  157    3

Based on the above analysis, we have decided to go with model5 which is zero Inflated Poisson model.