Link to the project in RPubs: http://rpubs.com/ofomicheva86/387855

#required packages
library(MASS)
library(corrplot)
library(PerformanceAnalytics)
library(GGally)
library(RColorBrewer)
library(VIM)
library(dplyr)
library(mice)
library(pROC)
library(caret)
library(pscl)
library(ResourceSelection)
library(stringr)
library(vcd)
library(rcompanion)
library(lmtest)
  1. DATA EXPLORATION
#read training data set
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/wine-training-data.csv",
stringsAsFactors=T, header=T)

#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/wine-evaluation-data.csv",
stringsAsFactors=T, header=T)

#display first six entries
head(data)
##   INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## 1     1      3          3.2           1.160      -0.98          54.2
## 2     2      3          4.5           0.160      -0.81          26.1
## 3     4      5          7.1           2.640      -0.88          14.8
## 4     5      3          5.7           0.385       0.04          18.8
## 5     6      4          8.0           0.330      -1.26           9.4
## 6     7      0         11.3           0.320       0.59           2.2
##   Chlorides FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates
## 1    -0.567                NA                268 0.99280 3.33     -0.59
## 2    -0.425                15               -327 1.02792 3.38      0.70
## 3     0.037               214                142 0.99518 3.12      0.48
## 4    -0.425                22                115 0.99640 2.24      1.83
## 5        NA              -167                108 0.99457 3.12      1.77
## 6     0.556               -37                 15 0.99940 3.20      1.29
##   Alcohol LabelAppeal AcidIndex STARS
## 1     9.9           0         8     2
## 2      NA          -1         7     3
## 3    22.0          -1         8     3
## 4     6.2          -1         6     1
## 5    13.7           0         9     2
## 6    15.4           0        11    NA
head(data_testing)
##   IN TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## 1  3     NA          5.4          -0.860       0.27         -10.7
## 2  9     NA         12.4           0.385      -0.76         -19.7
## 3 10     NA          7.2           1.750       0.17         -33.0
## 4 18     NA          6.2           0.100       1.80           1.0
## 5 21     NA         11.4           0.210       0.28           1.2
## 6 30     NA         17.6           0.040      -1.15           1.4
##   Chlorides FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates
## 1     0.092                23                398 0.98527 5.02      0.64
## 2     1.169               -37                 68 0.99048 3.37      1.09
## 3     0.065                 9                 76 1.04641 4.61      0.68
## 4    -0.179               104                 89 0.98877 3.20      2.11
## 5     0.038                70                 53 1.02899 2.54     -0.07
## 6     0.535              -250                140 0.95028 3.06     -0.02
##   Alcohol LabelAppeal AcidIndex STARS
## 1   12.30          -1         6    NA
## 2   16.00           0         6     2
## 3    8.55           0         8     1
## 4   12.30          -1         8     1
## 5    4.80           0        10    NA
## 6   11.40           1         8     4
#find dimentions
dim(data)
## [1] 12795    16
#build function that counts missing values
count_nas <- function(data){
  
variable_name_column <- c()
number_missing_column <- c()

for (i in 2:ncol(data)){
  variable_name <- colnames(data[i])
  number_missing <- sum(is.na(data[i]))
  variable_name_column <- c(variable_name_column,variable_name)
  number_missing_column <- c(number_missing_column,number_missing)
}

missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),2)) %>% arrange(desc(percentage))
missing_table
}


#build function that counts negative values
count_neg <- function(data){
  
variable_name_column <- c()
number_negative_column <- c()  


for (i in 3:ncol(data)){
  neg_count <- 0
  variable_name <- colnames(data[i])
  for (j in 1:nrow(data)){
    if(is.numeric(data[j,i]) && !is.na(data[j,i]) && data[j,i] < 0) {
      neg_count <- neg_count + 1
    }
    }
    number_negative_column <- c(number_negative_column,neg_count)
    variable_name_column  <- c(variable_name_column,variable_name) 
  }


negative_table <- data.frame(variable_name_column,number_negative_column)
negative_table <- negative_table %>% mutate(percentage=round(number_negative_column*100/nrow(data),2)) %>% arrange(desc(percentage))
negative_table
}

#chart for missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.5, cex.numbers = 0.1,
     ylab=c("Proportion of missingness","Missingness Pattern"),
     labels=names(data[-1]))

#count missing values
count_nas(data[3:length(data)])
##    variable_name_column number_missing_column percentage
## 1                 STARS                  3359      26.25
## 2             Sulphates                  1210       9.46
## 3    TotalSulfurDioxide                   682       5.33
## 4               Alcohol                   653       5.10
## 5     FreeSulfurDioxide                   647       5.06
## 6             Chlorides                   638       4.99
## 7         ResidualSugar                   616       4.81
## 8                    pH                   395       3.09
## 9       VolatileAcidity                     0       0.00
## 10           CitricAcid                     0       0.00
## 11              Density                     0       0.00
## 12          LabelAppeal                     0       0.00
## 13            AcidIndex                     0       0.00
count_nas(data_testing[3:length(data_testing)])
##    variable_name_column number_missing_column percentage
## 1                 STARS                   841      25.22
## 2             Sulphates                   310       9.30
## 3               Alcohol                   185       5.55
## 4         ResidualSugar                   168       5.04
## 5    TotalSulfurDioxide                   157       4.71
## 6     FreeSulfurDioxide                   152       4.56
## 7             Chlorides                   138       4.14
## 8                    pH                   104       3.12
## 9       VolatileAcidity                     0       0.00
## 10           CitricAcid                     0       0.00
## 11              Density                     0       0.00
## 12          LabelAppeal                     0       0.00
## 13            AcidIndex                     0       0.00
  1. DATA PREPARATION

Replace missing values of STARS with “NONE”.

data <- data %>% mutate(STARS = as.factor(ifelse(is.na(STARS),"NONE",STARS)))
data_testing <- data_testing %>% mutate(STARS = as.factor(ifelse(is.na(STARS),"NONE",STARS)))

Replace negative values with their absolute values.

#count negative values
count_neg(data)
##    variable_name_column number_negative_column percentage
## 1           LabelAppeal                   3640      28.45
## 2             Chlorides                   3197      24.99
## 3         ResidualSugar                   3136      24.51
## 4     FreeSulfurDioxide                   3036      23.73
## 5            CitricAcid                   2966      23.18
## 6       VolatileAcidity                   2827      22.09
## 7    TotalSulfurDioxide                   2504      19.57
## 8             Sulphates                   2361      18.45
## 9          FixedAcidity                   1621      12.67
## 10              Alcohol                    118       0.92
## 11              Density                      0       0.00
## 12                   pH                      0       0.00
## 13            AcidIndex                      0       0.00
## 14                STARS                      0       0.00
count_neg(data_testing)
##    variable_name_column number_negative_column percentage
## 1           LabelAppeal                    924      27.71
## 2         ResidualSugar                    828      24.83
## 3            CitricAcid                    804      24.11
## 4       VolatileAcidity                    788      23.63
## 5             Chlorides                    776      23.27
## 6     FreeSulfurDioxide                    774      23.21
## 7    TotalSulfurDioxide                    639      19.16
## 8             Sulphates                    594      17.81
## 9          FixedAcidity                    439      13.16
## 10              Alcohol                     25       0.75
## 11              Density                      0       0.00
## 12                   pH                      0       0.00
## 13            AcidIndex                      0       0.00
## 14                STARS                      0       0.00
#exclude the variables 'STARS' and 'LabelAppeal'
stars_labelappeal <- data %>% select(STARS,LabelAppeal)
stars_labelappeal_testing <- data_testing %>% select(STARS,LabelAppeal)
data <- data %>% select(-STARS,-LabelAppeal)
data_testing <- data_testing %>% select(-STARS,-LabelAppeal,-TARGET)

#replace negative values witj their absolute values
data <- abs(data)
data_testing <- abs(data_testing)

#merge
data <- data.frame(data,stars_labelappeal)
data_testing <- data.frame(data_testing,stars_labelappeal_testing)

#confirm no negatives
count_neg(data)
##    variable_name_column number_negative_column percentage
## 1           LabelAppeal                   3640      28.45
## 2          FixedAcidity                      0       0.00
## 3       VolatileAcidity                      0       0.00
## 4            CitricAcid                      0       0.00
## 5         ResidualSugar                      0       0.00
## 6             Chlorides                      0       0.00
## 7     FreeSulfurDioxide                      0       0.00
## 8    TotalSulfurDioxide                      0       0.00
## 9               Density                      0       0.00
## 10                   pH                      0       0.00
## 11            Sulphates                      0       0.00
## 12              Alcohol                      0       0.00
## 13            AcidIndex                      0       0.00
## 14                STARS                      0       0.00
count_neg(data_testing)
##    variable_name_column number_negative_column percentage
## 1           LabelAppeal                    924      27.71
## 2       VolatileAcidity                      0       0.00
## 3            CitricAcid                      0       0.00
## 4         ResidualSugar                      0       0.00
## 5             Chlorides                      0       0.00
## 6     FreeSulfurDioxide                      0       0.00
## 7    TotalSulfurDioxide                      0       0.00
## 8               Density                      0       0.00
## 9                    pH                      0       0.00
## 10            Sulphates                      0       0.00
## 11              Alcohol                      0       0.00
## 12            AcidIndex                      0       0.00
## 13                STARS                      0       0.00

Apply multiple imputation.

#apply multiple imputation for training data set
#exclude variable'INDEX' and 'TARGET'
exclude <- c('INDEX','TARGET')
index_target <- data[1:2]
include <- setdiff(names(data), exclude)
data_include <- data[include]

#imputation with mean
imp.data <- mice(data_include, m=9, method='pmm', printFlag=FALSE)

#merge imputed values with data frame
data <- complete(imp.data)
data <- data.frame(index_target,data)
head(data)
##   INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## 1     1      3          3.2           1.160       0.98          54.2
## 2     2      3          4.5           0.160       0.81          26.1
## 3     4      5          7.1           2.640       0.88          14.8
## 4     5      3          5.7           0.385       0.04          18.8
## 5     6      4          8.0           0.330       1.26           9.4
## 6     7      0         11.3           0.320       0.59           2.2
##   Chlorides FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates
## 1     0.567               222                268 0.99280 3.33      0.59
## 2     0.425                15                327 1.02792 3.38      0.70
## 3     0.037               214                142 0.99518 3.12      0.48
## 4     0.425                22                115 0.99640 2.24      1.83
## 5     0.256               167                108 0.99457 3.12      1.77
## 6     0.556                37                 15 0.99940 3.20      1.29
##   Alcohol AcidIndex STARS LabelAppeal
## 1     9.9         8     2           0
## 2    11.7         7     3          -1
## 3    22.0         8     3          -1
## 4     6.2         6     1          -1
## 5    13.7         9     2           0
## 6    15.4        11  NONE           0
#confirm no NAs
count_nas(data)
##    variable_name_column number_missing_column percentage
## 1                TARGET                     0          0
## 2          FixedAcidity                     0          0
## 3       VolatileAcidity                     0          0
## 4            CitricAcid                     0          0
## 5         ResidualSugar                     0          0
## 6             Chlorides                     0          0
## 7     FreeSulfurDioxide                     0          0
## 8    TotalSulfurDioxide                     0          0
## 9               Density                     0          0
## 10                   pH                     0          0
## 11            Sulphates                     0          0
## 12              Alcohol                     0          0
## 13            AcidIndex                     0          0
## 14                STARS                     0          0
## 15          LabelAppeal                     0          0
#apply multiple imputation for testing data set
exclude_testing <- c('IN','TARGET')
index_target_testing <- data_testing[1:2]
include_testing <- setdiff(names(data_testing), exclude_testing)
data_include_testing <- data_testing[include_testing]

#imputation with mean
imp.data_testing <- mice(data_include_testing, m=9, method='pmm', printFlag=FALSE)

#merge imputed values with data frame
data_testing <- complete(imp.data_testing)
data_testing <- data.frame(index_target_testing,data_testing)
head(data_testing)
##   IN FixedAcidity FixedAcidity.1 VolatileAcidity CitricAcid ResidualSugar
## 1  3          5.4            5.4           0.860       0.27          10.7
## 2  9         12.4           12.4           0.385       0.76          19.7
## 3 10          7.2            7.2           1.750       0.17          33.0
## 4 18          6.2            6.2           0.100       1.80           1.0
## 5 21         11.4           11.4           0.210       0.28           1.2
## 6 30         17.6           17.6           0.040       1.15           1.4
##   Chlorides FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates
## 1     0.092                23                398 0.98527 5.02      0.64
## 2     1.169                37                 68 0.99048 3.37      1.09
## 3     0.065                 9                 76 1.04641 4.61      0.68
## 4     0.179               104                 89 0.98877 3.20      2.11
## 5     0.038                70                 53 1.02899 2.54      0.07
## 6     0.535               250                140 0.95028 3.06      0.02
##   Alcohol AcidIndex STARS LabelAppeal
## 1   12.30         6  NONE          -1
## 2   16.00         6     2           0
## 3    8.55         8     1           0
## 4   12.30         8     1          -1
## 5    4.80        10  NONE           0
## 6   11.40         8     4           1
#confirm no NAs
count_nas(data_testing)
##    variable_name_column number_missing_column percentage
## 1          FixedAcidity                     0          0
## 2        FixedAcidity.1                     0          0
## 3       VolatileAcidity                     0          0
## 4            CitricAcid                     0          0
## 5         ResidualSugar                     0          0
## 6             Chlorides                     0          0
## 7     FreeSulfurDioxide                     0          0
## 8    TotalSulfurDioxide                     0          0
## 9               Density                     0          0
## 10                   pH                     0          0
## 11            Sulphates                     0          0
## 12              Alcohol                     0          0
## 13            AcidIndex                     0          0
## 14                STARS                     0          0
## 15          LabelAppeal                     0          0
#create separate boxplots for each numeric variable
par(mfrow=c(1,5))
for(i in 3:ncol(data)) {
  
   if (is.numeric(data[,i])=="TRUE") {
       boxplot(data[,i], main=names(data)[i])
   }
}

#create mosaic plot for 'STARS'
count <- table(data$TARGET, data$STARS)
mosaicplot(count, main = "Distribution of 'STARS'",
           xlab = "TARGET",
           ylab = "STARS",
           las = 1,
           border = "black",
           shade = TRUE
           )

  1. DATA PREPARATION

The following assumptions must be verified for linear regression.

  1. Linearity assumption.
par(mfrow=c(2,2))
#colnames <- dimnames(data)[[2]]
for (i in 1:ncol(data)) {
  if (is.double(data[,i]) == "TRUE"){
    plot(data$TARGET ~ data[,i],main=names(data)[i], xlab=names(data)[i])
    reg_line <- lm(data$TARGET ~ data[,i])
    abline(reg_line,col="red")
}
}    

  1. Normal distribution of dependent variables.
# histograms, density lines and normal probability plots
for (i in 1:ncol(data)) {
  if (is.double(data[,i]) == "TRUE"){
par(mfrow=c(1,2))
plotNormalHistogram(data[,i],main=names(data)[i])
qqnorm(data[,i],main=names(data)[i])
qqline(data[,i])
}}

c. Multicollinearity assumption.

#correlation between variables
corrplot(cor(data[3:ncol(data)] %>% select_if(is.numeric)), type = "upper", method = "number", tl.cex = 0.8, tl.col="black",number.cex = .5)

  1. Homoscedasticity assumption.
model = glm(TARGET ~., data = data)
#Breush Pagan Test
bptest(model)
## 
##  studentized Breusch-Pagan test
## 
## data:  model
## BP = 1874.1, df = 18, p-value < 2.2e-16

Transforming data using Box-Cox approach.

#resolve problem with non-negative values
for (i in 2:ncol(data)){
  for (j in 1:nrow(data)){
if (is.double(data[j,i])==TRUE && data[j,i] == 0) {
  data[j,i]=data[j,i]+0.01}
  }
}

#data transformation
for (i in 2:ncol(data)) {
  
  if (is.double(data[,i])==TRUE){
  
Box = boxcox(data[,i] ~ 1,               # Transform Turbidity as a single vector
             lambda = seq(-10,10,1)      # Try values -10 to 10 by 0.1
             )
Cox = data.frame(Box$x, Box$y)            # Create a data frame with the results
Cox2 = Cox[with(Cox, order(-Cox$Box.y)),] # Order the new data frame by decreasing y
#Cox2[1,]                                  # Display the lambda with the greatest
                                          #    log likelihood
lambda = Cox2[1, "Box.x"]                 # Extract that lambda

data[,i] = (data[,i] ^ lambda - 1)/lambda   # Transform the original data
#plotNormalHistogram(data[,i],main=names(data)[i])
#qqnorm(data[,i],main=names(data)[i])
#qqline(data[,i])
}

}

  1. BUILD MODELS
  1. Build a multiple linear model.
#build glm model using stepwise approach
linear_model.null = glm(TARGET ~ 1, data = data)

linear_model.full = glm(TARGET ~ ., data = data)
     
step(linear_model.null,
     scope = list(upper=linear_model.full),
             direction = "both",
             data = data)
## Start:  AIC=53091.37
## TARGET ~ 1
## 
##                      Df Deviance   AIC
## + STARS               4    24729 44754
## + LabelAppeal         1    41443 51354
## + AcidIndex           1    44603 52294
## + VolatileAcidity     1    47141 53003
## + TotalSulfurDioxide  1    47268 53037
## + Alcohol             1    47299 53045
## + FreeSulfurDioxide   1    47326 53053
## + Chlorides           1    47356 53061
## + FixedAcidity        1    47366 53063
## + Sulphates           1    47398 53072
## + CitricAcid          1    47410 53075
## + Density             1    47417 53077
## + ResidualSugar       1    47458 53088
## <none>                     47477 53091
## + pH                  1    47473 53092
## + INDEX               1    47477 53093
## 
## Step:  AIC=44753.56
## TARGET ~ STARS
## 
##                      Df Deviance   AIC
## + LabelAppeal         1    22900 43772
## + AcidIndex           1    23983 44363
## + VolatileAcidity     1    24627 44703
## + TotalSulfurDioxide  1    24663 44721
## + FreeSulfurDioxide   1    24690 44735
## + Chlorides           1    24695 44738
## + Alcohol             1    24704 44743
## + CitricAcid          1    24708 44745
## + Sulphates           1    24711 44746
## + FixedAcidity        1    24713 44747
## + Density             1    24715 44748
## + ResidualSugar       1    24723 44753
## <none>                     24729 44754
## + INDEX               1    24726 44754
## + pH                  1    24729 44756
## - STARS               4    47477 53091
## 
## Step:  AIC=43772.25
## TARGET ~ STARS + LabelAppeal
## 
##                      Df Deviance   AIC
## + AcidIndex           1    21990 43255
## + VolatileAcidity     1    22801 43719
## + TotalSulfurDioxide  1    22815 43727
## + FreeSulfurDioxide   1    22864 43755
## + Chlorides           1    22866 43755
## + Alcohol             1    22867 43756
## + FixedAcidity        1    22881 43764
## + Sulphates           1    22881 43764
## + CitricAcid          1    22885 43766
## + Density             1    22887 43767
## + ResidualSugar       1    22893 43770
## <none>                     22900 43772
## + INDEX               1    22899 43774
## + pH                  1    22899 43774
## - LabelAppeal         1    24729 44754
## - STARS               4    41443 51354
## 
## Step:  AIC=43255.27
## TARGET ~ STARS + LabelAppeal + AcidIndex
## 
##                      Df Deviance   AIC
## + VolatileAcidity     1    21915 43214
## + TotalSulfurDioxide  1    21938 43227
## + CitricAcid          1    21954 43237
## + Alcohol             1    21964 43243
## + FreeSulfurDioxide   1    21970 43246
## + Chlorides           1    21973 43248
## + Sulphates           1    21978 43251
## + pH                  1    21983 43254
## + Density             1    21983 43254
## + ResidualSugar       1    21985 43255
## <none>                     21990 43255
## + INDEX               1    21989 43257
## + FixedAcidity        1    21989 43257
## - AcidIndex           1    22900 43772
## - LabelAppeal         1    23983 44363
## - STARS               4    38357 50366
## 
## Step:  AIC=43213.93
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity
## 
##                      Df Deviance   AIC
## + TotalSulfurDioxide  1    21869 43189
## + CitricAcid          1    21882 43197
## + Alcohol             1    21888 43200
## + FreeSulfurDioxide   1    21897 43205
## + Chlorides           1    21900 43207
## + Sulphates           1    21905 43210
## + Density             1    21908 43212
## + pH                  1    21910 43213
## + ResidualSugar       1    21911 43213
## <none>                     21915 43214
## + INDEX               1    21915 43216
## + FixedAcidity        1    21915 43216
## - VolatileAcidity     1    21990 43255
## - AcidIndex           1    22801 43719
## - LabelAppeal         1    23903 44323
## - STARS               4    38156 50301
## 
## Step:  AIC=43188.68
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide
## 
##                      Df Deviance   AIC
## + CitricAcid          1    21837 43172
## + Alcohol             1    21839 43173
## + FreeSulfurDioxide   1    21851 43181
## + Chlorides           1    21853 43182
## + Sulphates           1    21858 43185
## + Density             1    21861 43186
## + pH                  1    21863 43187
## + ResidualSugar       1    21865 43188
## <none>                     21869 43189
## + FixedAcidity        1    21868 43190
## + INDEX               1    21868 43190
## - TotalSulfurDioxide  1    21915 43214
## - VolatileAcidity     1    21938 43227
## - AcidIndex           1    22724 43678
## - LabelAppeal         1    23868 44306
## - STARS               4    38034 50262
## 
## Step:  AIC=43172.22
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid
## 
##                      Df Deviance   AIC
## + Alcohol             1    21807 43157
## + FreeSulfurDioxide   1    21821 43165
## + Chlorides           1    21822 43166
## + Sulphates           1    21826 43168
## + Density             1    21830 43170
## + pH                  1    21831 43171
## + ResidualSugar       1    21833 43172
## <none>                     21837 43172
## + FixedAcidity        1    21836 43174
## + INDEX               1    21837 43174
## - CitricAcid          1    21869 43189
## - TotalSulfurDioxide  1    21882 43197
## - VolatileAcidity     1    21904 43209
## - AcidIndex           1    22711 43673
## - LabelAppeal         1    23831 44288
## - STARS               4    37943 50233
## 
## Step:  AIC=43156.65
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol
## 
##                      Df Deviance   AIC
## + FreeSulfurDioxide   1    21790 43149
## + Chlorides           1    21793 43150
## + Sulphates           1    21797 43152
## + Density             1    21800 43155
## + pH                  1    21802 43156
## + ResidualSugar       1    21803 43156
## <none>                     21807 43157
## + FixedAcidity        1    21806 43158
## + INDEX               1    21807 43158
## - Alcohol             1    21837 43172
## - CitricAcid          1    21839 43173
## - TotalSulfurDioxide  1    21855 43183
## - VolatileAcidity     1    21876 43195
## - AcidIndex           1    22672 43652
## - LabelAppeal         1    23808 44278
## - STARS               4    37800 50187
## 
## Step:  AIC=43148.61
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide
## 
##                      Df Deviance   AIC
## + Chlorides           1    21776 43142
## + Sulphates           1    21780 43145
## + Density             1    21783 43147
## + pH                  1    21785 43148
## + ResidualSugar       1    21786 43148
## <none>                     21790 43149
## + FixedAcidity        1    21789 43150
## + INDEX               1    21790 43150
## - FreeSulfurDioxide   1    21807 43157
## - Alcohol             1    21821 43165
## - CitricAcid          1    21821 43165
## - TotalSulfurDioxide  1    21836 43174
## - VolatileAcidity     1    21857 43186
## - AcidIndex           1    22641 43637
## - LabelAppeal         1    23787 44269
## - STARS               4    37743 50170
## 
## Step:  AIC=43142.23
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides
## 
##                      Df Deviance   AIC
## + Sulphates           1    21766 43139
## + Density             1    21769 43140
## + pH                  1    21771 43141
## + ResidualSugar       1    21771 43142
## <none>                     21776 43142
## + FixedAcidity        1    21775 43144
## + INDEX               1    21775 43144
## - Chlorides           1    21790 43149
## - FreeSulfurDioxide   1    21793 43150
## - Alcohol             1    21806 43158
## - CitricAcid          1    21806 43158
## - TotalSulfurDioxide  1    21822 43167
## - VolatileAcidity     1    21842 43179
## - AcidIndex           1    22612 43622
## - LabelAppeal         1    23772 44262
## - STARS               4    37701 50157
## 
## Step:  AIC=43138.54
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates
## 
##                      Df Deviance   AIC
## + Density             1    21760 43137
## + pH                  1    21761 43138
## + ResidualSugar       1    21762 43138
## <none>                     21766 43139
## + FixedAcidity        1    21765 43140
## + INDEX               1    21766 43140
## - Sulphates           1    21776 43142
## - Chlorides           1    21780 43145
## - FreeSulfurDioxide   1    21783 43146
## - Alcohol             1    21797 43155
## - CitricAcid          1    21797 43155
## - TotalSulfurDioxide  1    21811 43163
## - VolatileAcidity     1    21832 43175
## - AcidIndex           1    22596 43616
## - LabelAppeal         1    23762 44259
## - STARS               4    37665 50147
## 
## Step:  AIC=43136.85
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + Density
## 
##                      Df Deviance   AIC
## + pH                  1    21755 43136
## + ResidualSugar       1    21756 43136
## <none>                     21760 43137
## + FixedAcidity        1    21759 43139
## - Density             1    21766 43139
## + INDEX               1    21759 43139
## - Sulphates           1    21769 43140
## - Chlorides           1    21773 43143
## - FreeSulfurDioxide   1    21776 43145
## - Alcohol             1    21790 43153
## - CitricAcid          1    21790 43153
## - TotalSulfurDioxide  1    21806 43162
## - VolatileAcidity     1    21826 43174
## - AcidIndex           1    22583 43610
## - LabelAppeal         1    23754 44257
## - STARS               4    37642 50141
## 
## Step:  AIC=43136.22
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + Density + pH
## 
##                      Df Deviance   AIC
## + ResidualSugar       1    21751 43136
## <none>                     21755 43136
## - pH                  1    21760 43137
## - Density             1    21761 43138
## + FixedAcidity        1    21755 43138
## + INDEX               1    21755 43138
## - Sulphates           1    21764 43140
## - Chlorides           1    21768 43142
## - FreeSulfurDioxide   1    21772 43144
## - Alcohol             1    21785 43152
## - CitricAcid          1    21786 43152
## - TotalSulfurDioxide  1    21801 43161
## - VolatileAcidity     1    21821 43173
## - AcidIndex           1    22583 43612
## - LabelAppeal         1    23752 44258
## - STARS               4    37617 50135
## 
## Step:  AIC=43135.8
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + Density + pH + ResidualSugar
## 
##                      Df Deviance   AIC
## <none>                     21751 43136
## - ResidualSugar       1    21755 43136
## - pH                  1    21756 43136
## + FixedAcidity        1    21751 43137
## - Density             1    21757 43137
## + INDEX               1    21751 43138
## - Sulphates           1    21760 43139
## - Chlorides           1    21764 43142
## - FreeSulfurDioxide   1    21768 43144
## - Alcohol             1    21782 43152
## - CitricAcid          1    21782 43152
## - TotalSulfurDioxide  1    21796 43160
## - VolatileAcidity     1    21817 43172
## - AcidIndex           1    22577 43611
## - LabelAppeal         1    23748 44258
## - STARS               4    37605 50133
## 
## Call:  glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + Density + pH + ResidualSugar, data = data)
## 
## Coefficients:
##        (Intercept)              STARS2              STARS3  
##           3.742207            1.024585            1.597538  
##             STARS4              STARS5         LabelAppeal  
##           2.293295           -1.361599            0.466437  
##          AcidIndex     VolatileAcidity  TotalSulfurDioxide  
##          -0.197985           -0.102806            0.015316  
##         CitricAcid             Alcohol   FreeSulfurDioxide  
##           0.065283            0.039848            0.008899  
##          Chlorides           Sulphates             Density  
##          -0.034437           -0.038933           -0.837728  
##                 pH       ResidualSugar  
##          -0.024235            0.006284  
## 
## Degrees of Freedom: 12794 Total (i.e. Null);  12778 Residual
## Null Deviance:       47480 
## Residual Deviance: 21750     AIC: 43140
  1. Build a negative binomial regression model.
#build lm model using stepwise approach
linear_model.null = glm.nb(TARGET ~ 1, data = data)

linear_model.full = glm.nb(TARGET ~ ., data = data)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
step(linear_model.null,
     scope = list(upper=linear_model.full),
             direction = "both",
             data = data)
## Start:  AIC=54331.58
## TARGET ~ 1
## 
##                      Df Deviance   AIC
## + STARS               4    12176 48386
## + LabelAppeal         1    16706 52911
## + AcidIndex           1    17361 53566
## + VolatileAcidity     1    18050 54255
## + TotalSulfurDioxide  1    18079 54283
## + Alcohol             1    18087 54292
## + FreeSulfurDioxide   1    18093 54298
## + Chlorides           1    18101 54305
## + FixedAcidity        1    18103 54307
## + Sulphates           1    18110 54315
## + CitricAcid          1    18113 54318
## + Density             1    18115 54319
## + ResidualSugar       1    18124 54329
## <none>                     18129 54332
## + pH                  1    18128 54333
## + INDEX               1    18129 54334
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=46643.61
## TARGET ~ STARS
## 
##                      Df Deviance   AIC
## + LabelAppeal         1    14060 46015
## + AcidIndex           1    14391 46346
## + VolatileAcidity     1    14657 46612
## + TotalSulfurDioxide  1    14668 46623
## + FreeSulfurDioxide   1    14677 46632
## + Chlorides           1    14680 46635
## + Alcohol             1    14682 46637
## + CitricAcid          1    14683 46638
## + Sulphates           1    14684 46640
## + FixedAcidity        1    14685 46640
## + Density             1    14686 46641
## <none>                     14690 46644
## + ResidualSugar       1    14689 46644
## + INDEX               1    14690 46645
## + pH                  1    14690 46646
## - STARS               4    22860 54805
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=46014.56
## TARGET ~ STARS + LabelAppeal
## 
##                      Df Deviance   AIC
## + AcidIndex           1    13716 45673
## + VolatileAcidity     1    14029 45986
## + TotalSulfurDioxide  1    14033 45990
## + Chlorides           1    14048 46005
## + Alcohol             1    14048 46005
## + FreeSulfurDioxide   1    14048 46005
## + FixedAcidity        1    14053 46010
## + Sulphates           1    14053 46010
## + CitricAcid          1    14055 46012
## + Density             1    14055 46012
## + ResidualSugar       1    14056 46014
## <none>                     14060 46015
## + INDEX               1    14059 46016
## + pH                  1    14059 46016
## - LabelAppeal         1    14691 46644
## - STARS               4    20867 52814
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45672.53
## TARGET ~ STARS + LabelAppeal + AcidIndex
## 
##                      Df Deviance   AIC
## + VolatileAcidity     1    13692 45651
## + TotalSulfurDioxide  1    13698 45657
## + CitricAcid          1    13704 45663
## + Alcohol             1    13709 45668
## + FreeSulfurDioxide   1    13709 45668
## + Chlorides           1    13710 45669
## + Sulphates           1    13711 45670
## + pH                  1    13713 45672
## + ResidualSugar       1    13713 45672
## + Density             1    13714 45673
## <none>                     13716 45673
## + INDEX               1    13715 45674
## + FixedAcidity        1    13716 45675
## - AcidIndex           1    14060 46015
## - LabelAppeal         1    14391 46346
## - STARS               4    19767 51716
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45650.9
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity
## 
##                      Df Deviance   AIC
## + TotalSulfurDioxide  1    13676 45637
## + CitricAcid          1    13682 45642
## + Alcohol             1    13684 45645
## + FreeSulfurDioxide   1    13686 45647
## + Chlorides           1    13686 45647
## + Sulphates           1    13688 45649
## + ResidualSugar       1    13689 45650
## + pH                  1    13689 45650
## + Density             1    13690 45651
## <none>                     13692 45651
## + INDEX               1    13692 45653
## + FixedAcidity        1    13692 45653
## - VolatileAcidity     1    13716 45673
## - AcidIndex           1    14029 45986
## - LabelAppeal         1    14364 46321
## - STARS               4    19697 51648
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45637.24
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide
## 
##                      Df Deviance   AIC
## + CitricAcid          1    13666 45629
## + Alcohol             1    13668 45631
## + FreeSulfurDioxide   1    13670 45633
## + Chlorides           1    13670 45633
## + Sulphates           1    13672 45635
## + pH                  1    13674 45637
## + Density             1    13674 45637
## + ResidualSugar       1    13674 45637
## <none>                     13676 45637
## + INDEX               1    13676 45639
## + FixedAcidity        1    13676 45639
## - TotalSulfurDioxide  1    13692 45651
## - VolatileAcidity     1    13698 45657
## - AcidIndex           1    14005 45964
## - LabelAppeal         1    14351 46310
## - STARS               4    19655 51608
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45629.15
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid
## 
##                      Df Deviance   AIC
## + Alcohol             1    13658 45623
## + FreeSulfurDioxide   1    13660 45625
## + Chlorides           1    13661 45626
## + Sulphates           1    13662 45627
## + pH                  1    13664 45628
## + Density             1    13664 45629
## + ResidualSugar       1    13664 45629
## <none>                     13666 45629
## + INDEX               1    13666 45631
## + FixedAcidity        1    13666 45631
## - CitricAcid          1    13676 45637
## - TotalSulfurDioxide  1    13682 45642
## - VolatileAcidity     1    13687 45648
## - AcidIndex           1    14001 45962
## - LabelAppeal         1    14338 46299
## - STARS               4    19624 51579
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45622.72
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol
## 
##                      Df Deviance   AIC
## + FreeSulfurDioxide   1    13652 45619
## + Chlorides           1    13652 45619
## + Sulphates           1    13654 45620
## + ResidualSugar       1    13655 45622
## + pH                  1    13655 45622
## + Density             1    13655 45622
## <none>                     13658 45623
## + INDEX               1    13658 45625
## + FixedAcidity        1    13658 45625
## - Alcohol             1    13666 45629
## - CitricAcid          1    13668 45631
## - TotalSulfurDioxide  1    13674 45637
## - VolatileAcidity     1    13680 45642
## - AcidIndex           1    13988 45951
## - LabelAppeal         1    14332 46295
## - STARS               4    19580 51537
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45618.71
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide
## 
##                      Df Deviance   AIC
## + Chlorides           1    13646 45615
## + Sulphates           1    13648 45617
## + ResidualSugar       1    13649 45618
## + Density             1    13649 45618
## + pH                  1    13649 45618
## <none>                     13652 45619
## + INDEX               1    13652 45621
## + FixedAcidity        1    13652 45621
## - FreeSulfurDioxide   1    13658 45623
## - Alcohol             1    13660 45625
## - CitricAcid          1    13662 45627
## - TotalSulfurDioxide  1    13667 45632
## - VolatileAcidity     1    13673 45638
## - AcidIndex           1    13978 45943
## - LabelAppeal         1    14324 46289
## - STARS               4    19560 51519
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45615.44
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides
## 
##                      Df Deviance   AIC
## + Sulphates           1    13642 45613
## + ResidualSugar       1    13644 45615
## + Density             1    13644 45615
## + pH                  1    13644 45615
## <none>                     13646 45615
## + INDEX               1    13646 45617
## + FixedAcidity        1    13646 45617
## - Chlorides           1    13652 45619
## - FreeSulfurDioxide   1    13652 45619
## - Alcohol             1    13655 45622
## - CitricAcid          1    13656 45623
## - TotalSulfurDioxide  1    13662 45629
## - VolatileAcidity     1    13668 45635
## - AcidIndex           1    13968 45935
## - LabelAppeal         1    14319 46286
## - STARS               4    19546 51507
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45613.42
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates
## 
##                      Df Deviance   AIC
## + ResidualSugar       1    13640 45613
## + Density             1    13640 45613
## + pH                  1    13640 45613
## <none>                     13642 45613
## + INDEX               1    13642 45615
## + FixedAcidity        1    13642 45615
## - Sulphates           1    13646 45615
## - Chlorides           1    13648 45617
## - FreeSulfurDioxide   1    13648 45617
## - Alcohol             1    13651 45620
## - CitricAcid          1    13652 45622
## - TotalSulfurDioxide  1    13658 45627
## - VolatileAcidity     1    13663 45632
## - AcidIndex           1    13962 45931
## - LabelAppeal         1    14315 46284
## - STARS               4    19533 51496
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45613.01
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + ResidualSugar
## 
##                      Df Deviance   AIC
## + Density             1    13638 45613
## + pH                  1    13638 45613
## <none>                     13640 45613
## - ResidualSugar       1    13642 45613
## - Sulphates           1    13644 45615
## + INDEX               1    13640 45615
## + FixedAcidity        1    13640 45615
## - Chlorides           1    13645 45616
## - FreeSulfurDioxide   1    13646 45617
## - Alcohol             1    13649 45620
## - CitricAcid          1    13650 45621
## - TotalSulfurDioxide  1    13655 45626
## - VolatileAcidity     1    13661 45632
## - AcidIndex           1    13959 45930
## - LabelAppeal         1    14314 46285
## - STARS               4    19529 51494
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45612.67
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + ResidualSugar + Density
## 
##                      Df Deviance   AIC
## + pH                  1    13635 45612
## <none>                     13638 45613
## - Density             1    13640 45613
## - ResidualSugar       1    13640 45613
## - Sulphates           1    13642 45615
## + INDEX               1    13638 45615
## + FixedAcidity        1    13638 45615
## - Chlorides           1    13643 45616
## - FreeSulfurDioxide   1    13644 45617
## - Alcohol             1    13646 45619
## - CitricAcid          1    13648 45621
## - TotalSulfurDioxide  1    13653 45626
## - VolatileAcidity     1    13659 45632
## - AcidIndex           1    13954 45927
## - LabelAppeal         1    14311 46284
## - STARS               4    19521 51488
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## 
## Step:  AIC=45612.36
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + ResidualSugar + Density + pH
## 
##                      Df Deviance   AIC
## <none>                     13635 45612
## - pH                  1    13638 45613
## - Density             1    13638 45613
## - ResidualSugar       1    13638 45613
## - Sulphates           1    13639 45614
## + INDEX               1    13635 45614
## + FixedAcidity        1    13635 45614
## - Chlorides           1    13640 45615
## - FreeSulfurDioxide   1    13641 45616
## - Alcohol             1    13644 45619
## - CitricAcid          1    13645 45620
## - TotalSulfurDioxide  1    13651 45626
## - VolatileAcidity     1    13656 45631
## - AcidIndex           1    13954 45929
## - LabelAppeal         1    14310 46285
## - STARS               4    19511 51480
## 
## Call:  glm.nb(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + ResidualSugar + Density + pH, data = data, 
##     init.theta = 40828.34765, link = log)
## 
## Coefficients:
##        (Intercept)              STARS2              STARS3  
##           1.428340            0.319746            0.439234  
##             STARS4              STARS5         LabelAppeal  
##           0.556300           -0.766483            0.159204  
##          AcidIndex     VolatileAcidity  TotalSulfurDioxide  
##          -0.079397           -0.033542            0.005287  
##         CitricAcid             Alcohol   FreeSulfurDioxide  
##           0.021508            0.012191            0.003073  
##          Chlorides           Sulphates       ResidualSugar  
##          -0.012047           -0.014395            0.002797  
##            Density                  pH  
##          -0.293281           -0.010107  
## 
## Degrees of Freedom: 12794 Total (i.e. Null);  12778 Residual
## Null Deviance:       22860 
## Residual Deviance: 13640     AIC: 45610
  1. Build Poisson regression model.
#build lm model using stepwise approach
linear_model.null = glm(TARGET ~ 1, family = "poisson",data = data)

linear_model.full = glm(TARGET ~ ., family = "poisson",data = data)
     
step(linear_model.null,
     scope = list(upper=linear_model.full),
             direction = "both",
             data = data)
## Start:  AIC=54804.92
## TARGET ~ 1
## 
##                      Df Deviance   AIC
## + STARS               4    14691 46643
## + LabelAppeal         1    20868 52814
## + AcidIndex           1    21825 53771
## + VolatileAcidity     1    22749 54695
## + TotalSulfurDioxide  1    22792 54738
## + Alcohol             1    22801 54747
## + FreeSulfurDioxide   1    22811 54757
## + Chlorides           1    22821 54767
## + FixedAcidity        1    22824 54770
## + Sulphates           1    22835 54781
## + CitricAcid          1    22839 54785
## + Density             1    22841 54787
## + ResidualSugar       1    22854 54800
## <none>                     22861 54805
## + pH                  1    22860 54805
## + INDEX               1    22861 54807
## 
## Step:  AIC=46643.21
## TARGET ~ STARS
## 
##                      Df Deviance   AIC
## + LabelAppeal         1    14060 46014
## + AcidIndex           1    14392 46346
## + VolatileAcidity     1    14657 46611
## + TotalSulfurDioxide  1    14668 46622
## + FreeSulfurDioxide   1    14678 46632
## + Chlorides           1    14680 46634
## + Alcohol             1    14683 46637
## + CitricAcid          1    14684 46638
## + Sulphates           1    14685 46639
## + FixedAcidity        1    14686 46640
## + Density             1    14687 46641
## <none>                     14691 46643
## + ResidualSugar       1    14689 46643
## + INDEX               1    14690 46644
## + pH                  1    14691 46645
## - STARS               4    22861 54805
## 
## Step:  AIC=46014.13
## TARGET ~ STARS + LabelAppeal
## 
##                      Df Deviance   AIC
## + AcidIndex           1    13716 45672
## + VolatileAcidity     1    14030 45986
## + TotalSulfurDioxide  1    14034 45990
## + Chlorides           1    14049 46005
## + Alcohol             1    14049 46005
## + FreeSulfurDioxide   1    14049 46005
## + FixedAcidity        1    14054 46009
## + Sulphates           1    14054 46010
## + CitricAcid          1    14055 46011
## + Density             1    14055 46011
## + ResidualSugar       1    14057 46013
## <none>                     14060 46014
## + INDEX               1    14060 46016
## + pH                  1    14060 46016
## - LabelAppeal         1    14691 46643
## - STARS               4    20868 52814
## 
## Step:  AIC=45672.11
## TARGET ~ STARS + LabelAppeal + AcidIndex
## 
##                      Df Deviance   AIC
## + VolatileAcidity     1    13692 45650
## + TotalSulfurDioxide  1    13699 45657
## + CitricAcid          1    13705 45663
## + Alcohol             1    13709 45667
## + FreeSulfurDioxide   1    13709 45667
## + Chlorides           1    13710 45668
## + Sulphates           1    13712 45670
## + pH                  1    13713 45671
## + ResidualSugar       1    13713 45671
## + Density             1    13714 45672
## <none>                     13716 45672
## + INDEX               1    13716 45674
## + FixedAcidity        1    13716 45674
## - AcidIndex           1    14060 46014
## - LabelAppeal         1    14392 46346
## - STARS               4    19768 51716
## 
## Step:  AIC=45650.48
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity
## 
##                      Df Deviance   AIC
## + TotalSulfurDioxide  1    13677 45637
## + CitricAcid          1    13682 45642
## + Alcohol             1    13685 45645
## + FreeSulfurDioxide   1    13686 45646
## + Chlorides           1    13687 45647
## + Sulphates           1    13688 45648
## + ResidualSugar       1    13690 45650
## + pH                  1    13690 45650
## + Density             1    13690 45650
## <none>                     13692 45650
## + INDEX               1    13692 45652
## + FixedAcidity        1    13692 45652
## - VolatileAcidity     1    13716 45672
## - AcidIndex           1    14030 45986
## - LabelAppeal         1    14365 46321
## - STARS               4    19698 51648
## 
## Step:  AIC=45636.82
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide
## 
##                      Df Deviance   AIC
## + CitricAcid          1    13667 45629
## + Alcohol             1    13668 45631
## + FreeSulfurDioxide   1    13671 45633
## + Chlorides           1    13671 45633
## + Sulphates           1    13673 45635
## + pH                  1    13674 45636
## + Density             1    13674 45636
## + ResidualSugar       1    13674 45636
## <none>                     13677 45637
## + INDEX               1    13677 45639
## + FixedAcidity        1    13677 45639
## - TotalSulfurDioxide  1    13692 45650
## - VolatileAcidity     1    13699 45657
## - AcidIndex           1    14006 45964
## - LabelAppeal         1    14351 46309
## - STARS               4    19656 51608
## 
## Step:  AIC=45628.73
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid
## 
##                      Df Deviance   AIC
## + Alcohol             1    13658 45622
## + FreeSulfurDioxide   1    13661 45625
## + Chlorides           1    13661 45625
## + Sulphates           1    13662 45626
## + pH                  1    13664 45628
## + Density             1    13664 45628
## + ResidualSugar       1    13664 45628
## <none>                     13667 45629
## + INDEX               1    13667 45631
## + FixedAcidity        1    13667 45631
## - CitricAcid          1    13677 45637
## - TotalSulfurDioxide  1    13682 45642
## - VolatileAcidity     1    13688 45648
## - AcidIndex           1    14002 45962
## - LabelAppeal         1    14339 46299
## - STARS               4    19625 51579
## 
## Step:  AIC=45622.3
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol
## 
##                      Df Deviance   AIC
## + FreeSulfurDioxide   1    13652 45618
## + Chlorides           1    13653 45619
## + Sulphates           1    13654 45620
## + ResidualSugar       1    13656 45622
## + pH                  1    13656 45622
## + Density             1    13656 45622
## <none>                     13658 45622
## + INDEX               1    13658 45624
## + FixedAcidity        1    13658 45624
## - Alcohol             1    13667 45629
## - CitricAcid          1    13668 45631
## - TotalSulfurDioxide  1    13674 45636
## - VolatileAcidity     1    13680 45642
## - AcidIndex           1    13988 45950
## - LabelAppeal         1    14333 46295
## - STARS               4    19581 51537
## 
## Step:  AIC=45618.29
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide
## 
##                      Df Deviance   AIC
## + Chlorides           1    13647 45615
## + Sulphates           1    13648 45616
## + ResidualSugar       1    13650 45618
## + Density             1    13650 45618
## + pH                  1    13650 45618
## <none>                     13652 45618
## + INDEX               1    13652 45620
## + FixedAcidity        1    13652 45620
## - FreeSulfurDioxide   1    13658 45622
## - Alcohol             1    13661 45625
## - CitricAcid          1    13662 45626
## - TotalSulfurDioxide  1    13668 45632
## - VolatileAcidity     1    13674 45638
## - AcidIndex           1    13979 45943
## - LabelAppeal         1    14325 46289
## - STARS               4    19560 51519
## 
## Step:  AIC=45615.02
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides
## 
##                      Df Deviance   AIC
## + Sulphates           1    13643 45613
## + ResidualSugar       1    13644 45614
## + Density             1    13645 45615
## + pH                  1    13645 45615
## <none>                     13647 45615
## + INDEX               1    13647 45617
## + FixedAcidity        1    13647 45617
## - Chlorides           1    13652 45618
## - FreeSulfurDioxide   1    13653 45619
## - Alcohol             1    13656 45622
## - CitricAcid          1    13657 45623
## - TotalSulfurDioxide  1    13663 45629
## - VolatileAcidity     1    13668 45634
## - AcidIndex           1    13968 45934
## - LabelAppeal         1    14319 46285
## - STARS               4    19547 51507
## 
## Step:  AIC=45613.01
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates
## 
##                      Df Deviance   AIC
## + ResidualSugar       1    13641 45613
## + Density             1    13641 45613
## + pH                  1    13641 45613
## <none>                     13643 45613
## + INDEX               1    13643 45615
## + FixedAcidity        1    13643 45615
## - Sulphates           1    13647 45615
## - Chlorides           1    13648 45616
## - FreeSulfurDioxide   1    13649 45617
## - Alcohol             1    13651 45619
## - CitricAcid          1    13653 45621
## - TotalSulfurDioxide  1    13658 45626
## - VolatileAcidity     1    13664 45632
## - AcidIndex           1    13962 45931
## - LabelAppeal         1    14316 46284
## - STARS               4    19534 51496
## 
## Step:  AIC=45612.59
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + ResidualSugar
## 
##                      Df Deviance   AIC
## + Density             1    13638 45612
## + pH                  1    13638 45612
## <none>                     13641 45613
## - ResidualSugar       1    13643 45613
## - Sulphates           1    13644 45614
## + INDEX               1    13640 45615
## + FixedAcidity        1    13641 45615
## - Chlorides           1    13646 45616
## - FreeSulfurDioxide   1    13646 45616
## - Alcohol             1    13649 45619
## - CitricAcid          1    13650 45621
## - TotalSulfurDioxide  1    13656 45626
## - VolatileAcidity     1    13662 45632
## - AcidIndex           1    13960 45930
## - LabelAppeal         1    14314 46284
## - STARS               4    19530 51494
## 
## Step:  AIC=45612.26
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + ResidualSugar + Density
## 
##                      Df Deviance   AIC
## + pH                  1    13636 45612
## <none>                     13638 45612
## - Density             1    13641 45613
## - ResidualSugar       1    13641 45613
## - Sulphates           1    13642 45614
## + INDEX               1    13638 45614
## + FixedAcidity        1    13638 45614
## - Chlorides           1    13643 45615
## - FreeSulfurDioxide   1    13644 45616
## - Alcohol             1    13647 45619
## - CitricAcid          1    13648 45620
## - TotalSulfurDioxide  1    13654 45626
## - VolatileAcidity     1    13660 45631
## - AcidIndex           1    13954 45927
## - LabelAppeal         1    14312 46284
## - STARS               4    19522 51488
## 
## Step:  AIC=45611.94
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + ResidualSugar + Density + pH
## 
##                      Df Deviance   AIC
## <none>                     13636 45612
## - pH                  1    13638 45612
## - Density             1    13638 45612
## - ResidualSugar       1    13638 45612
## - Sulphates           1    13640 45614
## + INDEX               1    13636 45614
## + FixedAcidity        1    13636 45614
## - Chlorides           1    13641 45615
## - FreeSulfurDioxide   1    13642 45616
## - Alcohol             1    13644 45618
## - CitricAcid          1    13646 45620
## - TotalSulfurDioxide  1    13651 45625
## - VolatileAcidity     1    13657 45631
## - AcidIndex           1    13954 45928
## - LabelAppeal         1    14311 46285
## - STARS               4    19512 51480
## 
## Call:  glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide + 
##     Chlorides + Sulphates + ResidualSugar + Density + pH, family = "poisson", 
##     data = data)
## 
## Coefficients:
##        (Intercept)              STARS2              STARS3  
##           1.428325            0.319746            0.439234  
##             STARS4              STARS5         LabelAppeal  
##           0.556298           -0.766485            0.159205  
##          AcidIndex     VolatileAcidity  TotalSulfurDioxide  
##          -0.079394           -0.033540            0.005287  
##         CitricAcid             Alcohol   FreeSulfurDioxide  
##           0.021507            0.012192            0.003073  
##          Chlorides           Sulphates       ResidualSugar  
##          -0.012046           -0.014394            0.002797  
##            Density                  pH  
##          -0.293275           -0.010107  
## 
## Degrees of Freedom: 12794 Total (i.e. Null);  12778 Residual
## Null Deviance:       22860 
## Residual Deviance: 13640     AIC: 45610
  1. SELECT MODELS

Select optimal model.

#optimal model
final_model <- glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
    TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide + 
    Sulphates + Density + pH, family="gaussian",data = data)
summary(final_model)
## 
## Call:
## glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide + 
##     Sulphates + Density + pH, family = "gaussian", data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -4.7293  -0.8604   0.0184   0.8488   6.2032  
## 
## Coefficients:
##                     Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.770114   0.112361  33.554  < 2e-16 ***
## STARS2              1.024907   0.032559  31.478  < 2e-16 ***
## STARS3              1.598058   0.037628  42.469  < 2e-16 ***
## STARS4              2.293681   0.059599  38.485  < 2e-16 ***
## STARS5             -1.361671   0.032894 -41.396  < 2e-16 ***
## LabelAppeal         0.466342   0.013618  34.245  < 2e-16 ***
## AcidIndex          -0.198204   0.008986 -22.056  < 2e-16 ***
## VolatileAcidity    -0.102730   0.016552  -6.206 5.59e-10 ***
## TotalSulfurDioxide  0.015431   0.002966   5.202 2.00e-07 ***
## CitricAcid          0.065437   0.015394   4.251 2.15e-05 ***
## Alcohol             0.039553   0.009412   4.202 2.66e-05 ***
## Chlorides          -0.034384   0.012359  -2.782  0.00541 ** 
## FreeSulfurDioxide   0.008893   0.002848   3.122  0.00180 ** 
## Sulphates          -0.039273   0.016831  -2.333  0.01965 *  
## Density            -0.833203   0.436183  -1.910  0.05613 .  
## pH                 -0.024296   0.014984  -1.621  0.10495    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.70242)
## 
##     Null deviance: 47477  on 12794  degrees of freedom
## Residual deviance: 21755  on 12779  degrees of freedom
## AIC: 43136
## 
## Number of Fisher Scoring iterations: 2

Perform residual analysis.

#linearity
plot(final_model$residuals ~ data$TARGET,
     main="Residuals vs TARGET")
abline(h = 0, lty = 3)  # adds a horizontal dashed line at y = 0

#normal residuals
par(mfrow=c(1,2))
hist(final_model$residuals, probability=TRUE,col="gray", border="white", main="Distribution of residuals")
d <- density(final_model$residuals)
    lines(d, col="red")

#normal probability plot 
qqnorm(final_model$residuals)
qqline(final_model$residuals) 

#constant variability
plot(final_model)

Run likelihood ratio test.

#alternative model
alternative_model <- glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
    TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide, data = data)

#Likelihood Ratio Test
anova(final_model, alternative_model, test ="Chisq")
## Analysis of Deviance Table
## 
## Model 1: TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide + 
##     Sulphates + Density + pH
## Model 2: TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity + 
##     TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)   
## 1     12779      21755                        
## 2     12782      21776 -3  -20.428 0.007386 **
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Analize deviance.

#residual deviance test
p_value = 1 - pchisq(final_model$deviance,final_model$df.residual)
p_value
## [1] 0

Hosmer-Lemeshow Test.

hoslem.test(data$TARGET, fitted(final_model))
## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  data$TARGET, fitted(final_model)
## X-squared = 354.74, df = 8, p-value < 2.2e-16

Calculate sales for testing data set.

#create dummy varibles for training dataset
data$STARS_2 <- ifelse(data$STARS == "2",1,0)
data$STARS_3 <- ifelse(data$STARS == "3",1,0)
data$STARS_4 <- ifelse(data$STARS == "4",1,0)
data$STARS_5 <- ifelse(data$STARS == "5",1,0)

#create dummy varibles for testing dataset
data_testing$STARS_2 <- ifelse(data_testing$STARS == "2",1,0)
data_testing$STARS_3 <- ifelse(data_testing$STARS == "3",1,0)
data_testing$STARS_4 <- ifelse(data_testing$STARS == "4",1,0)
data_testing$STARS_5 <- ifelse(data_testing$STARS == "5",1,0)

#create the variable 'TARGET_pred' for testimng and training data sets
data$TARGET_pred <- c()
data_testing$TARGET_pred <- c()

#calculate 'TARGET_pred'
data$TARGET_pred <- round(3.778249 + 1.025474*data$STARS_2 + 1.598549*data$STARS_3 + 2.293281*data$STARS_4 - 1.361213*data$STARS_5 + 0.465884*data$LabelAppeal - 0.197816*data$AcidIndex - 0.103156*data$VolatileAcidity + 0.014510*data$TotalSulfurDioxide + 0.065114*data$CitricAcid + 0.037178*data$Alcohol  - 0.039618*data$Chlorides + 0.009917*data$FreeSulfurDioxide - 0.036880*data$Sulphates,0)

data_testing$TARGET_pred <- round(3.778249 + 1.025474*data_testing$STARS_2 + 1.598549*data_testing$STARS_3 + 2.293281*data_testing$STARS_4 - 1.361213*data_testing$STARS_5 + 0.465884*data_testing$LabelAppeal - 0.197816*data_testing$AcidIndex - 0.103156*data_testing$VolatileAcidity + 0.014510*data_testing$TotalSulfurDioxide + 0.065114*data_testing$CitricAcid + 0.037178*data_testing$Alcohol - 0.039618*data_testing$Chlorides + 0.009917*data_testing$FreeSulfurDioxide - 0.036880*data_testing$Sulphates,0)


#plot predicted vs actual
plot(predict(final_model),data$TARGET_pred,
     xlab="Predicted", ylab="Actual",
     main="TARGET. Actual vs Predicted")
abline(a=0,b=1,col="red")

#sagnificance of difference
ks.test(data$TARGET,data$TARGET_pred)
## Warning in ks.test(data$TARGET, data$TARGET_pred): p-value will be
## approximate in the presence of ties
## 
##  Two-sample Kolmogorov-Smirnov test
## 
## data:  data$TARGET and data$TARGET_pred
## D = 0.22415, p-value < 2.2e-16
## alternative hypothesis: two-sided
#export testing data file with predicted sales
write.table(data_testing, file = "/Users/olga/downloads/wine-evaluation-data.csv",append = FALSE)