suppressMessages(suppressWarnings(library(knitr)))

suppressMessages(suppressWarnings(library(ggplot2)))
suppressMessages(suppressWarnings(library(kableExtra)))
suppressMessages(suppressWarnings(library(formattable)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(stringr)))

#suppressMessages(suppressWarnings(library(alr3)))
suppressMessages(suppressWarnings(library(car))) #for residualPlots and mmps

suppressMessages(suppressWarnings(library(caret)))
suppressMessages(suppressWarnings(library(pROC)))

suppressMessages(suppressWarnings(library(rms))) # rms ols lrm


suppressMessages(suppressWarnings(library(AER)))
suppressMessages(suppressWarnings(library(pscl)))
suppressMessages(suppressWarnings(library(DHARMa)))
suppressMessages(suppressWarnings(library(faraway)))

suppressMessages(suppressWarnings(library(MASS)))
suppressMessages(suppressWarnings(library(broom)))
DO_PERFORM_STEPS <- TRUE

suppressMessages(suppressWarnings(library(plotly)))
suppressMessages(suppressWarnings(library(stringr)))
suppressMessages(suppressWarnings(library(sandwich)))
suppressMessages(suppressWarnings(library(MASS)))
suppressMessages(suppressWarnings(library(ROCR)))
suppressMessages(suppressWarnings(library(pscl)))
suppressMessages(suppressWarnings(library(boot)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(Amelia)))
suppressMessages(suppressWarnings(library(plyr)))
suppressMessages(suppressWarnings(library(psych)))

suppressMessages(suppressWarnings(library(kableExtra)))
wine_description <- read.csv("https://raw.githubusercontent.com/YunMai-SPS/DATA621_homework/master/data621_assignment5/wine_data_description.csv")

kable(wine_description,'html')%>% 
  kable_styling(bootstrap_options = c('bordered',full_width = T),font_size = 21) %>%
  column_spec(1, width = "3em",color='black') %>% 
  column_spec(2, width = "30em",color='black') %>% 
  column_spec(3, width = "30em",color='black')
VARIABLE.NAME DEFINITION THEORETICAL.EFFECT
INDEX Identification Variable (do not use) None
TARGET Number of Cases Purchased None
AcidIndex Proprietary method of testing total acidity of wine by using a weighted average
Alcohol Alcohol Content
Chlorides Chloride content of wine
CitricAcid Citric Acid Content
Density Density of Wine
FixedAcidity Fixed Acidity of Wine
FreeSulfurDioxide Sulfur Dioxide content of wine
LabelAppeal Marketing Score indicating the appeal of label design for consumers. High numbers suggest customers like the label design. Negative numbers suggest customes don’t like the design. Many consumers purchase based on the visual appeal of the wine label design. Higher numbers suggest better sales.
ResidualSugar Residual Sugar of wine
STARS Wine rating by a team of experts. 4 Stars = Excellent, 1 Star = Poor A high number of stars suggests high sales
Sulphates Sulfate conten of wine
TotalSulfurDioxide Total Sulfur Dioxide of Wine
VolatileAcidity Volatile Acid content of wine
pH pH of wine

1. DATA EXPLORATION (25 Points)

Specification:

Describe the size and the variables in the wine training data set. Consider that too much detail will cause a manager to lose interest while too little detail will make the manager consider that you aren’t doing your job. Some suggestions are given below. Please do NOT treat this as a check list of things to do to complete the assignment. You should have your own thoughts on what to tell the boss. These are just ideas. a. Mean / Standard Deviation / Median b. Bar Chart or Box Plot of the data c. Is the data correlated to the target variable (or to other variables?) d. Are any of the variables missing and need to be imputed “fixed”?

wine_train <- read.csv('https://raw.githubusercontent.com/YunMai-SPS/DATA621_homework/master/data621_assignment5/wine-training-data.csv')
colnames(wine_train)[1] <- 'INDEX'

wine_test <- read.csv('https://raw.githubusercontent.com/YunMai-SPS/DATA621_homework/master/data621_assignment5/wine-evaluation-data.csv')

kable(head(wine_train,10),'html') %>% 
  kable_styling(bootstrap_options = c('bordered','hover','condensed',full_width=F))
INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol LabelAppeal AcidIndex STARS
1 3 3.2 1.160 -0.98 54.20 -0.567 NA 268 0.99280 3.33 -0.59 9.9 0 8 2
2 3 4.5 0.160 -0.81 26.10 -0.425 15 -327 1.02792 3.38 0.70 NA -1 7 3
4 5 7.1 2.640 -0.88 14.80 0.037 214 142 0.99518 3.12 0.48 22.0 -1 8 3
5 3 5.7 0.385 0.04 18.80 -0.425 22 115 0.99640 2.24 1.83 6.2 -1 6 1
6 4 8.0 0.330 -1.26 9.40 NA -167 108 0.99457 3.12 1.77 13.7 0 9 2
7 0 11.3 0.320 0.59 2.20 0.556 -37 15 0.99940 3.20 1.29 15.4 0 11 NA
8 0 7.7 0.290 -0.40 21.50 0.060 287 156 0.99572 3.49 1.21 10.3 0 8 NA
11 4 6.5 -1.220 0.34 1.40 0.040 523 551 1.03236 3.20 NA 11.6 1 7 3
12 3 14.8 0.270 1.05 11.25 -0.007 -213 NA 0.99620 4.93 0.26 15.0 0 6 NA
13 6 5.5 -0.220 0.39 1.80 -0.277 62 180 0.94724 3.09 0.75 12.6 0 8 4
wine_training_data <- read.csv('https://raw.githubusercontent.com/chirag-vithlani/Business_Analytics_and_Data_Mining_DATA_621/master/Homework5/data/wine-training-data.csv', stringsAsFactors = FALSE)
intro <- read.csv('https://raw.githubusercontent.com/chirag-vithlani/Business_Analytics_and_Data_Mining_DATA_621/master/Homework5/data/wine_data_intro.csv', stringsAsFactors = FALSE)
names(intro)[2]<-"Description"
kable(intro, "html") %>%
  kable_styling(bootstrap_options = "striped", full_width = F)
Variable Description
Index Identification Variable (do not use)
Target Number of Cases Purchased
AcidIndex Proprietary method of testing total acidity of wine by using a weighted average
Alcohol Alcohol Content
Chlorides Chloride content of wine
CitricAcid Citric Acid Content
Density Density of Wine
FixedAcidity Fixed Acidity of Wine
FreeSulfurDioxide Sulfur Dioxide content of wine
LabelAppeal Marketing Score indicating the appeal of label design for consumers. High numbers suggest customers like the label design. Negative numbers suggest customes don’t like the design.Many consumers purchase based on the visual appeal of the wine label design. Higher numbers suggest better sales.
ResidualSugar Residual Sugar of wine
STARS Wine rating by a team of experts. 4 Stars = Excellent. 1 Star = Poor. A high number of stars suggests high sales
Sulphates Sulfate content of wine
TotalSulfurDioxide Total Sulfur Dioxide of Wine
VolatileAcidity Volatile Acid content of wine
pH pH of wine
do_factors <- function(wine_object)
{
  wine_object <- within(wine_object, {
      LabelAppeal <- factor(LabelAppeal)
      AcidIndex <- factor(AcidIndex)
      STARS <- factor(STARS)
  })
  return (wine_object)
}
colnames(wine_training_data)[1]<-"INDEX"

Objective here is to do analysis of chemical properties of wine data to maximize sales. So here independent varialbes are chemical properties of wines and its star rating. Dependent variable is number of sample cases. More samples are there, more that wine sold.

There are 12795 number of rows.

summary(wine_train)
##      INDEX           TARGET       FixedAcidity     VolatileAcidity  
##  Min.   :    1   Min.   :0.000   Min.   :-18.100   Min.   :-2.7900  
##  1st Qu.: 4038   1st Qu.:2.000   1st Qu.:  5.200   1st Qu.: 0.1300  
##  Median : 8110   Median :3.000   Median :  6.900   Median : 0.2800  
##  Mean   : 8070   Mean   :3.029   Mean   :  7.076   Mean   : 0.3241  
##  3rd Qu.:12106   3rd Qu.:4.000   3rd Qu.:  9.500   3rd Qu.: 0.6400  
##  Max.   :16129   Max.   :8.000   Max.   : 34.400   Max.   : 3.6800  
##                                                                     
##    CitricAcid      ResidualSugar        Chlorides       FreeSulfurDioxide
##  Min.   :-3.2400   Min.   :-127.800   Min.   :-1.1710   Min.   :-555.00  
##  1st Qu.: 0.0300   1st Qu.:  -2.000   1st Qu.:-0.0310   1st Qu.:   0.00  
##  Median : 0.3100   Median :   3.900   Median : 0.0460   Median :  30.00  
##  Mean   : 0.3084   Mean   :   5.419   Mean   : 0.0548   Mean   :  30.85  
##  3rd Qu.: 0.5800   3rd Qu.:  15.900   3rd Qu.: 0.1530   3rd Qu.:  70.00  
##  Max.   : 3.8600   Max.   : 141.150   Max.   : 1.3510   Max.   : 623.00  
##                    NA's   :616        NA's   :638       NA's   :647      
##  TotalSulfurDioxide    Density             pH          Sulphates      
##  Min.   :-823.0     Min.   :0.8881   Min.   :0.480   Min.   :-3.1300  
##  1st Qu.:  27.0     1st Qu.:0.9877   1st Qu.:2.960   1st Qu.: 0.2800  
##  Median : 123.0     Median :0.9945   Median :3.200   Median : 0.5000  
##  Mean   : 120.7     Mean   :0.9942   Mean   :3.208   Mean   : 0.5271  
##  3rd Qu.: 208.0     3rd Qu.:1.0005   3rd Qu.:3.470   3rd Qu.: 0.8600  
##  Max.   :1057.0     Max.   :1.0992   Max.   :6.130   Max.   : 4.2400  
##  NA's   :682                         NA's   :395     NA's   :1210     
##     Alcohol       LabelAppeal          AcidIndex          STARS      
##  Min.   :-4.70   Min.   :-2.000000   Min.   : 4.000   Min.   :1.000  
##  1st Qu.: 9.00   1st Qu.:-1.000000   1st Qu.: 7.000   1st Qu.:1.000  
##  Median :10.40   Median : 0.000000   Median : 8.000   Median :2.000  
##  Mean   :10.49   Mean   :-0.009066   Mean   : 7.773   Mean   :2.042  
##  3rd Qu.:12.40   3rd Qu.: 1.000000   3rd Qu.: 8.000   3rd Qu.:3.000  
##  Max.   :26.50   Max.   : 2.000000   Max.   :17.000   Max.   :4.000  
##  NA's   :653                                          NA's   :3359

Things we can note from summary.

1.1 Descriptive statistics for TARGET

table(wine_train$TARGET)
## 
##    0    1    2    3    4    5    6    7    8 
## 2734  244 1091 2611 3177 2014  765  142   17

1.2 Plot the numerical data and the categorical data separately

Statistic of the numerical data

suppressMessages(suppressWarnings(library(pastecs)))
options(scipen = 100)
options(digits = 3)

# numeric variables
num_var <- wine_train[-which(names(wine_train) %in% c('INDEX','TARGET','LabelAppeal','AcidIndex','STARS'))]

# categorical variables
cat_var <- wine_train[,-1]
cat_var <- cat_var[,-which(names(cat_var)%in% names(num_var))]

** Histagram and boxplot of numeric independent variables:**

par(mfrow=c(4,4),oma=c(1,1,0,0), mar=c(2,1,1,0)+.1, tcl=-0.01, mgp=c(5,1,0),pin=c(1.4,0.4))
for(i in 1:length(num_var)){
  hist(num_var[,i], probability = T, xlab = '', main = colnames(num_var)[i])
  d <- density(num_var[,i],na.rm= T)
  lines(d,col = 'red')
}

par(mfrow=c(1,4),oma=c(1,1,0,0), mar=c(2,1,1,0)+.1, tcl=-0.01, mgp=c(5,1,0),pin=c(1.4,0.4))
for(i in 1:length(cat_var)){
  hist(cat_var[,i],xlab = '', main = colnames(cat_var)[i])
}

par(mfrow=c(3,5),oma=c(1,1,1,1), mar=c(1, 3, 1.5, 2)+.1, tcl=-0.01, mgp=c(5,1,0))
for(i in 2:length(wine_train)){
  boxplot(wine_train[,i],main = colnames(wine_train)[i])
}

A basic analysis:

  • Our histograms show normally distributed variables
  • Our Box-Plots seem to have large amounts of instances outside of the the upper and lower whiskers. We will examine this further.
  • The dependent variable is “TARGET”, which is the number of cases purchased

** Categorical independent variables:**

cat_var <- as.data.frame(lapply(cat_var,as.factor))

# conditional densities plots
par(mfrow=c(1,3),mar=c(5,2.0,1,3.0))
for(i in 2:length(cat_var)){
  par(las=2,mgp=c(8,1,0.5)) #mgp=c(10,1.5,1), mar=c(12,10,1,10),
  cdplot(cat_var$TARGET ~ cat_var[,i], data=cat_var ,xlab=NULL, ylab="TARGET")
  title(names(cat_var)[i], adj = 0.5, line = 0.3, cex.main=0.9)
}

1.3 Missing values

1.3.1. The missing data for variables:

pMiss <- function(x){sum(is.na(x))/length(x)*100}
miss_feature <- as.data.frame(apply(wine_train[,-1],2,pMiss))
miss_feature_df <- miss_feature
miss_feature_df$predictors <- as.numeric(t(stat.desc(wine_train[,-1])[3,]))
miss_feature_df$missing_values <- rownames(miss_feature)
miss_feature_df <- miss_feature_df[,3:1]
colnames(miss_feature_df)<- c('variable_name','missing_values', 'missing_values(%)')

#kable(miss_feature_df)
kable(filter(miss_feature_df, miss_feature_df[,'missing_values(%)']!=0 ), "html") %>%
  kable_styling(bootstrap_options = c("striped", "bordered", "hover", "condensed"),full_width = F)
variable_name missing_values missing_values(%)
ResidualSugar 616 4.81
Chlorides 638 4.99
FreeSulfurDioxide 647 5.06
TotalSulfurDioxide 682 5.33
pH 395 3.09
Sulphates 1210 9.46
Alcohol 653 5.10
STARS 3359 26.25
#cat("The variables missed more than 5% datapoints are:",paste(rownames(miss_feature)[which(miss_feature > 5)],collapse = ',')) cat("\n")

(filter(miss_feature_df, miss_feature_df[,'missing_values(%)']!=0 )[,1])
## [1] "ResidualSugar"      "Chlorides"          "FreeSulfurDioxide" 
## [4] "TotalSulfurDioxide" "pH"                 "Sulphates"         
## [7] "Alcohol"            "STARS"

The variables missed more than 5% datapoints are: FreeSulfurDioxide,TotalSulfurDioxide,Sulphates,Alcohol,STARS

STAR is a concern because 26% 0f the values are missing. The the missing data will be Imputated.

1.3.2. The missing data for cases:

row_miss <- data.frame(apply(wine_train, 1, function(x) sum(is.na(x))))
colnames(row_miss) <- 'NAs'
row_miss$NA_percent <- row_miss$NAs/dim(wine_train)[2]
#kable(row_miss[-which(row_miss==0),])
#kable(row_miss[which(row_miss$NA_percent>0.05),])
table(row_miss[which(row_miss$NA_percent>0.05),])
##    NA_percent
## NAs 0.0625 0.125 0.1875 0.25
##   1   4780     0      0    0
##   2      0  1342      0    0
##   3      0     0    212    0
##   4      0     0      0   25

There are 4780 cases miss 6.25% values, 1342 cases miss 12.5%, 212 cases misses 18.75% values and 25 cases misses 25% values.

To preserve the information and meaning of the non-missing data, I will do the imputation for the missing values.

1.3.3. Explore NA’s through Missingness Map

The below table shows a summary of the NA values in the data. Only STARS had an NA frequency higher than 10%, so this was a concern. All NA values were thus replaced with samples from their respective collections, except for STARS, which required further analysis.

not_na_count <- sapply(wine_training_data, function(y) sum(length(which(!is.na(y)))))
na_count <- sapply(wine_training_data, function(y) sum(length(which(is.na(y)))))
na_pct <- na_count / (na_count + not_na_count)

na_summary_df <- data.frame(not_na_count,na_count,na_pct)
#Missingness Map
missmap(wine_training_data, main = "Missing Values Before Replacement" ,col=c("gray90", "black"),legend = F)

kable(na_summary_df)
not_na_count na_count na_pct
INDEX 12795 0 0.000
TARGET 12795 0 0.000
FixedAcidity 12795 0 0.000
VolatileAcidity 12795 0 0.000
CitricAcid 12795 0 0.000
ResidualSugar 12179 616 0.048
Chlorides 12157 638 0.050
FreeSulfurDioxide 12148 647 0.051
TotalSulfurDioxide 12113 682 0.053
Density 12795 0 0.000
pH 12400 395 0.031
Sulphates 11585 1210 0.095
Alcohol 12142 653 0.051
LabelAppeal 12795 0 0.000
AcidIndex 12795 0 0.000
STARS 9436 3359 0.263
wine_training_data$ResidualSugar[is.na(wine_training_data$ResidualSugar)] <- sample(wine_training_data$ResidualSugar[!is.na(wine_training_data$ResidualSugar)])
wine_training_data$Chlorides[is.na(wine_training_data$Chlorides)] <- sample(wine_training_data$Chlorides[!is.na(wine_training_data$Chlorides)])
wine_training_data$FreeSulfurDioxide[is.na(wine_training_data$FreeSulfurDioxide)] <- sample(wine_training_data$FreeSulfurDioxide[!is.na(wine_training_data$FreeSulfurDioxide)])
wine_training_data$TotalSulfurDioxide[is.na(wine_training_data$TotalSulfurDioxide)] <- sample(wine_training_data$TotalSulfurDioxide[!is.na(wine_training_data$TotalSulfurDioxide)])
wine_training_data$pH[is.na(wine_training_data$pH)] <- sample(wine_training_data$pH[!is.na(wine_training_data$pH)])
wine_training_data$Sulphates[is.na(wine_training_data$Sulphates)] <- sample(wine_training_data$Sulphates[!is.na(wine_training_data$Sulphates)])
wine_training_data$Alcohol[is.na(wine_training_data$Alcohol)] <- sample(wine_training_data$Alcohol[!is.na(wine_training_data$Alcohol)])
#Missingness Map
missmap(wine_training_data, main = "Missing Values After Replacement (Except STARS variable) " ,col=c("gray90", "black"),legend = F)

2. DATA PREPARATION

Specification:

Describe how you have transformed the data by changing the original variables or creating new variables. If you did transform the data or create new variables, discuss why you did this. Here are some possible transformations.

  1. Fix missing values (maybe with a Mean or Median value)

  2. Create flags to suggest if a variable was missing

  3. Transform data by putting it into buckets

  4. Mathematical transforms such as log or square root (or use Box-Cox)

  5. Combine variables (such as ratios or adding or multiplying) to create new variables

2.1 Imputation

2.1.1 Missingness pattern test

Before deciding how to impute the data, we will need to examine the missing data patterns. A visualization of the missing values in variables shown as follows:

suppressMessages(suppressWarnings(library (rpart)))
                                                                             suppressMessages(suppressWarnings(library (Hmisc)))
na.patterns <- naclus(wine_train)

naplot ( na.patterns , 'na per var')

#plot ( who.na , margin = .1 ) 
#text ( who.na,use.n=TRUE, all=TRUE, cex=.7 ) #too many text
                                                                               
plot ( na.patterns )

Upper panel shows the fraction of observations missing on each predictor. Lower panel depicts a hierarchical cluster analysis of missingness combinations. The similarity measure shown on the Y -axis is the fraction of observations for which both variables are missing.

MACR test

Before deciding how to impute the data, we will need to examine the missing data patterns. A visualization of the missing values in variables shown as follows:

suppressMessages(suppressWarnings(library(MissMech)))
out <- TestMCARNormality(data = wine_train[,c("ResidualSugar","Chlorides", "FreeSulfurDioxide","TotalSulfurDioxide","pH","Sulphates","Alcohol","STARS" )])
out
## Call:
## TestMCARNormality(data = wine_train[, c("ResidualSugar", "Chlorides", 
##     "FreeSulfurDioxide", "TotalSulfurDioxide", "pH", "Sulphates", 
##     "Alcohol", "STARS")])
## 
## Number of Patterns:  49 
## 
## Total number of cases used in the analysis:  12698 
## 
##  Pattern(s) used:
##            ResidualSugar   Chlorides   FreeSulfurDioxide
## group.1                1           1                  NA
## group.2                1           1                   1
## group.3                1           1                   1
## group.4                1          NA                   1
## group.5                1           1                   1
## group.6                1           1                   1
## group.7                1           1                   1
## group.8                1           1                   1
## group.9                1           1                   1
## group.10              NA           1                   1
## group.11               1           1                   1
## group.12              NA           1                   1
## group.13               1           1                   1
## group.14              NA           1                   1
## group.15              NA          NA                   1
## group.16               1           1                   1
## group.17               1           1                  NA
## group.18               1           1                   1
## group.19              NA           1                   1
## group.20               1           1                   1
## group.21               1           1                  NA
## group.22              NA           1                   1
## group.23              NA           1                   1
## group.24               1           1                  NA
## group.25               1          NA                   1
## group.26               1           1                   1
## group.27               1           1                  NA
## group.28               1           1                  NA
## group.29              NA           1                   1
## group.30               1           1                   1
## group.31               1           1                   1
## group.32               1           1                   1
## group.33               1          NA                   1
## group.34               1           1                   1
## group.35               1          NA                  NA
## group.36               1          NA                   1
## group.37               1          NA                   1
## group.38               1           1                   1
## group.39              NA          NA                   1
## group.40               1           1                   1
## group.41              NA           1                  NA
## group.42               1          NA                   1
## group.43               1           1                  NA
## group.44               1           1                   1
## group.45               1           1                  NA
## group.46               1          NA                   1
## group.47              NA           1                   1
## group.48               1           1                  NA
## group.49               1          NA                  NA
##            TotalSulfurDioxide   pH   Sulphates   Alcohol   STARS
## group.1                     1    1           1         1       1
## group.2                     1    1           1        NA       1
## group.3                     1    1           1         1       1
## group.4                     1    1           1         1       1
## group.5                     1    1           1         1      NA
## group.6                     1    1          NA         1       1
## group.7                    NA    1           1         1      NA
## group.8                    NA    1           1         1       1
## group.9                     1    1          NA         1      NA
## group.10                    1    1           1         1       1
## group.11                    1   NA           1         1       1
## group.12                    1    1           1        NA       1
## group.13                    1   NA           1         1      NA
## group.14                   NA    1           1         1       1
## group.15                    1    1           1         1       1
## group.16                    1    1           1        NA      NA
## group.17                    1    1          NA         1      NA
## group.18                    1    1          NA        NA       1
## group.19                    1    1           1         1      NA
## group.20                    1   NA          NA         1      NA
## group.21                    1    1           1         1      NA
## group.22                    1    1          NA         1       1
## group.23                    1    1          NA         1      NA
## group.24                    1   NA           1         1       1
## group.25                    1    1           1         1      NA
## group.26                   NA    1          NA         1       1
## group.27                   NA    1           1         1      NA
## group.28                    1    1          NA         1       1
## group.29                    1    1           1        NA      NA
## group.30                   NA   NA           1         1       1
## group.31                    1   NA           1        NA       1
## group.32                   NA    1          NA         1      NA
## group.33                   NA    1           1         1       1
## group.34                    1   NA          NA         1       1
## group.35                    1    1           1         1       1
## group.36                    1    1           1        NA       1
## group.37                    1    1          NA         1       1
## group.38                   NA    1           1        NA       1
## group.39                    1    1           1         1      NA
## group.40                   NA    1           1        NA      NA
## group.41                    1    1           1         1       1
## group.42                    1    1          NA         1      NA
## group.43                   NA    1           1         1       1
## group.44                    1    1          NA        NA      NA
## group.45                    1    1           1        NA       1
## group.46                    1    1           1        NA      NA
## group.47                    1   NA           1         1       1
## group.48                    1    1           1        NA      NA
## group.49                    1    1           1         1      NA
##            Number of cases
## group.1                338
## group.2                335
## group.3               6436
## group.4                350
## group.5               2239
## group.6                669
## group.7                123
## group.8                341
## group.9                247
## group.10               311
## group.11               197
## group.12                21
## group.13                81
## group.14                22
## group.15                25
## group.16               123
## group.17                14
## group.18                37
## group.19               108
## group.20                11
## group.21               124
## group.22                33
## group.23                15
## group.24                10
## group.25               113
## group.26                42
## group.27                16
## group.28                26
## group.29                 8
## group.30                13
## group.31                14
## group.32                16
## group.33                20
## group.34                22
## group.35                13
## group.36                17
## group.37                22
## group.38                21
## group.39                10
## group.40                 8
## group.41                17
## group.42                14
## group.43                19
## group.44                10
## group.45                15
## group.46                 7
## group.47                 9
## group.48                 7
## group.49                 9
## 
## 
##     Test of normality and Homoscedasticity:
##   -------------------------------------------
## 
## Hawkins Test:
## 
##     P-value for the Hawkins test of normality and homoscedasticity:  0 
## 
##     Either the test of multivariate normality or homoscedasticity (or both) is rejected.
##     Provided that normality can be assumed, the hypothesis of MCAR is 
##     rejected at 0.05 significance level. 
## 
## Non-Parametric Test:
## 
##     P-value for the non-parametric test of homoscedasticity:  0.752 
## 
##     Reject Normality at 0.05 significance level.
##     There is not sufficient evidence to reject MCAR at 0.05 significance level.

Hawkins Test:

P-value for the Hawkins test of normality and homoscedasticity:  0 

Either the test of multivariate normality or homoscedasticity (or both) is rejected.
Provided that normality can be assumed, the hypothesis of MCAR is 
rejected at 0.05 significance level. 

Non-Parametric Test:

P-value for the non-parametric test of homoscedasticity:  0.752 

Reject Normality at 0.05 significance level.
There is not sufficient evidence to reject MCAR at 0.05 significance level.

So we conclude that the assumption that missing values has a completely random pattern is valid.

2.1.2 Multivariate Imputation via Chained Equation

Impute the missing data for further analysis.

suppressMessages(suppressWarnings(library(mice)))

impData_0 <- wine_train
init <- mice(impData_0[,-1], maxit=0) 
meth <- init$method
predM <- init$predictorMatrix

impData_0[,c('LabelAppeal','AcidIndex','STARS')] <- lapply(impData_0[,c('LabelAppeal','AcidIndex','STARS')], as.factor)

#skip variables from imputation but still be predictors
#meth[c("TARGET ","FixedAcidity","VolatileAcidity","CitricAcid","Density"," LabelAppeal","AcidIndex")] <- ""

# specify the methods for imputing the missing values
meth[c("ResidualSugar","Chlorides","FreeSulfurDioxide","TotalSulfurDioxide","pH",        "Sulphates","Alcohol")]<-"norm" 
meth[c("STARS")]<-"polyreg"

# impute 5 data sets for the missing values in the data without removal of some records
impData <- mice(impData_0[,-1], m=5, method=meth, predictorMatrix=predM,printFlag=FALSE, maxit=50,seed=500)

#compare the distributions of original and imputed data
densityplot(impData)

# get back the completed dataset where the missing values have been replaced with the imputed values 
wine_impute <- complete(impData)


#write.csv(wine_impute,'wine_impute.csv')

#wine_impute <- read.csv('D:/Rstudio_projects/DATA621/wine_impute.csv')
#wine_impute <- wine_impute[,-1]

suppressMessages(suppressWarnings(library(DataExplorer)))
miss_plot <- plot_missing(wine_impute)

miss_plot
##               feature num_missing pct_missing group
## 1              TARGET           0           0  Good
## 2        FixedAcidity           0           0  Good
## 3     VolatileAcidity           0           0  Good
## 4          CitricAcid           0           0  Good
## 5       ResidualSugar           0           0  Good
## 6           Chlorides           0           0  Good
## 7   FreeSulfurDioxide           0           0  Good
## 8  TotalSulfurDioxide           0           0  Good
## 9             Density           0           0  Good
## 10                 pH           0           0  Good
## 11          Sulphates           0           0  Good
## 12            Alcohol           0           0  Good
## 13        LabelAppeal           0           0  Good
## 14          AcidIndex           0           0  Good
## 15              STARS           0           0  Good

2.2 Collinearity

# Plot a correlation graph
suppressMessages(suppressWarnings(library(corrplot)))

wine_impute[,c('LabelAppeal','AcidIndex','STARS')] <- lapply(wine_impute[,c('LabelAppeal','AcidIndex','STARS')], as.numeric)

# calculate Pearson correlation between predictors.
traincor <- cor(wine_impute,use = "na.or.complete")

cex.before <- par("cex")
par(cex = 0.6)
corrplot(traincor, method = "number",number.cex = .97, cl.cex = 1/par("cex"))

par(cex = cex.before)
#corr

VIF test

library(usdm)
## Loading required package: sp
## Loading required package: raster
## 
## Attaching package: 'raster'
## The following object is masked from 'package:pastecs':
## 
##     extract
## The following object is masked from 'package:plotly':
## 
##     select
## The following objects are masked from 'package:MASS':
## 
##     area, select
## The following objects are masked from 'package:Hmisc':
## 
##     mask, zoom
## The following object is masked from 'package:dplyr':
## 
##     select
## The following object is masked from 'package:formattable':
## 
##     area
## 
## Attaching package: 'usdm'
## The following object is masked from 'package:faraway':
## 
##     vif
## The following object is masked from 'package:rms':
## 
##     vif
## The following object is masked from 'package:car':
## 
##     vif
vif(wine_impute[,-c(1,2)])
##             Variables  VIF
## 1     VolatileAcidity 1.01
## 2          CitricAcid 1.01
## 3       ResidualSugar 1.00
## 4           Chlorides 1.01
## 5   FreeSulfurDioxide 1.01
## 6  TotalSulfurDioxide 1.00
## 7             Density 1.00
## 8                  pH 1.01
## 9           Sulphates 1.00
## 10            Alcohol 1.01
## 11        LabelAppeal 1.12
## 12          AcidIndex 1.04
## 13              STARS 1.14
Cor <- cor(wine_impute)
suppressMessages(suppressWarnings(library(caret)))
(highCor <- findCorrelation(Cor, cutoff = 0.75))
## integer(0)

Using findCorrelation function in caret package also suggests there is no collinearities needs to be watched and no variable needs to be eliminated.

We also treat’STARS’ differently to getn a different dataset.

Looking for Patterns in the ‘STARS’ NA Values:

Next, due to such a high NA percent (26.25%)

Note that there are no ZEROS in the STARS field, 1 is the min:

## [1] 1

Graphically, it seems that a blank stars field is analagous to a ZERO (see chart below).

wine_training_data$STARS[is.na(wine_training_data$STARS)] <- 0

#wine_training_data <- do_factors(wine_training_data)

m0 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 0])
m1 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 1])
m2 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 2])
m3 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 3])
m4 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 4])

stars_summary_df <- data.frame(cbind(num_stars = c(0,1,2,3,4), mean_target = c(m0,m1,m2,m3,m4)))
stars_summary_df
##   num_stars mean_target
## 1         0        1.18
## 2         1        2.58
## 3         2        3.80
## 4         3        4.54
## 5         4        5.42
ggplot(stars_summary_df, aes(num_stars, mean_target)) + geom_point(shape=1, size=5) + geom_smooth(method=lm,se=FALSE)+theme(axis.text.x=element_text(size=rel(0.2)))

Though this calculation may be imperfect at the moment, we will show later that the calculation is to be discarded, and thus not worth figuring out a closer approximation for NA’s replacement in the STARS field.

After the fill, we have what looks like a Zero-Inflated model on our hands.

ggplot(wine_training_data, aes(TARGET, fill = STARS)) + geom_histogram(binwidth=1, bins = 8,  position="dodge",fill=I('grey'),col=I('black'))

3. BUILD MODELS (25 Points)

Specification: Using the training data set, build at least two different poisson regression models, at least two different negative binomial regression models, and at least two multiple linear regression models, using different variables (or the same variables with different transformations). Sometimes poisson and negative binomial regression models give the same results. If that is the case, comment on that. Consider changing the input variables if that occurs so that you get different models. Although not covered in class, you may also want to consider building zero-inflated poisson and negative binomial regression models. You may select the variables manually, use an approach such as Forward or Stepwise, use a different approach such as trees, or use a combination of techniques. Describe the techniques you used. If you manually selected a variable for inclusion into the model or exclusion into the model, indicate why this was done.

Discuss the coefficients in the models, do they make sense? In this case, about the only thing you can comment on is the number of stars and the wine label appeal. However, you might comment on the coefficient and magnitude of variables and how they are similar or different from model to model. For example, you might say “pH seems to have a major positive impact in my poisson regression model, but a negative effect in my multiple linear regression model”. Are you keeping the model even though it is counter intuitive? Why? The boss needs to know.

We will build 6 data sets from our training data: * TRAINING set where NAs were replaced by multivariate imputation. * TRAINING set where NAs were replaced by ZEROs. * TRAINING set where NAs were NOT replaced by ZEROs. * TEST set where NAs were replaced by multivariate imputation. * TEST set where NAs were replaced by ZEROs. * TEST set where NAs were NOT replaced by ZEROs.

library(grid)
library(vcd)
## 
## Attaching package: 'vcd'
## The following object is masked from 'package:raster':
## 
##     mosaic
 fit <- goodfit(wine_impute$TARGET)
summary(fit) 
## 
##   Goodness-of-fit test for poisson distribution
## 
##                   X^2 df P(> X^2)
## Likelihood Ratio 8648  7        0
fit <- goodfit(wine_impute$TARGET,type = "binomial")
## Warning in goodfit(wine_impute$TARGET, type = "binomial"): size was not
## given, taken as maximum count
summary(fit) 
## 
##   Goodness-of-fit test for binomial distribution
## 
##                    X^2 df P(> X^2)
## Likelihood Ratio 11450  7        0
fit <- goodfit(wine_impute$TARGET,type = "nbinomial")
summary(fit) 
## 
##   Goodness-of-fit test for nbinomial distribution
## 
##                   X^2 df P(> X^2)
## Likelihood Ratio 8174  6        0

3.1 Poisson GLM:“NAs replaced by imputation” and Zero-inflated Poinsson Model

In part 2 function, missing values were filled by multivariate impuation via chained equation. Now we will these dataset to calculate the coefficients.

Before we build models, we split the data into train and test.

require(caTools) 
## Loading required package: caTools
set.seed(123)  
sample <- sample.split(wine_impute,SplitRatio = 0.75) 
train <- subset(wine_impute,sample ==TRUE)
test <- subset(wine_impute, sample==FALSE)

3.1.1 Build the model

glm_full <- glm(TARGET ~  ., family = poisson, data = train)
glm_null <- glm(TARGET ~  1, family = poisson, data = train)
fit.poisson.imp <-step(glm_full,direction='backward',family = poisson, data = train, trace = FALSE)
  
summary(fit.poisson.imp)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal + 
##     AcidIndex + STARS, family = poisson, data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.070  -0.684   0.124   0.630   2.672  
## 
## Coefficients:
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         0.4718976  0.0484096    9.75 < 0.0000000000000002 ***
## VolatileAcidity    -0.0397135  0.0075824   -5.24           0.00000016 ***
## CitricAcid          0.0146861  0.0069195    2.12              0.03380 *  
## Chlorides          -0.0540308  0.0186412   -2.90              0.00375 ** 
## FreeSulfurDioxide   0.0001564  0.0000398    3.92           0.00008696 ***
## TotalSulfurDioxide  0.0000962  0.0000258    3.73              0.00019 ***
## pH                 -0.0164628  0.0087435   -1.88              0.05972 .  
## Alcohol             0.0023545  0.0016139    1.46              0.14460    
## LabelAppeal         0.1475757  0.0070227   21.01 < 0.0000000000000002 ***
## AcidIndex          -0.1023197  0.0052214  -19.60 < 0.0000000000000002 ***
## STARS               0.3331119  0.0065148   51.13 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 16775  on 9382  degrees of freedom
## Residual deviance: 11796  on 9372  degrees of freedom
## AIC: 35282
## 
## Number of Fisher Scoring iterations: 5

3.1.2 Model diagnostics and adjustment

#good-of-fitness test
p_poisson <- 1-pchisq(summary(fit.poisson.imp)$deviance, summary(fit.poisson.imp)$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_1 <- deviance(fit.poisson.imp)/fit.poisson.imp$df.residual
dispersiontest(fit.poisson.imp)   # library(AER)
## 
##  Overdispersion test
## 
## data:  fit.poisson.imp
## z = -8, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##        0.9
#c = 0 equidispersion, c > 0 is overdispersed

#Check for zero inflation
fit.poisson.imp_zeroinf <- zeroinfl(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=train, dist="poisson")   
#library(pscl)
names(vuong(fit.poisson.imp, fit.poisson.imp_zeroinf))
## Vuong Non-Nested Hypothesis Test-Statistic: 
## (test-statistic is asymptotically distributed N(0,1) under the
##  null that the models are indistinguishible)
## -------------------------------------------------------------
##               Vuong z-statistic             H_A             p-value
## Raw                       -37.9 model2 > model1 <0.0000000000000002
## AIC-corrected             -37.6 model2 > model1 <0.0000000000000002
## BIC-corrected             -36.4 model2 > model1 <0.0000000000000002
## NULL
summary(fit.poisson.imp_zeroinf)
## 
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = train, dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3435 -0.4623  0.0312  0.4408  6.5944 
## 
## Count model coefficients (poisson with log link):
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         0.6202519  0.2347889    2.64              0.00825 ** 
## FixedAcidity        0.0003694  0.0009894    0.37              0.70886    
## VolatileAcidity    -0.0117353  0.0078453   -1.50              0.13470    
## CitricAcid          0.0011571  0.0071043    0.16              0.87062    
## ResidualSugar      -0.0001388  0.0001806   -0.77              0.44220    
## Chlorides          -0.0283245  0.0193895   -1.46              0.14407    
## FreeSulfurDioxide   0.0000421  0.0000405    1.04              0.29796    
## TotalSulfurDioxide -0.0000194  0.0000260   -0.74              0.45636    
## Density            -0.2870284  0.2320234   -1.24              0.21606    
## pH                  0.0068736  0.0090770    0.76              0.44890    
## Sulphates          -0.0017970  0.0065813   -0.27              0.78481    
## Alcohol             0.0073102  0.0016554    4.42              0.00001 ***
## LabelAppeal         0.2373423  0.0073549   32.27 < 0.0000000000000002 ***
## AcidIndex          -0.0197320  0.0057969   -3.40              0.00066 ***
## STARS               0.1164804  0.0072201   16.13 < 0.0000000000000002 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)        -2.912617   1.403483   -2.08               0.0380 *  
## FixedAcidity        0.000459   0.005835    0.08               0.9373    
## VolatileAcidity     0.208031   0.045626    4.56           0.00000513 ***
## CitricAcid         -0.087904   0.042756   -2.06               0.0398 *  
## ResidualSugar      -0.000311   0.001075   -0.29               0.7722    
## Chlorides           0.251197   0.113250    2.22               0.0265 *  
## FreeSulfurDioxide  -0.001044   0.000252   -4.14           0.00003436 ***
## TotalSulfurDioxide -0.000839   0.000160   -5.24           0.00000016 ***
## Density             0.297184   1.382742    0.21               0.8298    
## pH                  0.175923   0.053860    3.27               0.0011 ** 
## Sulphates           0.080522   0.039312    2.05               0.0405 *  
## Alcohol             0.027322   0.009713    2.81               0.0049 ** 
## LabelAppeal         0.676920   0.046334   14.61 < 0.0000000000000002 ***
## AcidIndex           0.476572   0.027843   17.12 < 0.0000000000000002 ***
## STARS              -2.887229   0.104365  -27.66 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 38 
## Log-likelihood: -1.55e+04 on 30 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.imp))  #library(faraway)

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.imp, n = 250)  #library(DHARMa)
plot(simulationOutput)

par(mfrow=c(3,5))
for(i in 2:15){
  plotResiduals(train[,i], simulationOutput$scaledResiduals,xlab=colnames(train)[i])
}
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 5
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 4

The GOF test indicates that the Poisson model does not fit the data (p < 0.05).

# outlier test
outlierTest(fit.poisson.imp)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 5766    -3.07            0.00213           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.poisson.imp)

##       StudRes     Hat    CookD
## 1630    -1.37 0.01301 0.001927
## 5766    -3.07 0.00174 0.000746
## 11289    2.68 0.00285 0.003199

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 11289, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.poisson.imp_11289 <- update(fit.poisson.imp,subset=c(-11289))
compareCoefs(fit.poisson.imp, fit.poisson.imp_11289 )
## 
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal + 
##   AcidIndex + STARS, family = poisson, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal + 
##   AcidIndex + STARS, family = poisson, data = train, subset = c(-11289))
##                        Est. 1       SE 1     Est. 2       SE 2
## (Intercept)         0.4718976  0.0484096  0.4718976  0.0484096
## VolatileAcidity    -0.0397135  0.0075824 -0.0397135  0.0075824
## CitricAcid          0.0146861  0.0069195  0.0146861  0.0069195
## Chlorides          -0.0540308  0.0186412 -0.0540308  0.0186412
## FreeSulfurDioxide   0.0001564  0.0000398  0.0001564  0.0000398
## TotalSulfurDioxide  0.0000962  0.0000258  0.0000962  0.0000258
## pH                 -0.0164628  0.0087435 -0.0164628  0.0087435
## Alcohol             0.0023545  0.0016139  0.0023545  0.0016139
## LabelAppeal         0.1475757  0.0070227  0.1475757  0.0070227
## AcidIndex          -0.1023197  0.0052214 -0.1023197  0.0052214
## STARS               0.3331119  0.0065148  0.3331119  0.0065148

From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.

#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.poisson.imp,layout = c(3, 4),ask=F)

##                    Test stat Pr(>|t|)
## VolatileAcidity        1.861    0.173
## CitricAcid             0.046    0.831
## Chlorides              2.590    0.108
## FreeSulfurDioxide      0.106    0.744
## TotalSulfurDioxide     0.679    0.410
## pH                     0.634    0.426
## Alcohol                1.988    0.159
## LabelAppeal           41.788    0.000
## AcidIndex             51.749    0.000
## STARS                456.088    0.000
glm_full_aj <- glm(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), family = poisson, data = train) 

fit.poisson.imp.aj <-step(glm_full_aj,direction='backward',family = poisson, data = train, trace = FALSE)

residualPlots(fit.poisson.imp.aj,layout = c(3, 4),ask=F)

##                      Test stat Pr(>|t|)
## VolatileAcidity          0.797    0.372
## CitricAcid               0.029    0.865
## Chlorides                2.552    0.110
## FreeSulfurDioxide        0.056    0.813
## TotalSulfurDioxide       1.181    0.277
## Sulphates                0.519    0.471
## Alcohol                  2.367    0.124
## LabelAppeal              0.061    0.805
## AcidIndex                0.000    1.000
## STARS                    0.000    1.000
## log(LabelAppeal + 3)     0.055    0.814
## I(AcidIndex^2)          25.542    0.000
## I(STARS^2)              75.041    0.000
summary(fit.poisson.imp.aj)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##     LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##     I(AcidIndex^2) + I(STARS^2), family = poisson, data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.153  -0.622   0.072   0.581   3.552  
## 
## Coefficients:
##                        Estimate Std. Error z value             Pr(>|z|)
## (Intercept)          -3.2212634  0.5444877   -5.92       0.000000003296
## VolatileAcidity      -0.0388835  0.0075996   -5.12       0.000000311254
## CitricAcid            0.0113935  0.0069222    1.65               0.0998
## Chlorides            -0.0598136  0.0186464   -3.21               0.0013
## FreeSulfurDioxide     0.0001475  0.0000397    3.71               0.0002
## TotalSulfurDioxide    0.0000775  0.0000258    3.00               0.0027
## Sulphates            -0.0093901  0.0063568   -1.48               0.1396
## Alcohol               0.0041796  0.0016160    2.59               0.0097
## LabelAppeal          -0.1755220  0.0698530   -2.51               0.0120
## AcidIndex             0.0619526  0.0245671    2.52               0.0117
## STARS                 0.9757016  0.0321168   30.38 < 0.0000000000000002
## log(LabelAppeal + 3)  1.9955698  0.4205452    4.75       0.000002083038
## I(AcidIndex^2)       -0.0148401  0.0022803   -6.51       0.000000000076
## I(STARS^2)           -0.1383156  0.0067808  -20.40 < 0.0000000000000002
##                         
## (Intercept)          ***
## VolatileAcidity      ***
## CitricAcid           .  
## Chlorides            ** 
## FreeSulfurDioxide    ***
## TotalSulfurDioxide   ** 
## Sulphates               
## Alcohol              ** 
## LabelAppeal          *  
## AcidIndex            *  
## STARS                ***
## log(LabelAppeal + 3) ***
## I(AcidIndex^2)       ***
## I(STARS^2)           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 16775  on 9382  degrees of freedom
## Residual deviance: 11269  on 9369  degrees of freedom
## AIC: 34761
## 
## Number of Fisher Scoring iterations: 6

3.1.3 Model Interpretation

coef <- summary(fit.poisson.imp.aj)$coefficients[,1]
var.name <- variable.names(fit.poisson.imp.aj) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) -3.22 0.04
VolatileAcidity -0.04 0.96
CitricAcid 0.01 1.01
Chlorides -0.06 0.94
FreeSulfurDioxide 0.00 1.00
TotalSulfurDioxide 0.00 1.00
Sulphates -0.01 0.99
Alcohol 0.00 1.00
LabelAppeal -0.18 0.84
AcidIndex 0.06 1.06
STARS 0.98 2.65
log(LabelAppeal + 3) 2.00 7.36
I(AcidIndex^2) -0.01 0.99
I(STARS^2) -0.14 0.87

3.1.4 new Model diagnostics

#good-of-fitness test
p_poisson_aj <- 1-pchisq(summary(fit.poisson.imp.aj)$deviance, summary(fit.poisson.imp)$df.residual)
cat("p-value_aj:" ,p_poisson_aj,"\n\n")
## p-value_aj: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_1 <- deviance(fit.poisson.imp.aj)/fit.poisson.imp.aj$df.residual
dispersiontest(fit.poisson.imp.aj)   # library(AER)
## 
##  Overdispersion test
## 
## data:  fit.poisson.imp.aj
## z = -8, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##      0.897
#c = 0 equidispersion, c > 0 is overdispersed

c =1.2 >0. The assumption of equidispersion is not rejected.

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.imp.aj, n = 250)  #library(DHARMa)
plot(simulationOutput)

#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.imp.aj))  #library(faraway)

# outlier test
outlierTest(fit.poisson.imp.aj)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##       rstudent unadjusted p-value Bonferonni p
## 11289     3.58           0.000346           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.poisson.imp.aj)

##       StudRes     Hat  CookD
## 2729     3.37 0.02327 0.0441
## 11289    3.58 0.00652 0.0137

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.poisson.imp_2729 <- update(fit.poisson.imp.aj,subset=c(-2729))
compareCoefs(fit.poisson.imp.aj, fit.poisson.imp_2729 )
## 
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##   LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##   I(AcidIndex^2) + I(STARS^2), family = poisson, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##   LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##   I(AcidIndex^2) + I(STARS^2), family = poisson, data = train, subset = 
##   c(-2729))
##                          Est. 1       SE 1     Est. 2       SE 2
## (Intercept)          -3.2212634  0.5444877 -3.2196985  0.5444750
## VolatileAcidity      -0.0388835  0.0075996 -0.0388835  0.0075995
## CitricAcid            0.0113935  0.0069222  0.0113969  0.0069221
## Chlorides            -0.0598136  0.0186464 -0.0597513  0.0186467
## FreeSulfurDioxide     0.0001475  0.0000397  0.0001475  0.0000397
## TotalSulfurDioxide    0.0000775  0.0000258  0.0000774  0.0000258
## Sulphates            -0.0093901  0.0063568 -0.0093861  0.0063567
## Alcohol               0.0041796  0.0016160  0.0041749  0.0016160
## LabelAppeal          -0.1755220  0.0698530 -0.1754023  0.0698510
## AcidIndex             0.0619526  0.0245671  0.0619432  0.0245672
## STARS                 0.9757016  0.0321168  0.9758040  0.0321173
## log(LabelAppeal + 3)  1.9955698  0.4205452  1.9944809  0.4205346
## I(AcidIndex^2)       -0.0148401  0.0022803 -0.0148403  0.0022804
## I(STARS^2)           -0.1383156  0.0067808 -0.1383278  0.0067808

From the output table, we can see that coefficients are changed minimally, and the observation 2729 is not influential.

We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model

# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.poisson.imp.aj)

The bar hanging from each point on the theoretical Poisson fit (red line) represents the difference between expected and observed counts. When a bar hanging below 0, it is underfitting. When a bar hanging above 0, it is overfitting. We can see that all variables have either underfitting or overfitting problem except the No.7 variable, “TotalSulfurDioxide”.

3.1.5 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_1 <- 1 - fit.poisson.imp.aj$deviance / fit.poisson.imp.aj$null.deviance)
## [1] 0.328
 #or
#(pR2 <- 1- logLik(logitfit.1)/logLik(logit.null))

# Log likelihood
(logLik_1<-logLik(fit.poisson.imp.aj))
## 'log Lik.' -17367 (df=14)
# AIC
AIC_1 <- fit.poisson.imp.aj$aic
BIC_1 <- BIC(fit.poisson.imp.aj)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_1<- cbind(test, 
      Mean = predict(fit.poisson.imp.aj, newdata=test, type="response"), 
      SE = predict(fit.poisson.imp.aj, newdata=test, type="response", se.fit=T)$se.fit
      )

evl_df_1 <- as.data.frame(aggregate(ndf_1[, 16], list(ndf_1$TARGET), mean))
evl_df_1 <- cbind(evl_df_1,aggregate(ndf_1[, 17], list(ndf_1$TARGET), mean)[,2])
colnames(evl_df_1) <- c('TARGET','Mean','SE')
evl_df_1
##   TARGET Mean     SE
## 1      0 1.93 0.0444
## 2      1 1.51 0.0439
## 3      2 2.07 0.0462
## 4      3 2.69 0.0545
## 5      4 3.46 0.0687
## 6      5 4.16 0.0873
## 7      6 4.93 0.1107
## 8      7 5.37 0.1334
## 9      8 5.24 0.1467

3.1.5 Evaluation of the zeroinflated model

# goodness of fit: pseudo R squared
zeroinf_null <- update(fit.poisson.imp_zeroinf, . ~ 1) 
(pR2_1z <- 1- logLik(fit.poisson.imp_zeroinf)[1]/logLik(zeroinf_null)[1])
## [1] 0.14
#dispersion
c_1z <- deviance(fit.poisson.imp_zeroinf)/fit.poisson.imp_zeroinf$df.residual

# Log likelihood
(logLik_1z<-logLik(fit.poisson.imp_zeroinf))
## 'log Lik.' -15471 (df=30)
# AIC
AIC_1z <- AIC(fit.poisson.imp_zeroinf)
BIC_1z <- BIC(fit.poisson.imp_zeroinf)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_1z<- cbind(test, 
      Count = predict(fit.poisson.imp_zeroinf, newdata=test, type = "count"), 
      Zero = predict(fit.poisson.imp_zeroinf, newdata=test, type = "zero")
      )

evl_df_1z <- as.data.frame(aggregate(ndf_1z[, 16], list(ndf_1z$TARGET), mean))
evl_df_1z <- cbind(evl_df_1z,aggregate(ndf_1z[, 17], list(ndf_1z$TARGET), mean)[,2])
colnames(evl_df_1z) <- c('TARGET','Count','Zero')
evl_df_1z
##   TARGET Count    Zero
## 1      0  3.29 0.44449
## 2      1  2.28 0.18082
## 3      2  2.69 0.17539
## 4      3  3.16 0.16233
## 5      4  3.83 0.12009
## 6      5  4.54 0.08155
## 7      6  5.34 0.03505
## 8      7  6.09 0.02496
## 9      8  6.44 0.00638

3.2 Quasibinomial GLM:“NAs replaced by imputation” and

To deal with the overdispersion, we will fit the modle with “quasipoisson”.

3.2.1 Build the model

fit.qp.imp <- update(fit.poisson.imp.aj,family=quasipoisson)

summary(fit.qp.imp)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##     LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##     I(AcidIndex^2) + I(STARS^2), family = quasipoisson, data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.153  -0.622   0.072   0.581   3.552  
## 
## Coefficients:
##                        Estimate Std. Error t value             Pr(>|t|)
## (Intercept)          -3.2212634  0.5146492   -6.26      0.0000000004041
## VolatileAcidity      -0.0388835  0.0071831   -5.41      0.0000000634560
## CitricAcid            0.0113935  0.0065429    1.74              0.08165
## Chlorides            -0.0598136  0.0176246   -3.39              0.00069
## FreeSulfurDioxide     0.0001475  0.0000375    3.93      0.0000861509755
## TotalSulfurDioxide    0.0000775  0.0000244    3.18              0.00148
## Sulphates            -0.0093901  0.0060084   -1.56              0.11813
## Alcohol               0.0041796  0.0015275    2.74              0.00622
## LabelAppeal          -0.1755220  0.0660250   -2.66              0.00786
## AcidIndex             0.0619526  0.0232208    2.67              0.00764
## STARS                 0.9757016  0.0303567   32.14 < 0.0000000000000002
## log(LabelAppeal + 3)  1.9955698  0.3974989    5.02      0.0000005253650
## I(AcidIndex^2)       -0.0148401  0.0021554   -6.89      0.0000000000061
## I(STARS^2)           -0.1383156  0.0064092  -21.58 < 0.0000000000000002
##                         
## (Intercept)          ***
## VolatileAcidity      ***
## CitricAcid           .  
## Chlorides            ***
## FreeSulfurDioxide    ***
## TotalSulfurDioxide   ** 
## Sulphates               
## Alcohol              ** 
## LabelAppeal          ** 
## AcidIndex            ** 
## STARS                ***
## log(LabelAppeal + 3) ***
## I(AcidIndex^2)       ***
## I(STARS^2)           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for quasipoisson family taken to be 0.893)
## 
##     Null deviance: 16775  on 9382  degrees of freedom
## Residual deviance: 11269  on 9369  degrees of freedom
## AIC: NA
## 
## Number of Fisher Scoring iterations: 6
residualPlots(fit.qp.imp,layout = c(3, 4),ask=F)

##                      Test stat Pr(>|t|)
## VolatileAcidity          0.797    0.372
## CitricAcid               0.029    0.865
## Chlorides                2.552    0.110
## FreeSulfurDioxide        0.056    0.813
## TotalSulfurDioxide       1.181    0.277
## Sulphates                0.519    0.471
## Alcohol                  2.367    0.124
## LabelAppeal              0.061    0.805
## AcidIndex                0.000    1.000
## STARS                    0.000    1.000
## log(LabelAppeal + 3)     0.055    0.814
## I(AcidIndex^2)          25.542    0.000
## I(STARS^2)              75.041    0.000

The dispersion parameter is estimated to be 0.89, the dispersion isless than one. The conditional variance is smaller than the conditional mean. So we have under-dispersion.

3.2.2 Model diagnostics and adjustment

#good-of-fitness test
p_qp <- 1-pchisq(summary(fit.qp.imp)$deviance, summary(fit.qp.imp)$df.residual)
cat("p-value:" ,p_qp,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_2 <- deviance(fit.qp.imp)/fit.qp.imp$df.residual
#c = 0 equidispersion, c > 0 is overdispersed

#Check for zero inflation
fit.qp.imp_zeroinf<-fit.poisson.imp_zeroinf
AIC(fit.qp.imp, fit.poisson.imp_zeroinf)
##                         df   AIC
## fit.qp.imp              14    NA
## fit.poisson.imp_zeroinf 30 31002
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.qp.imp))  #library(faraway)

par(mfrow=c(3,5))
for(i in 2:15){
  plotResiduals(train[,i], simulationOutput$scaledResiduals,xlab=colnames(train)[i])
}
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 5
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 4

The GOF test indicates that the Poisson model does not fit the data (p < 0.05).

# outlier test
outlierTest(fit.qp.imp)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##       rstudent unadjusted p-value Bonferonni p
## 11289     3.26             0.0011           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.qp.imp)

##       StudRes     Hat  CookD
## 2729     3.08 0.02327 0.0493
## 11289    3.26 0.00652 0.0153

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 11289, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.qp.imp_11289 <- update(fit.qp.imp,subset=c(-11289))
compareCoefs(fit.qp.imp, fit.qp.imp_11289 )
## 
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##   LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##   I(AcidIndex^2) + I(STARS^2), family = quasipoisson, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##   LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##   I(AcidIndex^2) + I(STARS^2), family = quasipoisson, data = train, 
##   subset = c(-11289))
##                          Est. 1       SE 1     Est. 2       SE 2
## (Intercept)          -3.2212634  0.5146492 -3.2212634  0.5146492
## VolatileAcidity      -0.0388835  0.0071831 -0.0388835  0.0071831
## CitricAcid            0.0113935  0.0065429  0.0113935  0.0065429
## Chlorides            -0.0598136  0.0176246 -0.0598136  0.0176246
## FreeSulfurDioxide     0.0001475  0.0000375  0.0001475  0.0000375
## TotalSulfurDioxide    0.0000775  0.0000244  0.0000775  0.0000244
## Sulphates            -0.0093901  0.0060084 -0.0093901  0.0060084
## Alcohol               0.0041796  0.0015275  0.0041796  0.0015275
## LabelAppeal          -0.1755220  0.0660250 -0.1755220  0.0660250
## AcidIndex             0.0619526  0.0232208  0.0619526  0.0232208
## STARS                 0.9757016  0.0303567  0.9757016  0.0303567
## log(LabelAppeal + 3)  1.9955698  0.3974989  1.9955698  0.3974989
## I(AcidIndex^2)       -0.0148401  0.0021554 -0.0148401  0.0021554
## I(STARS^2)           -0.1383156  0.0064092 -0.1383156  0.0064092

From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.

3.2.4 Interpretation

coef <- summary(fit.qp.imp)$coefficients[,1]
var.name <- variable.names(fit.qp.imp) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) -3.22 0.04
VolatileAcidity -0.04 0.96
CitricAcid 0.01 1.01
Chlorides -0.06 0.94
FreeSulfurDioxide 0.00 1.00
TotalSulfurDioxide 0.00 1.00
Sulphates -0.01 0.99
Alcohol 0.00 1.00
LabelAppeal -0.18 0.84
AcidIndex 0.06 1.06
STARS 0.98 2.65
log(LabelAppeal + 3) 2.00 7.36
I(AcidIndex^2) -0.01 0.99
I(STARS^2) -0.14 0.87

3.2.5 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_2 <- 1 - fit.qp.imp$deviance / fit.qp.imp$null.deviance)
## [1] 0.328
# Log likelihood
(logLik_2<-logLik(fit.qp.imp))
## 'log Lik.' NA (df=14)
# AIC
AIC_2 <- AIC(fit.qp.imp)
BIC_2 <- BIC(fit.qp.imp)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_2<- cbind(test, 
      Mean = predict(fit.qp.imp, newdata=test, type="response"), 
      SE = predict(fit.qp.imp, newdata=test, type="response", se.fit=T)$se.fit
      )

evl_df_2 <- as.data.frame(aggregate(ndf_2[, 16], list(ndf_2$TARGET), mean))
evl_df_2 <- cbind(evl_df_2,aggregate(ndf_2[, 17], list(ndf_2$TARGET), mean)[,2])
colnames(evl_df_2) <- c('TARGET','Mean','SE')
evl_df_2
##   TARGET Mean     SE
## 1      0 1.93 0.0419
## 2      1 1.51 0.0415
## 3      2 2.07 0.0436
## 4      3 2.69 0.0515
## 5      4 3.46 0.0649
## 6      5 4.16 0.0825
## 7      6 4.93 0.1046
## 8      7 5.37 0.1261
## 9      8 5.24 0.1387

3.2.6 Evaluation of the zeroinflated model

# goodness of fit: pseudo R squared
zeroinf_null <- update(fit.qp.imp_zeroinf, . ~ 1) 
(pR2_2z <- 1- logLik(fit.qp.imp_zeroinf)[1]/logLik(zeroinf_null)[1])
## [1] 0.14
# Log likelihood
(logLik_2z<-logLik(fit.qp.imp_zeroinf))
## 'log Lik.' -15471 (df=30)
#dispersion
c_2z <- deviance(fit.qp.imp_zeroinf)/fit.qp.imp_zeroinf$df.residual

# AIC
AIC_2z <- AIC(fit.qp.imp_zeroinf)
BIC_2z <- BIC(fit.qp.imp_zeroinf)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_2z<- cbind(test, 
      Count = predict(fit.qp.imp_zeroinf, newdata=test, type = "count"), 
      Zero = predict(fit.qp.imp_zeroinf, newdata=test, type = "zero")
      )

evl_df_2z <- as.data.frame(aggregate(ndf_2z[, 16], list(ndf_2z$TARGET), mean))
evl_df_2z <- cbind(evl_df_2z,aggregate(ndf_2z[, 17], list(ndf_2z$TARGET), mean)[,2])
colnames(evl_df_2z) <- c('TARGET','Count','Zero')
evl_df_2z
##   TARGET Count    Zero
## 1      0  3.29 0.44449
## 2      1  2.28 0.18082
## 3      2  2.69 0.17539
## 4      3  3.16 0.16233
## 5      4  3.83 0.12009
## 6      5  4.54 0.08155
## 7      6  5.34 0.03505
## 8      7  6.09 0.02496
## 9      8  6.44 0.00638

3.3 Negative Binomial GLM:“NAs replaced by imputation” and Zero-inflated Negative Binomial Model

In part 2 function, missing values were filled by multivariate impuation via chained equation. Now we will use the completed dataset to calculate the coefficients.

3.3.1 Build the model

glm_full_nb <- glm.nb(TARGET ~  ., data = train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
glm_null_nb <- glm.nb(TARGET ~  1, data = train)

fit.nb.imp <- step(glm_full_nb,direction="backward",trace=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(fit.nb.imp)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal + 
##     AcidIndex + STARS, data = train, init.theta = 48389.28899, 
##     link = log)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.070  -0.684   0.124   0.630   2.672  
## 
## Coefficients:
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         0.4719007  0.0484114    9.75 < 0.0000000000000002 ***
## VolatileAcidity    -0.0397145  0.0075827   -5.24           0.00000016 ***
## CitricAcid          0.0146862  0.0069198    2.12              0.03381 *  
## Chlorides          -0.0540321  0.0186419   -2.90              0.00375 ** 
## FreeSulfurDioxide   0.0001564  0.0000398    3.92           0.00008699 ***
## TotalSulfurDioxide  0.0000962  0.0000258    3.73              0.00019 ***
## pH                 -0.0164634  0.0087439   -1.88              0.05972 .  
## Alcohol             0.0023545  0.0016140    1.46              0.14462    
## LabelAppeal         0.1475755  0.0070230   21.01 < 0.0000000000000002 ***
## AcidIndex          -0.1023215  0.0052216  -19.60 < 0.0000000000000002 ***
## STARS               0.3331160  0.0065151   51.13 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(48389) family taken to be 1)
## 
##     Null deviance: 16774  on 9382  degrees of freedom
## Residual deviance: 11795  on 9372  degrees of freedom
## AIC: 35284
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  48389 
##           Std. Err.:  65368 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -35260

3.3.2 Model diagnostics and adjustment

#five-percent critical value for a chi-squared with 909 d.f. is
qchisq(0.95, df.residual(fit.nb.imp))
## [1] 9598
#good-of-fitness test
p_nb <- 1-pchisq(summary(fit.nb.imp)$deviance, summary(fit.nb.imp)$df.residual)
cat("p-value:" ,p_nb,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_3 <- deviance(fit.nb.imp)/fit.nb.imp$df.residual
#c = 0 equidispersion, c > 0 is overdispersed

#Check for zero inflation
fit.nb.imp_zeroinf <- zeroinfl(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=train, dist="negbin")    #library(pscl)
AIC(fit.nb.imp, fit.nb.imp_zeroinf)
##                    df   AIC
## fit.nb.imp         12 35284
## fit.nb.imp_zeroinf 31 31004
summary(fit.nb.imp_zeroinf)
## 
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = train, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3435 -0.4623  0.0312  0.4408  6.5944 
## 
## Count model coefficients (negbin with log link):
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         0.6202857  0.2347889    2.64              0.00824 ** 
## FixedAcidity        0.0003694  0.0009894    0.37              0.70886    
## VolatileAcidity    -0.0117354  0.0078453   -1.50              0.13469    
## CitricAcid          0.0011567  0.0071043    0.16              0.87067    
## ResidualSugar      -0.0001388  0.0001806   -0.77              0.44220    
## Chlorides          -0.0283250  0.0193895   -1.46              0.14406    
## FreeSulfurDioxide   0.0000421  0.0000405    1.04              0.29796    
## TotalSulfurDioxide -0.0000194  0.0000260   -0.74              0.45638    
## Density            -0.2870626  0.2320234   -1.24              0.21601    
## pH                  0.0068736  0.0090770    0.76              0.44890    
## Sulphates          -0.0017971  0.0065813   -0.27              0.78480    
## Alcohol             0.0073102  0.0016554    4.42             0.000010 ***
## LabelAppeal         0.2373423  0.0073549   32.27 < 0.0000000000000002 ***
## AcidIndex          -0.0197324  0.0057969   -3.40              0.00066 ***
## STARS               0.1164807  0.0072201   16.13 < 0.0000000000000002 ***
## Log(theta)         17.6062211  3.9656460    4.44             0.000009 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)        -2.910885   1.403479   -2.07               0.0381 *  
## FixedAcidity        0.000459   0.005835    0.08               0.9372    
## VolatileAcidity     0.208030   0.045625    4.56           0.00000513 ***
## CitricAcid         -0.087906   0.042756   -2.06               0.0398 *  
## ResidualSugar      -0.000311   0.001075   -0.29               0.7723    
## Chlorides           0.251195   0.113249    2.22               0.0266 *  
## FreeSulfurDioxide  -0.001044   0.000252   -4.14           0.00003437 ***
## TotalSulfurDioxide -0.000839   0.000160   -5.24           0.00000016 ***
## Density             0.295451   1.382743    0.21               0.8308    
## pH                  0.175922   0.053860    3.27               0.0011 ** 
## Sulphates           0.080520   0.039312    2.05               0.0405 *  
## Alcohol             0.027321   0.009713    2.81               0.0049 ** 
## LabelAppeal         0.676919   0.046334   14.61 < 0.0000000000000002 ***
## AcidIndex           0.476572   0.027843   17.12 < 0.0000000000000002 ***
## STARS              -2.887228   0.104365  -27.66 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 44287855.659 
## Number of iterations in BFGS optimization: 60 
## Log-likelihood: -1.55e+04 on 31 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.imp))  #library(faraway)

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.nb.imp, n = 250)  #library(DHARMa)
plot(simulationOutput)

par(mfrow=c(3,5))
for(i in 2:15){
  plotResiduals(train[,i], simulationOutput$scaledResiduals,xlab=colnames(train)[i])
}
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 5
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 4

The GOF test indicates that the Negative Binomial model does not fit the data (p < 0.05).

# outlier test
outlierTest(fit.nb.imp)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 5766    -2.74            0.00618           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.nb.imp)

##       StudRes     Hat    CookD
## 1630    -1.22 0.01301 0.001926
## 5766    -2.74 0.00174 0.000746
## 11289    2.39 0.00285 0.003199

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 11289, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.nb.imp_11289 <- update(fit.nb.imp,subset=c(-11289))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.imp, fit.nb.imp_11289 )
## 
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides 
##   + FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal 
##   + AcidIndex + STARS, data = train, init.theta = 48389.28899, link = 
##   log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides 
##   + FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal 
##   + AcidIndex + STARS, data = train, subset = c(-11289), init.theta = 
##   48389.28747, link = log)
##                        Est. 1       SE 1     Est. 2       SE 2
## (Intercept)         0.4719007  0.0484114  0.4719007  0.0484114
## VolatileAcidity    -0.0397145  0.0075827 -0.0397145  0.0075827
## CitricAcid          0.0146862  0.0069198  0.0146862  0.0069198
## Chlorides          -0.0540321  0.0186419 -0.0540321  0.0186419
## FreeSulfurDioxide   0.0001564  0.0000398  0.0001564  0.0000398
## TotalSulfurDioxide  0.0000962  0.0000258  0.0000962  0.0000258
## pH                 -0.0164634  0.0087439 -0.0164634  0.0087439
## Alcohol             0.0023545  0.0016140  0.0023545  0.0016140
## LabelAppeal         0.1475755  0.0070230  0.1475755  0.0070230
## AcidIndex          -0.1023215  0.0052216 -0.1023215  0.0052216
## STARS               0.3331160  0.0065151  0.3331160  0.0065151

From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.

Then adjust the model by adding high order terms or log form for “LabelAppeal”, “AcidIndex” and “STARS” to see whether we can improve the fit.

#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.nb.imp,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

##                    Test stat Pr(>|t|)
## VolatileAcidity        1.861    0.173
## CitricAcid             0.046    0.831
## Chlorides              2.590    0.108
## FreeSulfurDioxide      0.106    0.744
## TotalSulfurDioxide     0.679    0.410
## pH                     0.634    0.426
## Alcohol                1.988    0.159
## LabelAppeal           41.787    0.000
## AcidIndex             51.738    0.000
## STARS                456.048    0.000
glm_nb_full_aj <- glm.nb(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), data = train) 
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
fit.nb.imp.aj <-step(glm_nb_full_aj,direction='backward',data = train, trace = FALSE)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
residualPlots(fit.nb.imp.aj,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

##                      Test stat Pr(>|t|)
## VolatileAcidity          0.797    0.372
## CitricAcid               0.029    0.865
## Chlorides                2.552    0.110
## FreeSulfurDioxide        0.056    0.813
## TotalSulfurDioxide       1.181    0.277
## Sulphates                0.519    0.471
## Alcohol                  2.367    0.124
## LabelAppeal              0.061    0.805
## AcidIndex                0.000    0.999
## STARS                    0.000    0.999
## log(LabelAppeal + 3)     0.056    0.814
## I(AcidIndex^2)          25.544    0.000
## I(STARS^2)              75.031    0.000
summary(fit.nb.imp.aj)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##     LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##     I(AcidIndex^2) + I(STARS^2), data = train, init.theta = 45840.1185, 
##     link = log)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.153  -0.622   0.072   0.581   3.551  
## 
## Coefficients:
##                        Estimate Std. Error z value             Pr(>|z|)
## (Intercept)          -3.2212853  0.5445059   -5.92       0.000000003299
## VolatileAcidity      -0.0388844  0.0075999   -5.12       0.000000311381
## CitricAcid            0.0113937  0.0069225    1.65               0.0998
## Chlorides            -0.0598151  0.0186472   -3.21               0.0013
## FreeSulfurDioxide     0.0001475  0.0000397    3.71               0.0002
## TotalSulfurDioxide    0.0000775  0.0000258    3.00               0.0027
## Sulphates            -0.0093906  0.0063570   -1.48               0.1396
## Alcohol               0.0041795  0.0016161    2.59               0.0097
## LabelAppeal          -0.1755275  0.0698556   -2.51               0.0120
## AcidIndex             0.0619495  0.0245679    2.52               0.0117
## STARS                 0.9757061  0.0321180   30.38 < 0.0000000000000002
## log(LabelAppeal + 3)  1.9955949  0.4205597    4.75       0.000002084102
## I(AcidIndex^2)       -0.0148400  0.0022804   -6.51       0.000000000076
## I(STARS^2)           -0.1383163  0.0067811  -20.40 < 0.0000000000000002
##                         
## (Intercept)          ***
## VolatileAcidity      ***
## CitricAcid           .  
## Chlorides            ** 
## FreeSulfurDioxide    ***
## TotalSulfurDioxide   ** 
## Sulphates               
## Alcohol              ** 
## LabelAppeal          *  
## AcidIndex            *  
## STARS                ***
## log(LabelAppeal + 3) ***
## I(AcidIndex^2)       ***
## I(STARS^2)           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(45840) family taken to be 1)
## 
##     Null deviance: 16774  on 9382  degrees of freedom
## Residual deviance: 11269  on 9369  degrees of freedom
## AIC: 34764
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  45840 
##           Std. Err.:  54227 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -34734

3.3.3 new Model diagnostics

#good-of-fitness test
(p_nb <- 1-pchisq(summary(fit.nb.imp)$deviance, summary(fit.nb.imp)$df.residual))
## [1] 0
(p_nb_aj <- 1-pchisq(summary(fit.nb.imp.aj)$deviance, summary(fit.nb.imp.aj)$df.residual))
## [1] 0
cat("p-value_aj:" ,p_poisson_aj,"\n\n")
## p-value_aj: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
deviance(fit.nb.imp.aj)/fit.nb.imp.aj$df.residual
## [1] 1.2
#c = 0 equidispersion, c > 0 is overdispersed

c =1.2 >0. The assumption of equidispersion is not rejected.

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.nb.imp.aj, n = 250)  #library(DHARMa)
plot(simulationOutput)

#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.imp.aj))  #library(faraway)

# outlier test
outlierTest(fit.nb.imp.aj)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##       rstudent unadjusted p-value Bonferonni p
## 11289     3.26             0.0011           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.nb.imp.aj)

##       StudRes     Hat  CookD
## 2729     3.08 0.02327 0.0441
## 11289    3.26 0.00652 0.0137

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.nb.imp_11289 <- update(fit.nb.imp.aj,subset=c(-11289))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.imp.aj, fit.nb.imp_11289 )
## 
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides 
##   + FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##   LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##   I(AcidIndex^2) + I(STARS^2), data = train, init.theta = 45840.1185, 
##   link = log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides 
##   + FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol + 
##   LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) + 
##   I(AcidIndex^2) + I(STARS^2), data = train, subset = c(-11289), 
##   init.theta = 45840.15613, link = log)
##                          Est. 1       SE 1     Est. 2       SE 2
## (Intercept)          -3.2212853  0.5445059 -3.2212853  0.5445059
## VolatileAcidity      -0.0388844  0.0075999 -0.0388844  0.0075999
## CitricAcid            0.0113937  0.0069225  0.0113937  0.0069225
## Chlorides            -0.0598151  0.0186472 -0.0598151  0.0186472
## FreeSulfurDioxide     0.0001475  0.0000397  0.0001475  0.0000397
## TotalSulfurDioxide    0.0000775  0.0000258  0.0000775  0.0000258
## Sulphates            -0.0093906  0.0063570 -0.0093906  0.0063570
## Alcohol               0.0041795  0.0016161  0.0041795  0.0016161
## LabelAppeal          -0.1755275  0.0698556 -0.1755275  0.0698556
## AcidIndex             0.0619495  0.0245679  0.0619495  0.0245679
## STARS                 0.9757061  0.0321180  0.9757061  0.0321180
## log(LabelAppeal + 3)  1.9955949  0.4205597  1.9955949  0.4205597
## I(AcidIndex^2)       -0.0148400  0.0022804 -0.0148400  0.0022804
## I(STARS^2)           -0.1383163  0.0067811 -0.1383163  0.0067811

From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.

We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model

# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.nb.imp.aj)

The bar hanging from each point on the theoretical Poisson fit (red line) represents the difference between expected and observed counts. When a bar hanging below 0, it is underfitting. When a bar hanging above 0, it is overfitting. We can see that all variables have either underfitting or overfitting problem except the No.7 variable, “TotalSulfurDioxide”.

3.3.4 Model Interpretation

coef <- summary(fit.nb.imp.aj)$coefficients[,1]
var.name <- variable.names(fit.nb.imp.aj) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) -3.22 0.04
VolatileAcidity -0.04 0.96
CitricAcid 0.01 1.01
Chlorides -0.06 0.94
FreeSulfurDioxide 0.00 1.00
TotalSulfurDioxide 0.00 1.00
Sulphates -0.01 0.99
Alcohol 0.00 1.00
LabelAppeal -0.18 0.84
AcidIndex 0.06 1.06
STARS 0.98 2.65
log(LabelAppeal + 3) 2.00 7.36
I(AcidIndex^2) -0.01 0.99
I(STARS^2) -0.14 0.87

3.3.5 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_3 <- 1 - fit.nb.imp.aj$deviance / fit.nb.imp.aj$null.deviance)
## [1] 0.328
#or
#(pR2 <- 1- logLik(logitfit.1)/logLik(logit.null))

# Log likelihood
(logLik_3<-logLik(fit.nb.imp.aj))
## 'log Lik.' -17367 (df=15)
# AIC
AIC_3 <- fit.nb.imp.aj$aic
BIC_3 <- BIC(fit.nb.imp.aj)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_3<- cbind(test, 
      Mean = predict(fit.nb.imp.aj, newdata=test, type="response"), 
      SE = predict(fit.nb.imp.aj, newdata=test, type="response", se.fit=T)$se.fit
      )

evl_df_3 <- as.data.frame(aggregate(ndf_3[, 16], list(ndf_3$TARGET), mean))
evl_df_3 <- cbind(evl_df_3,aggregate(ndf_3[, 17], list(ndf_3$TARGET), mean)[,2])
colnames(evl_df_3) <- c('TARGET','Mean','SE')
evl_df_3
##   TARGET Mean     SE
## 1      0 1.93 0.0444
## 2      1 1.51 0.0439
## 3      2 2.07 0.0462
## 4      3 2.69 0.0545
## 5      4 3.46 0.0687
## 6      5 4.16 0.0873
## 7      6 4.93 0.1107
## 8      7 5.37 0.1334
## 9      8 5.24 0.1467

3.3.5 Evaluation of the zeroinflated model

# goodness of fit: pseudo R squared
zeroinf_null <-zeroinfl(TARGET ~ 1,train,dist="poisson")
(pR2_3z <- 1- logLik(fit.nb.imp_zeroinf)[1]/logLik(zeroinf_null)[1])
## [1] 0.14
# Log likelihood
(logLik_3z<-logLik(fit.nb.imp_zeroinf))
## 'log Lik.' -15471 (df=31)
#dispersion
c_3z <- deviance(fit.nb.imp_zeroinf)/fit.nb.imp_zeroinf$df.residual

# AIC
AIC_3z <- AIC(fit.nb.imp_zeroinf)
BIC_3z <- BIC(fit.nb.imp_zeroinf)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_3z<- cbind(test, 
      Count = predict(fit.nb.imp_zeroinf, newdata=test, type = "count"), 
      Zero = predict(fit.nb.imp_zeroinf, newdata=test, type = "zero")
      )

evl_df_3z <- as.data.frame(aggregate(ndf_3z[, 16], list(ndf_3z$TARGET), mean))
evl_df_3z <- cbind(evl_df_3z,aggregate(ndf_3z[, 17], list(ndf_3z$TARGET), mean)[,2])
colnames(evl_df_3z) <- c('TARGET','Count','Zero')
evl_df_3z
##   TARGET Count    Zero
## 1      0  3.29 0.44449
## 2      1  2.28 0.18082
## 3      2  2.69 0.17539
## 4      3  3.16 0.16233
## 5      4  3.83 0.12009
## 6      5  4.54 0.08155
## 7      6  5.34 0.03505
## 8      7  6.09 0.02496
## 9      8  6.44 0.00638

3.4 Multiple Linear Regression Model:“NAs replaced by imputation”

In part 2 function, missing values were filled by multivariate impuation via chained equation. Now we will use the completed dataset to calculate the coefficients.

3.4.1 Build the model

glm_full_lm <- glm(TARGET ~ .,data=train, family = gaussian)
glm_null_lm <- glm(TARGET ~ 1,data=train, family = gaussian)
fit.lm.imp <- step(glm_full_lm,direction="backward",trace=F)
summary(fit.lm.imp)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal + 
##     AcidIndex + STARS, family = gaussian, data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.807  -1.005   0.174   1.032   4.474  
## 
## Coefficients:
##                      Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)         0.7999365  0.1161236    6.89       0.000000000006 ***
## VolatileAcidity    -0.1215584  0.0187777   -6.47       0.000000000101 ***
## CitricAcid          0.0392234  0.0172436    2.27              0.02295 *  
## Chlorides          -0.1744872  0.0460703   -3.79              0.00015 ***
## FreeSulfurDioxide   0.0004523  0.0000990    4.57       0.000004942649 ***
## TotalSulfurDioxide  0.0002618  0.0000638    4.10       0.000041100043 ***
## pH                 -0.0392113  0.0217106   -1.81              0.07094 .  
## Alcohol             0.0111803  0.0039823    2.81              0.00500 ** 
## LabelAppeal         0.4511456  0.0171990   26.23 < 0.0000000000000002 ***
## AcidIndex          -0.2610259  0.0114903  -22.72 < 0.0000000000000002 ***
## STARS               1.1383662  0.0174134   65.37 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 2.04)
## 
##     Null deviance: 34969  on 9382  degrees of freedom
## Residual deviance: 19089  on 9372  degrees of freedom
## AIC: 33316
## 
## Number of Fisher Scoring iterations: 2

3.4.2 Model diagnostics and adjustment

#good-of-fitness test
p_lm <-1-pchisq(summary(fit.lm.imp)$deviance, summary(fit.lm.imp)$df.residual)
cat("p-value:" ,p_lm,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_4 <- deviance(fit.lm.imp)/fit.lm.imp$df.residual
#c = 0 equidispersion, c > 0 is overdispersed

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.lm.imp, n = 250)  #library(DHARMa)
plot(simulationOutput)

#plot a half normal probability plot of residuals
halfnorm(residuals(fit.lm.imp))  #library(faraway)

par(mfrow=c(3,5))
for(i in 2:15){
  plotResiduals(train[,i], simulationOutput$scaledResiduals,xlab=colnames(train)[i])
}
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 5
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 4
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.lm.imp,layout = c(3, 4),ask=F)

##                    Test stat Pr(>|t|)
## VolatileAcidity        2.237    0.135
## CitricAcid             0.193    0.660
## Chlorides              5.716    0.017
## FreeSulfurDioxide      0.030    0.862
## TotalSulfurDioxide     2.131    0.144
## pH                     2.210    0.137
## Alcohol                7.377    0.007
## LabelAppeal            4.047    0.044
## AcidIndex             23.512    0.000
## STARS                403.613    0.000

The GOF test indicates that the Poisson model does not fit the data (p < 0.05).

# outlier test
outlierTest(fit.lm.imp)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 5766    -3.37           0.000747           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.lm.imp)

##       StudRes     Hat   CookD
## 5766    -3.37 0.00118 0.00122
## 8447    -1.79 0.00811 0.00238
## 11289    3.14 0.00455 0.00410

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 11289, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.lm.imp_11289 <- update(fit.lm.imp,subset=c(-11289))
compareCoefs(fit.lm.imp, fit.lm.imp_11289 )
## 
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal + 
##   AcidIndex + STARS, family = gaussian, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal + 
##   AcidIndex + STARS, family = gaussian, data = train, subset = c(-11289))
##                        Est. 1       SE 1     Est. 2       SE 2
## (Intercept)         0.7999365  0.1161236  0.7999365  0.1161236
## VolatileAcidity    -0.1215584  0.0187777 -0.1215584  0.0187777
## CitricAcid          0.0392234  0.0172436  0.0392234  0.0172436
## Chlorides          -0.1744872  0.0460703 -0.1744872  0.0460703
## FreeSulfurDioxide   0.0004523  0.0000990  0.0004523  0.0000990
## TotalSulfurDioxide  0.0002618  0.0000638  0.0002618  0.0000638
## pH                 -0.0392113  0.0217106 -0.0392113  0.0217106
## Alcohol             0.0111803  0.0039823  0.0111803  0.0039823
## LabelAppeal         0.4511456  0.0171990  0.4511456  0.0171990
## AcidIndex          -0.2610259  0.0114903 -0.2610259  0.0114903
## STARS               1.1383662  0.0174134  1.1383662  0.0174134

From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.

Then adjust the model by adding high order terms or log form for “LabelAppeal”, “AcidIndex” and “STARS” to see whether we can improve the fit.

#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.lm.imp,layout = c(3, 4),ask=F)

##                    Test stat Pr(>|t|)
## VolatileAcidity        2.237    0.135
## CitricAcid             0.193    0.660
## Chlorides              5.716    0.017
## FreeSulfurDioxide      0.030    0.862
## TotalSulfurDioxide     2.131    0.144
## pH                     2.210    0.137
## Alcohol                7.377    0.007
## LabelAppeal            4.047    0.044
## AcidIndex             23.512    0.000
## STARS                403.613    0.000
glm_lm_full_aj <- glm(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), data = train,family=gaussian) 

fit.lm.imp.aj <-step(glm_lm_full_aj,direction='backward',data = train, trace = FALSE)

residualPlots(fit.lm.imp.aj,layout = c(3, 4),ask=F)

##                    Test stat Pr(>|t|)
## VolatileAcidity        1.727    0.189
## CitricAcid             0.121    0.728
## Chlorides              5.649    0.017
## FreeSulfurDioxide      0.143    0.705
## TotalSulfurDioxide     3.010    0.083
## pH                     1.897    0.168
## Sulphates              1.051    0.305
## Alcohol                7.764    0.005
## LabelAppeal            0.081    0.776
## AcidIndex              0.000    1.000
## STARS                  0.000    1.000
## I(AcidIndex^2)        47.095    0.000
## I(STARS^2)            76.025    0.000
summary(fit.lm.imp.aj)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##     FreeSulfurDioxide + TotalSulfurDioxide + pH + Sulphates + 
##     Alcohol + LabelAppeal + AcidIndex + STARS + I(AcidIndex^2) + 
##     I(STARS^2), family = gaussian, data = train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -4.881  -0.972   0.133   0.998   4.806  
## 
## Coefficients:
##                      Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)        -0.6908343  0.1884270   -3.67              0.00025 ***
## VolatileAcidity    -0.1185089  0.0185706   -6.38        0.00000000018 ***
## CitricAcid          0.0346696  0.0170564    2.03              0.04212 *  
## Chlorides          -0.1886135  0.0455670   -4.14        0.00003514795 ***
## FreeSulfurDioxide   0.0004492  0.0000979    4.59        0.00000450212 ***
## TotalSulfurDioxide  0.0002454  0.0000631    3.89              0.00010 ***
## pH                 -0.0321789  0.0214819   -1.50              0.13418    
## Sulphates          -0.0248422  0.0156354   -1.59              0.11213    
## Alcohol             0.0136121  0.0039418    3.45              0.00056 ***
## LabelAppeal         0.4624279  0.0170248   27.16 < 0.0000000000000002 ***
## AcidIndex          -0.0997936  0.0483298   -2.06              0.03896 *  
## STARS               2.2516914  0.0801398   28.10 < 0.0000000000000002 ***
## I(AcidIndex^2)     -0.0132245  0.0040381   -3.27              0.00106 ** 
## I(STARS^2)         -0.2564967  0.0180314  -14.22 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for gaussian family taken to be 1.99)
## 
##     Null deviance: 34969  on 9382  degrees of freedom
## Residual deviance: 18659  on 9369  degrees of freedom
## AIC: 33108
## 
## Number of Fisher Scoring iterations: 2

3.4.3 Interpretation

coef <- summary(fit.lm.imp.aj)$coefficients[,1]
var.name <- variable.names(fit.lm.imp.aj) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) -0.69 0.50
VolatileAcidity -0.12 0.89
CitricAcid 0.03 1.04
Chlorides -0.19 0.83
FreeSulfurDioxide 0.00 1.00
TotalSulfurDioxide 0.00 1.00
pH -0.03 0.97
Sulphates -0.02 0.98
Alcohol 0.01 1.01
LabelAppeal 0.46 1.59
AcidIndex -0.10 0.91
STARS 2.25 9.50
I(AcidIndex^2) -0.01 0.99
I(STARS^2) -0.26 0.77

3.4.4 new Model diagnostics

#good-of-fitness test
(p_lm <- 1-pchisq(summary(fit.lm.imp)$deviance, summary(fit.lm.imp)$df.residual))
## [1] 0
(p_lm_aj <- 1-pchisq(summary(fit.lm.imp.aj)$deviance, summary(fit.lm.imp.aj)$df.residual))
## [1] 0
cat("p-value_aj:" ,p_poisson_aj,"\n\n")
## p-value_aj: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_4 <-deviance(fit.lm.imp.aj)/fit.lm.imp.aj$df.residual
#c = 0 equidispersion, c > 0 is overdispersed

c =1.2 >0. The assumption of equidispersion is not rejected.

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.lm.imp.aj, n = 250)  #library(DHARMa)
plot(simulationOutput)

#plot a half normal probability plot of residuals
halfnorm(residuals(fit.lm.imp.aj))  #library(faraway)

# outlier test
outlierTest(fit.lm.imp.aj)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 5766    -3.46           0.000535           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.lm.imp.aj)

##      StudRes    Hat   CookD
## 2729    2.21 0.0399 0.01447
## 5766   -3.46 0.0014 0.00120
## 8447   -1.33 0.0414 0.00548

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.lm.imp_11289 <- update(fit.lm.imp.aj,subset=c(-11289))
compareCoefs(fit.lm.imp.aj, fit.lm.imp_11289 )
## 
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + pH + Sulphates + Alcohol + 
##   LabelAppeal + AcidIndex + STARS + I(AcidIndex^2) + I(STARS^2), family =
##    gaussian, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides + 
##   FreeSulfurDioxide + TotalSulfurDioxide + pH + Sulphates + Alcohol + 
##   LabelAppeal + AcidIndex + STARS + I(AcidIndex^2) + I(STARS^2), family =
##    gaussian, data = train, subset = c(-11289))
##                        Est. 1       SE 1     Est. 2       SE 2
## (Intercept)        -0.6908343  0.1884270 -0.6908343  0.1884270
## VolatileAcidity    -0.1185089  0.0185706 -0.1185089  0.0185706
## CitricAcid          0.0346696  0.0170564  0.0346696  0.0170564
## Chlorides          -0.1886135  0.0455670 -0.1886135  0.0455670
## FreeSulfurDioxide   0.0004492  0.0000979  0.0004492  0.0000979
## TotalSulfurDioxide  0.0002454  0.0000631  0.0002454  0.0000631
## pH                 -0.0321789  0.0214819 -0.0321789  0.0214819
## Sulphates          -0.0248422  0.0156354 -0.0248422  0.0156354
## Alcohol             0.0136121  0.0039418  0.0136121  0.0039418
## LabelAppeal         0.4624279  0.0170248  0.4624279  0.0170248
## AcidIndex          -0.0997936  0.0483298 -0.0997936  0.0483298
## STARS               2.2516914  0.0801398  2.2516914  0.0801398
## I(AcidIndex^2)     -0.0132245  0.0040381 -0.0132245  0.0040381
## I(STARS^2)         -0.2564967  0.0180314 -0.2564967  0.0180314

From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.

We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model

# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.lm.imp.aj)

3.4.4 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_4 <- 1 - fit.lm.imp.aj$deviance / fit.lm.imp.aj$null.deviance)
## [1] 0.466
# Log likelihood
(logLik_4<-logLik(fit.lm.imp.aj))
## 'log Lik.' -16539 (df=15)
# AIC
AIC_4 <- fit.lm.imp.aj$aic
BIC_4 <- BIC(fit.lm.imp.aj)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_4<- cbind(test, 
      Mean = predict(fit.lm.imp.aj, newdata=test, type="response"), 
      SE = predict(fit.lm.imp.aj, newdata=test, type="response", se.fit=T)$se.fit
      )

evl_df_4 <- as.data.frame(aggregate(ndf_4[, 16], list(ndf_4$TARGET), mean))
evl_df_4 <- cbind(evl_df_4,aggregate(ndf_4[, 17], list(ndf_4$TARGET), mean)[,2])
colnames(evl_df_4) <- c('TARGET','Mean','SE')
evl_df_4
##   TARGET Mean     SE
## 1      0 1.91 0.0550
## 2      1 1.39 0.0545
## 3      2 2.07 0.0512
## 4      3 2.71 0.0497
## 5      4 3.48 0.0503
## 6      5 4.14 0.0547
## 7      6 4.80 0.0566
## 8      7 5.25 0.0637
## 9      8 5.31 0.0683

3.5 Multinomial Logistic Regression Model:“NAs replaced by imputation”

suppressMessages(suppressWarnings(library(mlogit)))

train_fct <- train
train_fct[,c('TARGET','LabelAppeal','STARS')] <- lapply(train_fct[,c('TARGET','LabelAppeal','STARS')] ,as.factor)
  
# Reshaping the data from wide to long format
#mydata$mode<-as.factor(mydata$mode)
mldata<-mlogit.data(train, varying=NULL, choice="TARGET", shape="wide")

# Multinomial logit model 
fit.ml.imp <- mlogit(TARGET ~ 1 | FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS,data=mldata, reflevel=levels(train_fct$TARGET)[1])
summary(fit.ml.imp)
## 
## Call:
## mlogit(formula = TARGET ~ 1 | FixedAcidity + VolatileAcidity + 
##     CitricAcid + ResidualSugar + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + Density + pH + Sulphates + Alcohol + 
##     LabelAppeal + AcidIndex + STARS, data = mldata, reflevel = levels(train_fct$TARGET)[1], 
##     method = "nr", print.level = 0)
## 
## Frequencies of alternatives:
##       0       1       2       3       4       5       6       7       8 
## 0.21262 0.01982 0.08601 0.20015 0.24726 0.16093 0.06064 0.01130 0.00128 
## 
## nr method
## 12 iterations, 0h:0m:29s 
## g'(-H)^-1g = 0.000238 
## successive function values within tolerance limits 
## 
## Coefficients :
##                         Estimate  Std. Error t-value             Pr(>|t|)
## 1:(intercept)          4.7870282   3.3059821    1.45              0.14762
## 2:(intercept)          3.4055895   1.8283995    1.86              0.06252
## 3:(intercept)          3.1215290   1.3896201    2.25              0.02468
## 4:(intercept)         -0.4547947   1.3967838   -0.33              0.74473
## 5:(intercept)         -4.7076611   1.6809479   -2.80              0.00510
## 6:(intercept)         -9.8805514   2.3224478   -4.25        0.00002096387
## 7:(intercept)        -25.0778856   4.5129474   -5.56        0.00000002746
## 8:(intercept)        -72.1819360  17.1989239   -4.20        0.00002706092
## 1:FixedAcidity         0.0141287   0.0137268    1.03              0.30335
## 2:FixedAcidity         0.0075548   0.0076568    0.99              0.32380
## 3:FixedAcidity        -0.0043701   0.0058176   -0.75              0.45254
## 4:FixedAcidity         0.0027660   0.0058309    0.47              0.63524
## 5:FixedAcidity        -0.0028177   0.0070285   -0.40              0.68849
## 6:FixedAcidity         0.0113328   0.0099616    1.14              0.25527
## 7:FixedAcidity         0.0050450   0.0189260    0.27              0.78980
## 8:FixedAcidity         0.0143643   0.0551516    0.26              0.79452
## 1:VolatileAcidity      0.0276175   0.1066811    0.26              0.79573
## 2:VolatileAcidity     -0.1549679   0.0602681   -2.57              0.01013
## 3:VolatileAcidity     -0.2250579   0.0461910   -4.87        0.00000110289
## 4:VolatileAcidity     -0.2000726   0.0463893   -4.31        0.00001611218
## 5:VolatileAcidity     -0.2852203   0.0561188   -5.08        0.00000037263
## 6:VolatileAcidity     -0.3984951   0.0787249   -5.06        0.00000041517
## 7:VolatileAcidity     -0.4451098   0.1507908   -2.95              0.00316
## 8:VolatileAcidity     -0.0507834   0.4425565   -0.11              0.90864
## 1:CitricAcid          -0.0146047   0.0988840   -0.15              0.88258
## 2:CitricAcid           0.0974399   0.0558488    1.74              0.08104
## 3:CitricAcid           0.0502363   0.0426221    1.18              0.23854
## 4:CitricAcid           0.1072923   0.0428707    2.50              0.01233
## 5:CitricAcid           0.0169388   0.0512751    0.33              0.74113
## 6:CitricAcid           0.1527696   0.0706395    2.16              0.03057
## 7:CitricAcid           0.0540372   0.1371310    0.39              0.69354
## 8:CitricAcid           0.1346172   0.4588807    0.29              0.76925
## 1:ResidualSugar        0.0011370   0.0025845    0.44              0.65999
## 2:ResidualSugar        0.0001580   0.0014225    0.11              0.91153
## 3:ResidualSugar        0.0000431   0.0010792    0.04              0.96811
## 4:ResidualSugar       -0.0000473   0.0010836   -0.04              0.96521
## 5:ResidualSugar       -0.0009913   0.0012948   -0.77              0.44390
## 6:ResidualSugar       -0.0041261   0.0017897   -2.31              0.02114
## 7:ResidualSugar       -0.0015363   0.0035033   -0.44              0.66101
## 8:ResidualSugar       -0.0006326   0.0113460   -0.06              0.95553
## 1:Chlorides            0.3012341   0.2579027    1.17              0.24280
## 2:Chlorides           -0.1525528   0.1473225   -1.04              0.30043
## 3:Chlorides           -0.2380903   0.1124278   -2.12              0.03420
## 4:Chlorides           -0.3902899   0.1135848   -3.44              0.00059
## 5:Chlorides           -0.4408292   0.1374200   -3.21              0.00134
## 6:Chlorides           -0.7137845   0.1946205   -3.67              0.00024
## 7:Chlorides           -0.2136066   0.3667652   -0.58              0.56029
## 8:Chlorides           -1.2297747   1.0431889   -1.18              0.23845
## 1:FreeSulfurDioxide    0.0003433   0.0005732    0.60              0.54919
## 2:FreeSulfurDioxide    0.0005711   0.0003201    1.78              0.07438
## 3:FreeSulfurDioxide    0.0009415   0.0002467    3.82              0.00013
## 4:FreeSulfurDioxide    0.0009277   0.0002484    3.73              0.00019
## 5:FreeSulfurDioxide    0.0013386   0.0002971    4.51        0.00000663132
## 6:FreeSulfurDioxide    0.0013485   0.0004109    3.28              0.00103
## 7:FreeSulfurDioxide    0.0008096   0.0007822    1.04              0.30065
## 8:FreeSulfurDioxide    0.0040577   0.0020075    2.02              0.04325
## 1:TotalSulfurDioxide   0.0008029   0.0003749    2.14              0.03223
## 2:TotalSulfurDioxide   0.0011688   0.0002052    5.70        0.00000001227
## 3:TotalSulfurDioxide   0.0008090   0.0001571    5.15        0.00000026053
## 4:TotalSulfurDioxide   0.0006021   0.0001583    3.80              0.00014
## 5:TotalSulfurDioxide   0.0005527   0.0001905    2.90              0.00371
## 6:TotalSulfurDioxide   0.0003578   0.0002675    1.34              0.18094
## 7:TotalSulfurDioxide   0.0004065   0.0005247    0.77              0.43854
## 8:TotalSulfurDioxide   0.0016133   0.0015107    1.07              0.28554
## 1:Density              1.1729196   3.2580898    0.36              0.71885
## 2:Density              2.1358931   1.8001822    1.19              0.23543
## 3:Density             -0.7493329   1.3690747   -0.55              0.58415
## 4:Density             -1.1796625   1.3754673   -0.86              0.39109
## 5:Density             -2.7984471   1.6544651   -1.69              0.09075
## 6:Density             -4.4633322   2.2886590   -1.95              0.05115
## 7:Density             -0.8950092   4.3676843   -0.20              0.83764
## 8:Density             27.7019279  13.7868518    2.01              0.04451
## 1:pH                  -0.1110001   0.1259424   -0.88              0.37813
## 2:pH                  -0.2221153   0.0698344   -3.18              0.00147
## 3:pH                  -0.1427254   0.0530212   -2.69              0.00711
## 4:pH                  -0.1455688   0.0535361   -2.72              0.00655
## 5:pH                  -0.0853234   0.0648453   -1.32              0.18824
## 6:pH                  -0.1337259   0.0913237   -1.46              0.14311
## 7:pH                   0.5931078   0.1714457    3.46              0.00054
## 8:pH                  -0.0242286   0.6296618   -0.04              0.96931
## 1:Sulphates           -0.0469788   0.0900183   -0.52              0.60175
## 2:Sulphates           -0.0697013   0.0511690   -1.36              0.17314
## 3:Sulphates           -0.0836028   0.0390217   -2.14              0.03216
## 4:Sulphates           -0.1055276   0.0392502   -2.69              0.00718
## 5:Sulphates           -0.0580567   0.0471147   -1.23              0.21786
## 6:Sulphates           -0.0754956   0.0657450   -1.15              0.25084
## 7:Sulphates           -0.1033194   0.1237260   -0.84              0.40368
## 8:Sulphates            0.2172434   0.3170582    0.69              0.49323
## 1:Alcohol             -0.0755915   0.0228737   -3.30              0.00095
## 2:Alcohol             -0.0656105   0.0127449   -5.15        0.00000026329
## 3:Alcohol             -0.0361017   0.0097415   -3.71              0.00021
## 4:Alcohol             -0.0033549   0.0098544   -0.34              0.73352
## 5:Alcohol              0.0488693   0.0119669    4.08        0.00004432430
## 6:Alcohol              0.0861031   0.0169369    5.08        0.00000037002
## 7:Alcohol              0.1384143   0.0330889    4.18        0.00002875531
## 8:Alcohol              0.0362501   0.0899947    0.40              0.68709
## 1:LabelAppeal         -3.2147837   0.1489685  -21.58 < 0.0000000000000002
## 2:LabelAppeal         -2.1028377   0.0766622  -27.43 < 0.0000000000000002
## 3:LabelAppeal         -0.8732574   0.0512919  -17.03 < 0.0000000000000002
## 4:LabelAppeal          0.3002931   0.0503210    5.97        0.00000000241
## 5:LabelAppeal          1.3487300   0.0643861   20.95 < 0.0000000000000002
## 6:LabelAppeal          2.4205164   0.0958261   25.26 < 0.0000000000000002
## 7:LabelAppeal          3.4036200   0.1968572   17.29 < 0.0000000000000002
## 8:LabelAppeal          6.2300415   1.0978081    5.67        0.00000001387
## 1:AcidIndex           -0.2491778   0.0691287   -3.60              0.00031
## 2:AcidIndex           -0.3927096   0.0400295   -9.81 < 0.0000000000000002
## 3:AcidIndex           -0.3868888   0.0289201  -13.38 < 0.0000000000000002
## 4:AcidIndex           -0.5075611   0.0297350  -17.07 < 0.0000000000000002
## 5:AcidIndex           -0.5742915   0.0369153  -15.56 < 0.0000000000000002
## 6:AcidIndex           -0.6531131   0.0538578  -12.13 < 0.0000000000000002
## 7:AcidIndex           -0.6459091   0.1032328   -6.26        0.00000000039
## 8:AcidIndex           -0.7485220   0.2872183   -2.61              0.00916
## 1:STARS                0.7690696   0.1856058    4.14        0.00003419488
## 2:STARS                1.5022602   0.0927297   16.20 < 0.0000000000000002
## 3:STARS                2.0263099   0.0756487   26.79 < 0.0000000000000002
## 4:STARS                2.6079573   0.0753535   34.61 < 0.0000000000000002
## 5:STARS                3.2832005   0.0821925   39.95 < 0.0000000000000002
## 6:STARS                3.9495261   0.0987426   40.00 < 0.0000000000000002
## 7:STARS                4.8452709   0.1775129   27.30 < 0.0000000000000002
## 8:STARS                6.4768358   0.7639198    8.48 < 0.0000000000000002
##                         
## 1:(intercept)           
## 2:(intercept)        .  
## 3:(intercept)        *  
## 4:(intercept)           
## 5:(intercept)        ** 
## 6:(intercept)        ***
## 7:(intercept)        ***
## 8:(intercept)        ***
## 1:FixedAcidity          
## 2:FixedAcidity          
## 3:FixedAcidity          
## 4:FixedAcidity          
## 5:FixedAcidity          
## 6:FixedAcidity          
## 7:FixedAcidity          
## 8:FixedAcidity          
## 1:VolatileAcidity       
## 2:VolatileAcidity    *  
## 3:VolatileAcidity    ***
## 4:VolatileAcidity    ***
## 5:VolatileAcidity    ***
## 6:VolatileAcidity    ***
## 7:VolatileAcidity    ** 
## 8:VolatileAcidity       
## 1:CitricAcid            
## 2:CitricAcid         .  
## 3:CitricAcid            
## 4:CitricAcid         *  
## 5:CitricAcid            
## 6:CitricAcid         *  
## 7:CitricAcid            
## 8:CitricAcid            
## 1:ResidualSugar         
## 2:ResidualSugar         
## 3:ResidualSugar         
## 4:ResidualSugar         
## 5:ResidualSugar         
## 6:ResidualSugar      *  
## 7:ResidualSugar         
## 8:ResidualSugar         
## 1:Chlorides             
## 2:Chlorides             
## 3:Chlorides          *  
## 4:Chlorides          ***
## 5:Chlorides          ** 
## 6:Chlorides          ***
## 7:Chlorides             
## 8:Chlorides             
## 1:FreeSulfurDioxide     
## 2:FreeSulfurDioxide  .  
## 3:FreeSulfurDioxide  ***
## 4:FreeSulfurDioxide  ***
## 5:FreeSulfurDioxide  ***
## 6:FreeSulfurDioxide  ** 
## 7:FreeSulfurDioxide     
## 8:FreeSulfurDioxide  *  
## 1:TotalSulfurDioxide *  
## 2:TotalSulfurDioxide ***
## 3:TotalSulfurDioxide ***
## 4:TotalSulfurDioxide ***
## 5:TotalSulfurDioxide ** 
## 6:TotalSulfurDioxide    
## 7:TotalSulfurDioxide    
## 8:TotalSulfurDioxide    
## 1:Density               
## 2:Density               
## 3:Density               
## 4:Density               
## 5:Density            .  
## 6:Density            .  
## 7:Density               
## 8:Density            *  
## 1:pH                    
## 2:pH                 ** 
## 3:pH                 ** 
## 4:pH                 ** 
## 5:pH                    
## 6:pH                    
## 7:pH                 ***
## 8:pH                    
## 1:Sulphates             
## 2:Sulphates             
## 3:Sulphates          *  
## 4:Sulphates          ** 
## 5:Sulphates             
## 6:Sulphates             
## 7:Sulphates             
## 8:Sulphates             
## 1:Alcohol            ***
## 2:Alcohol            ***
## 3:Alcohol            ***
## 4:Alcohol               
## 5:Alcohol            ***
## 6:Alcohol            ***
## 7:Alcohol            ***
## 8:Alcohol               
## 1:LabelAppeal        ***
## 2:LabelAppeal        ***
## 3:LabelAppeal        ***
## 4:LabelAppeal        ***
## 5:LabelAppeal        ***
## 6:LabelAppeal        ***
## 7:LabelAppeal        ***
## 8:LabelAppeal        ***
## 1:AcidIndex          ***
## 2:AcidIndex          ***
## 3:AcidIndex          ***
## 4:AcidIndex          ***
## 5:AcidIndex          ***
## 6:AcidIndex          ***
## 7:AcidIndex          ***
## 8:AcidIndex          ** 
## 1:STARS              ***
## 2:STARS              ***
## 3:STARS              ***
## 4:STARS              ***
## 5:STARS              ***
## 6:STARS              ***
## 7:STARS              ***
## 8:STARS              ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Log-Likelihood: -12200
## McFadden R^2:  0.282 
## Likelihood ratio test : chisq = 9570 (p.value = <0.0000000000000002)
 # goodness of fit: pseudo R squared
# McFadden's R squared
ml_null <-mlogit(TARGET ~ 1,data=mldata, reflevel=levels(train_fct$TARGET)[1])
(pR2_5 <- 1- logLik(fit.ml.imp)[1]/logLik(ml_null)[1])
## [1] 0.282
# Log likelihood
(logLik_5<-logLik(fit.lm.imp.aj))
## 'log Lik.' -16539 (df=15)
# AIC
AIC_5 <- fit.lm.imp.aj$aic
BIC_5 <- BIC(fit.lm.imp.aj)

a<-as.data.frame(fitted(fit.ml.imp,type = "outcome",outcome = F) )
b<-fitted(fit.ml.imp,type = "outcome",outcome = T)
pred <- list()
for(i in 1:length(b)){
  pred <- append(pred,names(a[i,])[apply(a[i,], 1, function(x) which(x == b[i]))])
}

ndf_5<- cbind(train, unlist(pred),b)
colnames(ndf_5)[which(colnames(ndf_5)=='unlist(pred)')]<-'pred'
ndf_5$pred <- as.numeric(as.character(ndf_5$pred))

evl_df_5 <- as.data.frame(aggregate(ndf_5[, 16], list(ndf_5$TARGET), mean))
colnames(evl_df_5) <- c('TARGET','pred')
evl_df_5
##   TARGET pred
## 1      0    0
## 2      1    1
## 3      2    2
## 4      3    3
## 5      4    4
## 6      5    5
## 7      6    6
## 8      7    7
## 9      8    8

Simple Step Selection will be used for attribute selection, and we will build 3 models for both sets, plus a “zerinfl” negative binomial, yielding a total of 7 models:

wine_training_data.zeros <- wine_training_data
wine_training_data.nozeros <- wine_training_data[wine_training_data$STARS != 0,]

cutoff.zeros <- nrow(wine_training_data.zeros)*.75
wine_training_data.zeros.train <- wine_training_data.zeros[1:cutoff.zeros,]
wine_training_data.zeros.test <- wine_training_data.zeros[(cutoff.zeros+1):nrow(wine_training_data.zeros),]

cutoff.nozeros <- nrow(wine_training_data.nozeros)*.75
wine_training_data.nozeros.train <- wine_training_data.nozeros[1:cutoff.nozeros,]
wine_training_data.nozeros.test <- wine_training_data.nozeros[(cutoff.nozeros+1):nrow(wine_training_data.nozeros),]




#wine_training_data.zeros <- do_factors(wine_training_data.zeros)
#wine_training_data.nozeros <- do_factors(wine_training_data.nozeros)
#wine_training_data.zeros.train <- do_factors(wine_training_data.zeros.train)
#wine_training_data.zeros.test <- do_factors(wine_training_data.zeros.test)
#wine_training_data.nozeros.train <- do_factors(wine_training_data.nozeros.train)
#wine_training_data.nozeros.test <- do_factors(wine_training_data.nozeros.test)
  • A Poisson GLM with SCORE: “Zeros-for-NAs”
  • A Poisson GLM with SCORE: “NAs Removed”
  • A Negative Binomial GLM with SCORE: “Zeros-for-NAs”
  • A Negative Binomial GLM with SCORE: “NAs Removed”
  • A Multiple Linear Regression Model with SCORE: “Zeros-for-NAs”
  • A Multiple Linear Regression Model with SCORE: “NAs Removed”
  • A Negative Binomial Model via “zeroinfl()”: “Zeros-for-NAs”

Summaries for these Models are below:

(To show our point regarding coefficients, we will show summaries for the 2 Negative Binomial Distributions. Summaries of all 7 models, however, are located in the appendix).

3.6 Poisson GLM and Zero-inflated Poisson:“Zeros-for-NAs”

Poinsson glm and Zero-inflated Poisson regression will be fit using dataset in which NAs are filled by zero.

fit.poisson.zeros <- step(glm(TARGET ~ . , family = poisson, data = wine_training_data.zeros.train), trace = FALSE)
summary(fit.poisson.zeros)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##     TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal + 
##     AcidIndex + STARS, family = poisson, data = wine_training_data.zeros.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.987  -0.696   0.071   0.578   3.205  
## 
## Coefficients:
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         1.2255668  0.0556031   22.04 < 0.0000000000000002 ***
## VolatileAcidity    -0.0351524  0.0075417   -4.66            0.0000031 ***
## FreeSulfurDioxide   0.0001297  0.0000398    3.26               0.0011 ** 
## TotalSulfurDioxide  0.0000721  0.0000254    2.84               0.0046 ** 
## pH                 -0.0129234  0.0087058   -1.48               0.1377    
## Sulphates          -0.0118122  0.0062359   -1.89               0.0582 .  
## Alcohol             0.0026729  0.0015981    1.67               0.0944 .  
## LabelAppeal         0.1382714  0.0070001   19.75 < 0.0000000000000002 ***
## AcidIndex          -0.0854964  0.0051553  -16.58 < 0.0000000000000002 ***
## STARS               0.3079500  0.0052369   58.80 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 17042  on 9595  degrees of freedom
## Residual deviance: 11034  on 9586  degrees of freedom
## AIC: 35048
## 
## Number of Fisher Scoring iterations: 5

3.6.1 Model diagnostics and adjustment

#good-of-fitness test
p_poisson_zero <- 1-pchisq(summary(fit.poisson.zeros)$deviance, summary(fit.poisson.zeros)$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_6<- deviance(fit.poisson.zeros)/fit.poisson.zeros$df.residual
dispersiontest(fit.poisson.zeros)   # library(AER)
## 
##  Overdispersion test
## 
## data:  fit.poisson.zeros
## z = -10, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##      0.865
#c = 0 equidispersion, c > 0 is overdispersed

#Check for zero inflation
fit.poisson.zeros_zeroinf <- zeroinfl(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=wine_training_data.zeros.train, dist="poisson")    #library(pscl)
AIC(fit.poisson.zeros, fit.poisson.zeros_zeroinf)
##                           df   AIC
## fit.poisson.zeros         10 35048
## fit.poisson.zeros_zeroinf 30 30759
summary(fit.poisson.zeros_zeroinf)
## 
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = wine_training_data.zeros.train, dist = "poisson")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -2.12031 -0.40961 -0.00471  0.36947  5.90173 
## 
## Count model coefficients (poisson with log link):
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         1.4523000  0.2321671    6.26         0.0000000004 ***
## FixedAcidity        0.0004044  0.0009795    0.41                0.680    
## VolatileAcidity    -0.0125236  0.0077475   -1.62                0.106    
## CitricAcid         -0.0015902  0.0069195   -0.23                0.818    
## ResidualSugar      -0.0000348  0.0001792   -0.19                0.846    
## Chlorides          -0.0126578  0.0191607   -0.66                0.509    
## FreeSulfurDioxide   0.0000406  0.0000403    1.01                0.314    
## TotalSulfurDioxide -0.0000132  0.0000255   -0.52                0.604    
## Density            -0.3125233  0.2280031   -1.37                0.170    
## pH                  0.0090692  0.0089678    1.01                0.312    
## Sulphates          -0.0015214  0.0064036   -0.24                0.812    
## Alcohol             0.0068260  0.0016343    4.18         0.0000295754 ***
## LabelAppeal         0.2336540  0.0072663   32.16 < 0.0000000000000002 ***
## AcidIndex          -0.0173737  0.0056128   -3.10                0.002 ** 
## STARS               0.1005935  0.0059930   16.79 < 0.0000000000000002 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)        -4.133307   1.513854   -2.73              0.00633 ** 
## FixedAcidity        0.002360   0.006395    0.37              0.71209    
## VolatileAcidity     0.183783   0.050341    3.65              0.00026 ***
## CitricAcid         -0.053114   0.045920   -1.16              0.24741    
## ResidualSugar      -0.000896   0.001163   -0.77              0.44091    
## Chlorides          -0.002500   0.123390   -0.02              0.98384    
## FreeSulfurDioxide  -0.000654   0.000271   -2.41              0.01590 *  
## TotalSulfurDioxide -0.000798   0.000172   -4.64            0.0000034 ***
## Density             0.117083   1.491072    0.08              0.93741    
## pH                  0.215505   0.058039    3.71              0.00020 ***
## Sulphates           0.120421   0.041803    2.88              0.00397 ** 
## Alcohol             0.028319   0.010613    2.67              0.00762 ** 
## LabelAppeal         0.673191   0.048865   13.78 < 0.0000000000000002 ***
## AcidIndex           0.439117   0.029592   14.84 < 0.0000000000000002 ***
## STARS              -2.340599   0.068029  -34.41 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 38 
## Log-likelihood: -1.53e+04 on 30 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.zeros))  #library(faraway)

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.zeros, n = 250)  #library(DHARMa)
plot(simulationOutput)

The GOF test indicates that the Poisson model fit the data (p > 0.05).

# outlier test
outlierTest(fit.poisson.zeros)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 8887     3.21            0.00134           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.poisson.zeros)

##      StudRes     Hat   CookD
## 1630   -1.23 0.00927 0.00122
## 3953    2.60 0.00227 0.00245
## 8887    3.21 0.00059 0.00108

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 1630, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.poisson.zeros_1630 <- update(fit.poisson.zeros,subset=c(-1630))
compareCoefs(fit.poisson.zeros, fit.poisson.zeros_1630 )
## 
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal + 
##   AcidIndex + STARS, family = poisson, data = 
##   wine_training_data.zeros.train)
## 2: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal + 
##   AcidIndex + STARS, family = poisson, data = 
##   wine_training_data.zeros.train, subset = c(-1630))
##                        Est. 1       SE 1     Est. 2       SE 2
## (Intercept)         1.2255668  0.0556031  1.2265651  0.0556119
## VolatileAcidity    -0.0351524  0.0075417 -0.0357254  0.0075578
## FreeSulfurDioxide   0.0001297  0.0000398  0.0001296  0.0000398
## TotalSulfurDioxide  0.0000721  0.0000254  0.0000712  0.0000254
## pH                 -0.0129234  0.0087058 -0.0128748  0.0087054
## Sulphates          -0.0118122  0.0062359 -0.0119223  0.0062362
## Alcohol             0.0026729  0.0015981  0.0027332  0.0015988
## LabelAppeal         0.1382714  0.0070001  0.1383440  0.0070001
## AcidIndex          -0.0854964  0.0051553 -0.0856982  0.0051590
## STARS               0.3079500  0.0052369  0.3080394  0.0052372

From the output table, we can see that coefficients are changed minimally, and the observation 1630 is not influential.

#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.poisson.zeros,layout = c(3, 4),ask=F)

##                    Test stat Pr(>|t|)
## VolatileAcidity        0.910    0.340
## FreeSulfurDioxide      0.124    0.725
## TotalSulfurDioxide     0.570    0.450
## pH                     0.633    0.426
## Sulphates              0.079    0.779
## Alcohol                1.369    0.242
## LabelAppeal           21.183    0.000
## AcidIndex             43.615    0.000
## STARS                666.096    0.000
glm_full_aj <- glm(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), family = poisson, data = wine_training_data.zeros.train) 

fit.poisson.zeros_aj <-step(glm_full_aj,direction='backward',family = poisson, data = wine_training_data.zeros.train, trace = FALSE)

residualPlots(fit.poisson.zeros_aj,layout = c(3, 4),ask=F)

##                      Test stat Pr(>|t|)
## VolatileAcidity          0.468    0.494
## FreeSulfurDioxide        0.021    0.884
## TotalSulfurDioxide       0.669    0.413
## Sulphates                0.000    0.999
## Alcohol                  0.822    0.365
## LabelAppeal              0.209    0.648
## AcidIndex                0.000    1.000
## STARS                    0.000    1.000
## log(LabelAppeal + 3)     0.145    0.703
## I(AcidIndex^2)          13.065    0.000
## I(STARS^2)              91.112    0.000
summary(fit.poisson.zeros_aj)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##     TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + 
##     AcidIndex + STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + 
##     I(STARS^2), family = poisson, data = wine_training_data.zeros.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.213  -0.659  -0.002   0.493   3.564  
## 
## Coefficients:
##                        Estimate Std. Error z value             Pr(>|z|)
## (Intercept)          -0.1982206  0.1812114   -1.09               0.2740
## VolatileAcidity      -0.0324472  0.0075509   -4.30          0.000017301
## FreeSulfurDioxide     0.0000952  0.0000397    2.40               0.0165
## TotalSulfurDioxide    0.0000613  0.0000254    2.41               0.0160
## Sulphates            -0.0121488  0.0062274   -1.95               0.0511
## Alcohol               0.0038121  0.0015964    2.39               0.0169
## LabelAppeal           0.0669199  0.0306619    2.18               0.0291
## AcidIndex             0.1187903  0.0369791    3.21               0.0013
## STARS                 0.7040393  0.0171154   41.13 < 0.0000000000000002
## log(LabelAppeal + 3)  0.2670993  0.0867050    3.08               0.0021
## I(AcidIndex^2)       -0.0118800  0.0022055   -5.39          0.000000072
## I(STARS^2)           -0.1042046  0.0042411  -24.57 < 0.0000000000000002
##                         
## (Intercept)             
## VolatileAcidity      ***
## FreeSulfurDioxide    *  
## TotalSulfurDioxide   *  
## Sulphates            .  
## Alcohol              *  
## LabelAppeal          *  
## AcidIndex            ** 
## STARS                ***
## log(LabelAppeal + 3) ** 
## I(AcidIndex^2)       ***
## I(STARS^2)           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 17042  on 9595  degrees of freedom
## Residual deviance: 10329  on 9584  degrees of freedom
## AIC: 34347
## 
## Number of Fisher Scoring iterations: 6

3.6.3 Model Interpretation

coef <- summary(fit.poisson.zeros_aj)$coefficients[,1]
var.name <- variable.names(fit.poisson.zeros_aj) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) -0.20 0.82
VolatileAcidity -0.03 0.97
FreeSulfurDioxide 0.00 1.00
TotalSulfurDioxide 0.00 1.00
Sulphates -0.01 0.99
Alcohol 0.00 1.00
LabelAppeal 0.07 1.07
AcidIndex 0.12 1.13
STARS 0.70 2.02
log(LabelAppeal + 3) 0.27 1.31
I(AcidIndex^2) -0.01 0.99
I(STARS^2) -0.10 0.90

3.6.3 new Model diagnostics

#good-of-fitness test
p_poisson_zero_aj <- 1-pchisq(summary(fit.poisson.zeros_aj)$deviance, summary(fit.poisson.zeros_aj)$df.residual)
cat("p-value_aj:" ,p_poisson_aj,"\n\n")
## p-value_aj: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_6 <-deviance(fit.poisson.zeros_aj)/fit.poisson.zeros_aj$df.residual
dispersiontest(fit.poisson.zeros_aj)   # library(AER)
## 
##  Overdispersion test
## 
## data:  fit.poisson.zeros_aj
## z = -9, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##      0.876
#c = 0 equidispersion, c > 0 is overdispersed

c =1.15 >0. The assumption of equidispersion is not rejected.

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.zeros_aj, n = 250)  #library(DHARMa)
plot(simulationOutput)

#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.zeros_aj))  #library(faraway)

# outlier test
outlierTest(fit.poisson.zeros_aj)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 8887     3.57           0.000362           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.poisson.zeros_aj)

##      StudRes      Hat   CookD
## 2729    2.74 0.033751 0.03931
## 8887    3.57 0.000692 0.00143

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.poisson.zeros_aj_2729 <- update(fit.poisson.zeros_aj,subset=c(-2729))
compareCoefs(fit.poisson.zeros_aj, fit.poisson.zeros_aj_2729 )
## 
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##   STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + I(STARS^2), family = 
##   poisson, data = wine_training_data.zeros.train)
## 2: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##   STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + I(STARS^2), family = 
##   poisson, data = wine_training_data.zeros.train, subset = c(-2729))
##                          Est. 1       SE 1     Est. 2       SE 2
## (Intercept)          -0.1982206  0.1812114 -0.2917920  0.1843297
## VolatileAcidity      -0.0324472  0.0075509 -0.0324703  0.0075515
## FreeSulfurDioxide     0.0000952  0.0000397  0.0000956  0.0000397
## TotalSulfurDioxide    0.0000613  0.0000254  0.0000608  0.0000254
## Sulphates            -0.0121488  0.0062274 -0.0121160  0.0062275
## Alcohol               0.0038121  0.0015964  0.0038365  0.0015966
## LabelAppeal           0.0669199  0.0306619  0.0651507  0.0306764
## AcidIndex             0.1187903  0.0369791  0.1419165  0.0379481
## STARS                 0.7040393  0.0171154  0.7030149  0.0171176
## log(LabelAppeal + 3)  0.2670993  0.0867050  0.2716263  0.0867434
## I(AcidIndex^2)       -0.0118800  0.0022055 -0.0133409  0.0022713
## I(STARS^2)           -0.1042046  0.0042411 -0.1040184  0.0042416

From the output table, we can see that coefficients are changed minimally, and the observation 2729 is not influential.

We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model

# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.poisson.zeros_aj)

The bar hanging from each point on the theoretical Poisson fit (red line) represents the difference between expected and observed counts. When a bar hanging below 0, it is underfitting. When a bar hanging above 0, it is overfitting. We can see that all variables have either underfitting or overfitting problem except the No.7 variable, “TotalSulfurDioxide”.

3.6.4 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_zero_1 <- 1 - fit.poisson.zeros_aj$deviance / fit.poisson.zeros_aj$null.deviance)
## [1] 0.394
#or
#(pR2 <- 1- logLik(logitfit.1)/logLik(logit.null))

# Log likelihood
(logLik_zero_1 <-logLik(fit.poisson.zeros_aj))
## 'log Lik.' -17161 (df=12)
# AIC
AIC_zero_1 <- fit.poisson.zeros_aj$aic
BIC_zero_1 <- BIC(fit.poisson.zeros_aj)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_1<- cbind(wine_training_data.zeros.test, 
      Mean = predict(fit.poisson.zeros, newdata=wine_training_data.zeros.test, type="response"), 
      SE = predict(fit.poisson.zeros, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
      )

evl_df_zero_1 <- as.data.frame(aggregate(ndf_zero_1[, 17], list(ndf_zero_1$TARGET), mean))
evl_df_zero_1 <- cbind(evl_df_zero_1,aggregate(ndf_zero_1[, 18], list(ndf_zero_1 $TARGET), mean)[,2])
colnames(evl_df_zero_1) <- c('TARGET','Mean','SE')

Evaluation of the zeroinflated model

# goodness of fit: pseudo R squared
(pR2_zero_1z <- 1- logLik(fit.poisson.zeros_zeroinf)[1]/logLik(fit.poisson.zeros_zeroinf)[1])
## [1] 0
# Log likelihood
(logLik_zero_1z<-logLik(fit.poisson.zeros_zeroinf))
## 'log Lik.' -15349 (df=30)
#dispersion
c_zero_1z <- deviance(fit.poisson.zeros_zeroinf)/fit.poisson.zeros_zeroinf$df.residual

# AIC
AIC_zero_1z <- AIC(fit.poisson.zeros_zeroinf)
BIC_zero_1z <- BIC(fit.poisson.zeros_zeroinf)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_1z<- cbind(wine_training_data.zeros.test, 
      Count = predict(fit.poisson.zeros_zeroinf, newdata=wine_training_data.zeros.test, type = "count"), 
      Zero = predict(fit.poisson.zeros_zeroinf, newdata=wine_training_data.zeros.test, type = "zero")
      )

evl_df_zero_1z <- as.data.frame(aggregate(ndf_zero_1z[, 17], list(ndf_zero_1z$TARGET), mean))
evl_df_zero_1z <- cbind(evl_df_zero_1z,aggregate(ndf_zero_1z[, 18], list(ndf_zero_1z$TARGET), mean)[,2])
colnames(evl_df_zero_1z) <- c('TARGET','Count','Zero')

3.7 Poisson GLM: “NAs Removed”

3.7.1 Build the initial model

fit.poisson.nozeros <- step(glm(TARGET ~ . , family = poisson, data = wine_training_data.nozeros.train), trace = FALSE)
summary(fit.poisson.nozeros)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##     Alcohol + LabelAppeal + AcidIndex + STARS, family = poisson, 
##     data = wine_training_data.nozeros.train)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.228  -0.275   0.070   0.374   1.727  
## 
## Coefficients:
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)        1.1958931  0.0505635   23.65 < 0.0000000000000002 ***
## VolatileAcidity   -0.0271442  0.0079918   -3.40              0.00068 ***
## FreeSulfurDioxide  0.0000892  0.0000421    2.12              0.03405 *  
## Alcohol            0.0041172  0.0016916    2.43              0.01494 *  
## LabelAppeal        0.1809400  0.0075738   23.89 < 0.0000000000000002 ***
## AcidIndex         -0.0483117  0.0055739   -8.67 < 0.0000000000000002 ***
## STARS              0.1895270  0.0070857   26.75 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6583.0  on 7076  degrees of freedom
## Residual deviance: 4473.7  on 7070  degrees of freedom
## AIC: 25466
## 
## Number of Fisher Scoring iterations: 5

3.7.2 Interpretation

coef <- summary(fit.poisson.nozeros)$coefficients[,1]
var.name <- variable.names(fit.poisson.nozeros) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) 1.20 3.31
VolatileAcidity -0.03 0.97
FreeSulfurDioxide 0.00 1.00
Alcohol 0.00 1.00
LabelAppeal 0.18 1.20
AcidIndex -0.05 0.95
STARS 0.19 1.21

3.7.3 Model diagnostics and adjustment

#good-of-fitness test
p_poisson_nozero <- 1-pchisq(summary(fit.poisson.nozeros)$deviance, summary(fit.poisson.nozeros)$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_7<-deviance(fit.poisson.nozeros)/fit.poisson.nozeros$df.residual
dispersiontest(fit.poisson.nozeros)   # library(AER)
## 
##  Overdispersion test
## 
## data:  fit.poisson.nozeros
## z = -50, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##      0.431
#c = 0 equidispersion, c > 0 is overdispersed

#Check for zero inflation
fit.poisson.nozeros_zeroinf <- zeroinfl(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=wine_training_data.nozeros.train, dist="poisson")    #library(pscl)
AIC(fit.poisson.nozeros, fit.poisson.nozeros_zeroinf)
##                             df   AIC
## fit.poisson.nozeros          7 25466
## fit.poisson.nozeros_zeroinf 30 24548
summary(fit.poisson.nozeros_zeroinf)
## 
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = wine_training_data.nozeros.train, dist = "poisson")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3326 -0.2911  0.0461  0.3422  4.0587 
## 
## Count model coefficients (poisson with log link):
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         1.3979183  0.2431431    5.75          0.000000009 ***
## FixedAcidity        0.0004540  0.0010371    0.44              0.66156    
## VolatileAcidity    -0.0101632  0.0081336   -1.25              0.21147    
## CitricAcid         -0.0010763  0.0072655   -0.15              0.88223    
## ResidualSugar      -0.0000995  0.0001884   -0.53              0.59741    
## Chlorides          -0.0189518  0.0201411   -0.94              0.34673    
## FreeSulfurDioxide   0.0000370  0.0000426    0.87              0.38450    
## TotalSulfurDioxide -0.0000129  0.0000270   -0.48              0.63413    
## Density            -0.2854740  0.2387480   -1.20              0.23181    
## pH                  0.0080308  0.0094516    0.85              0.39550    
## Sulphates          -0.0015614  0.0067169   -0.23              0.81618    
## Alcohol             0.0064211  0.0017188    3.74              0.00019 ***
## LabelAppeal         0.2134811  0.0077528   27.54 < 0.0000000000000002 ***
## AcidIndex          -0.0170951  0.0059300   -2.88              0.00394 ** 
## STARS               0.1153341  0.0074931   15.39 < 0.0000000000000002 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)        -4.907803   2.665825   -1.84              0.06562 .  
## FixedAcidity       -0.005840   0.011406   -0.51              0.60867    
## VolatileAcidity     0.330304   0.086690    3.81              0.00014 ***
## CitricAcid         -0.049234   0.079298   -0.62              0.53469    
## ResidualSugar      -0.001095   0.002088   -0.52              0.59988    
## Chlorides          -0.245534   0.225846   -1.09              0.27696    
## FreeSulfurDioxide  -0.001305   0.000496   -2.63              0.00843 ** 
## TotalSulfurDioxide -0.000749   0.000306   -2.45              0.01447 *  
## Density             2.019178   2.600799    0.78              0.43753    
## pH                  0.180433   0.103810    1.74              0.08219 .  
## Sulphates           0.122848   0.073702    1.67              0.09555 .  
## Alcohol             0.047983   0.018248    2.63              0.00855 ** 
## LabelAppeal         0.734222   0.089350    8.22 < 0.0000000000000002 ***
## AcidIndex           0.527523   0.050415   10.46 < 0.0000000000000002 ***
## STARS              -4.012970   0.422826   -9.49 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Number of iterations in BFGS optimization: 42 
## Log-likelihood: -1.22e+04 on 30 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.nozeros))  #library(faraway)

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.nozeros, n = 250)  #library(DHARMa)
plot(simulationOutput)

#residual plots
residualPlots(fit.poisson.nozeros)

##                   Test stat Pr(>|t|)
## VolatileAcidity       0.170    0.680
## FreeSulfurDioxide     0.213    0.644
## Alcohol               0.739    0.390
## LabelAppeal          20.185    0.000
## AcidIndex             6.483    0.011
## STARS                77.088    0.000

The GOF test indicates that the Poisson model fit the data (p > 0.05).

# outlier test
outlierTest(fit.poisson.nozeros)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 4922    -3.23            0.00124           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.poisson.nozeros)

##      StudRes     Hat   CookD
## 369   -3.127 0.00320 0.00224
## 2729   0.611 0.01127 0.00067
## 4922  -3.229 0.00139 0.00104

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 369, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.poisson.nozeros_369 <- update(fit.poisson.nozeros,subset=c(-369))
compareCoefs(fit.poisson.nozeros, fit.poisson.nozeros_369 )
## 
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   Alcohol + LabelAppeal + AcidIndex + STARS, family = poisson, data = 
##   wine_training_data.nozeros.train)
## 2: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   Alcohol + LabelAppeal + AcidIndex + STARS, family = poisson, data = 
##   wine_training_data.nozeros.train, subset = c(-369))
##                       Est. 1       SE 1     Est. 2       SE 2
## (Intercept)        1.1958931  0.0505635  1.1957484  0.0505657
## VolatileAcidity   -0.0271442  0.0079918 -0.0271426  0.0079919
## FreeSulfurDioxide  0.0000892  0.0000421  0.0000893  0.0000421
## Alcohol            0.0041172  0.0016916  0.0041150  0.0016917
## LabelAppeal        0.1809400  0.0075738  0.1809053  0.0075746
## AcidIndex         -0.0483117  0.0055739 -0.0482984  0.0055740
## STARS              0.1895270  0.0070857  0.1895478  0.0070860

From the output table, we can see that coefficients are changed minimally, and the observation 369 is not influential.

3.7.4 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_nozero_1 <- 1 - fit.poisson.nozeros$deviance / fit.poisson.nozeros$null.deviance)
## [1] 0.32
#or
#(pR2 <- 1- logLik(logitfit.1)/logLik(logit.null))

# Log likelihood
(logLik_nozero_1 <-logLik(fit.poisson.nozeros))
## 'log Lik.' -12726 (df=7)
# AIC
AIC_nozero_1 <- fit.poisson.nozeros$aic
BIC_nozero_1 <- BIC(fit.poisson.nozeros)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_1<- cbind(wine_training_data.nozeros.test, 
      Mean = predict(fit.poisson.nozeros, newdata=wine_training_data.nozeros.test, type="response"), 
      SE = predict(fit.poisson.nozeros, newdata=wine_training_data.nozeros.test, type="response", se.fit=T)$se.fit
      )

evl_df_nozero_1 <- as.data.frame(aggregate(ndf_nozero_1[, 17], list(ndf_nozero_1$TARGET), mean))
evl_df_nozero_1 <- cbind(evl_df_nozero_1,aggregate(ndf_nozero_1[, 18], list(ndf_nozero_1$TARGET), mean)[,2])
colnames(evl_df_nozero_1) <- c('TARGET','Mean','SE')
evl_df_nozero_1
##   TARGET Mean     SE
## 1      0 2.85 0.0527
## 2      1 2.35 0.0444
## 3      2 2.63 0.0445
## 4      3 3.07 0.0472
## 5      4 3.69 0.0552
## 6      5 4.35 0.0673
## 7      6 5.22 0.0896
## 8      7 6.04 0.1056
## 9      8 6.56 0.1237

Evaluation of the zeroinflated model

# goodness of fit: pseudo R squared
zeroinf_null <-zeroinfl(TARGET ~ 1,wine_training_data.nozeros.train,dist="poisson")
(pR2_nozero_1z <- 1- logLik(fit.poisson.nozeros_zeroinf)[1]/logLik(zeroinf_null)[1])
## [1] 0.0931
# Log likelihood
(logLik_nozero_1z<-logLik(fit.poisson.nozeros_zeroinf))
## 'log Lik.' -12244 (df=30)
#dispersion
c_nozero_1z <- deviance(fit.poisson.nozeros_zeroinf)/fit.poisson.nozeros_zeroinf$df.residual

# AIC
AIC_nozero_1z <- AIC(fit.poisson.nozeros_zeroinf)
BIC_nozero_1z <- BIC(fit.poisson.nozeros_zeroinf)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_1z<- cbind(wine_training_data.nozeros.test, 
      Count = predict(fit.poisson.nozeros_zeroinf, newdata=wine_training_data.nozeros.test, type = "count"), 
      Zero = predict(fit.poisson.nozeros_zeroinf, newdata=wine_training_data.nozeros.test, type = "zero")
      )

evl_df_nozero_1z <- as.data.frame(aggregate(ndf_nozero_1z[, 17], list(ndf_nozero_1z$TARGET), mean))
evl_df_nozero_1z <- cbind(evl_df_nozero_1z,aggregate(ndf_nozero_1z[, 18], list(ndf_nozero_1z$TARGET), mean)[,2])
colnames(evl_df_nozero_1z) <- c('TARGET','Count','Zero')
evl_df_nozero_1z
##   TARGET Count     Zero
## 1      0  3.31 0.222407
## 2      1  2.53 0.044353
## 3      2  2.79 0.048695
## 4      3  3.25 0.052220
## 5      4  3.87 0.042633
## 6      5  4.50 0.029627
## 7      6  5.31 0.015806
## 8      7  6.04 0.001156
## 9      8  6.56 0.000152

3.8 Negative Binomial GLM with SCORE: “Zeros-for-NAs”

3.8.1 Build the initial model

fit.nb.zeros <- step(glm.nb(TARGET ~ . , data = wine_training_data.zeros.train), trace = FALSE)
summary(fit.nb.zeros)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##     TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal + 
##     AcidIndex + STARS, data = wine_training_data.zeros.train, 
##     init.theta = 49395.31414, link = log)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -2.987  -0.696   0.071   0.578   3.205  
## 
## Coefficients:
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         1.2255779  0.0556051   22.04 < 0.0000000000000002 ***
## VolatileAcidity    -0.0351533  0.0075420   -4.66            0.0000031 ***
## FreeSulfurDioxide   0.0001297  0.0000398    3.26               0.0011 ** 
## TotalSulfurDioxide  0.0000721  0.0000254    2.84               0.0046 ** 
## pH                 -0.0129241  0.0087062   -1.48               0.1377    
## Sulphates          -0.0118126  0.0062361   -1.89               0.0582 .  
## Alcohol             0.0026729  0.0015981    1.67               0.0944 .  
## LabelAppeal         0.1382709  0.0070003   19.75 < 0.0000000000000002 ***
## AcidIndex          -0.0854983  0.0051555  -16.58 < 0.0000000000000002 ***
## STARS               0.3079539  0.0052371   58.80 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(49395) family taken to be 1)
## 
##     Null deviance: 17041  on 9595  degrees of freedom
## Residual deviance: 11034  on 9586  degrees of freedom
## AIC: 35050
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  49395 
##           Std. Err.:  59121 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -35028

3.8.2 Model diagnostics and adjustment

#good-of-fitness test
p_nb_zero <- 1-pchisq(summary(fit.nb.zeros)$deviance, summary(fit.nb.zeros)$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_8<-deviance(fit.nb.zeros)/fit.nb.zeros$df.residual

#c = 0 equidispersion, c > 0 is overdispersed

#Check for zero inflation
fit.nb.zeros_zeroinf <- zeroinfl(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=wine_training_data.zeros.train, dist="negbin")    #library(pscl)
AIC(fit.nb.zeros, fit.nb.zeros_zeroinf)
##                      df   AIC
## fit.nb.zeros         11 35050
## fit.nb.zeros_zeroinf 31 30761
summary(fit.nb.zeros_zeroinf)
## 
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = wine_training_data.zeros.train, dist = "negbin")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -2.12026 -0.40960 -0.00474  0.36951  5.90131 
## 
## Count model coefficients (negbin with log link):
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         1.4524248  0.2321669    6.26         0.0000000004 ***
## FixedAcidity        0.0004040  0.0009795    0.41                0.680    
## VolatileAcidity    -0.0125228  0.0077475   -1.62                0.106    
## CitricAcid         -0.0016002  0.0069195   -0.23                0.817    
## ResidualSugar      -0.0000348  0.0001792   -0.19                0.846    
## Chlorides          -0.0126423  0.0191607   -0.66                0.509    
## FreeSulfurDioxide   0.0000406  0.0000403    1.01                0.314    
## TotalSulfurDioxide -0.0000132  0.0000255   -0.52                0.604    
## Density            -0.3125951  0.2280030   -1.37                0.170    
## pH                  0.0090600  0.0089678    1.01                0.312    
## Sulphates          -0.0015182  0.0064036   -0.24                0.813    
## Alcohol             0.0068259  0.0016343    4.18         0.0000295868 ***
## LabelAppeal         0.2336509  0.0072663   32.16 < 0.0000000000000002 ***
## AcidIndex          -0.0173759  0.0056128   -3.10                0.002 ** 
## STARS               0.1005929  0.0059930   16.78 < 0.0000000000000002 ***
## Log(theta)         18.1863991  2.0191302    9.01 < 0.0000000000000002 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)        -4.126611   1.513832   -2.73              0.00641 ** 
## FixedAcidity        0.002360   0.006395    0.37              0.71210    
## VolatileAcidity     0.183788   0.050341    3.65              0.00026 ***
## CitricAcid         -0.053108   0.045920   -1.16              0.24746    
## ResidualSugar      -0.000896   0.001163   -0.77              0.44087    
## Chlorides          -0.002491   0.123390   -0.02              0.98389    
## FreeSulfurDioxide  -0.000654   0.000271   -2.41              0.01591 *  
## TotalSulfurDioxide -0.000798   0.000172   -4.64            0.0000034 ***
## Density             0.110502   1.491063    0.07              0.94092    
## pH                  0.215475   0.058039    3.71              0.00021 ***
## Sulphates           0.120458   0.041803    2.88              0.00396 ** 
## Alcohol             0.028322   0.010613    2.67              0.00762 ** 
## LabelAppeal         0.673224   0.048866   13.78 < 0.0000000000000002 ***
## AcidIndex           0.439101   0.029591   14.84 < 0.0000000000000002 ***
## STARS              -2.340593   0.068029  -34.41 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 79113896.367 
## Number of iterations in BFGS optimization: 64 
## Log-likelihood: -1.53e+04 on 31 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.zeros))  #library(faraway)

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.nb.zeros, n = 250)  #library(DHARMa)
plot(simulationOutput)

The GOF test indicates that the Poisson model fit the data (p > 0.05).

# outlier test
outlierTest(fit.nb.zeros)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 8887     2.99             0.0028           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.nb.zeros)

##      StudRes     Hat   CookD
## 1630   -1.14 0.00927 0.00122
## 3953    2.42 0.00227 0.00245
## 8887    2.99 0.00059 0.00108

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 1630, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.nb.zeros_1630 <- update(fit.nb.zeros,subset=c(-1630))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.zeros, fit.nb.zeros_1630 )
## 
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal + 
##   AcidIndex + STARS, data = wine_training_data.zeros.train, init.theta = 
##   49395.31414, link = log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal + 
##   AcidIndex + STARS, data = wine_training_data.zeros.train, subset = 
##   c(-1630), init.theta = 49384.41221, link = log)
##                        Est. 1       SE 1     Est. 2       SE 2
## (Intercept)         1.2255779  0.0556051  1.2265761  0.0556140
## VolatileAcidity    -0.0351533  0.0075420 -0.0357263  0.0075581
## FreeSulfurDioxide   0.0001297  0.0000398  0.0001296  0.0000398
## TotalSulfurDioxide  0.0000721  0.0000254  0.0000712  0.0000254
## pH                 -0.0129241  0.0087062 -0.0128755  0.0087058
## Sulphates          -0.0118126  0.0062361 -0.0119226  0.0062364
## Alcohol             0.0026729  0.0015981  0.0027331  0.0015989
## LabelAppeal         0.1382709  0.0070003  0.1383436  0.0070003
## AcidIndex          -0.0854983  0.0051555 -0.0857001  0.0051592
## STARS               0.3079539  0.0052371  0.3080433  0.0052374

From the output table, we can see that coefficients are changed minimally, and the observation 1630 is not influential.

#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.nb.zeros,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

##                    Test stat Pr(>|t|)
## VolatileAcidity        0.910    0.340
## FreeSulfurDioxide      0.124    0.725
## TotalSulfurDioxide     0.570    0.450
## pH                     0.633    0.426
## Sulphates              0.079    0.779
## Alcohol                1.369    0.242
## LabelAppeal           21.183    0.000
## AcidIndex             43.607    0.000
## STARS                666.019    0.000
glm_full_aero_aj <- glm.nb(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), data = wine_training_data.zeros.train) 
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
fit.nb.zeros_aj <-step(glm_full_aero_aj,direction='backward',data = wine_training_data.zeros.train, trace = FALSE)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
residualPlots(fit.nb.zeros_aj,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

##                      Test stat Pr(>|t|)
## VolatileAcidity          0.468    0.494
## FreeSulfurDioxide        0.021    0.884
## TotalSulfurDioxide       0.669    0.413
## Sulphates                0.000    0.999
## Alcohol                  0.822    0.365
## LabelAppeal              0.209    0.648
## AcidIndex                0.000    1.000
## STARS                    0.000    1.000
## log(LabelAppeal + 3)     0.145    0.703
## I(AcidIndex^2)          13.067    0.000
## I(STARS^2)              91.089    0.000
summary(fit.nb.zeros_aj)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##     TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + 
##     AcidIndex + STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + 
##     I(STARS^2), data = wine_training_data.zeros.train, init.theta = 42997.68096, 
##     link = log)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.213  -0.659  -0.002   0.493   3.563  
## 
## Coefficients:
##                        Estimate Std. Error z value             Pr(>|z|)
## (Intercept)          -0.1982094  0.1812179   -1.09               0.2741
## VolatileAcidity      -0.0324480  0.0075512   -4.30          0.000017307
## FreeSulfurDioxide     0.0000952  0.0000397    2.40               0.0165
## TotalSulfurDioxide    0.0000613  0.0000254    2.41               0.0160
## Sulphates            -0.0121492  0.0062277   -1.95               0.0511
## Alcohol               0.0038120  0.0015965    2.39               0.0170
## LabelAppeal           0.0669180  0.0306631    2.18               0.0291
## AcidIndex             0.1187880  0.0369803    3.21               0.0013
## STARS                 0.7040426  0.0171160   41.13 < 0.0000000000000002
## log(LabelAppeal + 3)  0.2671015  0.0867079    3.08               0.0021
## I(AcidIndex^2)       -0.0118800  0.0022055   -5.39          0.000000072
## I(STARS^2)           -0.1042053  0.0042413  -24.57 < 0.0000000000000002
##                         
## (Intercept)             
## VolatileAcidity      ***
## FreeSulfurDioxide    *  
## TotalSulfurDioxide   *  
## Sulphates            .  
## Alcohol              *  
## LabelAppeal          *  
## AcidIndex            ** 
## STARS                ***
## log(LabelAppeal + 3) ** 
## I(AcidIndex^2)       ***
## I(STARS^2)           ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(42998) family taken to be 1)
## 
##     Null deviance: 17041  on 9595  degrees of freedom
## Residual deviance: 10328  on 9584  degrees of freedom
## AIC: 34349
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  42998 
##           Std. Err.:  43490 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -34323

3.8.3 Interpretation

coef <- summary(fit.nb.zeros_aj)$coefficients[,1]
var.name <- variable.names(fit.nb.zeros_aj) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) -0.20 0.82
VolatileAcidity -0.03 0.97
FreeSulfurDioxide 0.00 1.00
TotalSulfurDioxide 0.00 1.00
Sulphates -0.01 0.99
Alcohol 0.00 1.00
LabelAppeal 0.07 1.07
AcidIndex 0.12 1.13
STARS 0.70 2.02
log(LabelAppeal + 3) 0.27 1.31
I(AcidIndex^2) -0.01 0.99
I(STARS^2) -0.10 0.90

3.8.4 new Model diagnostics

#good-of-fitness test
p_nb_zero_aj <- 1-pchisq(summary(fit.nb.zeros_aj)$deviance, summary(fit.nb.zeros_aj)$df.residual)
  cat("p_nb_zero_aj:" ,p_nb_zero_aj,"\n\n")
## p_nb_zero_aj: 0.0000000768
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_8 <- deviance(fit.nb.zeros_aj)/fit.nb.zeros_aj$df.residual
#c = 0 equidispersion, c > 0 is overdispersed

c =1.07 >0. The assumption of equidispersion is not rejected.

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.nb.zeros_aj, n = 250)  #library(DHARMa)
plot(simulationOutput)

#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.zeros_aj))  #library(faraway)

# outlier test
outlierTest(fit.nb.zeros_aj)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 8887     3.44           0.000591           NA

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.nb.zeros_aj)

##      StudRes      Hat   CookD
## 2729    2.64 0.033751 0.03931
## 8887    3.44 0.000692 0.00143

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.nb.zeros_aj_2729 <- update(fit.nb.zeros_aj,subset=c(-2729))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.zeros_aj, fit.nb.zeros_aj_2729 )
## 
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##   STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + I(STARS^2), data = 
##   wine_training_data.zeros.train, init.theta = 42997.68096, link = log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##   STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + I(STARS^2), data = 
##   wine_training_data.zeros.train, subset = c(-2729), init.theta = 
##   42887.79241, link = log)
##                          Est. 1       SE 1     Est. 2       SE 2
## (Intercept)          -0.1982094  0.1812179 -0.2917839  0.1843363
## VolatileAcidity      -0.0324480  0.0075512 -0.0324711  0.0075519
## FreeSulfurDioxide     0.0000952  0.0000397  0.0000956  0.0000397
## TotalSulfurDioxide    0.0000613  0.0000254  0.0000608  0.0000254
## Sulphates            -0.0121492  0.0062277 -0.0121164  0.0062278
## Alcohol               0.0038120  0.0015965  0.0038364  0.0015966
## LabelAppeal           0.0669180  0.0306631  0.0651486  0.0306776
## AcidIndex             0.1187880  0.0369803  0.1419150  0.0379494
## STARS                 0.7040426  0.0171160  0.7030182  0.0171182
## log(LabelAppeal + 3)  0.2671015  0.0867079  0.2716288  0.0867463
## I(AcidIndex^2)       -0.0118800  0.0022055 -0.0133410  0.0022714
## I(STARS^2)           -0.1042053  0.0042413 -0.1040191  0.0042418

From the output table, we can see that coefficients are changed minimally, and the observation 2729 is not influential.

We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model

# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.nb.zeros_aj)

The bar hanging from each point on the theoretical Poisson fit (red line) represents the difference between expected and observed counts. When a bar hanging below 0, it is underfitting. When a bar hanging above 0, it is overfitting. We can see that all variables have either underfitting or overfitting problem except the No.7 variable, “TotalSulfurDioxide”.

3.8.5 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_zero_2 <- 1 - fit.nb.zeros_aj$deviance / fit.nb.zeros_aj$null.deviance)
## [1] 0.394
# Log likelihood
(logLik_zero_2 <-logLik(fit.nb.zeros_aj))
## 'log Lik.' -17161 (df=13)
# AIC
AIC_zero_2 <- fit.nb.zeros_aj$aic
BIC_zero_2 <- BIC(fit.nb.zeros_aj)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_2<- cbind(wine_training_data.zeros.test, 
      Mean = predict(fit.nb.zeros_aj, newdata=wine_training_data.zeros.test, type="response"), 
      SE = predict(fit.nb.zeros_aj, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
      )

evl_df_zero_2 <- as.data.frame(aggregate(ndf_zero_2[, 17], list(ndf_zero_2$TARGET), mean))
evl_df_zero_2 <- cbind(evl_df_zero_2,aggregate(ndf_zero_2[, 18], list(ndf_zero_2$TARGET), mean)[,2])
colnames(evl_df_zero_2) <- c('TARGET','Mean','SE')
evl_df_zero_2
##   TARGET Mean     SE
## 1      0 1.57 0.0361
## 2      1 1.42 0.0427
## 3      2 2.06 0.0444
## 4      3 2.75 0.0499
## 5      4 3.59 0.0627
## 6      5 4.25 0.0779
## 7      6 4.86 0.1019
## 8      7 5.41 0.1151
## 9      8 6.00 0.1608

Evaluation of the zeroinflated model

# goodness of fit: pseudo R squared
zero.zeroinf_null <- zeroinfl(TARGET ~ 1, data=wine_training_data.zeros.train, dist="negbin")
## Warning in sqrt(diag(vc)[np]): NaNs produced
(pR2_zero_2z <- 1- logLik(fit.nb.zeros_zeroinf)[1]/logLik(zero.zeroinf_null)[1] )
## [1] 0.164
# Log likelihood
(logLik_zero_2z<-logLik(fit.nb.zeros_zeroinf))
## 'log Lik.' -15349 (df=31)
#dispersion
c_zero_2z <- deviance(fit.nb.zeros_zeroinf)/fit.nb.zeros_zeroinf$df.residual

# AIC
AIC_zero_2z <- AIC(fit.nb.zeros_zeroinf)
BIC_zero_2z <- BIC(fit.nb.zeros_zeroinf)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_2z<- cbind(wine_training_data.zeros.test, 
      Count = predict(fit.nb.zeros_zeroinf, newdata=wine_training_data.zeros.test, type = "count"), 
      Zero = predict(fit.nb.zeros_zeroinf, newdata=wine_training_data.zeros.test, type = "zero")
      )

evl_df_zero_2z <- as.data.frame(aggregate(ndf_zero_2z[, 17], list(ndf_zero_2z$TARGET), mean))
evl_df_zero_2z <- cbind(evl_df_zero_2z,aggregate(ndf_zero_2z[, 18], list(ndf_zero_2z$TARGET), mean)[,2])
colnames(evl_df_zero_2z) <- c('TARGET','Count','Zero')

3.9 Negative Binomial GLM with SCORE: “NAs Removed”

3.9.1 Build the initial model

fit.nb.nozeros <- step(glm.nb(TARGET ~ . , data = wine_training_data.nozeros.train), trace = FALSE)
summary(fit.nb.nozeros)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##     Alcohol + LabelAppeal + AcidIndex + STARS, data = wine_training_data.nozeros.train, 
##     init.theta = 137870.7222, link = log)
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -3.228  -0.275   0.070   0.374   1.727  
## 
## Coefficients:
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)        1.1958944  0.0505641   23.65 < 0.0000000000000002 ***
## VolatileAcidity   -0.0271444  0.0079920   -3.40              0.00068 ***
## FreeSulfurDioxide  0.0000892  0.0000421    2.12              0.03405 *  
## Alcohol            0.0041172  0.0016917    2.43              0.01494 *  
## LabelAppeal        0.1809402  0.0075739   23.89 < 0.0000000000000002 ***
## AcidIndex         -0.0483120  0.0055740   -8.67 < 0.0000000000000002 ***
## STARS              0.1895274  0.0070858   26.75 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(137871) family taken to be 1)
## 
##     Null deviance: 6582.8  on 7076  degrees of freedom
## Residual deviance: 4473.6  on 7070  degrees of freedom
## AIC: 25468
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  137871 
##           Std. Err.:  218980 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -25452

3.9.2 Interpretation

coef <- summary(fit.nb.nozeros)$coefficients[,1]
var.name <- variable.names(fit.nb.nozeros) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) 1.20 3.31
VolatileAcidity -0.03 0.97
FreeSulfurDioxide 0.00 1.00
Alcohol 0.00 1.00
LabelAppeal 0.18 1.20
AcidIndex -0.05 0.95
STARS 0.19 1.21

3.9.3 Model diagnostics and adjustment

#good-of-fitness test
p_nb_nozero <- 1-pchisq(summary(fit.nb.nozeros)$deviance, summary(fit.nb.nozeros)$df.residual)
cat("p-value:" ,p_nb_nozero,"\n\n")
## p-value: 1
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_9<-summary(fit.nb.nozeros)$deviance/summary(fit.nb.nozeros)$df.residual
#c = 0 equidispersion, c > 0 is overdispersed

#Check for zero inflation
fit.nb.nozeros_zeroinf <- zeroinfl(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=wine_training_data.nozeros.train, dist="negbin")    #library(pscl)
AIC(fit.nb.nozeros, fit.nb.nozeros_zeroinf)
##                        df   AIC
## fit.nb.nozeros          8 25468
## fit.nb.nozeros_zeroinf 31 24550
summary(fit.nb.nozeros_zeroinf)
## 
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = wine_training_data.nozeros.train, dist = "negbin")
## 
## Pearson residuals:
##     Min      1Q  Median      3Q     Max 
## -2.3326 -0.2911  0.0461  0.3422  4.0573 
## 
## Count model coefficients (negbin with log link):
##                      Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)         1.3977723  0.2431430    5.75         0.0000000090 ***
## FixedAcidity        0.0004545  0.0010371    0.44              0.66122    
## VolatileAcidity    -0.0101546  0.0081336   -1.25              0.21186    
## CitricAcid         -0.0010742  0.0072655   -0.15              0.88246    
## ResidualSugar      -0.0000994  0.0001884   -0.53              0.59779    
## Chlorides          -0.0189427  0.0201410   -0.94              0.34696    
## FreeSulfurDioxide   0.0000370  0.0000426    0.87              0.38447    
## TotalSulfurDioxide -0.0000129  0.0000270   -0.48              0.63406    
## Density            -0.2853505  0.2387478   -1.20              0.23201    
## pH                  0.0080346  0.0094516    0.85              0.39528    
## Sulphates          -0.0015687  0.0067169   -0.23              0.81534    
## Alcohol             0.0064208  0.0017188    3.74              0.00019 ***
## LabelAppeal         0.2134860  0.0077528   27.54 < 0.0000000000000002 ***
## AcidIndex          -0.0170927  0.0059300   -2.88              0.00395 ** 
## STARS               0.1153298  0.0074932   15.39 < 0.0000000000000002 ***
## Log(theta)         17.9661270  3.0007605    5.99         0.0000000021 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##                     Estimate Std. Error z value             Pr(>|z|)    
## (Intercept)        -4.923766   2.666214   -1.85              0.06479 .  
## FixedAcidity       -0.005837   0.011407   -0.51              0.60884    
## VolatileAcidity     0.330347   0.086696    3.81              0.00014 ***
## CitricAcid         -0.049216   0.079304   -0.62              0.53486    
## ResidualSugar      -0.001095   0.002088   -0.52              0.60007    
## Chlorides          -0.245550   0.225862   -1.09              0.27696    
## FreeSulfurDioxide  -0.001305   0.000496   -2.63              0.00843 ** 
## TotalSulfurDioxide -0.000750   0.000306   -2.45              0.01445 *  
## Density             2.035288   2.601093    0.78              0.43394    
## pH                  0.180500   0.103816    1.74              0.08210 .  
## Sulphates           0.122905   0.073707    1.67              0.09542 .  
## Alcohol             0.047988   0.018250    2.63              0.00855 ** 
## LabelAppeal         0.734345   0.089361    8.22 < 0.0000000000000002 ***
## AcidIndex           0.527558   0.050420   10.46 < 0.0000000000000002 ***
## STARS              -4.013751   0.423096   -9.49 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 63473116.874 
## Number of iterations in BFGS optimization: 64 
## Log-likelihood: -1.22e+04 on 31 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.nozeros))  #library(faraway)

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.nb.nozeros, n = 250)  #library(DHARMa)
plot(simulationOutput)

The GOF test indicates that the Poisson model fit the data (p =1 > 0.05).

# outlier test
outlierTest(fit.nb.nozeros)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 4922    -4.06          0.0000489        0.346

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.nb.nozeros)

##      StudRes     Hat   CookD
## 369   -3.935 0.00320 0.00224
## 2729   0.768 0.01127 0.00067
## 4922  -4.063 0.00139 0.00104

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 369, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.nb.nozeros_369 <- update(fit.nb.nozeros,subset=c(-369))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.nozeros, fit.nb.nozeros_369 )
## 
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   Alcohol + LabelAppeal + AcidIndex + STARS, data = 
##   wine_training_data.nozeros.train, init.theta = 137870.7222, link = log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   Alcohol + LabelAppeal + AcidIndex + STARS, data = 
##   wine_training_data.nozeros.train, subset = c(-369), init.theta = 
##   137848.1397, link = log)
##                       Est. 1       SE 1     Est. 2       SE 2
## (Intercept)        1.1958944  0.0505641  1.1957497  0.0505664
## VolatileAcidity   -0.0271444  0.0079920 -0.0271428  0.0079921
## FreeSulfurDioxide  0.0000892  0.0000421  0.0000893  0.0000421
## Alcohol            0.0041172  0.0016917  0.0041150  0.0016917
## LabelAppeal        0.1809402  0.0075739  0.1809055  0.0075747
## AcidIndex         -0.0483120  0.0055740 -0.0482987  0.0055741
## STARS              0.1895274  0.0070858  0.1895483  0.0070861

From the output table, we can see that coefficients are changed minimally, and the observation 1630 is not influential.

#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.nb.nozeros,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

##                   Test stat Pr(>|t|)
## VolatileAcidity       0.170    0.680
## FreeSulfurDioxide     0.213    0.644
## Alcohol               0.739    0.390
## LabelAppeal          20.184    0.000
## AcidIndex             6.482    0.011
## STARS                77.082    0.000
#glm_full_aero_aj <- glm.nb(TARGET ~  FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), data = wine_training_data.zeros.train) 

#fit.nb.nozeros_aj <-step(glm_full_aero_aj,direction='backward',data = wine_training_data.zeros.train, trace = FALSE)

#residualPlots(fit.nb.nozeros_aj,layout = c(3, 4),ask=F)

#summary(fit.nb.nozeros_aj)

3.9.4 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_nozero_2 <- 1 - fit.nb.nozeros$deviance / fit.nb.nozeros$null.deviance)
## [1] 0.32
# Log likelihood
(logLik_nozero_2 <-logLik(fit.nb.nozeros))
## 'log Lik.' -12726 (df=8)
# AIC
AIC_nozero_2 <- fit.nb.nozeros$aic
BIC_nozero_2 <- BIC(fit.nb.nozeros)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_2<- cbind(wine_training_data.zeros.test, 
      Mean = predict(fit.nb.nozeros, newdata=wine_training_data.zeros.test, type="response"), 
      SE = predict(fit.nb.nozeros, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
      )

evl_df_nozero_2 <- as.data.frame(aggregate(ndf_nozero_2[, 17], list(ndf_nozero_2$TARGET), mean))
evl_df_nozero_2 <- cbind(evl_df_nozero_2,aggregate(ndf_nozero_2[, 18], list(ndf_nozero_2$TARGET), mean)[,2])
colnames(evl_df_nozero_2) <- c('TARGET','Mean','SE')
evl_df_nozero_2
##   TARGET Mean     SE
## 1      0 2.46 0.0532
## 2      1 1.99 0.0413
## 3      2 2.41 0.0433
## 4      3 2.90 0.0469
## 5      4 3.59 0.0548
## 6      5 4.27 0.0673
## 7      6 5.13 0.0890
## 8      7 5.94 0.1054
## 9      8 6.56 0.1237

Evaluation of the zeroinflated model

# goodness of fit: pseudo R squared
zero.nozeroinf_null <- zeroinfl(TARGET ~ 1, data=wine_training_data.nozeros.train, dist="negbin")
## Warning in sqrt(diag(vc)[np]): NaNs produced
(pR2_nozero_2z <- 1- logLik(fit.nb.nozeros_zeroinf)[1]/logLik(zero.nozeroinf_null)[1])
## [1] 0.0931
# Log likelihood
(logLik_nozero_2z<-logLik(fit.nb.nozeros_zeroinf))
## 'log Lik.' -12244 (df=31)
#dispersion
c_nozero_2z <- deviance(fit.nb.nozeros_zeroinf)/fit.nb.nozeros_zeroinf$df.residual

# AIC
AIC_nozero_2z <- AIC(fit.nb.nozeros_zeroinf)
BIC_nozero_2z <- BIC(fit.nb.nozeros_zeroinf)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_2z<- cbind(wine_training_data.nozeros.test, 
      Count = predict(fit.nb.nozeros_zeroinf, newdata=wine_training_data.nozeros.test, type = "count"), 
      Zero = predict(fit.nb.nozeros_zeroinf, newdata=wine_training_data.nozeros.test, type = "zero")
      )

evl_df_nozero_2z <- as.data.frame(aggregate(ndf_nozero_2z[, 17], list(ndf_nozero_2z$TARGET), mean))
evl_df_nozero_2z <- cbind(evl_df_nozero_2z, aggregate(ndf_nozero_2z[, 18], list(ndf_nozero_2z$TARGET), mean)[,2])
colnames(evl_df_nozero_2z) <- c('TARGET','Count','Zero')
evl_df_nozero_2z
##   TARGET Count     Zero
## 1      0  3.31 0.222407
## 2      1  2.53 0.044343
## 3      2  2.79 0.048684
## 4      3  3.25 0.052214
## 5      4  3.87 0.042630
## 6      5  4.50 0.029624
## 7      6  5.31 0.015804
## 8      7  6.04 0.001156
## 9      8  6.56 0.000152

3.10 Multiple Linear Regression Model with SCORE: “Zeros-for-NAs”

3.10.1 Build the initial Model

fit.mlr.zeros <- step(lm(TARGET ~ ., data = wine_training_data.zeros.train), trace = FALSE)
summary(fit.mlr.zeros)
## 
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex + 
##     STARS, data = wine_training_data.zeros.train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.549 -0.950  0.070  0.909  5.967 
## 
## Coefficients:
##                      Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)         4.0749449  0.5176253    7.87   0.0000000000000039 ***
## VolatileAcidity    -0.1031860  0.0173251   -5.96   0.0000000026778358 ***
## FreeSulfurDioxide   0.0003207  0.0000917    3.50              0.00047 ***
## TotalSulfurDioxide  0.0002007  0.0000586    3.42              0.00062 ***
## Density            -0.8932507  0.5094355   -1.75              0.07956 .  
## pH                 -0.0312522  0.0201038   -1.55              0.12009    
## Sulphates          -0.0322132  0.0144170   -2.23              0.02548 *  
## Alcohol             0.0108430  0.0036472    2.97              0.00296 ** 
## LabelAppeal         0.4464705  0.0158185   28.22 < 0.0000000000000002 ***
## AcidIndex          -0.2069486  0.0104436  -19.82 < 0.0000000000000002 ***
## STARS               0.9673370  0.0121174   79.83 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.33 on 9585 degrees of freedom
## Multiple R-squared:  0.524,  Adjusted R-squared:  0.523 
## F-statistic: 1.05e+03 on 10 and 9585 DF,  p-value: <0.0000000000000002

3.10.2 Interpretation

coef <- summary(fit.mlr.zeros)$coefficients[,1]
var.name <- variable.names(fit.mlr.zeros) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) 4.07 58.85
VolatileAcidity -0.10 0.90
FreeSulfurDioxide 0.00 1.00
TotalSulfurDioxide 0.00 1.00
Density -0.89 0.41
pH -0.03 0.97
Sulphates -0.03 0.97
Alcohol 0.01 1.01
LabelAppeal 0.45 1.56
AcidIndex -0.21 0.81
STARS 0.97 2.63

3.10.3 Model diagnostics and adjustment

#good-of-fitness test
p_mlr_zero <- 1-pchisq(deviance(fit.mlr.zeros), fit.mlr.zeros$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_10<-deviance(fit.mlr.zeros)/fit.mlr.zeros$df.residual

#c = 0 equidispersion, c > 0 is overdispersed

#Check for zero inflation
AIC(fit.mlr.zeros, fit.nb.zeros_zeroinf)
##                      df   AIC
## fit.mlr.zeros        12 32672
## fit.nb.zeros_zeroinf 31 30761
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.mlr.zeros))  #library(faraway)

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.mlr.zeros, n = 250)  #library(DHARMa)
plot(simulationOutput)

The GOF test indicates that the Poisson model fit the data (p > 0.05).

# outlier test
outlierTest(fit.mlr.zeros)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 8887      4.5         0.00000676       0.0649

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.mlr.zeros)

##      StudRes      Hat   CookD
## 2729    1.29 0.007447 0.00114
## 3953    3.54 0.003087 0.00353
## 8887    4.50 0.000825 0.00152

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.mlr.zeros_2729 <- update(fit.mlr.zeros,subset=c(-2729))
compareCoefs(fit.mlr.zeros, fit.mlr.zeros_2729 )
## 
## Call:
## 1: lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + Density + pH + Sulphates + Alcohol + LabelAppeal 
##   + AcidIndex + STARS, data = wine_training_data.zeros.train)
## 2: lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##   TotalSulfurDioxide + Density + pH + Sulphates + Alcohol + LabelAppeal 
##   + AcidIndex + STARS, data = wine_training_data.zeros.train, subset = 
##   c(-2729))
##                        Est. 1       SE 1     Est. 2       SE 2
## (Intercept)         4.0749449  0.5176253  4.1071606  0.5182061
## VolatileAcidity    -0.1031860  0.0173251 -0.1032255  0.0173245
## FreeSulfurDioxide   0.0003207  0.0000917  0.0003215  0.0000917
## TotalSulfurDioxide  0.0002007  0.0000586  0.0002008  0.0000586
## Density            -0.8932507  0.5094355 -0.9176445  0.5097667
## pH                 -0.0312522  0.0201038 -0.0312538  0.0201030
## Sulphates          -0.0322132  0.0144170 -0.0322240  0.0144165
## Alcohol             0.0108430  0.0036472  0.0108289  0.0036471
## LabelAppeal         0.4464705  0.0158185  0.4461516  0.0158198
## AcidIndex          -0.2069486  0.0104436 -0.2079136  0.0104699
## STARS               0.9673370  0.0121174  0.9670056  0.0121196

From the output table, we can see that coefficients are changed minimally, and the observation 2729 is not influential.

#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.mlr.zeros,layout = c(3, 4),ask=F)

##                    Test stat Pr(>|t|)
## VolatileAcidity       -0.747    0.455
## FreeSulfurDioxide      0.152    0.879
## TotalSulfurDioxide    -0.977    0.328
## Density               -1.195    0.232
## pH                    -0.681    0.496
## Sulphates              0.414    0.679
## Alcohol                1.193    0.233
## LabelAppeal            1.308    0.191
## AcidIndex             -2.067    0.039
## STARS                -14.836    0.000
## Tukey test            -6.748    0.000

3.10.4 Evaluation of the model

# goodness of fit: pseudo R squared
(pR2_zero_3 <- 1- logLik(fit.mlr.zeros)[1]/logLik(fit.mlr.zeros)[1])
## [1] 0
# Log likelihood
(logLik_zero_3 <-logLik(fit.mlr.zeros))
## 'log Lik.' -16324 (df=12)
# AIC
AIC_zero_3 <- AIC(fit.mlr.zeros)
BIC_zero_3 <- BIC(fit.mlr.zeros)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_3<- cbind(wine_training_data.zeros.test, 
      Mean = predict(fit.mlr.zeros, newdata=wine_training_data.zeros.test, type="response"), 
      SE = predict(fit.mlr.zeros, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
      )

evl_df_zero_3 <- as.data.frame(aggregate(ndf_zero_3[, 17], list(ndf_zero_3$TARGET), mean))
evl_df_zero_3 <- cbind(evl_df_zero_3,aggregate(ndf_zero_3[, 18], list(ndf_zero_3$TARGET), mean)[,2])
colnames(evl_df_zero_3) <- c('TARGET','Mean','SE')
evl_df_zero_3
##   TARGET Mean     SE
## 1      0 1.68 0.0464
## 2      1 1.33 0.0485
## 3      2 2.07 0.0432
## 4      3 2.73 0.0416
## 5      4 3.54 0.0411
## 6      5 4.20 0.0432
## 7      6 4.89 0.0471
## 8      7 5.40 0.0484
## 9      8 5.77 0.0597

3.11 Multiple Linear Regression Model with SCORE: “NAs Removed”

3.11.1 Build the initial model

fit.mlr.nozeros <- step(lm(TARGET ~ ., data = wine_training_data.nozeros.train), trace = FALSE)
summary(fit.mlr.nozeros)
## 
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + Density + 
##     Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, data = wine_training_data.nozeros.train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -5.043 -0.516  0.129  0.725  3.492 
## 
## Coefficients:
##                     Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)        4.2449571  0.5219676    8.13  0.00000000000000049 ***
## VolatileAcidity   -0.0987890  0.0175540   -5.63  0.00000001896040200 ***
## FreeSulfurDioxide  0.0003134  0.0000928    3.38              0.00074 ***
## Density           -0.9930115  0.5168318   -1.92              0.05473 .  
## Sulphates         -0.0221099  0.0146409   -1.51              0.13105    
## Alcohol            0.0158474  0.0036907    4.29  0.00001779645443424 ***
## LabelAppeal        0.6558181  0.0165919   39.53 < 0.0000000000000002 ***
## AcidIndex         -0.1643256  0.0116752  -14.07 < 0.0000000000000002 ***
## STARS              0.7355710  0.0161978   45.41 < 0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.15 on 7068 degrees of freedom
## Multiple R-squared:  0.455,  Adjusted R-squared:  0.455 
## F-statistic:  739 on 8 and 7068 DF,  p-value: <0.0000000000000002

3.10.2 Interpretation

coef <- summary(fit.mlr.nozeros)$coefficients[,1]
var.name <- variable.names(fit.mlr.nozeros) 
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef effects
(Intercept) 4.24 69.75
VolatileAcidity -0.10 0.91
FreeSulfurDioxide 0.00 1.00
Density -0.99 0.37
Sulphates -0.02 0.98
Alcohol 0.02 1.02
LabelAppeal 0.66 1.93
AcidIndex -0.16 0.85
STARS 0.74 2.09

3.11.3 Model diagnostics and adjustment

#good-of-fitness test
p_mlr_nozero <- 1-pchisq(deviance(fit.mlr.nozeros), fit.mlr.zeros$df.residual)
cat("p-value:" ,p_nb_nozero,"\n\n")
## p-value: 1
#Check for zero inflation
AIC(fit.mlr.nozeros, fit.nb.zeros_zeroinf)
## Warning in AIC.default(fit.mlr.nozeros, fit.nb.zeros_zeroinf): models are
## not all fitted to the same number of observations
##                      df   AIC
## fit.mlr.nozeros      10 22112
## fit.nb.zeros_zeroinf 31 30761
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.mlr.nozeros))  #library(faraway)

#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor) 
simulationOutput <- simulateResiduals(fittedModel = fit.mlr.nozeros, n = 250)  #library(DHARMa)
plot(simulationOutput)

The GOF test indicates that the Poisson model fit the data (p =1 > 0.05).

# outlier test
outlierTest(fit.mlr.nozeros)
## 
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
##      rstudent unadjusted p-value Bonferonni p
## 4922    -4.38          0.0000119       0.0844

The results show that there is no outlier as judged by Bonferonni p.

#Check for influential and leverage points
influencePlot(fit.mlr.nozeros)

##      StudRes     Hat   CookD
## 369   -4.250 0.00262 0.00525
## 2729   0.918 0.01177 0.00111
## 4922  -4.382 0.00135 0.00288

The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 369, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.

fit.mlr.nozeros_369 <- update(fit.mlr.nozeros,subset=c(-369))
compareCoefs(fit.mlr.nozeros, fit.mlr.nozeros_369 )
## 
## Call:
## 1: lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + Density 
##   + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, data = 
##   wine_training_data.nozeros.train)
## 2: lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + Density 
##   + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, data = 
##   wine_training_data.nozeros.train, subset = c(-369))
##                       Est. 1       SE 1     Est. 2       SE 2
## (Intercept)        4.2449571  0.5219676  4.2442624  0.5219960
## VolatileAcidity   -0.0987890  0.0175540 -0.0987817  0.0175549
## FreeSulfurDioxide  0.0003134  0.0000928  0.0003135  0.0000928
## Density           -0.9930115  0.5168318 -0.9927579  0.5168585
## Sulphates         -0.0221099  0.0146409 -0.0220984  0.0146416
## Alcohol            0.0158474  0.0036907  0.0158399  0.0036910
## LabelAppeal        0.6558181  0.0165919  0.6556927  0.0165944
## AcidIndex         -0.1643256  0.0116752 -0.1642833  0.0116760
## STARS              0.7355710  0.0161978  0.7356227  0.0161989

From the output table, we can see that coefficients are changed minimally, and the observation 1630 is not influential.

#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.mlr.nozeros,layout = c(3, 4),ask=F)

##                   Test stat Pr(>|t|)
## VolatileAcidity      -0.273    0.785
## FreeSulfurDioxide    -0.852    0.394
## Density              -0.613    0.540
## Sulphates             0.653    0.514
## Alcohol               1.465    0.143
## LabelAppeal           0.610    0.542
## AcidIndex            -2.583    0.010
## STARS                -7.606    0.000
## Tukey test           -1.663    0.096

3.11.4 Evaluation of the model

# goodness of fit: pseudo R squared
fit.mlr.nozeros.null <- lm(TARGET ~ 1, data = wine_training_data.nozeros.train)
(pR2_nozero_3 <- 1 - logLik(fit.mlr.nozeros)[1]/logLik(fit.mlr.nozeros.null)[1])
## [1] 0.163
# Log likelihood
(logLik_nozero_3 <-logLik(fit.mlr.nozeros))
## 'log Lik.' -11046 (df=10)
# AIC
AIC_nozero_3 <- AIC(fit.mlr.nozeros)
BIC_nozero_3 <- BIC(fit.mlr.nozeros)

#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_3<- cbind(wine_training_data.zeros.test, 
      Mean = predict(fit.mlr.nozeros, newdata=wine_training_data.zeros.test, type="response"), 
      SE = predict(fit.mlr.nozeros, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
      )

evl_df_nozero_3 <- as.data.frame(aggregate(ndf_nozero_3[, 17], list(ndf_nozero_3$TARGET), mean))
evl_df_nozero_3 <- cbind(evl_df_nozero_3,aggregate(ndf_nozero_3[, 18], list(ndf_nozero_3$TARGET), mean)[,2])
colnames(evl_df_nozero_3) <- c('TARGET','Mean','SE')
evl_df_nozero_3
##   TARGET Mean     SE
## 1      0 2.20 0.0506
## 2      1 1.44 0.0486
## 3      2 2.16 0.0425
## 4      3 2.84 0.0398
## 5      4 3.64 0.0384
## 6      5 4.30 0.0400
## 7      6 4.98 0.0442
## 8      7 5.53 0.0456
## 9      8 5.96 0.0489

4. SELECT MODELS (25 Points)

Specification:

Decide on the criteria for selecting the best count regression model. Will you select models with slightly worse performance if it makes more sense or is more parsimonious? Discuss why you selected your models.

For the count regression model, will you use a metric such as AIC, average squared error, etc.? Be sure to explain how you can make inferences from the model, and discuss other relevant model output. If you like the multiple linear regression model the best, please say why. However, you must select a count regression model for model deployment. Using the training data set, evaluate the performance of the count regression model. Make predictions using the evaluation data set.

Summary of the models we built:

model-1 fit.poisson.imp fit.poisson.imp.aj Poisson GLM: imputation model-2 fit.poisson.imp_zeroinf Zero-inflated Poisson: imputation model-3 fit.qp.imp Quasibinomial GLM: imputation model-4 fit.qp.imp_zeroinf Zero-inflated Poisson: imputation (repeated) model-5 fit.nb.imp fit.nb.imp.aj Negative Binomial GLM:imputation model-6 fit.nb.imp_zeroinf Zero-inflated Negative Binomial: imputation
model-7 fit.lm.imp fit.lm.imp.aj Multiple Linear Regression Model: imputation
model-8 fit.ml.imp Multinomial Logistic Regression Model: imputation
model-9 fit.poisson.zeros fit.poisson.zeros_aj Poisson GLM: NAs replaced by 0 model-10 fit.poisson.zeros_zeroinf Zero-inflated Poisson: NAs replaced by 0 model-11 fit.poisson.nozeros Poisson GLM: NAs Removed model-12 fit.poisson.nozeros_zeroinf Zero-inflated Poisson: NAs Removed model-13 fit.nb.zeros Negative Binomial GLM: NAs replaced by 0 model-14 fit.nb.zeros_zeroinf Zero-inflated Negative Binomial: NAs replaced by 0 model-15 fit.nb.nozeros Negative Binomial GLM: NAs Removed model-16 fit.nb.nozeros_zeroinf Zero-inflated Negative Binomial: NAs Removed model-17 fit.mlr.zeros Multiple Linear Regression: NAs replaced by 0 model-18 fit.mlr.nozeros Multiple Linear Regression: NAs Removed

Some Notes on the Models

  • Though the 5th and 6th models are not GLMs, and thus should be used with normal distributions, we need to remember that the TARGET variable did have a normal distribution before the ZEROS replaced the NAs.

  • There seem to be MORE significant fields in the “with zeros” model than in the “no zeros” model. Will this mean higher accuracy?

The Output of these Models:

calc_sd <- function(fit, data){
  prediction <- predict(fit, newdata=data, type='response')
  difference <- (prediction - mean(data$TARGET))
  difference_squared <- difference * difference
  return (mean(sqrt(difference_squared)))
}

calc_se <- function(fit, data){
  prediction <- predict(fit, newdata=data, type='response')
  difference <- (prediction - data$TARGET)
  difference_squared <- difference * difference
  return (sqrt(mean(difference_squared)))
}

# Function that returns Root Mean Squared Error
rmse <- function(error)
{
    sqrt(mean(error^2))
}
 
# Function that returns Mean Absolute Error
mae <- function(error)
{
    mean(abs(error))
}
##################################################################
# SD Calcs:
##################################################################
sd.poisson.imp.aj <- calc_sd(fit.poisson.imp.aj, test)
sd.poisson.imp_zeroinf <- calc_sd(fit.poisson.imp_zeroinf, test)
sd.fit.qp.imp <- calc_sd(fit.qp.imp, test)
#sd.qp.imp_zeroinf <- calc_sd(fit.qp.imp_zeroinf, test)
sd.nb.imp.aj <- calc_sd(fit.nb.imp.aj, test)
sd.nb.imp_zeroinf <- calc_sd(fit.nb.imp_zeroinf, test)
sd.lm.imp.aj <- calc_sd(fit.poisson.imp, test)
sd.ml.imp <- calc_sd(fit.ml.imp, mldata)

sd.poisson.nozeros <- calc_sd(fit.poisson.nozeros, wine_training_data.nozeros.test)
sd.nb.nozeros <- calc_sd(fit.nb.nozeros, wine_training_data.nozeros.test)
sd.mlr.nozeros <- calc_sd(fit.mlr.nozeros, wine_training_data.nozeros.test)

sd.poisson.zeros <- calc_sd(fit.poisson.zeros, wine_training_data.zeros.test)
sd.nb.zeros <- calc_sd(fit.nb.zeros, wine_training_data.zeros.test)
sd.mlr.zeros <- calc_sd(fit.mlr.zeros, wine_training_data.zeros.test)

sd.poisson.zeros_zeroinf <- calc_sd(fit.poisson.zeros_zeroinf, wine_training_data.zeros.test)
sd.poisson.nozeros_zeroinf <- calc_sd(fit.poisson.nozeros_zeroinf, wine_training_data.zeros.test)
sd.nb.zeros_zeroinf <- calc_sd(fit.nb.zeros_zeroinf , wine_training_data.zeros.test)
sd.nb.nozeros_zeroinf <- calc_sd(fit.nb.nozeros_zeroinf, wine_training_data.zeros.test)

sd.mlr.nozeros <- calc_sd(fit.mlr.nozeros, wine_training_data.zeros.test)

SD <- format(c(sd.poisson.imp.aj,sd.poisson.imp_zeroinf,sd.fit.qp.imp,sd.nb.imp.aj,sd.nb.imp_zeroinf,sd.lm.imp.aj,sd.ml.imp,sd.poisson.zeros,sd.poisson.zeros_zeroinf,sd.poisson.nozeros,sd.poisson.nozeros_zeroinf,sd.nb.zeros,sd.nb.zeros_zeroinf,sd.nb.nozeros,sd.nb.nozeros_zeroinf,sd.mlr.zeros,sd.mlr.nozeros), digits=2, nsmall=2)

##################################################################
# RMSE Calcs:
##################################################################
rmse.poisson.imp.aj <- rmse(fit.poisson.imp.aj$residuals)
rmse.poisson.imp_zeroinf <- rmse(fit.poisson.imp_zeroinf$residuals)
rmse.fit.qp.imp <- rmse(fit.qp.imp$residuals)
#rmse.qp.imp_zeroinf <- rmse(fit.qp.imp_zeroinf$residuals)
rmse.nb.imp.aj <- rmse(fit.nb.imp.aj$residuals)
rmse.nb.imp_zeroinf <- rmse(fit.nb.imp_zeroinf$residuals)
rmse.lm.imp.aj <- rmse(fit.lm.imp$residuals)
rmse.ml.imp <- rmse(fit.ml.imp$residuals)

rmse.poisson.zeros <- rmse(fit.poisson.zeros_aj$residuals)
rmse.poisson.zeros_zeroinf <- rmse(fit.poisson.zeros_zeroinf$residuals)
rmse.poisson.nozeros <- rmse(fit.poisson.nozeros$residuals)
rmse.poisson.nozeros_zeroinf <- rmse(fit.poisson.nozeros_zeroinf$residuals)
rmse.nb.zeros <- rmse(fit.nb.zeros$residuals)
rmse.nb.zeros_zeroinf <- rmse(fit.nb.zeros_zeroinf$residuals)
rmse.nb.nozeros <- rmse(fit.nb.nozeros$residuals)
rmse.nb.nozeros_zeroinf <- rmse(fit.nb.nozeros_zeroinf$residuals)
rmse.mlr.zeros <- rmse(fit.mlr.zeros$residuals)
rmse.mlr.nozeros <- rmse(fit.mlr.nozeros$residuals)

RMSE <- format(c(rmse.poisson.imp.aj,rmse.poisson.imp_zeroinf ,rmse.fit.qp.imp,rmse.nb.imp.aj,rmse.nb.imp.aj,rmse.lm.imp.aj,rmse.ml.imp,                 rmse.poisson.zeros,rmse.poisson.zeros_zeroinf,rmse.poisson.nozeros,rmse.poisson.nozeros_zeroinf,rmse.nb.zeros,rmse.nb.zeros_zeroinf,rmse.nb.nozeros,rmse.nb.nozeros_zeroinf,rmse.mlr.zeros,rmse.mlr.nozeros), digits=2, nsmall=2)

##################################################################
# SE Calcs:
##################################################################
se.poisson.imp.aj <- calc_se(fit.poisson.imp.aj, test)
se.poisson.imp_zeroinf <- calc_se(fit.poisson.imp_zeroinf, test)
se.fit.qp.imp <- calc_se(fit.qp.imp, test)
#se.qp.imp_zeroinf <- calc_se(fit.qp.imp_zeroinf, test)
se.nb.imp.aj <- calc_se(fit.nb.imp.aj, test)
se.nb.imp_zeroinf <- calc_se(fit.nb.imp_zeroinf, test)
se.lm.imp.aj <- calc_se(fit.lm.imp, test)
se.ml.imp <- calc_se(fit.ml.imp, mldata)

se.poisson.nozeros <- calc_se(fit.poisson.nozeros, wine_training_data.nozeros.test)
se.nb.nozeros <- calc_se(fit.nb.nozeros, wine_training_data.nozeros.test)
se.mlr.nozeros <- calc_se(fit.mlr.nozeros, wine_training_data.nozeros.test)

se.poisson.zeros <- calc_se(fit.poisson.zeros, wine_training_data.zeros.test)
se.nb.zeros <- calc_se(fit.nb.zeros, wine_training_data.zeros.test)
se.mlr.zeros <- calc_se(fit.mlr.zeros, wine_training_data.zeros.test)

se.poisson.zeros_zeroinf <- calc_se(fit.poisson.zeros_zeroinf, wine_training_data.zeros.test)
se.poisson.nozeros_zeroinf <- calc_se(fit.poisson.nozeros_zeroinf, wine_training_data.zeros.test)
se.nb.zeros_zeroinf <- calc_se(fit.nb.zeros_zeroinf , wine_training_data.zeros.test)
se.nb.nozeros_zeroinf <- calc_se(fit.nb.nozeros_zeroinf, wine_training_data.zeros.test)

se.mlr.nozeros <- calc_se(fit.mlr.nozeros, wine_training_data.zeros.test)

SE <- format(c(se.poisson.imp.aj,se.poisson.imp_zeroinf,se.fit.qp.imp,se.nb.imp.aj,se.nb.imp_zeroinf,se.lm.imp.aj,sd.ml.imp,se.poisson.zeros,se.poisson.zeros_zeroinf,se.poisson.nozeros,se.poisson.nozeros_zeroinf,se.nb.zeros,se.nb.zeros_zeroinf,se.nb.nozeros,se.nb.nozeros_zeroinf,se.mlr.zeros,se.mlr.nozeros), digits=2, nsmall=2)

##################################################################
# AIC Calcs:
##################################################################
AIC <- format(c(AIC_1,AIC_1z,AIC_2,AIC_3,AIC_3z,AIC_4,AIC_5,AIC_zero_1,AIC_zero_1z,AIC_nozero_1,AIC_nozero_1z,AIC_zero_2,AIC_zero_2z,AIC_nozero_2,AIC_nozero_2z,AIC_zero_3,AIC_nozero_3), digits=2, nsmall=2)

##################################################################
# BIC Calcs:
##################################################################
BIC <- format(c(BIC_1,BIC_1z,BIC_2,BIC_3,BIC_3z,BIC_4,BIC_5,BIC_zero_1,BIC_zero_1z,BIC_nozero_1,BIC_nozero_1z,BIC_zero_2,BIC_zero_2z,BIC_nozero_2,BIC_nozero_2z,BIC_zero_3,BIC_nozero_3), digits=2, nsmall=2)

##################################################################
# MDL Co-efficients:
##################################################################
#all_fits <- c(fit.poisson.nozeros,fit.nb.nozeros,fit.mlr.nozeros,fit.poisson.zeros,fit.nb.zeros,fit.mlr.zeros,fit.nb.zeroinfl)

##################################################################
# LogLik Calcs:
##################################################################
LogLik <- format(c(logLik_1,logLik_1z,logLik_2,logLik_3,logLik_3z,logLik_4,logLik_5,logLik_zero_1,logLik_zero_1z,logLik_nozero_1,logLik_nozero_1z,
logLik_zero_2,logLik_zero_2z,logLik_nozero_2,logLik_nozero_2z,logLik_zero_3,logLik_nozero_3), digits=2, nsmall=2)

##################################################################
# R.squared Calcs:
##################################################################
R.squared <- format(round(c(pR2_1,pR2_1z,pR2_2,pR2_3,pR2_3z,pR2_4,pR2_5,pR2_zero_1,pR2_zero_1z,pR2_nozero_1,pR2_nozero_1z,pR2_zero_2,pR2_zero_2z,pR2_nozero_2,pR2_nozero_2z,pR2_zero_3,pR2_nozero_3),2))

##################################################################
# good-of-fitness:
##################################################################
GOF <- format(round(c(p_poisson_aj,NA,p_qp,p_nb_aj,NA,p_lm_aj ,NA,p_poisson_zero_aj,NA,p_poisson_nozero,NA,p_nb_zero_aj,NA,p_nb_nozero,NA,p_mlr_zero,p_mlr_nozero),4))

##################################################################
# Dispersion Test: 
##################################################################
DispersionTest <- format(c(c_1,NA,c_2,c_3,NA,c_4,NA,c_6,NA,c_7,NA,c_8,NA,c_9,NA,c_10,NA))
Model <- c("Poisson imputation","Zero-inflated Poisson imputation","Quasibinomial imputation","Negative Binomial imputation","Zero-inflated Negative Binomial imputation","Multiple Linear imputation"," Multinomial Logistic imputation",
           "Poisson w/ 0s", "Zero-inflated Poisson w/ 0s","Poisson no 0s", "Zero-inflated Poisson no 0s","Negative Binomial w/0s","Zero-inflated Negative Binomial w/0s","Negative Binomial no 0s",  "Zero-inflated Negative Binomial no 0s", "Multiple Linear Regression w/ 0s", "Multiple Linear Regression no 0s" )
kable(model_summary<- cbind(Model, GOF,DispersionTest,R.squared ,RMSE, SE, SD, AIC, BIC, LogLik))
Model GOF DispersionTest R.squared RMSE SE SD AIC BIC LogLik
Poisson imputation 0.00 1.203 0.33 0.68 1.41 1.15 34761.48 34861.54 -17366.74
Zero-inflated Poisson imputation NA NA 0.14 1.37 1.37 1.16 31001.86 31216.26 -15470.93
Quasibinomial imputation 0.00 1.203 0.33 0.68 1.41 1.15 NA NA NA
Negative Binomial imputation 0.00 1.259 0.33 0.68 1.41 1.15 34763.70 34870.90 -17366.85
Zero-inflated Negative Binomial imputation NA NA 0.14 0.68 1.37 1.16 31003.86 31225.40 -15470.93
Multiple Linear imputation 0.00 1.992 0.47 1.43 1.43 1.02 33107.94 33215.14 -16538.97
Multinomial Logistic imputation NA NA 0.28 0.42 0.13 0.13 33107.94 33215.14 -16538.97
Poisson w/ 0s 0.00 1.078 0.39 0.73 1.40 1.14 34346.54 34432.57 -17161.27
Zero-inflated Poisson w/ 0s NA NA 0.00 1.28 1.25 1.23 30758.54 30973.61 -15349.27
Poisson no 0s 1.00 0.633 0.32 0.38 1.16 0.87 25466.10 25514.15 -12726.05
Zero-inflated Poisson no 0s NA NA 0.09 1.12 1.33 1.47 24547.98 24753.91 -12243.99
Negative Binomial w/0s 0.00 1.078 0.39 0.65 1.40 1.14 34348.83 34442.03 -17161.41
Zero-inflated Negative Binomial w/0s NA NA 0.16 1.28 1.25 1.23 30760.54 30982.78 -15349.27
Negative Binomial no 0s 1.00 0.633 0.32 0.38 1.16 0.87 25468.21 25523.13 -12726.11
Zero-inflated Negative Binomial no 0s NA NA 0.09 1.12 1.33 1.47 24549.98 24762.78 -12243.99
Multiple Linear Regression w/ 0s 0.00 1.760 0.00 1.33 1.32 1.18 32671.79 32757.82 -16323.90
Multiple Linear Regression no 0s 0.91 NA 0.16 1.15 1.38 1.02 22112.27 22180.92 -11046.14
model_id <- format(c("M1", "M1Z","M2","M3","M3Z","M4","M5","M6","M6z","M7","M7z","M8","M8z","M9","M9z","M10","M11"))
model_name<- format(c("Poisson GLM: imputation","Zero-inflated Poisson: imputation","Quasibinomial GLM: imputation","Negative Binomial GLM:imputation","Zero-inflated Negative Binomial: imputation","Multiple Linear Regression Model: imputation"," Multinomial Logistic Regression Model: imputation","Poisson GLM: NAs replaced by 0 ","Zero-inflated Poisson: NAs replaced by 0 ","Poisson GLM: NAs Removed","Zero-inflated Poisson: NAs Removed","Negative Binomial GLM: NAs replaced by 0 ","Zero-inflated Negative Binomial: NAs replaced by 0","Negative Binomial GLM: NAs Removed","Zero-inflated Negative Binomial: NAs Removed","Multiple Linear Regression: NAs replaced by 0","Multiple Linear Regression: NAs Removed"))

kable(model_name <- cbind(model_id,model_name))
model_id model_name
M1 Poisson GLM: imputation
M1Z Zero-inflated Poisson: imputation
M2 Quasibinomial GLM: imputation
M3 Negative Binomial GLM:imputation
M3Z Zero-inflated Negative Binomial: imputation
M4 Multiple Linear Regression Model: imputation
M5 Multinomial Logistic Regression Model: imputation
M6 Poisson GLM: NAs replaced by 0
M6z Zero-inflated Poisson: NAs replaced by 0
M7 Poisson GLM: NAs Removed
M7z Zero-inflated Poisson: NAs Removed
M8 Negative Binomial GLM: NAs replaced by 0
M8z Zero-inflated Negative Binomial: NAs replaced by 0
M9 Negative Binomial GLM: NAs Removed
M9z Zero-inflated Negative Binomial: NAs Removed
M10 Multiple Linear Regression: NAs replaced by 0
M11 Multiple Linear Regression: NAs Removed
evl_df_combine <- cbind(evl_df_1,evl_df_1z[,c(2,3)],evl_df_2[,c(2,3)],evl_df_3[,c(2,3)],evl_df_3z[,c(2,3)],evl_df_4[,c(2,3)],evl_df_5[,2],evl_df_zero_1[,c(2,3)],evl_df_zero_1z[,c(2,3)],evl_df_nozero_1[,c(2,3)],evl_df_nozero_1z[,c(2,3)],evl_df_zero_2[,c(2,3)],evl_df_zero_2z[,c(2,3)],evl_df_nozero_2[,c(2,3)],evl_df_nozero_2z[,c(2,3)],evl_df_zero_3[,c(2,3)],evl_df_nozero_3[,c(2,3)])

evl_df_combine <- cbind(seq(0,8),evl_df_combine)

colnames(evl_df_combine)<-c("ID","TARGET","M1.Mean", "M1.SE","M1Z.Count", "M1Z.Zero","M2.Mean", "M2.SE","M3.Mean", "M3.SE","M3Z.Count","M3Z.Zero","M4.Mean", "M4.SE","M5.pred","M6.Mean","M6.SE","M6z.Count","M6z.Zero","M7.Mean","M7.SE","M7z.Count","M7z.Zero","M8.Mean","M8.SE","M8z.Count","M8z.Zero","M9.Mean","M9.SE","M9z.Count","M9z.Zero","M10.Mean","M10.SE","M11.Mean","M11.SE")


kable(round(evl_df_combine,2),'html') %>% 
  kable_styling(bootstrap_options = c('bordered','hover','condensed',full_width=F))
ID TARGET M1.Mean M1.SE M1Z.Count M1Z.Zero M2.Mean M2.SE M3.Mean M3.SE M3Z.Count M3Z.Zero M4.Mean M4.SE M5.pred M6.Mean M6.SE M6z.Count M6z.Zero M7.Mean M7.SE M7z.Count M7z.Zero M8.Mean M8.SE M8z.Count M8z.Zero M9.Mean M9.SE M9z.Count M9z.Zero M10.Mean M10.SE M11.Mean M11.SE
0 0 1.93 0.04 3.29 0.44 1.93 0.04 1.93 0.04 3.29 0.44 1.91 0.05 0 1.81 0.04 3.16 0.55 2.85 0.05 3.31 0.22 1.57 0.04 3.16 0.55 2.46 0.05 3.31 0.22 1.68 0.05 2.20 0.05
1 1 1.51 0.04 2.28 0.18 1.51 0.04 1.51 0.04 2.28 0.18 1.39 0.05 1 1.66 0.03 2.25 0.25 2.35 0.04 2.53 0.04 1.42 0.04 2.25 0.25 1.99 0.04 2.53 0.04 1.33 0.05 1.44 0.05
2 2 2.07 0.05 2.69 0.18 2.07 0.04 2.07 0.05 2.69 0.18 2.07 0.05 2 2.12 0.04 2.65 0.18 2.63 0.04 2.79 0.05 2.06 0.04 2.65 0.18 2.41 0.04 2.79 0.05 2.07 0.04 2.16 0.04
3 3 2.69 0.05 3.16 0.16 2.69 0.05 2.69 0.05 3.16 0.16 2.71 0.05 3 2.63 0.05 3.15 0.14 3.07 0.05 3.25 0.05 2.75 0.05 3.15 0.14 2.90 0.05 3.25 0.05 2.73 0.04 2.84 0.04
4 4 3.46 0.07 3.83 0.12 3.46 0.06 3.46 0.07 3.83 0.12 3.48 0.05 4 3.40 0.06 3.83 0.08 3.69 0.06 3.87 0.04 3.59 0.06 3.83 0.08 3.59 0.05 3.87 0.04 3.54 0.04 3.64 0.04
5 5 4.16 0.09 4.54 0.08 4.16 0.08 4.16 0.09 4.54 0.08 4.14 0.05 5 4.18 0.07 4.50 0.06 4.35 0.07 4.50 0.03 4.25 0.08 4.50 0.06 4.27 0.07 4.50 0.03 4.20 0.04 4.30 0.04
6 6 4.93 0.11 5.34 0.04 4.93 0.10 4.93 0.11 5.34 0.04 4.80 0.06 6 5.19 0.10 5.32 0.04 5.22 0.09 5.31 0.02 4.86 0.10 5.32 0.04 5.13 0.09 5.31 0.02 4.89 0.05 4.98 0.04
7 7 5.37 0.13 6.09 0.02 5.37 0.13 5.37 0.13 6.09 0.02 5.25 0.06 7 6.11 0.12 6.08 0.05 6.04 0.11 6.04 0.00 5.41 0.12 6.08 0.05 5.94 0.11 6.04 0.00 5.40 0.05 5.53 0.05
8 8 5.24 0.15 6.44 0.01 5.24 0.14 5.24 0.15 6.44 0.01 5.31 0.07 8 6.70 0.16 6.72 0.00 6.56 0.12 6.56 0.00 6.00 0.16 6.72 0.00 6.56 0.12 6.56 0.00 5.77 0.06 5.96 0.05
# Generated palette with rich rainbow and dark (12, s = 0.6, v = 0.75)
mixcolor <- c("#1B9E77", "#666666", "#5e1ea6", "#7570B3", "#A6761D", "#D95F02", "#E6AB02", "#E7298A",  "#000040", "#000093", "#0d4701", "#0076FF", "#0ff4ff",  "#49FB25",  "#FEEA02",  "#FF3300","#eeff24","#ff24ef")
richcolor <- c("#000041", "#FF3300", "#0081FF",  "#80FE1A", "#FDEE02", "#FFAB00" )

ggplot(evl_df_combine,aes(x=ID,y=TARGET))+
  geom_line(aes(y = M1.Mean, colour = "Poisson-imputation"),size = 1)+
  geom_line(aes(y = M1Z.Count, colour = "Zero-inflated Poisson-imputation"),size = 1)+
  geom_line(aes(y = M2.Mean, colour = "Quasibinomial-imputation"),size = 1) + 
  geom_line(aes(y = M3.Mean, colour = "Negative Binomial-imputation"),size = 1) + 
  geom_line(aes(y = M3Z.Count, colour = "Zero-inflated Negative Binomial-imputation"),size = 1)+
  geom_line(aes(y = M4.Mean, colour = "Multiple Linear-imputation"),size = 1) + 
  geom_line(aes(y = M5.pred, colour = "Multinomial Logistic-imputation"),size = 1)+
  
  geom_line(aes(y = M6.Mean, colour = "Poisson w/ 0s"),size = 1) + 
  geom_line(aes(y = M6z.Count, colour = "Zero-inflated Poisson w/ 0s"),size = 1) + 
  geom_line(aes(y = M7.Mean, colour = "Poisson no 0s"),size = 1) + 
  geom_line(aes(y = M7z.Count, colour = "Zero-inflated Poisson no 0s"),size = 1) + 
  geom_line(aes(y = M8.Mean, colour = "Negative Binomial w/0s"),size = 1) + 
  geom_line(aes(y = M8z.Count, colour = "Zero-inflated Negative Binomial w/0s"),size = 1) + 
  geom_line(aes(y = M9.Mean, colour = "Negative Binomial no 0s"),size = 1) +
  geom_line(aes(y = M9z.Count, colour = "Zero-inflated Negative Binomial no 0s"),size = 1) + 
  geom_line(aes(y = M10.Mean, colour = "Multiple Linear Regression w/0s"),size = 1)+ 
  geom_line(aes(y = M11.Mean, colour = "Multiple Linear Regression no 0s"),size = 1)+ 
  
  geom_bar(stat="identity",fill="orange",color="grey",alpha=0.2)+
  theme_bw()+
  scale_color_manual(values = mixcolor)

ggplot(evl_df_combine,aes(x=ID,y=TARGET))+
  geom_line(aes(y = M3.Mean, colour = "Negative Binomial-imputation"),size = 1.2) + 
  geom_line(aes(y = M5.pred, colour = "Multinomial Logistic-imputation"),size = 1.2)+
  geom_line(aes(y = M7.Mean, colour = "Poisson no 0s"),size = 1.2) + 
  geom_line(aes(y = M8.Mean, colour = "Negative Binomial w/0s"),size = 1.2) + 
  geom_line(aes(y = M9.Mean, colour = "Negative Binomial no 0s"),size = 1.5) +

  
  geom_bar(stat="identity",fill="green",color="grey",alpha=0.3)+
  theme_bw()+
  scale_color_manual(values = richcolor)

Before selecting a model, a quick explanation of why the “no ZEROs” models performed better:

One might think that either removing or keeping a value (such as the “Perfect-Graphical-Fit” Zeros-for-NAs) would possibly improve a model’s accuracy, but at least maintain it. In this case, however, we saw a relatively large drop in performance of the models due to the inclusion of this attribute. Why would this be?

One probable explanation is that true customers that actually BOUGHT the product took the time to fill their surveys out accurately. Customers who didn’t purchase the product (with less stake in the game and/or lack of knowledge of the product) simply did not contribute such useful data. Due to this, the Zero-Inflated model is basically gone, and a linear model with a normal distribution again seems reasonable.

We know that Poisson Regression is actually a special case of Negative Binomial Regression (where the mean and the variance are equal), but in this case, the Poisson Regression did not yield more accurate results. We know that if there is overdispersion in the Poisson, then the estimates from the Poisson regression model are consistent but inefficient. It seems from our Box Plots in DATA Exploration that these overdispersions may have occurred.

When comparing models fitted by maximum likelihood to the same data, the smaller the AIC or BIC, the better the fit.

Based on this and the above results table, it seems that the Negative Binomial no 0s is our winner.

###########################################################
#Compare the pattern of missing valus
###########################################################
summary(wine_train)
##      INDEX           TARGET      FixedAcidity   VolatileAcidity
##  Min.   :    1   Min.   :0.00   Min.   :-18.1   Min.   :-2.79  
##  1st Qu.: 4038   1st Qu.:2.00   1st Qu.:  5.2   1st Qu.: 0.13  
##  Median : 8110   Median :3.00   Median :  6.9   Median : 0.28  
##  Mean   : 8070   Mean   :3.03   Mean   :  7.1   Mean   : 0.32  
##  3rd Qu.:12106   3rd Qu.:4.00   3rd Qu.:  9.5   3rd Qu.: 0.64  
##  Max.   :16129   Max.   :8.00   Max.   : 34.4   Max.   : 3.68  
##                                                                
##    CitricAcid    ResidualSugar    Chlorides   FreeSulfurDioxide
##  Min.   :-3.24   Min.   :-128   Min.   :-1    Min.   :-555     
##  1st Qu.: 0.03   1st Qu.:  -2   1st Qu.: 0    1st Qu.:   0     
##  Median : 0.31   Median :   4   Median : 0    Median :  30     
##  Mean   : 0.31   Mean   :   5   Mean   : 0    Mean   :  31     
##  3rd Qu.: 0.58   3rd Qu.:  16   3rd Qu.: 0    3rd Qu.:  70     
##  Max.   : 3.86   Max.   : 141   Max.   : 1    Max.   : 623     
##                  NA's   :616    NA's   :638   NA's   :647      
##  TotalSulfurDioxide    Density            pH        Sulphates   
##  Min.   :-823       Min.   :0.888   Min.   :0     Min.   :-3    
##  1st Qu.:  27       1st Qu.:0.988   1st Qu.:3     1st Qu.: 0    
##  Median : 123       Median :0.994   Median :3     Median : 0    
##  Mean   : 121       Mean   :0.994   Mean   :3     Mean   : 1    
##  3rd Qu.: 208       3rd Qu.:1.001   3rd Qu.:3     3rd Qu.: 1    
##  Max.   :1057       Max.   :1.099   Max.   :6     Max.   : 4    
##  NA's   :682                        NA's   :395   NA's   :1210  
##     Alcohol     LabelAppeal       AcidIndex         STARS     
##  Min.   :-5    Min.   :-2.000   Min.   : 4.00   Min.   :1     
##  1st Qu.: 9    1st Qu.:-1.000   1st Qu.: 7.00   1st Qu.:1     
##  Median :10    Median : 0.000   Median : 8.00   Median :2     
##  Mean   :10    Mean   :-0.009   Mean   : 7.77   Mean   :2     
##  3rd Qu.:12    3rd Qu.: 1.000   3rd Qu.: 8.00   3rd Qu.:3     
##  Max.   :26    Max.   : 2.000   Max.   :17.00   Max.   :4     
##  NA's   :653                                    NA's   :3359
summary(wine_test)
##        IN         TARGET         FixedAcidity   VolatileAcidity
##  Min.   :    3   Mode:logical   Min.   :-18.2   Min.   :-2.83  
##  1st Qu.: 4018   NA's:3335      1st Qu.:  5.2   1st Qu.: 0.08  
##  Median : 7906                  Median :  6.9   Median : 0.28  
##  Mean   : 8048                  Mean   :  6.9   Mean   : 0.31  
##  3rd Qu.:12061                  3rd Qu.:  9.0   3rd Qu.: 0.63  
##  Max.   :16130                  Max.   : 33.5   Max.   : 3.61  
##                                                                
##    CitricAcid    ResidualSugar      Chlorides    FreeSulfurDioxide
##  Min.   :-3.12   Min.   :-128.3   Min.   :-1.1   Min.   :-563     
##  1st Qu.: 0.00   1st Qu.:  -2.6   1st Qu.: 0.0   1st Qu.:   3     
##  Median : 0.31   Median :   3.6   Median : 0.0   Median :  30     
##  Mean   : 0.31   Mean   :   5.3   Mean   : 0.1   Mean   :  35     
##  3rd Qu.: 0.60   3rd Qu.:  17.2   3rd Qu.: 0.2   3rd Qu.:  79     
##  Max.   : 3.76   Max.   : 145.4   Max.   : 1.3   Max.   : 617     
##                  NA's   :168      NA's   :138    NA's   :152      
##  TotalSulfurDioxide    Density            pH        Sulphates   
##  Min.   :-769       Min.   :0.890   Min.   :0.6   Min.   :-3.1  
##  1st Qu.:  27       1st Qu.:0.988   1st Qu.:3.0   1st Qu.: 0.3  
##  Median : 124       Median :0.995   Median :3.2   Median : 0.5  
##  Mean   : 123       Mean   :0.995   Mean   :3.2   Mean   : 0.5  
##  3rd Qu.: 210       3rd Qu.:1.001   3rd Qu.:3.5   3rd Qu.: 0.8  
##  Max.   :1004       Max.   :1.100   Max.   :6.2   Max.   : 4.2  
##  NA's   :157                        NA's   :104   NA's   :310   
##     Alcohol      LabelAppeal       AcidIndex         STARS    
##  Min.   :-4.2   Min.   :-2.000   Min.   : 5.00   Min.   :1    
##  1st Qu.: 9.0   1st Qu.:-1.000   1st Qu.: 7.00   1st Qu.:1    
##  Median :10.4   Median : 0.000   Median : 8.00   Median :2    
##  Mean   :10.6   Mean   : 0.013   Mean   : 7.75   Mean   :2    
##  3rd Qu.:12.5   3rd Qu.: 1.000   3rd Qu.: 8.00   3rd Qu.:3    
##  Max.   :25.6   Max.   : 2.000   Max.   :17.00   Max.   :4    
##  NA's   :185                                     NA's   :841
#Missingness Map
miss_plot <- plot_missing(wine_train)

miss_plot <- plot_missing(wine_test[,-2])

wine_test$ResidualSugar[is.na(wine_test$ResidualSugar)] <- sample(wine_test$ResidualSugar[!is.na(wine_test$ResidualSugar)])
## Warning in wine_test$ResidualSugar[is.na(wine_test$ResidualSugar)] <-
## sample(wine_test$ResidualSugar[!is.na(wine_test$ResidualSugar)]): number of
## items to replace is not a multiple of replacement length
wine_test$Chlorides[is.na(wine_test$Chlorides)] <- sample(wine_test$Chlorides[!is.na(wine_test$Chlorides)])
## Warning in wine_test$Chlorides[is.na(wine_test$Chlorides)] <-
## sample(wine_test$Chlorides[!is.na(wine_test$Chlorides)]): number of items
## to replace is not a multiple of replacement length
wine_test$FreeSulfurDioxide[is.na(wine_test$FreeSulfurDioxide)] <- sample(wine_test$FreeSulfurDioxide[!is.na(wine_test$FreeSulfurDioxide)])
## Warning in wine_test$FreeSulfurDioxide[is.na(wine_test
## $FreeSulfurDioxide)] <- sample(wine_test$FreeSulfurDioxide[!is.na(wine_test
## $FreeSulfurDioxide)]): number of items to replace is not a multiple of
## replacement length
wine_test$TotalSulfurDioxide[is.na(wine_test$TotalSulfurDioxide)] <- sample(wine_test$TotalSulfurDioxide[!is.na(wine_test$TotalSulfurDioxide)])
## Warning in wine_test$TotalSulfurDioxide[is.na(wine_test
## $TotalSulfurDioxide)] <- sample(wine_test$TotalSulfurDioxide[!
## is.na(wine_test$TotalSulfurDioxide)]): number of items to replace is not a
## multiple of replacement length
wine_test$pH[is.na(wine_test$pH)] <- sample(wine_test$pH[!is.na(wine_test$pH)])
## Warning in wine_test$pH[is.na(wine_test$pH)] <- sample(wine_test$pH[!
## is.na(wine_test$pH)]): number of items to replace is not a multiple of
## replacement length
wine_test$Sulphates[is.na(wine_test$Sulphates)] <- sample(wine_test$Sulphates[!is.na(wine_test$Sulphates)])
## Warning in wine_test$Sulphates[is.na(wine_test$Sulphates)] <-
## sample(wine_test$Sulphates[!is.na(wine_test$Sulphates)]): number of items
## to replace is not a multiple of replacement length
wine_test$Alcohol[is.na(wine_test$Alcohol)] <- sample(wine_test$Alcohol[!is.na(wine_test$Alcohol)])
## Warning in wine_test$Alcohol[is.na(wine_test$Alcohol)] <- sample(wine_test
## $Alcohol[!is.na(wine_test$Alcohol)]): number of items to replace is not a
## multiple of replacement length
wine_test$STARS[is.na(wine_test$STARS)] <- 0
wine_test.zeros <- wine_test
wine_test.nozeros <- wine_test[wine_test$STARS != 0,]

#Missingness Map
miss_plot <- plot_missing(wine_test[,-2])

 pred_evalation <- predict(fit.nb.nozeros, newdata=wine_test, type="response")

pred_evalation_df <- cbind(wine_test$IN, pred_evalation,wine_test[,-c(1,2)])
colnames(pred_evalation_df)[colnames(pred_evalation_df)=='wine_test$IN']<- 'IN'

#write.csv(pred_evalation_df,"predicted_evalation.csv")

kable(head(pred_evalation_df,10))
IN pred_evalation FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol LabelAppeal AcidIndex STARS
3 2.23 5.4 -0.860 0.27 -10.7 0.092 23 398 0.985 5.02 0.64 12.30 -1 6 0
9 3.81 12.4 0.385 -0.76 -19.7 1.169 -37 68 0.990 3.37 1.09 16.00 0 6 2
10 2.68 7.2 1.750 0.17 -33.0 0.065 9 76 1.046 4.61 0.68 8.55 0 8 1
18 2.40 6.2 0.100 1.80 1.0 -0.179 104 89 0.989 3.20 2.11 12.30 -1 8 1
21 2.08 11.4 0.210 0.28 1.2 0.038 70 53 1.029 2.54 -0.07 4.80 0 10 0
30 5.88 17.6 0.040 -1.15 1.4 0.535 -250 140 0.950 3.06 -0.02 11.40 1 8 4
31 3.34 15.5 0.530 -0.53 4.6 1.263 10 17 1.000 3.07 0.75 8.50 0 12 3
37 2.90 15.9 1.190 1.14 31.9 -0.299 115 381 1.034 2.99 0.31 11.40 1 7 0
39 1.84 11.6 0.320 0.55 -50.9 0.076 35 83 1.000 3.32 2.18 -0.50 0 12 0
47 2.46 3.8 0.220 0.31 -7.7 0.039 40 129 0.906 4.72 -0.64 10.90 0 7 0
kable(head(pred_evalation_df,10),'html') %>% 
  kable_styling(bootstrap_options = c('bordered','hover','condensed',full_width=F))
IN pred_evalation FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol LabelAppeal AcidIndex STARS
3 2.23 5.4 -0.860 0.27 -10.7 0.092 23 398 0.985 5.02 0.64 12.30 -1 6 0
9 3.81 12.4 0.385 -0.76 -19.7 1.169 -37 68 0.990 3.37 1.09 16.00 0 6 2
10 2.68 7.2 1.750 0.17 -33.0 0.065 9 76 1.046 4.61 0.68 8.55 0 8 1
18 2.40 6.2 0.100 1.80 1.0 -0.179 104 89 0.989 3.20 2.11 12.30 -1 8 1
21 2.08 11.4 0.210 0.28 1.2 0.038 70 53 1.029 2.54 -0.07 4.80 0 10 0
30 5.88 17.6 0.040 -1.15 1.4 0.535 -250 140 0.950 3.06 -0.02 11.40 1 8 4
31 3.34 15.5 0.530 -0.53 4.6 1.263 10 17 1.000 3.07 0.75 8.50 0 12 3
37 2.90 15.9 1.190 1.14 31.9 -0.299 115 381 1.034 2.99 0.31 11.40 1 7 0
39 1.84 11.6 0.320 0.55 -50.9 0.076 35 83 1.000 3.32 2.18 -0.50 0 12 0
47 2.46 3.8 0.220 0.31 -7.7 0.039 40 129 0.906 4.72 -0.64 10.90 0 7 0