suppressMessages(suppressWarnings(library(knitr)))
suppressMessages(suppressWarnings(library(ggplot2)))
suppressMessages(suppressWarnings(library(kableExtra)))
suppressMessages(suppressWarnings(library(formattable)))
suppressMessages(suppressWarnings(library(dplyr)))
suppressMessages(suppressWarnings(library(stringr)))
#suppressMessages(suppressWarnings(library(alr3)))
suppressMessages(suppressWarnings(library(car))) #for residualPlots and mmps
suppressMessages(suppressWarnings(library(caret)))
suppressMessages(suppressWarnings(library(pROC)))
suppressMessages(suppressWarnings(library(rms))) # rms ols lrm
suppressMessages(suppressWarnings(library(AER)))
suppressMessages(suppressWarnings(library(pscl)))
suppressMessages(suppressWarnings(library(DHARMa)))
suppressMessages(suppressWarnings(library(faraway)))
suppressMessages(suppressWarnings(library(MASS)))
suppressMessages(suppressWarnings(library(broom)))
DO_PERFORM_STEPS <- TRUE
suppressMessages(suppressWarnings(library(plotly)))
suppressMessages(suppressWarnings(library(stringr)))
suppressMessages(suppressWarnings(library(sandwich)))
suppressMessages(suppressWarnings(library(MASS)))
suppressMessages(suppressWarnings(library(ROCR)))
suppressMessages(suppressWarnings(library(pscl)))
suppressMessages(suppressWarnings(library(boot)))
suppressMessages(suppressWarnings(library(gridExtra)))
suppressMessages(suppressWarnings(library(Amelia)))
suppressMessages(suppressWarnings(library(plyr)))
suppressMessages(suppressWarnings(library(psych)))
suppressMessages(suppressWarnings(library(kableExtra)))
wine_description <- read.csv("https://raw.githubusercontent.com/YunMai-SPS/DATA621_homework/master/data621_assignment5/wine_data_description.csv")
kable(wine_description,'html')%>%
kable_styling(bootstrap_options = c('bordered',full_width = T),font_size = 21) %>%
column_spec(1, width = "3em",color='black') %>%
column_spec(2, width = "30em",color='black') %>%
column_spec(3, width = "30em",color='black')
VARIABLE.NAME | DEFINITION | THEORETICAL.EFFECT |
---|---|---|
INDEX | Identification Variable (do not use) | None |
TARGET | Number of Cases Purchased | None |
AcidIndex | Proprietary method of testing total acidity of wine by using a weighted average | |
Alcohol | Alcohol Content | |
Chlorides | Chloride content of wine | |
CitricAcid | Citric Acid Content | |
Density | Density of Wine | |
FixedAcidity | Fixed Acidity of Wine | |
FreeSulfurDioxide | Sulfur Dioxide content of wine | |
LabelAppeal | Marketing Score indicating the appeal of label design for consumers. High numbers suggest customers like the label design. Negative numbers suggest customes don’t like the design. | Many consumers purchase based on the visual appeal of the wine label design. Higher numbers suggest better sales. |
ResidualSugar | Residual Sugar of wine | |
STARS | Wine rating by a team of experts. 4 Stars = Excellent, 1 Star = Poor | A high number of stars suggests high sales |
Sulphates | Sulfate conten of wine | |
TotalSulfurDioxide | Total Sulfur Dioxide of Wine | |
VolatileAcidity | Volatile Acid content of wine | |
pH | pH of wine |
Specification:
Describe the size and the variables in the wine training data set. Consider that too much detail will cause a manager to lose interest while too little detail will make the manager consider that you aren’t doing your job. Some suggestions are given below. Please do NOT treat this as a check list of things to do to complete the assignment. You should have your own thoughts on what to tell the boss. These are just ideas. a. Mean / Standard Deviation / Median b. Bar Chart or Box Plot of the data c. Is the data correlated to the target variable (or to other variables?) d. Are any of the variables missing and need to be imputed “fixed”?
wine_train <- read.csv('https://raw.githubusercontent.com/YunMai-SPS/DATA621_homework/master/data621_assignment5/wine-training-data.csv')
colnames(wine_train)[1] <- 'INDEX'
wine_test <- read.csv('https://raw.githubusercontent.com/YunMai-SPS/DATA621_homework/master/data621_assignment5/wine-evaluation-data.csv')
kable(head(wine_train,10),'html') %>%
kable_styling(bootstrap_options = c('bordered','hover','condensed',full_width=F))
INDEX | TARGET | FixedAcidity | VolatileAcidity | CitricAcid | ResidualSugar | Chlorides | FreeSulfurDioxide | TotalSulfurDioxide | Density | pH | Sulphates | Alcohol | LabelAppeal | AcidIndex | STARS |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
1 | 3 | 3.2 | 1.160 | -0.98 | 54.20 | -0.567 | NA | 268 | 0.99280 | 3.33 | -0.59 | 9.9 | 0 | 8 | 2 |
2 | 3 | 4.5 | 0.160 | -0.81 | 26.10 | -0.425 | 15 | -327 | 1.02792 | 3.38 | 0.70 | NA | -1 | 7 | 3 |
4 | 5 | 7.1 | 2.640 | -0.88 | 14.80 | 0.037 | 214 | 142 | 0.99518 | 3.12 | 0.48 | 22.0 | -1 | 8 | 3 |
5 | 3 | 5.7 | 0.385 | 0.04 | 18.80 | -0.425 | 22 | 115 | 0.99640 | 2.24 | 1.83 | 6.2 | -1 | 6 | 1 |
6 | 4 | 8.0 | 0.330 | -1.26 | 9.40 | NA | -167 | 108 | 0.99457 | 3.12 | 1.77 | 13.7 | 0 | 9 | 2 |
7 | 0 | 11.3 | 0.320 | 0.59 | 2.20 | 0.556 | -37 | 15 | 0.99940 | 3.20 | 1.29 | 15.4 | 0 | 11 | NA |
8 | 0 | 7.7 | 0.290 | -0.40 | 21.50 | 0.060 | 287 | 156 | 0.99572 | 3.49 | 1.21 | 10.3 | 0 | 8 | NA |
11 | 4 | 6.5 | -1.220 | 0.34 | 1.40 | 0.040 | 523 | 551 | 1.03236 | 3.20 | NA | 11.6 | 1 | 7 | 3 |
12 | 3 | 14.8 | 0.270 | 1.05 | 11.25 | -0.007 | -213 | NA | 0.99620 | 4.93 | 0.26 | 15.0 | 0 | 6 | NA |
13 | 6 | 5.5 | -0.220 | 0.39 | 1.80 | -0.277 | 62 | 180 | 0.94724 | 3.09 | 0.75 | 12.6 | 0 | 8 | 4 |
wine_training_data <- read.csv('https://raw.githubusercontent.com/chirag-vithlani/Business_Analytics_and_Data_Mining_DATA_621/master/Homework5/data/wine-training-data.csv', stringsAsFactors = FALSE)
intro <- read.csv('https://raw.githubusercontent.com/chirag-vithlani/Business_Analytics_and_Data_Mining_DATA_621/master/Homework5/data/wine_data_intro.csv', stringsAsFactors = FALSE)
names(intro)[2]<-"Description"
kable(intro, "html") %>%
kable_styling(bootstrap_options = "striped", full_width = F)
Variable | Description |
---|---|
Index | Identification Variable (do not use) |
Target | Number of Cases Purchased |
AcidIndex | Proprietary method of testing total acidity of wine by using a weighted average |
Alcohol | Alcohol Content |
Chlorides | Chloride content of wine |
CitricAcid | Citric Acid Content |
Density | Density of Wine |
FixedAcidity | Fixed Acidity of Wine |
FreeSulfurDioxide | Sulfur Dioxide content of wine |
LabelAppeal | Marketing Score indicating the appeal of label design for consumers. High numbers suggest customers like the label design. Negative numbers suggest customes don’t like the design.Many consumers purchase based on the visual appeal of the wine label design. Higher numbers suggest better sales. |
ResidualSugar | Residual Sugar of wine |
STARS | Wine rating by a team of experts. 4 Stars = Excellent. 1 Star = Poor. A high number of stars suggests high sales |
Sulphates | Sulfate content of wine |
TotalSulfurDioxide | Total Sulfur Dioxide of Wine |
VolatileAcidity | Volatile Acid content of wine |
pH | pH of wine |
do_factors <- function(wine_object)
{
wine_object <- within(wine_object, {
LabelAppeal <- factor(LabelAppeal)
AcidIndex <- factor(AcidIndex)
STARS <- factor(STARS)
})
return (wine_object)
}
colnames(wine_training_data)[1]<-"INDEX"
Objective here is to do analysis of chemical properties of wine data to maximize sales. So here independent varialbes are chemical properties of wines and its star rating. Dependent variable is number of sample cases. More samples are there, more that wine sold.
There are 12795 number of rows.
summary(wine_train)
## INDEX TARGET FixedAcidity VolatileAcidity
## Min. : 1 Min. :0.000 Min. :-18.100 Min. :-2.7900
## 1st Qu.: 4038 1st Qu.:2.000 1st Qu.: 5.200 1st Qu.: 0.1300
## Median : 8110 Median :3.000 Median : 6.900 Median : 0.2800
## Mean : 8070 Mean :3.029 Mean : 7.076 Mean : 0.3241
## 3rd Qu.:12106 3rd Qu.:4.000 3rd Qu.: 9.500 3rd Qu.: 0.6400
## Max. :16129 Max. :8.000 Max. : 34.400 Max. : 3.6800
##
## CitricAcid ResidualSugar Chlorides FreeSulfurDioxide
## Min. :-3.2400 Min. :-127.800 Min. :-1.1710 Min. :-555.00
## 1st Qu.: 0.0300 1st Qu.: -2.000 1st Qu.:-0.0310 1st Qu.: 0.00
## Median : 0.3100 Median : 3.900 Median : 0.0460 Median : 30.00
## Mean : 0.3084 Mean : 5.419 Mean : 0.0548 Mean : 30.85
## 3rd Qu.: 0.5800 3rd Qu.: 15.900 3rd Qu.: 0.1530 3rd Qu.: 70.00
## Max. : 3.8600 Max. : 141.150 Max. : 1.3510 Max. : 623.00
## NA's :616 NA's :638 NA's :647
## TotalSulfurDioxide Density pH Sulphates
## Min. :-823.0 Min. :0.8881 Min. :0.480 Min. :-3.1300
## 1st Qu.: 27.0 1st Qu.:0.9877 1st Qu.:2.960 1st Qu.: 0.2800
## Median : 123.0 Median :0.9945 Median :3.200 Median : 0.5000
## Mean : 120.7 Mean :0.9942 Mean :3.208 Mean : 0.5271
## 3rd Qu.: 208.0 3rd Qu.:1.0005 3rd Qu.:3.470 3rd Qu.: 0.8600
## Max. :1057.0 Max. :1.0992 Max. :6.130 Max. : 4.2400
## NA's :682 NA's :395 NA's :1210
## Alcohol LabelAppeal AcidIndex STARS
## Min. :-4.70 Min. :-2.000000 Min. : 4.000 Min. :1.000
## 1st Qu.: 9.00 1st Qu.:-1.000000 1st Qu.: 7.000 1st Qu.:1.000
## Median :10.40 Median : 0.000000 Median : 8.000 Median :2.000
## Mean :10.49 Mean :-0.009066 Mean : 7.773 Mean :2.042
## 3rd Qu.:12.40 3rd Qu.: 1.000000 3rd Qu.: 8.000 3rd Qu.:3.000
## Max. :26.50 Max. : 2.000000 Max. :17.000 Max. :4.000
## NA's :653 NA's :3359
Things we can note from summary.
table(wine_train$TARGET)
##
## 0 1 2 3 4 5 6 7 8
## 2734 244 1091 2611 3177 2014 765 142 17
Statistic of the numerical data
suppressMessages(suppressWarnings(library(pastecs)))
options(scipen = 100)
options(digits = 3)
# numeric variables
num_var <- wine_train[-which(names(wine_train) %in% c('INDEX','TARGET','LabelAppeal','AcidIndex','STARS'))]
# categorical variables
cat_var <- wine_train[,-1]
cat_var <- cat_var[,-which(names(cat_var)%in% names(num_var))]
** Histagram and boxplot of numeric independent variables:**
par(mfrow=c(4,4),oma=c(1,1,0,0), mar=c(2,1,1,0)+.1, tcl=-0.01, mgp=c(5,1,0),pin=c(1.4,0.4))
for(i in 1:length(num_var)){
hist(num_var[,i], probability = T, xlab = '', main = colnames(num_var)[i])
d <- density(num_var[,i],na.rm= T)
lines(d,col = 'red')
}
par(mfrow=c(1,4),oma=c(1,1,0,0), mar=c(2,1,1,0)+.1, tcl=-0.01, mgp=c(5,1,0),pin=c(1.4,0.4))
for(i in 1:length(cat_var)){
hist(cat_var[,i],xlab = '', main = colnames(cat_var)[i])
}
par(mfrow=c(3,5),oma=c(1,1,1,1), mar=c(1, 3, 1.5, 2)+.1, tcl=-0.01, mgp=c(5,1,0))
for(i in 2:length(wine_train)){
boxplot(wine_train[,i],main = colnames(wine_train)[i])
}
A basic analysis:
** Categorical independent variables:**
cat_var <- as.data.frame(lapply(cat_var,as.factor))
# conditional densities plots
par(mfrow=c(1,3),mar=c(5,2.0,1,3.0))
for(i in 2:length(cat_var)){
par(las=2,mgp=c(8,1,0.5)) #mgp=c(10,1.5,1), mar=c(12,10,1,10),
cdplot(cat_var$TARGET ~ cat_var[,i], data=cat_var ,xlab=NULL, ylab="TARGET")
title(names(cat_var)[i], adj = 0.5, line = 0.3, cex.main=0.9)
}
1.3.1. The missing data for variables:
pMiss <- function(x){sum(is.na(x))/length(x)*100}
miss_feature <- as.data.frame(apply(wine_train[,-1],2,pMiss))
miss_feature_df <- miss_feature
miss_feature_df$predictors <- as.numeric(t(stat.desc(wine_train[,-1])[3,]))
miss_feature_df$missing_values <- rownames(miss_feature)
miss_feature_df <- miss_feature_df[,3:1]
colnames(miss_feature_df)<- c('variable_name','missing_values', 'missing_values(%)')
#kable(miss_feature_df)
kable(filter(miss_feature_df, miss_feature_df[,'missing_values(%)']!=0 ), "html") %>%
kable_styling(bootstrap_options = c("striped", "bordered", "hover", "condensed"),full_width = F)
variable_name | missing_values | missing_values(%) |
---|---|---|
ResidualSugar | 616 | 4.81 |
Chlorides | 638 | 4.99 |
FreeSulfurDioxide | 647 | 5.06 |
TotalSulfurDioxide | 682 | 5.33 |
pH | 395 | 3.09 |
Sulphates | 1210 | 9.46 |
Alcohol | 653 | 5.10 |
STARS | 3359 | 26.25 |
#cat("The variables missed more than 5% datapoints are:",paste(rownames(miss_feature)[which(miss_feature > 5)],collapse = ',')) cat("\n")
(filter(miss_feature_df, miss_feature_df[,'missing_values(%)']!=0 )[,1])
## [1] "ResidualSugar" "Chlorides" "FreeSulfurDioxide"
## [4] "TotalSulfurDioxide" "pH" "Sulphates"
## [7] "Alcohol" "STARS"
The variables missed more than 5% datapoints are: FreeSulfurDioxide,TotalSulfurDioxide,Sulphates,Alcohol,STARS
STAR is a concern because 26% 0f the values are missing. The the missing data will be Imputated.
1.3.2. The missing data for cases:
row_miss <- data.frame(apply(wine_train, 1, function(x) sum(is.na(x))))
colnames(row_miss) <- 'NAs'
row_miss$NA_percent <- row_miss$NAs/dim(wine_train)[2]
#kable(row_miss[-which(row_miss==0),])
#kable(row_miss[which(row_miss$NA_percent>0.05),])
table(row_miss[which(row_miss$NA_percent>0.05),])
## NA_percent
## NAs 0.0625 0.125 0.1875 0.25
## 1 4780 0 0 0
## 2 0 1342 0 0
## 3 0 0 212 0
## 4 0 0 0 25
There are 4780 cases miss 6.25% values, 1342 cases miss 12.5%, 212 cases misses 18.75% values and 25 cases misses 25% values.
To preserve the information and meaning of the non-missing data, I will do the imputation for the missing values.
1.3.3. Explore NA’s through Missingness Map
The below table shows a summary of the NA values in the data. Only STARS had an NA frequency higher than 10%, so this was a concern. All NA values were thus replaced with samples from their respective collections, except for STARS, which required further analysis.
not_na_count <- sapply(wine_training_data, function(y) sum(length(which(!is.na(y)))))
na_count <- sapply(wine_training_data, function(y) sum(length(which(is.na(y)))))
na_pct <- na_count / (na_count + not_na_count)
na_summary_df <- data.frame(not_na_count,na_count,na_pct)
#Missingness Map
missmap(wine_training_data, main = "Missing Values Before Replacement" ,col=c("gray90", "black"),legend = F)
kable(na_summary_df)
not_na_count | na_count | na_pct | |
---|---|---|---|
INDEX | 12795 | 0 | 0.000 |
TARGET | 12795 | 0 | 0.000 |
FixedAcidity | 12795 | 0 | 0.000 |
VolatileAcidity | 12795 | 0 | 0.000 |
CitricAcid | 12795 | 0 | 0.000 |
ResidualSugar | 12179 | 616 | 0.048 |
Chlorides | 12157 | 638 | 0.050 |
FreeSulfurDioxide | 12148 | 647 | 0.051 |
TotalSulfurDioxide | 12113 | 682 | 0.053 |
Density | 12795 | 0 | 0.000 |
pH | 12400 | 395 | 0.031 |
Sulphates | 11585 | 1210 | 0.095 |
Alcohol | 12142 | 653 | 0.051 |
LabelAppeal | 12795 | 0 | 0.000 |
AcidIndex | 12795 | 0 | 0.000 |
STARS | 9436 | 3359 | 0.263 |
wine_training_data$ResidualSugar[is.na(wine_training_data$ResidualSugar)] <- sample(wine_training_data$ResidualSugar[!is.na(wine_training_data$ResidualSugar)])
wine_training_data$Chlorides[is.na(wine_training_data$Chlorides)] <- sample(wine_training_data$Chlorides[!is.na(wine_training_data$Chlorides)])
wine_training_data$FreeSulfurDioxide[is.na(wine_training_data$FreeSulfurDioxide)] <- sample(wine_training_data$FreeSulfurDioxide[!is.na(wine_training_data$FreeSulfurDioxide)])
wine_training_data$TotalSulfurDioxide[is.na(wine_training_data$TotalSulfurDioxide)] <- sample(wine_training_data$TotalSulfurDioxide[!is.na(wine_training_data$TotalSulfurDioxide)])
wine_training_data$pH[is.na(wine_training_data$pH)] <- sample(wine_training_data$pH[!is.na(wine_training_data$pH)])
wine_training_data$Sulphates[is.na(wine_training_data$Sulphates)] <- sample(wine_training_data$Sulphates[!is.na(wine_training_data$Sulphates)])
wine_training_data$Alcohol[is.na(wine_training_data$Alcohol)] <- sample(wine_training_data$Alcohol[!is.na(wine_training_data$Alcohol)])
#Missingness Map
missmap(wine_training_data, main = "Missing Values After Replacement (Except STARS variable) " ,col=c("gray90", "black"),legend = F)
Specification:
Describe how you have transformed the data by changing the original variables or creating new variables. If you did transform the data or create new variables, discuss why you did this. Here are some possible transformations.
Fix missing values (maybe with a Mean or Median value)
Create flags to suggest if a variable was missing
Transform data by putting it into buckets
Mathematical transforms such as log or square root (or use Box-Cox)
Combine variables (such as ratios or adding or multiplying) to create new variables
Before deciding how to impute the data, we will need to examine the missing data patterns. A visualization of the missing values in variables shown as follows:
suppressMessages(suppressWarnings(library (rpart)))
suppressMessages(suppressWarnings(library (Hmisc)))
na.patterns <- naclus(wine_train)
naplot ( na.patterns , 'na per var')
#plot ( who.na , margin = .1 )
#text ( who.na,use.n=TRUE, all=TRUE, cex=.7 ) #too many text
plot ( na.patterns )
Upper panel shows the fraction of observations missing on each predictor. Lower panel depicts a hierarchical cluster analysis of missingness combinations. The similarity measure shown on the Y -axis is the fraction of observations for which both variables are missing.
Before deciding how to impute the data, we will need to examine the missing data patterns. A visualization of the missing values in variables shown as follows:
suppressMessages(suppressWarnings(library(MissMech)))
out <- TestMCARNormality(data = wine_train[,c("ResidualSugar","Chlorides", "FreeSulfurDioxide","TotalSulfurDioxide","pH","Sulphates","Alcohol","STARS" )])
out
## Call:
## TestMCARNormality(data = wine_train[, c("ResidualSugar", "Chlorides",
## "FreeSulfurDioxide", "TotalSulfurDioxide", "pH", "Sulphates",
## "Alcohol", "STARS")])
##
## Number of Patterns: 49
##
## Total number of cases used in the analysis: 12698
##
## Pattern(s) used:
## ResidualSugar Chlorides FreeSulfurDioxide
## group.1 1 1 NA
## group.2 1 1 1
## group.3 1 1 1
## group.4 1 NA 1
## group.5 1 1 1
## group.6 1 1 1
## group.7 1 1 1
## group.8 1 1 1
## group.9 1 1 1
## group.10 NA 1 1
## group.11 1 1 1
## group.12 NA 1 1
## group.13 1 1 1
## group.14 NA 1 1
## group.15 NA NA 1
## group.16 1 1 1
## group.17 1 1 NA
## group.18 1 1 1
## group.19 NA 1 1
## group.20 1 1 1
## group.21 1 1 NA
## group.22 NA 1 1
## group.23 NA 1 1
## group.24 1 1 NA
## group.25 1 NA 1
## group.26 1 1 1
## group.27 1 1 NA
## group.28 1 1 NA
## group.29 NA 1 1
## group.30 1 1 1
## group.31 1 1 1
## group.32 1 1 1
## group.33 1 NA 1
## group.34 1 1 1
## group.35 1 NA NA
## group.36 1 NA 1
## group.37 1 NA 1
## group.38 1 1 1
## group.39 NA NA 1
## group.40 1 1 1
## group.41 NA 1 NA
## group.42 1 NA 1
## group.43 1 1 NA
## group.44 1 1 1
## group.45 1 1 NA
## group.46 1 NA 1
## group.47 NA 1 1
## group.48 1 1 NA
## group.49 1 NA NA
## TotalSulfurDioxide pH Sulphates Alcohol STARS
## group.1 1 1 1 1 1
## group.2 1 1 1 NA 1
## group.3 1 1 1 1 1
## group.4 1 1 1 1 1
## group.5 1 1 1 1 NA
## group.6 1 1 NA 1 1
## group.7 NA 1 1 1 NA
## group.8 NA 1 1 1 1
## group.9 1 1 NA 1 NA
## group.10 1 1 1 1 1
## group.11 1 NA 1 1 1
## group.12 1 1 1 NA 1
## group.13 1 NA 1 1 NA
## group.14 NA 1 1 1 1
## group.15 1 1 1 1 1
## group.16 1 1 1 NA NA
## group.17 1 1 NA 1 NA
## group.18 1 1 NA NA 1
## group.19 1 1 1 1 NA
## group.20 1 NA NA 1 NA
## group.21 1 1 1 1 NA
## group.22 1 1 NA 1 1
## group.23 1 1 NA 1 NA
## group.24 1 NA 1 1 1
## group.25 1 1 1 1 NA
## group.26 NA 1 NA 1 1
## group.27 NA 1 1 1 NA
## group.28 1 1 NA 1 1
## group.29 1 1 1 NA NA
## group.30 NA NA 1 1 1
## group.31 1 NA 1 NA 1
## group.32 NA 1 NA 1 NA
## group.33 NA 1 1 1 1
## group.34 1 NA NA 1 1
## group.35 1 1 1 1 1
## group.36 1 1 1 NA 1
## group.37 1 1 NA 1 1
## group.38 NA 1 1 NA 1
## group.39 1 1 1 1 NA
## group.40 NA 1 1 NA NA
## group.41 1 1 1 1 1
## group.42 1 1 NA 1 NA
## group.43 NA 1 1 1 1
## group.44 1 1 NA NA NA
## group.45 1 1 1 NA 1
## group.46 1 1 1 NA NA
## group.47 1 NA 1 1 1
## group.48 1 1 1 NA NA
## group.49 1 1 1 1 NA
## Number of cases
## group.1 338
## group.2 335
## group.3 6436
## group.4 350
## group.5 2239
## group.6 669
## group.7 123
## group.8 341
## group.9 247
## group.10 311
## group.11 197
## group.12 21
## group.13 81
## group.14 22
## group.15 25
## group.16 123
## group.17 14
## group.18 37
## group.19 108
## group.20 11
## group.21 124
## group.22 33
## group.23 15
## group.24 10
## group.25 113
## group.26 42
## group.27 16
## group.28 26
## group.29 8
## group.30 13
## group.31 14
## group.32 16
## group.33 20
## group.34 22
## group.35 13
## group.36 17
## group.37 22
## group.38 21
## group.39 10
## group.40 8
## group.41 17
## group.42 14
## group.43 19
## group.44 10
## group.45 15
## group.46 7
## group.47 9
## group.48 7
## group.49 9
##
##
## Test of normality and Homoscedasticity:
## -------------------------------------------
##
## Hawkins Test:
##
## P-value for the Hawkins test of normality and homoscedasticity: 0
##
## Either the test of multivariate normality or homoscedasticity (or both) is rejected.
## Provided that normality can be assumed, the hypothesis of MCAR is
## rejected at 0.05 significance level.
##
## Non-Parametric Test:
##
## P-value for the non-parametric test of homoscedasticity: 0.752
##
## Reject Normality at 0.05 significance level.
## There is not sufficient evidence to reject MCAR at 0.05 significance level.
Hawkins Test:
P-value for the Hawkins test of normality and homoscedasticity: 0
Either the test of multivariate normality or homoscedasticity (or both) is rejected.
Provided that normality can be assumed, the hypothesis of MCAR is
rejected at 0.05 significance level.
Non-Parametric Test:
P-value for the non-parametric test of homoscedasticity: 0.752
Reject Normality at 0.05 significance level.
There is not sufficient evidence to reject MCAR at 0.05 significance level.
So we conclude that the assumption that missing values has a completely random pattern is valid.
Impute the missing data for further analysis.
suppressMessages(suppressWarnings(library(mice)))
impData_0 <- wine_train
init <- mice(impData_0[,-1], maxit=0)
meth <- init$method
predM <- init$predictorMatrix
impData_0[,c('LabelAppeal','AcidIndex','STARS')] <- lapply(impData_0[,c('LabelAppeal','AcidIndex','STARS')], as.factor)
#skip variables from imputation but still be predictors
#meth[c("TARGET ","FixedAcidity","VolatileAcidity","CitricAcid","Density"," LabelAppeal","AcidIndex")] <- ""
# specify the methods for imputing the missing values
meth[c("ResidualSugar","Chlorides","FreeSulfurDioxide","TotalSulfurDioxide","pH", "Sulphates","Alcohol")]<-"norm"
meth[c("STARS")]<-"polyreg"
# impute 5 data sets for the missing values in the data without removal of some records
impData <- mice(impData_0[,-1], m=5, method=meth, predictorMatrix=predM,printFlag=FALSE, maxit=50,seed=500)
#compare the distributions of original and imputed data
densityplot(impData)
# get back the completed dataset where the missing values have been replaced with the imputed values
wine_impute <- complete(impData)
#write.csv(wine_impute,'wine_impute.csv')
#wine_impute <- read.csv('D:/Rstudio_projects/DATA621/wine_impute.csv')
#wine_impute <- wine_impute[,-1]
suppressMessages(suppressWarnings(library(DataExplorer)))
miss_plot <- plot_missing(wine_impute)
miss_plot
## feature num_missing pct_missing group
## 1 TARGET 0 0 Good
## 2 FixedAcidity 0 0 Good
## 3 VolatileAcidity 0 0 Good
## 4 CitricAcid 0 0 Good
## 5 ResidualSugar 0 0 Good
## 6 Chlorides 0 0 Good
## 7 FreeSulfurDioxide 0 0 Good
## 8 TotalSulfurDioxide 0 0 Good
## 9 Density 0 0 Good
## 10 pH 0 0 Good
## 11 Sulphates 0 0 Good
## 12 Alcohol 0 0 Good
## 13 LabelAppeal 0 0 Good
## 14 AcidIndex 0 0 Good
## 15 STARS 0 0 Good
# Plot a correlation graph
suppressMessages(suppressWarnings(library(corrplot)))
wine_impute[,c('LabelAppeal','AcidIndex','STARS')] <- lapply(wine_impute[,c('LabelAppeal','AcidIndex','STARS')], as.numeric)
# calculate Pearson correlation between predictors.
traincor <- cor(wine_impute,use = "na.or.complete")
cex.before <- par("cex")
par(cex = 0.6)
corrplot(traincor, method = "number",number.cex = .97, cl.cex = 1/par("cex"))
par(cex = cex.before)
#corr
library(usdm)
## Loading required package: sp
## Loading required package: raster
##
## Attaching package: 'raster'
## The following object is masked from 'package:pastecs':
##
## extract
## The following object is masked from 'package:plotly':
##
## select
## The following objects are masked from 'package:MASS':
##
## area, select
## The following objects are masked from 'package:Hmisc':
##
## mask, zoom
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:formattable':
##
## area
##
## Attaching package: 'usdm'
## The following object is masked from 'package:faraway':
##
## vif
## The following object is masked from 'package:rms':
##
## vif
## The following object is masked from 'package:car':
##
## vif
vif(wine_impute[,-c(1,2)])
## Variables VIF
## 1 VolatileAcidity 1.01
## 2 CitricAcid 1.01
## 3 ResidualSugar 1.00
## 4 Chlorides 1.01
## 5 FreeSulfurDioxide 1.01
## 6 TotalSulfurDioxide 1.00
## 7 Density 1.00
## 8 pH 1.01
## 9 Sulphates 1.00
## 10 Alcohol 1.01
## 11 LabelAppeal 1.12
## 12 AcidIndex 1.04
## 13 STARS 1.14
Cor <- cor(wine_impute)
suppressMessages(suppressWarnings(library(caret)))
(highCor <- findCorrelation(Cor, cutoff = 0.75))
## integer(0)
Using findCorrelation function in caret package also suggests there is no collinearities needs to be watched and no variable needs to be eliminated.
We also treat’STARS’ differently to getn a different dataset.
Next, due to such a high NA percent (26.25%)
Note that there are no ZEROS in the STARS field, 1 is the min:
## [1] 1
Graphically, it seems that a blank stars field is analagous to a ZERO (see chart below).
wine_training_data$STARS[is.na(wine_training_data$STARS)] <- 0
#wine_training_data <- do_factors(wine_training_data)
m0 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 0])
m1 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 1])
m2 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 2])
m3 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 3])
m4 <- mean(wine_training_data$TARGET[wine_training_data$STARS == 4])
stars_summary_df <- data.frame(cbind(num_stars = c(0,1,2,3,4), mean_target = c(m0,m1,m2,m3,m4)))
stars_summary_df
## num_stars mean_target
## 1 0 1.18
## 2 1 2.58
## 3 2 3.80
## 4 3 4.54
## 5 4 5.42
ggplot(stars_summary_df, aes(num_stars, mean_target)) + geom_point(shape=1, size=5) + geom_smooth(method=lm,se=FALSE)+theme(axis.text.x=element_text(size=rel(0.2)))
Though this calculation may be imperfect at the moment, we will show later that the calculation is to be discarded, and thus not worth figuring out a closer approximation for NA’s replacement in the STARS field.
After the fill, we have what looks like a Zero-Inflated model on our hands.
ggplot(wine_training_data, aes(TARGET, fill = STARS)) + geom_histogram(binwidth=1, bins = 8, position="dodge",fill=I('grey'),col=I('black'))
Specification: Using the training data set, build at least two different poisson regression models, at least two different negative binomial regression models, and at least two multiple linear regression models, using different variables (or the same variables with different transformations). Sometimes poisson and negative binomial regression models give the same results. If that is the case, comment on that. Consider changing the input variables if that occurs so that you get different models. Although not covered in class, you may also want to consider building zero-inflated poisson and negative binomial regression models. You may select the variables manually, use an approach such as Forward or Stepwise, use a different approach such as trees, or use a combination of techniques. Describe the techniques you used. If you manually selected a variable for inclusion into the model or exclusion into the model, indicate why this was done.
Discuss the coefficients in the models, do they make sense? In this case, about the only thing you can comment on is the number of stars and the wine label appeal. However, you might comment on the coefficient and magnitude of variables and how they are similar or different from model to model. For example, you might say “pH seems to have a major positive impact in my poisson regression model, but a negative effect in my multiple linear regression model”. Are you keeping the model even though it is counter intuitive? Why? The boss needs to know.
We will build 6 data sets from our training data: * TRAINING set where NAs were replaced by multivariate imputation. * TRAINING set where NAs were replaced by ZEROs. * TRAINING set where NAs were NOT replaced by ZEROs. * TEST set where NAs were replaced by multivariate imputation. * TEST set where NAs were replaced by ZEROs. * TEST set where NAs were NOT replaced by ZEROs.
library(grid)
library(vcd)
##
## Attaching package: 'vcd'
## The following object is masked from 'package:raster':
##
## mosaic
fit <- goodfit(wine_impute$TARGET)
summary(fit)
##
## Goodness-of-fit test for poisson distribution
##
## X^2 df P(> X^2)
## Likelihood Ratio 8648 7 0
fit <- goodfit(wine_impute$TARGET,type = "binomial")
## Warning in goodfit(wine_impute$TARGET, type = "binomial"): size was not
## given, taken as maximum count
summary(fit)
##
## Goodness-of-fit test for binomial distribution
##
## X^2 df P(> X^2)
## Likelihood Ratio 11450 7 0
fit <- goodfit(wine_impute$TARGET,type = "nbinomial")
summary(fit)
##
## Goodness-of-fit test for nbinomial distribution
##
## X^2 df P(> X^2)
## Likelihood Ratio 8174 6 0
In part 2 function, missing values were filled by multivariate impuation via chained equation. Now we will these dataset to calculate the coefficients.
Before we build models, we split the data into train and test.
require(caTools)
## Loading required package: caTools
set.seed(123)
sample <- sample.split(wine_impute,SplitRatio = 0.75)
train <- subset(wine_impute,sample ==TRUE)
test <- subset(wine_impute, sample==FALSE)
glm_full <- glm(TARGET ~ ., family = poisson, data = train)
glm_null <- glm(TARGET ~ 1, family = poisson, data = train)
fit.poisson.imp <-step(glm_full,direction='backward',family = poisson, data = train, trace = FALSE)
summary(fit.poisson.imp)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = poisson, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.070 -0.684 0.124 0.630 2.672
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4718976 0.0484096 9.75 < 0.0000000000000002 ***
## VolatileAcidity -0.0397135 0.0075824 -5.24 0.00000016 ***
## CitricAcid 0.0146861 0.0069195 2.12 0.03380 *
## Chlorides -0.0540308 0.0186412 -2.90 0.00375 **
## FreeSulfurDioxide 0.0001564 0.0000398 3.92 0.00008696 ***
## TotalSulfurDioxide 0.0000962 0.0000258 3.73 0.00019 ***
## pH -0.0164628 0.0087435 -1.88 0.05972 .
## Alcohol 0.0023545 0.0016139 1.46 0.14460
## LabelAppeal 0.1475757 0.0070227 21.01 < 0.0000000000000002 ***
## AcidIndex -0.1023197 0.0052214 -19.60 < 0.0000000000000002 ***
## STARS 0.3331119 0.0065148 51.13 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 16775 on 9382 degrees of freedom
## Residual deviance: 11796 on 9372 degrees of freedom
## AIC: 35282
##
## Number of Fisher Scoring iterations: 5
#good-of-fitness test
p_poisson <- 1-pchisq(summary(fit.poisson.imp)$deviance, summary(fit.poisson.imp)$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_1 <- deviance(fit.poisson.imp)/fit.poisson.imp$df.residual
dispersiontest(fit.poisson.imp) # library(AER)
##
## Overdispersion test
##
## data: fit.poisson.imp
## z = -8, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.9
#c = 0 equidispersion, c > 0 is overdispersed
#Check for zero inflation
fit.poisson.imp_zeroinf <- zeroinfl(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=train, dist="poisson")
#library(pscl)
names(vuong(fit.poisson.imp, fit.poisson.imp_zeroinf))
## Vuong Non-Nested Hypothesis Test-Statistic:
## (test-statistic is asymptotically distributed N(0,1) under the
## null that the models are indistinguishible)
## -------------------------------------------------------------
## Vuong z-statistic H_A p-value
## Raw -37.9 model2 > model1 <0.0000000000000002
## AIC-corrected -37.6 model2 > model1 <0.0000000000000002
## BIC-corrected -36.4 model2 > model1 <0.0000000000000002
## NULL
summary(fit.poisson.imp_zeroinf)
##
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid +
## ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide +
## Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS, data = train, dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.3435 -0.4623 0.0312 0.4408 6.5944
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6202519 0.2347889 2.64 0.00825 **
## FixedAcidity 0.0003694 0.0009894 0.37 0.70886
## VolatileAcidity -0.0117353 0.0078453 -1.50 0.13470
## CitricAcid 0.0011571 0.0071043 0.16 0.87062
## ResidualSugar -0.0001388 0.0001806 -0.77 0.44220
## Chlorides -0.0283245 0.0193895 -1.46 0.14407
## FreeSulfurDioxide 0.0000421 0.0000405 1.04 0.29796
## TotalSulfurDioxide -0.0000194 0.0000260 -0.74 0.45636
## Density -0.2870284 0.2320234 -1.24 0.21606
## pH 0.0068736 0.0090770 0.76 0.44890
## Sulphates -0.0017970 0.0065813 -0.27 0.78481
## Alcohol 0.0073102 0.0016554 4.42 0.00001 ***
## LabelAppeal 0.2373423 0.0073549 32.27 < 0.0000000000000002 ***
## AcidIndex -0.0197320 0.0057969 -3.40 0.00066 ***
## STARS 0.1164804 0.0072201 16.13 < 0.0000000000000002 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.912617 1.403483 -2.08 0.0380 *
## FixedAcidity 0.000459 0.005835 0.08 0.9373
## VolatileAcidity 0.208031 0.045626 4.56 0.00000513 ***
## CitricAcid -0.087904 0.042756 -2.06 0.0398 *
## ResidualSugar -0.000311 0.001075 -0.29 0.7722
## Chlorides 0.251197 0.113250 2.22 0.0265 *
## FreeSulfurDioxide -0.001044 0.000252 -4.14 0.00003436 ***
## TotalSulfurDioxide -0.000839 0.000160 -5.24 0.00000016 ***
## Density 0.297184 1.382742 0.21 0.8298
## pH 0.175923 0.053860 3.27 0.0011 **
## Sulphates 0.080522 0.039312 2.05 0.0405 *
## Alcohol 0.027322 0.009713 2.81 0.0049 **
## LabelAppeal 0.676920 0.046334 14.61 < 0.0000000000000002 ***
## AcidIndex 0.476572 0.027843 17.12 < 0.0000000000000002 ***
## STARS -2.887229 0.104365 -27.66 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 38
## Log-likelihood: -1.55e+04 on 30 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.imp)) #library(faraway)
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.imp, n = 250) #library(DHARMa)
plot(simulationOutput)
par(mfrow=c(3,5))
for(i in 2:15){
plotResiduals(train[,i], simulationOutput$scaledResiduals,xlab=colnames(train)[i])
}
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 5
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 4
The GOF test indicates that the Poisson model does not fit the data (p < 0.05).
# outlier test
outlierTest(fit.poisson.imp)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 5766 -3.07 0.00213 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.poisson.imp)
## StudRes Hat CookD
## 1630 -1.37 0.01301 0.001927
## 5766 -3.07 0.00174 0.000746
## 11289 2.68 0.00285 0.003199
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 11289, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.poisson.imp_11289 <- update(fit.poisson.imp,subset=c(-11289))
compareCoefs(fit.poisson.imp, fit.poisson.imp_11289 )
##
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = poisson, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = poisson, data = train, subset = c(-11289))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 0.4718976 0.0484096 0.4718976 0.0484096
## VolatileAcidity -0.0397135 0.0075824 -0.0397135 0.0075824
## CitricAcid 0.0146861 0.0069195 0.0146861 0.0069195
## Chlorides -0.0540308 0.0186412 -0.0540308 0.0186412
## FreeSulfurDioxide 0.0001564 0.0000398 0.0001564 0.0000398
## TotalSulfurDioxide 0.0000962 0.0000258 0.0000962 0.0000258
## pH -0.0164628 0.0087435 -0.0164628 0.0087435
## Alcohol 0.0023545 0.0016139 0.0023545 0.0016139
## LabelAppeal 0.1475757 0.0070227 0.1475757 0.0070227
## AcidIndex -0.1023197 0.0052214 -0.1023197 0.0052214
## STARS 0.3331119 0.0065148 0.3331119 0.0065148
From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.poisson.imp,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity 1.861 0.173
## CitricAcid 0.046 0.831
## Chlorides 2.590 0.108
## FreeSulfurDioxide 0.106 0.744
## TotalSulfurDioxide 0.679 0.410
## pH 0.634 0.426
## Alcohol 1.988 0.159
## LabelAppeal 41.788 0.000
## AcidIndex 51.749 0.000
## STARS 456.088 0.000
glm_full_aj <- glm(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), family = poisson, data = train)
fit.poisson.imp.aj <-step(glm_full_aj,direction='backward',family = poisson, data = train, trace = FALSE)
residualPlots(fit.poisson.imp.aj,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity 0.797 0.372
## CitricAcid 0.029 0.865
## Chlorides 2.552 0.110
## FreeSulfurDioxide 0.056 0.813
## TotalSulfurDioxide 1.181 0.277
## Sulphates 0.519 0.471
## Alcohol 2.367 0.124
## LabelAppeal 0.061 0.805
## AcidIndex 0.000 1.000
## STARS 0.000 1.000
## log(LabelAppeal + 3) 0.055 0.814
## I(AcidIndex^2) 25.542 0.000
## I(STARS^2) 75.041 0.000
summary(fit.poisson.imp.aj)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), family = poisson, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.153 -0.622 0.072 0.581 3.552
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.2212634 0.5444877 -5.92 0.000000003296
## VolatileAcidity -0.0388835 0.0075996 -5.12 0.000000311254
## CitricAcid 0.0113935 0.0069222 1.65 0.0998
## Chlorides -0.0598136 0.0186464 -3.21 0.0013
## FreeSulfurDioxide 0.0001475 0.0000397 3.71 0.0002
## TotalSulfurDioxide 0.0000775 0.0000258 3.00 0.0027
## Sulphates -0.0093901 0.0063568 -1.48 0.1396
## Alcohol 0.0041796 0.0016160 2.59 0.0097
## LabelAppeal -0.1755220 0.0698530 -2.51 0.0120
## AcidIndex 0.0619526 0.0245671 2.52 0.0117
## STARS 0.9757016 0.0321168 30.38 < 0.0000000000000002
## log(LabelAppeal + 3) 1.9955698 0.4205452 4.75 0.000002083038
## I(AcidIndex^2) -0.0148401 0.0022803 -6.51 0.000000000076
## I(STARS^2) -0.1383156 0.0067808 -20.40 < 0.0000000000000002
##
## (Intercept) ***
## VolatileAcidity ***
## CitricAcid .
## Chlorides **
## FreeSulfurDioxide ***
## TotalSulfurDioxide **
## Sulphates
## Alcohol **
## LabelAppeal *
## AcidIndex *
## STARS ***
## log(LabelAppeal + 3) ***
## I(AcidIndex^2) ***
## I(STARS^2) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 16775 on 9382 degrees of freedom
## Residual deviance: 11269 on 9369 degrees of freedom
## AIC: 34761
##
## Number of Fisher Scoring iterations: 6
coef <- summary(fit.poisson.imp.aj)$coefficients[,1]
var.name <- variable.names(fit.poisson.imp.aj)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | -3.22 | 0.04 |
VolatileAcidity | -0.04 | 0.96 |
CitricAcid | 0.01 | 1.01 |
Chlorides | -0.06 | 0.94 |
FreeSulfurDioxide | 0.00 | 1.00 |
TotalSulfurDioxide | 0.00 | 1.00 |
Sulphates | -0.01 | 0.99 |
Alcohol | 0.00 | 1.00 |
LabelAppeal | -0.18 | 0.84 |
AcidIndex | 0.06 | 1.06 |
STARS | 0.98 | 2.65 |
log(LabelAppeal + 3) | 2.00 | 7.36 |
I(AcidIndex^2) | -0.01 | 0.99 |
I(STARS^2) | -0.14 | 0.87 |
#good-of-fitness test
p_poisson_aj <- 1-pchisq(summary(fit.poisson.imp.aj)$deviance, summary(fit.poisson.imp)$df.residual)
cat("p-value_aj:" ,p_poisson_aj,"\n\n")
## p-value_aj: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_1 <- deviance(fit.poisson.imp.aj)/fit.poisson.imp.aj$df.residual
dispersiontest(fit.poisson.imp.aj) # library(AER)
##
## Overdispersion test
##
## data: fit.poisson.imp.aj
## z = -8, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.897
#c = 0 equidispersion, c > 0 is overdispersed
c =1.2 >0. The assumption of equidispersion is not rejected.
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.imp.aj, n = 250) #library(DHARMa)
plot(simulationOutput)
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.imp.aj)) #library(faraway)
# outlier test
outlierTest(fit.poisson.imp.aj)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 11289 3.58 0.000346 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.poisson.imp.aj)
## StudRes Hat CookD
## 2729 3.37 0.02327 0.0441
## 11289 3.58 0.00652 0.0137
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.poisson.imp_2729 <- update(fit.poisson.imp.aj,subset=c(-2729))
compareCoefs(fit.poisson.imp.aj, fit.poisson.imp_2729 )
##
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), family = poisson, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), family = poisson, data = train, subset =
## c(-2729))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) -3.2212634 0.5444877 -3.2196985 0.5444750
## VolatileAcidity -0.0388835 0.0075996 -0.0388835 0.0075995
## CitricAcid 0.0113935 0.0069222 0.0113969 0.0069221
## Chlorides -0.0598136 0.0186464 -0.0597513 0.0186467
## FreeSulfurDioxide 0.0001475 0.0000397 0.0001475 0.0000397
## TotalSulfurDioxide 0.0000775 0.0000258 0.0000774 0.0000258
## Sulphates -0.0093901 0.0063568 -0.0093861 0.0063567
## Alcohol 0.0041796 0.0016160 0.0041749 0.0016160
## LabelAppeal -0.1755220 0.0698530 -0.1754023 0.0698510
## AcidIndex 0.0619526 0.0245671 0.0619432 0.0245672
## STARS 0.9757016 0.0321168 0.9758040 0.0321173
## log(LabelAppeal + 3) 1.9955698 0.4205452 1.9944809 0.4205346
## I(AcidIndex^2) -0.0148401 0.0022803 -0.0148403 0.0022804
## I(STARS^2) -0.1383156 0.0067808 -0.1383278 0.0067808
From the output table, we can see that coefficients are changed minimally, and the observation 2729 is not influential.
We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model
# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.poisson.imp.aj)
The bar hanging from each point on the theoretical Poisson fit (red line) represents the difference between expected and observed counts. When a bar hanging below 0, it is underfitting. When a bar hanging above 0, it is overfitting. We can see that all variables have either underfitting or overfitting problem except the No.7 variable, “TotalSulfurDioxide”.
# goodness of fit: pseudo R squared
(pR2_1 <- 1 - fit.poisson.imp.aj$deviance / fit.poisson.imp.aj$null.deviance)
## [1] 0.328
#or
#(pR2 <- 1- logLik(logitfit.1)/logLik(logit.null))
# Log likelihood
(logLik_1<-logLik(fit.poisson.imp.aj))
## 'log Lik.' -17367 (df=14)
# AIC
AIC_1 <- fit.poisson.imp.aj$aic
BIC_1 <- BIC(fit.poisson.imp.aj)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_1<- cbind(test,
Mean = predict(fit.poisson.imp.aj, newdata=test, type="response"),
SE = predict(fit.poisson.imp.aj, newdata=test, type="response", se.fit=T)$se.fit
)
evl_df_1 <- as.data.frame(aggregate(ndf_1[, 16], list(ndf_1$TARGET), mean))
evl_df_1 <- cbind(evl_df_1,aggregate(ndf_1[, 17], list(ndf_1$TARGET), mean)[,2])
colnames(evl_df_1) <- c('TARGET','Mean','SE')
evl_df_1
## TARGET Mean SE
## 1 0 1.93 0.0444
## 2 1 1.51 0.0439
## 3 2 2.07 0.0462
## 4 3 2.69 0.0545
## 5 4 3.46 0.0687
## 6 5 4.16 0.0873
## 7 6 4.93 0.1107
## 8 7 5.37 0.1334
## 9 8 5.24 0.1467
# goodness of fit: pseudo R squared
zeroinf_null <- update(fit.poisson.imp_zeroinf, . ~ 1)
(pR2_1z <- 1- logLik(fit.poisson.imp_zeroinf)[1]/logLik(zeroinf_null)[1])
## [1] 0.14
#dispersion
c_1z <- deviance(fit.poisson.imp_zeroinf)/fit.poisson.imp_zeroinf$df.residual
# Log likelihood
(logLik_1z<-logLik(fit.poisson.imp_zeroinf))
## 'log Lik.' -15471 (df=30)
# AIC
AIC_1z <- AIC(fit.poisson.imp_zeroinf)
BIC_1z <- BIC(fit.poisson.imp_zeroinf)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_1z<- cbind(test,
Count = predict(fit.poisson.imp_zeroinf, newdata=test, type = "count"),
Zero = predict(fit.poisson.imp_zeroinf, newdata=test, type = "zero")
)
evl_df_1z <- as.data.frame(aggregate(ndf_1z[, 16], list(ndf_1z$TARGET), mean))
evl_df_1z <- cbind(evl_df_1z,aggregate(ndf_1z[, 17], list(ndf_1z$TARGET), mean)[,2])
colnames(evl_df_1z) <- c('TARGET','Count','Zero')
evl_df_1z
## TARGET Count Zero
## 1 0 3.29 0.44449
## 2 1 2.28 0.18082
## 3 2 2.69 0.17539
## 4 3 3.16 0.16233
## 5 4 3.83 0.12009
## 6 5 4.54 0.08155
## 7 6 5.34 0.03505
## 8 7 6.09 0.02496
## 9 8 6.44 0.00638
To deal with the overdispersion, we will fit the modle with “quasipoisson”.
fit.qp.imp <- update(fit.poisson.imp.aj,family=quasipoisson)
summary(fit.qp.imp)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), family = quasipoisson, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.153 -0.622 0.072 0.581 3.552
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.2212634 0.5146492 -6.26 0.0000000004041
## VolatileAcidity -0.0388835 0.0071831 -5.41 0.0000000634560
## CitricAcid 0.0113935 0.0065429 1.74 0.08165
## Chlorides -0.0598136 0.0176246 -3.39 0.00069
## FreeSulfurDioxide 0.0001475 0.0000375 3.93 0.0000861509755
## TotalSulfurDioxide 0.0000775 0.0000244 3.18 0.00148
## Sulphates -0.0093901 0.0060084 -1.56 0.11813
## Alcohol 0.0041796 0.0015275 2.74 0.00622
## LabelAppeal -0.1755220 0.0660250 -2.66 0.00786
## AcidIndex 0.0619526 0.0232208 2.67 0.00764
## STARS 0.9757016 0.0303567 32.14 < 0.0000000000000002
## log(LabelAppeal + 3) 1.9955698 0.3974989 5.02 0.0000005253650
## I(AcidIndex^2) -0.0148401 0.0021554 -6.89 0.0000000000061
## I(STARS^2) -0.1383156 0.0064092 -21.58 < 0.0000000000000002
##
## (Intercept) ***
## VolatileAcidity ***
## CitricAcid .
## Chlorides ***
## FreeSulfurDioxide ***
## TotalSulfurDioxide **
## Sulphates
## Alcohol **
## LabelAppeal **
## AcidIndex **
## STARS ***
## log(LabelAppeal + 3) ***
## I(AcidIndex^2) ***
## I(STARS^2) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for quasipoisson family taken to be 0.893)
##
## Null deviance: 16775 on 9382 degrees of freedom
## Residual deviance: 11269 on 9369 degrees of freedom
## AIC: NA
##
## Number of Fisher Scoring iterations: 6
residualPlots(fit.qp.imp,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity 0.797 0.372
## CitricAcid 0.029 0.865
## Chlorides 2.552 0.110
## FreeSulfurDioxide 0.056 0.813
## TotalSulfurDioxide 1.181 0.277
## Sulphates 0.519 0.471
## Alcohol 2.367 0.124
## LabelAppeal 0.061 0.805
## AcidIndex 0.000 1.000
## STARS 0.000 1.000
## log(LabelAppeal + 3) 0.055 0.814
## I(AcidIndex^2) 25.542 0.000
## I(STARS^2) 75.041 0.000
The dispersion parameter is estimated to be 0.89, the dispersion isless than one. The conditional variance is smaller than the conditional mean. So we have under-dispersion.
#good-of-fitness test
p_qp <- 1-pchisq(summary(fit.qp.imp)$deviance, summary(fit.qp.imp)$df.residual)
cat("p-value:" ,p_qp,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_2 <- deviance(fit.qp.imp)/fit.qp.imp$df.residual
#c = 0 equidispersion, c > 0 is overdispersed
#Check for zero inflation
fit.qp.imp_zeroinf<-fit.poisson.imp_zeroinf
AIC(fit.qp.imp, fit.poisson.imp_zeroinf)
## df AIC
## fit.qp.imp 14 NA
## fit.poisson.imp_zeroinf 30 31002
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.qp.imp)) #library(faraway)
par(mfrow=c(3,5))
for(i in 2:15){
plotResiduals(train[,i], simulationOutput$scaledResiduals,xlab=colnames(train)[i])
}
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 5
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 4
The GOF test indicates that the Poisson model does not fit the data (p < 0.05).
# outlier test
outlierTest(fit.qp.imp)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 11289 3.26 0.0011 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.qp.imp)
## StudRes Hat CookD
## 2729 3.08 0.02327 0.0493
## 11289 3.26 0.00652 0.0153
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 11289, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.qp.imp_11289 <- update(fit.qp.imp,subset=c(-11289))
compareCoefs(fit.qp.imp, fit.qp.imp_11289 )
##
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), family = quasipoisson, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), family = quasipoisson, data = train,
## subset = c(-11289))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) -3.2212634 0.5146492 -3.2212634 0.5146492
## VolatileAcidity -0.0388835 0.0071831 -0.0388835 0.0071831
## CitricAcid 0.0113935 0.0065429 0.0113935 0.0065429
## Chlorides -0.0598136 0.0176246 -0.0598136 0.0176246
## FreeSulfurDioxide 0.0001475 0.0000375 0.0001475 0.0000375
## TotalSulfurDioxide 0.0000775 0.0000244 0.0000775 0.0000244
## Sulphates -0.0093901 0.0060084 -0.0093901 0.0060084
## Alcohol 0.0041796 0.0015275 0.0041796 0.0015275
## LabelAppeal -0.1755220 0.0660250 -0.1755220 0.0660250
## AcidIndex 0.0619526 0.0232208 0.0619526 0.0232208
## STARS 0.9757016 0.0303567 0.9757016 0.0303567
## log(LabelAppeal + 3) 1.9955698 0.3974989 1.9955698 0.3974989
## I(AcidIndex^2) -0.0148401 0.0021554 -0.0148401 0.0021554
## I(STARS^2) -0.1383156 0.0064092 -0.1383156 0.0064092
From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.
coef <- summary(fit.qp.imp)$coefficients[,1]
var.name <- variable.names(fit.qp.imp)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | -3.22 | 0.04 |
VolatileAcidity | -0.04 | 0.96 |
CitricAcid | 0.01 | 1.01 |
Chlorides | -0.06 | 0.94 |
FreeSulfurDioxide | 0.00 | 1.00 |
TotalSulfurDioxide | 0.00 | 1.00 |
Sulphates | -0.01 | 0.99 |
Alcohol | 0.00 | 1.00 |
LabelAppeal | -0.18 | 0.84 |
AcidIndex | 0.06 | 1.06 |
STARS | 0.98 | 2.65 |
log(LabelAppeal + 3) | 2.00 | 7.36 |
I(AcidIndex^2) | -0.01 | 0.99 |
I(STARS^2) | -0.14 | 0.87 |
# goodness of fit: pseudo R squared
(pR2_2 <- 1 - fit.qp.imp$deviance / fit.qp.imp$null.deviance)
## [1] 0.328
# Log likelihood
(logLik_2<-logLik(fit.qp.imp))
## 'log Lik.' NA (df=14)
# AIC
AIC_2 <- AIC(fit.qp.imp)
BIC_2 <- BIC(fit.qp.imp)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_2<- cbind(test,
Mean = predict(fit.qp.imp, newdata=test, type="response"),
SE = predict(fit.qp.imp, newdata=test, type="response", se.fit=T)$se.fit
)
evl_df_2 <- as.data.frame(aggregate(ndf_2[, 16], list(ndf_2$TARGET), mean))
evl_df_2 <- cbind(evl_df_2,aggregate(ndf_2[, 17], list(ndf_2$TARGET), mean)[,2])
colnames(evl_df_2) <- c('TARGET','Mean','SE')
evl_df_2
## TARGET Mean SE
## 1 0 1.93 0.0419
## 2 1 1.51 0.0415
## 3 2 2.07 0.0436
## 4 3 2.69 0.0515
## 5 4 3.46 0.0649
## 6 5 4.16 0.0825
## 7 6 4.93 0.1046
## 8 7 5.37 0.1261
## 9 8 5.24 0.1387
# goodness of fit: pseudo R squared
zeroinf_null <- update(fit.qp.imp_zeroinf, . ~ 1)
(pR2_2z <- 1- logLik(fit.qp.imp_zeroinf)[1]/logLik(zeroinf_null)[1])
## [1] 0.14
# Log likelihood
(logLik_2z<-logLik(fit.qp.imp_zeroinf))
## 'log Lik.' -15471 (df=30)
#dispersion
c_2z <- deviance(fit.qp.imp_zeroinf)/fit.qp.imp_zeroinf$df.residual
# AIC
AIC_2z <- AIC(fit.qp.imp_zeroinf)
BIC_2z <- BIC(fit.qp.imp_zeroinf)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_2z<- cbind(test,
Count = predict(fit.qp.imp_zeroinf, newdata=test, type = "count"),
Zero = predict(fit.qp.imp_zeroinf, newdata=test, type = "zero")
)
evl_df_2z <- as.data.frame(aggregate(ndf_2z[, 16], list(ndf_2z$TARGET), mean))
evl_df_2z <- cbind(evl_df_2z,aggregate(ndf_2z[, 17], list(ndf_2z$TARGET), mean)[,2])
colnames(evl_df_2z) <- c('TARGET','Count','Zero')
evl_df_2z
## TARGET Count Zero
## 1 0 3.29 0.44449
## 2 1 2.28 0.18082
## 3 2 2.69 0.17539
## 4 3 3.16 0.16233
## 5 4 3.83 0.12009
## 6 5 4.54 0.08155
## 7 6 5.34 0.03505
## 8 7 6.09 0.02496
## 9 8 6.44 0.00638
In part 2 function, missing values were filled by multivariate impuation via chained equation. Now we will use the completed dataset to calculate the coefficients.
glm_full_nb <- glm.nb(TARGET ~ ., data = train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
glm_null_nb <- glm.nb(TARGET ~ 1, data = train)
fit.nb.imp <- step(glm_full_nb,direction="backward",trace=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(fit.nb.imp)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal +
## AcidIndex + STARS, data = train, init.theta = 48389.28899,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.070 -0.684 0.124 0.630 2.672
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.4719007 0.0484114 9.75 < 0.0000000000000002 ***
## VolatileAcidity -0.0397145 0.0075827 -5.24 0.00000016 ***
## CitricAcid 0.0146862 0.0069198 2.12 0.03381 *
## Chlorides -0.0540321 0.0186419 -2.90 0.00375 **
## FreeSulfurDioxide 0.0001564 0.0000398 3.92 0.00008699 ***
## TotalSulfurDioxide 0.0000962 0.0000258 3.73 0.00019 ***
## pH -0.0164634 0.0087439 -1.88 0.05972 .
## Alcohol 0.0023545 0.0016140 1.46 0.14462
## LabelAppeal 0.1475755 0.0070230 21.01 < 0.0000000000000002 ***
## AcidIndex -0.1023215 0.0052216 -19.60 < 0.0000000000000002 ***
## STARS 0.3331160 0.0065151 51.13 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(48389) family taken to be 1)
##
## Null deviance: 16774 on 9382 degrees of freedom
## Residual deviance: 11795 on 9372 degrees of freedom
## AIC: 35284
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 48389
## Std. Err.: 65368
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -35260
#five-percent critical value for a chi-squared with 909 d.f. is
qchisq(0.95, df.residual(fit.nb.imp))
## [1] 9598
#good-of-fitness test
p_nb <- 1-pchisq(summary(fit.nb.imp)$deviance, summary(fit.nb.imp)$df.residual)
cat("p-value:" ,p_nb,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_3 <- deviance(fit.nb.imp)/fit.nb.imp$df.residual
#c = 0 equidispersion, c > 0 is overdispersed
#Check for zero inflation
fit.nb.imp_zeroinf <- zeroinfl(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=train, dist="negbin") #library(pscl)
AIC(fit.nb.imp, fit.nb.imp_zeroinf)
## df AIC
## fit.nb.imp 12 35284
## fit.nb.imp_zeroinf 31 31004
summary(fit.nb.imp_zeroinf)
##
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid +
## ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide +
## Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS, data = train, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.3435 -0.4623 0.0312 0.4408 6.5944
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.6202857 0.2347889 2.64 0.00824 **
## FixedAcidity 0.0003694 0.0009894 0.37 0.70886
## VolatileAcidity -0.0117354 0.0078453 -1.50 0.13469
## CitricAcid 0.0011567 0.0071043 0.16 0.87067
## ResidualSugar -0.0001388 0.0001806 -0.77 0.44220
## Chlorides -0.0283250 0.0193895 -1.46 0.14406
## FreeSulfurDioxide 0.0000421 0.0000405 1.04 0.29796
## TotalSulfurDioxide -0.0000194 0.0000260 -0.74 0.45638
## Density -0.2870626 0.2320234 -1.24 0.21601
## pH 0.0068736 0.0090770 0.76 0.44890
## Sulphates -0.0017971 0.0065813 -0.27 0.78480
## Alcohol 0.0073102 0.0016554 4.42 0.000010 ***
## LabelAppeal 0.2373423 0.0073549 32.27 < 0.0000000000000002 ***
## AcidIndex -0.0197324 0.0057969 -3.40 0.00066 ***
## STARS 0.1164807 0.0072201 16.13 < 0.0000000000000002 ***
## Log(theta) 17.6062211 3.9656460 4.44 0.000009 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.910885 1.403479 -2.07 0.0381 *
## FixedAcidity 0.000459 0.005835 0.08 0.9372
## VolatileAcidity 0.208030 0.045625 4.56 0.00000513 ***
## CitricAcid -0.087906 0.042756 -2.06 0.0398 *
## ResidualSugar -0.000311 0.001075 -0.29 0.7723
## Chlorides 0.251195 0.113249 2.22 0.0266 *
## FreeSulfurDioxide -0.001044 0.000252 -4.14 0.00003437 ***
## TotalSulfurDioxide -0.000839 0.000160 -5.24 0.00000016 ***
## Density 0.295451 1.382743 0.21 0.8308
## pH 0.175922 0.053860 3.27 0.0011 **
## Sulphates 0.080520 0.039312 2.05 0.0405 *
## Alcohol 0.027321 0.009713 2.81 0.0049 **
## LabelAppeal 0.676919 0.046334 14.61 < 0.0000000000000002 ***
## AcidIndex 0.476572 0.027843 17.12 < 0.0000000000000002 ***
## STARS -2.887228 0.104365 -27.66 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 44287855.659
## Number of iterations in BFGS optimization: 60
## Log-likelihood: -1.55e+04 on 31 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.imp)) #library(faraway)
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.nb.imp, n = 250) #library(DHARMa)
plot(simulationOutput)
par(mfrow=c(3,5))
for(i in 2:15){
plotResiduals(train[,i], simulationOutput$scaledResiduals,xlab=colnames(train)[i])
}
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 5
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 4
The GOF test indicates that the Negative Binomial model does not fit the data (p < 0.05).
# outlier test
outlierTest(fit.nb.imp)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 5766 -2.74 0.00618 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.nb.imp)
## StudRes Hat CookD
## 1630 -1.22 0.01301 0.001926
## 5766 -2.74 0.00174 0.000746
## 11289 2.39 0.00285 0.003199
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 11289, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.nb.imp_11289 <- update(fit.nb.imp,subset=c(-11289))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.imp, fit.nb.imp_11289 )
##
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides
## + FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal
## + AcidIndex + STARS, data = train, init.theta = 48389.28899, link =
## log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides
## + FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal
## + AcidIndex + STARS, data = train, subset = c(-11289), init.theta =
## 48389.28747, link = log)
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 0.4719007 0.0484114 0.4719007 0.0484114
## VolatileAcidity -0.0397145 0.0075827 -0.0397145 0.0075827
## CitricAcid 0.0146862 0.0069198 0.0146862 0.0069198
## Chlorides -0.0540321 0.0186419 -0.0540321 0.0186419
## FreeSulfurDioxide 0.0001564 0.0000398 0.0001564 0.0000398
## TotalSulfurDioxide 0.0000962 0.0000258 0.0000962 0.0000258
## pH -0.0164634 0.0087439 -0.0164634 0.0087439
## Alcohol 0.0023545 0.0016140 0.0023545 0.0016140
## LabelAppeal 0.1475755 0.0070230 0.1475755 0.0070230
## AcidIndex -0.1023215 0.0052216 -0.1023215 0.0052216
## STARS 0.3331160 0.0065151 0.3331160 0.0065151
From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.
Then adjust the model by adding high order terms or log form for “LabelAppeal”, “AcidIndex” and “STARS” to see whether we can improve the fit.
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.nb.imp,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Test stat Pr(>|t|)
## VolatileAcidity 1.861 0.173
## CitricAcid 0.046 0.831
## Chlorides 2.590 0.108
## FreeSulfurDioxide 0.106 0.744
## TotalSulfurDioxide 0.679 0.410
## pH 0.634 0.426
## Alcohol 1.988 0.159
## LabelAppeal 41.787 0.000
## AcidIndex 51.738 0.000
## STARS 456.048 0.000
glm_nb_full_aj <- glm.nb(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), data = train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
fit.nb.imp.aj <-step(glm_nb_full_aj,direction='backward',data = train, trace = FALSE)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
residualPlots(fit.nb.imp.aj,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Test stat Pr(>|t|)
## VolatileAcidity 0.797 0.372
## CitricAcid 0.029 0.865
## Chlorides 2.552 0.110
## FreeSulfurDioxide 0.056 0.813
## TotalSulfurDioxide 1.181 0.277
## Sulphates 0.519 0.471
## Alcohol 2.367 0.124
## LabelAppeal 0.061 0.805
## AcidIndex 0.000 0.999
## STARS 0.000 0.999
## log(LabelAppeal + 3) 0.056 0.814
## I(AcidIndex^2) 25.544 0.000
## I(STARS^2) 75.031 0.000
summary(fit.nb.imp.aj)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), data = train, init.theta = 45840.1185,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.153 -0.622 0.072 0.581 3.551
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.2212853 0.5445059 -5.92 0.000000003299
## VolatileAcidity -0.0388844 0.0075999 -5.12 0.000000311381
## CitricAcid 0.0113937 0.0069225 1.65 0.0998
## Chlorides -0.0598151 0.0186472 -3.21 0.0013
## FreeSulfurDioxide 0.0001475 0.0000397 3.71 0.0002
## TotalSulfurDioxide 0.0000775 0.0000258 3.00 0.0027
## Sulphates -0.0093906 0.0063570 -1.48 0.1396
## Alcohol 0.0041795 0.0016161 2.59 0.0097
## LabelAppeal -0.1755275 0.0698556 -2.51 0.0120
## AcidIndex 0.0619495 0.0245679 2.52 0.0117
## STARS 0.9757061 0.0321180 30.38 < 0.0000000000000002
## log(LabelAppeal + 3) 1.9955949 0.4205597 4.75 0.000002084102
## I(AcidIndex^2) -0.0148400 0.0022804 -6.51 0.000000000076
## I(STARS^2) -0.1383163 0.0067811 -20.40 < 0.0000000000000002
##
## (Intercept) ***
## VolatileAcidity ***
## CitricAcid .
## Chlorides **
## FreeSulfurDioxide ***
## TotalSulfurDioxide **
## Sulphates
## Alcohol **
## LabelAppeal *
## AcidIndex *
## STARS ***
## log(LabelAppeal + 3) ***
## I(AcidIndex^2) ***
## I(STARS^2) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(45840) family taken to be 1)
##
## Null deviance: 16774 on 9382 degrees of freedom
## Residual deviance: 11269 on 9369 degrees of freedom
## AIC: 34764
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 45840
## Std. Err.: 54227
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -34734
#good-of-fitness test
(p_nb <- 1-pchisq(summary(fit.nb.imp)$deviance, summary(fit.nb.imp)$df.residual))
## [1] 0
(p_nb_aj <- 1-pchisq(summary(fit.nb.imp.aj)$deviance, summary(fit.nb.imp.aj)$df.residual))
## [1] 0
cat("p-value_aj:" ,p_poisson_aj,"\n\n")
## p-value_aj: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
deviance(fit.nb.imp.aj)/fit.nb.imp.aj$df.residual
## [1] 1.2
#c = 0 equidispersion, c > 0 is overdispersed
c =1.2 >0. The assumption of equidispersion is not rejected.
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.nb.imp.aj, n = 250) #library(DHARMa)
plot(simulationOutput)
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.imp.aj)) #library(faraway)
# outlier test
outlierTest(fit.nb.imp.aj)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 11289 3.26 0.0011 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.nb.imp.aj)
## StudRes Hat CookD
## 2729 3.08 0.02327 0.0441
## 11289 3.26 0.00652 0.0137
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.nb.imp_11289 <- update(fit.nb.imp.aj,subset=c(-11289))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.imp.aj, fit.nb.imp_11289 )
##
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides
## + FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), data = train, init.theta = 45840.1185,
## link = log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides
## + FreeSulfurDioxide + TotalSulfurDioxide + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + log(LabelAppeal + 3) +
## I(AcidIndex^2) + I(STARS^2), data = train, subset = c(-11289),
## init.theta = 45840.15613, link = log)
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) -3.2212853 0.5445059 -3.2212853 0.5445059
## VolatileAcidity -0.0388844 0.0075999 -0.0388844 0.0075999
## CitricAcid 0.0113937 0.0069225 0.0113937 0.0069225
## Chlorides -0.0598151 0.0186472 -0.0598151 0.0186472
## FreeSulfurDioxide 0.0001475 0.0000397 0.0001475 0.0000397
## TotalSulfurDioxide 0.0000775 0.0000258 0.0000775 0.0000258
## Sulphates -0.0093906 0.0063570 -0.0093906 0.0063570
## Alcohol 0.0041795 0.0016161 0.0041795 0.0016161
## LabelAppeal -0.1755275 0.0698556 -0.1755275 0.0698556
## AcidIndex 0.0619495 0.0245679 0.0619495 0.0245679
## STARS 0.9757061 0.0321180 0.9757061 0.0321180
## log(LabelAppeal + 3) 1.9955949 0.4205597 1.9955949 0.4205597
## I(AcidIndex^2) -0.0148400 0.0022804 -0.0148400 0.0022804
## I(STARS^2) -0.1383163 0.0067811 -0.1383163 0.0067811
From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.
We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model
# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.nb.imp.aj)
The bar hanging from each point on the theoretical Poisson fit (red line) represents the difference between expected and observed counts. When a bar hanging below 0, it is underfitting. When a bar hanging above 0, it is overfitting. We can see that all variables have either underfitting or overfitting problem except the No.7 variable, “TotalSulfurDioxide”.
coef <- summary(fit.nb.imp.aj)$coefficients[,1]
var.name <- variable.names(fit.nb.imp.aj)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | -3.22 | 0.04 |
VolatileAcidity | -0.04 | 0.96 |
CitricAcid | 0.01 | 1.01 |
Chlorides | -0.06 | 0.94 |
FreeSulfurDioxide | 0.00 | 1.00 |
TotalSulfurDioxide | 0.00 | 1.00 |
Sulphates | -0.01 | 0.99 |
Alcohol | 0.00 | 1.00 |
LabelAppeal | -0.18 | 0.84 |
AcidIndex | 0.06 | 1.06 |
STARS | 0.98 | 2.65 |
log(LabelAppeal + 3) | 2.00 | 7.36 |
I(AcidIndex^2) | -0.01 | 0.99 |
I(STARS^2) | -0.14 | 0.87 |
# goodness of fit: pseudo R squared
(pR2_3 <- 1 - fit.nb.imp.aj$deviance / fit.nb.imp.aj$null.deviance)
## [1] 0.328
#or
#(pR2 <- 1- logLik(logitfit.1)/logLik(logit.null))
# Log likelihood
(logLik_3<-logLik(fit.nb.imp.aj))
## 'log Lik.' -17367 (df=15)
# AIC
AIC_3 <- fit.nb.imp.aj$aic
BIC_3 <- BIC(fit.nb.imp.aj)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_3<- cbind(test,
Mean = predict(fit.nb.imp.aj, newdata=test, type="response"),
SE = predict(fit.nb.imp.aj, newdata=test, type="response", se.fit=T)$se.fit
)
evl_df_3 <- as.data.frame(aggregate(ndf_3[, 16], list(ndf_3$TARGET), mean))
evl_df_3 <- cbind(evl_df_3,aggregate(ndf_3[, 17], list(ndf_3$TARGET), mean)[,2])
colnames(evl_df_3) <- c('TARGET','Mean','SE')
evl_df_3
## TARGET Mean SE
## 1 0 1.93 0.0444
## 2 1 1.51 0.0439
## 3 2 2.07 0.0462
## 4 3 2.69 0.0545
## 5 4 3.46 0.0687
## 6 5 4.16 0.0873
## 7 6 4.93 0.1107
## 8 7 5.37 0.1334
## 9 8 5.24 0.1467
# goodness of fit: pseudo R squared
zeroinf_null <-zeroinfl(TARGET ~ 1,train,dist="poisson")
(pR2_3z <- 1- logLik(fit.nb.imp_zeroinf)[1]/logLik(zeroinf_null)[1])
## [1] 0.14
# Log likelihood
(logLik_3z<-logLik(fit.nb.imp_zeroinf))
## 'log Lik.' -15471 (df=31)
#dispersion
c_3z <- deviance(fit.nb.imp_zeroinf)/fit.nb.imp_zeroinf$df.residual
# AIC
AIC_3z <- AIC(fit.nb.imp_zeroinf)
BIC_3z <- BIC(fit.nb.imp_zeroinf)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_3z<- cbind(test,
Count = predict(fit.nb.imp_zeroinf, newdata=test, type = "count"),
Zero = predict(fit.nb.imp_zeroinf, newdata=test, type = "zero")
)
evl_df_3z <- as.data.frame(aggregate(ndf_3z[, 16], list(ndf_3z$TARGET), mean))
evl_df_3z <- cbind(evl_df_3z,aggregate(ndf_3z[, 17], list(ndf_3z$TARGET), mean)[,2])
colnames(evl_df_3z) <- c('TARGET','Count','Zero')
evl_df_3z
## TARGET Count Zero
## 1 0 3.29 0.44449
## 2 1 2.28 0.18082
## 3 2 2.69 0.17539
## 4 3 3.16 0.16233
## 5 4 3.83 0.12009
## 6 5 4.54 0.08155
## 7 6 5.34 0.03505
## 8 7 6.09 0.02496
## 9 8 6.44 0.00638
In part 2 function, missing values were filled by multivariate impuation via chained equation. Now we will use the completed dataset to calculate the coefficients.
glm_full_lm <- glm(TARGET ~ .,data=train, family = gaussian)
glm_null_lm <- glm(TARGET ~ 1,data=train, family = gaussian)
fit.lm.imp <- step(glm_full_lm,direction="backward",trace=F)
summary(fit.lm.imp)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = gaussian, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.807 -1.005 0.174 1.032 4.474
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.7999365 0.1161236 6.89 0.000000000006 ***
## VolatileAcidity -0.1215584 0.0187777 -6.47 0.000000000101 ***
## CitricAcid 0.0392234 0.0172436 2.27 0.02295 *
## Chlorides -0.1744872 0.0460703 -3.79 0.00015 ***
## FreeSulfurDioxide 0.0004523 0.0000990 4.57 0.000004942649 ***
## TotalSulfurDioxide 0.0002618 0.0000638 4.10 0.000041100043 ***
## pH -0.0392113 0.0217106 -1.81 0.07094 .
## Alcohol 0.0111803 0.0039823 2.81 0.00500 **
## LabelAppeal 0.4511456 0.0171990 26.23 < 0.0000000000000002 ***
## AcidIndex -0.2610259 0.0114903 -22.72 < 0.0000000000000002 ***
## STARS 1.1383662 0.0174134 65.37 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 2.04)
##
## Null deviance: 34969 on 9382 degrees of freedom
## Residual deviance: 19089 on 9372 degrees of freedom
## AIC: 33316
##
## Number of Fisher Scoring iterations: 2
#good-of-fitness test
p_lm <-1-pchisq(summary(fit.lm.imp)$deviance, summary(fit.lm.imp)$df.residual)
cat("p-value:" ,p_lm,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_4 <- deviance(fit.lm.imp)/fit.lm.imp$df.residual
#c = 0 equidispersion, c > 0 is overdispersed
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.lm.imp, n = 250) #library(DHARMa)
plot(simulationOutput)
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.lm.imp)) #library(faraway)
par(mfrow=c(3,5))
for(i in 2:15){
plotResiduals(train[,i], simulationOutput$scaledResiduals,xlab=colnames(train)[i])
}
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 5
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## DHARMa::plotResiduals - low number of unique predictor values, consider setting asFactor = T
## Warning in smooth.spline(pred, res, df = 10): not using invalid df; must
## have 1 < df <= n := #{unique x} = 4
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.lm.imp,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity 2.237 0.135
## CitricAcid 0.193 0.660
## Chlorides 5.716 0.017
## FreeSulfurDioxide 0.030 0.862
## TotalSulfurDioxide 2.131 0.144
## pH 2.210 0.137
## Alcohol 7.377 0.007
## LabelAppeal 4.047 0.044
## AcidIndex 23.512 0.000
## STARS 403.613 0.000
The GOF test indicates that the Poisson model does not fit the data (p < 0.05).
# outlier test
outlierTest(fit.lm.imp)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 5766 -3.37 0.000747 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.lm.imp)
## StudRes Hat CookD
## 5766 -3.37 0.00118 0.00122
## 8447 -1.79 0.00811 0.00238
## 11289 3.14 0.00455 0.00410
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 11289, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.lm.imp_11289 <- update(fit.lm.imp,subset=c(-11289))
compareCoefs(fit.lm.imp, fit.lm.imp_11289 )
##
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = gaussian, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = gaussian, data = train, subset = c(-11289))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 0.7999365 0.1161236 0.7999365 0.1161236
## VolatileAcidity -0.1215584 0.0187777 -0.1215584 0.0187777
## CitricAcid 0.0392234 0.0172436 0.0392234 0.0172436
## Chlorides -0.1744872 0.0460703 -0.1744872 0.0460703
## FreeSulfurDioxide 0.0004523 0.0000990 0.0004523 0.0000990
## TotalSulfurDioxide 0.0002618 0.0000638 0.0002618 0.0000638
## pH -0.0392113 0.0217106 -0.0392113 0.0217106
## Alcohol 0.0111803 0.0039823 0.0111803 0.0039823
## LabelAppeal 0.4511456 0.0171990 0.4511456 0.0171990
## AcidIndex -0.2610259 0.0114903 -0.2610259 0.0114903
## STARS 1.1383662 0.0174134 1.1383662 0.0174134
From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.
Then adjust the model by adding high order terms or log form for “LabelAppeal”, “AcidIndex” and “STARS” to see whether we can improve the fit.
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.lm.imp,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity 2.237 0.135
## CitricAcid 0.193 0.660
## Chlorides 5.716 0.017
## FreeSulfurDioxide 0.030 0.862
## TotalSulfurDioxide 2.131 0.144
## pH 2.210 0.137
## Alcohol 7.377 0.007
## LabelAppeal 4.047 0.044
## AcidIndex 23.512 0.000
## STARS 403.613 0.000
glm_lm_full_aj <- glm(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), data = train,family=gaussian)
fit.lm.imp.aj <-step(glm_lm_full_aj,direction='backward',data = train, trace = FALSE)
residualPlots(fit.lm.imp.aj,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity 1.727 0.189
## CitricAcid 0.121 0.728
## Chlorides 5.649 0.017
## FreeSulfurDioxide 0.143 0.705
## TotalSulfurDioxide 3.010 0.083
## pH 1.897 0.168
## Sulphates 1.051 0.305
## Alcohol 7.764 0.005
## LabelAppeal 0.081 0.776
## AcidIndex 0.000 1.000
## STARS 0.000 1.000
## I(AcidIndex^2) 47.095 0.000
## I(STARS^2) 76.025 0.000
summary(fit.lm.imp.aj)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Sulphates +
## Alcohol + LabelAppeal + AcidIndex + STARS + I(AcidIndex^2) +
## I(STARS^2), family = gaussian, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.881 -0.972 0.133 0.998 4.806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.6908343 0.1884270 -3.67 0.00025 ***
## VolatileAcidity -0.1185089 0.0185706 -6.38 0.00000000018 ***
## CitricAcid 0.0346696 0.0170564 2.03 0.04212 *
## Chlorides -0.1886135 0.0455670 -4.14 0.00003514795 ***
## FreeSulfurDioxide 0.0004492 0.0000979 4.59 0.00000450212 ***
## TotalSulfurDioxide 0.0002454 0.0000631 3.89 0.00010 ***
## pH -0.0321789 0.0214819 -1.50 0.13418
## Sulphates -0.0248422 0.0156354 -1.59 0.11213
## Alcohol 0.0136121 0.0039418 3.45 0.00056 ***
## LabelAppeal 0.4624279 0.0170248 27.16 < 0.0000000000000002 ***
## AcidIndex -0.0997936 0.0483298 -2.06 0.03896 *
## STARS 2.2516914 0.0801398 28.10 < 0.0000000000000002 ***
## I(AcidIndex^2) -0.0132245 0.0040381 -3.27 0.00106 **
## I(STARS^2) -0.2564967 0.0180314 -14.22 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.99)
##
## Null deviance: 34969 on 9382 degrees of freedom
## Residual deviance: 18659 on 9369 degrees of freedom
## AIC: 33108
##
## Number of Fisher Scoring iterations: 2
coef <- summary(fit.lm.imp.aj)$coefficients[,1]
var.name <- variable.names(fit.lm.imp.aj)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | -0.69 | 0.50 |
VolatileAcidity | -0.12 | 0.89 |
CitricAcid | 0.03 | 1.04 |
Chlorides | -0.19 | 0.83 |
FreeSulfurDioxide | 0.00 | 1.00 |
TotalSulfurDioxide | 0.00 | 1.00 |
pH | -0.03 | 0.97 |
Sulphates | -0.02 | 0.98 |
Alcohol | 0.01 | 1.01 |
LabelAppeal | 0.46 | 1.59 |
AcidIndex | -0.10 | 0.91 |
STARS | 2.25 | 9.50 |
I(AcidIndex^2) | -0.01 | 0.99 |
I(STARS^2) | -0.26 | 0.77 |
#good-of-fitness test
(p_lm <- 1-pchisq(summary(fit.lm.imp)$deviance, summary(fit.lm.imp)$df.residual))
## [1] 0
(p_lm_aj <- 1-pchisq(summary(fit.lm.imp.aj)$deviance, summary(fit.lm.imp.aj)$df.residual))
## [1] 0
cat("p-value_aj:" ,p_poisson_aj,"\n\n")
## p-value_aj: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_4 <-deviance(fit.lm.imp.aj)/fit.lm.imp.aj$df.residual
#c = 0 equidispersion, c > 0 is overdispersed
c =1.2 >0. The assumption of equidispersion is not rejected.
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.lm.imp.aj, n = 250) #library(DHARMa)
plot(simulationOutput)
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.lm.imp.aj)) #library(faraway)
# outlier test
outlierTest(fit.lm.imp.aj)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 5766 -3.46 0.000535 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.lm.imp.aj)
## StudRes Hat CookD
## 2729 2.21 0.0399 0.01447
## 5766 -3.46 0.0014 0.00120
## 8447 -1.33 0.0414 0.00548
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.lm.imp_11289 <- update(fit.lm.imp.aj,subset=c(-11289))
compareCoefs(fit.lm.imp.aj, fit.lm.imp_11289 )
##
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + I(AcidIndex^2) + I(STARS^2), family =
## gaussian, data = train)
## 2: glm(formula = TARGET ~ VolatileAcidity + CitricAcid + Chlorides +
## FreeSulfurDioxide + TotalSulfurDioxide + pH + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS + I(AcidIndex^2) + I(STARS^2), family =
## gaussian, data = train, subset = c(-11289))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) -0.6908343 0.1884270 -0.6908343 0.1884270
## VolatileAcidity -0.1185089 0.0185706 -0.1185089 0.0185706
## CitricAcid 0.0346696 0.0170564 0.0346696 0.0170564
## Chlorides -0.1886135 0.0455670 -0.1886135 0.0455670
## FreeSulfurDioxide 0.0004492 0.0000979 0.0004492 0.0000979
## TotalSulfurDioxide 0.0002454 0.0000631 0.0002454 0.0000631
## pH -0.0321789 0.0214819 -0.0321789 0.0214819
## Sulphates -0.0248422 0.0156354 -0.0248422 0.0156354
## Alcohol 0.0136121 0.0039418 0.0136121 0.0039418
## LabelAppeal 0.4624279 0.0170248 0.4624279 0.0170248
## AcidIndex -0.0997936 0.0483298 -0.0997936 0.0483298
## STARS 2.2516914 0.0801398 2.2516914 0.0801398
## I(AcidIndex^2) -0.0132245 0.0040381 -0.0132245 0.0040381
## I(STARS^2) -0.2564967 0.0180314 -0.2564967 0.0180314
From the output table, we can see that coefficients are changed minimally, and the observation 11289 is not influential.
We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model
# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.lm.imp.aj)
# goodness of fit: pseudo R squared
(pR2_4 <- 1 - fit.lm.imp.aj$deviance / fit.lm.imp.aj$null.deviance)
## [1] 0.466
# Log likelihood
(logLik_4<-logLik(fit.lm.imp.aj))
## 'log Lik.' -16539 (df=15)
# AIC
AIC_4 <- fit.lm.imp.aj$aic
BIC_4 <- BIC(fit.lm.imp.aj)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_4<- cbind(test,
Mean = predict(fit.lm.imp.aj, newdata=test, type="response"),
SE = predict(fit.lm.imp.aj, newdata=test, type="response", se.fit=T)$se.fit
)
evl_df_4 <- as.data.frame(aggregate(ndf_4[, 16], list(ndf_4$TARGET), mean))
evl_df_4 <- cbind(evl_df_4,aggregate(ndf_4[, 17], list(ndf_4$TARGET), mean)[,2])
colnames(evl_df_4) <- c('TARGET','Mean','SE')
evl_df_4
## TARGET Mean SE
## 1 0 1.91 0.0550
## 2 1 1.39 0.0545
## 3 2 2.07 0.0512
## 4 3 2.71 0.0497
## 5 4 3.48 0.0503
## 6 5 4.14 0.0547
## 7 6 4.80 0.0566
## 8 7 5.25 0.0637
## 9 8 5.31 0.0683
suppressMessages(suppressWarnings(library(mlogit)))
train_fct <- train
train_fct[,c('TARGET','LabelAppeal','STARS')] <- lapply(train_fct[,c('TARGET','LabelAppeal','STARS')] ,as.factor)
# Reshaping the data from wide to long format
#mydata$mode<-as.factor(mydata$mode)
mldata<-mlogit.data(train, varying=NULL, choice="TARGET", shape="wide")
# Multinomial logit model
fit.ml.imp <- mlogit(TARGET ~ 1 | FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS,data=mldata, reflevel=levels(train_fct$TARGET)[1])
summary(fit.ml.imp)
##
## Call:
## mlogit(formula = TARGET ~ 1 | FixedAcidity + VolatileAcidity +
## CitricAcid + ResidualSugar + Chlorides + FreeSulfurDioxide +
## TotalSulfurDioxide + Density + pH + Sulphates + Alcohol +
## LabelAppeal + AcidIndex + STARS, data = mldata, reflevel = levels(train_fct$TARGET)[1],
## method = "nr", print.level = 0)
##
## Frequencies of alternatives:
## 0 1 2 3 4 5 6 7 8
## 0.21262 0.01982 0.08601 0.20015 0.24726 0.16093 0.06064 0.01130 0.00128
##
## nr method
## 12 iterations, 0h:0m:29s
## g'(-H)^-1g = 0.000238
## successive function values within tolerance limits
##
## Coefficients :
## Estimate Std. Error t-value Pr(>|t|)
## 1:(intercept) 4.7870282 3.3059821 1.45 0.14762
## 2:(intercept) 3.4055895 1.8283995 1.86 0.06252
## 3:(intercept) 3.1215290 1.3896201 2.25 0.02468
## 4:(intercept) -0.4547947 1.3967838 -0.33 0.74473
## 5:(intercept) -4.7076611 1.6809479 -2.80 0.00510
## 6:(intercept) -9.8805514 2.3224478 -4.25 0.00002096387
## 7:(intercept) -25.0778856 4.5129474 -5.56 0.00000002746
## 8:(intercept) -72.1819360 17.1989239 -4.20 0.00002706092
## 1:FixedAcidity 0.0141287 0.0137268 1.03 0.30335
## 2:FixedAcidity 0.0075548 0.0076568 0.99 0.32380
## 3:FixedAcidity -0.0043701 0.0058176 -0.75 0.45254
## 4:FixedAcidity 0.0027660 0.0058309 0.47 0.63524
## 5:FixedAcidity -0.0028177 0.0070285 -0.40 0.68849
## 6:FixedAcidity 0.0113328 0.0099616 1.14 0.25527
## 7:FixedAcidity 0.0050450 0.0189260 0.27 0.78980
## 8:FixedAcidity 0.0143643 0.0551516 0.26 0.79452
## 1:VolatileAcidity 0.0276175 0.1066811 0.26 0.79573
## 2:VolatileAcidity -0.1549679 0.0602681 -2.57 0.01013
## 3:VolatileAcidity -0.2250579 0.0461910 -4.87 0.00000110289
## 4:VolatileAcidity -0.2000726 0.0463893 -4.31 0.00001611218
## 5:VolatileAcidity -0.2852203 0.0561188 -5.08 0.00000037263
## 6:VolatileAcidity -0.3984951 0.0787249 -5.06 0.00000041517
## 7:VolatileAcidity -0.4451098 0.1507908 -2.95 0.00316
## 8:VolatileAcidity -0.0507834 0.4425565 -0.11 0.90864
## 1:CitricAcid -0.0146047 0.0988840 -0.15 0.88258
## 2:CitricAcid 0.0974399 0.0558488 1.74 0.08104
## 3:CitricAcid 0.0502363 0.0426221 1.18 0.23854
## 4:CitricAcid 0.1072923 0.0428707 2.50 0.01233
## 5:CitricAcid 0.0169388 0.0512751 0.33 0.74113
## 6:CitricAcid 0.1527696 0.0706395 2.16 0.03057
## 7:CitricAcid 0.0540372 0.1371310 0.39 0.69354
## 8:CitricAcid 0.1346172 0.4588807 0.29 0.76925
## 1:ResidualSugar 0.0011370 0.0025845 0.44 0.65999
## 2:ResidualSugar 0.0001580 0.0014225 0.11 0.91153
## 3:ResidualSugar 0.0000431 0.0010792 0.04 0.96811
## 4:ResidualSugar -0.0000473 0.0010836 -0.04 0.96521
## 5:ResidualSugar -0.0009913 0.0012948 -0.77 0.44390
## 6:ResidualSugar -0.0041261 0.0017897 -2.31 0.02114
## 7:ResidualSugar -0.0015363 0.0035033 -0.44 0.66101
## 8:ResidualSugar -0.0006326 0.0113460 -0.06 0.95553
## 1:Chlorides 0.3012341 0.2579027 1.17 0.24280
## 2:Chlorides -0.1525528 0.1473225 -1.04 0.30043
## 3:Chlorides -0.2380903 0.1124278 -2.12 0.03420
## 4:Chlorides -0.3902899 0.1135848 -3.44 0.00059
## 5:Chlorides -0.4408292 0.1374200 -3.21 0.00134
## 6:Chlorides -0.7137845 0.1946205 -3.67 0.00024
## 7:Chlorides -0.2136066 0.3667652 -0.58 0.56029
## 8:Chlorides -1.2297747 1.0431889 -1.18 0.23845
## 1:FreeSulfurDioxide 0.0003433 0.0005732 0.60 0.54919
## 2:FreeSulfurDioxide 0.0005711 0.0003201 1.78 0.07438
## 3:FreeSulfurDioxide 0.0009415 0.0002467 3.82 0.00013
## 4:FreeSulfurDioxide 0.0009277 0.0002484 3.73 0.00019
## 5:FreeSulfurDioxide 0.0013386 0.0002971 4.51 0.00000663132
## 6:FreeSulfurDioxide 0.0013485 0.0004109 3.28 0.00103
## 7:FreeSulfurDioxide 0.0008096 0.0007822 1.04 0.30065
## 8:FreeSulfurDioxide 0.0040577 0.0020075 2.02 0.04325
## 1:TotalSulfurDioxide 0.0008029 0.0003749 2.14 0.03223
## 2:TotalSulfurDioxide 0.0011688 0.0002052 5.70 0.00000001227
## 3:TotalSulfurDioxide 0.0008090 0.0001571 5.15 0.00000026053
## 4:TotalSulfurDioxide 0.0006021 0.0001583 3.80 0.00014
## 5:TotalSulfurDioxide 0.0005527 0.0001905 2.90 0.00371
## 6:TotalSulfurDioxide 0.0003578 0.0002675 1.34 0.18094
## 7:TotalSulfurDioxide 0.0004065 0.0005247 0.77 0.43854
## 8:TotalSulfurDioxide 0.0016133 0.0015107 1.07 0.28554
## 1:Density 1.1729196 3.2580898 0.36 0.71885
## 2:Density 2.1358931 1.8001822 1.19 0.23543
## 3:Density -0.7493329 1.3690747 -0.55 0.58415
## 4:Density -1.1796625 1.3754673 -0.86 0.39109
## 5:Density -2.7984471 1.6544651 -1.69 0.09075
## 6:Density -4.4633322 2.2886590 -1.95 0.05115
## 7:Density -0.8950092 4.3676843 -0.20 0.83764
## 8:Density 27.7019279 13.7868518 2.01 0.04451
## 1:pH -0.1110001 0.1259424 -0.88 0.37813
## 2:pH -0.2221153 0.0698344 -3.18 0.00147
## 3:pH -0.1427254 0.0530212 -2.69 0.00711
## 4:pH -0.1455688 0.0535361 -2.72 0.00655
## 5:pH -0.0853234 0.0648453 -1.32 0.18824
## 6:pH -0.1337259 0.0913237 -1.46 0.14311
## 7:pH 0.5931078 0.1714457 3.46 0.00054
## 8:pH -0.0242286 0.6296618 -0.04 0.96931
## 1:Sulphates -0.0469788 0.0900183 -0.52 0.60175
## 2:Sulphates -0.0697013 0.0511690 -1.36 0.17314
## 3:Sulphates -0.0836028 0.0390217 -2.14 0.03216
## 4:Sulphates -0.1055276 0.0392502 -2.69 0.00718
## 5:Sulphates -0.0580567 0.0471147 -1.23 0.21786
## 6:Sulphates -0.0754956 0.0657450 -1.15 0.25084
## 7:Sulphates -0.1033194 0.1237260 -0.84 0.40368
## 8:Sulphates 0.2172434 0.3170582 0.69 0.49323
## 1:Alcohol -0.0755915 0.0228737 -3.30 0.00095
## 2:Alcohol -0.0656105 0.0127449 -5.15 0.00000026329
## 3:Alcohol -0.0361017 0.0097415 -3.71 0.00021
## 4:Alcohol -0.0033549 0.0098544 -0.34 0.73352
## 5:Alcohol 0.0488693 0.0119669 4.08 0.00004432430
## 6:Alcohol 0.0861031 0.0169369 5.08 0.00000037002
## 7:Alcohol 0.1384143 0.0330889 4.18 0.00002875531
## 8:Alcohol 0.0362501 0.0899947 0.40 0.68709
## 1:LabelAppeal -3.2147837 0.1489685 -21.58 < 0.0000000000000002
## 2:LabelAppeal -2.1028377 0.0766622 -27.43 < 0.0000000000000002
## 3:LabelAppeal -0.8732574 0.0512919 -17.03 < 0.0000000000000002
## 4:LabelAppeal 0.3002931 0.0503210 5.97 0.00000000241
## 5:LabelAppeal 1.3487300 0.0643861 20.95 < 0.0000000000000002
## 6:LabelAppeal 2.4205164 0.0958261 25.26 < 0.0000000000000002
## 7:LabelAppeal 3.4036200 0.1968572 17.29 < 0.0000000000000002
## 8:LabelAppeal 6.2300415 1.0978081 5.67 0.00000001387
## 1:AcidIndex -0.2491778 0.0691287 -3.60 0.00031
## 2:AcidIndex -0.3927096 0.0400295 -9.81 < 0.0000000000000002
## 3:AcidIndex -0.3868888 0.0289201 -13.38 < 0.0000000000000002
## 4:AcidIndex -0.5075611 0.0297350 -17.07 < 0.0000000000000002
## 5:AcidIndex -0.5742915 0.0369153 -15.56 < 0.0000000000000002
## 6:AcidIndex -0.6531131 0.0538578 -12.13 < 0.0000000000000002
## 7:AcidIndex -0.6459091 0.1032328 -6.26 0.00000000039
## 8:AcidIndex -0.7485220 0.2872183 -2.61 0.00916
## 1:STARS 0.7690696 0.1856058 4.14 0.00003419488
## 2:STARS 1.5022602 0.0927297 16.20 < 0.0000000000000002
## 3:STARS 2.0263099 0.0756487 26.79 < 0.0000000000000002
## 4:STARS 2.6079573 0.0753535 34.61 < 0.0000000000000002
## 5:STARS 3.2832005 0.0821925 39.95 < 0.0000000000000002
## 6:STARS 3.9495261 0.0987426 40.00 < 0.0000000000000002
## 7:STARS 4.8452709 0.1775129 27.30 < 0.0000000000000002
## 8:STARS 6.4768358 0.7639198 8.48 < 0.0000000000000002
##
## 1:(intercept)
## 2:(intercept) .
## 3:(intercept) *
## 4:(intercept)
## 5:(intercept) **
## 6:(intercept) ***
## 7:(intercept) ***
## 8:(intercept) ***
## 1:FixedAcidity
## 2:FixedAcidity
## 3:FixedAcidity
## 4:FixedAcidity
## 5:FixedAcidity
## 6:FixedAcidity
## 7:FixedAcidity
## 8:FixedAcidity
## 1:VolatileAcidity
## 2:VolatileAcidity *
## 3:VolatileAcidity ***
## 4:VolatileAcidity ***
## 5:VolatileAcidity ***
## 6:VolatileAcidity ***
## 7:VolatileAcidity **
## 8:VolatileAcidity
## 1:CitricAcid
## 2:CitricAcid .
## 3:CitricAcid
## 4:CitricAcid *
## 5:CitricAcid
## 6:CitricAcid *
## 7:CitricAcid
## 8:CitricAcid
## 1:ResidualSugar
## 2:ResidualSugar
## 3:ResidualSugar
## 4:ResidualSugar
## 5:ResidualSugar
## 6:ResidualSugar *
## 7:ResidualSugar
## 8:ResidualSugar
## 1:Chlorides
## 2:Chlorides
## 3:Chlorides *
## 4:Chlorides ***
## 5:Chlorides **
## 6:Chlorides ***
## 7:Chlorides
## 8:Chlorides
## 1:FreeSulfurDioxide
## 2:FreeSulfurDioxide .
## 3:FreeSulfurDioxide ***
## 4:FreeSulfurDioxide ***
## 5:FreeSulfurDioxide ***
## 6:FreeSulfurDioxide **
## 7:FreeSulfurDioxide
## 8:FreeSulfurDioxide *
## 1:TotalSulfurDioxide *
## 2:TotalSulfurDioxide ***
## 3:TotalSulfurDioxide ***
## 4:TotalSulfurDioxide ***
## 5:TotalSulfurDioxide **
## 6:TotalSulfurDioxide
## 7:TotalSulfurDioxide
## 8:TotalSulfurDioxide
## 1:Density
## 2:Density
## 3:Density
## 4:Density
## 5:Density .
## 6:Density .
## 7:Density
## 8:Density *
## 1:pH
## 2:pH **
## 3:pH **
## 4:pH **
## 5:pH
## 6:pH
## 7:pH ***
## 8:pH
## 1:Sulphates
## 2:Sulphates
## 3:Sulphates *
## 4:Sulphates **
## 5:Sulphates
## 6:Sulphates
## 7:Sulphates
## 8:Sulphates
## 1:Alcohol ***
## 2:Alcohol ***
## 3:Alcohol ***
## 4:Alcohol
## 5:Alcohol ***
## 6:Alcohol ***
## 7:Alcohol ***
## 8:Alcohol
## 1:LabelAppeal ***
## 2:LabelAppeal ***
## 3:LabelAppeal ***
## 4:LabelAppeal ***
## 5:LabelAppeal ***
## 6:LabelAppeal ***
## 7:LabelAppeal ***
## 8:LabelAppeal ***
## 1:AcidIndex ***
## 2:AcidIndex ***
## 3:AcidIndex ***
## 4:AcidIndex ***
## 5:AcidIndex ***
## 6:AcidIndex ***
## 7:AcidIndex ***
## 8:AcidIndex **
## 1:STARS ***
## 2:STARS ***
## 3:STARS ***
## 4:STARS ***
## 5:STARS ***
## 6:STARS ***
## 7:STARS ***
## 8:STARS ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Log-Likelihood: -12200
## McFadden R^2: 0.282
## Likelihood ratio test : chisq = 9570 (p.value = <0.0000000000000002)
# goodness of fit: pseudo R squared
# McFadden's R squared
ml_null <-mlogit(TARGET ~ 1,data=mldata, reflevel=levels(train_fct$TARGET)[1])
(pR2_5 <- 1- logLik(fit.ml.imp)[1]/logLik(ml_null)[1])
## [1] 0.282
# Log likelihood
(logLik_5<-logLik(fit.lm.imp.aj))
## 'log Lik.' -16539 (df=15)
# AIC
AIC_5 <- fit.lm.imp.aj$aic
BIC_5 <- BIC(fit.lm.imp.aj)
a<-as.data.frame(fitted(fit.ml.imp,type = "outcome",outcome = F) )
b<-fitted(fit.ml.imp,type = "outcome",outcome = T)
pred <- list()
for(i in 1:length(b)){
pred <- append(pred,names(a[i,])[apply(a[i,], 1, function(x) which(x == b[i]))])
}
ndf_5<- cbind(train, unlist(pred),b)
colnames(ndf_5)[which(colnames(ndf_5)=='unlist(pred)')]<-'pred'
ndf_5$pred <- as.numeric(as.character(ndf_5$pred))
evl_df_5 <- as.data.frame(aggregate(ndf_5[, 16], list(ndf_5$TARGET), mean))
colnames(evl_df_5) <- c('TARGET','pred')
evl_df_5
## TARGET pred
## 1 0 0
## 2 1 1
## 3 2 2
## 4 3 3
## 5 4 4
## 6 5 5
## 7 6 6
## 8 7 7
## 9 8 8
Simple Step Selection will be used for attribute selection, and we will build 3 models for both sets, plus a “zerinfl” negative binomial, yielding a total of 7 models:
wine_training_data.zeros <- wine_training_data
wine_training_data.nozeros <- wine_training_data[wine_training_data$STARS != 0,]
cutoff.zeros <- nrow(wine_training_data.zeros)*.75
wine_training_data.zeros.train <- wine_training_data.zeros[1:cutoff.zeros,]
wine_training_data.zeros.test <- wine_training_data.zeros[(cutoff.zeros+1):nrow(wine_training_data.zeros),]
cutoff.nozeros <- nrow(wine_training_data.nozeros)*.75
wine_training_data.nozeros.train <- wine_training_data.nozeros[1:cutoff.nozeros,]
wine_training_data.nozeros.test <- wine_training_data.nozeros[(cutoff.nozeros+1):nrow(wine_training_data.nozeros),]
#wine_training_data.zeros <- do_factors(wine_training_data.zeros)
#wine_training_data.nozeros <- do_factors(wine_training_data.nozeros)
#wine_training_data.zeros.train <- do_factors(wine_training_data.zeros.train)
#wine_training_data.zeros.test <- do_factors(wine_training_data.zeros.test)
#wine_training_data.nozeros.train <- do_factors(wine_training_data.nozeros.train)
#wine_training_data.nozeros.test <- do_factors(wine_training_data.nozeros.test)
Summaries for these Models are below:
(To show our point regarding coefficients, we will show summaries for the 2 Negative Binomial Distributions. Summaries of all 7 models, however, are located in the appendix).
Poinsson glm and Zero-inflated Poisson regression will be fit using dataset in which NAs are filled by zero.
fit.poisson.zeros <- step(glm(TARGET ~ . , family = poisson, data = wine_training_data.zeros.train), trace = FALSE)
summary(fit.poisson.zeros)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = poisson, data = wine_training_data.zeros.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.987 -0.696 0.071 0.578 3.205
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2255668 0.0556031 22.04 < 0.0000000000000002 ***
## VolatileAcidity -0.0351524 0.0075417 -4.66 0.0000031 ***
## FreeSulfurDioxide 0.0001297 0.0000398 3.26 0.0011 **
## TotalSulfurDioxide 0.0000721 0.0000254 2.84 0.0046 **
## pH -0.0129234 0.0087058 -1.48 0.1377
## Sulphates -0.0118122 0.0062359 -1.89 0.0582 .
## Alcohol 0.0026729 0.0015981 1.67 0.0944 .
## LabelAppeal 0.1382714 0.0070001 19.75 < 0.0000000000000002 ***
## AcidIndex -0.0854964 0.0051553 -16.58 < 0.0000000000000002 ***
## STARS 0.3079500 0.0052369 58.80 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 17042 on 9595 degrees of freedom
## Residual deviance: 11034 on 9586 degrees of freedom
## AIC: 35048
##
## Number of Fisher Scoring iterations: 5
#good-of-fitness test
p_poisson_zero <- 1-pchisq(summary(fit.poisson.zeros)$deviance, summary(fit.poisson.zeros)$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_6<- deviance(fit.poisson.zeros)/fit.poisson.zeros$df.residual
dispersiontest(fit.poisson.zeros) # library(AER)
##
## Overdispersion test
##
## data: fit.poisson.zeros
## z = -10, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.865
#c = 0 equidispersion, c > 0 is overdispersed
#Check for zero inflation
fit.poisson.zeros_zeroinf <- zeroinfl(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=wine_training_data.zeros.train, dist="poisson") #library(pscl)
AIC(fit.poisson.zeros, fit.poisson.zeros_zeroinf)
## df AIC
## fit.poisson.zeros 10 35048
## fit.poisson.zeros_zeroinf 30 30759
summary(fit.poisson.zeros_zeroinf)
##
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid +
## ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide +
## Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS, data = wine_training_data.zeros.train, dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.12031 -0.40961 -0.00471 0.36947 5.90173
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4523000 0.2321671 6.26 0.0000000004 ***
## FixedAcidity 0.0004044 0.0009795 0.41 0.680
## VolatileAcidity -0.0125236 0.0077475 -1.62 0.106
## CitricAcid -0.0015902 0.0069195 -0.23 0.818
## ResidualSugar -0.0000348 0.0001792 -0.19 0.846
## Chlorides -0.0126578 0.0191607 -0.66 0.509
## FreeSulfurDioxide 0.0000406 0.0000403 1.01 0.314
## TotalSulfurDioxide -0.0000132 0.0000255 -0.52 0.604
## Density -0.3125233 0.2280031 -1.37 0.170
## pH 0.0090692 0.0089678 1.01 0.312
## Sulphates -0.0015214 0.0064036 -0.24 0.812
## Alcohol 0.0068260 0.0016343 4.18 0.0000295754 ***
## LabelAppeal 0.2336540 0.0072663 32.16 < 0.0000000000000002 ***
## AcidIndex -0.0173737 0.0056128 -3.10 0.002 **
## STARS 0.1005935 0.0059930 16.79 < 0.0000000000000002 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.133307 1.513854 -2.73 0.00633 **
## FixedAcidity 0.002360 0.006395 0.37 0.71209
## VolatileAcidity 0.183783 0.050341 3.65 0.00026 ***
## CitricAcid -0.053114 0.045920 -1.16 0.24741
## ResidualSugar -0.000896 0.001163 -0.77 0.44091
## Chlorides -0.002500 0.123390 -0.02 0.98384
## FreeSulfurDioxide -0.000654 0.000271 -2.41 0.01590 *
## TotalSulfurDioxide -0.000798 0.000172 -4.64 0.0000034 ***
## Density 0.117083 1.491072 0.08 0.93741
## pH 0.215505 0.058039 3.71 0.00020 ***
## Sulphates 0.120421 0.041803 2.88 0.00397 **
## Alcohol 0.028319 0.010613 2.67 0.00762 **
## LabelAppeal 0.673191 0.048865 13.78 < 0.0000000000000002 ***
## AcidIndex 0.439117 0.029592 14.84 < 0.0000000000000002 ***
## STARS -2.340599 0.068029 -34.41 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 38
## Log-likelihood: -1.53e+04 on 30 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.zeros)) #library(faraway)
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.zeros, n = 250) #library(DHARMa)
plot(simulationOutput)
The GOF test indicates that the Poisson model fit the data (p > 0.05).
# outlier test
outlierTest(fit.poisson.zeros)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 8887 3.21 0.00134 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.poisson.zeros)
## StudRes Hat CookD
## 1630 -1.23 0.00927 0.00122
## 3953 2.60 0.00227 0.00245
## 8887 3.21 0.00059 0.00108
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 1630, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.poisson.zeros_1630 <- update(fit.poisson.zeros,subset=c(-1630))
compareCoefs(fit.poisson.zeros, fit.poisson.zeros_1630 )
##
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = poisson, data =
## wine_training_data.zeros.train)
## 2: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARS, family = poisson, data =
## wine_training_data.zeros.train, subset = c(-1630))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 1.2255668 0.0556031 1.2265651 0.0556119
## VolatileAcidity -0.0351524 0.0075417 -0.0357254 0.0075578
## FreeSulfurDioxide 0.0001297 0.0000398 0.0001296 0.0000398
## TotalSulfurDioxide 0.0000721 0.0000254 0.0000712 0.0000254
## pH -0.0129234 0.0087058 -0.0128748 0.0087054
## Sulphates -0.0118122 0.0062359 -0.0119223 0.0062362
## Alcohol 0.0026729 0.0015981 0.0027332 0.0015988
## LabelAppeal 0.1382714 0.0070001 0.1383440 0.0070001
## AcidIndex -0.0854964 0.0051553 -0.0856982 0.0051590
## STARS 0.3079500 0.0052369 0.3080394 0.0052372
From the output table, we can see that coefficients are changed minimally, and the observation 1630 is not influential.
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.poisson.zeros,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity 0.910 0.340
## FreeSulfurDioxide 0.124 0.725
## TotalSulfurDioxide 0.570 0.450
## pH 0.633 0.426
## Sulphates 0.079 0.779
## Alcohol 1.369 0.242
## LabelAppeal 21.183 0.000
## AcidIndex 43.615 0.000
## STARS 666.096 0.000
glm_full_aj <- glm(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), family = poisson, data = wine_training_data.zeros.train)
fit.poisson.zeros_aj <-step(glm_full_aj,direction='backward',family = poisson, data = wine_training_data.zeros.train, trace = FALSE)
residualPlots(fit.poisson.zeros_aj,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity 0.468 0.494
## FreeSulfurDioxide 0.021 0.884
## TotalSulfurDioxide 0.669 0.413
## Sulphates 0.000 0.999
## Alcohol 0.822 0.365
## LabelAppeal 0.209 0.648
## AcidIndex 0.000 1.000
## STARS 0.000 1.000
## log(LabelAppeal + 3) 0.145 0.703
## I(AcidIndex^2) 13.065 0.000
## I(STARS^2) 91.112 0.000
summary(fit.poisson.zeros_aj)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARS + log(LabelAppeal + 3) + I(AcidIndex^2) +
## I(STARS^2), family = poisson, data = wine_training_data.zeros.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.213 -0.659 -0.002 0.493 3.564
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1982206 0.1812114 -1.09 0.2740
## VolatileAcidity -0.0324472 0.0075509 -4.30 0.000017301
## FreeSulfurDioxide 0.0000952 0.0000397 2.40 0.0165
## TotalSulfurDioxide 0.0000613 0.0000254 2.41 0.0160
## Sulphates -0.0121488 0.0062274 -1.95 0.0511
## Alcohol 0.0038121 0.0015964 2.39 0.0169
## LabelAppeal 0.0669199 0.0306619 2.18 0.0291
## AcidIndex 0.1187903 0.0369791 3.21 0.0013
## STARS 0.7040393 0.0171154 41.13 < 0.0000000000000002
## log(LabelAppeal + 3) 0.2670993 0.0867050 3.08 0.0021
## I(AcidIndex^2) -0.0118800 0.0022055 -5.39 0.000000072
## I(STARS^2) -0.1042046 0.0042411 -24.57 < 0.0000000000000002
##
## (Intercept)
## VolatileAcidity ***
## FreeSulfurDioxide *
## TotalSulfurDioxide *
## Sulphates .
## Alcohol *
## LabelAppeal *
## AcidIndex **
## STARS ***
## log(LabelAppeal + 3) **
## I(AcidIndex^2) ***
## I(STARS^2) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 17042 on 9595 degrees of freedom
## Residual deviance: 10329 on 9584 degrees of freedom
## AIC: 34347
##
## Number of Fisher Scoring iterations: 6
coef <- summary(fit.poisson.zeros_aj)$coefficients[,1]
var.name <- variable.names(fit.poisson.zeros_aj)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | -0.20 | 0.82 |
VolatileAcidity | -0.03 | 0.97 |
FreeSulfurDioxide | 0.00 | 1.00 |
TotalSulfurDioxide | 0.00 | 1.00 |
Sulphates | -0.01 | 0.99 |
Alcohol | 0.00 | 1.00 |
LabelAppeal | 0.07 | 1.07 |
AcidIndex | 0.12 | 1.13 |
STARS | 0.70 | 2.02 |
log(LabelAppeal + 3) | 0.27 | 1.31 |
I(AcidIndex^2) | -0.01 | 0.99 |
I(STARS^2) | -0.10 | 0.90 |
#good-of-fitness test
p_poisson_zero_aj <- 1-pchisq(summary(fit.poisson.zeros_aj)$deviance, summary(fit.poisson.zeros_aj)$df.residual)
cat("p-value_aj:" ,p_poisson_aj,"\n\n")
## p-value_aj: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_6 <-deviance(fit.poisson.zeros_aj)/fit.poisson.zeros_aj$df.residual
dispersiontest(fit.poisson.zeros_aj) # library(AER)
##
## Overdispersion test
##
## data: fit.poisson.zeros_aj
## z = -9, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.876
#c = 0 equidispersion, c > 0 is overdispersed
c =1.15 >0. The assumption of equidispersion is not rejected.
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.zeros_aj, n = 250) #library(DHARMa)
plot(simulationOutput)
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.zeros_aj)) #library(faraway)
# outlier test
outlierTest(fit.poisson.zeros_aj)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 8887 3.57 0.000362 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.poisson.zeros_aj)
## StudRes Hat CookD
## 2729 2.74 0.033751 0.03931
## 8887 3.57 0.000692 0.00143
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.poisson.zeros_aj_2729 <- update(fit.poisson.zeros_aj,subset=c(-2729))
compareCoefs(fit.poisson.zeros_aj, fit.poisson.zeros_aj_2729 )
##
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + I(STARS^2), family =
## poisson, data = wine_training_data.zeros.train)
## 2: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + I(STARS^2), family =
## poisson, data = wine_training_data.zeros.train, subset = c(-2729))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) -0.1982206 0.1812114 -0.2917920 0.1843297
## VolatileAcidity -0.0324472 0.0075509 -0.0324703 0.0075515
## FreeSulfurDioxide 0.0000952 0.0000397 0.0000956 0.0000397
## TotalSulfurDioxide 0.0000613 0.0000254 0.0000608 0.0000254
## Sulphates -0.0121488 0.0062274 -0.0121160 0.0062275
## Alcohol 0.0038121 0.0015964 0.0038365 0.0015966
## LabelAppeal 0.0669199 0.0306619 0.0651507 0.0306764
## AcidIndex 0.1187903 0.0369791 0.1419165 0.0379481
## STARS 0.7040393 0.0171154 0.7030149 0.0171176
## log(LabelAppeal + 3) 0.2670993 0.0867050 0.2716263 0.0867434
## I(AcidIndex^2) -0.0118800 0.0022055 -0.0133409 0.0022713
## I(STARS^2) -0.1042046 0.0042411 -0.1040184 0.0042416
From the output table, we can see that coefficients are changed minimally, and the observation 2729 is not influential.
We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model
# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.poisson.zeros_aj)
The bar hanging from each point on the theoretical Poisson fit (red line) represents the difference between expected and observed counts. When a bar hanging below 0, it is underfitting. When a bar hanging above 0, it is overfitting. We can see that all variables have either underfitting or overfitting problem except the No.7 variable, “TotalSulfurDioxide”.
# goodness of fit: pseudo R squared
(pR2_zero_1 <- 1 - fit.poisson.zeros_aj$deviance / fit.poisson.zeros_aj$null.deviance)
## [1] 0.394
#or
#(pR2 <- 1- logLik(logitfit.1)/logLik(logit.null))
# Log likelihood
(logLik_zero_1 <-logLik(fit.poisson.zeros_aj))
## 'log Lik.' -17161 (df=12)
# AIC
AIC_zero_1 <- fit.poisson.zeros_aj$aic
BIC_zero_1 <- BIC(fit.poisson.zeros_aj)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_1<- cbind(wine_training_data.zeros.test,
Mean = predict(fit.poisson.zeros, newdata=wine_training_data.zeros.test, type="response"),
SE = predict(fit.poisson.zeros, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
)
evl_df_zero_1 <- as.data.frame(aggregate(ndf_zero_1[, 17], list(ndf_zero_1$TARGET), mean))
evl_df_zero_1 <- cbind(evl_df_zero_1,aggregate(ndf_zero_1[, 18], list(ndf_zero_1 $TARGET), mean)[,2])
colnames(evl_df_zero_1) <- c('TARGET','Mean','SE')
# goodness of fit: pseudo R squared
(pR2_zero_1z <- 1- logLik(fit.poisson.zeros_zeroinf)[1]/logLik(fit.poisson.zeros_zeroinf)[1])
## [1] 0
# Log likelihood
(logLik_zero_1z<-logLik(fit.poisson.zeros_zeroinf))
## 'log Lik.' -15349 (df=30)
#dispersion
c_zero_1z <- deviance(fit.poisson.zeros_zeroinf)/fit.poisson.zeros_zeroinf$df.residual
# AIC
AIC_zero_1z <- AIC(fit.poisson.zeros_zeroinf)
BIC_zero_1z <- BIC(fit.poisson.zeros_zeroinf)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_1z<- cbind(wine_training_data.zeros.test,
Count = predict(fit.poisson.zeros_zeroinf, newdata=wine_training_data.zeros.test, type = "count"),
Zero = predict(fit.poisson.zeros_zeroinf, newdata=wine_training_data.zeros.test, type = "zero")
)
evl_df_zero_1z <- as.data.frame(aggregate(ndf_zero_1z[, 17], list(ndf_zero_1z$TARGET), mean))
evl_df_zero_1z <- cbind(evl_df_zero_1z,aggregate(ndf_zero_1z[, 18], list(ndf_zero_1z$TARGET), mean)[,2])
colnames(evl_df_zero_1z) <- c('TARGET','Count','Zero')
fit.poisson.nozeros <- step(glm(TARGET ~ . , family = poisson, data = wine_training_data.nozeros.train), trace = FALSE)
summary(fit.poisson.nozeros)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## Alcohol + LabelAppeal + AcidIndex + STARS, family = poisson,
## data = wine_training_data.nozeros.train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.228 -0.275 0.070 0.374 1.727
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1958931 0.0505635 23.65 < 0.0000000000000002 ***
## VolatileAcidity -0.0271442 0.0079918 -3.40 0.00068 ***
## FreeSulfurDioxide 0.0000892 0.0000421 2.12 0.03405 *
## Alcohol 0.0041172 0.0016916 2.43 0.01494 *
## LabelAppeal 0.1809400 0.0075738 23.89 < 0.0000000000000002 ***
## AcidIndex -0.0483117 0.0055739 -8.67 < 0.0000000000000002 ***
## STARS 0.1895270 0.0070857 26.75 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 6583.0 on 7076 degrees of freedom
## Residual deviance: 4473.7 on 7070 degrees of freedom
## AIC: 25466
##
## Number of Fisher Scoring iterations: 5
coef <- summary(fit.poisson.nozeros)$coefficients[,1]
var.name <- variable.names(fit.poisson.nozeros)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | 1.20 | 3.31 |
VolatileAcidity | -0.03 | 0.97 |
FreeSulfurDioxide | 0.00 | 1.00 |
Alcohol | 0.00 | 1.00 |
LabelAppeal | 0.18 | 1.20 |
AcidIndex | -0.05 | 0.95 |
STARS | 0.19 | 1.21 |
#good-of-fitness test
p_poisson_nozero <- 1-pchisq(summary(fit.poisson.nozeros)$deviance, summary(fit.poisson.nozeros)$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_7<-deviance(fit.poisson.nozeros)/fit.poisson.nozeros$df.residual
dispersiontest(fit.poisson.nozeros) # library(AER)
##
## Overdispersion test
##
## data: fit.poisson.nozeros
## z = -50, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.431
#c = 0 equidispersion, c > 0 is overdispersed
#Check for zero inflation
fit.poisson.nozeros_zeroinf <- zeroinfl(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=wine_training_data.nozeros.train, dist="poisson") #library(pscl)
AIC(fit.poisson.nozeros, fit.poisson.nozeros_zeroinf)
## df AIC
## fit.poisson.nozeros 7 25466
## fit.poisson.nozeros_zeroinf 30 24548
summary(fit.poisson.nozeros_zeroinf)
##
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid +
## ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide +
## Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS, data = wine_training_data.nozeros.train, dist = "poisson")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.3326 -0.2911 0.0461 0.3422 4.0587
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.3979183 0.2431431 5.75 0.000000009 ***
## FixedAcidity 0.0004540 0.0010371 0.44 0.66156
## VolatileAcidity -0.0101632 0.0081336 -1.25 0.21147
## CitricAcid -0.0010763 0.0072655 -0.15 0.88223
## ResidualSugar -0.0000995 0.0001884 -0.53 0.59741
## Chlorides -0.0189518 0.0201411 -0.94 0.34673
## FreeSulfurDioxide 0.0000370 0.0000426 0.87 0.38450
## TotalSulfurDioxide -0.0000129 0.0000270 -0.48 0.63413
## Density -0.2854740 0.2387480 -1.20 0.23181
## pH 0.0080308 0.0094516 0.85 0.39550
## Sulphates -0.0015614 0.0067169 -0.23 0.81618
## Alcohol 0.0064211 0.0017188 3.74 0.00019 ***
## LabelAppeal 0.2134811 0.0077528 27.54 < 0.0000000000000002 ***
## AcidIndex -0.0170951 0.0059300 -2.88 0.00394 **
## STARS 0.1153341 0.0074931 15.39 < 0.0000000000000002 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.907803 2.665825 -1.84 0.06562 .
## FixedAcidity -0.005840 0.011406 -0.51 0.60867
## VolatileAcidity 0.330304 0.086690 3.81 0.00014 ***
## CitricAcid -0.049234 0.079298 -0.62 0.53469
## ResidualSugar -0.001095 0.002088 -0.52 0.59988
## Chlorides -0.245534 0.225846 -1.09 0.27696
## FreeSulfurDioxide -0.001305 0.000496 -2.63 0.00843 **
## TotalSulfurDioxide -0.000749 0.000306 -2.45 0.01447 *
## Density 2.019178 2.600799 0.78 0.43753
## pH 0.180433 0.103810 1.74 0.08219 .
## Sulphates 0.122848 0.073702 1.67 0.09555 .
## Alcohol 0.047983 0.018248 2.63 0.00855 **
## LabelAppeal 0.734222 0.089350 8.22 < 0.0000000000000002 ***
## AcidIndex 0.527523 0.050415 10.46 < 0.0000000000000002 ***
## STARS -4.012970 0.422826 -9.49 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 42
## Log-likelihood: -1.22e+04 on 30 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.poisson.nozeros)) #library(faraway)
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.poisson.nozeros, n = 250) #library(DHARMa)
plot(simulationOutput)
#residual plots
residualPlots(fit.poisson.nozeros)
## Test stat Pr(>|t|)
## VolatileAcidity 0.170 0.680
## FreeSulfurDioxide 0.213 0.644
## Alcohol 0.739 0.390
## LabelAppeal 20.185 0.000
## AcidIndex 6.483 0.011
## STARS 77.088 0.000
The GOF test indicates that the Poisson model fit the data (p > 0.05).
# outlier test
outlierTest(fit.poisson.nozeros)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 4922 -3.23 0.00124 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.poisson.nozeros)
## StudRes Hat CookD
## 369 -3.127 0.00320 0.00224
## 2729 0.611 0.01127 0.00067
## 4922 -3.229 0.00139 0.00104
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 369, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.poisson.nozeros_369 <- update(fit.poisson.nozeros,subset=c(-369))
compareCoefs(fit.poisson.nozeros, fit.poisson.nozeros_369 )
##
## Call:
## 1: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## Alcohol + LabelAppeal + AcidIndex + STARS, family = poisson, data =
## wine_training_data.nozeros.train)
## 2: glm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## Alcohol + LabelAppeal + AcidIndex + STARS, family = poisson, data =
## wine_training_data.nozeros.train, subset = c(-369))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 1.1958931 0.0505635 1.1957484 0.0505657
## VolatileAcidity -0.0271442 0.0079918 -0.0271426 0.0079919
## FreeSulfurDioxide 0.0000892 0.0000421 0.0000893 0.0000421
## Alcohol 0.0041172 0.0016916 0.0041150 0.0016917
## LabelAppeal 0.1809400 0.0075738 0.1809053 0.0075746
## AcidIndex -0.0483117 0.0055739 -0.0482984 0.0055740
## STARS 0.1895270 0.0070857 0.1895478 0.0070860
From the output table, we can see that coefficients are changed minimally, and the observation 369 is not influential.
# goodness of fit: pseudo R squared
(pR2_nozero_1 <- 1 - fit.poisson.nozeros$deviance / fit.poisson.nozeros$null.deviance)
## [1] 0.32
#or
#(pR2 <- 1- logLik(logitfit.1)/logLik(logit.null))
# Log likelihood
(logLik_nozero_1 <-logLik(fit.poisson.nozeros))
## 'log Lik.' -12726 (df=7)
# AIC
AIC_nozero_1 <- fit.poisson.nozeros$aic
BIC_nozero_1 <- BIC(fit.poisson.nozeros)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_1<- cbind(wine_training_data.nozeros.test,
Mean = predict(fit.poisson.nozeros, newdata=wine_training_data.nozeros.test, type="response"),
SE = predict(fit.poisson.nozeros, newdata=wine_training_data.nozeros.test, type="response", se.fit=T)$se.fit
)
evl_df_nozero_1 <- as.data.frame(aggregate(ndf_nozero_1[, 17], list(ndf_nozero_1$TARGET), mean))
evl_df_nozero_1 <- cbind(evl_df_nozero_1,aggregate(ndf_nozero_1[, 18], list(ndf_nozero_1$TARGET), mean)[,2])
colnames(evl_df_nozero_1) <- c('TARGET','Mean','SE')
evl_df_nozero_1
## TARGET Mean SE
## 1 0 2.85 0.0527
## 2 1 2.35 0.0444
## 3 2 2.63 0.0445
## 4 3 3.07 0.0472
## 5 4 3.69 0.0552
## 6 5 4.35 0.0673
## 7 6 5.22 0.0896
## 8 7 6.04 0.1056
## 9 8 6.56 0.1237
# goodness of fit: pseudo R squared
zeroinf_null <-zeroinfl(TARGET ~ 1,wine_training_data.nozeros.train,dist="poisson")
(pR2_nozero_1z <- 1- logLik(fit.poisson.nozeros_zeroinf)[1]/logLik(zeroinf_null)[1])
## [1] 0.0931
# Log likelihood
(logLik_nozero_1z<-logLik(fit.poisson.nozeros_zeroinf))
## 'log Lik.' -12244 (df=30)
#dispersion
c_nozero_1z <- deviance(fit.poisson.nozeros_zeroinf)/fit.poisson.nozeros_zeroinf$df.residual
# AIC
AIC_nozero_1z <- AIC(fit.poisson.nozeros_zeroinf)
BIC_nozero_1z <- BIC(fit.poisson.nozeros_zeroinf)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_1z<- cbind(wine_training_data.nozeros.test,
Count = predict(fit.poisson.nozeros_zeroinf, newdata=wine_training_data.nozeros.test, type = "count"),
Zero = predict(fit.poisson.nozeros_zeroinf, newdata=wine_training_data.nozeros.test, type = "zero")
)
evl_df_nozero_1z <- as.data.frame(aggregate(ndf_nozero_1z[, 17], list(ndf_nozero_1z$TARGET), mean))
evl_df_nozero_1z <- cbind(evl_df_nozero_1z,aggregate(ndf_nozero_1z[, 18], list(ndf_nozero_1z$TARGET), mean)[,2])
colnames(evl_df_nozero_1z) <- c('TARGET','Count','Zero')
evl_df_nozero_1z
## TARGET Count Zero
## 1 0 3.31 0.222407
## 2 1 2.53 0.044353
## 3 2 2.79 0.048695
## 4 3 3.25 0.052220
## 5 4 3.87 0.042633
## 6 5 4.50 0.029627
## 7 6 5.31 0.015806
## 8 7 6.04 0.001156
## 9 8 6.56 0.000152
fit.nb.zeros <- step(glm.nb(TARGET ~ . , data = wine_training_data.zeros.train), trace = FALSE)
summary(fit.nb.zeros)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARS, data = wine_training_data.zeros.train,
## init.theta = 49395.31414, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.987 -0.696 0.071 0.578 3.205
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.2255779 0.0556051 22.04 < 0.0000000000000002 ***
## VolatileAcidity -0.0351533 0.0075420 -4.66 0.0000031 ***
## FreeSulfurDioxide 0.0001297 0.0000398 3.26 0.0011 **
## TotalSulfurDioxide 0.0000721 0.0000254 2.84 0.0046 **
## pH -0.0129241 0.0087062 -1.48 0.1377
## Sulphates -0.0118126 0.0062361 -1.89 0.0582 .
## Alcohol 0.0026729 0.0015981 1.67 0.0944 .
## LabelAppeal 0.1382709 0.0070003 19.75 < 0.0000000000000002 ***
## AcidIndex -0.0854983 0.0051555 -16.58 < 0.0000000000000002 ***
## STARS 0.3079539 0.0052371 58.80 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(49395) family taken to be 1)
##
## Null deviance: 17041 on 9595 degrees of freedom
## Residual deviance: 11034 on 9586 degrees of freedom
## AIC: 35050
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 49395
## Std. Err.: 59121
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -35028
#good-of-fitness test
p_nb_zero <- 1-pchisq(summary(fit.nb.zeros)$deviance, summary(fit.nb.zeros)$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_8<-deviance(fit.nb.zeros)/fit.nb.zeros$df.residual
#c = 0 equidispersion, c > 0 is overdispersed
#Check for zero inflation
fit.nb.zeros_zeroinf <- zeroinfl(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=wine_training_data.zeros.train, dist="negbin") #library(pscl)
AIC(fit.nb.zeros, fit.nb.zeros_zeroinf)
## df AIC
## fit.nb.zeros 11 35050
## fit.nb.zeros_zeroinf 31 30761
summary(fit.nb.zeros_zeroinf)
##
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid +
## ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide +
## Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS, data = wine_training_data.zeros.train, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.12026 -0.40960 -0.00474 0.36951 5.90131
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.4524248 0.2321669 6.26 0.0000000004 ***
## FixedAcidity 0.0004040 0.0009795 0.41 0.680
## VolatileAcidity -0.0125228 0.0077475 -1.62 0.106
## CitricAcid -0.0016002 0.0069195 -0.23 0.817
## ResidualSugar -0.0000348 0.0001792 -0.19 0.846
## Chlorides -0.0126423 0.0191607 -0.66 0.509
## FreeSulfurDioxide 0.0000406 0.0000403 1.01 0.314
## TotalSulfurDioxide -0.0000132 0.0000255 -0.52 0.604
## Density -0.3125951 0.2280030 -1.37 0.170
## pH 0.0090600 0.0089678 1.01 0.312
## Sulphates -0.0015182 0.0064036 -0.24 0.813
## Alcohol 0.0068259 0.0016343 4.18 0.0000295868 ***
## LabelAppeal 0.2336509 0.0072663 32.16 < 0.0000000000000002 ***
## AcidIndex -0.0173759 0.0056128 -3.10 0.002 **
## STARS 0.1005929 0.0059930 16.78 < 0.0000000000000002 ***
## Log(theta) 18.1863991 2.0191302 9.01 < 0.0000000000000002 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.126611 1.513832 -2.73 0.00641 **
## FixedAcidity 0.002360 0.006395 0.37 0.71210
## VolatileAcidity 0.183788 0.050341 3.65 0.00026 ***
## CitricAcid -0.053108 0.045920 -1.16 0.24746
## ResidualSugar -0.000896 0.001163 -0.77 0.44087
## Chlorides -0.002491 0.123390 -0.02 0.98389
## FreeSulfurDioxide -0.000654 0.000271 -2.41 0.01591 *
## TotalSulfurDioxide -0.000798 0.000172 -4.64 0.0000034 ***
## Density 0.110502 1.491063 0.07 0.94092
## pH 0.215475 0.058039 3.71 0.00021 ***
## Sulphates 0.120458 0.041803 2.88 0.00396 **
## Alcohol 0.028322 0.010613 2.67 0.00762 **
## LabelAppeal 0.673224 0.048866 13.78 < 0.0000000000000002 ***
## AcidIndex 0.439101 0.029591 14.84 < 0.0000000000000002 ***
## STARS -2.340593 0.068029 -34.41 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 79113896.367
## Number of iterations in BFGS optimization: 64
## Log-likelihood: -1.53e+04 on 31 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.zeros)) #library(faraway)
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.nb.zeros, n = 250) #library(DHARMa)
plot(simulationOutput)
The GOF test indicates that the Poisson model fit the data (p > 0.05).
# outlier test
outlierTest(fit.nb.zeros)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 8887 2.99 0.0028 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.nb.zeros)
## StudRes Hat CookD
## 1630 -1.14 0.00927 0.00122
## 3953 2.42 0.00227 0.00245
## 8887 2.99 0.00059 0.00108
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 1630, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.nb.zeros_1630 <- update(fit.nb.zeros,subset=c(-1630))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.zeros, fit.nb.zeros_1630 )
##
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARS, data = wine_training_data.zeros.train, init.theta =
## 49395.31414, link = log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + pH + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARS, data = wine_training_data.zeros.train, subset =
## c(-1630), init.theta = 49384.41221, link = log)
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 1.2255779 0.0556051 1.2265761 0.0556140
## VolatileAcidity -0.0351533 0.0075420 -0.0357263 0.0075581
## FreeSulfurDioxide 0.0001297 0.0000398 0.0001296 0.0000398
## TotalSulfurDioxide 0.0000721 0.0000254 0.0000712 0.0000254
## pH -0.0129241 0.0087062 -0.0128755 0.0087058
## Sulphates -0.0118126 0.0062361 -0.0119226 0.0062364
## Alcohol 0.0026729 0.0015981 0.0027331 0.0015989
## LabelAppeal 0.1382709 0.0070003 0.1383436 0.0070003
## AcidIndex -0.0854983 0.0051555 -0.0857001 0.0051592
## STARS 0.3079539 0.0052371 0.3080433 0.0052374
From the output table, we can see that coefficients are changed minimally, and the observation 1630 is not influential.
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.nb.zeros,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Test stat Pr(>|t|)
## VolatileAcidity 0.910 0.340
## FreeSulfurDioxide 0.124 0.725
## TotalSulfurDioxide 0.570 0.450
## pH 0.633 0.426
## Sulphates 0.079 0.779
## Alcohol 1.369 0.242
## LabelAppeal 21.183 0.000
## AcidIndex 43.607 0.000
## STARS 666.019 0.000
glm_full_aero_aj <- glm.nb(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), data = wine_training_data.zeros.train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
fit.nb.zeros_aj <-step(glm_full_aero_aj,direction='backward',data = wine_training_data.zeros.train, trace = FALSE)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
residualPlots(fit.nb.zeros_aj,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Test stat Pr(>|t|)
## VolatileAcidity 0.468 0.494
## FreeSulfurDioxide 0.021 0.884
## TotalSulfurDioxide 0.669 0.413
## Sulphates 0.000 0.999
## Alcohol 0.822 0.365
## LabelAppeal 0.209 0.648
## AcidIndex 0.000 1.000
## STARS 0.000 1.000
## log(LabelAppeal + 3) 0.145 0.703
## I(AcidIndex^2) 13.067 0.000
## I(STARS^2) 91.089 0.000
summary(fit.nb.zeros_aj)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal +
## AcidIndex + STARS + log(LabelAppeal + 3) + I(AcidIndex^2) +
## I(STARS^2), data = wine_training_data.zeros.train, init.theta = 42997.68096,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.213 -0.659 -0.002 0.493 3.563
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.1982094 0.1812179 -1.09 0.2741
## VolatileAcidity -0.0324480 0.0075512 -4.30 0.000017307
## FreeSulfurDioxide 0.0000952 0.0000397 2.40 0.0165
## TotalSulfurDioxide 0.0000613 0.0000254 2.41 0.0160
## Sulphates -0.0121492 0.0062277 -1.95 0.0511
## Alcohol 0.0038120 0.0015965 2.39 0.0170
## LabelAppeal 0.0669180 0.0306631 2.18 0.0291
## AcidIndex 0.1187880 0.0369803 3.21 0.0013
## STARS 0.7040426 0.0171160 41.13 < 0.0000000000000002
## log(LabelAppeal + 3) 0.2671015 0.0867079 3.08 0.0021
## I(AcidIndex^2) -0.0118800 0.0022055 -5.39 0.000000072
## I(STARS^2) -0.1042053 0.0042413 -24.57 < 0.0000000000000002
##
## (Intercept)
## VolatileAcidity ***
## FreeSulfurDioxide *
## TotalSulfurDioxide *
## Sulphates .
## Alcohol *
## LabelAppeal *
## AcidIndex **
## STARS ***
## log(LabelAppeal + 3) **
## I(AcidIndex^2) ***
## I(STARS^2) ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(42998) family taken to be 1)
##
## Null deviance: 17041 on 9595 degrees of freedom
## Residual deviance: 10328 on 9584 degrees of freedom
## AIC: 34349
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 42998
## Std. Err.: 43490
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -34323
coef <- summary(fit.nb.zeros_aj)$coefficients[,1]
var.name <- variable.names(fit.nb.zeros_aj)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | -0.20 | 0.82 |
VolatileAcidity | -0.03 | 0.97 |
FreeSulfurDioxide | 0.00 | 1.00 |
TotalSulfurDioxide | 0.00 | 1.00 |
Sulphates | -0.01 | 0.99 |
Alcohol | 0.00 | 1.00 |
LabelAppeal | 0.07 | 1.07 |
AcidIndex | 0.12 | 1.13 |
STARS | 0.70 | 2.02 |
log(LabelAppeal + 3) | 0.27 | 1.31 |
I(AcidIndex^2) | -0.01 | 0.99 |
I(STARS^2) | -0.10 | 0.90 |
#good-of-fitness test
p_nb_zero_aj <- 1-pchisq(summary(fit.nb.zeros_aj)$deviance, summary(fit.nb.zeros_aj)$df.residual)
cat("p_nb_zero_aj:" ,p_nb_zero_aj,"\n\n")
## p_nb_zero_aj: 0.0000000768
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_8 <- deviance(fit.nb.zeros_aj)/fit.nb.zeros_aj$df.residual
#c = 0 equidispersion, c > 0 is overdispersed
c =1.07 >0. The assumption of equidispersion is not rejected.
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.nb.zeros_aj, n = 250) #library(DHARMa)
plot(simulationOutput)
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.zeros_aj)) #library(faraway)
# outlier test
outlierTest(fit.nb.zeros_aj)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 8887 3.44 0.000591 NA
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.nb.zeros_aj)
## StudRes Hat CookD
## 2729 2.64 0.033751 0.03931
## 8887 3.44 0.000692 0.00143
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.nb.zeros_aj_2729 <- update(fit.nb.zeros_aj,subset=c(-2729))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.zeros_aj, fit.nb.zeros_aj_2729 )
##
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + I(STARS^2), data =
## wine_training_data.zeros.train, init.theta = 42997.68096, link = log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS + log(LabelAppeal + 3) + I(AcidIndex^2) + I(STARS^2), data =
## wine_training_data.zeros.train, subset = c(-2729), init.theta =
## 42887.79241, link = log)
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) -0.1982094 0.1812179 -0.2917839 0.1843363
## VolatileAcidity -0.0324480 0.0075512 -0.0324711 0.0075519
## FreeSulfurDioxide 0.0000952 0.0000397 0.0000956 0.0000397
## TotalSulfurDioxide 0.0000613 0.0000254 0.0000608 0.0000254
## Sulphates -0.0121492 0.0062277 -0.0121164 0.0062278
## Alcohol 0.0038120 0.0015965 0.0038364 0.0015966
## LabelAppeal 0.0669180 0.0306631 0.0651486 0.0306776
## AcidIndex 0.1187880 0.0369803 0.1419150 0.0379494
## STARS 0.7040426 0.0171160 0.7030182 0.0171182
## log(LabelAppeal + 3) 0.2671015 0.0867079 0.2716288 0.0867463
## I(AcidIndex^2) -0.0118800 0.0022055 -0.0133410 0.0022714
## I(STARS^2) -0.1042053 0.0042413 -0.1040191 0.0042418
From the output table, we can see that coefficients are changed minimally, and the observation 2729 is not influential.
We then use rootogram approach to test the assessment of fit as it is an improved test for a count regression model
# check the good of fitness of the poisson model
#install.packages("countreg", repos="http://R-Forge.R-project.org")
countreg::rootogram(fit.nb.zeros_aj)
The bar hanging from each point on the theoretical Poisson fit (red line) represents the difference between expected and observed counts. When a bar hanging below 0, it is underfitting. When a bar hanging above 0, it is overfitting. We can see that all variables have either underfitting or overfitting problem except the No.7 variable, “TotalSulfurDioxide”.
# goodness of fit: pseudo R squared
(pR2_zero_2 <- 1 - fit.nb.zeros_aj$deviance / fit.nb.zeros_aj$null.deviance)
## [1] 0.394
# Log likelihood
(logLik_zero_2 <-logLik(fit.nb.zeros_aj))
## 'log Lik.' -17161 (df=13)
# AIC
AIC_zero_2 <- fit.nb.zeros_aj$aic
BIC_zero_2 <- BIC(fit.nb.zeros_aj)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_2<- cbind(wine_training_data.zeros.test,
Mean = predict(fit.nb.zeros_aj, newdata=wine_training_data.zeros.test, type="response"),
SE = predict(fit.nb.zeros_aj, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
)
evl_df_zero_2 <- as.data.frame(aggregate(ndf_zero_2[, 17], list(ndf_zero_2$TARGET), mean))
evl_df_zero_2 <- cbind(evl_df_zero_2,aggregate(ndf_zero_2[, 18], list(ndf_zero_2$TARGET), mean)[,2])
colnames(evl_df_zero_2) <- c('TARGET','Mean','SE')
evl_df_zero_2
## TARGET Mean SE
## 1 0 1.57 0.0361
## 2 1 1.42 0.0427
## 3 2 2.06 0.0444
## 4 3 2.75 0.0499
## 5 4 3.59 0.0627
## 6 5 4.25 0.0779
## 7 6 4.86 0.1019
## 8 7 5.41 0.1151
## 9 8 6.00 0.1608
# goodness of fit: pseudo R squared
zero.zeroinf_null <- zeroinfl(TARGET ~ 1, data=wine_training_data.zeros.train, dist="negbin")
## Warning in sqrt(diag(vc)[np]): NaNs produced
(pR2_zero_2z <- 1- logLik(fit.nb.zeros_zeroinf)[1]/logLik(zero.zeroinf_null)[1] )
## [1] 0.164
# Log likelihood
(logLik_zero_2z<-logLik(fit.nb.zeros_zeroinf))
## 'log Lik.' -15349 (df=31)
#dispersion
c_zero_2z <- deviance(fit.nb.zeros_zeroinf)/fit.nb.zeros_zeroinf$df.residual
# AIC
AIC_zero_2z <- AIC(fit.nb.zeros_zeroinf)
BIC_zero_2z <- BIC(fit.nb.zeros_zeroinf)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_2z<- cbind(wine_training_data.zeros.test,
Count = predict(fit.nb.zeros_zeroinf, newdata=wine_training_data.zeros.test, type = "count"),
Zero = predict(fit.nb.zeros_zeroinf, newdata=wine_training_data.zeros.test, type = "zero")
)
evl_df_zero_2z <- as.data.frame(aggregate(ndf_zero_2z[, 17], list(ndf_zero_2z$TARGET), mean))
evl_df_zero_2z <- cbind(evl_df_zero_2z,aggregate(ndf_zero_2z[, 18], list(ndf_zero_2z$TARGET), mean)[,2])
colnames(evl_df_zero_2z) <- c('TARGET','Count','Zero')
fit.nb.nozeros <- step(glm.nb(TARGET ~ . , data = wine_training_data.nozeros.train), trace = FALSE)
summary(fit.nb.nozeros)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## Alcohol + LabelAppeal + AcidIndex + STARS, data = wine_training_data.nozeros.train,
## init.theta = 137870.7222, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.228 -0.275 0.070 0.374 1.727
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.1958944 0.0505641 23.65 < 0.0000000000000002 ***
## VolatileAcidity -0.0271444 0.0079920 -3.40 0.00068 ***
## FreeSulfurDioxide 0.0000892 0.0000421 2.12 0.03405 *
## Alcohol 0.0041172 0.0016917 2.43 0.01494 *
## LabelAppeal 0.1809402 0.0075739 23.89 < 0.0000000000000002 ***
## AcidIndex -0.0483120 0.0055740 -8.67 < 0.0000000000000002 ***
## STARS 0.1895274 0.0070858 26.75 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(137871) family taken to be 1)
##
## Null deviance: 6582.8 on 7076 degrees of freedom
## Residual deviance: 4473.6 on 7070 degrees of freedom
## AIC: 25468
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 137871
## Std. Err.: 218980
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -25452
coef <- summary(fit.nb.nozeros)$coefficients[,1]
var.name <- variable.names(fit.nb.nozeros)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | 1.20 | 3.31 |
VolatileAcidity | -0.03 | 0.97 |
FreeSulfurDioxide | 0.00 | 1.00 |
Alcohol | 0.00 | 1.00 |
LabelAppeal | 0.18 | 1.20 |
AcidIndex | -0.05 | 0.95 |
STARS | 0.19 | 1.21 |
#good-of-fitness test
p_nb_nozero <- 1-pchisq(summary(fit.nb.nozeros)$deviance, summary(fit.nb.nozeros)$df.residual)
cat("p-value:" ,p_nb_nozero,"\n\n")
## p-value: 1
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_9<-summary(fit.nb.nozeros)$deviance/summary(fit.nb.nozeros)$df.residual
#c = 0 equidispersion, c > 0 is overdispersed
#Check for zero inflation
fit.nb.nozeros_zeroinf <- zeroinfl(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS, data=wine_training_data.nozeros.train, dist="negbin") #library(pscl)
AIC(fit.nb.nozeros, fit.nb.nozeros_zeroinf)
## df AIC
## fit.nb.nozeros 8 25468
## fit.nb.nozeros_zeroinf 31 24550
summary(fit.nb.nozeros_zeroinf)
##
## Call:
## zeroinfl(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid +
## ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide +
## Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS, data = wine_training_data.nozeros.train, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.3326 -0.2911 0.0461 0.3422 4.0573
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.3977723 0.2431430 5.75 0.0000000090 ***
## FixedAcidity 0.0004545 0.0010371 0.44 0.66122
## VolatileAcidity -0.0101546 0.0081336 -1.25 0.21186
## CitricAcid -0.0010742 0.0072655 -0.15 0.88246
## ResidualSugar -0.0000994 0.0001884 -0.53 0.59779
## Chlorides -0.0189427 0.0201410 -0.94 0.34696
## FreeSulfurDioxide 0.0000370 0.0000426 0.87 0.38447
## TotalSulfurDioxide -0.0000129 0.0000270 -0.48 0.63406
## Density -0.2853505 0.2387478 -1.20 0.23201
## pH 0.0080346 0.0094516 0.85 0.39528
## Sulphates -0.0015687 0.0067169 -0.23 0.81534
## Alcohol 0.0064208 0.0017188 3.74 0.00019 ***
## LabelAppeal 0.2134860 0.0077528 27.54 < 0.0000000000000002 ***
## AcidIndex -0.0170927 0.0059300 -2.88 0.00395 **
## STARS 0.1153298 0.0074932 15.39 < 0.0000000000000002 ***
## Log(theta) 17.9661270 3.0007605 5.99 0.0000000021 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.923766 2.666214 -1.85 0.06479 .
## FixedAcidity -0.005837 0.011407 -0.51 0.60884
## VolatileAcidity 0.330347 0.086696 3.81 0.00014 ***
## CitricAcid -0.049216 0.079304 -0.62 0.53486
## ResidualSugar -0.001095 0.002088 -0.52 0.60007
## Chlorides -0.245550 0.225862 -1.09 0.27696
## FreeSulfurDioxide -0.001305 0.000496 -2.63 0.00843 **
## TotalSulfurDioxide -0.000750 0.000306 -2.45 0.01445 *
## Density 2.035288 2.601093 0.78 0.43394
## pH 0.180500 0.103816 1.74 0.08210 .
## Sulphates 0.122905 0.073707 1.67 0.09542 .
## Alcohol 0.047988 0.018250 2.63 0.00855 **
## LabelAppeal 0.734345 0.089361 8.22 < 0.0000000000000002 ***
## AcidIndex 0.527558 0.050420 10.46 < 0.0000000000000002 ***
## STARS -4.013751 0.423096 -9.49 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 63473116.874
## Number of iterations in BFGS optimization: 64
## Log-likelihood: -1.22e+04 on 31 Df
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.nb.nozeros)) #library(faraway)
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.nb.nozeros, n = 250) #library(DHARMa)
plot(simulationOutput)
The GOF test indicates that the Poisson model fit the data (p =1 > 0.05).
# outlier test
outlierTest(fit.nb.nozeros)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 4922 -4.06 0.0000489 0.346
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.nb.nozeros)
## StudRes Hat CookD
## 369 -3.935 0.00320 0.00224
## 2729 0.768 0.01127 0.00067
## 4922 -4.063 0.00139 0.00104
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 369, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.nb.nozeros_369 <- update(fit.nb.nozeros,subset=c(-369))
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
compareCoefs(fit.nb.nozeros, fit.nb.nozeros_369 )
##
## Call:
## 1: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## Alcohol + LabelAppeal + AcidIndex + STARS, data =
## wine_training_data.nozeros.train, init.theta = 137870.7222, link = log)
## 2: glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## Alcohol + LabelAppeal + AcidIndex + STARS, data =
## wine_training_data.nozeros.train, subset = c(-369), init.theta =
## 137848.1397, link = log)
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 1.1958944 0.0505641 1.1957497 0.0505664
## VolatileAcidity -0.0271444 0.0079920 -0.0271428 0.0079921
## FreeSulfurDioxide 0.0000892 0.0000421 0.0000893 0.0000421
## Alcohol 0.0041172 0.0016917 0.0041150 0.0016917
## LabelAppeal 0.1809402 0.0075739 0.1809055 0.0075747
## AcidIndex -0.0483120 0.0055740 -0.0482987 0.0055741
## STARS 0.1895274 0.0070858 0.1895483 0.0070861
From the output table, we can see that coefficients are changed minimally, and the observation 1630 is not influential.
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.nb.nozeros,layout = c(3, 4),ask=F)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Test stat Pr(>|t|)
## VolatileAcidity 0.170 0.680
## FreeSulfurDioxide 0.213 0.644
## Alcohol 0.739 0.390
## LabelAppeal 20.184 0.000
## AcidIndex 6.482 0.011
## STARS 77.082 0.000
#glm_full_aero_aj <- glm.nb(TARGET ~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+Density+pH+Sulphates+Alcohol+LabelAppeal+AcidIndex+STARS+log(LabelAppeal+3)+I(AcidIndex^2)+I(STARS^2), data = wine_training_data.zeros.train)
#fit.nb.nozeros_aj <-step(glm_full_aero_aj,direction='backward',data = wine_training_data.zeros.train, trace = FALSE)
#residualPlots(fit.nb.nozeros_aj,layout = c(3, 4),ask=F)
#summary(fit.nb.nozeros_aj)
# goodness of fit: pseudo R squared
(pR2_nozero_2 <- 1 - fit.nb.nozeros$deviance / fit.nb.nozeros$null.deviance)
## [1] 0.32
# Log likelihood
(logLik_nozero_2 <-logLik(fit.nb.nozeros))
## 'log Lik.' -12726 (df=8)
# AIC
AIC_nozero_2 <- fit.nb.nozeros$aic
BIC_nozero_2 <- BIC(fit.nb.nozeros)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_2<- cbind(wine_training_data.zeros.test,
Mean = predict(fit.nb.nozeros, newdata=wine_training_data.zeros.test, type="response"),
SE = predict(fit.nb.nozeros, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
)
evl_df_nozero_2 <- as.data.frame(aggregate(ndf_nozero_2[, 17], list(ndf_nozero_2$TARGET), mean))
evl_df_nozero_2 <- cbind(evl_df_nozero_2,aggregate(ndf_nozero_2[, 18], list(ndf_nozero_2$TARGET), mean)[,2])
colnames(evl_df_nozero_2) <- c('TARGET','Mean','SE')
evl_df_nozero_2
## TARGET Mean SE
## 1 0 2.46 0.0532
## 2 1 1.99 0.0413
## 3 2 2.41 0.0433
## 4 3 2.90 0.0469
## 5 4 3.59 0.0548
## 6 5 4.27 0.0673
## 7 6 5.13 0.0890
## 8 7 5.94 0.1054
## 9 8 6.56 0.1237
# goodness of fit: pseudo R squared
zero.nozeroinf_null <- zeroinfl(TARGET ~ 1, data=wine_training_data.nozeros.train, dist="negbin")
## Warning in sqrt(diag(vc)[np]): NaNs produced
(pR2_nozero_2z <- 1- logLik(fit.nb.nozeros_zeroinf)[1]/logLik(zero.nozeroinf_null)[1])
## [1] 0.0931
# Log likelihood
(logLik_nozero_2z<-logLik(fit.nb.nozeros_zeroinf))
## 'log Lik.' -12244 (df=31)
#dispersion
c_nozero_2z <- deviance(fit.nb.nozeros_zeroinf)/fit.nb.nozeros_zeroinf$df.residual
# AIC
AIC_nozero_2z <- AIC(fit.nb.nozeros_zeroinf)
BIC_nozero_2z <- BIC(fit.nb.nozeros_zeroinf)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_2z<- cbind(wine_training_data.nozeros.test,
Count = predict(fit.nb.nozeros_zeroinf, newdata=wine_training_data.nozeros.test, type = "count"),
Zero = predict(fit.nb.nozeros_zeroinf, newdata=wine_training_data.nozeros.test, type = "zero")
)
evl_df_nozero_2z <- as.data.frame(aggregate(ndf_nozero_2z[, 17], list(ndf_nozero_2z$TARGET), mean))
evl_df_nozero_2z <- cbind(evl_df_nozero_2z, aggregate(ndf_nozero_2z[, 18], list(ndf_nozero_2z$TARGET), mean)[,2])
colnames(evl_df_nozero_2z) <- c('TARGET','Count','Zero')
evl_df_nozero_2z
## TARGET Count Zero
## 1 0 3.31 0.222407
## 2 1 2.53 0.044343
## 3 2 2.79 0.048684
## 4 3 3.25 0.052214
## 5 4 3.87 0.042630
## 6 5 4.50 0.029624
## 7 6 5.31 0.015804
## 8 7 6.04 0.001156
## 9 8 6.56 0.000152
fit.mlr.zeros <- step(lm(TARGET ~ ., data = wine_training_data.zeros.train), trace = FALSE)
summary(fit.mlr.zeros)
##
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide +
## Density + pH + Sulphates + Alcohol + LabelAppeal + AcidIndex +
## STARS, data = wine_training_data.zeros.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.549 -0.950 0.070 0.909 5.967
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.0749449 0.5176253 7.87 0.0000000000000039 ***
## VolatileAcidity -0.1031860 0.0173251 -5.96 0.0000000026778358 ***
## FreeSulfurDioxide 0.0003207 0.0000917 3.50 0.00047 ***
## TotalSulfurDioxide 0.0002007 0.0000586 3.42 0.00062 ***
## Density -0.8932507 0.5094355 -1.75 0.07956 .
## pH -0.0312522 0.0201038 -1.55 0.12009
## Sulphates -0.0322132 0.0144170 -2.23 0.02548 *
## Alcohol 0.0108430 0.0036472 2.97 0.00296 **
## LabelAppeal 0.4464705 0.0158185 28.22 < 0.0000000000000002 ***
## AcidIndex -0.2069486 0.0104436 -19.82 < 0.0000000000000002 ***
## STARS 0.9673370 0.0121174 79.83 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.33 on 9585 degrees of freedom
## Multiple R-squared: 0.524, Adjusted R-squared: 0.523
## F-statistic: 1.05e+03 on 10 and 9585 DF, p-value: <0.0000000000000002
coef <- summary(fit.mlr.zeros)$coefficients[,1]
var.name <- variable.names(fit.mlr.zeros)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | 4.07 | 58.85 |
VolatileAcidity | -0.10 | 0.90 |
FreeSulfurDioxide | 0.00 | 1.00 |
TotalSulfurDioxide | 0.00 | 1.00 |
Density | -0.89 | 0.41 |
pH | -0.03 | 0.97 |
Sulphates | -0.03 | 0.97 |
Alcohol | 0.01 | 1.01 |
LabelAppeal | 0.45 | 1.56 |
AcidIndex | -0.21 | 0.81 |
STARS | 0.97 | 2.63 |
#good-of-fitness test
p_mlr_zero <- 1-pchisq(deviance(fit.mlr.zeros), fit.mlr.zeros$df.residual)
cat("p-value:" ,p_poisson,"\n\n")
## p-value: 0
#Check for over / underdispersion by looking at residual deviance/df or at a formal test statistic
c_10<-deviance(fit.mlr.zeros)/fit.mlr.zeros$df.residual
#c = 0 equidispersion, c > 0 is overdispersed
#Check for zero inflation
AIC(fit.mlr.zeros, fit.nb.zeros_zeroinf)
## df AIC
## fit.mlr.zeros 12 32672
## fit.nb.zeros_zeroinf 31 30761
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.mlr.zeros)) #library(faraway)
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.mlr.zeros, n = 250) #library(DHARMa)
plot(simulationOutput)
The GOF test indicates that the Poisson model fit the data (p > 0.05).
# outlier test
outlierTest(fit.mlr.zeros)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 8887 4.5 0.00000676 0.0649
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.mlr.zeros)
## StudRes Hat CookD
## 2729 1.29 0.007447 0.00114
## 3953 3.54 0.003087 0.00353
## 8887 4.50 0.000825 0.00152
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 2729, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.mlr.zeros_2729 <- update(fit.mlr.zeros,subset=c(-2729))
compareCoefs(fit.mlr.zeros, fit.mlr.zeros_2729 )
##
## Call:
## 1: lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Density + pH + Sulphates + Alcohol + LabelAppeal
## + AcidIndex + STARS, data = wine_training_data.zeros.train)
## 2: lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Density + pH + Sulphates + Alcohol + LabelAppeal
## + AcidIndex + STARS, data = wine_training_data.zeros.train, subset =
## c(-2729))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 4.0749449 0.5176253 4.1071606 0.5182061
## VolatileAcidity -0.1031860 0.0173251 -0.1032255 0.0173245
## FreeSulfurDioxide 0.0003207 0.0000917 0.0003215 0.0000917
## TotalSulfurDioxide 0.0002007 0.0000586 0.0002008 0.0000586
## Density -0.8932507 0.5094355 -0.9176445 0.5097667
## pH -0.0312522 0.0201038 -0.0312538 0.0201030
## Sulphates -0.0322132 0.0144170 -0.0322240 0.0144165
## Alcohol 0.0108430 0.0036472 0.0108289 0.0036471
## LabelAppeal 0.4464705 0.0158185 0.4461516 0.0158198
## AcidIndex -0.2069486 0.0104436 -0.2079136 0.0104699
## STARS 0.9673370 0.0121174 0.9670056 0.0121196
From the output table, we can see that coefficients are changed minimally, and the observation 2729 is not influential.
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.mlr.zeros,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity -0.747 0.455
## FreeSulfurDioxide 0.152 0.879
## TotalSulfurDioxide -0.977 0.328
## Density -1.195 0.232
## pH -0.681 0.496
## Sulphates 0.414 0.679
## Alcohol 1.193 0.233
## LabelAppeal 1.308 0.191
## AcidIndex -2.067 0.039
## STARS -14.836 0.000
## Tukey test -6.748 0.000
# goodness of fit: pseudo R squared
(pR2_zero_3 <- 1- logLik(fit.mlr.zeros)[1]/logLik(fit.mlr.zeros)[1])
## [1] 0
# Log likelihood
(logLik_zero_3 <-logLik(fit.mlr.zeros))
## 'log Lik.' -16324 (df=12)
# AIC
AIC_zero_3 <- AIC(fit.mlr.zeros)
BIC_zero_3 <- BIC(fit.mlr.zeros)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_zero_3<- cbind(wine_training_data.zeros.test,
Mean = predict(fit.mlr.zeros, newdata=wine_training_data.zeros.test, type="response"),
SE = predict(fit.mlr.zeros, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
)
evl_df_zero_3 <- as.data.frame(aggregate(ndf_zero_3[, 17], list(ndf_zero_3$TARGET), mean))
evl_df_zero_3 <- cbind(evl_df_zero_3,aggregate(ndf_zero_3[, 18], list(ndf_zero_3$TARGET), mean)[,2])
colnames(evl_df_zero_3) <- c('TARGET','Mean','SE')
evl_df_zero_3
## TARGET Mean SE
## 1 0 1.68 0.0464
## 2 1 1.33 0.0485
## 3 2 2.07 0.0432
## 4 3 2.73 0.0416
## 5 4 3.54 0.0411
## 6 5 4.20 0.0432
## 7 6 4.89 0.0471
## 8 7 5.40 0.0484
## 9 8 5.77 0.0597
fit.mlr.nozeros <- step(lm(TARGET ~ ., data = wine_training_data.nozeros.train), trace = FALSE)
summary(fit.mlr.nozeros)
##
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + Density +
## Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, data = wine_training_data.nozeros.train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.043 -0.516 0.129 0.725 3.492
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.2449571 0.5219676 8.13 0.00000000000000049 ***
## VolatileAcidity -0.0987890 0.0175540 -5.63 0.00000001896040200 ***
## FreeSulfurDioxide 0.0003134 0.0000928 3.38 0.00074 ***
## Density -0.9930115 0.5168318 -1.92 0.05473 .
## Sulphates -0.0221099 0.0146409 -1.51 0.13105
## Alcohol 0.0158474 0.0036907 4.29 0.00001779645443424 ***
## LabelAppeal 0.6558181 0.0165919 39.53 < 0.0000000000000002 ***
## AcidIndex -0.1643256 0.0116752 -14.07 < 0.0000000000000002 ***
## STARS 0.7355710 0.0161978 45.41 < 0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.15 on 7068 degrees of freedom
## Multiple R-squared: 0.455, Adjusted R-squared: 0.455
## F-statistic: 739 on 8 and 7068 DF, p-value: <0.0000000000000002
coef <- summary(fit.mlr.nozeros)$coefficients[,1]
var.name <- variable.names(fit.mlr.nozeros)
effects <- exp(coef)
kable(round(interpretation <- cbind(coef,effects),2))
coef | effects | |
---|---|---|
(Intercept) | 4.24 | 69.75 |
VolatileAcidity | -0.10 | 0.91 |
FreeSulfurDioxide | 0.00 | 1.00 |
Density | -0.99 | 0.37 |
Sulphates | -0.02 | 0.98 |
Alcohol | 0.02 | 1.02 |
LabelAppeal | 0.66 | 1.93 |
AcidIndex | -0.16 | 0.85 |
STARS | 0.74 | 2.09 |
#good-of-fitness test
p_mlr_nozero <- 1-pchisq(deviance(fit.mlr.nozeros), fit.mlr.zeros$df.residual)
cat("p-value:" ,p_nb_nozero,"\n\n")
## p-value: 1
#Check for zero inflation
AIC(fit.mlr.nozeros, fit.nb.zeros_zeroinf)
## Warning in AIC.default(fit.mlr.nozeros, fit.nb.zeros_zeroinf): models are
## not all fitted to the same number of observations
## df AIC
## fit.mlr.nozeros 10 22112
## fit.nb.zeros_zeroinf 31 30761
#plot a half normal probability plot of residuals
halfnorm(residuals(fit.mlr.nozeros)) #library(faraway)
#Plot the scaled residuals vs. the (log) predicted values (or the linear predictor)
simulationOutput <- simulateResiduals(fittedModel = fit.mlr.nozeros, n = 250) #library(DHARMa)
plot(simulationOutput)
The GOF test indicates that the Poisson model fit the data (p =1 > 0.05).
# outlier test
outlierTest(fit.mlr.nozeros)
##
## No Studentized residuals with Bonferonni p < 0.05
## Largest |rstudent|:
## rstudent unadjusted p-value Bonferonni p
## 4922 -4.38 0.0000119 0.0844
The results show that there is no outlier as judged by Bonferonni p.
#Check for influential and leverage points
influencePlot(fit.mlr.nozeros)
## StudRes Hat CookD
## 369 -4.250 0.00262 0.00525
## 2729 0.918 0.01177 0.00111
## 4922 -4.382 0.00135 0.00288
The above table shows some points with large studentized residuals, hat-values or Cook’s distances from the influential plots. To test if the case, 369, which has the largest Cook’s distances is influential, I remove it from the model and compare the coefficients.
fit.mlr.nozeros_369 <- update(fit.mlr.nozeros,subset=c(-369))
compareCoefs(fit.mlr.nozeros, fit.mlr.nozeros_369 )
##
## Call:
## 1: lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + Density
## + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, data =
## wine_training_data.nozeros.train)
## 2: lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + Density
## + Sulphates + Alcohol + LabelAppeal + AcidIndex + STARS, data =
## wine_training_data.nozeros.train, subset = c(-369))
## Est. 1 SE 1 Est. 2 SE 2
## (Intercept) 4.2449571 0.5219676 4.2442624 0.5219960
## VolatileAcidity -0.0987890 0.0175540 -0.0987817 0.0175549
## FreeSulfurDioxide 0.0003134 0.0000928 0.0003135 0.0000928
## Density -0.9930115 0.5168318 -0.9927579 0.5168585
## Sulphates -0.0221099 0.0146409 -0.0220984 0.0146416
## Alcohol 0.0158474 0.0036907 0.0158399 0.0036910
## LabelAppeal 0.6558181 0.0165919 0.6556927 0.0165944
## AcidIndex -0.1643256 0.0116752 -0.1642833 0.0116760
## STARS 0.7355710 0.0161978 0.7356227 0.0161989
From the output table, we can see that coefficients are changed minimally, and the observation 1630 is not influential.
#suppressMessages(suppressWarnings(library(car)))
residualPlots(fit.mlr.nozeros,layout = c(3, 4),ask=F)
## Test stat Pr(>|t|)
## VolatileAcidity -0.273 0.785
## FreeSulfurDioxide -0.852 0.394
## Density -0.613 0.540
## Sulphates 0.653 0.514
## Alcohol 1.465 0.143
## LabelAppeal 0.610 0.542
## AcidIndex -2.583 0.010
## STARS -7.606 0.000
## Tukey test -1.663 0.096
# goodness of fit: pseudo R squared
fit.mlr.nozeros.null <- lm(TARGET ~ 1, data = wine_training_data.nozeros.train)
(pR2_nozero_3 <- 1 - logLik(fit.mlr.nozeros)[1]/logLik(fit.mlr.nozeros.null)[1])
## [1] 0.163
# Log likelihood
(logLik_nozero_3 <-logLik(fit.mlr.nozeros))
## 'log Lik.' -11046 (df=10)
# AIC
AIC_nozero_3 <- AIC(fit.mlr.nozeros)
BIC_nozero_3 <- BIC(fit.mlr.nozeros)
#use the model to predict mean counts for each treatment and standard errors for each parameter.
ndf_nozero_3<- cbind(wine_training_data.zeros.test,
Mean = predict(fit.mlr.nozeros, newdata=wine_training_data.zeros.test, type="response"),
SE = predict(fit.mlr.nozeros, newdata=wine_training_data.zeros.test, type="response", se.fit=T)$se.fit
)
evl_df_nozero_3 <- as.data.frame(aggregate(ndf_nozero_3[, 17], list(ndf_nozero_3$TARGET), mean))
evl_df_nozero_3 <- cbind(evl_df_nozero_3,aggregate(ndf_nozero_3[, 18], list(ndf_nozero_3$TARGET), mean)[,2])
colnames(evl_df_nozero_3) <- c('TARGET','Mean','SE')
evl_df_nozero_3
## TARGET Mean SE
## 1 0 2.20 0.0506
## 2 1 1.44 0.0486
## 3 2 2.16 0.0425
## 4 3 2.84 0.0398
## 5 4 3.64 0.0384
## 6 5 4.30 0.0400
## 7 6 4.98 0.0442
## 8 7 5.53 0.0456
## 9 8 5.96 0.0489
Specification:
Decide on the criteria for selecting the best count regression model. Will you select models with slightly worse performance if it makes more sense or is more parsimonious? Discuss why you selected your models.
For the count regression model, will you use a metric such as AIC, average squared error, etc.? Be sure to explain how you can make inferences from the model, and discuss other relevant model output. If you like the multiple linear regression model the best, please say why. However, you must select a count regression model for model deployment. Using the training data set, evaluate the performance of the count regression model. Make predictions using the evaluation data set.
model-1 fit.poisson.imp fit.poisson.imp.aj Poisson GLM: imputation model-2 fit.poisson.imp_zeroinf Zero-inflated Poisson: imputation model-3 fit.qp.imp Quasibinomial GLM: imputation model-4 fit.qp.imp_zeroinf Zero-inflated Poisson: imputation (repeated) model-5 fit.nb.imp fit.nb.imp.aj Negative Binomial GLM:imputation model-6 fit.nb.imp_zeroinf Zero-inflated Negative Binomial: imputation
model-7 fit.lm.imp fit.lm.imp.aj Multiple Linear Regression Model: imputation
model-8 fit.ml.imp Multinomial Logistic Regression Model: imputation
model-9 fit.poisson.zeros fit.poisson.zeros_aj Poisson GLM: NAs replaced by 0 model-10 fit.poisson.zeros_zeroinf Zero-inflated Poisson: NAs replaced by 0 model-11 fit.poisson.nozeros Poisson GLM: NAs Removed model-12 fit.poisson.nozeros_zeroinf Zero-inflated Poisson: NAs Removed model-13 fit.nb.zeros Negative Binomial GLM: NAs replaced by 0 model-14 fit.nb.zeros_zeroinf Zero-inflated Negative Binomial: NAs replaced by 0 model-15 fit.nb.nozeros Negative Binomial GLM: NAs Removed model-16 fit.nb.nozeros_zeroinf Zero-inflated Negative Binomial: NAs Removed model-17 fit.mlr.zeros Multiple Linear Regression: NAs replaced by 0 model-18 fit.mlr.nozeros Multiple Linear Regression: NAs Removed
Though the 5th and 6th models are not GLMs, and thus should be used with normal distributions, we need to remember that the TARGET variable did have a normal distribution before the ZEROS replaced the NAs.
There seem to be MORE significant fields in the “with zeros” model than in the “no zeros” model. Will this mean higher accuracy?
calc_sd <- function(fit, data){
prediction <- predict(fit, newdata=data, type='response')
difference <- (prediction - mean(data$TARGET))
difference_squared <- difference * difference
return (mean(sqrt(difference_squared)))
}
calc_se <- function(fit, data){
prediction <- predict(fit, newdata=data, type='response')
difference <- (prediction - data$TARGET)
difference_squared <- difference * difference
return (sqrt(mean(difference_squared)))
}
# Function that returns Root Mean Squared Error
rmse <- function(error)
{
sqrt(mean(error^2))
}
# Function that returns Mean Absolute Error
mae <- function(error)
{
mean(abs(error))
}
##################################################################
# SD Calcs:
##################################################################
sd.poisson.imp.aj <- calc_sd(fit.poisson.imp.aj, test)
sd.poisson.imp_zeroinf <- calc_sd(fit.poisson.imp_zeroinf, test)
sd.fit.qp.imp <- calc_sd(fit.qp.imp, test)
#sd.qp.imp_zeroinf <- calc_sd(fit.qp.imp_zeroinf, test)
sd.nb.imp.aj <- calc_sd(fit.nb.imp.aj, test)
sd.nb.imp_zeroinf <- calc_sd(fit.nb.imp_zeroinf, test)
sd.lm.imp.aj <- calc_sd(fit.poisson.imp, test)
sd.ml.imp <- calc_sd(fit.ml.imp, mldata)
sd.poisson.nozeros <- calc_sd(fit.poisson.nozeros, wine_training_data.nozeros.test)
sd.nb.nozeros <- calc_sd(fit.nb.nozeros, wine_training_data.nozeros.test)
sd.mlr.nozeros <- calc_sd(fit.mlr.nozeros, wine_training_data.nozeros.test)
sd.poisson.zeros <- calc_sd(fit.poisson.zeros, wine_training_data.zeros.test)
sd.nb.zeros <- calc_sd(fit.nb.zeros, wine_training_data.zeros.test)
sd.mlr.zeros <- calc_sd(fit.mlr.zeros, wine_training_data.zeros.test)
sd.poisson.zeros_zeroinf <- calc_sd(fit.poisson.zeros_zeroinf, wine_training_data.zeros.test)
sd.poisson.nozeros_zeroinf <- calc_sd(fit.poisson.nozeros_zeroinf, wine_training_data.zeros.test)
sd.nb.zeros_zeroinf <- calc_sd(fit.nb.zeros_zeroinf , wine_training_data.zeros.test)
sd.nb.nozeros_zeroinf <- calc_sd(fit.nb.nozeros_zeroinf, wine_training_data.zeros.test)
sd.mlr.nozeros <- calc_sd(fit.mlr.nozeros, wine_training_data.zeros.test)
SD <- format(c(sd.poisson.imp.aj,sd.poisson.imp_zeroinf,sd.fit.qp.imp,sd.nb.imp.aj,sd.nb.imp_zeroinf,sd.lm.imp.aj,sd.ml.imp,sd.poisson.zeros,sd.poisson.zeros_zeroinf,sd.poisson.nozeros,sd.poisson.nozeros_zeroinf,sd.nb.zeros,sd.nb.zeros_zeroinf,sd.nb.nozeros,sd.nb.nozeros_zeroinf,sd.mlr.zeros,sd.mlr.nozeros), digits=2, nsmall=2)
##################################################################
# RMSE Calcs:
##################################################################
rmse.poisson.imp.aj <- rmse(fit.poisson.imp.aj$residuals)
rmse.poisson.imp_zeroinf <- rmse(fit.poisson.imp_zeroinf$residuals)
rmse.fit.qp.imp <- rmse(fit.qp.imp$residuals)
#rmse.qp.imp_zeroinf <- rmse(fit.qp.imp_zeroinf$residuals)
rmse.nb.imp.aj <- rmse(fit.nb.imp.aj$residuals)
rmse.nb.imp_zeroinf <- rmse(fit.nb.imp_zeroinf$residuals)
rmse.lm.imp.aj <- rmse(fit.lm.imp$residuals)
rmse.ml.imp <- rmse(fit.ml.imp$residuals)
rmse.poisson.zeros <- rmse(fit.poisson.zeros_aj$residuals)
rmse.poisson.zeros_zeroinf <- rmse(fit.poisson.zeros_zeroinf$residuals)
rmse.poisson.nozeros <- rmse(fit.poisson.nozeros$residuals)
rmse.poisson.nozeros_zeroinf <- rmse(fit.poisson.nozeros_zeroinf$residuals)
rmse.nb.zeros <- rmse(fit.nb.zeros$residuals)
rmse.nb.zeros_zeroinf <- rmse(fit.nb.zeros_zeroinf$residuals)
rmse.nb.nozeros <- rmse(fit.nb.nozeros$residuals)
rmse.nb.nozeros_zeroinf <- rmse(fit.nb.nozeros_zeroinf$residuals)
rmse.mlr.zeros <- rmse(fit.mlr.zeros$residuals)
rmse.mlr.nozeros <- rmse(fit.mlr.nozeros$residuals)
RMSE <- format(c(rmse.poisson.imp.aj,rmse.poisson.imp_zeroinf ,rmse.fit.qp.imp,rmse.nb.imp.aj,rmse.nb.imp.aj,rmse.lm.imp.aj,rmse.ml.imp, rmse.poisson.zeros,rmse.poisson.zeros_zeroinf,rmse.poisson.nozeros,rmse.poisson.nozeros_zeroinf,rmse.nb.zeros,rmse.nb.zeros_zeroinf,rmse.nb.nozeros,rmse.nb.nozeros_zeroinf,rmse.mlr.zeros,rmse.mlr.nozeros), digits=2, nsmall=2)
##################################################################
# SE Calcs:
##################################################################
se.poisson.imp.aj <- calc_se(fit.poisson.imp.aj, test)
se.poisson.imp_zeroinf <- calc_se(fit.poisson.imp_zeroinf, test)
se.fit.qp.imp <- calc_se(fit.qp.imp, test)
#se.qp.imp_zeroinf <- calc_se(fit.qp.imp_zeroinf, test)
se.nb.imp.aj <- calc_se(fit.nb.imp.aj, test)
se.nb.imp_zeroinf <- calc_se(fit.nb.imp_zeroinf, test)
se.lm.imp.aj <- calc_se(fit.lm.imp, test)
se.ml.imp <- calc_se(fit.ml.imp, mldata)
se.poisson.nozeros <- calc_se(fit.poisson.nozeros, wine_training_data.nozeros.test)
se.nb.nozeros <- calc_se(fit.nb.nozeros, wine_training_data.nozeros.test)
se.mlr.nozeros <- calc_se(fit.mlr.nozeros, wine_training_data.nozeros.test)
se.poisson.zeros <- calc_se(fit.poisson.zeros, wine_training_data.zeros.test)
se.nb.zeros <- calc_se(fit.nb.zeros, wine_training_data.zeros.test)
se.mlr.zeros <- calc_se(fit.mlr.zeros, wine_training_data.zeros.test)
se.poisson.zeros_zeroinf <- calc_se(fit.poisson.zeros_zeroinf, wine_training_data.zeros.test)
se.poisson.nozeros_zeroinf <- calc_se(fit.poisson.nozeros_zeroinf, wine_training_data.zeros.test)
se.nb.zeros_zeroinf <- calc_se(fit.nb.zeros_zeroinf , wine_training_data.zeros.test)
se.nb.nozeros_zeroinf <- calc_se(fit.nb.nozeros_zeroinf, wine_training_data.zeros.test)
se.mlr.nozeros <- calc_se(fit.mlr.nozeros, wine_training_data.zeros.test)
SE <- format(c(se.poisson.imp.aj,se.poisson.imp_zeroinf,se.fit.qp.imp,se.nb.imp.aj,se.nb.imp_zeroinf,se.lm.imp.aj,sd.ml.imp,se.poisson.zeros,se.poisson.zeros_zeroinf,se.poisson.nozeros,se.poisson.nozeros_zeroinf,se.nb.zeros,se.nb.zeros_zeroinf,se.nb.nozeros,se.nb.nozeros_zeroinf,se.mlr.zeros,se.mlr.nozeros), digits=2, nsmall=2)
##################################################################
# AIC Calcs:
##################################################################
AIC <- format(c(AIC_1,AIC_1z,AIC_2,AIC_3,AIC_3z,AIC_4,AIC_5,AIC_zero_1,AIC_zero_1z,AIC_nozero_1,AIC_nozero_1z,AIC_zero_2,AIC_zero_2z,AIC_nozero_2,AIC_nozero_2z,AIC_zero_3,AIC_nozero_3), digits=2, nsmall=2)
##################################################################
# BIC Calcs:
##################################################################
BIC <- format(c(BIC_1,BIC_1z,BIC_2,BIC_3,BIC_3z,BIC_4,BIC_5,BIC_zero_1,BIC_zero_1z,BIC_nozero_1,BIC_nozero_1z,BIC_zero_2,BIC_zero_2z,BIC_nozero_2,BIC_nozero_2z,BIC_zero_3,BIC_nozero_3), digits=2, nsmall=2)
##################################################################
# MDL Co-efficients:
##################################################################
#all_fits <- c(fit.poisson.nozeros,fit.nb.nozeros,fit.mlr.nozeros,fit.poisson.zeros,fit.nb.zeros,fit.mlr.zeros,fit.nb.zeroinfl)
##################################################################
# LogLik Calcs:
##################################################################
LogLik <- format(c(logLik_1,logLik_1z,logLik_2,logLik_3,logLik_3z,logLik_4,logLik_5,logLik_zero_1,logLik_zero_1z,logLik_nozero_1,logLik_nozero_1z,
logLik_zero_2,logLik_zero_2z,logLik_nozero_2,logLik_nozero_2z,logLik_zero_3,logLik_nozero_3), digits=2, nsmall=2)
##################################################################
# R.squared Calcs:
##################################################################
R.squared <- format(round(c(pR2_1,pR2_1z,pR2_2,pR2_3,pR2_3z,pR2_4,pR2_5,pR2_zero_1,pR2_zero_1z,pR2_nozero_1,pR2_nozero_1z,pR2_zero_2,pR2_zero_2z,pR2_nozero_2,pR2_nozero_2z,pR2_zero_3,pR2_nozero_3),2))
##################################################################
# good-of-fitness:
##################################################################
GOF <- format(round(c(p_poisson_aj,NA,p_qp,p_nb_aj,NA,p_lm_aj ,NA,p_poisson_zero_aj,NA,p_poisson_nozero,NA,p_nb_zero_aj,NA,p_nb_nozero,NA,p_mlr_zero,p_mlr_nozero),4))
##################################################################
# Dispersion Test:
##################################################################
DispersionTest <- format(c(c_1,NA,c_2,c_3,NA,c_4,NA,c_6,NA,c_7,NA,c_8,NA,c_9,NA,c_10,NA))
Model <- c("Poisson imputation","Zero-inflated Poisson imputation","Quasibinomial imputation","Negative Binomial imputation","Zero-inflated Negative Binomial imputation","Multiple Linear imputation"," Multinomial Logistic imputation",
"Poisson w/ 0s", "Zero-inflated Poisson w/ 0s","Poisson no 0s", "Zero-inflated Poisson no 0s","Negative Binomial w/0s","Zero-inflated Negative Binomial w/0s","Negative Binomial no 0s", "Zero-inflated Negative Binomial no 0s", "Multiple Linear Regression w/ 0s", "Multiple Linear Regression no 0s" )
kable(model_summary<- cbind(Model, GOF,DispersionTest,R.squared ,RMSE, SE, SD, AIC, BIC, LogLik))
Model | GOF | DispersionTest | R.squared | RMSE | SE | SD | AIC | BIC | LogLik |
---|---|---|---|---|---|---|---|---|---|
Poisson imputation | 0.00 | 1.203 | 0.33 | 0.68 | 1.41 | 1.15 | 34761.48 | 34861.54 | -17366.74 |
Zero-inflated Poisson imputation | NA | NA | 0.14 | 1.37 | 1.37 | 1.16 | 31001.86 | 31216.26 | -15470.93 |
Quasibinomial imputation | 0.00 | 1.203 | 0.33 | 0.68 | 1.41 | 1.15 | NA | NA | NA |
Negative Binomial imputation | 0.00 | 1.259 | 0.33 | 0.68 | 1.41 | 1.15 | 34763.70 | 34870.90 | -17366.85 |
Zero-inflated Negative Binomial imputation | NA | NA | 0.14 | 0.68 | 1.37 | 1.16 | 31003.86 | 31225.40 | -15470.93 |
Multiple Linear imputation | 0.00 | 1.992 | 0.47 | 1.43 | 1.43 | 1.02 | 33107.94 | 33215.14 | -16538.97 |
Multinomial Logistic imputation | NA | NA | 0.28 | 0.42 | 0.13 | 0.13 | 33107.94 | 33215.14 | -16538.97 |
Poisson w/ 0s | 0.00 | 1.078 | 0.39 | 0.73 | 1.40 | 1.14 | 34346.54 | 34432.57 | -17161.27 |
Zero-inflated Poisson w/ 0s | NA | NA | 0.00 | 1.28 | 1.25 | 1.23 | 30758.54 | 30973.61 | -15349.27 |
Poisson no 0s | 1.00 | 0.633 | 0.32 | 0.38 | 1.16 | 0.87 | 25466.10 | 25514.15 | -12726.05 |
Zero-inflated Poisson no 0s | NA | NA | 0.09 | 1.12 | 1.33 | 1.47 | 24547.98 | 24753.91 | -12243.99 |
Negative Binomial w/0s | 0.00 | 1.078 | 0.39 | 0.65 | 1.40 | 1.14 | 34348.83 | 34442.03 | -17161.41 |
Zero-inflated Negative Binomial w/0s | NA | NA | 0.16 | 1.28 | 1.25 | 1.23 | 30760.54 | 30982.78 | -15349.27 |
Negative Binomial no 0s | 1.00 | 0.633 | 0.32 | 0.38 | 1.16 | 0.87 | 25468.21 | 25523.13 | -12726.11 |
Zero-inflated Negative Binomial no 0s | NA | NA | 0.09 | 1.12 | 1.33 | 1.47 | 24549.98 | 24762.78 | -12243.99 |
Multiple Linear Regression w/ 0s | 0.00 | 1.760 | 0.00 | 1.33 | 1.32 | 1.18 | 32671.79 | 32757.82 | -16323.90 |
Multiple Linear Regression no 0s | 0.91 | NA | 0.16 | 1.15 | 1.38 | 1.02 | 22112.27 | 22180.92 | -11046.14 |
model_id <- format(c("M1", "M1Z","M2","M3","M3Z","M4","M5","M6","M6z","M7","M7z","M8","M8z","M9","M9z","M10","M11"))
model_name<- format(c("Poisson GLM: imputation","Zero-inflated Poisson: imputation","Quasibinomial GLM: imputation","Negative Binomial GLM:imputation","Zero-inflated Negative Binomial: imputation","Multiple Linear Regression Model: imputation"," Multinomial Logistic Regression Model: imputation","Poisson GLM: NAs replaced by 0 ","Zero-inflated Poisson: NAs replaced by 0 ","Poisson GLM: NAs Removed","Zero-inflated Poisson: NAs Removed","Negative Binomial GLM: NAs replaced by 0 ","Zero-inflated Negative Binomial: NAs replaced by 0","Negative Binomial GLM: NAs Removed","Zero-inflated Negative Binomial: NAs Removed","Multiple Linear Regression: NAs replaced by 0","Multiple Linear Regression: NAs Removed"))
kable(model_name <- cbind(model_id,model_name))
model_id | model_name |
---|---|
M1 | Poisson GLM: imputation |
M1Z | Zero-inflated Poisson: imputation |
M2 | Quasibinomial GLM: imputation |
M3 | Negative Binomial GLM:imputation |
M3Z | Zero-inflated Negative Binomial: imputation |
M4 | Multiple Linear Regression Model: imputation |
M5 | Multinomial Logistic Regression Model: imputation |
M6 | Poisson GLM: NAs replaced by 0 |
M6z | Zero-inflated Poisson: NAs replaced by 0 |
M7 | Poisson GLM: NAs Removed |
M7z | Zero-inflated Poisson: NAs Removed |
M8 | Negative Binomial GLM: NAs replaced by 0 |
M8z | Zero-inflated Negative Binomial: NAs replaced by 0 |
M9 | Negative Binomial GLM: NAs Removed |
M9z | Zero-inflated Negative Binomial: NAs Removed |
M10 | Multiple Linear Regression: NAs replaced by 0 |
M11 | Multiple Linear Regression: NAs Removed |
evl_df_combine <- cbind(evl_df_1,evl_df_1z[,c(2,3)],evl_df_2[,c(2,3)],evl_df_3[,c(2,3)],evl_df_3z[,c(2,3)],evl_df_4[,c(2,3)],evl_df_5[,2],evl_df_zero_1[,c(2,3)],evl_df_zero_1z[,c(2,3)],evl_df_nozero_1[,c(2,3)],evl_df_nozero_1z[,c(2,3)],evl_df_zero_2[,c(2,3)],evl_df_zero_2z[,c(2,3)],evl_df_nozero_2[,c(2,3)],evl_df_nozero_2z[,c(2,3)],evl_df_zero_3[,c(2,3)],evl_df_nozero_3[,c(2,3)])
evl_df_combine <- cbind(seq(0,8),evl_df_combine)
colnames(evl_df_combine)<-c("ID","TARGET","M1.Mean", "M1.SE","M1Z.Count", "M1Z.Zero","M2.Mean", "M2.SE","M3.Mean", "M3.SE","M3Z.Count","M3Z.Zero","M4.Mean", "M4.SE","M5.pred","M6.Mean","M6.SE","M6z.Count","M6z.Zero","M7.Mean","M7.SE","M7z.Count","M7z.Zero","M8.Mean","M8.SE","M8z.Count","M8z.Zero","M9.Mean","M9.SE","M9z.Count","M9z.Zero","M10.Mean","M10.SE","M11.Mean","M11.SE")
kable(round(evl_df_combine,2),'html') %>%
kable_styling(bootstrap_options = c('bordered','hover','condensed',full_width=F))
ID | TARGET | M1.Mean | M1.SE | M1Z.Count | M1Z.Zero | M2.Mean | M2.SE | M3.Mean | M3.SE | M3Z.Count | M3Z.Zero | M4.Mean | M4.SE | M5.pred | M6.Mean | M6.SE | M6z.Count | M6z.Zero | M7.Mean | M7.SE | M7z.Count | M7z.Zero | M8.Mean | M8.SE | M8z.Count | M8z.Zero | M9.Mean | M9.SE | M9z.Count | M9z.Zero | M10.Mean | M10.SE | M11.Mean | M11.SE |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
0 | 0 | 1.93 | 0.04 | 3.29 | 0.44 | 1.93 | 0.04 | 1.93 | 0.04 | 3.29 | 0.44 | 1.91 | 0.05 | 0 | 1.81 | 0.04 | 3.16 | 0.55 | 2.85 | 0.05 | 3.31 | 0.22 | 1.57 | 0.04 | 3.16 | 0.55 | 2.46 | 0.05 | 3.31 | 0.22 | 1.68 | 0.05 | 2.20 | 0.05 |
1 | 1 | 1.51 | 0.04 | 2.28 | 0.18 | 1.51 | 0.04 | 1.51 | 0.04 | 2.28 | 0.18 | 1.39 | 0.05 | 1 | 1.66 | 0.03 | 2.25 | 0.25 | 2.35 | 0.04 | 2.53 | 0.04 | 1.42 | 0.04 | 2.25 | 0.25 | 1.99 | 0.04 | 2.53 | 0.04 | 1.33 | 0.05 | 1.44 | 0.05 |
2 | 2 | 2.07 | 0.05 | 2.69 | 0.18 | 2.07 | 0.04 | 2.07 | 0.05 | 2.69 | 0.18 | 2.07 | 0.05 | 2 | 2.12 | 0.04 | 2.65 | 0.18 | 2.63 | 0.04 | 2.79 | 0.05 | 2.06 | 0.04 | 2.65 | 0.18 | 2.41 | 0.04 | 2.79 | 0.05 | 2.07 | 0.04 | 2.16 | 0.04 |
3 | 3 | 2.69 | 0.05 | 3.16 | 0.16 | 2.69 | 0.05 | 2.69 | 0.05 | 3.16 | 0.16 | 2.71 | 0.05 | 3 | 2.63 | 0.05 | 3.15 | 0.14 | 3.07 | 0.05 | 3.25 | 0.05 | 2.75 | 0.05 | 3.15 | 0.14 | 2.90 | 0.05 | 3.25 | 0.05 | 2.73 | 0.04 | 2.84 | 0.04 |
4 | 4 | 3.46 | 0.07 | 3.83 | 0.12 | 3.46 | 0.06 | 3.46 | 0.07 | 3.83 | 0.12 | 3.48 | 0.05 | 4 | 3.40 | 0.06 | 3.83 | 0.08 | 3.69 | 0.06 | 3.87 | 0.04 | 3.59 | 0.06 | 3.83 | 0.08 | 3.59 | 0.05 | 3.87 | 0.04 | 3.54 | 0.04 | 3.64 | 0.04 |
5 | 5 | 4.16 | 0.09 | 4.54 | 0.08 | 4.16 | 0.08 | 4.16 | 0.09 | 4.54 | 0.08 | 4.14 | 0.05 | 5 | 4.18 | 0.07 | 4.50 | 0.06 | 4.35 | 0.07 | 4.50 | 0.03 | 4.25 | 0.08 | 4.50 | 0.06 | 4.27 | 0.07 | 4.50 | 0.03 | 4.20 | 0.04 | 4.30 | 0.04 |
6 | 6 | 4.93 | 0.11 | 5.34 | 0.04 | 4.93 | 0.10 | 4.93 | 0.11 | 5.34 | 0.04 | 4.80 | 0.06 | 6 | 5.19 | 0.10 | 5.32 | 0.04 | 5.22 | 0.09 | 5.31 | 0.02 | 4.86 | 0.10 | 5.32 | 0.04 | 5.13 | 0.09 | 5.31 | 0.02 | 4.89 | 0.05 | 4.98 | 0.04 |
7 | 7 | 5.37 | 0.13 | 6.09 | 0.02 | 5.37 | 0.13 | 5.37 | 0.13 | 6.09 | 0.02 | 5.25 | 0.06 | 7 | 6.11 | 0.12 | 6.08 | 0.05 | 6.04 | 0.11 | 6.04 | 0.00 | 5.41 | 0.12 | 6.08 | 0.05 | 5.94 | 0.11 | 6.04 | 0.00 | 5.40 | 0.05 | 5.53 | 0.05 |
8 | 8 | 5.24 | 0.15 | 6.44 | 0.01 | 5.24 | 0.14 | 5.24 | 0.15 | 6.44 | 0.01 | 5.31 | 0.07 | 8 | 6.70 | 0.16 | 6.72 | 0.00 | 6.56 | 0.12 | 6.56 | 0.00 | 6.00 | 0.16 | 6.72 | 0.00 | 6.56 | 0.12 | 6.56 | 0.00 | 5.77 | 0.06 | 5.96 | 0.05 |
# Generated palette with rich rainbow and dark (12, s = 0.6, v = 0.75)
mixcolor <- c("#1B9E77", "#666666", "#5e1ea6", "#7570B3", "#A6761D", "#D95F02", "#E6AB02", "#E7298A", "#000040", "#000093", "#0d4701", "#0076FF", "#0ff4ff", "#49FB25", "#FEEA02", "#FF3300","#eeff24","#ff24ef")
richcolor <- c("#000041", "#FF3300", "#0081FF", "#80FE1A", "#FDEE02", "#FFAB00" )
ggplot(evl_df_combine,aes(x=ID,y=TARGET))+
geom_line(aes(y = M1.Mean, colour = "Poisson-imputation"),size = 1)+
geom_line(aes(y = M1Z.Count, colour = "Zero-inflated Poisson-imputation"),size = 1)+
geom_line(aes(y = M2.Mean, colour = "Quasibinomial-imputation"),size = 1) +
geom_line(aes(y = M3.Mean, colour = "Negative Binomial-imputation"),size = 1) +
geom_line(aes(y = M3Z.Count, colour = "Zero-inflated Negative Binomial-imputation"),size = 1)+
geom_line(aes(y = M4.Mean, colour = "Multiple Linear-imputation"),size = 1) +
geom_line(aes(y = M5.pred, colour = "Multinomial Logistic-imputation"),size = 1)+
geom_line(aes(y = M6.Mean, colour = "Poisson w/ 0s"),size = 1) +
geom_line(aes(y = M6z.Count, colour = "Zero-inflated Poisson w/ 0s"),size = 1) +
geom_line(aes(y = M7.Mean, colour = "Poisson no 0s"),size = 1) +
geom_line(aes(y = M7z.Count, colour = "Zero-inflated Poisson no 0s"),size = 1) +
geom_line(aes(y = M8.Mean, colour = "Negative Binomial w/0s"),size = 1) +
geom_line(aes(y = M8z.Count, colour = "Zero-inflated Negative Binomial w/0s"),size = 1) +
geom_line(aes(y = M9.Mean, colour = "Negative Binomial no 0s"),size = 1) +
geom_line(aes(y = M9z.Count, colour = "Zero-inflated Negative Binomial no 0s"),size = 1) +
geom_line(aes(y = M10.Mean, colour = "Multiple Linear Regression w/0s"),size = 1)+
geom_line(aes(y = M11.Mean, colour = "Multiple Linear Regression no 0s"),size = 1)+
geom_bar(stat="identity",fill="orange",color="grey",alpha=0.2)+
theme_bw()+
scale_color_manual(values = mixcolor)
ggplot(evl_df_combine,aes(x=ID,y=TARGET))+
geom_line(aes(y = M3.Mean, colour = "Negative Binomial-imputation"),size = 1.2) +
geom_line(aes(y = M5.pred, colour = "Multinomial Logistic-imputation"),size = 1.2)+
geom_line(aes(y = M7.Mean, colour = "Poisson no 0s"),size = 1.2) +
geom_line(aes(y = M8.Mean, colour = "Negative Binomial w/0s"),size = 1.2) +
geom_line(aes(y = M9.Mean, colour = "Negative Binomial no 0s"),size = 1.5) +
geom_bar(stat="identity",fill="green",color="grey",alpha=0.3)+
theme_bw()+
scale_color_manual(values = richcolor)
Before selecting a model, a quick explanation of why the “no ZEROs” models performed better:
One might think that either removing or keeping a value (such as the “Perfect-Graphical-Fit” Zeros-for-NAs) would possibly improve a model’s accuracy, but at least maintain it. In this case, however, we saw a relatively large drop in performance of the models due to the inclusion of this attribute. Why would this be?
One probable explanation is that true customers that actually BOUGHT the product took the time to fill their surveys out accurately. Customers who didn’t purchase the product (with less stake in the game and/or lack of knowledge of the product) simply did not contribute such useful data. Due to this, the Zero-Inflated model is basically gone, and a linear model with a normal distribution again seems reasonable.
We know that Poisson Regression is actually a special case of Negative Binomial Regression (where the mean and the variance are equal), but in this case, the Poisson Regression did not yield more accurate results. We know that if there is overdispersion in the Poisson, then the estimates from the Poisson regression model are consistent but inefficient. It seems from our Box Plots in DATA Exploration that these overdispersions may have occurred.
When comparing models fitted by maximum likelihood to the same data, the smaller the AIC or BIC, the better the fit.
Based on this and the above results table, it seems that the Negative Binomial no 0s is our winner.
###########################################################
#Compare the pattern of missing valus
###########################################################
summary(wine_train)
## INDEX TARGET FixedAcidity VolatileAcidity
## Min. : 1 Min. :0.00 Min. :-18.1 Min. :-2.79
## 1st Qu.: 4038 1st Qu.:2.00 1st Qu.: 5.2 1st Qu.: 0.13
## Median : 8110 Median :3.00 Median : 6.9 Median : 0.28
## Mean : 8070 Mean :3.03 Mean : 7.1 Mean : 0.32
## 3rd Qu.:12106 3rd Qu.:4.00 3rd Qu.: 9.5 3rd Qu.: 0.64
## Max. :16129 Max. :8.00 Max. : 34.4 Max. : 3.68
##
## CitricAcid ResidualSugar Chlorides FreeSulfurDioxide
## Min. :-3.24 Min. :-128 Min. :-1 Min. :-555
## 1st Qu.: 0.03 1st Qu.: -2 1st Qu.: 0 1st Qu.: 0
## Median : 0.31 Median : 4 Median : 0 Median : 30
## Mean : 0.31 Mean : 5 Mean : 0 Mean : 31
## 3rd Qu.: 0.58 3rd Qu.: 16 3rd Qu.: 0 3rd Qu.: 70
## Max. : 3.86 Max. : 141 Max. : 1 Max. : 623
## NA's :616 NA's :638 NA's :647
## TotalSulfurDioxide Density pH Sulphates
## Min. :-823 Min. :0.888 Min. :0 Min. :-3
## 1st Qu.: 27 1st Qu.:0.988 1st Qu.:3 1st Qu.: 0
## Median : 123 Median :0.994 Median :3 Median : 0
## Mean : 121 Mean :0.994 Mean :3 Mean : 1
## 3rd Qu.: 208 3rd Qu.:1.001 3rd Qu.:3 3rd Qu.: 1
## Max. :1057 Max. :1.099 Max. :6 Max. : 4
## NA's :682 NA's :395 NA's :1210
## Alcohol LabelAppeal AcidIndex STARS
## Min. :-5 Min. :-2.000 Min. : 4.00 Min. :1
## 1st Qu.: 9 1st Qu.:-1.000 1st Qu.: 7.00 1st Qu.:1
## Median :10 Median : 0.000 Median : 8.00 Median :2
## Mean :10 Mean :-0.009 Mean : 7.77 Mean :2
## 3rd Qu.:12 3rd Qu.: 1.000 3rd Qu.: 8.00 3rd Qu.:3
## Max. :26 Max. : 2.000 Max. :17.00 Max. :4
## NA's :653 NA's :3359
summary(wine_test)
## IN TARGET FixedAcidity VolatileAcidity
## Min. : 3 Mode:logical Min. :-18.2 Min. :-2.83
## 1st Qu.: 4018 NA's:3335 1st Qu.: 5.2 1st Qu.: 0.08
## Median : 7906 Median : 6.9 Median : 0.28
## Mean : 8048 Mean : 6.9 Mean : 0.31
## 3rd Qu.:12061 3rd Qu.: 9.0 3rd Qu.: 0.63
## Max. :16130 Max. : 33.5 Max. : 3.61
##
## CitricAcid ResidualSugar Chlorides FreeSulfurDioxide
## Min. :-3.12 Min. :-128.3 Min. :-1.1 Min. :-563
## 1st Qu.: 0.00 1st Qu.: -2.6 1st Qu.: 0.0 1st Qu.: 3
## Median : 0.31 Median : 3.6 Median : 0.0 Median : 30
## Mean : 0.31 Mean : 5.3 Mean : 0.1 Mean : 35
## 3rd Qu.: 0.60 3rd Qu.: 17.2 3rd Qu.: 0.2 3rd Qu.: 79
## Max. : 3.76 Max. : 145.4 Max. : 1.3 Max. : 617
## NA's :168 NA's :138 NA's :152
## TotalSulfurDioxide Density pH Sulphates
## Min. :-769 Min. :0.890 Min. :0.6 Min. :-3.1
## 1st Qu.: 27 1st Qu.:0.988 1st Qu.:3.0 1st Qu.: 0.3
## Median : 124 Median :0.995 Median :3.2 Median : 0.5
## Mean : 123 Mean :0.995 Mean :3.2 Mean : 0.5
## 3rd Qu.: 210 3rd Qu.:1.001 3rd Qu.:3.5 3rd Qu.: 0.8
## Max. :1004 Max. :1.100 Max. :6.2 Max. : 4.2
## NA's :157 NA's :104 NA's :310
## Alcohol LabelAppeal AcidIndex STARS
## Min. :-4.2 Min. :-2.000 Min. : 5.00 Min. :1
## 1st Qu.: 9.0 1st Qu.:-1.000 1st Qu.: 7.00 1st Qu.:1
## Median :10.4 Median : 0.000 Median : 8.00 Median :2
## Mean :10.6 Mean : 0.013 Mean : 7.75 Mean :2
## 3rd Qu.:12.5 3rd Qu.: 1.000 3rd Qu.: 8.00 3rd Qu.:3
## Max. :25.6 Max. : 2.000 Max. :17.00 Max. :4
## NA's :185 NA's :841
#Missingness Map
miss_plot <- plot_missing(wine_train)
miss_plot <- plot_missing(wine_test[,-2])
wine_test$ResidualSugar[is.na(wine_test$ResidualSugar)] <- sample(wine_test$ResidualSugar[!is.na(wine_test$ResidualSugar)])
## Warning in wine_test$ResidualSugar[is.na(wine_test$ResidualSugar)] <-
## sample(wine_test$ResidualSugar[!is.na(wine_test$ResidualSugar)]): number of
## items to replace is not a multiple of replacement length
wine_test$Chlorides[is.na(wine_test$Chlorides)] <- sample(wine_test$Chlorides[!is.na(wine_test$Chlorides)])
## Warning in wine_test$Chlorides[is.na(wine_test$Chlorides)] <-
## sample(wine_test$Chlorides[!is.na(wine_test$Chlorides)]): number of items
## to replace is not a multiple of replacement length
wine_test$FreeSulfurDioxide[is.na(wine_test$FreeSulfurDioxide)] <- sample(wine_test$FreeSulfurDioxide[!is.na(wine_test$FreeSulfurDioxide)])
## Warning in wine_test$FreeSulfurDioxide[is.na(wine_test
## $FreeSulfurDioxide)] <- sample(wine_test$FreeSulfurDioxide[!is.na(wine_test
## $FreeSulfurDioxide)]): number of items to replace is not a multiple of
## replacement length
wine_test$TotalSulfurDioxide[is.na(wine_test$TotalSulfurDioxide)] <- sample(wine_test$TotalSulfurDioxide[!is.na(wine_test$TotalSulfurDioxide)])
## Warning in wine_test$TotalSulfurDioxide[is.na(wine_test
## $TotalSulfurDioxide)] <- sample(wine_test$TotalSulfurDioxide[!
## is.na(wine_test$TotalSulfurDioxide)]): number of items to replace is not a
## multiple of replacement length
wine_test$pH[is.na(wine_test$pH)] <- sample(wine_test$pH[!is.na(wine_test$pH)])
## Warning in wine_test$pH[is.na(wine_test$pH)] <- sample(wine_test$pH[!
## is.na(wine_test$pH)]): number of items to replace is not a multiple of
## replacement length
wine_test$Sulphates[is.na(wine_test$Sulphates)] <- sample(wine_test$Sulphates[!is.na(wine_test$Sulphates)])
## Warning in wine_test$Sulphates[is.na(wine_test$Sulphates)] <-
## sample(wine_test$Sulphates[!is.na(wine_test$Sulphates)]): number of items
## to replace is not a multiple of replacement length
wine_test$Alcohol[is.na(wine_test$Alcohol)] <- sample(wine_test$Alcohol[!is.na(wine_test$Alcohol)])
## Warning in wine_test$Alcohol[is.na(wine_test$Alcohol)] <- sample(wine_test
## $Alcohol[!is.na(wine_test$Alcohol)]): number of items to replace is not a
## multiple of replacement length
wine_test$STARS[is.na(wine_test$STARS)] <- 0
wine_test.zeros <- wine_test
wine_test.nozeros <- wine_test[wine_test$STARS != 0,]
#Missingness Map
miss_plot <- plot_missing(wine_test[,-2])
pred_evalation <- predict(fit.nb.nozeros, newdata=wine_test, type="response")
pred_evalation_df <- cbind(wine_test$IN, pred_evalation,wine_test[,-c(1,2)])
colnames(pred_evalation_df)[colnames(pred_evalation_df)=='wine_test$IN']<- 'IN'
#write.csv(pred_evalation_df,"predicted_evalation.csv")
kable(head(pred_evalation_df,10))
IN | pred_evalation | FixedAcidity | VolatileAcidity | CitricAcid | ResidualSugar | Chlorides | FreeSulfurDioxide | TotalSulfurDioxide | Density | pH | Sulphates | Alcohol | LabelAppeal | AcidIndex | STARS |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 2.23 | 5.4 | -0.860 | 0.27 | -10.7 | 0.092 | 23 | 398 | 0.985 | 5.02 | 0.64 | 12.30 | -1 | 6 | 0 |
9 | 3.81 | 12.4 | 0.385 | -0.76 | -19.7 | 1.169 | -37 | 68 | 0.990 | 3.37 | 1.09 | 16.00 | 0 | 6 | 2 |
10 | 2.68 | 7.2 | 1.750 | 0.17 | -33.0 | 0.065 | 9 | 76 | 1.046 | 4.61 | 0.68 | 8.55 | 0 | 8 | 1 |
18 | 2.40 | 6.2 | 0.100 | 1.80 | 1.0 | -0.179 | 104 | 89 | 0.989 | 3.20 | 2.11 | 12.30 | -1 | 8 | 1 |
21 | 2.08 | 11.4 | 0.210 | 0.28 | 1.2 | 0.038 | 70 | 53 | 1.029 | 2.54 | -0.07 | 4.80 | 0 | 10 | 0 |
30 | 5.88 | 17.6 | 0.040 | -1.15 | 1.4 | 0.535 | -250 | 140 | 0.950 | 3.06 | -0.02 | 11.40 | 1 | 8 | 4 |
31 | 3.34 | 15.5 | 0.530 | -0.53 | 4.6 | 1.263 | 10 | 17 | 1.000 | 3.07 | 0.75 | 8.50 | 0 | 12 | 3 |
37 | 2.90 | 15.9 | 1.190 | 1.14 | 31.9 | -0.299 | 115 | 381 | 1.034 | 2.99 | 0.31 | 11.40 | 1 | 7 | 0 |
39 | 1.84 | 11.6 | 0.320 | 0.55 | -50.9 | 0.076 | 35 | 83 | 1.000 | 3.32 | 2.18 | -0.50 | 0 | 12 | 0 |
47 | 2.46 | 3.8 | 0.220 | 0.31 | -7.7 | 0.039 | 40 | 129 | 0.906 | 4.72 | -0.64 | 10.90 | 0 | 7 | 0 |
kable(head(pred_evalation_df,10),'html') %>%
kable_styling(bootstrap_options = c('bordered','hover','condensed',full_width=F))
IN | pred_evalation | FixedAcidity | VolatileAcidity | CitricAcid | ResidualSugar | Chlorides | FreeSulfurDioxide | TotalSulfurDioxide | Density | pH | Sulphates | Alcohol | LabelAppeal | AcidIndex | STARS |
---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|---|
3 | 2.23 | 5.4 | -0.860 | 0.27 | -10.7 | 0.092 | 23 | 398 | 0.985 | 5.02 | 0.64 | 12.30 | -1 | 6 | 0 |
9 | 3.81 | 12.4 | 0.385 | -0.76 | -19.7 | 1.169 | -37 | 68 | 0.990 | 3.37 | 1.09 | 16.00 | 0 | 6 | 2 |
10 | 2.68 | 7.2 | 1.750 | 0.17 | -33.0 | 0.065 | 9 | 76 | 1.046 | 4.61 | 0.68 | 8.55 | 0 | 8 | 1 |
18 | 2.40 | 6.2 | 0.100 | 1.80 | 1.0 | -0.179 | 104 | 89 | 0.989 | 3.20 | 2.11 | 12.30 | -1 | 8 | 1 |
21 | 2.08 | 11.4 | 0.210 | 0.28 | 1.2 | 0.038 | 70 | 53 | 1.029 | 2.54 | -0.07 | 4.80 | 0 | 10 | 0 |
30 | 5.88 | 17.6 | 0.040 | -1.15 | 1.4 | 0.535 | -250 | 140 | 0.950 | 3.06 | -0.02 | 11.40 | 1 | 8 | 4 |
31 | 3.34 | 15.5 | 0.530 | -0.53 | 4.6 | 1.263 | 10 | 17 | 1.000 | 3.07 | 0.75 | 8.50 | 0 | 12 | 3 |
37 | 2.90 | 15.9 | 1.190 | 1.14 | 31.9 | -0.299 | 115 | 381 | 1.034 | 2.99 | 0.31 | 11.40 | 1 | 7 | 0 |
39 | 1.84 | 11.6 | 0.320 | 0.55 | -50.9 | 0.076 | 35 | 83 | 1.000 | 3.32 | 2.18 | -0.50 | 0 | 12 | 0 |
47 | 2.46 | 3.8 | 0.220 | 0.31 | -7.7 | 0.039 | 40 | 129 | 0.906 | 4.72 | -0.64 | 10.90 | 0 | 7 | 0 |