library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(stringr)
library(mice)
## Warning: package 'mice' was built under R version 3.6.2
##
## Attaching package: 'mice'
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(tidyr)
library(corrplot)
## Warning: package 'corrplot' was built under R version 3.6.3
## corrplot 0.84 loaded
library(VIM)
## Warning: package 'VIM' was built under R version 3.6.2
## Loading required package: colorspace
## Loading required package: grid
## Loading required package: data.table
##
## Attaching package: 'data.table'
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(magrittr)
##
## Attaching package: 'magrittr'
## The following object is masked from 'package:tidyr':
##
## extract
library(pscl)
## Warning: package 'pscl' was built under R version 3.6.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
wine_raw <- read.csv("https://raw.githubusercontent.com/deepakmongia/Data621/master/HW-5/data/wine-training-data.csv",
header = TRUE, row.names = 1)
print(head(wine_raw))
## TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1 3 3.2 1.160 -0.98 54.2 -0.567
## 2 3 4.5 0.160 -0.81 26.1 -0.425
## 4 5 7.1 2.640 -0.88 14.8 0.037
## 5 3 5.7 0.385 0.04 18.8 -0.425
## 6 4 8.0 0.330 -1.26 9.4 NA
## 7 0 11.3 0.320 0.59 2.2 0.556
## FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol
## 1 NA 268 0.99280 3.33 -0.59 9.9
## 2 15 -327 1.02792 3.38 0.70 NA
## 4 214 142 0.99518 3.12 0.48 22.0
## 5 22 115 0.99640 2.24 1.83 6.2
## 6 -167 108 0.99457 3.12 1.77 13.7
## 7 -37 15 0.99940 3.20 1.29 15.4
## LabelAppeal AcidIndex STARS
## 1 0 8 2
## 2 -1 7 3
## 4 -1 8 3
## 5 -1 6 1
## 6 0 9 2
## 7 0 11 NA
print(dim(wine_raw))
## [1] 12795 15
print(str(wine_raw))
## 'data.frame': 12795 obs. of 15 variables:
## $ TARGET : int 3 3 5 3 4 0 0 4 3 6 ...
## $ FixedAcidity : num 3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
## $ VolatileAcidity : num 1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
## $ CitricAcid : num -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
## $ ResidualSugar : num 54.2 26.1 14.8 18.8 9.4 ...
## $ Chlorides : num -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
## $ FreeSulfurDioxide : num NA 15 214 22 -167 -37 287 523 -213 62 ...
## $ TotalSulfurDioxide: num 268 -327 142 115 108 15 156 551 NA 180 ...
## $ Density : num 0.993 1.028 0.995 0.996 0.995 ...
## $ pH : num 3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
## $ Sulphates : num -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
## $ Alcohol : num 9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
## $ LabelAppeal : int 0 -1 -1 -1 0 0 0 1 0 0 ...
## $ AcidIndex : int 8 7 8 6 9 11 8 7 6 8 ...
## $ STARS : int 2 3 3 1 2 NA NA 3 NA 4 ...
## NULL
print(summary(wine_raw))
## TARGET FixedAcidity VolatileAcidity CitricAcid
## Min. :0.000 Min. :-18.100 Min. :-2.7900 Min. :-3.2400
## 1st Qu.:2.000 1st Qu.: 5.200 1st Qu.: 0.1300 1st Qu.: 0.0300
## Median :3.000 Median : 6.900 Median : 0.2800 Median : 0.3100
## Mean :3.029 Mean : 7.076 Mean : 0.3241 Mean : 0.3084
## 3rd Qu.:4.000 3rd Qu.: 9.500 3rd Qu.: 0.6400 3rd Qu.: 0.5800
## Max. :8.000 Max. : 34.400 Max. : 3.6800 Max. : 3.8600
##
## ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide
## Min. :-127.800 Min. :-1.1710 Min. :-555.00 Min. :-823.0
## 1st Qu.: -2.000 1st Qu.:-0.0310 1st Qu.: 0.00 1st Qu.: 27.0
## Median : 3.900 Median : 0.0460 Median : 30.00 Median : 123.0
## Mean : 5.419 Mean : 0.0548 Mean : 30.85 Mean : 120.7
## 3rd Qu.: 15.900 3rd Qu.: 0.1530 3rd Qu.: 70.00 3rd Qu.: 208.0
## Max. : 141.150 Max. : 1.3510 Max. : 623.00 Max. :1057.0
## NA's :616 NA's :638 NA's :647 NA's :682
## Density pH Sulphates Alcohol
## Min. :0.8881 Min. :0.480 Min. :-3.1300 Min. :-4.70
## 1st Qu.:0.9877 1st Qu.:2.960 1st Qu.: 0.2800 1st Qu.: 9.00
## Median :0.9945 Median :3.200 Median : 0.5000 Median :10.40
## Mean :0.9942 Mean :3.208 Mean : 0.5271 Mean :10.49
## 3rd Qu.:1.0005 3rd Qu.:3.470 3rd Qu.: 0.8600 3rd Qu.:12.40
## Max. :1.0992 Max. :6.130 Max. : 4.2400 Max. :26.50
## NA's :395 NA's :1210 NA's :653
## LabelAppeal AcidIndex STARS
## Min. :-2.000000 Min. : 4.000 Min. :1.000
## 1st Qu.:-1.000000 1st Qu.: 7.000 1st Qu.:1.000
## Median : 0.000000 Median : 8.000 Median :2.000
## Mean :-0.009066 Mean : 7.773 Mean :2.042
## 3rd Qu.: 1.000000 3rd Qu.: 8.000 3rd Qu.:3.000
## Max. : 2.000000 Max. :17.000 Max. :4.000
## NA's :3359
We are now checking if there is any blank or NA data
### Blank data
train_missing_df <- data.frame(apply(wine_raw, 2, function(x) length(which(x == ''))))
print(train_missing_df)
## apply.wine_raw..2..function.x..length.which.x.........
## TARGET 0
## FixedAcidity 0
## VolatileAcidity 0
## CitricAcid 0
## ResidualSugar 0
## Chlorides 0
## FreeSulfurDioxide 0
## TotalSulfurDioxide 0
## Density 0
## pH 0
## Sulphates 0
## Alcohol 0
## LabelAppeal 0
## AcidIndex 0
## STARS 0
### NA data
train_na_df1 <- data.frame(apply(wine_raw, 2, function(x) length(which(is.na(x)))))
print(train_na_df1)
## apply.wine_raw..2..function.x..length.which.is.na.x....
## TARGET 0
## FixedAcidity 0
## VolatileAcidity 0
## CitricAcid 0
## ResidualSugar 616
## Chlorides 638
## FreeSulfurDioxide 647
## TotalSulfurDioxide 682
## Density 0
## pH 395
## Sulphates 1210
## Alcohol 653
## LabelAppeal 0
## AcidIndex 0
## STARS 3359
So there is no data with blank column data but there is data with missing or NA column data. We will work on imputing this missing / NA data, but let us first explore each column before that.
print(unique(wine_raw$TARGET))
## [1] 3 5 4 0 6 7 2 1 8
table(wine_raw$TARGET)
##
## 0 1 2 3 4 5 6 7 8
## 2734 244 1091 2611 3177 2014 765 142 17
barplot(table(wine_raw$TARGET))
class(wine_raw$TARGET)
## [1] "integer"
summary(wine_raw$TARGET)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 3.000 3.029 4.000 8.000
summary(wine_raw$FixedAcidity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -18.100 5.200 6.900 7.076 9.500 34.400
boxplot(wine_raw$FixedAcidity)
ggplot(wine_raw, aes(FixedAcidity)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(wine_raw$VolatileAcidity)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.7900 0.1300 0.2800 0.3241 0.6400 3.6800
boxplot(wine_raw$VolatileAcidity)
ggplot(wine_raw, aes(VolatileAcidity)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(wine_raw$CitricAcid)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.2400 0.0300 0.3100 0.3084 0.5800 3.8600
boxplot(wine_raw$CitricAcid)
ggplot(wine_raw, aes(CitricAcid)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(wine_raw$ResidualSugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -127.800 -2.000 3.900 5.419 15.900 141.150 616
boxplot(wine_raw$ResidualSugar)
ggplot(wine_raw, aes(ResidualSugar)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 616 rows containing non-finite values (stat_bin).
summary(wine_raw$Chlorides)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.1710 -0.0310 0.0460 0.0548 0.1530 1.3510 638
boxplot(wine_raw$Chlorides)
ggplot(wine_raw, aes(Chlorides)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 638 rows containing non-finite values (stat_bin).
summary(wine_raw$FreeSulfurDioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -555.00 0.00 30.00 30.85 70.00 623.00 647
boxplot(wine_raw$FreeSulfurDioxide)
ggplot(wine_raw, aes(FreeSulfurDioxide)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 647 rows containing non-finite values (stat_bin).
summary(wine_raw$TotalSulfurDioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -823.0 27.0 123.0 120.7 208.0 1057.0 682
boxplot(wine_raw$TotalSulfurDioxide)
ggplot(wine_raw, aes(TotalSulfurDioxide)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 682 rows containing non-finite values (stat_bin).
summary(wine_raw$Density)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.8881 0.9877 0.9945 0.9942 1.0005 1.0992
boxplot(wine_raw$Density)
ggplot(wine_raw, aes(Density)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(wine_raw$pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.480 2.960 3.200 3.208 3.470 6.130 395
boxplot(wine_raw$pH)
ggplot(wine_raw, aes(pH)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 395 rows containing non-finite values (stat_bin).
summary(wine_raw$Sulphates)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -3.1300 0.2800 0.5000 0.5271 0.8600 4.2400 1210
boxplot(wine_raw$Sulphates)
ggplot(wine_raw, aes(Sulphates)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 1210 rows containing non-finite values (stat_bin).
summary(wine_raw$Alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -4.70 9.00 10.40 10.49 12.40 26.50 653
boxplot(wine_raw$Alcohol)
ggplot(wine_raw, aes(Alcohol)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 653 rows containing non-finite values (stat_bin).
summary(wine_raw$LabelAppeal)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.000000 -1.000000 0.000000 -0.009066 1.000000 2.000000
unique(wine_raw$LabelAppeal)
## [1] 0 -1 1 2 -2
boxplot(wine_raw$LabelAppeal)
ggplot(wine_raw, aes(LabelAppeal)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(wine_raw$AcidIndex)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 4.000 7.000 8.000 7.773 8.000 17.000
unique(wine_raw$AcidIndex)
## [1] 8 7 6 9 11 5 10 14 13 12 16 15 17 4
boxplot(wine_raw$AcidIndex)
ggplot(wine_raw, aes(AcidIndex)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
summary(wine_raw$STARS)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.042 3.000 4.000 3359
unique(wine_raw$STARS)
## [1] 2 3 1 NA 4
boxplot(wine_raw$STARS)
ggplot(wine_raw, aes(STARS)) + geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## Warning: Removed 3359 rows containing non-finite values (stat_bin).
### Imputing missing values
summary(wine_raw$ResidualSugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -127.800 -2.000 3.900 5.419 15.900 141.150 616
summary(wine_raw$Chlorides)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.1710 -0.0310 0.0460 0.0548 0.1530 1.3510 638
summary(wine_raw$FreeSulfurDioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -555.00 0.00 30.00 30.85 70.00 623.00 647
summary(wine_raw$TotalSulfurDioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -823.0 27.0 123.0 120.7 208.0 1057.0 682
summary(wine_raw$pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.480 2.960 3.200 3.208 3.470 6.130 395
summary(wine_raw$Sulphates)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -3.1300 0.2800 0.5000 0.5271 0.8600 4.2400 1210
summary(wine_raw$Alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -4.70 9.00 10.40 10.49 12.40 26.50 653
summary(wine_raw$STARS)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.042 3.000 4.000 3359
As we see above the column STARS has a very big number of data missing, so we won’t impute this column. Rather we will remove this column first for our impute efforts and then will put it back to the imputed dataset.
wine_raw3 <- subset(wine_raw, select = -c(STARS))
aggr_plot3 <- aggr(wine_raw3, col=c('navyblue','red'),
numbers=TRUE, sortVars=TRUE,
labels=names(wine_raw3), cex.axis=.7,
gap=3, ylab=c("Histogram of missing data","Pattern"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)
##
## Variables sorted by number of missings:
## Variable Count
## Sulphates 0.09456819
## TotalSulfurDioxide 0.05330207
## Alcohol 0.05103556
## FreeSulfurDioxide 0.05056663
## Chlorides 0.04986323
## ResidualSugar 0.04814381
## pH 0.03087143
## TARGET 0.00000000
## FixedAcidity 0.00000000
## VolatileAcidity 0.00000000
## CitricAcid 0.00000000
## Density 0.00000000
## LabelAppeal 0.00000000
## AcidIndex 0.00000000
wine_raw_imputed <- mice(data = wine_raw3, m = 1,
method = "pmm", maxit = 5, seed = 500)
##
## iter imp variable
## 1 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol
## 2 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol
## 3 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol
## 4 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol
## 5 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol
wine_raw_imputed_df <- mice::complete(wine_raw_imputed, 1)
wine_raw_imputed_df_bk <- wine_raw_imputed_df
wine_raw_imputed_df <- cbind(wine_raw_imputed_df, wine_raw[, "STARS"])
colnames(wine_raw_imputed_df)[colnames(wine_raw_imputed_df) == 'wine_raw[, "STARS"]'] <-
"STARS"
We have imputed all fields except STARS. Let’s compare the raw dataset columns having NA and the same columns in the imputed dataset.
summary(wine_raw$ResidualSugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -127.800 -2.000 3.900 5.419 15.900 141.150 616
summary(wine_raw_imputed_df$ResidualSugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -127.800 -2.300 3.900 5.444 16.000 141.150
summary(wine_raw$Chlorides)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -1.1710 -0.0310 0.0460 0.0548 0.1530 1.3510 638
summary(wine_raw_imputed_df$Chlorides)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.17100 -0.02900 0.04600 0.05584 0.15600 1.35100
summary(wine_raw$FreeSulfurDioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -555.00 0.00 30.00 30.85 70.00 623.00 647
summary(wine_raw_imputed_df$FreeSulfurDioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -555.00 0.00 30.00 30.69 70.00 623.00
summary(wine_raw$TotalSulfurDioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -823.0 27.0 123.0 120.7 208.0 1057.0 682
summary(wine_raw_imputed_df$TotalSulfurDioxide)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -823.0 27.0 124.0 121.2 209.0 1057.0
summary(wine_raw$pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 0.480 2.960 3.200 3.208 3.470 6.130 395
summary(wine_raw_imputed_df$pH)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.480 2.960 3.200 3.208 3.470 6.130
summary(wine_raw$Sulphates)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -3.1300 0.2800 0.5000 0.5271 0.8600 4.2400 1210
summary(wine_raw_imputed_df$Sulphates)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.1300 0.2800 0.5000 0.5278 0.8600 4.2400
summary(wine_raw$Alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -4.70 9.00 10.40 10.49 12.40 26.50 653
summary(wine_raw_imputed_df$Alcohol)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -4.7 9.0 10.4 10.5 12.4 26.5
summary(wine_raw$STARS)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.042 3.000 4.000 3359
summary(wine_raw_imputed_df$STARS)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 1.000 1.000 2.000 2.042 3.000 4.000 3359
Let’s build the correlation plot.
corrMatrix <- round(cor(wine_raw_imputed_df_bk),4)
corrMatrix %>% corrplot(., method = "color", outline = T,
addgrid.col = "darkgray", order="hclust",
addrect = 4, rect.col = "black",
rect.lwd = 5,cl.pos = "b", tl.col = "indianred4",
tl.cex = 1.0, cl.cex = 1.0, addCoef.col = "white",
number.digits = 2, number.cex = 0.8,
col = colorRampPalette(c("darkred","white","dodgerblue4"))(100))
n <- nrow(wine_raw_imputed_df)
set.seed(123)
wine_random_index <- wine_raw_imputed_df[sample(n), ]
wine.train.df <- wine_raw_imputed_df[1:as.integer(0.7*n),]
wine.test.df <- wine_raw_imputed_df[as.integer(0.7*n +1):n, ]
table(wine.test.df$TARGET) / nrow(wine.test.df)
##
## 0 1 2 3 4
## 0.2159416515 0.0187548841 0.0862203699 0.1997916124 0.2448554311
## 5 6 7 8
## 0.1604584527 0.0617348268 0.0114613181 0.0007814535
table(wine.train.df$TARGET) / nrow(wine.train.df)
##
## 0 1 2 3 4 5
## 0.212706565 0.019205002 0.084859312 0.205895489 0.249776686 0.156096472
## 6 7 8
## 0.058954891 0.010942385 0.001563198
poisson_model1 <- glm(formula = TARGET ~ . - STARS,
data = wine.train.df,
family = poisson)
summary(poisson_model1)
##
## Call:
## glm(formula = TARGET ~ . - STARS, family = poisson, data = wine.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5426 -0.2929 0.1064 0.3627 1.9301
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.993e+00 2.464e-01 8.087 6.10e-16 ***
## FixedAcidity 4.580e-04 1.040e-03 0.440 0.6597
## VolatileAcidity -3.367e-02 8.192e-03 -4.110 3.95e-05 ***
## CitricAcid 1.934e-03 7.384e-03 0.262 0.7934
## ResidualSugar 1.951e-05 1.909e-04 0.102 0.9186
## Chlorides -1.083e-02 2.031e-02 -0.533 0.5940
## FreeSulfurDioxide 8.108e-05 4.341e-05 1.868 0.0618 .
## TotalSulfurDioxide 2.534e-05 2.798e-05 0.905 0.3652
## Density -3.144e-01 2.425e-01 -1.297 0.1948
## pH -1.978e-03 9.628e-03 -0.205 0.8372
## Sulphates -9.199e-03 6.878e-03 -1.338 0.1810
## Alcohol 7.056e-03 1.747e-03 4.039 5.36e-05 ***
## LabelAppeal 2.466e-01 7.340e-03 33.605 < 2e-16 ***
## AcidIndex -6.289e-02 5.778e-03 -10.885 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 6186.4 on 6640 degrees of freedom
## Residual deviance: 4870.8 on 6627 degrees of freedom
## (2315 observations deleted due to missingness)
## AIC: 24583
##
## Number of Fisher Scoring iterations: 5
length(predict(poisson_model1, newdata = wine.test.df))
## [1] 3839
predict_model1_test <- predict(poisson_model1, newdata = wine.test.df,
type = "response") %>% round(0)
table(Predicted = predict_model1_test, Actual = wine.test.df$TARGET)
## Actual
## Predicted 0 1 2 3 4 5 6 7 8
## 1 1 0 0 0 0 0 0 0 0
## 2 97 45 66 49 10 2 0 0 0
## 3 371 27 231 480 329 101 18 1 0
## 4 243 0 33 231 479 290 70 7 0
## 5 96 0 1 7 118 196 103 15 1
## 6 21 0 0 0 4 26 43 20 2
## 7 0 0 0 0 0 1 3 1 0
Removing the statistically insignificant variables
poisson_model2 <- glm(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal +
AcidIndex + FreeSulfurDioxide,
data = wine.train.df,
family = poisson)
summary(poisson_model2)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal +
## AcidIndex + FreeSulfurDioxide, family = poisson, data = wine.train.df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8451 -0.4753 0.2246 0.6321 2.9385
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.050e+00 4.418e-02 46.409 < 2e-16 ***
## VolatileAcidity -6.404e-02 7.726e-03 -8.290 < 2e-16 ***
## Alcohol 8.891e-03 1.641e-03 5.418 6.02e-08 ***
## LabelAppeal 2.582e-01 6.835e-03 37.772 < 2e-16 ***
## AcidIndex -1.359e-01 5.181e-03 -26.228 < 2e-16 ***
## FreeSulfurDioxide 1.477e-04 4.095e-05 3.606 0.000311 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 15931 on 8955 degrees of freedom
## Residual deviance: 13620 on 8950 degrees of freedom
## AIC: 36010
##
## Number of Fisher Scoring iterations: 5
predict_model2_test <- predict(poisson_model2, newdata = wine.test.df,
type = "response") %>% round(0)
unique(wine.test.df$TARGET)
## [1] 3 2 0 4 5 6 7 1 8
table(Predicted = predict_model2_test, Actual = wine.test.df$TARGET)
## Actual
## Predicted 0 1 2 3 4 5 6 7 8
## 1 40 7 4 10 2 2 0 0 0
## 2 295 54 214 278 115 41 4 0 0
## 3 323 11 103 411 560 239 56 6 0
## 4 125 0 10 66 239 268 104 12 1
## 5 37 0 0 2 20 54 60 16 1
## 6 9 0 0 0 4 10 10 9 1
## 7 0 0 0 0 0 2 3 0 0
## 8 0 0 0 0 0 0 0 1 0
nb_model3 <- glm.nb(formula = TARGET ~ . - STARS,
data = wine.train.df)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(nb_model3)
##
## Call:
## glm.nb(formula = TARGET ~ . - STARS, data = wine.train.df, init.theta = 130137.6579,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.5425 -0.2929 0.1064 0.3627 1.9301
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.993e+00 2.464e-01 8.087 6.10e-16 ***
## FixedAcidity 4.580e-04 1.040e-03 0.440 0.6597
## VolatileAcidity -3.367e-02 8.192e-03 -4.110 3.95e-05 ***
## CitricAcid 1.934e-03 7.384e-03 0.262 0.7934
## ResidualSugar 1.951e-05 1.909e-04 0.102 0.9186
## Chlorides -1.083e-02 2.031e-02 -0.533 0.5940
## FreeSulfurDioxide 8.108e-05 4.341e-05 1.868 0.0618 .
## TotalSulfurDioxide 2.534e-05 2.798e-05 0.905 0.3652
## Density -3.144e-01 2.425e-01 -1.297 0.1948
## pH -1.978e-03 9.629e-03 -0.205 0.8372
## Sulphates -9.199e-03 6.878e-03 -1.338 0.1810
## Alcohol 7.056e-03 1.747e-03 4.039 5.36e-05 ***
## LabelAppeal 2.466e-01 7.340e-03 33.604 < 2e-16 ***
## AcidIndex -6.290e-02 5.778e-03 -10.885 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(130137.7) family taken to be 1)
##
## Null deviance: 6186.3 on 6640 degrees of freedom
## Residual deviance: 4870.7 on 6627 degrees of freedom
## (2315 observations deleted due to missingness)
## AIC: 24585
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 130138
## Std. Err.: 225828
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -24555.19
predict_model3_test <- predict(nb_model3, newdata = wine.test.df,
type = "response") %>% round(0)
unique(wine.test.df$TARGET)
## [1] 3 2 0 4 5 6 7 1 8
table(Predicted = predict_model3_test, Actual = wine.test.df$TARGET)
## Actual
## Predicted 0 1 2 3 4 5 6 7 8
## 1 1 0 0 0 0 0 0 0 0
## 2 97 45 66 49 10 2 0 0 0
## 3 371 27 231 480 329 101 18 1 0
## 4 243 0 33 231 479 290 70 7 0
## 5 96 0 1 7 118 196 103 15 1
## 6 21 0 0 0 4 26 43 20 2
## 7 0 0 0 0 0 1 3 1 0
Keeping only statistically relevant columns
nb_model4 <- glm.nb(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal +
AcidIndex + FreeSulfurDioxide,
data = wine.train.df)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(nb_model4)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal +
## AcidIndex + FreeSulfurDioxide, data = wine.train.df, init.theta = 21776.5604,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.8448 -0.4752 0.2246 0.6320 2.9383
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.050e+00 4.418e-02 46.406 < 2e-16 ***
## VolatileAcidity -6.404e-02 7.726e-03 -8.289 < 2e-16 ***
## Alcohol 8.891e-03 1.641e-03 5.418 6.03e-08 ***
## LabelAppeal 2.582e-01 6.836e-03 37.769 < 2e-16 ***
## AcidIndex -1.359e-01 5.181e-03 -26.226 < 2e-16 ***
## FreeSulfurDioxide 1.477e-04 4.095e-05 3.606 0.000311 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(21776.56) family taken to be 1)
##
## Null deviance: 15929 on 8955 degrees of freedom
## Residual deviance: 13618 on 8950 degrees of freedom
## AIC: 36012
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 21777
## Std. Err.: 63257
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -35997.77
predict_model4_test <- predict(nb_model4, newdata = wine.test.df,
type = "response") %>% round(0)
table(Predicted = predict_model4_test, Actual = wine.test.df$TARGET)
## Actual
## Predicted 0 1 2 3 4 5 6 7 8
## 1 40 7 4 10 2 2 0 0 0
## 2 295 54 214 278 115 41 4 0 0
## 3 323 11 103 411 560 239 56 6 0
## 4 125 0 10 66 239 268 104 12 1
## 5 37 0 0 2 20 54 60 16 1
## 6 9 0 0 0 4 10 10 9 1
## 7 0 0 0 0 0 2 3 0 0
## 8 0 0 0 0 0 0 0 1 0
zero_infl_model5 <- zeroinfl(formula = TARGET ~ VolatileAcidity + Alcohol +
LabelAppeal + AcidIndex + FreeSulfurDioxide,
data = wine.train.df)
summary(zero_infl_model5)
##
## Call:
## zeroinfl(formula = TARGET ~ VolatileAcidity + Alcohol + LabelAppeal +
## AcidIndex + FreeSulfurDioxide, data = wine.train.df)
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.1865 -0.3292 0.1986 0.4740 4.8234
##
## Count model coefficients (poisson with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.353e+00 4.904e-02 27.588 < 2e-16 ***
## VolatileAcidity -1.541e-02 8.109e-03 -1.901 0.05736 .
## Alcohol 9.033e-03 1.720e-03 5.250 1.52e-07 ***
## LabelAppeal 2.821e-01 7.141e-03 39.507 < 2e-16 ***
## AcidIndex -2.020e-02 5.822e-03 -3.470 0.00052 ***
## FreeSulfurDioxide 3.304e-05 4.209e-05 0.785 0.43237
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.5461860 0.2111140 -26.271 < 2e-16 ***
## VolatileAcidity 0.2763853 0.0402520 6.866 6.59e-12 ***
## Alcohol -0.0005266 0.0084825 -0.062 0.950496
## LabelAppeal 0.1559517 0.0363520 4.290 1.79e-05 ***
## AcidIndex 0.4956530 0.0224337 22.094 < 2e-16 ***
## FreeSulfurDioxide -0.0007390 0.0002110 -3.502 0.000462 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Number of iterations in BFGS optimization: 16
## Log-likelihood: -1.6e+04 on 12 Df
predict_model5_test_prob <- predict(zero_infl_model5, newdata = wine.test.df,
type = "prob")
predict_model5_test_final <- apply(predict_model5_test_prob,1,which.max)
predict_model5_test_final <- predict_model5_test_final - 1
table(predict_model5_test_final)
## predict_model5_test_final
## 0 1 2 3 4 5 6 7
## 1899 9 617 887 277 120 28 2
table(Predicted = predict_model5_test_final, Actual = wine.test.df$TARGET)
## Actual
## Predicted 0 1 2 3 4 5 6 7 8
## 0 536 32 126 320 418 308 136 22 1
## 1 1 4 4 0 0 0 0 0 0
## 2 98 36 168 220 79 15 1 0 0
## 3 125 0 32 220 335 152 23 0 0
## 4 46 0 0 5 92 90 40 4 0
## 5 18 0 1 2 13 45 31 10 0
## 6 5 0 0 0 3 6 5 7 2
## 7 0 0 0 0 0 0 1 1 0
### Multiple Linear Regression 1
linear_model6 <- lm(formula = TARGET ~ . - STARS,
data = wine.train.df)
summary(linear_model6)
##
## Call:
## lm(formula = TARGET ~ . - STARS, data = wine.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7094 -0.5646 0.2050 0.6891 3.7765
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.124e+00 6.184e-01 9.903 < 2e-16 ***
## FixedAcidity 1.725e-03 2.615e-03 0.660 0.50953
## VolatileAcidity -1.221e-01 2.057e-02 -5.936 3.07e-09 ***
## CitricAcid 7.018e-03 1.867e-02 0.376 0.70702
## ResidualSugar 2.141e-04 4.793e-04 0.447 0.65503
## Chlorides -4.094e-02 5.074e-02 -0.807 0.41973
## FreeSulfurDioxide 2.935e-04 1.088e-04 2.697 0.00701 **
## TotalSulfurDioxide 9.876e-05 6.993e-05 1.412 0.15795
## Density -1.088e+00 6.088e-01 -1.787 0.07397 .
## pH -2.964e-03 2.412e-02 -0.123 0.90218
## Sulphates -3.026e-02 1.731e-02 -1.748 0.08052 .
## Alcohol 2.625e-02 4.354e-03 6.028 1.75e-09 ***
## LabelAppeal 9.008e-01 1.837e-02 49.049 < 2e-16 ***
## AcidIndex -2.176e-01 1.375e-02 -15.822 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.313 on 6627 degrees of freedom
## (2315 observations deleted due to missingness)
## Multiple R-squared: 0.2957, Adjusted R-squared: 0.2943
## F-statistic: 214 on 13 and 6627 DF, p-value: < 2.2e-16
### Multiple Linear Regression 2
linear_model7 <- lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
TotalSulfurDioxide + Density + Sulphates + Alcohol +
LabelAppeal + AcidIndex,
data = wine.train.df)
summary(linear_model7)
##
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide +
## Density + Sulphates + Alcohol + LabelAppeal + AcidIndex,
## data = wine.train.df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.9198 -0.6702 0.3975 1.1269 5.0180
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.4095052 0.6841677 10.830 < 2e-16 ***
## VolatileAcidity -0.1876416 0.0230711 -8.133 4.74e-16 ***
## FreeSulfurDioxide 0.0004566 0.0001218 3.750 0.000178 ***
## TotalSulfurDioxide 0.0003207 0.0000779 4.117 3.88e-05 ***
## Density -1.8531822 0.6815777 -2.719 0.006561 **
## Sulphates -0.0413100 0.0192825 -2.142 0.032192 *
## Alcohol 0.0280679 0.0048590 5.776 7.88e-09 ***
## LabelAppeal 0.7825607 0.0202709 38.605 < 2e-16 ***
## AcidIndex -0.3580132 0.0136153 -26.295 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.711 on 8947 degrees of freedom
## Multiple R-squared: 0.2088, Adjusted R-squared: 0.2081
## F-statistic: 295.1 on 8 and 8947 DF, p-value: < 2.2e-16
predict_model7_test <- predict(linear_model7, newdata = wine.test.df)
table(round(predict_model7_test,0))
##
## -1 0 1 2 3 4 5 6
## 1 14 133 814 1674 1043 157 3
Based on the above analysis, we have decided to go with model5 which is zero Inflated Poisson model.