Abstract
Purpose of this assignment is to explore, analyze and model a dataset containing approximately 12,000 commercially available wines. The variables are mostly related to the chemical properties of the wine being sold. The response variable(TARGET) is the number of sample cases of wine that were purchased by wine distribution companies after sampling a bottle of wine. These cases would be used to provide tasting samples to restaurants and wine stores around the United States. The more sample cases purchased, the more likely is a bottle of wine to be sold at a high-end restaurant. A large wine manufacturer wants to study the data to predict the number of wine cases that would be ordered based on the wine characteristics.
If the wine manufacturer can predict the number of cases, then that manufacturer will be able to adjust their wine offering to maximize sales. Our objective is to build a count regression model to predict the number of cases of wine that will be sold under given properties of the wine. Sometimes, the fact that a variable is missing is predictive of the target. For building various models, we will be using only the variables that are part of the dataset(or variables that are derived from the variables).
Keywords: wine, data621
Data Exploration
knitr::opts_chunk$set(echo = TRUE)
library(e1071)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(purrr)
library(tidyr)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.4.4
library(VIF)
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.4.4
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.4.4
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
##
## src, summarize
## The following object is masked from 'package:e1071':
##
## impute
## The following objects are masked from 'package:base':
##
## format.pval, units
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(binr)
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(AER)
## Loading required package: car
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
##
## Attaching package: 'car'
## The following object is masked from 'package:VIF':
##
## vif
## The following object is masked from 'package:purrr':
##
## some
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 3.4.4
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
## Loading required package: sandwich
# read data
train = read.csv(file="data/wine-training-data.csv")
dim(train)
## [1] 12795 16
#transform data
#check data
summary(train) %>% kable() %>% kable_styling()
|
INDEX </th>
|
TARGET </th>
|
FixedAcidity
|
VolatileAcidity
|
CitricAcid
|
ResidualSugar
|
Chlorides
|
FreeSulfurDioxide
|
TotalSulfurDioxide
|
Density </th>
|
pH </th>
|
Sulphates
|
Alcohol </th>
|
LabelAppeal
|
AcidIndex
|
STARS </th>
|
|
Min. : 1
|
Min. :0.000
|
Min. :-18.100
|
Min. :-2.7900
|
Min. :-3.2400
|
Min. :-127.800
|
Min. :-1.1710
|
Min. :-555.00
|
Min. :-823.0
|
Min. :0.8881
|
Min. :0.480
|
Min. :-3.1300
|
Min. :-4.70
|
Min. :-2.000000
|
Min. : 4.000
|
Min. :1.000
|
|
1st Qu.: 4038
|
1st Qu.:2.000
|
1st Qu.: 5.200
|
1st Qu.: 0.1300
|
1st Qu.: 0.0300
|
1st Qu.: -2.000
|
1st Qu.:-0.0310
|
1st Qu.: 0.00
|
1st Qu.: 27.0
|
1st Qu.:0.9877
|
1st Qu.:2.960
|
1st Qu.: 0.2800
|
1st Qu.: 9.00
|
1st Qu.:-1.000000
|
1st Qu.: 7.000
|
1st Qu.:1.000
|
|
Median : 8110
|
Median :3.000
|
Median : 6.900
|
Median : 0.2800
|
Median : 0.3100
|
Median : 3.900
|
Median : 0.0460
|
Median : 30.00
|
Median : 123.0
|
Median :0.9945
|
Median :3.200
|
Median : 0.5000
|
Median :10.40
|
Median : 0.000000
|
Median : 8.000
|
Median :2.000
|
|
Mean : 8070
|
Mean :3.029
|
Mean : 7.076
|
Mean : 0.3241
|
Mean : 0.3084
|
Mean : 5.419
|
Mean : 0.0548
|
Mean : 30.85
|
Mean : 120.7
|
Mean :0.9942
|
Mean :3.208
|
Mean : 0.5271
|
Mean :10.49
|
Mean :-0.009066
|
Mean : 7.773
|
Mean :2.042
|
|
3rd Qu.:12106
|
3rd Qu.:4.000
|
3rd Qu.: 9.500
|
3rd Qu.: 0.6400
|
3rd Qu.: 0.5800
|
3rd Qu.: 15.900
|
3rd Qu.: 0.1530
|
3rd Qu.: 70.00
|
3rd Qu.: 208.0
|
3rd Qu.:1.0005
|
3rd Qu.:3.470
|
3rd Qu.: 0.8600
|
3rd Qu.:12.40
|
3rd Qu.: 1.000000
|
3rd Qu.: 8.000
|
3rd Qu.:3.000
|
|
Max. :16129
|
Max. :8.000
|
Max. : 34.400
|
Max. : 3.6800
|
Max. : 3.8600
|
Max. : 141.150
|
Max. : 1.3510
|
Max. : 623.00
|
Max. :1057.0
|
Max. :1.0992
|
Max. :6.130
|
Max. : 4.2400
|
Max. :26.50
|
Max. : 2.000000
|
Max. :17.000
|
Max. :4.000
|
|
NA
|
NA
|
NA
|
NA
|
NA
|
NA’s :616
|
NA’s :638
|
NA’s :647
|
NA’s :682
|
NA
|
NA’s :395
|
NA’s :1210
|
NA’s :653
|
NA
|
NA
|
NA’s :3359
|
str(train)
## 'data.frame': 12795 obs. of 16 variables:
## $ INDEX : int 1 2 4 5 6 7 8 11 12 13 ...
## $ TARGET : int 3 3 5 3 4 0 0 4 3 6 ...
## $ FixedAcidity : num 3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
## $ VolatileAcidity : num 1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
## $ CitricAcid : num -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
## $ ResidualSugar : num 54.2 26.1 14.8 18.8 9.4 ...
## $ Chlorides : num -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
## $ FreeSulfurDioxide : num NA 15 214 22 -167 -37 287 523 -213 62 ...
## $ TotalSulfurDioxide: num 268 -327 142 115 108 15 156 551 NA 180 ...
## $ Density : num 0.993 1.028 0.995 0.996 0.995 ...
## $ pH : num 3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
## $ Sulphates : num -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
## $ Alcohol : num 9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
## $ LabelAppeal : int 0 -1 -1 -1 0 0 0 1 0 0 ...
## $ AcidIndex : int 8 7 8 6 9 11 8 7 6 8 ...
## $ STARS : int 2 3 3 1 2 NA NA 3 NA 4 ...
sapply(train, function(x) sum(is.na(x))) %>% kable() %>% kable_styling()
|
x
|
INDEX
|
0
|
TARGET
|
0
|
FixedAcidity
|
0
|
VolatileAcidity
|
0
|
CitricAcid
|
0
|
ResidualSugar
|
616
|
Chlorides
|
638
|
FreeSulfurDioxide
|
647
|
TotalSulfurDioxide
|
682
|
Density
|
0
|
pH
|
395
|
Sulphates
|
1210
|
Alcohol
|
653
|
LabelAppeal
|
0
|
AcidIndex
|
0
|
STARS
|
3359
|
library(UpSetR)
##
## Attaching package: 'UpSetR'
## The following object is masked from 'package:lattice':
##
## histogram
library(naniar)
## Warning: package 'naniar' was built under R version 3.4.4
gg_miss_fct(x = train, fct = TARGET)

train %>%
as_shadow_upset() %>%
upset(nsets = 24)

ntrain<-select_if(train, is.numeric)
ntrain %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(value)) + # Plot the values
facet_wrap(~ key, scales = "free") + # In separate panels
geom_density()
## Warning: Removed 8200 rows containing non-finite values (stat_density).

summary_metrics <- function(df){
###Creates summary metrics table
metrics_only <- df[, sapply(df, is.numeric)]
df_metrics <- psych::describe(metrics_only, quant = c(.25,.75))
df_metrics$unique_values = rapply(metrics_only, function(x) length(unique(x)))
df_metrics <-
dplyr::select(df_metrics, n, unique_values, min, Q.1st = Q0.25, median, mean, Q.3rd = Q0.75,
max, range, sd, skew, kurtosis
)
return(df_metrics)
}
metrics_df <- summary_metrics(train)
boxplot_data <-
train %>%
dplyr::select(rownames(metrics_df)[metrics_df$unique_values < 15]) %>%
reshape2::melt(id.vars = "TARGET")
ggplot(data = boxplot_data, aes(x = factor(value), y = TARGET)) +
geom_boxplot() +
facet_wrap( ~ variable, scales = "free") +
coord_flip() +
ggthemes::theme_fivethirtyeight()

trainc <- train[complete.cases(train), ]
trainc <- trainc[, !(colnames(trainc) %in% c("INDEX"))]
rcorr(as.matrix(trainc))
## TARGET FixedAcidity VolatileAcidity CitricAcid
## TARGET 1.00 -0.01 -0.08 0.00
## FixedAcidity -0.01 1.00 0.02 0.01
## VolatileAcidity -0.08 0.02 1.00 -0.02
## CitricAcid 0.00 0.01 -0.02 1.00
## ResidualSugar 0.00 -0.02 0.00 -0.01
## Chlorides -0.03 -0.01 0.01 -0.03
## FreeSulfurDioxide 0.02 0.02 -0.01 0.01
## TotalSulfurDioxide 0.02 -0.02 0.00 -0.01
## Density -0.05 0.01 0.01 -0.02
## pH 0.00 0.00 0.01 0.00
## Sulphates -0.02 0.04 0.00 -0.01
## Alcohol 0.07 -0.01 0.00 0.02
## LabelAppeal 0.50 0.01 -0.02 0.02
## AcidIndex -0.17 0.15 0.03 0.05
## STARS 0.55 0.00 -0.04 0.01
## ResidualSugar Chlorides FreeSulfurDioxide
## TARGET 0.00 -0.03 0.02
## FixedAcidity -0.02 -0.01 0.02
## VolatileAcidity 0.00 0.01 -0.01
## CitricAcid -0.01 -0.03 0.01
## ResidualSugar 1.00 0.00 0.02
## Chlorides 0.00 1.00 -0.02
## FreeSulfurDioxide 0.02 -0.02 1.00
## TotalSulfurDioxide 0.02 0.00 0.01
## Density -0.01 0.02 -0.01
## pH 0.02 -0.02 0.00
## Sulphates 0.00 0.00 0.03
## Alcohol -0.02 -0.02 -0.02
## LabelAppeal 0.00 -0.01 0.01
## AcidIndex -0.02 0.00 -0.01
## STARS 0.02 -0.01 -0.02
## TotalSulfurDioxide Density pH Sulphates Alcohol
## TARGET 0.02 -0.05 0.00 -0.02 0.07
## FixedAcidity -0.02 0.01 0.00 0.04 -0.01
## VolatileAcidity 0.00 0.01 0.01 0.00 0.00
## CitricAcid -0.01 -0.02 0.00 -0.01 0.02
## ResidualSugar 0.02 -0.01 0.02 0.00 -0.02
## Chlorides 0.00 0.02 -0.02 0.00 -0.02
## FreeSulfurDioxide 0.01 -0.01 0.00 0.03 -0.02
## TotalSulfurDioxide 1.00 0.02 0.00 0.00 -0.02
## Density 0.02 1.00 0.00 -0.01 -0.01
## pH 0.00 0.00 1.00 0.01 -0.01
## Sulphates 0.00 -0.01 0.01 1.00 0.01
## Alcohol -0.02 -0.01 -0.01 0.01 1.00
## LabelAppeal 0.00 -0.02 0.00 0.00 0.00
## AcidIndex -0.02 0.05 -0.05 0.03 -0.06
## STARS 0.02 -0.03 0.00 -0.02 0.06
## LabelAppeal AcidIndex STARS
## TARGET 0.50 -0.17 0.55
## FixedAcidity 0.01 0.15 0.00
## VolatileAcidity -0.02 0.03 -0.04
## CitricAcid 0.02 0.05 0.01
## ResidualSugar 0.00 -0.02 0.02
## Chlorides -0.01 0.00 -0.01
## FreeSulfurDioxide 0.01 -0.01 -0.02
## TotalSulfurDioxide 0.00 -0.02 0.02
## Density -0.02 0.05 -0.03
## pH 0.00 -0.05 0.00
## Sulphates 0.00 0.03 -0.02
## Alcohol 0.00 -0.06 0.06
## LabelAppeal 1.00 0.01 0.32
## AcidIndex 0.01 1.00 -0.10
## STARS 0.32 -0.10 1.00
##
## n= 6436
##
##
## P
## TARGET FixedAcidity VolatileAcidity CitricAcid
## TARGET 0.3146 0.0000 0.8508
## FixedAcidity 0.3146 0.1273 0.2614
## VolatileAcidity 0.0000 0.1273 0.0602
## CitricAcid 0.8508 0.2614 0.0602
## ResidualSugar 0.7777 0.2158 0.9025 0.4298
## Chlorides 0.0146 0.6244 0.2336 0.0071
## FreeSulfurDioxide 0.0693 0.2156 0.3588 0.3312
## TotalSulfurDioxide 0.0831 0.0613 0.9525 0.4263
## Density 0.0001 0.3532 0.2934 0.1729
## pH 0.9859 0.7149 0.5634 0.9515
## Sulphates 0.0887 0.0007 0.9032 0.2473
## Alcohol 0.0000 0.2939 0.9833 0.1730
## LabelAppeal 0.0000 0.3615 0.1044 0.2188
## AcidIndex 0.0000 0.0000 0.0445 0.0000
## STARS 0.0000 0.6921 0.0012 0.5668
## ResidualSugar Chlorides FreeSulfurDioxide
## TARGET 0.7777 0.0146 0.0693
## FixedAcidity 0.2158 0.6244 0.2156
## VolatileAcidity 0.9025 0.2336 0.3588
## CitricAcid 0.4298 0.0071 0.3312
## ResidualSugar 0.7410 0.0781
## Chlorides 0.7410 0.1002
## FreeSulfurDioxide 0.0781 0.1002
## TotalSulfurDioxide 0.1719 0.9732 0.2802
## Density 0.5679 0.0973 0.4871
## pH 0.1589 0.1494 0.8720
## Sulphates 0.8282 0.8336 0.0314
## Alcohol 0.1286 0.0664 0.0555
## LabelAppeal 0.7134 0.6084 0.2301
## AcidIndex 0.1034 0.8907 0.2373
## STARS 0.1147 0.6120 0.2170
## TotalSulfurDioxide Density pH Sulphates Alcohol
## TARGET 0.0831 0.0001 0.9859 0.0887 0.0000
## FixedAcidity 0.0613 0.3532 0.7149 0.0007 0.2939
## VolatileAcidity 0.9525 0.2934 0.5634 0.9032 0.9833
## CitricAcid 0.4263 0.1729 0.9515 0.2473 0.1730
## ResidualSugar 0.1719 0.5679 0.1589 0.8282 0.1286
## Chlorides 0.9732 0.0973 0.1494 0.8336 0.0664
## FreeSulfurDioxide 0.2802 0.4871 0.8720 0.0314 0.0555
## TotalSulfurDioxide 0.0631 0.7837 0.8408 0.1765
## Density 0.0631 0.8713 0.3948 0.6230
## pH 0.7837 0.8713 0.4019 0.3276
## Sulphates 0.8408 0.3948 0.4019 0.3844
## Alcohol 0.1765 0.6230 0.3276 0.3844
## LabelAppeal 0.8271 0.1467 0.9860 0.7624 0.9587
## AcidIndex 0.0759 0.0001 0.0000 0.0127 0.0000
## STARS 0.0763 0.0223 0.7241 0.0635 0.0000
## LabelAppeal AcidIndex STARS
## TARGET 0.0000 0.0000 0.0000
## FixedAcidity 0.3615 0.0000 0.6921
## VolatileAcidity 0.1044 0.0445 0.0012
## CitricAcid 0.2188 0.0000 0.5668
## ResidualSugar 0.7134 0.1034 0.1147
## Chlorides 0.6084 0.8907 0.6120
## FreeSulfurDioxide 0.2301 0.2373 0.2170
## TotalSulfurDioxide 0.8271 0.0759 0.0763
## Density 0.1467 0.0001 0.0223
## pH 0.9860 0.0000 0.7241
## Sulphates 0.7624 0.0127 0.0635
## Alcohol 0.9587 0.0000 0.0000
## LabelAppeal 0.4087 0.0000
## AcidIndex 0.4087 0.0000
## STARS 0.0000 0.0000
corrplot(cor(trainc), method="square")

library(VIM)
## Loading required package: colorspace
##
## Attaching package: 'colorspace'
## The following object is masked from 'package:pROC':
##
## coords
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.4
##
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
##
## transpose
## The following objects are masked from 'package:dplyr':
##
## between, first, last
## VIM is ready to use.
## Since version 4.0.0 the GUI is in its own package VIMGUI.
##
## Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
##
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
##
## sleep
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
options(scipen = 999)
missing_plot <- VIM::aggr(train,
numbers = T,
sortVars = T,
col = c("lightgreen", "darkred", "orange"),
labels=names(train),
ylab=c("Missing Value Counts"
, "Pattern"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

##
## Variables sorted by number of missings:
## Variable Count
## STARS 0.26252442
## Sulphates 0.09456819
## TotalSulfurDioxide 0.05330207
## Alcohol 0.05103556
## FreeSulfurDioxide 0.05056663
## Chlorides 0.04986323
## ResidualSugar 0.04814381
## pH 0.03087143
## INDEX 0.00000000
## TARGET 0.00000000
## FixedAcidity 0.00000000
## VolatileAcidity 0.00000000
## CitricAcid 0.00000000
## Density 0.00000000
## LabelAppeal 0.00000000
## AcidIndex 0.00000000
summary(missing_plot)
##
## Missings per variable:
## Variable Count
## INDEX 0
## TARGET 0
## FixedAcidity 0
## VolatileAcidity 0
## CitricAcid 0
## ResidualSugar 616
## Chlorides 638
## FreeSulfurDioxide 647
## TotalSulfurDioxide 682
## Density 0
## pH 395
## Sulphates 1210
## Alcohol 653
## LabelAppeal 0
## AcidIndex 0
## STARS 3359
##
## Missings in combinations of variables:
## Combinations Count Percent
## 0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0 6436 50.300898789
## 0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:1 2239 17.499023056
## 0:0:0:0:0:0:0:0:0:0:0:0:1:0:0:0 335 2.618210238
## 0:0:0:0:0:0:0:0:0:0:0:0:1:0:0:1 123 0.961313013
## 0:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0 669 5.228604924
## 0:0:0:0:0:0:0:0:0:0:0:1:0:0:0:1 247 1.930441579
## 0:0:0:0:0:0:0:0:0:0:0:1:1:0:0:0 37 0.289175459
## 0:0:0:0:0:0:0:0:0:0:0:1:1:0:0:1 10 0.078155530
## 0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0 197 1.539663931
## 0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:1 81 0.633059789
## 0:0:0:0:0:0:0:0:0:0:1:0:1:0:0:0 14 0.109417741
## 0:0:0:0:0:0:0:0:0:0:1:0:1:0:0:1 5 0.039077765
## 0:0:0:0:0:0:0:0:0:0:1:1:0:0:0:0 22 0.171942165
## 0:0:0:0:0:0:0:0:0:0:1:1:0:0:0:1 11 0.085971082
## 0:0:0:0:0:0:0:0:1:0:0:0:0:0:0:0 341 2.665103556
## 0:0:0:0:0:0:0:0:1:0:0:0:0:0:0:1 123 0.961313013
## 0:0:0:0:0:0:0:0:1:0:0:0:1:0:0:0 21 0.164126612
## 0:0:0:0:0:0:0:0:1:0:0:0:1:0:0:1 8 0.062524424
## 0:0:0:0:0:0:0:0:1:0:0:1:0:0:0:0 42 0.328253224
## 0:0:0:0:0:0:0:0:1:0:0:1:0:0:0:1 16 0.125048847
## 0:0:0:0:0:0:0:0:1:0:0:1:1:0:0:0 3 0.023446659
## 0:0:0:0:0:0:0:0:1:0:0:1:1:0:0:1 1 0.007815553
## 0:0:0:0:0:0:0:0:1:0:1:0:0:0:0:0 13 0.101602188
## 0:0:0:0:0:0:0:0:1:0:1:0:0:0:0:1 6 0.046893318
## 0:0:0:0:0:0:0:0:1:0:1:0:1:0:0:0 1 0.007815553
## 0:0:0:0:0:0:0:1:0:0:0:0:0:0:0:0 338 2.641656897
## 0:0:0:0:0:0:0:1:0:0:0:0:0:0:0:1 124 0.969128566
## 0:0:0:0:0:0:0:1:0:0:0:0:1:0:0:0 15 0.117233294
## 0:0:0:0:0:0:0:1:0:0:0:0:1:0:0:1 7 0.054708871
## 0:0:0:0:0:0:0:1:0:0:0:1:0:0:0:0 26 0.203204377
## 0:0:0:0:0:0:0:1:0:0:0:1:0:0:0:1 14 0.109417741
## 0:0:0:0:0:0:0:1:0:0:0:1:1:0:0:0 2 0.015631106
## 0:0:0:0:0:0:0:1:0:0:0:1:1:0:0:1 2 0.015631106
## 0:0:0:0:0:0:0:1:0:0:1:0:0:0:0:0 10 0.078155530
## 0:0:0:0:0:0:0:1:0:0:1:0:0:0:0:1 4 0.031262212
## 0:0:0:0:0:0:0:1:0:0:1:1:0:0:0:0 2 0.015631106
## 0:0:0:0:0:0:0:1:1:0:0:0:0:0:0:0 19 0.148495506
## 0:0:0:0:0:0:0:1:1:0:0:0:0:0:0:1 16 0.125048847
## 0:0:0:0:0:0:0:1:1:0:0:0:1:0:0:1 1 0.007815553
## 0:0:0:0:0:0:0:1:1:0:0:1:0:0:0:0 2 0.015631106
## 0:0:0:0:0:0:0:1:1:0:0:1:0:0:0:1 2 0.015631106
## 0:0:0:0:0:0:1:0:0:0:0:0:0:0:0:0 350 2.735443533
## 0:0:0:0:0:0:1:0:0:0:0:0:0:0:0:1 113 0.883157483
## 0:0:0:0:0:0:1:0:0:0:0:0:1:0:0:0 17 0.132864400
## 0:0:0:0:0:0:1:0:0:0:0:0:1:0:0:1 7 0.054708871
## 0:0:0:0:0:0:1:0:0:0:0:1:0:0:0:0 22 0.171942165
## 0:0:0:0:0:0:1:0:0:0:0:1:0:0:0:1 14 0.109417741
## 0:0:0:0:0:0:1:0:0:0:1:0:0:0:0:0 5 0.039077765
## 0:0:0:0:0:0:1:0:0:0:1:0:0:0:0:1 3 0.023446659
## 0:0:0:0:0:0:1:0:0:0:1:0:1:0:0:0 2 0.015631106
## 0:0:0:0:0:0:1:0:1:0:0:0:0:0:0:0 20 0.156311059
## 0:0:0:0:0:0:1:0:1:0:0:0:0:0:0:1 5 0.039077765
## 0:0:0:0:0:0:1:0:1:0:0:0:1:0:0:0 1 0.007815553
## 0:0:0:0:0:0:1:0:1:0:0:0:1:0:0:1 1 0.007815553
## 0:0:0:0:0:0:1:0:1:0:0:1:0:0:0:0 2 0.015631106
## 0:0:0:0:0:0:1:1:0:0:0:0:0:0:0:0 13 0.101602188
## 0:0:0:0:0:0:1:1:0:0:0:0:0:0:0:1 9 0.070339977
## 0:0:0:0:0:0:1:1:0:0:0:0:1:0:0:0 1 0.007815553
## 0:0:0:0:0:0:1:1:0:0:0:0:1:0:0:1 2 0.015631106
## 0:0:0:0:0:0:1:1:0:0:0:1:0:0:0:0 2 0.015631106
## 0:0:0:0:0:0:1:1:0:0:0:1:0:0:0:1 1 0.007815553
## 0:0:0:0:0:0:1:1:0:0:1:0:0:0:0:0 1 0.007815553
## 0:0:0:0:0:0:1:1:1:0:0:0:0:0:0:0 3 0.023446659
## 0:0:0:0:0:1:0:0:0:0:0:0:0:0:0:0 311 2.430636968
## 0:0:0:0:0:1:0:0:0:0:0:0:0:0:0:1 108 0.844079719
## 0:0:0:0:0:1:0:0:0:0:0:0:1:0:0:0 21 0.164126612
## 0:0:0:0:0:1:0:0:0:0:0:0:1:0:0:1 8 0.062524424
## 0:0:0:0:0:1:0:0:0:0:0:1:0:0:0:0 33 0.257913247
## 0:0:0:0:0:1:0:0:0:0:0:1:0:0:0:1 15 0.117233294
## 0:0:0:0:0:1:0:0:0:0:0:1:1:0:0:1 1 0.007815553
## 0:0:0:0:0:1:0:0:0:0:1:0:0:0:0:0 9 0.070339977
## 0:0:0:0:0:1:0:0:0:0:1:0:0:0:0:1 2 0.015631106
## 0:0:0:0:0:1:0:0:0:0:1:0:1:0:0:0 2 0.015631106
## 0:0:0:0:0:1:0:0:0:0:1:1:0:0:0:0 3 0.023446659
## 0:0:0:0:0:1:0:0:1:0:0:0:0:0:0:0 22 0.171942165
## 0:0:0:0:0:1:0:0:1:0:0:0:0:0:0:1 5 0.039077765
## 0:0:0:0:0:1:0:0:1:0:0:1:0:0:0:1 2 0.015631106
## 0:0:0:0:0:1:0:0:1:0:1:0:0:0:0:0 1 0.007815553
## 0:0:0:0:0:1:0:1:0:0:0:0:0:0:0:0 17 0.132864400
## 0:0:0:0:0:1:0:1:0:0:0:0:0:0:0:1 3 0.023446659
## 0:0:0:0:0:1:0:1:0:0:0:0:1:0:0:1 2 0.015631106
## 0:0:0:0:0:1:0:1:0:0:0:1:0:0:0:0 3 0.023446659
## 0:0:0:0:0:1:0:1:0:0:0:1:0:0:0:1 1 0.007815553
## 0:0:0:0:0:1:0:1:0:0:1:0:0:0:0:1 1 0.007815553
## 0:0:0:0:0:1:0:1:1:0:0:0:0:0:0:0 2 0.015631106
## 0:0:0:0:0:1:1:0:0:0:0:0:0:0:0:0 25 0.195388824
## 0:0:0:0:0:1:1:0:0:0:0:0:0:0:0:1 10 0.078155530
## 0:0:0:0:0:1:1:0:0:0:0:0:1:0:0:1 2 0.015631106
## 0:0:0:0:0:1:1:0:0:0:0:1:0:0:0:0 1 0.007815553
## 0:0:0:0:0:1:1:0:0:0:0:1:0:0:0:1 1 0.007815553
## 0:0:0:0:0:1:1:0:0:0:0:1:1:0:0:0 1 0.007815553
## 0:0:0:0:0:1:1:0:1:0:0:0:0:0:0:1 2 0.015631106
## 0:0:0:0:0:1:1:1:0:0:0:0:0:0:0:1 1 0.007815553
## 0:0:0:0:0:1:1:1:1:0:0:0:0:0:0:0 1 0.007815553
missing_plot$missings %>%
mutate(
pct_missing = Count / nrow(train)
) %>%
arrange(-pct_missing) %>%
filter(pct_missing > 0) %>%
kable(digits = 3, row.names = T, caption = "Variables Missing Values")
Variables Missing Values
|
Variable
|
Count
|
pct_missing
|
1
|
STARS
|
3359
|
0.263
|
2
|
Sulphates
|
1210
|
0.095
|
3
|
TotalSulfurDioxide
|
682
|
0.053
|
4
|
Alcohol
|
653
|
0.051
|
5
|
FreeSulfurDioxide
|
647
|
0.051
|
6
|
Chlorides
|
638
|
0.050
|
7
|
ResidualSugar
|
616
|
0.048
|
8
|
pH
|
395
|
0.031
|
Data Preparation
#negative values
vars_neg_values <-
dplyr::select(train,
intersect(rownames(metrics_df)[metrics_df$unique_values > 15],
rownames(metrics_df)[metrics_df$min < 0])
)
neg_proportions <- t(apply(vars_neg_values, 2, function(x) prop.table(table(x < 0))))
data.frame(
Var = rownames(neg_proportions),
is_negative = neg_proportions[, 2]
) %>% arrange(-is_negative) %>%
kable(digits = 2)
Var
|
is_negative
|
Chlorides
|
0.26
|
ResidualSugar
|
0.26
|
FreeSulfurDioxide
|
0.25
|
CitricAcid
|
0.23
|
VolatileAcidity
|
0.22
|
TotalSulfurDioxide
|
0.21
|
Sulphates
|
0.20
|
FixedAcidity
|
0.13
|
Alcohol
|
0.01
|
#new variables
train$BoundSulfurDioxide <- train$TotalSulfurDioxide - train$FreeSulfurDioxide
# impute data for missing values
# use column mean for calculation
train$STARS[is.na(train$STARS)] <- mean(train$STARS, na.rm=TRUE)
train$Alcohol[is.na(train$Alcohol)] <- mean(train$Alcohol, na.rm=TRUE)
train$Sulphates[is.na(train$Sulphates)] <- mean(train$Sulphates, na.rm=TRUE)
train$pH[is.na(train$pH)] <- mean(train$pH, na.rm=TRUE)
train$TotalSulfurDioxide[is.na(train$TotalSulfurDioxide)] <- mean(train$TotalSulfurDioxide, na.rm=TRUE)
train$FreeSulfurDioxide[is.na(train$FreeSulfurDioxide)] <- mean(train$FreeSulfurDioxide, na.rm=TRUE)
train$BoundSulfurDioxide[is.na(train$BoundSulfurDioxide)] <- mean(train$BoundSulfurDioxide, na.rm=TRUE)
train$Chlorides[is.na(train$Chlorides)] <- mean(train$Chlorides, na.rm=TRUE)
train$ResidualSugar[is.na(train$ResidualSugar)] <- mean(train$ResidualSugar, na.rm=TRUE)
#convert to abs for negative values
#converted to positive based upon literature
train$FixedAcidity <- abs(train$FixedAcidity)
train$VolatileAcidity <- abs(train$VolatileAcidity)
train$CitricAcid <- abs(train$CitricAcid)
train$ResidualSugar <- abs(train$ResidualSugar)
train$Chlorides <- abs(train$Chlorides)
train$FreeSulfurDioxide <- abs(train$FreeSulfurDioxide)
train$TotalSulfurDioxide <- abs(train$TotalSulfurDioxide)
train$BoundSulfurDioxide <- abs(train$BoundSulfurDioxide)
train$Sulphates <- abs(train$Sulphates)
train$Alcohol <- abs(train$Alcohol)
#new variables after abs to avoid nan and inf
train$PerVol <- train$VolatileAcidity/(train$FixedAcidity+train$VolatileAcidity)
#shift categorigal labelappeal
train$LabelAppeal <- train$LabelAppeal+2
train2<-train
train2$STARS <- as.factor(train2$STARS)
train <- train[, !(colnames(train) %in% c("INDEX"))]
#
# #create variable
# train$new <- train$tax / (train$medv*10)
#
trainnum <- dplyr::select_if(train, is.numeric)
rcorr(as.matrix(trainnum))
## TARGET FixedAcidity VolatileAcidity CitricAcid
## TARGET 1.00 -0.05 -0.07 0.01
## FixedAcidity -0.05 1.00 0.01 0.00
## VolatileAcidity -0.07 0.01 1.00 0.00
## CitricAcid 0.01 0.00 0.00 1.00
## ResidualSugar 0.00 0.00 0.00 -0.01
## Chlorides -0.03 0.00 0.01 0.00
## FreeSulfurDioxide 0.02 0.00 -0.01 0.01
## TotalSulfurDioxide 0.03 -0.01 -0.03 0.01
## Density -0.04 0.00 0.00 -0.01
## pH -0.01 0.00 0.02 0.00
## Sulphates -0.03 0.02 0.01 0.01
## Alcohol 0.06 -0.01 0.01 -0.01
## LabelAppeal 0.36 0.00 -0.02 0.02
## AcidIndex -0.25 0.18 0.04 0.04
## STARS 0.39 -0.02 -0.03 0.00
## BoundSulfurDioxide 0.01 0.00 -0.03 0.02
## PerVol -0.03 -0.49 0.47 0.00
## ResidualSugar Chlorides FreeSulfurDioxide
## TARGET 0.00 -0.03 0.02
## FixedAcidity 0.00 0.00 0.00
## VolatileAcidity 0.00 0.01 -0.01
## CitricAcid -0.01 0.00 0.01
## ResidualSugar 1.00 0.00 -0.01
## Chlorides 0.00 1.00 0.00
## FreeSulfurDioxide -0.01 0.00 1.00
## TotalSulfurDioxide 0.01 -0.01 0.01
## Density 0.00 0.02 0.00
## pH 0.00 0.01 -0.01
## Sulphates -0.01 0.02 0.00
## Alcohol -0.01 0.00 -0.01
## LabelAppeal 0.00 -0.01 0.01
## AcidIndex -0.01 0.03 -0.02
## STARS 0.01 0.00 0.00
## BoundSulfurDioxide 0.01 -0.01 0.28
## PerVol 0.00 0.02 -0.01
## TotalSulfurDioxide Density pH Sulphates Alcohol
## TARGET 0.03 -0.04 -0.01 -0.03 0.06
## FixedAcidity -0.01 0.00 0.00 0.02 -0.01
## VolatileAcidity -0.03 0.00 0.02 0.01 0.01
## CitricAcid 0.01 -0.01 0.00 0.01 -0.01
## ResidualSugar 0.01 0.00 0.00 -0.01 -0.01
## Chlorides -0.01 0.02 0.01 0.02 0.00
## FreeSulfurDioxide 0.01 0.00 -0.01 0.00 -0.01
## TotalSulfurDioxide 1.00 0.01 0.01 -0.01 -0.03
## Density 0.01 1.00 0.01 0.01 -0.01
## pH 0.01 0.01 1.00 0.01 -0.01
## Sulphates -0.01 0.01 0.01 1.00 0.00
## Alcohol -0.03 -0.01 -0.01 0.00 1.00
## LabelAppeal -0.01 -0.01 0.00 0.00 0.00
## AcidIndex -0.04 0.04 -0.06 0.03 -0.04
## STARS 0.00 -0.02 0.00 0.00 0.05
## BoundSulfurDioxide 0.72 0.00 0.00 -0.01 -0.02
## PerVol -0.02 0.00 0.02 0.01 0.02
## LabelAppeal AcidIndex STARS BoundSulfurDioxide PerVol
## TARGET 0.36 -0.25 0.39 0.01 -0.03
## FixedAcidity 0.00 0.18 -0.02 0.00 -0.49
## VolatileAcidity -0.02 0.04 -0.03 -0.03 0.47
## CitricAcid 0.02 0.04 0.00 0.02 0.00
## ResidualSugar 0.00 -0.01 0.01 0.01 0.00
## Chlorides -0.01 0.03 0.00 -0.01 0.02
## FreeSulfurDioxide 0.01 -0.02 0.00 0.28 -0.01
## TotalSulfurDioxide -0.01 -0.04 0.00 0.72 -0.02
## Density -0.01 0.04 -0.02 0.00 0.00
## pH 0.00 -0.06 0.00 0.00 0.02
## Sulphates 0.00 0.03 0.00 -0.01 0.01
## Alcohol 0.00 -0.04 0.05 -0.02 0.02
## LabelAppeal 1.00 0.02 0.28 -0.01 -0.01
## AcidIndex 0.02 1.00 -0.07 0.00 -0.03
## STARS 0.28 -0.07 1.00 0.00 -0.01
## BoundSulfurDioxide -0.01 0.00 0.00 1.00 -0.03
## PerVol -0.01 -0.03 -0.01 -0.03 1.00
##
## n= 12795
##
##
## P
## TARGET FixedAcidity VolatileAcidity CitricAcid
## TARGET 0.0000 0.0000 0.1145
## FixedAcidity 0.0000 0.2489 0.6205
## VolatileAcidity 0.0000 0.2489 0.7764
## CitricAcid 0.1145 0.6205 0.7764
## ResidualSugar 0.8421 0.6514 0.8201 0.1535
## Chlorides 0.0017 0.9436 0.2518 0.5935
## FreeSulfurDioxide 0.0076 0.5887 0.3005 0.4925
## TotalSulfurDioxide 0.0002 0.2290 0.0000 0.3276
## Density 0.0000 0.9949 0.6341 0.2196
## pH 0.2939 0.7960 0.0702 0.7142
## Sulphates 0.0004 0.0348 0.4347 0.1073
## Alcohol 0.0000 0.2991 0.1197 0.4935
## LabelAppeal 0.0000 0.8000 0.0825 0.0501
## AcidIndex 0.0000 0.0000 0.0000 0.0000
## STARS 0.0000 0.0761 0.0037 0.8749
## BoundSulfurDioxide 0.4382 0.7819 0.0005 0.0220
## PerVol 0.0039 0.0000 0.0000 0.7383
## ResidualSugar Chlorides FreeSulfurDioxide
## TARGET 0.8421 0.0017 0.0076
## FixedAcidity 0.6514 0.9436 0.5887
## VolatileAcidity 0.8201 0.2518 0.3005
## CitricAcid 0.1535 0.5935 0.4925
## ResidualSugar 0.7835 0.2116
## Chlorides 0.7835 0.6498
## FreeSulfurDioxide 0.2116 0.6498
## TotalSulfurDioxide 0.1240 0.1500 0.1881
## Density 0.6577 0.0806 0.5978
## pH 0.9352 0.3766 0.4808
## Sulphates 0.4925 0.0628 0.8712
## Alcohol 0.4881 0.7274 0.1716
## LabelAppeal 0.9877 0.3329 0.1483
## AcidIndex 0.2682 0.0006 0.0269
## STARS 0.4869 0.8433 0.9593
## BoundSulfurDioxide 0.4458 0.1961 0.0000
## PerVol 0.8960 0.0649 0.4008
## TotalSulfurDioxide Density pH Sulphates Alcohol
## TARGET 0.0002 0.0000 0.2939 0.0004 0.0000
## FixedAcidity 0.2290 0.9949 0.7960 0.0348 0.2991
## VolatileAcidity 0.0000 0.6341 0.0702 0.4347 0.1197
## CitricAcid 0.3276 0.2196 0.7142 0.1073 0.4935
## ResidualSugar 0.1240 0.6577 0.9352 0.4925 0.4881
## Chlorides 0.1500 0.0806 0.3766 0.0628 0.7274
## FreeSulfurDioxide 0.1881 0.5978 0.4808 0.8712 0.1716
## TotalSulfurDioxide 0.1650 0.2828 0.1891 0.0005
## Density 0.1650 0.5219 0.2056 0.3870
## pH 0.2828 0.5219 0.2369 0.3094
## Sulphates 0.1891 0.2056 0.2369 0.9549
## Alcohol 0.0005 0.3870 0.3094 0.9549
## LabelAppeal 0.0949 0.2892 0.6451 0.9147 0.7877
## AcidIndex 0.0000 0.0000 0.0000 0.0001 0.0000
## STARS 0.8309 0.0756 0.9634 0.9707 0.0000
## BoundSulfurDioxide 0.0000 0.6587 0.6544 0.3406 0.0365
## PerVol 0.0503 0.8528 0.0291 0.3313 0.0387
## LabelAppeal AcidIndex STARS BoundSulfurDioxide PerVol
## TARGET 0.0000 0.0000 0.0000 0.4382 0.0039
## FixedAcidity 0.8000 0.0000 0.0761 0.7819 0.0000
## VolatileAcidity 0.0825 0.0000 0.0037 0.0005 0.0000
## CitricAcid 0.0501 0.0000 0.8749 0.0220 0.7383
## ResidualSugar 0.9877 0.2682 0.4869 0.4458 0.8960
## Chlorides 0.3329 0.0006 0.8433 0.1961 0.0649
## FreeSulfurDioxide 0.1483 0.0269 0.9593 0.0000 0.4008
## TotalSulfurDioxide 0.0949 0.0000 0.8309 0.0000 0.0503
## Density 0.2892 0.0000 0.0756 0.6587 0.8528
## pH 0.6451 0.0000 0.9634 0.6544 0.0291
## Sulphates 0.9147 0.0001 0.9707 0.3406 0.3313
## Alcohol 0.7877 0.0000 0.0000 0.0365 0.0387
## LabelAppeal 0.0051 0.0000 0.4144 0.2185
## AcidIndex 0.0051 0.0000 0.7082 0.0013
## STARS 0.0000 0.0000 0.5931 0.2366
## BoundSulfurDioxide 0.4144 0.7082 0.5931 0.0041
## PerVol 0.2185 0.0013 0.2366 0.0041
corrplot(cor(trainnum), method="square")

Build Models Poisson 2
#MODEL 1
model1 <- glm(TARGET~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+BoundSulfurDioxide+Density+pH+Sulphates+Alcohol+as.factor(LabelAppeal)+as.factor(AcidIndex) + as.factor(STARS)+PerVol,data=train, family=poisson())
summary(model1)
##
## Call:
## glm(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid +
## ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide +
## BoundSulfurDioxide + Density + pH + Sulphates + Alcohol +
## as.factor(LabelAppeal) + as.factor(AcidIndex) + as.factor(STARS) +
## PerVol, family = poisson(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2127 -0.6516 -0.0030 0.4432 3.6940
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 1.06253675 0.37156470 2.860
## FixedAcidity -0.00072647 0.00126206 -0.576
## VolatileAcidity -0.02903800 0.01123171 -2.585
## CitricAcid 0.00869022 0.00834783 1.041
## ResidualSugar -0.00001575 0.00020681 -0.076
## Chlorides -0.03234449 0.02218266 -1.458
## FreeSulfurDioxide 0.00006404 0.00005138 1.246
## TotalSulfurDioxide 0.00011812 0.00004809 2.456
## BoundSulfurDioxide -0.00006537 0.00004433 -1.474
## Density -0.29557735 0.19193416 -1.540
## pH -0.00983182 0.00765360 -1.285
## Sulphates -0.01153271 0.00817562 -1.411
## Alcohol 0.00461643 0.00144659 3.191
## as.factor(LabelAppeal)1 0.23924089 0.03800031 6.296
## as.factor(LabelAppeal)2 0.42916835 0.03706591 11.579
## as.factor(LabelAppeal)3 0.56226154 0.03771537 14.908
## as.factor(LabelAppeal)4 0.69766946 0.04245421 16.433
## as.factor(AcidIndex)5 -0.13380941 0.32271890 -0.415
## as.factor(AcidIndex)6 -0.10034777 0.31725980 -0.316
## as.factor(AcidIndex)7 -0.13264716 0.31700855 -0.418
## as.factor(AcidIndex)8 -0.16430607 0.31706657 -0.518
## as.factor(AcidIndex)9 -0.27397521 0.31739070 -0.863
## as.factor(AcidIndex)10 -0.43449994 0.31848283 -1.364
## as.factor(AcidIndex)11 -0.79602036 0.32208457 -2.471
## as.factor(AcidIndex)12 -0.80895430 0.32774169 -2.468
## as.factor(AcidIndex)13 -0.64343858 0.33066231 -1.946
## as.factor(AcidIndex)14 -0.74416112 0.34328561 -2.168
## as.factor(AcidIndex)15 -0.30132160 0.40394479 -0.746
## as.factor(AcidIndex)16 -0.95688354 0.54863387 -1.744
## as.factor(AcidIndex)17 -1.18518604 0.54861237 -2.160
## as.factor(STARS)2 0.31833077 0.01436884 22.154
## as.factor(STARS)2.04175498092412 -0.75685033 0.01956973 -38.675
## as.factor(STARS)3 0.43713915 0.01562442 27.978
## as.factor(STARS)4 0.55871107 0.02166437 25.789
## PerVol -0.05516995 0.05207826 -1.059
## Pr(>|z|)
## (Intercept) 0.00424 **
## FixedAcidity 0.56487
## VolatileAcidity 0.00973 **
## CitricAcid 0.29787
## ResidualSugar 0.93930
## Chlorides 0.14481
## FreeSulfurDioxide 0.21263
## TotalSulfurDioxide 0.01404 *
## BoundSulfurDioxide 0.14035
## Density 0.12356
## pH 0.19893
## Sulphates 0.15836
## Alcohol 0.00142 **
## as.factor(LabelAppeal)1 0.000000000306 ***
## as.factor(LabelAppeal)2 < 0.0000000000000002 ***
## as.factor(LabelAppeal)3 < 0.0000000000000002 ***
## as.factor(LabelAppeal)4 < 0.0000000000000002 ***
## as.factor(AcidIndex)5 0.67841
## as.factor(AcidIndex)6 0.75178
## as.factor(AcidIndex)7 0.67563
## as.factor(AcidIndex)8 0.60431
## as.factor(AcidIndex)9 0.38802
## as.factor(AcidIndex)10 0.17248
## as.factor(AcidIndex)11 0.01346 *
## as.factor(AcidIndex)12 0.01358 *
## as.factor(AcidIndex)13 0.05167 .
## as.factor(AcidIndex)14 0.03018 *
## as.factor(AcidIndex)15 0.45570
## as.factor(AcidIndex)16 0.08114 .
## as.factor(AcidIndex)17 0.03075 *
## as.factor(STARS)2 < 0.0000000000000002 ***
## as.factor(STARS)2.04175498092412 < 0.0000000000000002 ***
## as.factor(STARS)3 < 0.0000000000000002 ***
## as.factor(STARS)4 < 0.0000000000000002 ***
## PerVol 0.28943
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 22861 on 12794 degrees of freedom
## Residual deviance: 13549 on 12760 degrees of freedom
## AIC: 45561
##
## Number of Fisher Scoring iterations: 6
predmodel1 <- predict(model1, type="response")
train2$pred1 <- predict(model1, type="response")
table(true = train$TARGET, pred = floor(fitted(model1))) %>% kable() %>% kable_styling()
|
0
|
1
|
2
|
3
|
4
|
5
|
6
|
7
|
0
|
495
|
1670
|
383
|
166
|
18
|
2
|
0
|
0
|
1
|
82
|
105
|
49
|
8
|
0
|
0
|
0
|
0
|
2
|
91
|
355
|
423
|
212
|
10
|
0
|
0
|
0
|
3
|
54
|
476
|
927
|
887
|
261
|
6
|
0
|
0
|
4
|
7
|
277
|
573
|
1098
|
1071
|
148
|
3
|
0
|
5
|
3
|
105
|
137
|
400
|
912
|
430
|
27
|
0
|
6
|
0
|
33
|
15
|
53
|
267
|
344
|
53
|
0
|
7
|
0
|
8
|
1
|
4
|
20
|
79
|
28
|
2
|
8
|
0
|
2
|
0
|
0
|
0
|
7
|
7
|
1
|
par(mfrow=c(1,2))
hist(train2$TARGET)
hist(train2$pred1)

#plots for Model 1
par(mfrow=c(2,2))
plot(model1)

dispersiontest(model1)
##
## Overdispersion test
##
## data: model1
## z = -9.3583, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.8801366
#MODEL 2
model2 <- glm(TARGET~ VolatileAcidity+TotalSulfurDioxide+Alcohol+as.factor(LabelAppeal)+as.factor(AcidIndex) + as.factor(STARS)+PerVol,data=train, family=poisson())
summary(model2)
##
## Call:
## glm(formula = TARGET ~ VolatileAcidity + TotalSulfurDioxide +
## Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) +
## as.factor(STARS) + PerVol, family = poisson(), data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2471 -0.6496 -0.0005 0.4355 3.6907
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.71353025 0.31931670 2.235
## VolatileAcidity -0.03085150 0.01067228 -2.891
## TotalSulfurDioxide 0.00006467 0.00003195 2.024
## Alcohol 0.00461529 0.00144657 3.191
## as.factor(LabelAppeal)1 0.23988496 0.03799700 6.313
## as.factor(LabelAppeal)2 0.42949634 0.03706428 11.588
## as.factor(LabelAppeal)3 0.56362465 0.03770892 14.947
## as.factor(LabelAppeal)4 0.69761429 0.04244584 16.435
## as.factor(AcidIndex)5 -0.12466124 0.32238208 -0.387
## as.factor(AcidIndex)6 -0.08925265 0.31691690 -0.282
## as.factor(AcidIndex)7 -0.12199358 0.31663296 -0.385
## as.factor(AcidIndex)8 -0.15350050 0.31666560 -0.485
## as.factor(AcidIndex)9 -0.26427415 0.31696999 -0.834
## as.factor(AcidIndex)10 -0.42663465 0.31805212 -1.341
## as.factor(AcidIndex)11 -0.79005656 0.32162571 -2.456
## as.factor(AcidIndex)12 -0.80327975 0.32728632 -2.454
## as.factor(AcidIndex)13 -0.63916256 0.33019908 -1.936
## as.factor(AcidIndex)14 -0.73826506 0.34274553 -2.154
## as.factor(AcidIndex)15 -0.28283782 0.40345858 -0.701
## as.factor(AcidIndex)16 -0.95458004 0.54800017 -1.742
## as.factor(AcidIndex)17 -1.19689236 0.54811293 -2.184
## as.factor(STARS)2 0.31814639 0.01436122 22.153
## as.factor(STARS)2.04175498092412 -0.75871740 0.01956057 -38.788
## as.factor(STARS)3 0.43756789 0.01561931 28.015
## as.factor(STARS)4 0.55870679 0.02166337 25.790
## PerVol -0.04074099 0.04313558 -0.944
## Pr(>|z|)
## (Intercept) 0.02545 *
## VolatileAcidity 0.00384 **
## TotalSulfurDioxide 0.04295 *
## Alcohol 0.00142 **
## as.factor(LabelAppeal)1 0.000000000273 ***
## as.factor(LabelAppeal)2 < 0.0000000000000002 ***
## as.factor(LabelAppeal)3 < 0.0000000000000002 ***
## as.factor(LabelAppeal)4 < 0.0000000000000002 ***
## as.factor(AcidIndex)5 0.69899
## as.factor(AcidIndex)6 0.77823
## as.factor(AcidIndex)7 0.70003
## as.factor(AcidIndex)8 0.62786
## as.factor(AcidIndex)9 0.40442
## as.factor(AcidIndex)10 0.17979
## as.factor(AcidIndex)11 0.01403 *
## as.factor(AcidIndex)12 0.01411 *
## as.factor(AcidIndex)13 0.05291 .
## as.factor(AcidIndex)14 0.03124 *
## as.factor(AcidIndex)15 0.48328
## as.factor(AcidIndex)16 0.08152 .
## as.factor(AcidIndex)17 0.02899 *
## as.factor(STARS)2 < 0.0000000000000002 ***
## as.factor(STARS)2.04175498092412 < 0.0000000000000002 ***
## as.factor(STARS)3 < 0.0000000000000002 ***
## as.factor(STARS)4 < 0.0000000000000002 ***
## PerVol 0.34492
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 22861 on 12794 degrees of freedom
## Residual deviance: 13562 on 12769 degrees of freedom
## AIC: 45556
##
## Number of Fisher Scoring iterations: 6
predmodel2 <- predict(model2, type="response")
train2$pred2 <- predict(model2, type="response")
table(true = train$TARGET, pred = floor(fitted(model2))) %>% kable() %>% kable_styling()
|
0
|
1
|
2
|
3
|
4
|
5
|
6
|
7
|
0
|
500
|
1663
|
385
|
166
|
17
|
3
|
0
|
0
|
1
|
83
|
103
|
49
|
9
|
0
|
0
|
0
|
0
|
2
|
94
|
349
|
426
|
212
|
10
|
0
|
0
|
0
|
3
|
58
|
467
|
945
|
885
|
251
|
5
|
0
|
0
|
4
|
7
|
280
|
569
|
1119
|
1049
|
152
|
1
|
0
|
5
|
3
|
104
|
141
|
403
|
897
|
443
|
23
|
0
|
6
|
0
|
32
|
16
|
55
|
264
|
346
|
51
|
1
|
7
|
0
|
8
|
1
|
4
|
16
|
85
|
27
|
1
|
8
|
0
|
2
|
0
|
0
|
0
|
7
|
8
|
0
|
par(mfrow=c(1,2))
hist(train2$TARGET)
hist(train2$pred2)

#plots for Model 1
par(mfrow=c(2,2))
plot(model2)

dispersiontest(model2)
##
## Overdispersion test
##
## data: model2
## z = -9.2966, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion
## 0.880733
Build Models Neg Bin Reg 2
library(MASS)
#MODEL 1
model3 <- glm.nb(TARGET~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+BoundSulfurDioxide+Density+pH+Sulphates+Alcohol+as.factor(LabelAppeal)+as.factor(AcidIndex) + as.factor(STARS)+PerVol,data=train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model3)
##
## Call:
## glm.nb(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid +
## ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide +
## BoundSulfurDioxide + Density + pH + Sulphates + Alcohol +
## as.factor(LabelAppeal) + as.factor(AcidIndex) + as.factor(STARS) +
## PerVol, data = train, init.theta = 40922.4051, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2126 -0.6516 -0.0030 0.4431 3.6939
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 1.06256763 0.37158580 2.860
## FixedAcidity -0.00072650 0.00126212 -0.576
## VolatileAcidity -0.02903909 0.01123219 -2.585
## CitricAcid 0.00869043 0.00834821 1.041
## ResidualSugar -0.00001575 0.00020682 -0.076
## Chlorides -0.03234520 0.02218366 -1.458
## FreeSulfurDioxide 0.00006404 0.00005138 1.246
## TotalSulfurDioxide 0.00011812 0.00004809 2.456
## BoundSulfurDioxide -0.00006537 0.00004434 -1.474
## Density -0.29558179 0.19194284 -1.540
## pH -0.00983270 0.00765394 -1.285
## Sulphates -0.01153325 0.00817598 -1.411
## Alcohol 0.00461635 0.00144666 3.191
## as.factor(LabelAppeal)1 0.23924036 0.03800119 6.296
## as.factor(LabelAppeal)2 0.42916666 0.03706677 11.578
## as.factor(LabelAppeal)3 0.56225795 0.03771630 14.908
## as.factor(LabelAppeal)4 0.69766527 0.04245561 16.433
## as.factor(AcidIndex)5 -0.13382982 0.32273859 -0.415
## as.factor(AcidIndex)6 -0.10036698 0.31727930 -0.316
## as.factor(AcidIndex)7 -0.13266700 0.31702803 -0.418
## as.factor(AcidIndex)8 -0.16432660 0.31708605 -0.518
## as.factor(AcidIndex)9 -0.27399928 0.31741019 -0.863
## as.factor(AcidIndex)10 -0.43452627 0.31850230 -1.364
## as.factor(AcidIndex)11 -0.79605119 0.32210392 -2.471
## as.factor(AcidIndex)12 -0.80898684 0.32776092 -2.468
## as.factor(AcidIndex)13 -0.64346919 0.33068163 -1.946
## as.factor(AcidIndex)14 -0.74418859 0.34330458 -2.168
## as.factor(AcidIndex)15 -0.30134808 0.40396482 -0.746
## as.factor(AcidIndex)16 -0.95692082 0.54865195 -1.744
## as.factor(AcidIndex)17 -1.18522561 0.54862910 -2.160
## as.factor(STARS)2 0.31833116 0.01436939 22.153
## as.factor(STARS)2.04175498092412 -0.75684944 0.01957014 -38.674
## as.factor(STARS)3 0.43714031 0.01562508 27.977
## as.factor(STARS)4 0.55871330 0.02166558 25.788
## PerVol -0.05517181 0.05208054 -1.059
## Pr(>|z|)
## (Intercept) 0.00424 **
## FixedAcidity 0.56487
## VolatileAcidity 0.00973 **
## CitricAcid 0.29788
## ResidualSugar 0.93929
## Chlorides 0.14482
## FreeSulfurDioxide 0.21263
## TotalSulfurDioxide 0.01404 *
## BoundSulfurDioxide 0.14035
## Density 0.12357
## pH 0.19891
## Sulphates 0.15836
## Alcohol 0.00142 **
## as.factor(LabelAppeal)1 0.000000000306 ***
## as.factor(LabelAppeal)2 < 0.0000000000000002 ***
## as.factor(LabelAppeal)3 < 0.0000000000000002 ***
## as.factor(LabelAppeal)4 < 0.0000000000000002 ***
## as.factor(AcidIndex)5 0.67838
## as.factor(AcidIndex)6 0.75175
## as.factor(AcidIndex)7 0.67560
## as.factor(AcidIndex)8 0.60429
## as.factor(AcidIndex)9 0.38801
## as.factor(AcidIndex)10 0.17248
## as.factor(AcidIndex)11 0.01346 *
## as.factor(AcidIndex)12 0.01358 *
## as.factor(AcidIndex)13 0.05167 .
## as.factor(AcidIndex)14 0.03018 *
## as.factor(AcidIndex)15 0.45568
## as.factor(AcidIndex)16 0.08114 .
## as.factor(AcidIndex)17 0.03075 *
## as.factor(STARS)2 < 0.0000000000000002 ***
## as.factor(STARS)2.04175498092412 < 0.0000000000000002 ***
## as.factor(STARS)3 < 0.0000000000000002 ***
## as.factor(STARS)4 < 0.0000000000000002 ***
## PerVol 0.28944
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(40922.41) family taken to be 1)
##
## Null deviance: 22860 on 12794 degrees of freedom
## Residual deviance: 13549 on 12760 degrees of freedom
## AIC: 45564
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 40922
## Std. Err.: 34326
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -45491.65
predmodel3 <- predict(model3, type="response")
train2$pred3 <- predict(model3, type="response")
table(true = train$TARGET, pred = floor(fitted(model3))) %>% kable() %>% kable_styling()
|
0
|
1
|
2
|
3
|
4
|
5
|
6
|
7
|
0
|
495
|
1670
|
383
|
166
|
18
|
2
|
0
|
0
|
1
|
82
|
105
|
49
|
8
|
0
|
0
|
0
|
0
|
2
|
91
|
355
|
423
|
212
|
10
|
0
|
0
|
0
|
3
|
54
|
476
|
927
|
887
|
261
|
6
|
0
|
0
|
4
|
7
|
277
|
573
|
1098
|
1071
|
148
|
3
|
0
|
5
|
3
|
105
|
137
|
400
|
912
|
430
|
27
|
0
|
6
|
0
|
33
|
15
|
53
|
267
|
344
|
53
|
0
|
7
|
0
|
8
|
1
|
4
|
20
|
79
|
28
|
2
|
8
|
0
|
2
|
0
|
0
|
0
|
7
|
7
|
1
|
par(mfrow=c(1,2))
hist(train2$TARGET)
hist(train2$pred3)

#plots for Model 1
par(mfrow=c(2,2))
plot(model3)

#MODEL 2
model4 <- glm.nb(TARGET~ VolatileAcidity+TotalSulfurDioxide+Alcohol+as.factor(LabelAppeal)+as.factor(AcidIndex) + as.factor(STARS)+PerVol,data=train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model4)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + TotalSulfurDioxide +
## Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) +
## as.factor(STARS) + PerVol, data = train, init.theta = 40886.26992,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2470 -0.6496 -0.0005 0.4354 3.6906
##
## Coefficients:
## Estimate Std. Error z value
## (Intercept) 0.71355226 0.31933620 2.234
## VolatileAcidity -0.03085266 0.01067275 -2.891
## TotalSulfurDioxide 0.00006468 0.00003195 2.024
## Alcohol 0.00461521 0.00144663 3.190
## as.factor(LabelAppeal)1 0.23988448 0.03799789 6.313
## as.factor(LabelAppeal)2 0.42949469 0.03706514 11.588
## as.factor(LabelAppeal)3 0.56362112 0.03770984 14.946
## as.factor(LabelAppeal)4 0.69761012 0.04244725 16.435
## as.factor(AcidIndex)5 -0.12468053 0.32240182 -0.387
## as.factor(AcidIndex)6 -0.08927068 0.31693642 -0.282
## as.factor(AcidIndex)7 -0.12201221 0.31665248 -0.385
## as.factor(AcidIndex)8 -0.15351977 0.31668512 -0.485
## as.factor(AcidIndex)9 -0.26429701 0.31698951 -0.834
## as.factor(AcidIndex)10 -0.42665984 0.31807163 -1.341
## as.factor(AcidIndex)11 -0.79008634 0.32164510 -2.456
## as.factor(AcidIndex)12 -0.80331117 0.32730558 -2.454
## as.factor(AcidIndex)13 -0.63919219 0.33021843 -1.936
## as.factor(AcidIndex)14 -0.73829156 0.34276453 -2.154
## as.factor(AcidIndex)15 -0.28286247 0.40347863 -0.701
## as.factor(AcidIndex)16 -0.95461650 0.54801814 -1.742
## as.factor(AcidIndex)17 -1.19693172 0.54812968 -2.184
## as.factor(STARS)2 0.31814672 0.01436176 22.152
## as.factor(STARS)2.04175498092412 -0.75871659 0.01956098 -38.787
## as.factor(STARS)3 0.43756899 0.01561998 28.013
## as.factor(STARS)4 0.55870896 0.02166458 25.789
## PerVol -0.04074236 0.04313746 -0.944
## Pr(>|z|)
## (Intercept) 0.02545 *
## VolatileAcidity 0.00384 **
## TotalSulfurDioxide 0.04294 *
## Alcohol 0.00142 **
## as.factor(LabelAppeal)1 0.000000000273 ***
## as.factor(LabelAppeal)2 < 0.0000000000000002 ***
## as.factor(LabelAppeal)3 < 0.0000000000000002 ***
## as.factor(LabelAppeal)4 < 0.0000000000000002 ***
## as.factor(AcidIndex)5 0.69896
## as.factor(AcidIndex)6 0.77820
## as.factor(AcidIndex)7 0.70000
## as.factor(AcidIndex)8 0.62784
## as.factor(AcidIndex)9 0.40441
## as.factor(AcidIndex)10 0.17979
## as.factor(AcidIndex)11 0.01403 *
## as.factor(AcidIndex)12 0.01412 *
## as.factor(AcidIndex)13 0.05291 .
## as.factor(AcidIndex)14 0.03125 *
## as.factor(AcidIndex)15 0.48327
## as.factor(AcidIndex)16 0.08152 .
## as.factor(AcidIndex)17 0.02899 *
## as.factor(STARS)2 < 0.0000000000000002 ***
## as.factor(STARS)2.04175498092412 < 0.0000000000000002 ***
## as.factor(STARS)3 < 0.0000000000000002 ***
## as.factor(STARS)4 < 0.0000000000000002 ***
## PerVol 0.34493
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(40886.27) family taken to be 1)
##
## Null deviance: 22860 on 12794 degrees of freedom
## Residual deviance: 13561 on 12769 degrees of freedom
## AIC: 45558
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 40886
## Std. Err.: 34285
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -45504.06
predmodel4 <- predict(model2, type="response")
train2$pred4 <- predict(model2, type="response")
table(true = train$TARGET, pred = floor(fitted(model4))) %>% kable() %>% kable_styling()
|
0
|
1
|
2
|
3
|
4
|
5
|
6
|
7
|
0
|
500
|
1663
|
385
|
166
|
17
|
3
|
0
|
0
|
1
|
83
|
103
|
49
|
9
|
0
|
0
|
0
|
0
|
2
|
94
|
349
|
426
|
212
|
10
|
0
|
0
|
0
|
3
|
58
|
467
|
945
|
885
|
251
|
5
|
0
|
0
|
4
|
7
|
280
|
569
|
1119
|
1049
|
152
|
1
|
0
|
5
|
3
|
104
|
141
|
403
|
897
|
443
|
23
|
0
|
6
|
0
|
32
|
16
|
55
|
264
|
346
|
51
|
1
|
7
|
0
|
8
|
1
|
4
|
16
|
85
|
27
|
1
|
8
|
0
|
2
|
0
|
0
|
0
|
7
|
8
|
0
|
par(mfrow=c(1,2))
hist(train2$TARGET)
hist(train2$pred4)

#plots for Model 1
par(mfrow=c(2,2))
plot(model4)

Build Models Linear 2
#MODEL 1
model5 <- lm(TARGET ~ ., data=train)
summary(model5)
##
## Call:
## lm(formula = TARGET ~ ., data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0189 -0.7380 0.3737 1.1294 4.6454
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3757840 0.5573755 7.851 0.00000000000000447 ***
## FixedAcidity -0.0036799 0.0035701 -1.031 0.302683
## VolatileAcidity -0.1559722 0.0312290 -4.994 0.00000059778214213 ***
## CitricAcid 0.0551764 0.0239298 2.306 0.021140 *
## ResidualSugar -0.0001641 0.0005880 -0.279 0.780230
## Chlorides -0.1415407 0.0626819 -2.258 0.023957 *
## FreeSulfurDioxide 0.0004751 0.0001478 3.214 0.001312 **
## TotalSulfurDioxide 0.0007186 0.0001376 5.222 0.00000017982823371 ***
## Density -1.3781218 0.5464768 -2.522 0.011687 *
## pH -0.0633858 0.0216939 -2.922 0.003486 **
## Sulphates -0.0665696 0.0229855 -2.896 0.003784 **
## Alcohol 0.0210964 0.0041090 5.134 0.00000028748895271 ***
## LabelAppeal 0.6034374 0.0169723 35.554 < 0.0000000000000002 ***
## AcidIndex -0.3300359 0.0112552 -29.323 < 0.0000000000000002 ***
## STARS 0.7178055 0.0195731 36.673 < 0.0000000000000002 ***
## BoundSulfurDioxide -0.0004583 0.0001280 -3.581 0.000343 ***
## PerVol -0.1285625 0.1472273 -0.873 0.382557
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.638 on 12778 degrees of freedom
## Multiple R-squared: 0.2779, Adjusted R-squared: 0.277
## F-statistic: 307.3 on 16 and 12778 DF, p-value: < 0.00000000000000022
par(mfrow=c(1,2))
plot(model5$residuals ~ model5$fitted.values)
plot(model5$fitted.values,train$TARGET)

par(mfrow=c(2,2))
plot(model5)

#extract variables that are significant and rerun model
sigvars <- data.frame(summary(model5)$coef[summary(model5)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
colist<-dplyr::pull(sigvars, vars)
colist <- colist[c(2:14)]
idx <- match(colist, names(train))
trainmod2 <- cbind(train[,idx], train['TARGET'])
#MODEL 2
model6<-lm(TARGET ~ ., data=trainmod2)
summary(model6)
##
## Call:
## lm(formula = TARGET ~ ., data = trainmod2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.0101 -0.7355 0.3733 1.1267 4.6520
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.3497981 0.5567548 7.813 0.00000000000000603 ***
## VolatileAcidity -0.1710061 0.0261125 -6.549 0.00000000006021103 ***
## CitricAcid 0.0554216 0.0239248 2.316 0.020547 *
## Chlorides -0.1420447 0.0626707 -2.267 0.023436 *
## FreeSulfurDioxide 0.0004756 0.0001478 3.218 0.001294 **
## TotalSulfurDioxide 0.0007177 0.0001376 5.216 0.00000018534856613 ***
## Density -1.3735422 0.5464180 -2.514 0.011959 *
## pH -0.0638339 0.0216882 -2.943 0.003254 **
## Sulphates -0.0670812 0.0229781 -2.919 0.003514 **
## Alcohol 0.0210671 0.0041083 5.128 0.00000029718809899 ***
## LabelAppeal 0.6036101 0.0169705 35.568 < 0.0000000000000002 ***
## AcidIndex -0.3319226 0.0110520 -30.033 < 0.0000000000000002 ***
## STARS 0.7178463 0.0195712 36.679 < 0.0000000000000002 ***
## BoundSulfurDioxide -0.0004568 0.0001279 -3.570 0.000358 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.638 on 12781 degrees of freedom
## Multiple R-squared: 0.2778, Adjusted R-squared: 0.2771
## F-statistic: 378.2 on 13 and 12781 DF, p-value: < 0.00000000000000022
par(mfrow=c(2,2))
plot(model6$residuals ~ model6$fitted.values)
plot(model6$fitted.values,train$TARGET)
par(mfrow=c(2,2))

plot(model6)

par(mfrow=c(1,2))
plot(model6$residuals ~ model6$fitted.values, main="New Reduced Var Model")
abline(h = 0)
plot(model5$residuals ~ model5$fitted.values, main="Orignal Model All Vars")
abline(h = 0)

Select Models
test = read.csv(file="data/wine-evaluation-data.csv")
test2<- test
dim(test)
## [1] 3335 16
#new variables
test$BoundSulfurDioxide <- test$TotalSulfurDioxide - test$FreeSulfurDioxide
# impute data for missing values
# use column mean for calculation
test$STARS[is.na(test$STARS)] <- mean(test$STARS, na.rm=TRUE)
test$Alcohol[is.na(test$Alcohol)] <- mean(test$Alcohol, na.rm=TRUE)
test$Sulphates[is.na(test$Sulphates)] <- mean(test$Sulphates, na.rm=TRUE)
test$pH[is.na(test$pH)] <- mean(test$pH, na.rm=TRUE)
test$TotalSulfurDioxide[is.na(test$TotalSulfurDioxide)] <- mean(test$TotalSulfurDioxide, na.rm=TRUE)
test$FreeSulfurDioxide[is.na(test$FreeSulfurDioxide)] <- mean(test$FreeSulfurDioxide, na.rm=TRUE)
test$BoundSulfurDioxide[is.na(test$BoundSulfurDioxide)] <- mean(test$BoundSulfurDioxide, na.rm=TRUE)
test$Chlorides[is.na(test$Chlorides)] <- mean(test$Chlorides, na.rm=TRUE)
test$ResidualSugar[is.na(test$ResidualSugar)] <- mean(test$ResidualSugar, na.rm=TRUE)
#convert to abs for negative values
#converted to positive based upon literature
test$FixedAcidity <- abs(test$FixedAcidity)
test$VolatileAcidity <- abs(test$VolatileAcidity)
test$CitricAcid <- abs(test$CitricAcid)
test$ResidualSugar <- abs(test$ResidualSugar)
test$Chlorides <- abs(test$Chlorides)
test$FreeSulfurDioxide <- abs(test$FreeSulfurDioxide)
test$TotalSulfurDioxide <- abs(test$TotalSulfurDioxide)
test$BoundSulfurDioxide <- abs(test$BoundSulfurDioxide)
test$Sulphates <- abs(test$Sulphates)
test$Alcohol <- abs(test$Alcohol)
#new variables after abs to avoid nan and inf
test$PerVol <- test$VolatileAcidity/(test$FixedAcidity+test$VolatileAcidity)
#shift categorigal labelappeal
test$LabelAppeal <- test$LabelAppeal+2
test2<-test
test2$STARS <- as.factor(test2$STARS)
test <- test[, !(colnames(test) %in% c("INDEX"))]
test <- test[, !(colnames(test) %in% c("IN"))]
test$TARGET <- 0
test$STARS[test$STARS>2 & test$STARS <3] <- 2.04175498092412
test$TARGET <- predict(model2, newdata = test, type="response")
y_pred_num <- floor(test$TARGET)
y_pred <- factor(y_pred_num, levels=c(0, 1,2,3,4,5,6,7,8))
summary(y_pred)
## 0 1 2 3 4 5 6 7 8
## 165 773 703 763 611 288 32 0 0
par(mfrow=c(2,2))
hist(test$TARGET)
hist(train$TARGET)
