Abstract

Purpose of this assignment is to explore, analyze and model a dataset containing approximately 12,000 commercially available wines. The variables are mostly related to the chemical properties of the wine being sold. The response variable(TARGET) is the number of sample cases of wine that were purchased by wine distribution companies after sampling a bottle of wine. These cases would be used to provide tasting samples to restaurants and wine stores around the United States. The more sample cases purchased, the more likely is a bottle of wine to be sold at a high-end restaurant. A large wine manufacturer wants to study the data to predict the number of wine cases that would be ordered based on the wine characteristics.

If the wine manufacturer can predict the number of cases, then that manufacturer will be able to adjust their wine offering to maximize sales. Our objective is to build a count regression model to predict the number of cases of wine that will be sold under given properties of the wine. Sometimes, the fact that a variable is missing is predictive of the target. For building various models, we will be using only the variables that are part of the dataset(or variables that are derived from the variables).

Keywords: wine, data621

Data Exploration

knitr::opts_chunk$set(echo = TRUE)
library(e1071)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(purrr)
library(tidyr)
library(ggplot2)
library(corrplot)
## corrplot 0.84 loaded
library(FactoMineR)
## Warning: package 'FactoMineR' was built under R version 3.4.4
library(VIF)
library(knitr)
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 3.4.4
library(Hmisc)
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Warning: package 'Formula' was built under R version 3.4.4
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## The following object is masked from 'package:e1071':
## 
##     impute
## The following objects are masked from 'package:base':
## 
##     format.pval, units
library(pROC)
## Warning: package 'pROC' was built under R version 3.4.4
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
library(binr)
library(MASS)
## Warning: package 'MASS' was built under R version 3.4.4
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
library(AER)
## Loading required package: car
## Warning: package 'car' was built under R version 3.4.4
## Loading required package: carData
## Warning: package 'carData' was built under R version 3.4.4
## 
## Attaching package: 'car'
## The following object is masked from 'package:VIF':
## 
##     vif
## The following object is masked from 'package:purrr':
## 
##     some
## The following object is masked from 'package:dplyr':
## 
##     recode
## Loading required package: lmtest
## Warning: package 'lmtest' was built under R version 3.4.4
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: sandwich
# read data
train = read.csv(file="data/wine-training-data.csv")
dim(train)
## [1] 12795    16
#transform data


#check data
summary(train) %>% kable() %>% kable_styling()
 INDEX </th>
 TARGET </th>
FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide
Density </th>
   pH </th>
Sulphates
Alcohol </th>
LabelAppeal AcidIndex
 STARS </th>
Min. : 1 Min. :0.000 Min. :-18.100 Min. :-2.7900 Min. :-3.2400 Min. :-127.800 Min. :-1.1710 Min. :-555.00 Min. :-823.0 Min. :0.8881 Min. :0.480 Min. :-3.1300 Min. :-4.70 Min. :-2.000000 Min. : 4.000 Min. :1.000
1st Qu.: 4038 1st Qu.:2.000 1st Qu.: 5.200 1st Qu.: 0.1300 1st Qu.: 0.0300 1st Qu.: -2.000 1st Qu.:-0.0310 1st Qu.: 0.00 1st Qu.: 27.0 1st Qu.:0.9877 1st Qu.:2.960 1st Qu.: 0.2800 1st Qu.: 9.00 1st Qu.:-1.000000 1st Qu.: 7.000 1st Qu.:1.000
Median : 8110 Median :3.000 Median : 6.900 Median : 0.2800 Median : 0.3100 Median : 3.900 Median : 0.0460 Median : 30.00 Median : 123.0 Median :0.9945 Median :3.200 Median : 0.5000 Median :10.40 Median : 0.000000 Median : 8.000 Median :2.000
Mean : 8070 Mean :3.029 Mean : 7.076 Mean : 0.3241 Mean : 0.3084 Mean : 5.419 Mean : 0.0548 Mean : 30.85 Mean : 120.7 Mean :0.9942 Mean :3.208 Mean : 0.5271 Mean :10.49 Mean :-0.009066 Mean : 7.773 Mean :2.042
3rd Qu.:12106 3rd Qu.:4.000 3rd Qu.: 9.500 3rd Qu.: 0.6400 3rd Qu.: 0.5800 3rd Qu.: 15.900 3rd Qu.: 0.1530 3rd Qu.: 70.00 3rd Qu.: 208.0 3rd Qu.:1.0005 3rd Qu.:3.470 3rd Qu.: 0.8600 3rd Qu.:12.40 3rd Qu.: 1.000000 3rd Qu.: 8.000 3rd Qu.:3.000
Max. :16129 Max. :8.000 Max. : 34.400 Max. : 3.6800 Max. : 3.8600 Max. : 141.150 Max. : 1.3510 Max. : 623.00 Max. :1057.0 Max. :1.0992 Max. :6.130 Max. : 4.2400 Max. :26.50 Max. : 2.000000 Max. :17.000 Max. :4.000
NA NA NA NA NA NA’s :616 NA’s :638 NA’s :647 NA’s :682 NA NA’s :395 NA’s :1210 NA’s :653 NA NA NA’s :3359
str(train)
## 'data.frame':    12795 obs. of  16 variables:
##  $ INDEX             : int  1 2 4 5 6 7 8 11 12 13 ...
##  $ TARGET            : int  3 3 5 3 4 0 0 4 3 6 ...
##  $ FixedAcidity      : num  3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
##  $ VolatileAcidity   : num  1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
##  $ CitricAcid        : num  -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
##  $ ResidualSugar     : num  54.2 26.1 14.8 18.8 9.4 ...
##  $ Chlorides         : num  -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
##  $ FreeSulfurDioxide : num  NA 15 214 22 -167 -37 287 523 -213 62 ...
##  $ TotalSulfurDioxide: num  268 -327 142 115 108 15 156 551 NA 180 ...
##  $ Density           : num  0.993 1.028 0.995 0.996 0.995 ...
##  $ pH                : num  3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
##  $ Sulphates         : num  -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
##  $ Alcohol           : num  9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
##  $ LabelAppeal       : int  0 -1 -1 -1 0 0 0 1 0 0 ...
##  $ AcidIndex         : int  8 7 8 6 9 11 8 7 6 8 ...
##  $ STARS             : int  2 3 3 1 2 NA NA 3 NA 4 ...
sapply(train, function(x) sum(is.na(x))) %>% kable() %>% kable_styling()
x
INDEX 0
TARGET 0
FixedAcidity 0
VolatileAcidity 0
CitricAcid 0
ResidualSugar 616
Chlorides 638
FreeSulfurDioxide 647
TotalSulfurDioxide 682
Density 0
pH 395
Sulphates 1210
Alcohol 653
LabelAppeal 0
AcidIndex 0
STARS 3359
library(UpSetR)
## 
## Attaching package: 'UpSetR'
## The following object is masked from 'package:lattice':
## 
##     histogram
library(naniar)
## Warning: package 'naniar' was built under R version 3.4.4
gg_miss_fct(x = train, fct = TARGET)

train %>%
  as_shadow_upset() %>%
  upset(nsets = 24)

ntrain<-select_if(train, is.numeric)
ntrain %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(value)) +                     # Plot the values
    facet_wrap(~ key, scales = "free") +   # In separate panels
    geom_density()  
## Warning: Removed 8200 rows containing non-finite values (stat_density).

summary_metrics <- function(df){
  ###Creates summary metrics table
  metrics_only <- df[, sapply(df, is.numeric)]
   
  df_metrics <- psych::describe(metrics_only, quant = c(.25,.75))
  df_metrics$unique_values = rapply(metrics_only, function(x) length(unique(x)))
  df_metrics <- 
    dplyr::select(df_metrics, n, unique_values, min, Q.1st = Q0.25, median, mean, Q.3rd = Q0.75, 
    max, range, sd, skew, kurtosis
  )
  return(df_metrics)
}


metrics_df <- summary_metrics(train)

boxplot_data <- 
  train %>% 
  dplyr::select(rownames(metrics_df)[metrics_df$unique_values < 15]) %>% 
  reshape2::melt(id.vars = "TARGET")

ggplot(data = boxplot_data, aes(x = factor(value), y = TARGET)) +
  geom_boxplot() +
  facet_wrap( ~ variable, scales = "free") +
  coord_flip() +
  ggthemes::theme_fivethirtyeight()

trainc <- train[complete.cases(train), ]
trainc <- trainc[, !(colnames(trainc) %in% c("INDEX"))]

rcorr(as.matrix(trainc))
##                    TARGET FixedAcidity VolatileAcidity CitricAcid
## TARGET               1.00        -0.01           -0.08       0.00
## FixedAcidity        -0.01         1.00            0.02       0.01
## VolatileAcidity     -0.08         0.02            1.00      -0.02
## CitricAcid           0.00         0.01           -0.02       1.00
## ResidualSugar        0.00        -0.02            0.00      -0.01
## Chlorides           -0.03        -0.01            0.01      -0.03
## FreeSulfurDioxide    0.02         0.02           -0.01       0.01
## TotalSulfurDioxide   0.02        -0.02            0.00      -0.01
## Density             -0.05         0.01            0.01      -0.02
## pH                   0.00         0.00            0.01       0.00
## Sulphates           -0.02         0.04            0.00      -0.01
## Alcohol              0.07        -0.01            0.00       0.02
## LabelAppeal          0.50         0.01           -0.02       0.02
## AcidIndex           -0.17         0.15            0.03       0.05
## STARS                0.55         0.00           -0.04       0.01
##                    ResidualSugar Chlorides FreeSulfurDioxide
## TARGET                      0.00     -0.03              0.02
## FixedAcidity               -0.02     -0.01              0.02
## VolatileAcidity             0.00      0.01             -0.01
## CitricAcid                 -0.01     -0.03              0.01
## ResidualSugar               1.00      0.00              0.02
## Chlorides                   0.00      1.00             -0.02
## FreeSulfurDioxide           0.02     -0.02              1.00
## TotalSulfurDioxide          0.02      0.00              0.01
## Density                    -0.01      0.02             -0.01
## pH                          0.02     -0.02              0.00
## Sulphates                   0.00      0.00              0.03
## Alcohol                    -0.02     -0.02             -0.02
## LabelAppeal                 0.00     -0.01              0.01
## AcidIndex                  -0.02      0.00             -0.01
## STARS                       0.02     -0.01             -0.02
##                    TotalSulfurDioxide Density    pH Sulphates Alcohol
## TARGET                           0.02   -0.05  0.00     -0.02    0.07
## FixedAcidity                    -0.02    0.01  0.00      0.04   -0.01
## VolatileAcidity                  0.00    0.01  0.01      0.00    0.00
## CitricAcid                      -0.01   -0.02  0.00     -0.01    0.02
## ResidualSugar                    0.02   -0.01  0.02      0.00   -0.02
## Chlorides                        0.00    0.02 -0.02      0.00   -0.02
## FreeSulfurDioxide                0.01   -0.01  0.00      0.03   -0.02
## TotalSulfurDioxide               1.00    0.02  0.00      0.00   -0.02
## Density                          0.02    1.00  0.00     -0.01   -0.01
## pH                               0.00    0.00  1.00      0.01   -0.01
## Sulphates                        0.00   -0.01  0.01      1.00    0.01
## Alcohol                         -0.02   -0.01 -0.01      0.01    1.00
## LabelAppeal                      0.00   -0.02  0.00      0.00    0.00
## AcidIndex                       -0.02    0.05 -0.05      0.03   -0.06
## STARS                            0.02   -0.03  0.00     -0.02    0.06
##                    LabelAppeal AcidIndex STARS
## TARGET                    0.50     -0.17  0.55
## FixedAcidity              0.01      0.15  0.00
## VolatileAcidity          -0.02      0.03 -0.04
## CitricAcid                0.02      0.05  0.01
## ResidualSugar             0.00     -0.02  0.02
## Chlorides                -0.01      0.00 -0.01
## FreeSulfurDioxide         0.01     -0.01 -0.02
## TotalSulfurDioxide        0.00     -0.02  0.02
## Density                  -0.02      0.05 -0.03
## pH                        0.00     -0.05  0.00
## Sulphates                 0.00      0.03 -0.02
## Alcohol                   0.00     -0.06  0.06
## LabelAppeal               1.00      0.01  0.32
## AcidIndex                 0.01      1.00 -0.10
## STARS                     0.32     -0.10  1.00
## 
## n= 6436 
## 
## 
## P
##                    TARGET FixedAcidity VolatileAcidity CitricAcid
## TARGET                    0.3146       0.0000          0.8508    
## FixedAcidity       0.3146              0.1273          0.2614    
## VolatileAcidity    0.0000 0.1273                       0.0602    
## CitricAcid         0.8508 0.2614       0.0602                    
## ResidualSugar      0.7777 0.2158       0.9025          0.4298    
## Chlorides          0.0146 0.6244       0.2336          0.0071    
## FreeSulfurDioxide  0.0693 0.2156       0.3588          0.3312    
## TotalSulfurDioxide 0.0831 0.0613       0.9525          0.4263    
## Density            0.0001 0.3532       0.2934          0.1729    
## pH                 0.9859 0.7149       0.5634          0.9515    
## Sulphates          0.0887 0.0007       0.9032          0.2473    
## Alcohol            0.0000 0.2939       0.9833          0.1730    
## LabelAppeal        0.0000 0.3615       0.1044          0.2188    
## AcidIndex          0.0000 0.0000       0.0445          0.0000    
## STARS              0.0000 0.6921       0.0012          0.5668    
##                    ResidualSugar Chlorides FreeSulfurDioxide
## TARGET             0.7777        0.0146    0.0693           
## FixedAcidity       0.2158        0.6244    0.2156           
## VolatileAcidity    0.9025        0.2336    0.3588           
## CitricAcid         0.4298        0.0071    0.3312           
## ResidualSugar                    0.7410    0.0781           
## Chlorides          0.7410                  0.1002           
## FreeSulfurDioxide  0.0781        0.1002                     
## TotalSulfurDioxide 0.1719        0.9732    0.2802           
## Density            0.5679        0.0973    0.4871           
## pH                 0.1589        0.1494    0.8720           
## Sulphates          0.8282        0.8336    0.0314           
## Alcohol            0.1286        0.0664    0.0555           
## LabelAppeal        0.7134        0.6084    0.2301           
## AcidIndex          0.1034        0.8907    0.2373           
## STARS              0.1147        0.6120    0.2170           
##                    TotalSulfurDioxide Density pH     Sulphates Alcohol
## TARGET             0.0831             0.0001  0.9859 0.0887    0.0000 
## FixedAcidity       0.0613             0.3532  0.7149 0.0007    0.2939 
## VolatileAcidity    0.9525             0.2934  0.5634 0.9032    0.9833 
## CitricAcid         0.4263             0.1729  0.9515 0.2473    0.1730 
## ResidualSugar      0.1719             0.5679  0.1589 0.8282    0.1286 
## Chlorides          0.9732             0.0973  0.1494 0.8336    0.0664 
## FreeSulfurDioxide  0.2802             0.4871  0.8720 0.0314    0.0555 
## TotalSulfurDioxide                    0.0631  0.7837 0.8408    0.1765 
## Density            0.0631                     0.8713 0.3948    0.6230 
## pH                 0.7837             0.8713         0.4019    0.3276 
## Sulphates          0.8408             0.3948  0.4019           0.3844 
## Alcohol            0.1765             0.6230  0.3276 0.3844           
## LabelAppeal        0.8271             0.1467  0.9860 0.7624    0.9587 
## AcidIndex          0.0759             0.0001  0.0000 0.0127    0.0000 
## STARS              0.0763             0.0223  0.7241 0.0635    0.0000 
##                    LabelAppeal AcidIndex STARS 
## TARGET             0.0000      0.0000    0.0000
## FixedAcidity       0.3615      0.0000    0.6921
## VolatileAcidity    0.1044      0.0445    0.0012
## CitricAcid         0.2188      0.0000    0.5668
## ResidualSugar      0.7134      0.1034    0.1147
## Chlorides          0.6084      0.8907    0.6120
## FreeSulfurDioxide  0.2301      0.2373    0.2170
## TotalSulfurDioxide 0.8271      0.0759    0.0763
## Density            0.1467      0.0001    0.0223
## pH                 0.9860      0.0000    0.7241
## Sulphates          0.7624      0.0127    0.0635
## Alcohol            0.9587      0.0000    0.0000
## LabelAppeal                    0.4087    0.0000
## AcidIndex          0.4087                0.0000
## STARS              0.0000      0.0000
corrplot(cor(trainc), method="square")

library(VIM)
## Loading required package: colorspace
## 
## Attaching package: 'colorspace'
## The following object is masked from 'package:pROC':
## 
##     coords
## Loading required package: grid
## Loading required package: data.table
## Warning: package 'data.table' was built under R version 3.4.4
## 
## Attaching package: 'data.table'
## The following object is masked from 'package:purrr':
## 
##     transpose
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## VIM is ready to use. 
##  Since version 4.0.0 the GUI is in its own package VIMGUI.
## 
##           Please use the package to use the new (and old) GUI.
## Suggestions and bug-reports can be submitted at: https://github.com/alexkowa/VIM/issues
## 
## Attaching package: 'VIM'
## The following object is masked from 'package:datasets':
## 
##     sleep
library(stringr)
## Warning: package 'stringr' was built under R version 3.4.4
options(scipen = 999)
missing_plot <- VIM::aggr(train,  
                      numbers = T, 
                      sortVars = T,
                      col = c("lightgreen", "darkred", "orange"),
                      labels=names(train), 
                      ylab=c("Missing Value Counts"
                             , "Pattern"))
## Warning in plot.aggr(res, ...): not enough vertical space to display
## frequencies (too many combinations)

## 
##  Variables sorted by number of missings: 
##            Variable      Count
##               STARS 0.26252442
##           Sulphates 0.09456819
##  TotalSulfurDioxide 0.05330207
##             Alcohol 0.05103556
##   FreeSulfurDioxide 0.05056663
##           Chlorides 0.04986323
##       ResidualSugar 0.04814381
##                  pH 0.03087143
##               INDEX 0.00000000
##              TARGET 0.00000000
##        FixedAcidity 0.00000000
##     VolatileAcidity 0.00000000
##          CitricAcid 0.00000000
##             Density 0.00000000
##         LabelAppeal 0.00000000
##           AcidIndex 0.00000000
summary(missing_plot)
## 
##  Missings per variable: 
##            Variable Count
##               INDEX     0
##              TARGET     0
##        FixedAcidity     0
##     VolatileAcidity     0
##          CitricAcid     0
##       ResidualSugar   616
##           Chlorides   638
##   FreeSulfurDioxide   647
##  TotalSulfurDioxide   682
##             Density     0
##                  pH   395
##           Sulphates  1210
##             Alcohol   653
##         LabelAppeal     0
##           AcidIndex     0
##               STARS  3359
## 
##  Missings in combinations of variables: 
##                     Combinations Count      Percent
##  0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:0  6436 50.300898789
##  0:0:0:0:0:0:0:0:0:0:0:0:0:0:0:1  2239 17.499023056
##  0:0:0:0:0:0:0:0:0:0:0:0:1:0:0:0   335  2.618210238
##  0:0:0:0:0:0:0:0:0:0:0:0:1:0:0:1   123  0.961313013
##  0:0:0:0:0:0:0:0:0:0:0:1:0:0:0:0   669  5.228604924
##  0:0:0:0:0:0:0:0:0:0:0:1:0:0:0:1   247  1.930441579
##  0:0:0:0:0:0:0:0:0:0:0:1:1:0:0:0    37  0.289175459
##  0:0:0:0:0:0:0:0:0:0:0:1:1:0:0:1    10  0.078155530
##  0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:0   197  1.539663931
##  0:0:0:0:0:0:0:0:0:0:1:0:0:0:0:1    81  0.633059789
##  0:0:0:0:0:0:0:0:0:0:1:0:1:0:0:0    14  0.109417741
##  0:0:0:0:0:0:0:0:0:0:1:0:1:0:0:1     5  0.039077765
##  0:0:0:0:0:0:0:0:0:0:1:1:0:0:0:0    22  0.171942165
##  0:0:0:0:0:0:0:0:0:0:1:1:0:0:0:1    11  0.085971082
##  0:0:0:0:0:0:0:0:1:0:0:0:0:0:0:0   341  2.665103556
##  0:0:0:0:0:0:0:0:1:0:0:0:0:0:0:1   123  0.961313013
##  0:0:0:0:0:0:0:0:1:0:0:0:1:0:0:0    21  0.164126612
##  0:0:0:0:0:0:0:0:1:0:0:0:1:0:0:1     8  0.062524424
##  0:0:0:0:0:0:0:0:1:0:0:1:0:0:0:0    42  0.328253224
##  0:0:0:0:0:0:0:0:1:0:0:1:0:0:0:1    16  0.125048847
##  0:0:0:0:0:0:0:0:1:0:0:1:1:0:0:0     3  0.023446659
##  0:0:0:0:0:0:0:0:1:0:0:1:1:0:0:1     1  0.007815553
##  0:0:0:0:0:0:0:0:1:0:1:0:0:0:0:0    13  0.101602188
##  0:0:0:0:0:0:0:0:1:0:1:0:0:0:0:1     6  0.046893318
##  0:0:0:0:0:0:0:0:1:0:1:0:1:0:0:0     1  0.007815553
##  0:0:0:0:0:0:0:1:0:0:0:0:0:0:0:0   338  2.641656897
##  0:0:0:0:0:0:0:1:0:0:0:0:0:0:0:1   124  0.969128566
##  0:0:0:0:0:0:0:1:0:0:0:0:1:0:0:0    15  0.117233294
##  0:0:0:0:0:0:0:1:0:0:0:0:1:0:0:1     7  0.054708871
##  0:0:0:0:0:0:0:1:0:0:0:1:0:0:0:0    26  0.203204377
##  0:0:0:0:0:0:0:1:0:0:0:1:0:0:0:1    14  0.109417741
##  0:0:0:0:0:0:0:1:0:0:0:1:1:0:0:0     2  0.015631106
##  0:0:0:0:0:0:0:1:0:0:0:1:1:0:0:1     2  0.015631106
##  0:0:0:0:0:0:0:1:0:0:1:0:0:0:0:0    10  0.078155530
##  0:0:0:0:0:0:0:1:0:0:1:0:0:0:0:1     4  0.031262212
##  0:0:0:0:0:0:0:1:0:0:1:1:0:0:0:0     2  0.015631106
##  0:0:0:0:0:0:0:1:1:0:0:0:0:0:0:0    19  0.148495506
##  0:0:0:0:0:0:0:1:1:0:0:0:0:0:0:1    16  0.125048847
##  0:0:0:0:0:0:0:1:1:0:0:0:1:0:0:1     1  0.007815553
##  0:0:0:0:0:0:0:1:1:0:0:1:0:0:0:0     2  0.015631106
##  0:0:0:0:0:0:0:1:1:0:0:1:0:0:0:1     2  0.015631106
##  0:0:0:0:0:0:1:0:0:0:0:0:0:0:0:0   350  2.735443533
##  0:0:0:0:0:0:1:0:0:0:0:0:0:0:0:1   113  0.883157483
##  0:0:0:0:0:0:1:0:0:0:0:0:1:0:0:0    17  0.132864400
##  0:0:0:0:0:0:1:0:0:0:0:0:1:0:0:1     7  0.054708871
##  0:0:0:0:0:0:1:0:0:0:0:1:0:0:0:0    22  0.171942165
##  0:0:0:0:0:0:1:0:0:0:0:1:0:0:0:1    14  0.109417741
##  0:0:0:0:0:0:1:0:0:0:1:0:0:0:0:0     5  0.039077765
##  0:0:0:0:0:0:1:0:0:0:1:0:0:0:0:1     3  0.023446659
##  0:0:0:0:0:0:1:0:0:0:1:0:1:0:0:0     2  0.015631106
##  0:0:0:0:0:0:1:0:1:0:0:0:0:0:0:0    20  0.156311059
##  0:0:0:0:0:0:1:0:1:0:0:0:0:0:0:1     5  0.039077765
##  0:0:0:0:0:0:1:0:1:0:0:0:1:0:0:0     1  0.007815553
##  0:0:0:0:0:0:1:0:1:0:0:0:1:0:0:1     1  0.007815553
##  0:0:0:0:0:0:1:0:1:0:0:1:0:0:0:0     2  0.015631106
##  0:0:0:0:0:0:1:1:0:0:0:0:0:0:0:0    13  0.101602188
##  0:0:0:0:0:0:1:1:0:0:0:0:0:0:0:1     9  0.070339977
##  0:0:0:0:0:0:1:1:0:0:0:0:1:0:0:0     1  0.007815553
##  0:0:0:0:0:0:1:1:0:0:0:0:1:0:0:1     2  0.015631106
##  0:0:0:0:0:0:1:1:0:0:0:1:0:0:0:0     2  0.015631106
##  0:0:0:0:0:0:1:1:0:0:0:1:0:0:0:1     1  0.007815553
##  0:0:0:0:0:0:1:1:0:0:1:0:0:0:0:0     1  0.007815553
##  0:0:0:0:0:0:1:1:1:0:0:0:0:0:0:0     3  0.023446659
##  0:0:0:0:0:1:0:0:0:0:0:0:0:0:0:0   311  2.430636968
##  0:0:0:0:0:1:0:0:0:0:0:0:0:0:0:1   108  0.844079719
##  0:0:0:0:0:1:0:0:0:0:0:0:1:0:0:0    21  0.164126612
##  0:0:0:0:0:1:0:0:0:0:0:0:1:0:0:1     8  0.062524424
##  0:0:0:0:0:1:0:0:0:0:0:1:0:0:0:0    33  0.257913247
##  0:0:0:0:0:1:0:0:0:0:0:1:0:0:0:1    15  0.117233294
##  0:0:0:0:0:1:0:0:0:0:0:1:1:0:0:1     1  0.007815553
##  0:0:0:0:0:1:0:0:0:0:1:0:0:0:0:0     9  0.070339977
##  0:0:0:0:0:1:0:0:0:0:1:0:0:0:0:1     2  0.015631106
##  0:0:0:0:0:1:0:0:0:0:1:0:1:0:0:0     2  0.015631106
##  0:0:0:0:0:1:0:0:0:0:1:1:0:0:0:0     3  0.023446659
##  0:0:0:0:0:1:0:0:1:0:0:0:0:0:0:0    22  0.171942165
##  0:0:0:0:0:1:0:0:1:0:0:0:0:0:0:1     5  0.039077765
##  0:0:0:0:0:1:0:0:1:0:0:1:0:0:0:1     2  0.015631106
##  0:0:0:0:0:1:0:0:1:0:1:0:0:0:0:0     1  0.007815553
##  0:0:0:0:0:1:0:1:0:0:0:0:0:0:0:0    17  0.132864400
##  0:0:0:0:0:1:0:1:0:0:0:0:0:0:0:1     3  0.023446659
##  0:0:0:0:0:1:0:1:0:0:0:0:1:0:0:1     2  0.015631106
##  0:0:0:0:0:1:0:1:0:0:0:1:0:0:0:0     3  0.023446659
##  0:0:0:0:0:1:0:1:0:0:0:1:0:0:0:1     1  0.007815553
##  0:0:0:0:0:1:0:1:0:0:1:0:0:0:0:1     1  0.007815553
##  0:0:0:0:0:1:0:1:1:0:0:0:0:0:0:0     2  0.015631106
##  0:0:0:0:0:1:1:0:0:0:0:0:0:0:0:0    25  0.195388824
##  0:0:0:0:0:1:1:0:0:0:0:0:0:0:0:1    10  0.078155530
##  0:0:0:0:0:1:1:0:0:0:0:0:1:0:0:1     2  0.015631106
##  0:0:0:0:0:1:1:0:0:0:0:1:0:0:0:0     1  0.007815553
##  0:0:0:0:0:1:1:0:0:0:0:1:0:0:0:1     1  0.007815553
##  0:0:0:0:0:1:1:0:0:0:0:1:1:0:0:0     1  0.007815553
##  0:0:0:0:0:1:1:0:1:0:0:0:0:0:0:1     2  0.015631106
##  0:0:0:0:0:1:1:1:0:0:0:0:0:0:0:1     1  0.007815553
##  0:0:0:0:0:1:1:1:1:0:0:0:0:0:0:0     1  0.007815553
missing_plot$missings %>% 
  mutate(
    pct_missing = Count / nrow(train)
    ) %>% 
  arrange(-pct_missing) %>% 
  filter(pct_missing > 0) %>% 
  kable(digits = 3, row.names = T, caption = "Variables Missing Values")  
Variables Missing Values
Variable Count pct_missing
1 STARS 3359 0.263
2 Sulphates 1210 0.095
3 TotalSulfurDioxide 682 0.053
4 Alcohol 653 0.051
5 FreeSulfurDioxide 647 0.051
6 Chlorides 638 0.050
7 ResidualSugar 616 0.048
8 pH 395 0.031

Data Preparation

#negative values
vars_neg_values <- 
  dplyr::select(train, 
              intersect(rownames(metrics_df)[metrics_df$unique_values > 15],
              rownames(metrics_df)[metrics_df$min < 0])
              )

neg_proportions <- t(apply(vars_neg_values, 2, function(x) prop.table(table(x < 0))))

data.frame(
  Var = rownames(neg_proportions),
  is_negative = neg_proportions[, 2]
) %>% arrange(-is_negative) %>% 
  kable(digits = 2)
Var is_negative
Chlorides 0.26
ResidualSugar 0.26
FreeSulfurDioxide 0.25
CitricAcid 0.23
VolatileAcidity 0.22
TotalSulfurDioxide 0.21
Sulphates 0.20
FixedAcidity 0.13
Alcohol 0.01
#new variables
train$BoundSulfurDioxide <- train$TotalSulfurDioxide - train$FreeSulfurDioxide

# impute data for missing values
# use column mean for calculation

train$STARS[is.na(train$STARS)] <- mean(train$STARS, na.rm=TRUE)
train$Alcohol[is.na(train$Alcohol)] <- mean(train$Alcohol, na.rm=TRUE)
train$Sulphates[is.na(train$Sulphates)] <- mean(train$Sulphates, na.rm=TRUE)
train$pH[is.na(train$pH)] <- mean(train$pH, na.rm=TRUE)
train$TotalSulfurDioxide[is.na(train$TotalSulfurDioxide)] <- mean(train$TotalSulfurDioxide, na.rm=TRUE)
train$FreeSulfurDioxide[is.na(train$FreeSulfurDioxide)] <- mean(train$FreeSulfurDioxide, na.rm=TRUE)
train$BoundSulfurDioxide[is.na(train$BoundSulfurDioxide)] <- mean(train$BoundSulfurDioxide, na.rm=TRUE)
train$Chlorides[is.na(train$Chlorides)] <- mean(train$Chlorides, na.rm=TRUE)
train$ResidualSugar[is.na(train$ResidualSugar)] <- mean(train$ResidualSugar, na.rm=TRUE)

#convert to abs for negative values
#converted to positive based upon literature

train$FixedAcidity <- abs(train$FixedAcidity)
train$VolatileAcidity <- abs(train$VolatileAcidity)
train$CitricAcid <- abs(train$CitricAcid)
train$ResidualSugar <- abs(train$ResidualSugar)
train$Chlorides <- abs(train$Chlorides)
train$FreeSulfurDioxide <- abs(train$FreeSulfurDioxide)
train$TotalSulfurDioxide <- abs(train$TotalSulfurDioxide)
train$BoundSulfurDioxide <- abs(train$BoundSulfurDioxide)
train$Sulphates <- abs(train$Sulphates)
train$Alcohol <- abs(train$Alcohol)

#new variables after abs to avoid nan and inf
train$PerVol <- train$VolatileAcidity/(train$FixedAcidity+train$VolatileAcidity)

#shift categorigal labelappeal
train$LabelAppeal <- train$LabelAppeal+2


train2<-train
train2$STARS <- as.factor(train2$STARS)



train <- train[, !(colnames(train) %in% c("INDEX"))]


# 
# #create variable
# train$new <- train$tax / (train$medv*10)
# 
trainnum <- dplyr::select_if(train, is.numeric)

rcorr(as.matrix(trainnum))
##                    TARGET FixedAcidity VolatileAcidity CitricAcid
## TARGET               1.00        -0.05           -0.07       0.01
## FixedAcidity        -0.05         1.00            0.01       0.00
## VolatileAcidity     -0.07         0.01            1.00       0.00
## CitricAcid           0.01         0.00            0.00       1.00
## ResidualSugar        0.00         0.00            0.00      -0.01
## Chlorides           -0.03         0.00            0.01       0.00
## FreeSulfurDioxide    0.02         0.00           -0.01       0.01
## TotalSulfurDioxide   0.03        -0.01           -0.03       0.01
## Density             -0.04         0.00            0.00      -0.01
## pH                  -0.01         0.00            0.02       0.00
## Sulphates           -0.03         0.02            0.01       0.01
## Alcohol              0.06        -0.01            0.01      -0.01
## LabelAppeal          0.36         0.00           -0.02       0.02
## AcidIndex           -0.25         0.18            0.04       0.04
## STARS                0.39        -0.02           -0.03       0.00
## BoundSulfurDioxide   0.01         0.00           -0.03       0.02
## PerVol              -0.03        -0.49            0.47       0.00
##                    ResidualSugar Chlorides FreeSulfurDioxide
## TARGET                      0.00     -0.03              0.02
## FixedAcidity                0.00      0.00              0.00
## VolatileAcidity             0.00      0.01             -0.01
## CitricAcid                 -0.01      0.00              0.01
## ResidualSugar               1.00      0.00             -0.01
## Chlorides                   0.00      1.00              0.00
## FreeSulfurDioxide          -0.01      0.00              1.00
## TotalSulfurDioxide          0.01     -0.01              0.01
## Density                     0.00      0.02              0.00
## pH                          0.00      0.01             -0.01
## Sulphates                  -0.01      0.02              0.00
## Alcohol                    -0.01      0.00             -0.01
## LabelAppeal                 0.00     -0.01              0.01
## AcidIndex                  -0.01      0.03             -0.02
## STARS                       0.01      0.00              0.00
## BoundSulfurDioxide          0.01     -0.01              0.28
## PerVol                      0.00      0.02             -0.01
##                    TotalSulfurDioxide Density    pH Sulphates Alcohol
## TARGET                           0.03   -0.04 -0.01     -0.03    0.06
## FixedAcidity                    -0.01    0.00  0.00      0.02   -0.01
## VolatileAcidity                 -0.03    0.00  0.02      0.01    0.01
## CitricAcid                       0.01   -0.01  0.00      0.01   -0.01
## ResidualSugar                    0.01    0.00  0.00     -0.01   -0.01
## Chlorides                       -0.01    0.02  0.01      0.02    0.00
## FreeSulfurDioxide                0.01    0.00 -0.01      0.00   -0.01
## TotalSulfurDioxide               1.00    0.01  0.01     -0.01   -0.03
## Density                          0.01    1.00  0.01      0.01   -0.01
## pH                               0.01    0.01  1.00      0.01   -0.01
## Sulphates                       -0.01    0.01  0.01      1.00    0.00
## Alcohol                         -0.03   -0.01 -0.01      0.00    1.00
## LabelAppeal                     -0.01   -0.01  0.00      0.00    0.00
## AcidIndex                       -0.04    0.04 -0.06      0.03   -0.04
## STARS                            0.00   -0.02  0.00      0.00    0.05
## BoundSulfurDioxide               0.72    0.00  0.00     -0.01   -0.02
## PerVol                          -0.02    0.00  0.02      0.01    0.02
##                    LabelAppeal AcidIndex STARS BoundSulfurDioxide PerVol
## TARGET                    0.36     -0.25  0.39               0.01  -0.03
## FixedAcidity              0.00      0.18 -0.02               0.00  -0.49
## VolatileAcidity          -0.02      0.04 -0.03              -0.03   0.47
## CitricAcid                0.02      0.04  0.00               0.02   0.00
## ResidualSugar             0.00     -0.01  0.01               0.01   0.00
## Chlorides                -0.01      0.03  0.00              -0.01   0.02
## FreeSulfurDioxide         0.01     -0.02  0.00               0.28  -0.01
## TotalSulfurDioxide       -0.01     -0.04  0.00               0.72  -0.02
## Density                  -0.01      0.04 -0.02               0.00   0.00
## pH                        0.00     -0.06  0.00               0.00   0.02
## Sulphates                 0.00      0.03  0.00              -0.01   0.01
## Alcohol                   0.00     -0.04  0.05              -0.02   0.02
## LabelAppeal               1.00      0.02  0.28              -0.01  -0.01
## AcidIndex                 0.02      1.00 -0.07               0.00  -0.03
## STARS                     0.28     -0.07  1.00               0.00  -0.01
## BoundSulfurDioxide       -0.01      0.00  0.00               1.00  -0.03
## PerVol                   -0.01     -0.03 -0.01              -0.03   1.00
## 
## n= 12795 
## 
## 
## P
##                    TARGET FixedAcidity VolatileAcidity CitricAcid
## TARGET                    0.0000       0.0000          0.1145    
## FixedAcidity       0.0000              0.2489          0.6205    
## VolatileAcidity    0.0000 0.2489                       0.7764    
## CitricAcid         0.1145 0.6205       0.7764                    
## ResidualSugar      0.8421 0.6514       0.8201          0.1535    
## Chlorides          0.0017 0.9436       0.2518          0.5935    
## FreeSulfurDioxide  0.0076 0.5887       0.3005          0.4925    
## TotalSulfurDioxide 0.0002 0.2290       0.0000          0.3276    
## Density            0.0000 0.9949       0.6341          0.2196    
## pH                 0.2939 0.7960       0.0702          0.7142    
## Sulphates          0.0004 0.0348       0.4347          0.1073    
## Alcohol            0.0000 0.2991       0.1197          0.4935    
## LabelAppeal        0.0000 0.8000       0.0825          0.0501    
## AcidIndex          0.0000 0.0000       0.0000          0.0000    
## STARS              0.0000 0.0761       0.0037          0.8749    
## BoundSulfurDioxide 0.4382 0.7819       0.0005          0.0220    
## PerVol             0.0039 0.0000       0.0000          0.7383    
##                    ResidualSugar Chlorides FreeSulfurDioxide
## TARGET             0.8421        0.0017    0.0076           
## FixedAcidity       0.6514        0.9436    0.5887           
## VolatileAcidity    0.8201        0.2518    0.3005           
## CitricAcid         0.1535        0.5935    0.4925           
## ResidualSugar                    0.7835    0.2116           
## Chlorides          0.7835                  0.6498           
## FreeSulfurDioxide  0.2116        0.6498                     
## TotalSulfurDioxide 0.1240        0.1500    0.1881           
## Density            0.6577        0.0806    0.5978           
## pH                 0.9352        0.3766    0.4808           
## Sulphates          0.4925        0.0628    0.8712           
## Alcohol            0.4881        0.7274    0.1716           
## LabelAppeal        0.9877        0.3329    0.1483           
## AcidIndex          0.2682        0.0006    0.0269           
## STARS              0.4869        0.8433    0.9593           
## BoundSulfurDioxide 0.4458        0.1961    0.0000           
## PerVol             0.8960        0.0649    0.4008           
##                    TotalSulfurDioxide Density pH     Sulphates Alcohol
## TARGET             0.0002             0.0000  0.2939 0.0004    0.0000 
## FixedAcidity       0.2290             0.9949  0.7960 0.0348    0.2991 
## VolatileAcidity    0.0000             0.6341  0.0702 0.4347    0.1197 
## CitricAcid         0.3276             0.2196  0.7142 0.1073    0.4935 
## ResidualSugar      0.1240             0.6577  0.9352 0.4925    0.4881 
## Chlorides          0.1500             0.0806  0.3766 0.0628    0.7274 
## FreeSulfurDioxide  0.1881             0.5978  0.4808 0.8712    0.1716 
## TotalSulfurDioxide                    0.1650  0.2828 0.1891    0.0005 
## Density            0.1650                     0.5219 0.2056    0.3870 
## pH                 0.2828             0.5219         0.2369    0.3094 
## Sulphates          0.1891             0.2056  0.2369           0.9549 
## Alcohol            0.0005             0.3870  0.3094 0.9549           
## LabelAppeal        0.0949             0.2892  0.6451 0.9147    0.7877 
## AcidIndex          0.0000             0.0000  0.0000 0.0001    0.0000 
## STARS              0.8309             0.0756  0.9634 0.9707    0.0000 
## BoundSulfurDioxide 0.0000             0.6587  0.6544 0.3406    0.0365 
## PerVol             0.0503             0.8528  0.0291 0.3313    0.0387 
##                    LabelAppeal AcidIndex STARS  BoundSulfurDioxide PerVol
## TARGET             0.0000      0.0000    0.0000 0.4382             0.0039
## FixedAcidity       0.8000      0.0000    0.0761 0.7819             0.0000
## VolatileAcidity    0.0825      0.0000    0.0037 0.0005             0.0000
## CitricAcid         0.0501      0.0000    0.8749 0.0220             0.7383
## ResidualSugar      0.9877      0.2682    0.4869 0.4458             0.8960
## Chlorides          0.3329      0.0006    0.8433 0.1961             0.0649
## FreeSulfurDioxide  0.1483      0.0269    0.9593 0.0000             0.4008
## TotalSulfurDioxide 0.0949      0.0000    0.8309 0.0000             0.0503
## Density            0.2892      0.0000    0.0756 0.6587             0.8528
## pH                 0.6451      0.0000    0.9634 0.6544             0.0291
## Sulphates          0.9147      0.0001    0.9707 0.3406             0.3313
## Alcohol            0.7877      0.0000    0.0000 0.0365             0.0387
## LabelAppeal                    0.0051    0.0000 0.4144             0.2185
## AcidIndex          0.0051                0.0000 0.7082             0.0013
## STARS              0.0000      0.0000           0.5931             0.2366
## BoundSulfurDioxide 0.4144      0.7082    0.5931                    0.0041
## PerVol             0.2185      0.0013    0.2366 0.0041
corrplot(cor(trainnum), method="square")

Build Models Poisson 2

#MODEL 1
model1 <- glm(TARGET~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+BoundSulfurDioxide+Density+pH+Sulphates+Alcohol+as.factor(LabelAppeal)+as.factor(AcidIndex) + as.factor(STARS)+PerVol,data=train, family=poisson())

summary(model1)
## 
## Call:
## glm(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     BoundSulfurDioxide + Density + pH + Sulphates + Alcohol + 
##     as.factor(LabelAppeal) + as.factor(AcidIndex) + as.factor(STARS) + 
##     PerVol, family = poisson(), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2127  -0.6516  -0.0030   0.4432   3.6940  
## 
## Coefficients:
##                                     Estimate  Std. Error z value
## (Intercept)                       1.06253675  0.37156470   2.860
## FixedAcidity                     -0.00072647  0.00126206  -0.576
## VolatileAcidity                  -0.02903800  0.01123171  -2.585
## CitricAcid                        0.00869022  0.00834783   1.041
## ResidualSugar                    -0.00001575  0.00020681  -0.076
## Chlorides                        -0.03234449  0.02218266  -1.458
## FreeSulfurDioxide                 0.00006404  0.00005138   1.246
## TotalSulfurDioxide                0.00011812  0.00004809   2.456
## BoundSulfurDioxide               -0.00006537  0.00004433  -1.474
## Density                          -0.29557735  0.19193416  -1.540
## pH                               -0.00983182  0.00765360  -1.285
## Sulphates                        -0.01153271  0.00817562  -1.411
## Alcohol                           0.00461643  0.00144659   3.191
## as.factor(LabelAppeal)1           0.23924089  0.03800031   6.296
## as.factor(LabelAppeal)2           0.42916835  0.03706591  11.579
## as.factor(LabelAppeal)3           0.56226154  0.03771537  14.908
## as.factor(LabelAppeal)4           0.69766946  0.04245421  16.433
## as.factor(AcidIndex)5            -0.13380941  0.32271890  -0.415
## as.factor(AcidIndex)6            -0.10034777  0.31725980  -0.316
## as.factor(AcidIndex)7            -0.13264716  0.31700855  -0.418
## as.factor(AcidIndex)8            -0.16430607  0.31706657  -0.518
## as.factor(AcidIndex)9            -0.27397521  0.31739070  -0.863
## as.factor(AcidIndex)10           -0.43449994  0.31848283  -1.364
## as.factor(AcidIndex)11           -0.79602036  0.32208457  -2.471
## as.factor(AcidIndex)12           -0.80895430  0.32774169  -2.468
## as.factor(AcidIndex)13           -0.64343858  0.33066231  -1.946
## as.factor(AcidIndex)14           -0.74416112  0.34328561  -2.168
## as.factor(AcidIndex)15           -0.30132160  0.40394479  -0.746
## as.factor(AcidIndex)16           -0.95688354  0.54863387  -1.744
## as.factor(AcidIndex)17           -1.18518604  0.54861237  -2.160
## as.factor(STARS)2                 0.31833077  0.01436884  22.154
## as.factor(STARS)2.04175498092412 -0.75685033  0.01956973 -38.675
## as.factor(STARS)3                 0.43713915  0.01562442  27.978
## as.factor(STARS)4                 0.55871107  0.02166437  25.789
## PerVol                           -0.05516995  0.05207826  -1.059
##                                              Pr(>|z|)    
## (Intercept)                                   0.00424 ** 
## FixedAcidity                                  0.56487    
## VolatileAcidity                               0.00973 ** 
## CitricAcid                                    0.29787    
## ResidualSugar                                 0.93930    
## Chlorides                                     0.14481    
## FreeSulfurDioxide                             0.21263    
## TotalSulfurDioxide                            0.01404 *  
## BoundSulfurDioxide                            0.14035    
## Density                                       0.12356    
## pH                                            0.19893    
## Sulphates                                     0.15836    
## Alcohol                                       0.00142 ** 
## as.factor(LabelAppeal)1                0.000000000306 ***
## as.factor(LabelAppeal)2          < 0.0000000000000002 ***
## as.factor(LabelAppeal)3          < 0.0000000000000002 ***
## as.factor(LabelAppeal)4          < 0.0000000000000002 ***
## as.factor(AcidIndex)5                         0.67841    
## as.factor(AcidIndex)6                         0.75178    
## as.factor(AcidIndex)7                         0.67563    
## as.factor(AcidIndex)8                         0.60431    
## as.factor(AcidIndex)9                         0.38802    
## as.factor(AcidIndex)10                        0.17248    
## as.factor(AcidIndex)11                        0.01346 *  
## as.factor(AcidIndex)12                        0.01358 *  
## as.factor(AcidIndex)13                        0.05167 .  
## as.factor(AcidIndex)14                        0.03018 *  
## as.factor(AcidIndex)15                        0.45570    
## as.factor(AcidIndex)16                        0.08114 .  
## as.factor(AcidIndex)17                        0.03075 *  
## as.factor(STARS)2                < 0.0000000000000002 ***
## as.factor(STARS)2.04175498092412 < 0.0000000000000002 ***
## as.factor(STARS)3                < 0.0000000000000002 ***
## as.factor(STARS)4                < 0.0000000000000002 ***
## PerVol                                        0.28943    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22861  on 12794  degrees of freedom
## Residual deviance: 13549  on 12760  degrees of freedom
## AIC: 45561
## 
## Number of Fisher Scoring iterations: 6
predmodel1 <- predict(model1, type="response")
train2$pred1 <- predict(model1, type="response")

table(true = train$TARGET, pred = floor(fitted(model1))) %>% kable() %>% kable_styling()
0 1 2 3 4 5 6 7
0 495 1670 383 166 18 2 0 0
1 82 105 49 8 0 0 0 0
2 91 355 423 212 10 0 0 0
3 54 476 927 887 261 6 0 0
4 7 277 573 1098 1071 148 3 0
5 3 105 137 400 912 430 27 0
6 0 33 15 53 267 344 53 0
7 0 8 1 4 20 79 28 2
8 0 2 0 0 0 7 7 1
par(mfrow=c(1,2))
hist(train2$TARGET)
hist(train2$pred1)

#plots for Model 1
par(mfrow=c(2,2))
plot(model1)

dispersiontest(model1)
## 
##  Overdispersion test
## 
## data:  model1
## z = -9.3583, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##  0.8801366
#MODEL 2

model2 <- glm(TARGET~ VolatileAcidity+TotalSulfurDioxide+Alcohol+as.factor(LabelAppeal)+as.factor(AcidIndex) + as.factor(STARS)+PerVol,data=train, family=poisson())

summary(model2)
## 
## Call:
## glm(formula = TARGET ~ VolatileAcidity + TotalSulfurDioxide + 
##     Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) + 
##     as.factor(STARS) + PerVol, family = poisson(), data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2471  -0.6496  -0.0005   0.4355   3.6907  
## 
## Coefficients:
##                                     Estimate  Std. Error z value
## (Intercept)                       0.71353025  0.31931670   2.235
## VolatileAcidity                  -0.03085150  0.01067228  -2.891
## TotalSulfurDioxide                0.00006467  0.00003195   2.024
## Alcohol                           0.00461529  0.00144657   3.191
## as.factor(LabelAppeal)1           0.23988496  0.03799700   6.313
## as.factor(LabelAppeal)2           0.42949634  0.03706428  11.588
## as.factor(LabelAppeal)3           0.56362465  0.03770892  14.947
## as.factor(LabelAppeal)4           0.69761429  0.04244584  16.435
## as.factor(AcidIndex)5            -0.12466124  0.32238208  -0.387
## as.factor(AcidIndex)6            -0.08925265  0.31691690  -0.282
## as.factor(AcidIndex)7            -0.12199358  0.31663296  -0.385
## as.factor(AcidIndex)8            -0.15350050  0.31666560  -0.485
## as.factor(AcidIndex)9            -0.26427415  0.31696999  -0.834
## as.factor(AcidIndex)10           -0.42663465  0.31805212  -1.341
## as.factor(AcidIndex)11           -0.79005656  0.32162571  -2.456
## as.factor(AcidIndex)12           -0.80327975  0.32728632  -2.454
## as.factor(AcidIndex)13           -0.63916256  0.33019908  -1.936
## as.factor(AcidIndex)14           -0.73826506  0.34274553  -2.154
## as.factor(AcidIndex)15           -0.28283782  0.40345858  -0.701
## as.factor(AcidIndex)16           -0.95458004  0.54800017  -1.742
## as.factor(AcidIndex)17           -1.19689236  0.54811293  -2.184
## as.factor(STARS)2                 0.31814639  0.01436122  22.153
## as.factor(STARS)2.04175498092412 -0.75871740  0.01956057 -38.788
## as.factor(STARS)3                 0.43756789  0.01561931  28.015
## as.factor(STARS)4                 0.55870679  0.02166337  25.790
## PerVol                           -0.04074099  0.04313558  -0.944
##                                              Pr(>|z|)    
## (Intercept)                                   0.02545 *  
## VolatileAcidity                               0.00384 ** 
## TotalSulfurDioxide                            0.04295 *  
## Alcohol                                       0.00142 ** 
## as.factor(LabelAppeal)1                0.000000000273 ***
## as.factor(LabelAppeal)2          < 0.0000000000000002 ***
## as.factor(LabelAppeal)3          < 0.0000000000000002 ***
## as.factor(LabelAppeal)4          < 0.0000000000000002 ***
## as.factor(AcidIndex)5                         0.69899    
## as.factor(AcidIndex)6                         0.77823    
## as.factor(AcidIndex)7                         0.70003    
## as.factor(AcidIndex)8                         0.62786    
## as.factor(AcidIndex)9                         0.40442    
## as.factor(AcidIndex)10                        0.17979    
## as.factor(AcidIndex)11                        0.01403 *  
## as.factor(AcidIndex)12                        0.01411 *  
## as.factor(AcidIndex)13                        0.05291 .  
## as.factor(AcidIndex)14                        0.03124 *  
## as.factor(AcidIndex)15                        0.48328    
## as.factor(AcidIndex)16                        0.08152 .  
## as.factor(AcidIndex)17                        0.02899 *  
## as.factor(STARS)2                < 0.0000000000000002 ***
## as.factor(STARS)2.04175498092412 < 0.0000000000000002 ***
## as.factor(STARS)3                < 0.0000000000000002 ***
## as.factor(STARS)4                < 0.0000000000000002 ***
## PerVol                                        0.34492    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22861  on 12794  degrees of freedom
## Residual deviance: 13562  on 12769  degrees of freedom
## AIC: 45556
## 
## Number of Fisher Scoring iterations: 6
predmodel2 <- predict(model2, type="response")
train2$pred2 <- predict(model2, type="response")

table(true = train$TARGET, pred = floor(fitted(model2))) %>% kable() %>% kable_styling()
0 1 2 3 4 5 6 7
0 500 1663 385 166 17 3 0 0
1 83 103 49 9 0 0 0 0
2 94 349 426 212 10 0 0 0
3 58 467 945 885 251 5 0 0
4 7 280 569 1119 1049 152 1 0
5 3 104 141 403 897 443 23 0
6 0 32 16 55 264 346 51 1
7 0 8 1 4 16 85 27 1
8 0 2 0 0 0 7 8 0
par(mfrow=c(1,2))
hist(train2$TARGET)
hist(train2$pred2)

#plots for Model 1
par(mfrow=c(2,2))
plot(model2)

dispersiontest(model2)
## 
##  Overdispersion test
## 
## data:  model2
## z = -9.2966, p-value = 1
## alternative hypothesis: true dispersion is greater than 1
## sample estimates:
## dispersion 
##   0.880733

Build Models Neg Bin Reg 2

library(MASS)
#MODEL 1
model3 <- glm.nb(TARGET~ FixedAcidity+VolatileAcidity+CitricAcid+ResidualSugar+Chlorides+FreeSulfurDioxide+TotalSulfurDioxide+BoundSulfurDioxide+Density+pH+Sulphates+Alcohol+as.factor(LabelAppeal)+as.factor(AcidIndex) + as.factor(STARS)+PerVol,data=train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model3)
## 
## Call:
## glm.nb(formula = TARGET ~ FixedAcidity + VolatileAcidity + CitricAcid + 
##     ResidualSugar + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + 
##     BoundSulfurDioxide + Density + pH + Sulphates + Alcohol + 
##     as.factor(LabelAppeal) + as.factor(AcidIndex) + as.factor(STARS) + 
##     PerVol, data = train, init.theta = 40922.4051, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2126  -0.6516  -0.0030   0.4431   3.6939  
## 
## Coefficients:
##                                     Estimate  Std. Error z value
## (Intercept)                       1.06256763  0.37158580   2.860
## FixedAcidity                     -0.00072650  0.00126212  -0.576
## VolatileAcidity                  -0.02903909  0.01123219  -2.585
## CitricAcid                        0.00869043  0.00834821   1.041
## ResidualSugar                    -0.00001575  0.00020682  -0.076
## Chlorides                        -0.03234520  0.02218366  -1.458
## FreeSulfurDioxide                 0.00006404  0.00005138   1.246
## TotalSulfurDioxide                0.00011812  0.00004809   2.456
## BoundSulfurDioxide               -0.00006537  0.00004434  -1.474
## Density                          -0.29558179  0.19194284  -1.540
## pH                               -0.00983270  0.00765394  -1.285
## Sulphates                        -0.01153325  0.00817598  -1.411
## Alcohol                           0.00461635  0.00144666   3.191
## as.factor(LabelAppeal)1           0.23924036  0.03800119   6.296
## as.factor(LabelAppeal)2           0.42916666  0.03706677  11.578
## as.factor(LabelAppeal)3           0.56225795  0.03771630  14.908
## as.factor(LabelAppeal)4           0.69766527  0.04245561  16.433
## as.factor(AcidIndex)5            -0.13382982  0.32273859  -0.415
## as.factor(AcidIndex)6            -0.10036698  0.31727930  -0.316
## as.factor(AcidIndex)7            -0.13266700  0.31702803  -0.418
## as.factor(AcidIndex)8            -0.16432660  0.31708605  -0.518
## as.factor(AcidIndex)9            -0.27399928  0.31741019  -0.863
## as.factor(AcidIndex)10           -0.43452627  0.31850230  -1.364
## as.factor(AcidIndex)11           -0.79605119  0.32210392  -2.471
## as.factor(AcidIndex)12           -0.80898684  0.32776092  -2.468
## as.factor(AcidIndex)13           -0.64346919  0.33068163  -1.946
## as.factor(AcidIndex)14           -0.74418859  0.34330458  -2.168
## as.factor(AcidIndex)15           -0.30134808  0.40396482  -0.746
## as.factor(AcidIndex)16           -0.95692082  0.54865195  -1.744
## as.factor(AcidIndex)17           -1.18522561  0.54862910  -2.160
## as.factor(STARS)2                 0.31833116  0.01436939  22.153
## as.factor(STARS)2.04175498092412 -0.75684944  0.01957014 -38.674
## as.factor(STARS)3                 0.43714031  0.01562508  27.977
## as.factor(STARS)4                 0.55871330  0.02166558  25.788
## PerVol                           -0.05517181  0.05208054  -1.059
##                                              Pr(>|z|)    
## (Intercept)                                   0.00424 ** 
## FixedAcidity                                  0.56487    
## VolatileAcidity                               0.00973 ** 
## CitricAcid                                    0.29788    
## ResidualSugar                                 0.93929    
## Chlorides                                     0.14482    
## FreeSulfurDioxide                             0.21263    
## TotalSulfurDioxide                            0.01404 *  
## BoundSulfurDioxide                            0.14035    
## Density                                       0.12357    
## pH                                            0.19891    
## Sulphates                                     0.15836    
## Alcohol                                       0.00142 ** 
## as.factor(LabelAppeal)1                0.000000000306 ***
## as.factor(LabelAppeal)2          < 0.0000000000000002 ***
## as.factor(LabelAppeal)3          < 0.0000000000000002 ***
## as.factor(LabelAppeal)4          < 0.0000000000000002 ***
## as.factor(AcidIndex)5                         0.67838    
## as.factor(AcidIndex)6                         0.75175    
## as.factor(AcidIndex)7                         0.67560    
## as.factor(AcidIndex)8                         0.60429    
## as.factor(AcidIndex)9                         0.38801    
## as.factor(AcidIndex)10                        0.17248    
## as.factor(AcidIndex)11                        0.01346 *  
## as.factor(AcidIndex)12                        0.01358 *  
## as.factor(AcidIndex)13                        0.05167 .  
## as.factor(AcidIndex)14                        0.03018 *  
## as.factor(AcidIndex)15                        0.45568    
## as.factor(AcidIndex)16                        0.08114 .  
## as.factor(AcidIndex)17                        0.03075 *  
## as.factor(STARS)2                < 0.0000000000000002 ***
## as.factor(STARS)2.04175498092412 < 0.0000000000000002 ***
## as.factor(STARS)3                < 0.0000000000000002 ***
## as.factor(STARS)4                < 0.0000000000000002 ***
## PerVol                                        0.28944    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(40922.41) family taken to be 1)
## 
##     Null deviance: 22860  on 12794  degrees of freedom
## Residual deviance: 13549  on 12760  degrees of freedom
## AIC: 45564
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  40922 
##           Std. Err.:  34326 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -45491.65
predmodel3 <- predict(model3, type="response")
train2$pred3 <- predict(model3, type="response")

table(true = train$TARGET, pred = floor(fitted(model3))) %>% kable() %>% kable_styling()
0 1 2 3 4 5 6 7
0 495 1670 383 166 18 2 0 0
1 82 105 49 8 0 0 0 0
2 91 355 423 212 10 0 0 0
3 54 476 927 887 261 6 0 0
4 7 277 573 1098 1071 148 3 0
5 3 105 137 400 912 430 27 0
6 0 33 15 53 267 344 53 0
7 0 8 1 4 20 79 28 2
8 0 2 0 0 0 7 7 1
par(mfrow=c(1,2))
hist(train2$TARGET)
hist(train2$pred3)

#plots for Model 1
par(mfrow=c(2,2))
plot(model3)

#MODEL 2

model4 <- glm.nb(TARGET~ VolatileAcidity+TotalSulfurDioxide+Alcohol+as.factor(LabelAppeal)+as.factor(AcidIndex) + as.factor(STARS)+PerVol,data=train)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model4)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + TotalSulfurDioxide + 
##     Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) + 
##     as.factor(STARS) + PerVol, data = train, init.theta = 40886.26992, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2470  -0.6496  -0.0005   0.4354   3.6906  
## 
## Coefficients:
##                                     Estimate  Std. Error z value
## (Intercept)                       0.71355226  0.31933620   2.234
## VolatileAcidity                  -0.03085266  0.01067275  -2.891
## TotalSulfurDioxide                0.00006468  0.00003195   2.024
## Alcohol                           0.00461521  0.00144663   3.190
## as.factor(LabelAppeal)1           0.23988448  0.03799789   6.313
## as.factor(LabelAppeal)2           0.42949469  0.03706514  11.588
## as.factor(LabelAppeal)3           0.56362112  0.03770984  14.946
## as.factor(LabelAppeal)4           0.69761012  0.04244725  16.435
## as.factor(AcidIndex)5            -0.12468053  0.32240182  -0.387
## as.factor(AcidIndex)6            -0.08927068  0.31693642  -0.282
## as.factor(AcidIndex)7            -0.12201221  0.31665248  -0.385
## as.factor(AcidIndex)8            -0.15351977  0.31668512  -0.485
## as.factor(AcidIndex)9            -0.26429701  0.31698951  -0.834
## as.factor(AcidIndex)10           -0.42665984  0.31807163  -1.341
## as.factor(AcidIndex)11           -0.79008634  0.32164510  -2.456
## as.factor(AcidIndex)12           -0.80331117  0.32730558  -2.454
## as.factor(AcidIndex)13           -0.63919219  0.33021843  -1.936
## as.factor(AcidIndex)14           -0.73829156  0.34276453  -2.154
## as.factor(AcidIndex)15           -0.28286247  0.40347863  -0.701
## as.factor(AcidIndex)16           -0.95461650  0.54801814  -1.742
## as.factor(AcidIndex)17           -1.19693172  0.54812968  -2.184
## as.factor(STARS)2                 0.31814672  0.01436176  22.152
## as.factor(STARS)2.04175498092412 -0.75871659  0.01956098 -38.787
## as.factor(STARS)3                 0.43756899  0.01561998  28.013
## as.factor(STARS)4                 0.55870896  0.02166458  25.789
## PerVol                           -0.04074236  0.04313746  -0.944
##                                              Pr(>|z|)    
## (Intercept)                                   0.02545 *  
## VolatileAcidity                               0.00384 ** 
## TotalSulfurDioxide                            0.04294 *  
## Alcohol                                       0.00142 ** 
## as.factor(LabelAppeal)1                0.000000000273 ***
## as.factor(LabelAppeal)2          < 0.0000000000000002 ***
## as.factor(LabelAppeal)3          < 0.0000000000000002 ***
## as.factor(LabelAppeal)4          < 0.0000000000000002 ***
## as.factor(AcidIndex)5                         0.69896    
## as.factor(AcidIndex)6                         0.77820    
## as.factor(AcidIndex)7                         0.70000    
## as.factor(AcidIndex)8                         0.62784    
## as.factor(AcidIndex)9                         0.40441    
## as.factor(AcidIndex)10                        0.17979    
## as.factor(AcidIndex)11                        0.01403 *  
## as.factor(AcidIndex)12                        0.01412 *  
## as.factor(AcidIndex)13                        0.05291 .  
## as.factor(AcidIndex)14                        0.03125 *  
## as.factor(AcidIndex)15                        0.48327    
## as.factor(AcidIndex)16                        0.08152 .  
## as.factor(AcidIndex)17                        0.02899 *  
## as.factor(STARS)2                < 0.0000000000000002 ***
## as.factor(STARS)2.04175498092412 < 0.0000000000000002 ***
## as.factor(STARS)3                < 0.0000000000000002 ***
## as.factor(STARS)4                < 0.0000000000000002 ***
## PerVol                                        0.34493    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(40886.27) family taken to be 1)
## 
##     Null deviance: 22860  on 12794  degrees of freedom
## Residual deviance: 13561  on 12769  degrees of freedom
## AIC: 45558
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  40886 
##           Std. Err.:  34285 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -45504.06
predmodel4 <- predict(model2, type="response")
train2$pred4 <- predict(model2, type="response")

table(true = train$TARGET, pred = floor(fitted(model4))) %>% kable() %>% kable_styling()
0 1 2 3 4 5 6 7
0 500 1663 385 166 17 3 0 0
1 83 103 49 9 0 0 0 0
2 94 349 426 212 10 0 0 0
3 58 467 945 885 251 5 0 0
4 7 280 569 1119 1049 152 1 0
5 3 104 141 403 897 443 23 0
6 0 32 16 55 264 346 51 1
7 0 8 1 4 16 85 27 1
8 0 2 0 0 0 7 8 0
par(mfrow=c(1,2))
hist(train2$TARGET)
hist(train2$pred4)

#plots for Model 1
par(mfrow=c(2,2))
plot(model4)

Build Models Linear 2

#MODEL 1
model5 <- lm(TARGET ~ ., data=train)
summary(model5)
## 
## Call:
## lm(formula = TARGET ~ ., data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0189 -0.7380  0.3737  1.1294  4.6454 
## 
## Coefficients:
##                      Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)         4.3757840  0.5573755   7.851  0.00000000000000447 ***
## FixedAcidity       -0.0036799  0.0035701  -1.031             0.302683    
## VolatileAcidity    -0.1559722  0.0312290  -4.994  0.00000059778214213 ***
## CitricAcid          0.0551764  0.0239298   2.306             0.021140 *  
## ResidualSugar      -0.0001641  0.0005880  -0.279             0.780230    
## Chlorides          -0.1415407  0.0626819  -2.258             0.023957 *  
## FreeSulfurDioxide   0.0004751  0.0001478   3.214             0.001312 ** 
## TotalSulfurDioxide  0.0007186  0.0001376   5.222  0.00000017982823371 ***
## Density            -1.3781218  0.5464768  -2.522             0.011687 *  
## pH                 -0.0633858  0.0216939  -2.922             0.003486 ** 
## Sulphates          -0.0665696  0.0229855  -2.896             0.003784 ** 
## Alcohol             0.0210964  0.0041090   5.134  0.00000028748895271 ***
## LabelAppeal         0.6034374  0.0169723  35.554 < 0.0000000000000002 ***
## AcidIndex          -0.3300359  0.0112552 -29.323 < 0.0000000000000002 ***
## STARS               0.7178055  0.0195731  36.673 < 0.0000000000000002 ***
## BoundSulfurDioxide -0.0004583  0.0001280  -3.581             0.000343 ***
## PerVol             -0.1285625  0.1472273  -0.873             0.382557    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.638 on 12778 degrees of freedom
## Multiple R-squared:  0.2779, Adjusted R-squared:  0.277 
## F-statistic: 307.3 on 16 and 12778 DF,  p-value: < 0.00000000000000022
par(mfrow=c(1,2))
plot(model5$residuals ~ model5$fitted.values)
plot(model5$fitted.values,train$TARGET)

par(mfrow=c(2,2))
plot(model5)

#extract variables that are significant and rerun model
sigvars <- data.frame(summary(model5)$coef[summary(model5)$coef[,4] <= .05, 4])
sigvars <- add_rownames(sigvars, "vars")
## Warning: Deprecated, use tibble::rownames_to_column() instead.
colist<-dplyr::pull(sigvars, vars)
colist <- colist[c(2:14)]

idx <- match(colist, names(train))
trainmod2 <- cbind(train[,idx], train['TARGET'])

#MODEL 2
model6<-lm(TARGET ~ ., data=trainmod2)

summary(model6)
## 
## Call:
## lm(formula = TARGET ~ ., data = trainmod2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.0101 -0.7355  0.3733  1.1267  4.6520 
## 
## Coefficients:
##                      Estimate Std. Error t value             Pr(>|t|)    
## (Intercept)         4.3497981  0.5567548   7.813  0.00000000000000603 ***
## VolatileAcidity    -0.1710061  0.0261125  -6.549  0.00000000006021103 ***
## CitricAcid          0.0554216  0.0239248   2.316             0.020547 *  
## Chlorides          -0.1420447  0.0626707  -2.267             0.023436 *  
## FreeSulfurDioxide   0.0004756  0.0001478   3.218             0.001294 ** 
## TotalSulfurDioxide  0.0007177  0.0001376   5.216  0.00000018534856613 ***
## Density            -1.3735422  0.5464180  -2.514             0.011959 *  
## pH                 -0.0638339  0.0216882  -2.943             0.003254 ** 
## Sulphates          -0.0670812  0.0229781  -2.919             0.003514 ** 
## Alcohol             0.0210671  0.0041083   5.128  0.00000029718809899 ***
## LabelAppeal         0.6036101  0.0169705  35.568 < 0.0000000000000002 ***
## AcidIndex          -0.3319226  0.0110520 -30.033 < 0.0000000000000002 ***
## STARS               0.7178463  0.0195712  36.679 < 0.0000000000000002 ***
## BoundSulfurDioxide -0.0004568  0.0001279  -3.570             0.000358 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.638 on 12781 degrees of freedom
## Multiple R-squared:  0.2778, Adjusted R-squared:  0.2771 
## F-statistic: 378.2 on 13 and 12781 DF,  p-value: < 0.00000000000000022
par(mfrow=c(2,2))
plot(model6$residuals ~ model6$fitted.values)
plot(model6$fitted.values,train$TARGET)


par(mfrow=c(2,2))

plot(model6)

par(mfrow=c(1,2))
plot(model6$residuals ~ model6$fitted.values, main="New Reduced Var Model")
abline(h = 0)
plot(model5$residuals ~ model5$fitted.values, main="Orignal Model All Vars")
abline(h = 0)

Select Models

test = read.csv(file="data/wine-evaluation-data.csv")
test2<- test
dim(test)
## [1] 3335   16
#new variables
test$BoundSulfurDioxide <- test$TotalSulfurDioxide - test$FreeSulfurDioxide

# impute data for missing values
# use column mean for calculation

test$STARS[is.na(test$STARS)] <- mean(test$STARS, na.rm=TRUE)
test$Alcohol[is.na(test$Alcohol)] <- mean(test$Alcohol, na.rm=TRUE)
test$Sulphates[is.na(test$Sulphates)] <- mean(test$Sulphates, na.rm=TRUE)
test$pH[is.na(test$pH)] <- mean(test$pH, na.rm=TRUE)
test$TotalSulfurDioxide[is.na(test$TotalSulfurDioxide)] <- mean(test$TotalSulfurDioxide, na.rm=TRUE)
test$FreeSulfurDioxide[is.na(test$FreeSulfurDioxide)] <- mean(test$FreeSulfurDioxide, na.rm=TRUE)
test$BoundSulfurDioxide[is.na(test$BoundSulfurDioxide)] <- mean(test$BoundSulfurDioxide, na.rm=TRUE)
test$Chlorides[is.na(test$Chlorides)] <- mean(test$Chlorides, na.rm=TRUE)
test$ResidualSugar[is.na(test$ResidualSugar)] <- mean(test$ResidualSugar, na.rm=TRUE)

#convert to abs for negative values
#converted to positive based upon literature

test$FixedAcidity <- abs(test$FixedAcidity)
test$VolatileAcidity <- abs(test$VolatileAcidity)
test$CitricAcid <- abs(test$CitricAcid)
test$ResidualSugar <- abs(test$ResidualSugar)
test$Chlorides <- abs(test$Chlorides)
test$FreeSulfurDioxide <- abs(test$FreeSulfurDioxide)
test$TotalSulfurDioxide <- abs(test$TotalSulfurDioxide)
test$BoundSulfurDioxide <- abs(test$BoundSulfurDioxide)
test$Sulphates <- abs(test$Sulphates)
test$Alcohol <- abs(test$Alcohol)

#new variables after abs to avoid nan and inf
test$PerVol <- test$VolatileAcidity/(test$FixedAcidity+test$VolatileAcidity)

#shift categorigal labelappeal
test$LabelAppeal <- test$LabelAppeal+2


test2<-test
test2$STARS <- as.factor(test2$STARS)



test <- test[, !(colnames(test) %in% c("INDEX"))]
test <- test[, !(colnames(test) %in% c("IN"))]

test$TARGET <- 0
test$STARS[test$STARS>2 & test$STARS <3] <- 2.04175498092412


test$TARGET <- predict(model2, newdata = test, type="response")

y_pred_num <- floor(test$TARGET)
y_pred <- factor(y_pred_num, levels=c(0, 1,2,3,4,5,6,7,8))
summary(y_pred)
##   0   1   2   3   4   5   6   7   8 
## 165 773 703 763 611 288  32   0   0
par(mfrow=c(2,2))
hist(test$TARGET)
hist(train$TARGET)