rm(list = ls())
options(repos = "http://cran.rstudio.com/") 


packages <- c("psych",      # quick summary stats for data exploration,
              "stargazer",  # summary stats for sharing,
              "tidyverse",  # data manipulation like selecting variables,
              "corrplot",   # correlation plots
              "ggplot2",    # graphing
              "data.table", # reshape for graphing 
              "car",        # vif
              "dplyr",
              "tidyverse",
              "knitr",
              "reshape2",
              "glmnet",
              "conflicted",
              "xts", 
              "forecast",
              "qqplotr"
)  

for (i in 1:length(packages)) {
  if (!packages[i] %in% rownames(installed.packages())) {
    install.packages(packages[i]
                     , repos = "http://cran.rstudio.com/"
                     , dependencies = TRUE
    )
  }
  library(packages[i], character.only = TRUE)
}

## Warning: package 'psych' was built under R version 4.2.3

## 
## Please cite as:

##  Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.

##  R package version 5.2.3. https://CRAN.R-project.org/package=stargazer

## Warning: package 'tidyverse' was built under R version 4.2.3

## Warning: package 'ggplot2' was built under R version 4.2.3

## Warning: package 'tibble' was built under R version 4.2.3

## Warning: package 'purrr' was built under R version 4.2.3

## Warning: package 'dplyr' was built under R version 4.2.3

## Warning: package 'stringr' was built under R version 4.2.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.1     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%()   masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

## Warning: package 'corrplot' was built under R version 4.2.3

## corrplot 0.92 loaded

## Warning: package 'data.table' was built under R version 4.2.3

## 
## Attaching package: 'data.table'
## 
## The following objects are masked from 'package:lubridate':
## 
##     hour, isoweek, mday, minute, month, quarter, second, wday, week,
##     yday, year
## 
## The following objects are masked from 'package:dplyr':
## 
##     between, first, last
## 
## The following object is masked from 'package:purrr':
## 
##     transpose
## 
## Loading required package: carData

## Warning: package 'carData' was built under R version 4.2.3

## 
## Attaching package: 'car'
## 
## The following object is masked from 'package:dplyr':
## 
##     recode
## 
## The following object is masked from 'package:purrr':
## 
##     some
## 
## The following object is masked from 'package:psych':
## 
##     logit

## Warning: package 'knitr' was built under R version 4.2.3

## 
## Attaching package: 'reshape2'
## 
## The following objects are masked from 'package:data.table':
## 
##     dcast, melt
## 
## The following object is masked from 'package:tidyr':
## 
##     smiths

## Warning: package 'glmnet' was built under R version 4.2.3

## Loading required package: Matrix

## Warning: package 'Matrix' was built under R version 4.2.3

## 
## Attaching package: 'Matrix'
## 
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## 
## Loaded glmnet 4.1-7

## Warning: package 'xts' was built under R version 4.2.3

## Loading required package: zoo

## Warning: package 'zoo' was built under R version 4.2.3

## 
## Attaching package: 'zoo'
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## 
## 
## ######################### Warning from 'xts' package ##########################
## #                                                                             #
## # The dplyr lag() function breaks how base R's lag() function is supposed to  #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or       #
## # source() into this session won't work correctly.                            #
## #                                                                             #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop           #
## # dplyr from breaking base R's lag() function.                                #
## #                                                                             #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning.  #
## #                                                                             #
## ###############################################################################

## Warning: package 'forecast' was built under R version 4.2.3

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Warning: package 'qqplotr' was built under R version 4.2.3

wine.training.data <-read.csv("C:/Users/91976/Downloads/wine-training-data.csv")

summary(wine.training.data)

##      INDEX           TARGET       FixedAcidity     VolatileAcidity  
##  Min.   :    1   Min.   :0.000   Min.   :-18.100   Min.   :-2.7900  
##  1st Qu.: 4038   1st Qu.:2.000   1st Qu.:  5.200   1st Qu.: 0.1300  
##  Median : 8110   Median :3.000   Median :  6.900   Median : 0.2800  
##  Mean   : 8070   Mean   :3.029   Mean   :  7.076   Mean   : 0.3241  
##  3rd Qu.:12106   3rd Qu.:4.000   3rd Qu.:  9.500   3rd Qu.: 0.6400  
##  Max.   :16129   Max.   :8.000   Max.   : 34.400   Max.   : 3.6800  
##                                                                     
##    CitricAcid      ResidualSugar        Chlorides       FreeSulfurDioxide
##  Min.   :-3.2400   Min.   :-127.800   Min.   :-1.1710   Min.   :-555.00  
##  1st Qu.: 0.0300   1st Qu.:  -2.000   1st Qu.:-0.0310   1st Qu.:   0.00  
##  Median : 0.3100   Median :   3.900   Median : 0.0460   Median :  30.00  
##  Mean   : 0.3084   Mean   :   5.419   Mean   : 0.0548   Mean   :  30.85  
##  3rd Qu.: 0.5800   3rd Qu.:  15.900   3rd Qu.: 0.1530   3rd Qu.:  70.00  
##  Max.   : 3.8600   Max.   : 141.150   Max.   : 1.3510   Max.   : 623.00  
##                    NA's   :616        NA's   :638       NA's   :647      
##  TotalSulfurDioxide    Density             pH          Sulphates      
##  Min.   :-823.0     Min.   :0.8881   Min.   :0.480   Min.   :-3.1300  
##  1st Qu.:  27.0     1st Qu.:0.9877   1st Qu.:2.960   1st Qu.: 0.2800  
##  Median : 123.0     Median :0.9945   Median :3.200   Median : 0.5000  
##  Mean   : 120.7     Mean   :0.9942   Mean   :3.208   Mean   : 0.5271  
##  3rd Qu.: 208.0     3rd Qu.:1.0005   3rd Qu.:3.470   3rd Qu.: 0.8600  
##  Max.   :1057.0     Max.   :1.0992   Max.   :6.130   Max.   : 4.2400  
##  NA's   :682                         NA's   :395     NA's   :1210     
##     Alcohol       LabelAppeal          AcidIndex          STARS      
##  Min.   :-4.70   Min.   :-2.000000   Min.   : 4.000   Min.   :1.000  
##  1st Qu.: 9.00   1st Qu.:-1.000000   1st Qu.: 7.000   1st Qu.:1.000  
##  Median :10.40   Median : 0.000000   Median : 8.000   Median :2.000  
##  Mean   :10.49   Mean   :-0.009066   Mean   : 7.773   Mean   :2.042  
##  3rd Qu.:12.40   3rd Qu.: 1.000000   3rd Qu.: 8.000   3rd Qu.:3.000  
##  Max.   :26.50   Max.   : 2.000000   Max.   :17.000   Max.   :4.000  
##  NA's   :653                                          NA's   :3359

Data Exploration

#Removing the index column
data_t_mod <- wine.training.data [,-1]

Summary Stats

library(stargazer)

stargazer(data_t_mod, type="text")

## 
## =============================================================
## Statistic            N     Mean   St. Dev.   Min       Max   
## -------------------------------------------------------------
## TARGET             12,795  3.029   1.926      0         8    
## FixedAcidity       12,795  7.076   6.318   -18.100   34.400  
## VolatileAcidity    12,795  0.324   0.784    -2.790    3.680  
## CitricAcid         12,795  0.308   0.862    -3.240    3.860  
## ResidualSugar      12,179  5.419   33.749  -127.800  141.150 
## Chlorides          12,157  0.055   0.318    -1.171    1.351  
## FreeSulfurDioxide  12,148 30.846  148.715  -555.000  623.000 
## TotalSulfurDioxide 12,113 120.714 231.913  -823.000 1,057.000
## Density            12,795  0.994   0.027    0.888     1.099  
## pH                 12,400  3.208   0.680    0.480     6.130  
## Sulphates          11,585  0.527   0.932    -3.130    4.240  
## Alcohol            12,142 10.489   3.728    -4.700   26.500  
## LabelAppeal        12,795 -0.009   0.891      -2        2    
## AcidIndex          12,795  7.773   1.324      4        17    
## STARS              9,436   2.042   0.903      1         4    
## -------------------------------------------------------------

Checking for class

sapply(X=data_t_mod,
       FUN= class)

##             TARGET       FixedAcidity    VolatileAcidity         CitricAcid 
##          "integer"          "numeric"          "numeric"          "numeric" 
##      ResidualSugar          Chlorides  FreeSulfurDioxide TotalSulfurDioxide 
##          "numeric"          "numeric"          "numeric"          "numeric" 
##            Density                 pH          Sulphates            Alcohol 
##          "numeric"          "numeric"          "numeric"          "numeric" 
##        LabelAppeal          AcidIndex              STARS 
##          "integer"          "integer"          "integer"

#Creating Boxplots

library(ggplot2)
library(magrittr)
library(moments)
library(qqplotr)

library(ggplot2)
library(moments)
library(gridExtra)

with(data_t_mod, c(summary(FixedAcidity), SD = sd(FixedAcidity), Skew = skewness(FixedAcidity), Kurt = kurtosis(FixedAcidity)))

##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
## -18.10000000   5.20000000   6.90000000   7.07571708   9.50000000  34.40000000 
##           SD         Skew         Kurt 
##   6.31764346  -0.02258861   4.67572951

hist <- ggplot(data_t_mod, aes(FixedAcidity)) +
  geom_histogram(fill = 'dodgerblue', binwidth = 4, color = 'darkgray') +
  theme_classic() +
  labs(title = 'Histogram of FixedAcidity') +
  theme(plot.title = element_text(hjust = 0.5))

qq_plot <- ggplot(data_t_mod, aes(sample = FixedAcidity)) +
  geom_qq(color = 'dodgerblue') +
  geom_qq_line(color = 'darkgray') +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles", title = "QQ Plot of FixedAcidity") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

box_plot <- ggplot(data_t_mod, aes(x = "", y = FixedAcidity)) +
  geom_boxplot(fill = 'dodgerblue', color = 'darkgray') +
  theme_classic() +
  labs(title = 'Boxplot of FixedAcidity', x = "") +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip()

box_TARGET <- ggplot(data_t_mod, aes(x = factor(TARGET), y = FixedAcidity)) +
  geom_boxplot(fill = 'dodgerblue', color = 'darkgray') +
  labs(x = 'TARGET', title = 'Boxplot of FixedAcidity by TARGET') +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol = 2)

#The box plot below shows that outliners exist in variables FixedAcidity, VolatileAcidity, CitricAcid, ResidualSugar, Chlorides, FreeSulfurDioxide, TotalSulfurDioxide, Density, pH, Sulphates, Alcohol, LabelAppeal andAcidIndex. We use scaled training set to draw the box plot to show the corresponding outliers by ratio

Histogram

library(ggpubr)

## Warning: package 'ggpubr' was built under R version 4.2.3

hist <- ggplot(data_t_mod, aes(FixedAcidity)) + geom_histogram(fill = 'orange', binwidth = 4, color = 'darkgray' ) + 
  theme_classic() + labs(title = 'Histogram of FixedAcidity') + theme(plot.title = element_text(hjust = 0.5))

Create a QQ plot with points and reference line

library(ggplot2)

data_t_mod <- data.frame(FixedAcidity = rnorm(100))
qq_plot <- ggplot(data_t_mod, aes(sample = FixedAcidity)) +
  geom_qq(color = 'orange') +
  geom_abline(color = 'darkgray') +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles", title = "QQ Plot of Fixed Acidity") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

print(qq_plot)

box_plot <- ggplot(data_t_mod, aes(x="", FixedAcidity)) + geom_boxplot(fill='orange', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of FixedAcidity', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

data_t_mod <- wine.training.data
head(data_t_mod[,-1])

##   TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1      3          3.2           1.160      -0.98          54.2    -0.567
## 2      3          4.5           0.160      -0.81          26.1    -0.425
## 3      5          7.1           2.640      -0.88          14.8     0.037
## 4      3          5.7           0.385       0.04          18.8    -0.425
## 5      4          8.0           0.330      -1.26           9.4        NA
## 6      0         11.3           0.320       0.59           2.2     0.556
##   FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 1                NA                268 0.99280 3.33     -0.59     9.9
## 2                15               -327 1.02792 3.38      0.70      NA
## 3               214                142 0.99518 3.12      0.48    22.0
## 4                22                115 0.99640 2.24      1.83     6.2
## 5              -167                108 0.99457 3.12      1.77    13.7
## 6               -37                 15 0.99940 3.20      1.29    15.4
##   LabelAppeal AcidIndex STARS
## 1           0         8     2
## 2          -1         7     3
## 3          -1         8     3
## 4          -1         6     1
## 5           0         9     2
## 6           0        11    NA

head(data_t_mod)

##   INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1     1      3          3.2           1.160      -0.98          54.2    -0.567
## 2     2      3          4.5           0.160      -0.81          26.1    -0.425
## 3     4      5          7.1           2.640      -0.88          14.8     0.037
## 4     5      3          5.7           0.385       0.04          18.8    -0.425
## 5     6      4          8.0           0.330      -1.26           9.4        NA
## 6     7      0         11.3           0.320       0.59           2.2     0.556
##   FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 1                NA                268 0.99280 3.33     -0.59     9.9
## 2                15               -327 1.02792 3.38      0.70      NA
## 3               214                142 0.99518 3.12      0.48    22.0
## 4                22                115 0.99640 2.24      1.83     6.2
## 5              -167                108 0.99457 3.12      1.77    13.7
## 6               -37                 15 0.99940 3.20      1.29    15.4
##   LabelAppeal AcidIndex STARS
## 1           0         8     2
## 2          -1         7     3
## 3          -1         8     3
## 4          -1         6     1
## 5           0         9     2
## 6           0        11    NA

box_TARGET <- ggplot(data_t_mod, aes(x=factor(TARGET), FixedAcidity)) + geom_boxplot(fill='orange', color='darkgrey') +
  labs(x='TARGET', title = 'Boxplot of FixedAcidity by TARGET') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

library(gridExtra)

grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)

Checking chloride content of wine

library(qqplotr)

with(data_t_mod, c(summary(Chlorides), SD=sd(Chlorides), Skew=skewness(Chlorides), Kurt=kurtosis(Chlorides)))

##         Min.      1st Qu.       Median         Mean      3rd Qu.         Max. 
##  -1.17100000  -0.03100000   0.04600000   0.05482249   0.15300000   1.35100000 
##         NA's           SD         Skew         Kurt 
## 638.00000000           NA           NA           NA

hist <- ggplot(data_t_mod, aes(Chlorides)) + geom_histogram(fill = 'maroon', binwidth = .2, color = 'darkgray' ) + 
  theme_classic() + labs(title = 'Histogram of Chlorides') + theme(plot.title = element_text(hjust = 0.5)) 

gg_plot <- ggplot(data_t_mod, aes(sample=Chlorides)) + stat_qq_point(color='maroon') + ggplot2::stat_qq_line(color='darkgray') +
  labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of Chlorides") + theme_classic() + 
  theme(plot.title = element_text(hjust = 0.5))

box_plot <- ggplot(data_t_mod, aes(x="", Chlorides)) + geom_boxplot(fill='maroon', color='darkgray')+ theme_classic() +
  labs(title = 'Boxplot of Chlorides', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()

box_TARGET <- ggplot(data_t_mod, aes(x=factor(TARGET), Chlorides)) + geom_boxplot(fill='maroon', color='darkgrey') +
  labs(x='TARGET', title = 'Boxplot of Chlorides by TARGET') + theme_classic() +
  theme(plot.title = element_text(hjust = 0.5)) 

grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)

## Warning: Removed 638 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 638 rows containing non-finite values (`stat_boxplot()`).
## Removed 638 rows containing non-finite values (`stat_boxplot()`).

##Checking Alcohol Content

library(ggplot2)
library(moments)


with(data_t_mod, c(summary(Alcohol), SD=sd(Alcohol), Skew=skewness(Alcohol), Kurt=kurtosis(Alcohol)))

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max.      NA's        SD 
##  -4.70000   9.00000  10.40000  10.48924  12.40000  26.50000 653.00000        NA 
##      Skew      Kurt 
##        NA        NA

hist <- ggplot(data_t_mod, aes(Alcohol)) + geom_histogram(fill = 'yellow', binwidth = 2, color = 'darkgray' ) + 
  theme_classic() + labs(title = 'Histogram of Alcohol') + theme(plot.title = element_text(hjust = 0.5)) 

qq_plot <- ggplot(data_t_mod, aes(sample = Alcohol)) +
  geom_qq(color = 'yellow') +
  geom_qq_line(color = 'darkgray') +
  labs(x = "Theoretical Quantiles", y = "Sample Quantiles", title = "QQ Plot of Alcohol") +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

box_plot <- ggplot(data_t_mod, aes(x = "", y = Alcohol)) +
  geom_boxplot(fill = 'yellow', color = 'darkgray') +
  theme_classic() +
  labs(title = 'Boxplot of Alcohol', x = "") +
  theme(plot.title = element_text(hjust = 0.5)) +
  coord_flip()


box_TARGET <- ggplot(data_t_mod, aes(x = factor(TARGET), y = Alcohol)) +
  geom_boxplot(fill = 'yellow', color = 'darkgray') +
  labs(x = 'TARGET', title = 'Boxplot of Alcohol by TARGET') +
  theme_classic() +
  theme(plot.title = element_text(hjust = 0.5))

grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol = 2)

## Warning: Removed 653 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 653 rows containing non-finite values (`stat_qq()`).

## Warning: Removed 653 rows containing non-finite values (`stat_qq_line()`).

## Warning: Removed 653 rows containing non-finite values (`stat_boxplot()`).
## Removed 653 rows containing non-finite values (`stat_boxplot()`).

Creating density plot

data_t_mod %>%
  select_if(is.numeric) %>%
  keep(is.numeric) %>%                     # Keep only numeric columns
  gather() %>%                             # Convert to key-value pairs
  ggplot(aes(x=value)) +                   # Plot the values
  facet_wrap(~key, scales = "free") +    # In separate panels
  geom_density()

## Warning: Removed 8200 rows containing non-finite values (`stat_density()`).

#The scaled histogram and density plots show that variables AcidIndex is right skewed; AcidIndex, STARS, LabelAppeal and TARGET have multimodal distribution; while most others seem to be normally distrbuted due to the bell curve they display.

Checking for Correlation

drop_na(data_t_mod) %>%
  select_if(is.numeric) %>%
  cor() %>%
  #corrplot(method = "square", type = "upper", order = 'hclust', tl.col = "black", diag = FALSE, bg= 'white', col = colorRampPalette(c('deeppink4','white','steelblue1'))(100))
  corrplot.mixed(upper = 'pie', lower = 'number', order = 'hclust', tl.col = "black")

pairs.panels(data_t_mod[1:15])

PerformanceAnalytics::chart.Correlation(data_t_mod, histogram=TRUE, pch=19)

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

## Warning in par(usr): argument 1 does not name a graphical parameter

#The correlation matrix below shows that the response variable TARGET has strong positive relationship (>=0.6) with variables FixedAcidity,CitricAcid,ResidualSugar,Density,Alcohol.

Scatter plots against TARGET:

library(ggthemes)

## Warning: package 'ggthemes' was built under R version 4.2.3

data_t_mod %>%
  gather(-TARGET, key = "key", value = "ResponseVariables") %>%
  ggplot(aes(x = ResponseVariables, y = TARGET)) +
  geom_point(size = .5) +
  geom_smooth(method='lm',formula=y~x, color = 'dark grey')+
  facet_wrap(~ key, scales = "free")+
  ggthemes::theme_tufte()+
  ylab('Cases Bought')

## Warning: Removed 8200 rows containing non-finite values (`stat_smooth()`).

## Warning: Removed 8200 rows containing missing values (`geom_point()`).

#STARS and LableAppleal look like they have the greatest correlation.

Data Preparation

Checking for N.as

data_t_mod%>% 
  summarise(across(everything(),~ sum(is.na(.)))) %>%
  glimpse()

## Rows: 1
## Columns: 16
## $ INDEX              <int> 0
## $ TARGET             <int> 0
## $ FixedAcidity       <int> 0
## $ VolatileAcidity    <int> 0
## $ CitricAcid         <int> 0
## $ ResidualSugar      <int> 616
## $ Chlorides          <int> 638
## $ FreeSulfurDioxide  <int> 647
## $ TotalSulfurDioxide <int> 682
## $ Density            <int> 0
## $ pH                 <int> 395
## $ Sulphates          <int> 1210
## $ Alcohol            <int> 653
## $ LabelAppeal        <int> 0
## $ AcidIndex          <int> 0
## $ STARS              <int> 3359

library(naniar)

## Warning: package 'naniar' was built under R version 4.2.3

vis_miss(data_t_mod)

vis_miss(data_t_mod)

Splitting the data

require(mice)

## Loading required package: mice

## Warning: package 'mice' was built under R version 4.2.3

set.seed(999) 
sampl = caTools::sample.split(data_t_mod$TARGET, SplitRatio = .80)
wine_train1 <- subset(data_t_mod, sampl == TRUE)
wine_test1 <- subset(data_t_mod, sampl == FALSE)

wine_train2 <-  as.data.frame(tidyr::complete(mice(wine_train1, m=1, maxit = 5, seed = 42)))

## 
##  iter imp variable
##   1   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS
##   2   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS
##   3   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS
##   4   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS
##   5   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS

wine_test2 <- as.data.frame(tidyr::complete(mice(wine_test1, m=1, maxit = 5, seed = 42)))

## 
##  iter imp variable
##   1   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS
##   2   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS
##   3   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS
##   4   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS
##   5   1  ResidualSugar  Chlorides  FreeSulfurDioxide  TotalSulfurDioxide  pH  Sulphates  Alcohol  STARS

#Given the low correlation between AcidIndex and TARGET it might not make a huge difference, however, we will log transform it to test.

wine_train2$AcidIndex <- log(wine_train2$AcidIndex)
wine_test2$AcidIndex <- log(wine_test2$AcidIndex)

Building Models

Poisson Model without imputations

require(ggplot2)
require(gridExtra)

wine_train1 <- wine_train1[,-1]

model1 = glm(TARGET ~  ., data=wine_train1, family=poisson)
summary(model1)

## 
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = wine_train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2128  -0.2757   0.0647   0.3766   1.6981  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.608e+00  2.796e-01   5.750 8.90e-09 ***
## FixedAcidity        6.705e-04  1.177e-03   0.570  0.56901    
## VolatileAcidity    -2.750e-02  9.283e-03  -2.963  0.00305 ** 
## CitricAcid         -3.835e-03  8.519e-03  -0.450  0.65259    
## ResidualSugar       1.828e-05  2.152e-04   0.085  0.93232    
## Chlorides          -3.764e-02  2.314e-02  -1.627  0.10377    
## FreeSulfurDioxide   5.671e-05  4.892e-05   1.159  0.24630    
## TotalSulfurDioxide  2.230e-05  3.177e-05   0.702  0.48274    
## Density            -4.025e-01  2.749e-01  -1.464  0.14326    
## pH                  2.307e-04  1.085e-02   0.021  0.98303    
## Sulphates          -5.984e-03  7.973e-03  -0.751  0.45293    
## Alcohol             3.262e-03  2.004e-03   1.628  0.10360    
## LabelAppeal         1.730e-01  8.858e-03  19.530  < 2e-16 ***
## AcidIndex          -4.967e-02  6.666e-03  -7.451 9.28e-14 ***
## STARS               1.929e-01  8.328e-03  23.160  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4720.5  on 5143  degrees of freedom
## Residual deviance: 3242.8  on 5129  degrees of freedom
##   (5093 observations deleted due to missingness)
## AIC: 18545
## 
## Number of Fisher Scoring iterations: 5

plot(model1)

grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)

## Warning: Removed 653 rows containing non-finite values (`stat_bin()`).

## Warning: Removed 653 rows containing non-finite values (`stat_qq()`).

## Warning: Removed 653 rows containing non-finite values (`stat_qq_line()`).

## Warning: Removed 653 rows containing non-finite values (`stat_boxplot()`).
## Removed 653 rows containing non-finite values (`stat_boxplot()`).

Model 2: Poisson Model without imputations and only significant variables

model2 = glm(TARGET ~  .-FixedAcidity-CitricAcid-ResidualSugar-Chlorides-FreeSulfurDioxide-TotalSulfurDioxide-Density-pH-Sulphates-Alcohol, data=wine_train1, family=poisson)
summary(model2)

## 
## Call:
## glm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar - 
##     Chlorides - FreeSulfurDioxide - TotalSulfurDioxide - Density - 
##     pH - Sulphates - Alcohol, family = poisson, data = wine_train1)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1898  -0.2777   0.0622   0.3764   1.6086  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.251442   0.054724  22.868  < 2e-16 ***
## VolatileAcidity -0.027581   0.009278  -2.973  0.00295 ** 
## LabelAppeal      0.173177   0.008853  19.562  < 2e-16 ***
## AcidIndex       -0.050616   0.006553  -7.724 1.13e-14 ***
## STARS            0.194208   0.008292  23.421  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 4720.5  on 5143  degrees of freedom
## Residual deviance: 3253.1  on 5139  degrees of freedom
##   (5093 observations deleted due to missingness)
## AIC: 18535
## 
## Number of Fisher Scoring iterations: 5

plot(model2)

Model 3: Poisson Model with imputations

model3 = glm(TARGET ~  ., data=wine_train2, family=poisson)
summary(model3)

## 
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = wine_train2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8881  -0.6747   0.1302   0.6374   2.4015  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         2.248e+00  2.287e-01   9.830  < 2e-16 ***
## INDEX               8.312e-07  1.220e-06   0.681 0.495737    
## FixedAcidity        8.840e-05  9.192e-04   0.096 0.923380    
## VolatileAcidity    -4.304e-02  7.279e-03  -5.912 3.37e-09 ***
## CitricAcid          8.337e-03  6.572e-03   1.269 0.204606    
## ResidualSugar       1.243e-04  1.675e-04   0.742 0.458154    
## Chlorides          -6.422e-02  1.789e-02  -3.590 0.000330 ***
## FreeSulfurDioxide   1.339e-04  3.809e-05   3.515 0.000439 ***
## TotalSulfurDioxide  8.710e-05  2.460e-05   3.541 0.000398 ***
## Density            -2.978e-01  2.144e-01  -1.389 0.164915    
## pH                 -1.902e-02  8.413e-03  -2.261 0.023774 *  
## Sulphates          -1.234e-02  6.173e-03  -1.999 0.045630 *  
## Alcohol             2.379e-03  1.555e-03   1.530 0.126110    
## LabelAppeal         1.436e-01  6.770e-03  21.217  < 2e-16 ***
## AcidIndex          -7.520e-01  4.001e-02 -18.798  < 2e-16 ***
## STARS               3.430e-01  6.248e-03  54.900  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18291  on 10236  degrees of freedom
## Residual deviance: 12778  on 10221  degrees of freedom
## AIC: 38368
## 
## Number of Fisher Scoring iterations: 5

plot(model3)

Model 4: Poisson Model with imputations and only significant variables

model4 = glm(TARGET ~  .-FixedAcidity-CitricAcid-ResidualSugar-Density-Alcohol, data=wine_train2, family=poisson)

summary(model4)

## 
## Call:
## glm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar - 
##     Density - Alcohol, family = poisson, data = wine_train2)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8904  -0.6794   0.1272   0.6380   2.3916  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.984e+00  8.912e-02  22.257  < 2e-16 ***
## INDEX               8.686e-07  1.220e-06   0.712 0.476446    
## VolatileAcidity    -4.328e-02  7.278e-03  -5.946 2.74e-09 ***
## Chlorides          -6.556e-02  1.788e-02  -3.667 0.000245 ***
## FreeSulfurDioxide   1.320e-04  3.807e-05   3.468 0.000525 ***
## TotalSulfurDioxide  8.613e-05  2.457e-05   3.505 0.000457 ***
## pH                 -1.929e-02  8.410e-03  -2.293 0.021840 *  
## Sulphates          -1.227e-02  6.168e-03  -1.989 0.046736 *  
## LabelAppeal         1.437e-01  6.771e-03  21.216  < 2e-16 ***
## AcidIndex          -7.540e-01  3.936e-02 -19.154  < 2e-16 ***
## STARS               3.440e-01  6.231e-03  55.206  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 18291  on 10236  degrees of freedom
## Residual deviance: 12785  on 10226  degrees of freedom
## AIC: 38365
## 
## Number of Fisher Scoring iterations: 5

plot(model4)

Model II: Negative Binomial

Model 5: Negative Binomial without imputations

library(MASS)

## Warning: package 'MASS' was built under R version 4.2.3

model5 <- glm.nb(TARGET ~ ., data = wine_train1)

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(model5)

## 
## Call:
## glm.nb(formula = TARGET ~ ., data = wine_train1, init.theta = 138898.9107, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2127  -0.2757   0.0647   0.3766   1.6981  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.608e+00  2.796e-01   5.750 8.91e-09 ***
## FixedAcidity        6.705e-04  1.177e-03   0.570  0.56900    
## VolatileAcidity    -2.750e-02  9.283e-03  -2.963  0.00305 ** 
## CitricAcid         -3.835e-03  8.519e-03  -0.450  0.65259    
## ResidualSugar       1.828e-05  2.152e-04   0.085  0.93231    
## Chlorides          -3.764e-02  2.314e-02  -1.627  0.10378    
## FreeSulfurDioxide   5.671e-05  4.892e-05   1.159  0.24630    
## TotalSulfurDioxide  2.230e-05  3.177e-05   0.702  0.48275    
## Density            -4.025e-01  2.750e-01  -1.464  0.14326    
## pH                  2.307e-04  1.085e-02   0.021  0.98303    
## Sulphates          -5.984e-03  7.973e-03  -0.751  0.45293    
## Alcohol             3.262e-03  2.004e-03   1.628  0.10360    
## LabelAppeal         1.730e-01  8.858e-03  19.529  < 2e-16 ***
## AcidIndex          -4.967e-02  6.666e-03  -7.451 9.28e-14 ***
## STARS               1.929e-01  8.328e-03  23.160  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(138898.9) family taken to be 1)
## 
##     Null deviance: 4720.4  on 5143  degrees of freedom
## Residual deviance: 3242.7  on 5129  degrees of freedom
##   (5093 observations deleted due to missingness)
## AIC: 18547
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  138899 
##           Std. Err.:  259921 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -18515.07

plot(model5)

Model 6 : Negative Binomial without imputations and only significant variables

library(MASS)


model6 <- glm.nb(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Chlorides-FreeSulfurDioxide-TotalSulfurDioxide-Density-pH-Sulphates-Alcohol, data = wine_train1)

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(model6)

## 
## Call:
## glm.nb(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar - 
##     Chlorides - FreeSulfurDioxide - TotalSulfurDioxide - Density - 
##     pH - Sulphates - Alcohol, data = wine_train1, init.theta = 138402.5261, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.1898  -0.2777   0.0622   0.3764   1.6086  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)      1.251443   0.054725  22.868  < 2e-16 ***
## VolatileAcidity -0.027581   0.009279  -2.973  0.00295 ** 
## LabelAppeal      0.173177   0.008853  19.562  < 2e-16 ***
## AcidIndex       -0.050616   0.006553  -7.724 1.13e-14 ***
## STARS            0.194209   0.008292  23.421  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(138402.5) family taken to be 1)
## 
##     Null deviance: 4720.4  on 5143  degrees of freedom
## Residual deviance: 3253.0  on 5139  degrees of freedom
##   (5093 observations deleted due to missingness)
## AIC: 18537
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  138403 
##           Std. Err.:  258834 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -18525.37

plot(model6)

## Model 7 : Negative Binomial with imputations

model7 <- glm.nb(TARGET ~ ., data = wine_train2)

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(model7)

## 
## Call:
## glm.nb(formula = TARGET ~ ., data = wine_train2, init.theta = 49133.51129, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8880  -0.6747   0.1302   0.6374   2.4015  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         2.248e+00  2.287e-01   9.830  < 2e-16 ***
## INDEX               8.311e-07  1.220e-06   0.681 0.495762    
## FixedAcidity        8.842e-05  9.192e-04   0.096 0.923369    
## VolatileAcidity    -4.304e-02  7.280e-03  -5.912 3.38e-09 ***
## CitricAcid          8.337e-03  6.572e-03   1.269 0.204616    
## ResidualSugar       1.243e-04  1.676e-04   0.742 0.458148    
## Chlorides          -6.422e-02  1.789e-02  -3.590 0.000330 ***
## FreeSulfurDioxide   1.339e-04  3.809e-05   3.515 0.000440 ***
## TotalSulfurDioxide  8.710e-05  2.460e-05   3.541 0.000399 ***
## Density            -2.978e-01  2.144e-01  -1.389 0.164928    
## pH                 -1.902e-02  8.413e-03  -2.261 0.023775 *  
## Sulphates          -1.234e-02  6.173e-03  -1.999 0.045628 *  
## Alcohol             2.379e-03  1.555e-03   1.530 0.126130    
## LabelAppeal         1.436e-01  6.770e-03  21.216  < 2e-16 ***
## AcidIndex          -7.521e-01  4.001e-02 -18.798  < 2e-16 ***
## STARS               3.430e-01  6.248e-03  54.899  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(49133.51) family taken to be 1)
## 
##     Null deviance: 18290  on 10236  degrees of freedom
## Residual deviance: 12778  on 10221  degrees of freedom
## AIC: 38371
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  49134 
##           Std. Err.:  63329 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -38336.61

plot(model7)

Model 8 : Negative Binomial with imputations and only significant variables

model8 <- glm.nb(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Density-Alcohol, data = wine_train2)

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

summary(model8)

## 
## Call:
## glm.nb(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar - 
##     Density - Alcohol, data = wine_train2, init.theta = 49062.73041, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8903  -0.6794   0.1272   0.6380   2.3916  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.984e+00  8.913e-02  22.256  < 2e-16 ***
## INDEX               8.685e-07  1.220e-06   0.712 0.476472    
## VolatileAcidity    -4.328e-02  7.278e-03  -5.946 2.74e-09 ***
## Chlorides          -6.557e-02  1.788e-02  -3.667 0.000245 ***
## FreeSulfurDioxide   1.320e-04  3.807e-05   3.468 0.000525 ***
## TotalSulfurDioxide  8.613e-05  2.457e-05   3.505 0.000457 ***
## pH                 -1.929e-02  8.410e-03  -2.293 0.021841 *  
## Sulphates          -1.227e-02  6.168e-03  -1.989 0.046734 *  
## LabelAppeal         1.437e-01  6.771e-03  21.216  < 2e-16 ***
## AcidIndex          -7.540e-01  3.937e-02 -19.154  < 2e-16 ***
## STARS               3.440e-01  6.231e-03  55.205  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(49062.73) family taken to be 1)
## 
##     Null deviance: 18290  on 10236  degrees of freedom
## Residual deviance: 12784  on 10226  degrees of freedom
## AIC: 38367
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  49063 
##           Std. Err.:  63266 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -38343.11

plot(model8)

Model III: Linear Model

Model 9 : Linear Model with imputations

model9 <- lm(TARGET ~ ., data = wine_train2)

summary(model9)

## 
## Call:
## lm(formula = TARGET ~ ., data = wine_train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2761 -1.0236  0.1638  1.0311  4.2527 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.704e+00  5.559e-01  10.261  < 2e-16 ***
## INDEX               1.755e-06  2.999e-06   0.585 0.558452    
## FixedAcidity        6.125e-04  2.244e-03   0.273 0.784855    
## VolatileAcidity    -1.236e-01  1.784e-02  -6.929 4.51e-12 ***
## CitricAcid          2.473e-02  1.623e-02   1.524 0.127555    
## ResidualSugar       3.463e-04  4.116e-04   0.841 0.400181    
## Chlorides          -2.005e-01  4.387e-02  -4.569 4.95e-06 ***
## FreeSulfurDioxide   3.669e-04  9.364e-05   3.919 8.97e-05 ***
## TotalSulfurDioxide  2.260e-04  5.996e-05   3.770 0.000164 ***
## Density            -7.213e-01  5.239e-01  -1.377 0.168644    
## pH                 -4.485e-02  2.060e-02  -2.177 0.029498 *  
## Sulphates          -3.526e-02  1.515e-02  -2.326 0.020013 *  
## Alcohol             1.054e-02  3.784e-03   2.785 0.005359 ** 
## LabelAppeal         4.400e-01  1.634e-02  26.932  < 2e-16 ***
## AcidIndex          -2.016e+00  9.211e-02 -21.887  < 2e-16 ***
## STARS               1.174e+00  1.658e-02  70.801  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.41 on 10221 degrees of freedom
## Multiple R-squared:  0.465,  Adjusted R-squared:  0.4642 
## F-statistic: 592.2 on 15 and 10221 DF,  p-value: < 2.2e-16

plot(model9)

Model 10 : Linear Model with imputations and only significant variables

#As we know the significant variables are FixedAcidity, CitricAcid and ResidualSugar, so using the same in the Linear Regression Model and applying the same of imputed training data.

model10 <- lm(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar, data = wine_train2)

summary(model10)

## 
## Call:
## lm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar, 
##     data = wine_train2)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.2776 -1.0217  0.1651  1.0288  4.2686 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         5.688e+00  5.557e-01  10.236  < 2e-16 ***
## INDEX               1.780e-06  2.999e-06   0.593 0.552956    
## VolatileAcidity    -1.243e-01  1.783e-02  -6.971 3.33e-12 ***
## Chlorides          -2.018e-01  4.387e-02  -4.601 4.26e-06 ***
## FreeSulfurDioxide   3.686e-04  9.363e-05   3.937 8.30e-05 ***
## TotalSulfurDioxide  2.279e-04  5.994e-05   3.802 0.000144 ***
## Density            -7.229e-01  5.239e-01  -1.380 0.167688    
## pH                 -4.481e-02  2.060e-02  -2.175 0.029623 *  
## Sulphates          -3.574e-02  1.515e-02  -2.360 0.018311 *  
## Alcohol             1.057e-02  3.782e-03   2.796 0.005188 ** 
## LabelAppeal         4.399e-01  1.634e-02  26.929  < 2e-16 ***
## AcidIndex          -2.001e+00  9.047e-02 -22.122  < 2e-16 ***
## STARS               1.175e+00  1.658e-02  70.852  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.41 on 10224 degrees of freedom
## Multiple R-squared:  0.4648, Adjusted R-squared:  0.4642 
## F-statistic:   740 on 12 and 10224 DF,  p-value: < 2.2e-16

plot(model10)

Model 11 : Ordinal Logistic Regression

polrDF <- wine_train2
polrDF$TARGET <- as.factor(polrDF$TARGET)
model11 <- polr(TARGET ~ ., data = polrDF, Hess=TRUE)
summary(model11)

## Warning in sqrt(diag(vc)): NaNs produced

## Call:
## polr(formula = TARGET ~ ., data = polrDF, Hess = TRUE)
## 
## Coefficients:
##                         Value Std. Error   t value
## INDEX               2.267e-06        NaN       NaN
## FixedAcidity        1.573e-03  2.836e-03    0.5547
## VolatileAcidity    -1.527e-01  2.322e-02   -6.5747
## CitricAcid          2.440e-02  2.101e-02    1.1615
## ResidualSugar       2.614e-04  5.322e-04    0.4911
## Chlorides          -2.526e-01  6.045e-04 -417.7833
## FreeSulfurDioxide   4.674e-04  1.215e-04    3.8460
## TotalSulfurDioxide  2.348e-04  7.795e-05    3.0128
## Density            -1.147e+00  1.954e-03 -587.2133
## pH                 -2.790e-02  1.333e-02   -2.0925
## Sulphates          -2.635e-02  1.977e-02   -1.3330
## Alcohol             2.396e-02  4.571e-03    5.2406
## LabelAppeal         8.398e-01  2.338e-02   35.9220
## AcidIndex          -2.590e+00  4.202e-03 -616.3190
## STARS               1.474e+00  2.209e-02   66.7088
## 
## Intercepts:
##     Value      Std. Error t value   
## 0|1    -5.6148     0.0020 -2868.0237
## 1|2    -5.4771     0.0020 -2762.0888
## 2|3    -4.8713     0.0051  -958.1218
## 3|4    -3.5027     0.0263  -133.4228
## 4|5    -1.6534     0.0391   -42.2503
## 5|6     0.3167     0.0560     5.6557
## 6|7     2.5198     0.0558    45.1705
## 7|8     4.8597     0.0558    87.1480
## 
## Residual Deviance: 29979.83 
## AIC: 30025.83

Model 12 : Zero inflation

#Zero inflation understands that some Poisson distrobutions are dominated by many zeros. As such it corrects for this. This is one of the most promissing ones because as we saw in our data exploration, there were more zeros, and then normally distributed data after that.

options(repos = "http://cran.rstudio.com/") 


library(pscl)

## Warning: package 'pscl' was built under R version 4.2.3

## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

wine_train2 <- wine_train2 [,-1]

model12 <- zeroinfl(TARGET ~ . | STARS, data = wine_train2, dist = 'negbin')
summary(model12)

## 
## Call:
## zeroinfl(formula = TARGET ~ . | STARS, data = wine_train2, dist = "negbin")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -2.14654 -0.50114  0.06471  0.48537  2.09398 
## 
## Count model coefficients (negbin with log link):
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.770e+00  2.383e-01   7.427 1.11e-13 ***
## FixedAcidity        2.527e-04  9.455e-04   0.267 0.789258    
## VolatileAcidity    -1.858e-02  7.550e-03  -2.461 0.013839 *  
## CitricAcid          2.031e-03  6.710e-03   0.303 0.762105    
## ResidualSugar      -1.932e-05  1.720e-04  -0.112 0.910584    
## Chlorides          -3.326e-02  1.855e-02  -1.793 0.073026 .  
## FreeSulfurDioxide   4.256e-05  3.864e-05   1.102 0.270632    
## TotalSulfurDioxide -2.630e-06  2.464e-05  -0.107 0.914998    
## Density            -2.677e-01  2.220e-01  -1.206 0.227917    
## pH                  5.240e-04  8.735e-03   0.060 0.952162    
## Sulphates          -3.152e-03  6.400e-03  -0.492 0.622431    
## Alcohol             6.047e-03  1.589e-03   3.805 0.000142 ***
## LabelAppeal         2.258e-01  7.069e-03  31.937  < 2e-16 ***
## AcidIndex          -2.602e-01  4.478e-02  -5.810 6.24e-09 ***
## STARS               1.221e-01  6.981e-03  17.488  < 2e-16 ***
## Log(theta)          1.981e+01  1.382e+00  14.329  < 2e-16 ***
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   2.5836     0.1248   20.70   <2e-16 ***
## STARS        -2.9073     0.1126  -25.82   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1 
## 
## Theta = 400975735.8308 
## Number of iterations in BFGS optimization: 24 
## Log-likelihood: -1.717e+04 on 18 Df

scatterPreds <- predict(model12, wine_train2)
qplot(wine_train2$TARGET, scatterPreds, main = 'Predicted vs Actual') + ggthemes::theme_tufte()

## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

residPlot <- scatterPreds - wine_train2$TARGET
qplot(wine_train2$TARGET, residPlot, main = 'Residuals') + ggthemes::theme_tufte()

Selecting Models

aic1 <- model1$aic
aic2 <- model2$aic
aic3 <- model3$aic
aic4 <- model4$aic
aic5 <- model5$aic
aic6 <- model6$aic
aic7 <- model7$aic
aic8 <- model8$aic
aic9 <- model9$aic
aic10 <- model10$aic
aic11 <- model11$aic
aic12 <- model12$aic
mse1 <- mean((wine_train2$TARGET - predict(model1))^2)

## Warning in wine_train2$TARGET - predict(model1): longer object length is not a
## multiple of shorter object length

mse2 <- mean((wine_train2$TARGET - predict(model2))^2)

## Warning in wine_train2$TARGET - predict(model2): longer object length is not a
## multiple of shorter object length

mse3 <- mean((wine_train2$TARGET - predict(model3))^2)
mse4 <- mean((wine_train2$TARGET - predict(model4))^2)
mse5 <- mean((wine_train2$TARGET - predict(model5))^2)

## Warning in wine_train2$TARGET - predict(model5): longer object length is not a
## multiple of shorter object length

mse6 <- mean((wine_train2$TARGET - predict(model6))^2)

## Warning in wine_train2$TARGET - predict(model6): longer object length is not a
## multiple of shorter object length

mse7 <- mean((wine_train2$TARGET - predict(model7))^2)
mse8 <- mean((wine_train2$TARGET - predict(model8))^2)
mse9 <- mean((wine_train2$TARGET - predict(model9))^2)
mse10 <- mean((wine_train2$TARGET - predict(model10))^2)
mse11 <- mean((wine_train2$TARGET - predict(model11))^2)

## Warning in Ops.factor(wine_train2$TARGET, predict(model11)): '-' not meaningful
## for factors

mse12 <- mean((wine_train2$TARGET - predict(model12))^2)


compare_aic_mse <- matrix(c(mse1, mse2, mse3, mse4, mse5, mse6, mse7, mse8, mse9, mse10, mse11, mse12,
                            aic1, aic2, aic3, aic4, aic5, aic6, aic7, aic8, aic9, aic10, aic11, aic12),nrow=12,ncol=2,byrow=TRUE)

## Warning in matrix(c(mse1, mse2, mse3, mse4, mse5, mse6, mse7, mse8, mse9, :
## data length [20] is not a sub-multiple or multiple of the number of rows [12]

rownames(compare_aic_mse) <- c("Model1","Model2","Model3","Model4","Model5","Model6","Model7","Model8","Model9","Model10","Model11","Model12")

colnames(compare_aic_mse) <- c("MSE","AIC")

library(kableExtra)

## Warning: package 'kableExtra' was built under R version 4.2.3

kable(compare_aic_mse)  %>% 
  kable_styling(full_width = T)

	MSE	AIC
Model1	6.929787	6.926722
Model2	6.843881	6.844564
Model3	6.929788	6.926723
Model4	6.843877	6.844560
Model5	1.985748	1.986345
Model6	NA	1.952898
Model7	18544.983192	18535.285631
Model8	38368.436016	38364.927425
Model9	18547.067808	18537.370350
Model10	38370.614365	38367.105614
Model11	6.929787	6.926722
Model12	6.843881	6.844564

Comparing Models

modelValidation <- function(mod){
  preds = predict(mod, wine_test2)
  diffMat = as.numeric(preds) - as.numeric(wine_test2$TARGET)
  diffMat = diffMat^2
  loss <- mean(diffMat)
  return(loss)
}


compare_models <- matrix(c(modelValidation(model1),modelValidation(model2),modelValidation(model3),modelValidation(model4),modelValidation(model5),modelValidation(model6),
                           modelValidation(model7),modelValidation(model8),modelValidation(model9),modelValidation(model10),modelValidation(model11),modelValidation(model12)),
                         nrow=12,ncol=1,byrow=TRUE)

rownames(compare_models) <- c("Model1","Model2","Model3","Model4","Model5","Model6","Model7","Model8","Model9","Model10","Model11","Model12")

colnames(compare_models) <- c("Loss:")

compare_models <- as.data.frame(compare_models)

compare_models

##            Loss:
## Model1  5.477096
## Model2  5.461739
## Model3  6.839470
## Model4  6.840874
## Model5  5.477090
## Model6  5.461734
## Model7  6.839466
## Model8  6.840870
## Model9  2.043590
## Model10 2.043377
## Model11 3.680219
## Model12 2.034109

#A regular Poisson regression does not perform very well. #The same can be said for the Negative Binomial. #The linear model actually performs very well. #Very surprisinly, Ordinal Logistic Regression does not work as well as the linear model.

#Using sqaured loss to check accuracy of the model

#Based on this metric, Zero Poission Inflation is the most accurate. #From the above models, Model12 - Zero Poission Inflations has least loss as it uses less variables and is parsimonious. Also the R2 looks fine. The squared loss is also fine

#In terms of MSE and AIC, Model 5 is best followed by Model 4 and tied Model 2 and Model 12. #In terms of Loss, Model 12 is best followed closely by Model 9 and Model 10.

#Overall, we choose Zero Poission Inflation due to following : - least loss - good MSE score - good AIC score

Prediction

require(mice)
library(dplyr)
library(moments)             # Load the package


data_e_mod <- read.csv("C:/Users/91976/Downloads/wine-evaluation-data.csv")

wine_eval <- as.data.frame(complete(data_e_mod))
wine_eval$AcidIndex <- log(wine_eval$AcidIndex)
wine_eval$TARGET <- predict(model12, newdata=wine_eval)
write.csv(wine_eval,"Evaluation_Full_Data.csv", row.names=FALSE)

data_predicted_eval <- read.csv("Evaluation_Full_Data.csv")

data_predicted_eval <- na.omit(data_predicted_eval)  # Remove rows with missing values

DT::datatable(data_predicted_eval)

options(width=100)
round(with(data_predicted_eval, c(summary(TARGET), StdD=sd(TARGET), Skew=skewness(TARGET), Kurt=kurtosis(TARGET))),2)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    StdD    Skew    Kurt 
##    1.06    1.98    3.40    3.35    4.29    8.25    1.40    0.44    2.64

Assignment 3

Anuranjan Saxena

2023-05-17