R Markdown

Overview

A large wine manufacturer is studying the data in order to predict the number of wine cases ordered based upon the wine characteristics. If the wine manufacturer can predict the number of cases, then that manufacturer will be able to adjust their wine offering to maximize sales.

Your objective is to build a count regression model to predict the number of cases of wine that will be sold given certain properties of the wine.

knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)
library(dplyr)
library(reshape)
library(ggplot2)
library(purrr)
library(psych)
library(tidyr)
library(MASS)
library(rpart.plot)
library(gridExtra)
library(forecast)
library(fpp2)
library(fma)
library(kableExtra)
library(e1071)
library(mlbench)
library(DataExplorer)
library(timeDate)
library(caret)
library(GGally)
library(corrplot)
library(RColorBrewer)
library(tibble)
library(tidyr)
library(tidyverse)
library(dplyr)
library(reshape2)
library(mixtools)
library(tidymodels)
library(ggpmisc)
library(regclass)
library(skimr)

Load Data

Load the wine training data

# load data
data <- read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_5/wine-training-data.csv")
head(data,2)
##   ï..INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## 1        1      3          3.2            1.16      -0.98          54.2
## 2        2      3          4.5            0.16      -0.81          26.1
##   Chlorides FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 1    -0.567                NA                268 0.99280 3.33     -0.59     9.9
## 2    -0.425                15               -327 1.02792 3.38      0.70      NA
##   LabelAppeal AcidIndex STARS
## 1           0         8     2
## 2          -1         7     3

Load the wine evaluation data {{r setup, include=TRUE, cache = FALSE}} # load data eval_data <- read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_5/wine-evaluation-data.csv") head(eval_data,2)

Describe the Data

instances = dim(data)[1]
features = dim(data)[2]
cat('Dataset for this assignment is', instances, 'instances and', features, 'features')
## Dataset for this assignment is 12795 instances and 16 features

Missing Values Count

map(data, ~sum(is.na(.))) %>% t()
##      ï..INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## [1,] 0        0      0            0               0          616          
##      Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH  Sulphates
## [1,] 638       647               682                0       395 1210     
##      Alcohol LabelAppeal AcidIndex STARS
## [1,] 653     0           0         3359

Describe data

describe(data)
##                    vars     n    mean      sd  median trimmed     mad     min
## ï..INDEX              1 12795 8069.98 4656.91 8110.00 8071.03 5977.84    1.00
## TARGET                2 12795    3.03    1.93    3.00    3.05    1.48    0.00
## FixedAcidity          3 12795    7.08    6.32    6.90    7.07    3.26  -18.10
## VolatileAcidity       4 12795    0.32    0.78    0.28    0.32    0.43   -2.79
## CitricAcid            5 12795    0.31    0.86    0.31    0.31    0.42   -3.24
## ResidualSugar         6 12179    5.42   33.75    3.90    5.58   15.72 -127.80
## Chlorides             7 12157    0.05    0.32    0.05    0.05    0.13   -1.17
## FreeSulfurDioxide     8 12148   30.85  148.71   30.00   30.93   56.34 -555.00
## TotalSulfurDioxide    9 12113  120.71  231.91  123.00  120.89  134.92 -823.00
## Density              10 12795    0.99    0.03    0.99    0.99    0.01    0.89
## pH                   11 12400    3.21    0.68    3.20    3.21    0.39    0.48
## Sulphates            12 11585    0.53    0.93    0.50    0.53    0.44   -3.13
## Alcohol              13 12142   10.49    3.73   10.40   10.50    2.37   -4.70
## LabelAppeal          14 12795   -0.01    0.89    0.00   -0.01    1.48   -2.00
## AcidIndex            15 12795    7.77    1.32    8.00    7.64    1.48    4.00
## STARS                16  9436    2.04    0.90    2.00    1.97    1.48    1.00
##                         max    range  skew kurtosis    se
## ï..INDEX           16129.00 16128.00  0.00    -1.20 41.17
## TARGET                 8.00     8.00 -0.33    -0.88  0.02
## FixedAcidity          34.40    52.50 -0.02     1.67  0.06
## VolatileAcidity        3.68     6.47  0.02     1.83  0.01
## CitricAcid             3.86     7.10 -0.05     1.84  0.01
## ResidualSugar        141.15   268.95 -0.05     1.88  0.31
## Chlorides              1.35     2.52  0.03     1.79  0.00
## FreeSulfurDioxide    623.00  1178.00  0.01     1.84  1.35
## TotalSulfurDioxide  1057.00  1880.00 -0.01     1.67  2.11
## Density                1.10     0.21 -0.02     1.90  0.00
## pH                     6.13     5.65  0.04     1.65  0.01
## Sulphates              4.24     7.37  0.01     1.75  0.01
## Alcohol               26.50    31.20 -0.03     1.54  0.03
## LabelAppeal            2.00     4.00  0.01    -0.26  0.01
## AcidIndex             17.00    13.00  1.65     5.19  0.01
## STARS                  4.00     3.00  0.45    -0.69  0.01

Boxplot the data

meltData <- melt(data)
boxplot(data=meltData, value~variable)

p <- ggplot(meltData, aes(factor(variable), value)) 
p + geom_boxplot() + facet_wrap(~variable, scale="free")

Data correlations

res_vector  <- vector()
colnames_vector <- colnames(data)
for(i in 1:ncol(data)) {
  
  res <- cor.test(data[ , i], data$TARGET, 
                    method = "pearson")
  
  res_round = round(res$estimate,4)
  
  res_vector <- c(res_vector, res_round)
}
corrdf <- data.frame(colnames_vector, res_vector)
corrdf_sort <- corrdf %>% arrange(desc(res_vector))
#Remove TARGET from correlation df
corrdf_sort = corrdf_sort[-c(1),]
corrdf_sort
##       colnames_vector res_vector
## 2               STARS     0.5588
## 3         LabelAppeal     0.3565
## 4             Alcohol     0.0621
## 5  TotalSulfurDioxide     0.0515
## 6   FreeSulfurDioxide     0.0438
## 7       ResidualSugar     0.0165
## 8          CitricAcid     0.0087
## 9            ï..INDEX     0.0013
## 10                 pH    -0.0094
## 11            Density    -0.0355
## 12          Chlorides    -0.0383
## 13          Sulphates    -0.0388
## 14       FixedAcidity    -0.0490
## 15    VolatileAcidity    -0.0888
## 16          AcidIndex    -0.2460
#Plot the correlations
ggplot(corrdf_sort, aes(x=reorder(colnames_vector,res_vector), y=res_vector)) +
  geom_bar(stat="identity") +
  theme(axis.text.x = element_text(angle = 90, size = 10))

Recommendations

  1. Address Missing Values: The features below have missing values. Recommendation to address this for each feature is…
  1. Residual Sugar (616): replace missing values with mean because normal dist, corr w/target is 0, & most target values have a similar mean.
  2. Chlorides (638): same as Residual Sugar
  3. Free Sulfur Dioxide (647): same as Residual Sugar
  4. Total Sulfur Dioxide (682): same as Residual Sugar (considered using median, but mean is 120 and median is 123, so not much difference)
  5. PH (395): same as Residual Sugar
  6. Sulphates (1210): same as Residual Sugar
  7. Alcohol (653): replace missing values with mean. Correlation is higher on this feature at around 12%
  8. Stars (3359): values are 1 to 4. replace missing values with a 0 to indicate no ranking provided.

  1. Apply Scalar: Max values are wide ranging across the features. For example, “Volatile Acidity’s” max value is 4 and “SulfurDioxide’s” max value is 1038. Recommend applying StandardScalar prior to modeling.
#Fill in missing values in training data
data_mv <- COPY<-data.frame(data)
data_mv = data_mv %>% replace_na(list(STARS = 0))
for(i in 1:ncol(data_mv)){
  data_mv[is.na(data_mv[,i]), i] <- mean(data_mv[,i], na.rm = TRUE)
}
#Fill in missing values in evaluation data
eval_data<-read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_5/wine-evaluation-data.csv")
eval_data_mv <- COPY<-data.frame(eval_data)
eval_data_mv = eval_data_mv %>% replace_na(list(STARS = 0))
for(i in 1:ncol(eval_data_mv)){
  eval_data_mv[is.na(eval_data_mv[,i]), i] <- mean(eval_data_mv[,i], na.rm = TRUE)
}
#Apply scalarization to training and evaluation data
data_mv_scaled = scale(data_mv)
eval_data_mv_scaled = scale(eval_data_mv, center=attr(data_mv_scaled, "scaled:center"), 
                              scale=attr(data_mv_scaled, "scaled:scale"))

Output training and evaluation csv’s with missing values replaced, and csv’s that are scaled

#Training
write.csv(data_mv, "wine-training-data-mv.csv")
write.csv(data_mv_scaled, "wine-training-data-mv-scaled.csv")
#Evaluation
write.csv(eval_data_mv, "wine-evaluation-data-mv.csv")
write.csv(eval_data_mv_scaled, "wine-evaluation-data-mv-scaled.csv")

Poisson model with data_mv

model1 = glm(TARGET ~  ., data=data_mv, family=poisson)
summary(model1)
## 
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = data_mv)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9721  -0.7208   0.0698   0.5794   3.2293  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.522e+00  1.958e-01   7.773 7.67e-15 ***
## ï..INDEX            4.027e-07  1.093e-06   0.368 0.712530    
## FixedAcidity       -3.053e-04  8.204e-04  -0.372 0.709791    
## VolatileAcidity    -3.342e-02  6.516e-03  -5.129 2.91e-07 ***
## CitricAcid          7.754e-03  5.892e-03   1.316 0.188210    
## ResidualSugar       5.654e-05  1.546e-04   0.366 0.714620    
## Chlorides          -4.143e-02  1.645e-02  -2.519 0.011779 *  
## FreeSulfurDioxide   1.253e-04  3.512e-05   3.567 0.000361 ***
## TotalSulfurDioxide  8.292e-05  2.275e-05   3.645 0.000268 ***
## Density            -2.816e-01  1.920e-01  -1.467 0.142448    
## pH                 -1.571e-02  7.638e-03  -2.057 0.039701 *  
## Sulphates          -1.267e-02  5.749e-03  -2.203 0.027584 *  
## Alcohol             2.200e-03  1.410e-03   1.561 0.118607    
## LabelAppeal         1.332e-01  6.064e-03  21.958  < 2e-16 ***
## AcidIndex          -8.705e-02  4.548e-03 -19.138  < 2e-16 ***
## STARS               3.113e-01  4.532e-03  68.696  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22861  on 12794  degrees of freedom
## Residual deviance: 14728  on 12779  degrees of freedom
## AIC: 46702
## 
## Number of Fisher Scoring iterations: 5
require(gridExtra)
require(ggplot2)
plot(model1)

Poisson model with significant variables with data_mv dataset.

model2 = glm(TARGET ~  .-FixedAcidity-CitricAcid-ResidualSugar-Chlorides-FreeSulfurDioxide-TotalSulfurDioxide-Density-pH-Sulphates-Alcohol-LabelAppeal,AcidIndex,STARS,data = data_mv, family=poisson)
summary(model2)
## 
## Call:
## glm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar - 
##     Chlorides - FreeSulfurDioxide - TotalSulfurDioxide - Density - 
##     pH - Sulphates - Alcohol - LabelAppeal, family = poisson, 
##     data = data_mv, weights = AcidIndex, subset = STARS)
## 
## Deviance Residuals: 
##       Min         1Q     Median         3Q        Max  
## 0.000e+00  5.162e-08  5.575e-08  5.960e-08  5.960e-08  
## 
## Coefficients: (1 not defined because of singularities)
##                  Estimate Std. Error z value Pr(>|z|)    
## (Intercept)     -16.55166    0.62754  -26.38   <2e-16 ***
## ï..INDEX          0.87519    0.02910   30.08   <2e-16 ***
## VolatileAcidity  -1.42888    0.05741  -24.89   <2e-16 ***
## AcidIndex         2.30407    0.08293   27.79   <2e-16 ***
## STARS                  NA         NA      NA       NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 1.4009e+04  on 9435  degrees of freedom
## Residual deviance: 1.5677e-11  on 9432  degrees of freedom
## AIC: 220163
## 
## Number of Fisher Scoring iterations: 4
plot(model2)

data_scale_train<-data.frame(data_mv_scaled)

Negative Binomial with data_mv dataset.

model3 <- glm.nb(TARGET ~ ., data = data_mv)

summary(model3)
## 
## Call:
## glm.nb(formula = TARGET ~ ., data = data_mv, init.theta = 48971.50111, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9720  -0.7207   0.0698   0.5793   3.2292  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.522e+00  1.958e-01   7.773 7.68e-15 ***
## ï..INDEX            4.027e-07  1.093e-06   0.368 0.712564    
## FixedAcidity       -3.053e-04  8.205e-04  -0.372 0.709793    
## VolatileAcidity    -3.342e-02  6.516e-03  -5.129 2.91e-07 ***
## CitricAcid          7.754e-03  5.893e-03   1.316 0.188223    
## ResidualSugar       5.655e-05  1.546e-04   0.366 0.714606    
## Chlorides          -4.143e-02  1.645e-02  -2.519 0.011780 *  
## FreeSulfurDioxide   1.253e-04  3.512e-05   3.567 0.000361 ***
## TotalSulfurDioxide  8.292e-05  2.275e-05   3.645 0.000268 ***
## Density            -2.816e-01  1.920e-01  -1.467 0.142455    
## pH                 -1.571e-02  7.638e-03  -2.057 0.039700 *  
## Sulphates          -1.267e-02  5.749e-03  -2.203 0.027585 *  
## Alcohol             2.200e-03  1.410e-03   1.561 0.118629    
## LabelAppeal         1.332e-01  6.065e-03  21.957  < 2e-16 ***
## AcidIndex          -8.705e-02  4.549e-03 -19.137  < 2e-16 ***
## STARS               3.113e-01  4.532e-03  68.694  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(48971.5) family taken to be 1)
## 
##     Null deviance: 22860  on 12794  degrees of freedom
## Residual deviance: 14728  on 12779  degrees of freedom
## AIC: 46704
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  48972 
##           Std. Err.:  50707 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -46670.37
plot(model3)

### Negative binomial model with data_mv dataset with significant values.

model4 <- glm.nb(TARGET~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + Alcohol +
                as.factor(LabelAppeal) +
                as.factor(AcidIndex) + 
                as.factor(STARS), data = data_mv)

summary(model4)
## 
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + 
##     TotalSulfurDioxide + Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) + 
##     as.factor(STARS), data = data_mv, init.theta = 40912.6004, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2463  -0.6517  -0.0030   0.4394   3.6953  
## 
## Coefficients:
##                            Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -3.408e-02  3.192e-01  -0.107  0.91497    
## VolatileAcidity          -2.990e-02  6.533e-03  -4.576 4.74e-06 ***
## FreeSulfurDioxide         8.883e-05  3.502e-05   2.537  0.01119 *  
## TotalSulfurDioxide        7.221e-05  2.274e-05   3.176  0.00149 ** 
## Alcohol                   4.417e-03  1.410e-03   3.134  0.00173 ** 
## as.factor(LabelAppeal)-1  2.400e-01  3.800e-02   6.316 2.68e-10 ***
## as.factor(LabelAppeal)0   4.302e-01  3.706e-02  11.608  < 2e-16 ***
## as.factor(LabelAppeal)1   5.633e-01  3.771e-02  14.937  < 2e-16 ***
## as.factor(LabelAppeal)2   6.983e-01  4.245e-02  16.452  < 2e-16 ***
## as.factor(AcidIndex)5    -1.537e-01  3.224e-01  -0.477  0.63346    
## as.factor(AcidIndex)6    -1.103e-01  3.169e-01  -0.348  0.72775    
## as.factor(AcidIndex)7    -1.434e-01  3.167e-01  -0.453  0.65056    
## as.factor(AcidIndex)8    -1.747e-01  3.167e-01  -0.552  0.58120    
## as.factor(AcidIndex)9    -2.851e-01  3.170e-01  -0.899  0.36850    
## as.factor(AcidIndex)10   -4.428e-01  3.181e-01  -1.392  0.16390    
## as.factor(AcidIndex)11   -8.058e-01  3.216e-01  -2.505  0.01223 *  
## as.factor(AcidIndex)12   -8.195e-01  3.273e-01  -2.504  0.01228 *  
## as.factor(AcidIndex)13   -6.554e-01  3.302e-01  -1.985  0.04716 *  
## as.factor(AcidIndex)14   -7.538e-01  3.428e-01  -2.199  0.02787 *  
## as.factor(AcidIndex)15   -2.993e-01  4.035e-01  -0.742  0.45829    
## as.factor(AcidIndex)16   -9.533e-01  5.480e-01  -1.740  0.08194 .  
## as.factor(AcidIndex)17   -1.204e+00  5.481e-01  -2.198  0.02798 *  
## as.factor(STARS)1         7.566e-01  1.957e-02  38.668  < 2e-16 ***
## as.factor(STARS)2         1.075e+00  1.826e-02  58.862  < 2e-16 ***
## as.factor(STARS)3         1.194e+00  1.923e-02  62.074  < 2e-16 ***
## as.factor(STARS)4         1.315e+00  2.433e-02  54.034  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(40912.6) family taken to be 1)
## 
##     Null deviance: 22860  on 12794  degrees of freedom
## Residual deviance: 13543  on 12769  degrees of freedom
## AIC: 45540
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  40913 
##           Std. Err.:  34296 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -45486.34

linear model with scaled data

model5 <- lm(TARGET ~ ., data = data_scale_train)

summary(model5)
## 
## Call:
## lm(formula = TARGET ~ ., data = data_scale_train)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.35962 -0.49243  0.03446  0.47059  3.10806 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)        -3.315e-16  6.077e-03   0.000 1.000000    
## ï..INDEX            2.426e-03  6.080e-03   0.399 0.689881    
## FixedAcidity       -8.174e-07  6.180e-03   0.000 0.999894    
## VolatileAcidity    -4.036e-02  6.097e-03  -6.621 3.72e-11 ***
## CitricAcid          9.320e-03  6.096e-03   1.529 0.126343    
## ResidualSugar       3.430e-03  6.083e-03   0.564 0.572851    
## Chlorides          -2.003e-02  6.086e-03  -3.291 0.000999 ***
## FreeSulfurDioxide   2.370e-02  6.088e-03   3.893 9.95e-05 ***
## TotalSulfurDioxide  2.649e-02  6.092e-03   4.349 1.38e-05 ***
## Density            -1.101e-02  6.088e-03  -1.809 0.070510 .  
## pH                 -1.198e-02  6.092e-03  -1.966 0.049318 *  
## Sulphates          -1.507e-02  6.086e-03  -2.477 0.013267 *  
## Alcohol             2.064e-02  6.097e-03   3.386 0.000711 ***
## LabelAppeal         2.001e-01  6.323e-03  31.643  < 2e-16 ***
## AcidIndex          -1.432e-01  6.332e-03 -22.613  < 2e-16 ***
## STARS               6.016e-01  6.440e-03  93.426  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6874 on 12779 degrees of freedom
## Multiple R-squared:  0.528,  Adjusted R-squared:  0.5275 
## F-statistic: 953.1 on 15 and 12779 DF,  p-value: < 2.2e-16
plot(model5)

### linear model with scaled data with significant variables.

model6 <- lm(TARGET~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + Alcohol +
                as.factor(LabelAppeal) +
                as.factor(AcidIndex) + 
                as.factor(STARS), data = data_scale_train)

summary(model6)
## 
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) + 
##     as.factor(STARS), data = data_scale_train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.5946 -0.4487  0.0204  0.4337  3.1649 
## 
## Coefficients:
##                                           Estimate Std. Error t value Pr(>|t|)
## (Intercept)                              -1.070741   0.391988  -2.732 0.006312
## VolatileAcidity                          -0.038595   0.006006  -6.426 1.35e-10
## FreeSulfurDioxide                         0.020122   0.005994   3.357 0.000791
## TotalSulfurDioxide                        0.025044   0.006000   4.174 3.02e-05
## Alcohol                                   0.026496   0.006010   4.408 1.05e-05
## as.factor(LabelAppeal)-1.11204793733397   0.190666   0.032584   5.852 4.99e-09
## as.factor(LabelAppeal)0.0101741115806247  0.433759   0.031771  13.653  < 2e-16
## as.factor(LabelAppeal)1.13239616049522    0.675266   0.033196  20.342  < 2e-16
## as.factor(LabelAppeal)2.25461820940981    0.979510   0.043717  22.406  < 2e-16
## as.factor(AcidIndex)-2.09431867252628    -0.160631   0.398565  -0.403 0.686937
## as.factor(AcidIndex)-1.33898965894075    -0.102980   0.391322  -0.263 0.792432
## as.factor(AcidIndex)-0.583660645355226   -0.154375   0.390955  -0.395 0.692947
## as.factor(AcidIndex)0.171668368230302    -0.209756   0.390974  -0.536 0.591625
## as.factor(AcidIndex)0.926997381815829    -0.367250   0.391228  -0.939 0.347897
## as.factor(AcidIndex)1.68232639540136     -0.528477   0.391854  -1.349 0.177471
## as.factor(AcidIndex)2.43765540898688     -0.775551   0.393055  -1.973 0.048502
## as.factor(AcidIndex)3.19298442257241     -0.783829   0.395301  -1.983 0.047404
## as.factor(AcidIndex)3.94831343615794     -0.787984   0.399159  -1.974 0.048391
## as.factor(AcidIndex)4.70364244974347     -0.712553   0.403105  -1.768 0.077142
## as.factor(AcidIndex)5.45897146332899     -0.321913   0.458248  -0.702 0.482388
## as.factor(AcidIndex)6.21430047691452     -0.886853   0.494284  -1.794 0.072802
## as.factor(AcidIndex)6.96962949050005     -0.985757   0.467183  -2.110 0.034877
## as.factor(STARS)-0.42623524866846         0.701610   0.017091  41.050  < 2e-16
## as.factor(STARS)0.416552574962037         1.239157   0.016619  74.563  < 2e-16
## as.factor(STARS)1.25934039859254          1.531710   0.019240  79.611  < 2e-16
## as.factor(STARS)2.10212822222303          1.888885   0.030710  61.507  < 2e-16
##                                             
## (Intercept)                              ** 
## VolatileAcidity                          ***
## FreeSulfurDioxide                        ***
## TotalSulfurDioxide                       ***
## Alcohol                                  ***
## as.factor(LabelAppeal)-1.11204793733397  ***
## as.factor(LabelAppeal)0.0101741115806247 ***
## as.factor(LabelAppeal)1.13239616049522   ***
## as.factor(LabelAppeal)2.25461820940981   ***
## as.factor(AcidIndex)-2.09431867252628       
## as.factor(AcidIndex)-1.33898965894075       
## as.factor(AcidIndex)-0.583660645355226      
## as.factor(AcidIndex)0.171668368230302       
## as.factor(AcidIndex)0.926997381815829       
## as.factor(AcidIndex)1.68232639540136        
## as.factor(AcidIndex)2.43765540898688     *  
## as.factor(AcidIndex)3.19298442257241     *  
## as.factor(AcidIndex)3.94831343615794     *  
## as.factor(AcidIndex)4.70364244974347     .  
## as.factor(AcidIndex)5.45897146332899        
## as.factor(AcidIndex)6.21430047691452     .  
## as.factor(AcidIndex)6.96962949050005     *  
## as.factor(STARS)-0.42623524866846        ***
## as.factor(STARS)0.416552574962037        ***
## as.factor(STARS)1.25934039859254         ***
## as.factor(STARS)2.10212822222303         ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6766 on 12769 degrees of freedom
## Multiple R-squared:  0.5432, Adjusted R-squared:  0.5423 
## F-statistic: 607.3 on 25 and 12769 DF,  p-value: < 2.2e-16
plot(model6)

### linear model without the scaled value(data_mv dataset).

model7 <- lm(TARGET ~ ., data = data_mv)

summary(model7)
## 
## Call:
## lm(formula = TARGET ~ ., data = data_mv)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5455 -0.9486  0.0664  0.9065  5.9873 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.976e+00  4.495e-01   8.845  < 2e-16 ***
## ï..INDEX            1.004e-06  2.515e-06   0.399 0.689881    
## FixedAcidity       -2.492e-07  1.885e-03   0.000 0.999894    
## VolatileAcidity    -9.917e-02  1.498e-02  -6.621 3.72e-11 ***
## CitricAcid          2.083e-02  1.362e-02   1.529 0.126343    
## ResidualSugar       2.007e-04  3.559e-04   0.564 0.572851    
## Chlorides          -1.243e-01  3.777e-02  -3.291 0.000999 ***
## FreeSulfurDioxide   3.151e-04  8.093e-05   3.893 9.95e-05 ***
## TotalSulfurDioxide  2.262e-04  5.201e-05   4.349 1.38e-05 ***
## Density            -7.993e-01  4.419e-01  -1.809 0.070510 .  
## pH                 -3.448e-02  1.754e-02  -1.966 0.049318 *  
## Sulphates          -3.274e-02  1.322e-02  -2.477 0.013267 *  
## Alcohol             1.095e-02  3.234e-03   3.386 0.000711 ***
## LabelAppeal         4.325e-01  1.367e-02  31.643  < 2e-16 ***
## AcidIndex          -2.083e-01  9.213e-03 -22.613  < 2e-16 ***
## STARS               9.768e-01  1.046e-02  93.426  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.324 on 12779 degrees of freedom
## Multiple R-squared:  0.528,  Adjusted R-squared:  0.5275 
## F-statistic: 953.1 on 15 and 12779 DF,  p-value: < 2.2e-16
plot(model7)

### Linear model with the data_mv dataset with significant variables.

model8 <- lm(TARGET~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + Alcohol +
                as.factor(LabelAppeal) +
                as.factor(AcidIndex) + 
                as.factor(STARS), data = data_mv)

summary(model8)
## 
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + 
##     Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) + 
##     as.factor(STARS), data = data_mv)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.9982 -0.8644  0.0394  0.8354  6.0968 
## 
## Coefficients:
##                            Estimate Std. Error t value Pr(>|t|)    
## (Intercept)               8.157e-01  7.560e-01   1.079 0.280664    
## VolatileAcidity          -9.483e-02  1.476e-02  -6.426 1.35e-10 ***
## FreeSulfurDioxide         2.675e-04  7.969e-05   3.357 0.000791 ***
## TotalSulfurDioxide        2.138e-04  5.123e-05   4.174 3.02e-05 ***
## Alcohol                   1.406e-02  3.188e-03   4.408 1.05e-05 ***
## as.factor(LabelAppeal)-1  3.673e-01  6.277e-02   5.852 4.99e-09 ***
## as.factor(LabelAppeal)0   8.356e-01  6.120e-02  13.653  < 2e-16 ***
## as.factor(LabelAppeal)1   1.301e+00  6.395e-02  20.342  < 2e-16 ***
## as.factor(LabelAppeal)2   1.887e+00  8.422e-02  22.406  < 2e-16 ***
## as.factor(AcidIndex)5    -3.094e-01  7.678e-01  -0.403 0.686937    
## as.factor(AcidIndex)6    -1.984e-01  7.538e-01  -0.263 0.792432    
## as.factor(AcidIndex)7    -2.974e-01  7.531e-01  -0.395 0.692947    
## as.factor(AcidIndex)8    -4.041e-01  7.532e-01  -0.536 0.591625    
## as.factor(AcidIndex)9    -7.075e-01  7.537e-01  -0.939 0.347897    
## as.factor(AcidIndex)10   -1.018e+00  7.549e-01  -1.349 0.177471    
## as.factor(AcidIndex)11   -1.494e+00  7.572e-01  -1.973 0.048502 *  
## as.factor(AcidIndex)12   -1.510e+00  7.615e-01  -1.983 0.047404 *  
## as.factor(AcidIndex)13   -1.518e+00  7.689e-01  -1.974 0.048391 *  
## as.factor(AcidIndex)14   -1.373e+00  7.765e-01  -1.768 0.077142 .  
## as.factor(AcidIndex)15   -6.201e-01  8.828e-01  -0.702 0.482388    
## as.factor(AcidIndex)16   -1.708e+00  9.522e-01  -1.794 0.072802 .  
## as.factor(AcidIndex)17   -1.899e+00  9.000e-01  -2.110 0.034877 *  
## as.factor(STARS)1         1.352e+00  3.292e-02  41.050  < 2e-16 ***
## as.factor(STARS)2         2.387e+00  3.201e-02  74.563  < 2e-16 ***
## as.factor(STARS)3         2.951e+00  3.706e-02  79.611  < 2e-16 ***
## as.factor(STARS)4         3.639e+00  5.916e-02  61.507  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.303 on 12769 degrees of freedom
## Multiple R-squared:  0.5432, Adjusted R-squared:  0.5423 
## F-statistic: 607.3 on 25 and 12769 DF,  p-value: < 2.2e-16
plot(model8)

### Ordinal logistic regression with scaled data

olr1 <- data_scale_train
olr1$TARGET <- as.factor(olr1$TARGET)
model9 <- polr(TARGET ~ ., data = olr1, Hess=TRUE)
summary(model9)
## Call:
## polr(formula = TARGET ~ ., data = olr1, Hess = TRUE)
## 
## Coefficients:
##                        Value Std. Error   t value
## ï..INDEX            0.009879    0.01637   0.60344
## FixedAcidity        0.009839    0.01664   0.59143
## VolatileAcidity    -0.105399    0.01649  -6.38999
## CitricAcid          0.022077    0.01642   1.34435
## ResidualSugar      -0.001255    0.01628  -0.07707
## Chlorides          -0.053700    0.01631  -3.29199
## FreeSulfurDioxide   0.058083    0.01641   3.53971
## TotalSulfurDioxide  0.056892    0.01639   3.47099
## Density            -0.032068    0.01651  -1.94258
## pH                 -0.012273    0.01645  -0.74611
## Sulphates          -0.030839    0.01648  -1.87138
## Alcohol             0.096465    0.01639   5.88544
## LabelAppeal         0.758628    0.01907  39.77675
## AcidIndex          -0.388535    0.01860 -20.89270
## STARS               1.540233    0.02183  70.54658
## 
## Intercepts:
##                                        Value    Std. Error t value 
## -1.57242723585788|-1.05331568225978     -2.2286   0.0291   -76.4926
## -1.05331568225978|-0.534204128661679    -2.0726   0.0285   -72.8472
## -0.534204128661679|-0.0150925750635791  -1.3893   0.0258   -53.9476
## -0.0150925750635791|0.504018978534521    0.1093   0.0229     4.7659
## 0.504018978534521|1.02313053213262       2.0323   0.0292    69.4889
## 1.02313053213262|1.54224208573072        4.0071   0.0458    87.4917
## 1.54224208573072|2.06135363932882        6.1891   0.0886    69.8252
## 2.06135363932882|2.58046519292692        8.5389   0.2462    34.6861
## 
## Residual Deviance: 36214.49 
## AIC: 36260.49

###Ordinal logistic without scaled data

olr2 <- data_mv
olr2$TARGET <- as.factor(olr2$TARGET)
model10 <- polr(TARGET ~ ., data = olr2, Hess=TRUE)
summary(model10)
## Call:
## polr(formula = TARGET ~ ., data = olr2, Hess = TRUE)
## 
## Coefficients:
##                         Value Std. Error    t value
## ï..INDEX            2.119e-06        NaN        NaN
## FixedAcidity        1.558e-03  2.633e-03    0.59167
## VolatileAcidity    -1.344e-01  2.101e-02   -6.39951
## CitricAcid          2.561e-02  1.904e-02    1.34519
## ResidualSugar      -3.822e-05  4.944e-04   -0.07729
## Chlorides          -1.730e-01  3.214e-04 -538.21394
## FreeSulfurDioxide   4.009e-04  1.133e-04    3.53957
## TotalSulfurDioxide  2.521e-04  7.266e-05    3.46998
## Density            -1.208e+00  1.431e-03 -844.16348
## pH                 -1.835e-02  1.951e-02   -0.94077
## Sulphates          -3.477e-02  1.856e-02   -1.87338
## Alcohol             2.656e-02  4.275e-03    6.21310
## LabelAppeal         8.513e-01  2.103e-02   40.47578
## AcidIndex          -2.935e-01  8.872e-03  -33.08037
## STARS               1.298e+00  1.715e-02   75.71084
## 
## Intercepts:
##     Value      Std. Error t value   
## 0|1    -3.5371     0.0014 -2439.9380
## 1|2    -3.3811     0.0015 -2255.5000
## 2|3    -2.6979     0.0038  -712.9369
## 3|4    -1.1993     0.0264   -45.4952
## 4|5     0.7237     0.0385    18.8065
## 5|6     2.6986     0.0536    50.3522
## 6|7     4.8806     0.0537    90.8238
## 7|8     7.2303     0.0537   134.5209
## 
## Residual Deviance: 36214.49 
## AIC: 36260.49

Zero inflation with data_mv dataset

library(pscl)
model11 <- zeroinfl(TARGET ~ . | STARS, data = data_mv, dist = 'negbin')
summary(model11)
## 
## Call:
## zeroinfl(formula = TARGET ~ . | STARS, data = data_mv, dist = "negbin")
## 
## Pearson residuals:
##      Min       1Q   Median       3Q      Max 
## -2.31701 -0.53688  0.01918  0.40829  2.89146 
## 
## Count model coefficients (negbin with log link):
##                      Estimate Std. Error z value Pr(>|z|)
## (Intercept)         1.543e+00         NA      NA       NA
## ï..INDEX            1.809e-07         NA      NA       NA
## FixedAcidity        3.013e-04         NA      NA       NA
## VolatileAcidity    -1.499e-02         NA      NA       NA
## CitricAcid          1.243e-03         NA      NA       NA
## ResidualSugar      -4.969e-05         NA      NA       NA
## Chlorides          -2.335e-02         NA      NA       NA
## FreeSulfurDioxide   3.718e-05         NA      NA       NA
## TotalSulfurDioxide -2.958e-06         NA      NA       NA
## Density            -2.732e-01         NA      NA       NA
## pH                  2.922e-03         NA      NA       NA
## Sulphates          -1.947e-03         NA      NA       NA
## Alcohol             6.542e-03         NA      NA       NA
## LabelAppeal         2.240e-01         NA      NA       NA
## AcidIndex          -3.130e-02         NA      NA       NA
## STARS               1.007e-01         NA      NA       NA
## Log(theta)          1.797e+01         NA      NA       NA
## 
## Zero-inflation model coefficients (binomial with logit link):
##             Estimate Std. Error z value Pr(>|z|)
## (Intercept)   0.3804         NA      NA       NA
## STARS        -2.2238         NA      NA       NA
## 
## Theta = 63997386.9979 
## Number of iterations in BFGS optimization: 23 
## Log-likelihood: -2.083e+04 on 19 Df
library(ggthemes)
scatterPreds <- predict(model11, data_mv)
qplot(data_mv$TARGET, scatterPreds, main = 'Predicted vs Actual') + ggthemes::theme_tufte()

residPlot <- scatterPreds - data_mv$TARGET
qplot(data_mv$TARGET, residPlot, main = 'Residuals') + ggthemes::theme_tufte()

The scaled dataset was transformed into absolute value dataset as Poisson and Negative Binomial were unable to work with negative data.

Absolute Value

absdata <- abs(data_scale_train)
model12 <- glm.nb(TARGET ~ ., data = absdata)
summary(model12)
## 
## Call:
## glm.nb(formula = TARGET ~ ., data = absdata, init.theta = 35848.33073, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8888  -0.9652  -0.8312   0.4713   1.5768  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -0.920318   0.041521 -22.165   <2e-16 ***
## ï..INDEX            0.012386   0.019653   0.630    0.529    
## FixedAcidity        0.001218   0.013445   0.091    0.928    
## VolatileAcidity     0.017113   0.013199   1.296    0.195    
## CitricAcid          0.002713   0.013125   0.207    0.836    
## ResidualSugar      -0.004542   0.013001  -0.349    0.727    
## Chlorides           0.001761   0.012668   0.139    0.889    
## FreeSulfurDioxide  -0.002295   0.012879  -0.178    0.859    
## TotalSulfurDioxide  0.003254   0.013308   0.245    0.807    
## Density             0.008868   0.012986   0.683    0.495    
## pH                  0.004886   0.013442   0.363    0.716    
## Sulphates           0.013197   0.012681   1.041    0.298    
## Alcohol            -0.007084   0.013916  -0.509    0.611    
## LabelAppeal         0.162795   0.013839  11.764   <2e-16 ***
## AcidIndex           0.120343   0.012366   9.732   <2e-16 ***
## STARS               0.488152   0.018843  25.907   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(35848.33) family taken to be 1)
## 
##     Null deviance: 9920.9  on 12794  degrees of freedom
## Residual deviance: 8970.4  on 12779  degrees of freedom
## AIC: 25738
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  35848 
##           Std. Err.:  45074 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -25703.86
modelsave1<-saveRDS(model12, "mymodel.rds")
plot(model12)

model13 = glm(TARGET ~  ., data=absdata, family=poisson)
summary(model13)
## 
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = absdata)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8550  -0.5038  -0.0667   0.4644   1.5768  
## 
## Coefficients:
##                     Estimate Std. Error z value Pr(>|z|)    
## (Intercept)        -0.920316   0.041520 -22.166   <2e-16 ***
## ï..INDEX            0.012386   0.019653   0.630    0.529    
## FixedAcidity        0.001218   0.013445   0.091    0.928    
## VolatileAcidity     0.017113   0.013199   1.296    0.195    
## CitricAcid          0.002713   0.013125   0.207    0.836    
## ResidualSugar      -0.004542   0.013001  -0.349    0.727    
## Chlorides           0.001761   0.012668   0.139    0.889    
## FreeSulfurDioxide  -0.002295   0.012879  -0.178    0.859    
## TotalSulfurDioxide  0.003254   0.013308   0.245    0.807    
## Density             0.008868   0.012986   0.683    0.495    
## pH                  0.004886   0.013442   0.363    0.716    
## Sulphates           0.013197   0.012680   1.041    0.298    
## Alcohol            -0.007084   0.013916  -0.509    0.611    
## LabelAppeal         0.162795   0.013839  11.764   <2e-16 ***
## AcidIndex           0.120342   0.012365   9.732   <2e-16 ***
## STARS               0.488151   0.018842  25.907   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 6665.5  on 12794  degrees of freedom
## Residual deviance: 5715.0  on 12779  degrees of freedom
## AIC: Inf
## 
## Number of Fisher Scoring iterations: 5
plot(model13)

modelsave2<-saveRDS(model13, "mymodel.rds")

As it takes a long time to load Poisson and Negative Binomial model with the scaled absolute value dataset, I am saving the model in a file:

saveRDS(model12, "model.rds")
my_model12 <- readRDS("model.rds")
saveRDS(model12, "model.rds")
my_model12 <- readRDS("model.rds")

Model Selection

aic1 <- model1$aic
aic2 <- model2$aic
aic3 <- model3$aic
aic4 <- model4$aic
aic5 <- model5$aic
aic6 <- model6$aic
aic7 <- model7$aic
aic8 <- model8$aic
aic9 <- model9$aic
aic10 <- model10$aic
aic11 <- model11$aic
aic12 <- model12$aic
aic13 <- model13$aic

mse1 <- mean((data_mv$TARGET - predict(model1))^2)
mse2 <- mean((data_mv$TARGET - predict(model2))^2)
mse3 <- mean((data_mv$TARGET - predict(model3))^2)
mse4 <- mean((data_mv$TARGET - predict(model4))^2)
mse5 <- mean((data_scale_train$TARGET - predict(model5))^2)
mse6 <- mean((data_scale_train$TARGET - predict(model6))^2)
mse7 <- mean((data_mv$TARGET - predict(model7))^2)
mse8 <- mean((data_mv$TARGET - predict(model8))^2)
mse9 <- mean((data_scale_train$TARGET - predict(model9))^2)
mse10 <- mean((data_mv$TARGET - predict(model10))^2)
mse11 <- mean((data_mv$TARGET - predict(model11))^2)
mse12 <- mean((absdata$TARGET - predict(model12))^2)
mse13 <- mean((absdata$TARGET - predict(model13))^2)


compare_aic_mse <- matrix(c(mse1, mse2, mse3, mse4, mse5, mse6, mse7, mse8, mse9, mse10, mse11,mse12,mse13,aic1, aic2, aic3, aic4, aic5, aic6, aic7, aic8, aic9, aic10, aic11,aic12,aic13),nrow=13,ncol=2,byrow=TRUE)


rownames(compare_aic_mse) <- c("Model1","Model2","Model3","Model4","Model5","Model6","Model7","Model8","Model9","Model10","Model11","Model12","Model13")
colnames(compare_aic_mse) <- c("MSE","AIC")
compare_models <- as.data.frame(compare_models)

kable(compare_aic_mse)  %>% 
  kable_styling(full_width = T)
MSE AIC
Model1 6.749972e+00 7.039608e+00
Model2 6.749968e+00 6.704008e+00
Model3 4.719362e-01 4.568016e-01
Model4 1.751305e+00 1.695142e+00
Model5 NA NA
Model6 1.717355e+00 1.417136e+00
Model7 1.417136e+00 4.670209e+04
Model8 2.201626e+05 4.670437e+04
Model9 4.554034e+04 2.573786e+04
Model10 Inf 6.749972e+00
Model11 7.039608e+00 6.749968e+00
Model12 6.704008e+00 4.719362e-01
Model13 4.568016e-01 1.751305e+00
eval_data_mv<-data.frame(eval_data_mv)

lower AIC MSE values indicate a better model.

Model 3,4,6,7,13 have a lower AIC and MSE value:Model 3 is a negative binomial model with data_mv dataset.Model 4 is a negative binomial model with only significant value. For model 4 I have used the data_mv dataset.

Model 6 is a linear model with significant values from the scaled dataset.

Model 7 is a linear model with data_mv dataset.

Model 13 is a poisson model with a dataset that is consist of absolute value of the scaled train datasets.

Test Data

Now lets see the output of the Models using test data.

modelValidation <- function(mod,eval_data_mv){
  preds = predict(mod, eval_data_mv$TARGET)
  diffMat = as.numeric(preds) - as.numeric(eval_data_mv$TARGET)
  diffMat = diffMat^2
  loss <- mean(diffMat)
  return(loss)
}
df1<-data.frame(modelValidation1(model1,data_mv))
df2<-data.frame(modelValidation1(model2,data_mv))
df3<-data.frame(modelValidation1(model3,data_mv))
df4<-data.frame(modelValidation1(model4,data_mv))
df5<-data.frame(modelValidation1(model5,data_scale_train))
df6<-data.frame(modelValidation1(model6,data_scale_train))
df7<-data.frame(modelValidation1(model7,data_mv))
df8<-data.frame(modelValidation1(model8,data_mv))
df9<-data.frame(modelValidation1(model9,data_scale_train))
df10<-data.frame(modelValidation1(model10,data_mv))
df11<-data.frame(modelValidation1(model11,data_mv))
df12<-data.frame(modelValidation1(model12,absdata))
df13<-data.frame(modelValidation1(model13,absdata))
library(kableExtra)
compare_model1 <- c(df1)
compare_model2 <- c(df2)
compare_model3 <- c(df3)
compare_model4 <- c(df4)
compare_model5 <- c(df5)
compare_model6 <- c(df6)
compare_model7 <- c(df7)
compare_model8 <- c(df8)
compare_model9 <- c(df9)
compare_model10 <- c(df10)
compare_model11 <- c(df11)
compare_model12 <- c(df12)
compare_model13 <- c(df13)


compare <- data.frame(compare_model1, compare_model2, compare_model3, compare_model4, compare_model5, compare_model6,compare_model7,compare_model8,compare_model9,compare_model10,compare_model11,compare_model12,compare_model13)
colnames(compare) <- c("Model1", "Model2", "Model3", "Model4", "Model5", "Model6", "Model7","Model8","Model9","Model10","Model11","Model12","Model13")

kable(compare)
Model1 Model2 Model3 Model4 Model5 Model6 Model7 Model8 Model9 Model10 Model11 Model12 Model13
6.749972 66462383 6.749968 6.704008 0.4719362 0.4568016 1.751305 1.695142 17.13263 3.136616 1.717355 1.417136 1.417136

We have used squared loss to validate the model. We will use the squared difference to select a model (MSE) from predictions on the training sets. As lower number is better, Model 6(linear model with scaled data),model 7(linear model with data_mv dataset),model 13 (Poisson model with scaled absolute value dataset) will be selected