library(caret)
library(dplyr)
library(psych)
library(corrplot)
library(tidyr)
library(ggplot2)
library(VIM)
library(ResourceSelection)
library(pROC)
library(tidyverse)
library(mice)
library(MASS)

Data Exploration

data <- read.csv('wine-training-data.csv')[,-1]

str(data)
## 'data.frame':    12795 obs. of  15 variables:
##  $ TARGET            : int  3 3 5 3 4 0 0 4 3 6 ...
##  $ FixedAcidity      : num  3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
##  $ VolatileAcidity   : num  1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
##  $ CitricAcid        : num  -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
##  $ ResidualSugar     : num  54.2 26.1 14.8 18.8 9.4 ...
##  $ Chlorides         : num  -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
##  $ FreeSulfurDioxide : num  NA 15 214 22 -167 -37 287 523 -213 62 ...
##  $ TotalSulfurDioxide: num  268 -327 142 115 108 15 156 551 NA 180 ...
##  $ Density           : num  0.993 1.028 0.995 0.996 0.995 ...
##  $ pH                : num  3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
##  $ Sulphates         : num  -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
##  $ Alcohol           : num  9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
##  $ LabelAppeal       : int  0 -1 -1 -1 0 0 0 1 0 0 ...
##  $ AcidIndex         : int  8 7 8 6 9 11 8 7 6 8 ...
##  $ STARS             : int  2 3 3 1 2 NA NA 3 NA 4 ...
summary(data)
##      TARGET       FixedAcidity     VolatileAcidity     CitricAcid     
##  Min.   :0.000   Min.   :-18.100   Min.   :-2.7900   Min.   :-3.2400  
##  1st Qu.:2.000   1st Qu.:  5.200   1st Qu.: 0.1300   1st Qu.: 0.0300  
##  Median :3.000   Median :  6.900   Median : 0.2800   Median : 0.3100  
##  Mean   :3.029   Mean   :  7.076   Mean   : 0.3241   Mean   : 0.3084  
##  3rd Qu.:4.000   3rd Qu.:  9.500   3rd Qu.: 0.6400   3rd Qu.: 0.5800  
##  Max.   :8.000   Max.   : 34.400   Max.   : 3.6800   Max.   : 3.8600  
##                                                                       
##  ResidualSugar        Chlorides       FreeSulfurDioxide TotalSulfurDioxide
##  Min.   :-127.800   Min.   :-1.1710   Min.   :-555.00   Min.   :-823.0    
##  1st Qu.:  -2.000   1st Qu.:-0.0310   1st Qu.:   0.00   1st Qu.:  27.0    
##  Median :   3.900   Median : 0.0460   Median :  30.00   Median : 123.0    
##  Mean   :   5.419   Mean   : 0.0548   Mean   :  30.85   Mean   : 120.7    
##  3rd Qu.:  15.900   3rd Qu.: 0.1530   3rd Qu.:  70.00   3rd Qu.: 208.0    
##  Max.   : 141.150   Max.   : 1.3510   Max.   : 623.00   Max.   :1057.0    
##  NA's   :616        NA's   :638       NA's   :647       NA's   :682       
##     Density             pH          Sulphates          Alcohol     
##  Min.   :0.8881   Min.   :0.480   Min.   :-3.1300   Min.   :-4.70  
##  1st Qu.:0.9877   1st Qu.:2.960   1st Qu.: 0.2800   1st Qu.: 9.00  
##  Median :0.9945   Median :3.200   Median : 0.5000   Median :10.40  
##  Mean   :0.9942   Mean   :3.208   Mean   : 0.5271   Mean   :10.49  
##  3rd Qu.:1.0005   3rd Qu.:3.470   3rd Qu.: 0.8600   3rd Qu.:12.40  
##  Max.   :1.0992   Max.   :6.130   Max.   : 4.2400   Max.   :26.50  
##                   NA's   :395     NA's   :1210      NA's   :653    
##   LabelAppeal          AcidIndex          STARS      
##  Min.   :-2.000000   Min.   : 4.000   Min.   :1.000  
##  1st Qu.:-1.000000   1st Qu.: 7.000   1st Qu.:1.000  
##  Median : 0.000000   Median : 8.000   Median :2.000  
##  Mean   :-0.009066   Mean   : 7.773   Mean   :2.042  
##  3rd Qu.: 1.000000   3rd Qu.: 8.000   3rd Qu.:3.000  
##  Max.   : 2.000000   Max.   :17.000   Max.   :4.000  
##                                       NA's   :3359
ggplot(stack(data[,-1]), aes(x = ind, y = values)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 100))
## Warning: Removed 8200 rows containing non-finite values (stat_boxplot).

ggplot(gather(select_if(data[,-1],is.numeric)), aes(value)) +
    geom_histogram(bins = 20) +
    facet_wrap(~key, scales = 'free')
## Warning: Removed 8200 rows containing non-finite values (stat_bin).

corrplot(cor(data[,-1], use = "na.or.complete"))

From the plot we can observe that some of the variables are normally distrubuted.

Data Preparation

For the data preparation we going to start by observing if we have mimssing observation.Later we going to impute the data using the Mice packages.

Amelia::missmap(data)

t <- mice(data,m=5, maxit = 10, method = 'pmm', seed = 500,printFlag = F)
imputed.data <- complete(t)

Amelia::missmap(imputed.data )

Model

Poisson Regression

Model 1

model1 <- glm(formula = TARGET ~ ., family = poisson, data = data)

summary(model1)
## 
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.2158  -0.2734   0.0616   0.3732   1.6830  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.593e+00  2.506e-01   6.359 2.03e-10 ***
## FixedAcidity        3.293e-04  1.053e-03   0.313  0.75447    
## VolatileAcidity    -2.560e-02  8.353e-03  -3.065  0.00218 ** 
## CitricAcid         -7.259e-04  7.575e-03  -0.096  0.92365    
## ResidualSugar      -6.141e-05  1.941e-04  -0.316  0.75165    
## Chlorides          -3.007e-02  2.056e-02  -1.463  0.14346    
## FreeSulfurDioxide   6.734e-05  4.404e-05   1.529  0.12620    
## TotalSulfurDioxide  2.081e-05  2.855e-05   0.729  0.46618    
## Density            -3.725e-01  2.462e-01  -1.513  0.13026    
## pH                 -4.661e-03  9.598e-03  -0.486  0.62722    
## Sulphates          -5.164e-03  7.051e-03  -0.732  0.46398    
## Alcohol             3.948e-03  1.771e-03   2.229  0.02579 *  
## LabelAppeal         1.771e-01  7.954e-03  22.271  < 2e-16 ***
## AcidIndex          -4.870e-02  5.903e-03  -8.251  < 2e-16 ***
## STARS               1.871e-01  7.487e-03  24.993  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 5844.1  on 6435  degrees of freedom
## Residual deviance: 4009.1  on 6421  degrees of freedom
##   (6359 observations deleted due to missingness)
## AIC: 23172
## 
## Number of Fisher Scoring iterations: 5

Model 2

model2 <- glm(formula = TARGET ~ ., family = poisson, data = imputed.data)


summary(model2)
## 
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = imputed.data)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0334  -0.6898   0.1251   0.6348   2.6374  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.486e+00  1.957e-01   7.595 3.07e-14 ***
## FixedAcidity       -5.016e-04  8.199e-04  -0.612 0.540647    
## VolatileAcidity    -3.932e-02  6.511e-03  -6.039 1.55e-09 ***
## CitricAcid          1.143e-02  5.888e-03   1.941 0.052265 .  
## ResidualSugar       6.788e-05  1.502e-04   0.452 0.651337    
## Chlorides          -4.740e-02  1.605e-02  -2.954 0.003137 ** 
## FreeSulfurDioxide   1.420e-04  3.421e-05   4.151 3.30e-05 ***
## TotalSulfurDioxide  8.325e-05  2.213e-05   3.762 0.000168 ***
## Density            -2.933e-01  1.922e-01  -1.526 0.126941    
## pH                 -1.953e-02  7.515e-03  -2.599 0.009349 ** 
## Sulphates          -1.433e-02  5.482e-03  -2.614 0.008953 ** 
## Alcohol             2.421e-03  1.373e-03   1.763 0.077942 .  
## LabelAppeal         1.433e-01  6.087e-03  23.537  < 2e-16 ***
## AcidIndex          -9.723e-02  4.518e-03 -21.521  < 2e-16 ***
## STARS               3.381e-01  5.612e-03  60.248  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for poisson family taken to be 1)
## 
##     Null deviance: 22861  on 12794  degrees of freedom
## Residual deviance: 15998  on 12780  degrees of freedom
## AIC: 47970
## 
## Number of Fisher Scoring iterations: 5

Negative Binomial Regression

Model 3

model3 <- glm.nb(formula = TARGET ~ ., data = imputed.data,init.theta = 60123.4587, link = log)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model3)
## 
## Call:
## glm.nb(formula = TARGET ~ ., data = imputed.data, init.theta = 48668.52345, 
##     link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0334  -0.6898   0.1251   0.6348   2.6374  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.486e+00  1.957e-01   7.595 3.08e-14 ***
## FixedAcidity       -5.016e-04  8.199e-04  -0.612 0.540656    
## VolatileAcidity    -3.932e-02  6.511e-03  -6.039 1.55e-09 ***
## CitricAcid          1.143e-02  5.888e-03   1.941 0.052272 .  
## ResidualSugar       6.788e-05  1.502e-04   0.452 0.651327    
## Chlorides          -4.740e-02  1.605e-02  -2.954 0.003138 ** 
## FreeSulfurDioxide   1.420e-04  3.421e-05   4.151 3.31e-05 ***
## TotalSulfurDioxide  8.325e-05  2.213e-05   3.762 0.000169 ***
## Density            -2.933e-01  1.922e-01  -1.526 0.126948    
## pH                 -1.953e-02  7.515e-03  -2.599 0.009349 ** 
## Sulphates          -1.433e-02  5.482e-03  -2.614 0.008954 ** 
## Alcohol             2.421e-03  1.373e-03   1.763 0.077957 .  
## LabelAppeal         1.433e-01  6.087e-03  23.536  < 2e-16 ***
## AcidIndex          -9.723e-02  4.518e-03 -21.520  < 2e-16 ***
## STARS               3.381e-01  5.612e-03  60.247  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(48668.52) family taken to be 1)
## 
##     Null deviance: 22860  on 12794  degrees of freedom
## Residual deviance: 15998  on 12780  degrees of freedom
## AIC: 47973
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  48669 
##           Std. Err.:  56093 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -47940.58

Model 4

model4 <- glm.nb(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data,     init.theta = 60123.4587, link = log)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached

## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model4)
## 
## Call:
## glm.nb(formula = TARGET ~ +VolatileAcidity + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data, 
##     init.theta = 48617.87372, link = log)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -3.0441  -0.6879   0.1229   0.6333   2.6756  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         1.148e+00  3.787e-02  30.324  < 2e-16 ***
## VolatileAcidity    -4.000e-02  6.509e-03  -6.146 7.93e-10 ***
## Chlorides          -4.771e-02  1.603e-02  -2.976 0.002923 ** 
## FreeSulfurDioxide   1.395e-04  3.420e-05   4.080 4.51e-05 ***
## TotalSulfurDioxide  8.270e-05  2.211e-05   3.741 0.000183 ***
## LabelAppeal         1.428e-01  6.085e-03  23.466  < 2e-16 ***
## AcidIndex          -9.723e-02  4.441e-03 -21.893  < 2e-16 ***
## STARS               3.397e-01  5.593e-03  60.733  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for Negative Binomial(48617.87) family taken to be 1)
## 
##     Null deviance: 22860  on 12794  degrees of freedom
## Residual deviance: 16021  on 12787  degrees of freedom
## AIC: 47982
## 
## Number of Fisher Scoring iterations: 1
## 
## 
##               Theta:  48618 
##           Std. Err.:  56088 
## Warning while fitting theta: iteration limit reached 
## 
##  2 x log-likelihood:  -47964.31

Linear regression

Model 5

model5 <- lm(formula = TARGET ~ ., data = imputed.data)

summary(model5)
## 
## Call:
## lm(formula = TARGET ~ ., data = imputed.data)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.496 -1.016  0.168  1.032  4.393 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         3.670e+00  4.800e-01   7.646 2.23e-14 ***
## FixedAcidity       -8.296e-04  2.014e-03  -0.412 0.680484    
## VolatileAcidity    -1.182e-01  1.601e-02  -7.383 1.64e-13 ***
## CitricAcid          3.140e-02  1.456e-02   2.157 0.031034 *  
## ResidualSugar       2.263e-04  3.698e-04   0.612 0.540605    
## Chlorides          -1.484e-01  3.939e-02  -3.767 0.000166 ***
## FreeSulfurDioxide   3.849e-04  8.423e-05   4.569 4.94e-06 ***
## TotalSulfurDioxide  2.249e-04  5.416e-05   4.154 3.30e-05 ***
## Density            -8.600e-01  4.723e-01  -1.821 0.068638 .  
## pH                 -4.998e-02  1.847e-02  -2.706 0.006814 ** 
## Sulphates          -3.830e-02  1.347e-02  -2.844 0.004464 ** 
## Alcohol             1.051e-02  3.361e-03   3.126 0.001775 ** 
## LabelAppeal         4.361e-01  1.474e-02  29.587  < 2e-16 ***
## AcidIndex          -2.407e-01  9.815e-03 -24.528  < 2e-16 ***
## STARS               1.160e+00  1.493e-02  77.687  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.415 on 12780 degrees of freedom
## Multiple R-squared:  0.4607, Adjusted R-squared:  0.4601 
## F-statistic: 779.9 on 14 and 12780 DF,  p-value: < 2.2e-16

Model 6

model6 <- lm(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS,data= imputed.data)
 
summary(model6)
## 
## Call:
## lm(formula = TARGET ~ +VolatileAcidity + Chlorides + FreeSulfurDioxide + 
##     TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -4.5035 -1.0269  0.1687  1.0318  4.4570 
## 
## Coefficients:
##                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)         2.742e+00  8.541e-02  32.100  < 2e-16 ***
## VolatileAcidity    -1.197e-01  1.601e-02  -7.474 8.31e-14 ***
## Chlorides          -1.502e-01  3.941e-02  -3.810  0.00014 ***
## FreeSulfurDioxide   3.761e-04  8.428e-05   4.462 8.19e-06 ***
## TotalSulfurDioxide  2.241e-04  5.418e-05   4.136 3.56e-05 ***
## LabelAppeal         4.350e-01  1.475e-02  29.484  < 2e-16 ***
## AcidIndex          -2.408e-01  9.619e-03 -25.036  < 2e-16 ***
## STARS               1.165e+00  1.490e-02  78.157  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.417 on 12787 degrees of freedom
## Multiple R-squared:  0.4593, Adjusted R-squared:  0.459 
## F-statistic:  1552 on 7 and 12787 DF,  p-value: < 2.2e-16

Model Selection

The model selection will be based o taking consideraton of the AIC results, model 1 have the lowest result 23172 and comparassion of the other model.

# evalutation Data

data.test <- read.csv('wine-evaluation-data.csv')[,-1]


data.test$target_pro <- predict(model1, data.test, type='response')

head(data.test,10)
##    TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1      NA          5.4          -0.860       0.27         -10.7     0.092
## 2      NA         12.4           0.385      -0.76         -19.7     1.169
## 3      NA          7.2           1.750       0.17         -33.0     0.065
## 4      NA          6.2           0.100       1.80           1.0    -0.179
## 5      NA         11.4           0.210       0.28           1.2     0.038
## 6      NA         17.6           0.040      -1.15           1.4     0.535
## 7      NA         15.5           0.530      -0.53           4.6     1.263
## 8      NA         15.9           1.190       1.14          31.9    -0.299
## 9      NA         11.6           0.320       0.55         -50.9     0.076
## 10     NA          3.8           0.220       0.31          -7.7     0.039
##    FreeSulfurDioxide TotalSulfurDioxide Density   pH Sulphates Alcohol
## 1                 23                398 0.98527 5.02      0.64   12.30
## 2                -37                 68 0.99048 3.37      1.09   16.00
## 3                  9                 76 1.04641 4.61      0.68    8.55
## 4                104                 89 0.98877 3.20      2.11   12.30
## 5                 70                 53 1.02899 2.54     -0.07    4.80
## 6               -250                140 0.95028 3.06     -0.02   11.40
## 7                 10                 17 1.00020 3.07      0.75    8.50
## 8                115                381 1.03416 2.99      0.31   11.40
## 9                 35                 83 1.00020 3.32      2.18   -0.50
## 10                40                129 0.90610 4.72     -0.64   10.90
##    LabelAppeal AcidIndex STARS target_pro
## 1           -1         6    NA         NA
## 2            0         6     2   3.699006
## 3            0         8     1   2.636868
## 4           -1         8     1   2.412248
## 5            0        10    NA         NA
## 6            1         8     4   5.940416
## 7            0        12     3   3.214993
## 8            1         7    NA         NA
## 9            0        12    NA         NA
## 10           0         7    NA         NA
write.csv(data.test, "test_prediction.csv", row.names = FALSE)

Appendix

library(caret)
library(dplyr)
library(psych)
library(corrplot)
library(tidyr)
library(ggplot2)
library(VIM)
library(ResourceSelection)
library(pROC)
library(tidyverse)
library(mice)
library(MASS)







data <- read.csv('wine-training-data.csv')[,-1]

str(data)
summary(data)


ggplot(stack(data[,-1]), aes(x = ind, y = values)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 100))



ggplot(gather(select_if(data[,-1],is.numeric)), aes(value)) +
    geom_histogram(bins = 20) +
    facet_wrap(~key, scales = 'free')


corrplot(cor(data[,-1], use = "na.or.complete"))




Amelia::missmap(data)

t <- mice(data,m=5, maxit = 10, method = 'pmm', seed = 500,printFlag = F)
imputed.data <- complete(t)

Amelia::missmap(imputed.data )

model1 <- glm(formula = TARGET ~ ., family = poisson, data = data)

summary(model1)





model2 <- glm(formula = TARGET ~ ., family = poisson, data = imputed.data)


summary(model2)




model3 <- glm.nb(formula = TARGET ~ ., data = imputed.data,init.theta = 60123.4587, link = log)


summary(model3)






model4 <- glm.nb(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data,     init.theta = 60123.4587, link = log)


summary(model4)




model5 <- lm(formula = TARGET ~ ., data = imputed.data)

summary(model5)



model6 <- lm(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS,data= imputed.data)
 
summary(model6)

# evalutation Data

data.test <- read.csv('wine-evaluation-data.csv')[,-1]


data.test$target_pro <- predict(model1, data.test, type='response')

head(data.test,10)

write.csv(data.test, "test_prediction.csv", row.names = FALSE)




library(caret)
library(dplyr)
library(psych)
library(corrplot)
library(tidyr)
library(ggplot2)
library(VIM)
library(ResourceSelection)
library(pROC)
library(tidyverse)
library(mice)
library(MASS)

data <- read.csv('wine-training-data.csv')[,-1]

str(data)
summary(data)
# data Exploration
ggplot(stack(data[,-1]), aes(x = ind, y = values)) +
  geom_boxplot() +
  coord_cartesian(ylim = c(0, 100))



ggplot(gather(select_if(data[,-1],is.numeric)), aes(value)) +
    geom_histogram(bins = 20) +
    facet_wrap(~key, scales = 'free')


corrplot(cor(data[,-1], use = "na.or.complete"))

#preparation
Amelia::missmap(data)

t <- mice(data,m=5, maxit = 10, method = 'pmm', seed = 500,printFlag = F)
imputed.data <- complete(t)

Amelia::missmap(imputed.data )

#models
model1 <- glm(formula = TARGET ~ ., family = poisson, data = data)
summary(model1)

model2 <- glm(formula = TARGET ~ ., family = poisson, data = imputed.data)
summary(model2)

model3 <- glm.nb(formula = TARGET ~ ., data = imputed.data,init.theta = 60123.4587, link = log)
summary(model3)


model4 <- glm.nb(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data,     init.theta = 60123.4587, link = log)
summary(model4)

model5 <- lm(formula = TARGET ~ ., data = imputed.data)
summary(model5)

model6 <- lm(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS,data= imputed.data)
summary(model6)

# testing dataset
data.test <- read.csv('wine-evaluation-data.csv')[,-1]
data.test$target_pro <- predict(model1, data.test, type='response')
head(data.test,10)
write.csv(data.test, "test_prediction.csv", row.names = FALSE)