library(caret)
library(dplyr)
library(psych)
library(corrplot)
library(tidyr)
library(ggplot2)
library(VIM)
library(ResourceSelection)
library(pROC)
library(tidyverse)
library(mice)
library(MASS)data <- read.csv('wine-training-data.csv')[,-1]
str(data)## 'data.frame': 12795 obs. of 15 variables:
## $ TARGET : int 3 3 5 3 4 0 0 4 3 6 ...
## $ FixedAcidity : num 3.2 4.5 7.1 5.7 8 11.3 7.7 6.5 14.8 5.5 ...
## $ VolatileAcidity : num 1.16 0.16 2.64 0.385 0.33 0.32 0.29 -1.22 0.27 -0.22 ...
## $ CitricAcid : num -0.98 -0.81 -0.88 0.04 -1.26 0.59 -0.4 0.34 1.05 0.39 ...
## $ ResidualSugar : num 54.2 26.1 14.8 18.8 9.4 ...
## $ Chlorides : num -0.567 -0.425 0.037 -0.425 NA 0.556 0.06 0.04 -0.007 -0.277 ...
## $ FreeSulfurDioxide : num NA 15 214 22 -167 -37 287 523 -213 62 ...
## $ TotalSulfurDioxide: num 268 -327 142 115 108 15 156 551 NA 180 ...
## $ Density : num 0.993 1.028 0.995 0.996 0.995 ...
## $ pH : num 3.33 3.38 3.12 2.24 3.12 3.2 3.49 3.2 4.93 3.09 ...
## $ Sulphates : num -0.59 0.7 0.48 1.83 1.77 1.29 1.21 NA 0.26 0.75 ...
## $ Alcohol : num 9.9 NA 22 6.2 13.7 15.4 10.3 11.6 15 12.6 ...
## $ LabelAppeal : int 0 -1 -1 -1 0 0 0 1 0 0 ...
## $ AcidIndex : int 8 7 8 6 9 11 8 7 6 8 ...
## $ STARS : int 2 3 3 1 2 NA NA 3 NA 4 ...
summary(data)## TARGET FixedAcidity VolatileAcidity CitricAcid
## Min. :0.000 Min. :-18.100 Min. :-2.7900 Min. :-3.2400
## 1st Qu.:2.000 1st Qu.: 5.200 1st Qu.: 0.1300 1st Qu.: 0.0300
## Median :3.000 Median : 6.900 Median : 0.2800 Median : 0.3100
## Mean :3.029 Mean : 7.076 Mean : 0.3241 Mean : 0.3084
## 3rd Qu.:4.000 3rd Qu.: 9.500 3rd Qu.: 0.6400 3rd Qu.: 0.5800
## Max. :8.000 Max. : 34.400 Max. : 3.6800 Max. : 3.8600
##
## ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide
## Min. :-127.800 Min. :-1.1710 Min. :-555.00 Min. :-823.0
## 1st Qu.: -2.000 1st Qu.:-0.0310 1st Qu.: 0.00 1st Qu.: 27.0
## Median : 3.900 Median : 0.0460 Median : 30.00 Median : 123.0
## Mean : 5.419 Mean : 0.0548 Mean : 30.85 Mean : 120.7
## 3rd Qu.: 15.900 3rd Qu.: 0.1530 3rd Qu.: 70.00 3rd Qu.: 208.0
## Max. : 141.150 Max. : 1.3510 Max. : 623.00 Max. :1057.0
## NA's :616 NA's :638 NA's :647 NA's :682
## Density pH Sulphates Alcohol
## Min. :0.8881 Min. :0.480 Min. :-3.1300 Min. :-4.70
## 1st Qu.:0.9877 1st Qu.:2.960 1st Qu.: 0.2800 1st Qu.: 9.00
## Median :0.9945 Median :3.200 Median : 0.5000 Median :10.40
## Mean :0.9942 Mean :3.208 Mean : 0.5271 Mean :10.49
## 3rd Qu.:1.0005 3rd Qu.:3.470 3rd Qu.: 0.8600 3rd Qu.:12.40
## Max. :1.0992 Max. :6.130 Max. : 4.2400 Max. :26.50
## NA's :395 NA's :1210 NA's :653
## LabelAppeal AcidIndex STARS
## Min. :-2.000000 Min. : 4.000 Min. :1.000
## 1st Qu.:-1.000000 1st Qu.: 7.000 1st Qu.:1.000
## Median : 0.000000 Median : 8.000 Median :2.000
## Mean :-0.009066 Mean : 7.773 Mean :2.042
## 3rd Qu.: 1.000000 3rd Qu.: 8.000 3rd Qu.:3.000
## Max. : 2.000000 Max. :17.000 Max. :4.000
## NA's :3359
ggplot(stack(data[,-1]), aes(x = ind, y = values)) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 100))## Warning: Removed 8200 rows containing non-finite values (stat_boxplot).
ggplot(gather(select_if(data[,-1],is.numeric)), aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free')## Warning: Removed 8200 rows containing non-finite values (stat_bin).
corrplot(cor(data[,-1], use = "na.or.complete")) From the plot we can observe that some of the variables are normally distrubuted.
For the data preparation we going to start by observing if we have mimssing observation.Later we going to impute the data using the Mice packages.
Amelia::missmap(data)t <- mice(data,m=5, maxit = 10, method = 'pmm', seed = 500,printFlag = F)
imputed.data <- complete(t)
Amelia::missmap(imputed.data )model1 <- glm(formula = TARGET ~ ., family = poisson, data = data)
summary(model1)##
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2158 -0.2734 0.0616 0.3732 1.6830
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.593e+00 2.506e-01 6.359 2.03e-10 ***
## FixedAcidity 3.293e-04 1.053e-03 0.313 0.75447
## VolatileAcidity -2.560e-02 8.353e-03 -3.065 0.00218 **
## CitricAcid -7.259e-04 7.575e-03 -0.096 0.92365
## ResidualSugar -6.141e-05 1.941e-04 -0.316 0.75165
## Chlorides -3.007e-02 2.056e-02 -1.463 0.14346
## FreeSulfurDioxide 6.734e-05 4.404e-05 1.529 0.12620
## TotalSulfurDioxide 2.081e-05 2.855e-05 0.729 0.46618
## Density -3.725e-01 2.462e-01 -1.513 0.13026
## pH -4.661e-03 9.598e-03 -0.486 0.62722
## Sulphates -5.164e-03 7.051e-03 -0.732 0.46398
## Alcohol 3.948e-03 1.771e-03 2.229 0.02579 *
## LabelAppeal 1.771e-01 7.954e-03 22.271 < 2e-16 ***
## AcidIndex -4.870e-02 5.903e-03 -8.251 < 2e-16 ***
## STARS 1.871e-01 7.487e-03 24.993 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 5844.1 on 6435 degrees of freedom
## Residual deviance: 4009.1 on 6421 degrees of freedom
## (6359 observations deleted due to missingness)
## AIC: 23172
##
## Number of Fisher Scoring iterations: 5
model2 <- glm(formula = TARGET ~ ., family = poisson, data = imputed.data)
summary(model2)##
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = imputed.data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0334 -0.6898 0.1251 0.6348 2.6374
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.486e+00 1.957e-01 7.595 3.07e-14 ***
## FixedAcidity -5.016e-04 8.199e-04 -0.612 0.540647
## VolatileAcidity -3.932e-02 6.511e-03 -6.039 1.55e-09 ***
## CitricAcid 1.143e-02 5.888e-03 1.941 0.052265 .
## ResidualSugar 6.788e-05 1.502e-04 0.452 0.651337
## Chlorides -4.740e-02 1.605e-02 -2.954 0.003137 **
## FreeSulfurDioxide 1.420e-04 3.421e-05 4.151 3.30e-05 ***
## TotalSulfurDioxide 8.325e-05 2.213e-05 3.762 0.000168 ***
## Density -2.933e-01 1.922e-01 -1.526 0.126941
## pH -1.953e-02 7.515e-03 -2.599 0.009349 **
## Sulphates -1.433e-02 5.482e-03 -2.614 0.008953 **
## Alcohol 2.421e-03 1.373e-03 1.763 0.077942 .
## LabelAppeal 1.433e-01 6.087e-03 23.537 < 2e-16 ***
## AcidIndex -9.723e-02 4.518e-03 -21.521 < 2e-16 ***
## STARS 3.381e-01 5.612e-03 60.248 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 22861 on 12794 degrees of freedom
## Residual deviance: 15998 on 12780 degrees of freedom
## AIC: 47970
##
## Number of Fisher Scoring iterations: 5
model3 <- glm.nb(formula = TARGET ~ ., data = imputed.data,init.theta = 60123.4587, link = log)## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model3)##
## Call:
## glm.nb(formula = TARGET ~ ., data = imputed.data, init.theta = 48668.52345,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0334 -0.6898 0.1251 0.6348 2.6374
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.486e+00 1.957e-01 7.595 3.08e-14 ***
## FixedAcidity -5.016e-04 8.199e-04 -0.612 0.540656
## VolatileAcidity -3.932e-02 6.511e-03 -6.039 1.55e-09 ***
## CitricAcid 1.143e-02 5.888e-03 1.941 0.052272 .
## ResidualSugar 6.788e-05 1.502e-04 0.452 0.651327
## Chlorides -4.740e-02 1.605e-02 -2.954 0.003138 **
## FreeSulfurDioxide 1.420e-04 3.421e-05 4.151 3.31e-05 ***
## TotalSulfurDioxide 8.325e-05 2.213e-05 3.762 0.000169 ***
## Density -2.933e-01 1.922e-01 -1.526 0.126948
## pH -1.953e-02 7.515e-03 -2.599 0.009349 **
## Sulphates -1.433e-02 5.482e-03 -2.614 0.008954 **
## Alcohol 2.421e-03 1.373e-03 1.763 0.077957 .
## LabelAppeal 1.433e-01 6.087e-03 23.536 < 2e-16 ***
## AcidIndex -9.723e-02 4.518e-03 -21.520 < 2e-16 ***
## STARS 3.381e-01 5.612e-03 60.247 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(48668.52) family taken to be 1)
##
## Null deviance: 22860 on 12794 degrees of freedom
## Residual deviance: 15998 on 12780 degrees of freedom
## AIC: 47973
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 48669
## Std. Err.: 56093
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -47940.58
model4 <- glm.nb(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data, init.theta = 60123.4587, link = log)## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model4)##
## Call:
## glm.nb(formula = TARGET ~ +VolatileAcidity + Chlorides + FreeSulfurDioxide +
## TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data,
## init.theta = 48617.87372, link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.0441 -0.6879 0.1229 0.6333 2.6756
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.148e+00 3.787e-02 30.324 < 2e-16 ***
## VolatileAcidity -4.000e-02 6.509e-03 -6.146 7.93e-10 ***
## Chlorides -4.771e-02 1.603e-02 -2.976 0.002923 **
## FreeSulfurDioxide 1.395e-04 3.420e-05 4.080 4.51e-05 ***
## TotalSulfurDioxide 8.270e-05 2.211e-05 3.741 0.000183 ***
## LabelAppeal 1.428e-01 6.085e-03 23.466 < 2e-16 ***
## AcidIndex -9.723e-02 4.441e-03 -21.893 < 2e-16 ***
## STARS 3.397e-01 5.593e-03 60.733 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(48617.87) family taken to be 1)
##
## Null deviance: 22860 on 12794 degrees of freedom
## Residual deviance: 16021 on 12787 degrees of freedom
## AIC: 47982
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 48618
## Std. Err.: 56088
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -47964.31
model5 <- lm(formula = TARGET ~ ., data = imputed.data)
summary(model5)##
## Call:
## lm(formula = TARGET ~ ., data = imputed.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.496 -1.016 0.168 1.032 4.393
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.670e+00 4.800e-01 7.646 2.23e-14 ***
## FixedAcidity -8.296e-04 2.014e-03 -0.412 0.680484
## VolatileAcidity -1.182e-01 1.601e-02 -7.383 1.64e-13 ***
## CitricAcid 3.140e-02 1.456e-02 2.157 0.031034 *
## ResidualSugar 2.263e-04 3.698e-04 0.612 0.540605
## Chlorides -1.484e-01 3.939e-02 -3.767 0.000166 ***
## FreeSulfurDioxide 3.849e-04 8.423e-05 4.569 4.94e-06 ***
## TotalSulfurDioxide 2.249e-04 5.416e-05 4.154 3.30e-05 ***
## Density -8.600e-01 4.723e-01 -1.821 0.068638 .
## pH -4.998e-02 1.847e-02 -2.706 0.006814 **
## Sulphates -3.830e-02 1.347e-02 -2.844 0.004464 **
## Alcohol 1.051e-02 3.361e-03 3.126 0.001775 **
## LabelAppeal 4.361e-01 1.474e-02 29.587 < 2e-16 ***
## AcidIndex -2.407e-01 9.815e-03 -24.528 < 2e-16 ***
## STARS 1.160e+00 1.493e-02 77.687 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.415 on 12780 degrees of freedom
## Multiple R-squared: 0.4607, Adjusted R-squared: 0.4601
## F-statistic: 779.9 on 14 and 12780 DF, p-value: < 2.2e-16
model6 <- lm(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS,data= imputed.data)
summary(model6)##
## Call:
## lm(formula = TARGET ~ +VolatileAcidity + Chlorides + FreeSulfurDioxide +
## TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5035 -1.0269 0.1687 1.0318 4.4570
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.742e+00 8.541e-02 32.100 < 2e-16 ***
## VolatileAcidity -1.197e-01 1.601e-02 -7.474 8.31e-14 ***
## Chlorides -1.502e-01 3.941e-02 -3.810 0.00014 ***
## FreeSulfurDioxide 3.761e-04 8.428e-05 4.462 8.19e-06 ***
## TotalSulfurDioxide 2.241e-04 5.418e-05 4.136 3.56e-05 ***
## LabelAppeal 4.350e-01 1.475e-02 29.484 < 2e-16 ***
## AcidIndex -2.408e-01 9.619e-03 -25.036 < 2e-16 ***
## STARS 1.165e+00 1.490e-02 78.157 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.417 on 12787 degrees of freedom
## Multiple R-squared: 0.4593, Adjusted R-squared: 0.459
## F-statistic: 1552 on 7 and 12787 DF, p-value: < 2.2e-16
The model selection will be based o taking consideraton of the AIC results, model 1 have the lowest result 23172 and comparassion of the other model.
# evalutation Data
data.test <- read.csv('wine-evaluation-data.csv')[,-1]
data.test$target_pro <- predict(model1, data.test, type='response')
head(data.test,10)## TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1 NA 5.4 -0.860 0.27 -10.7 0.092
## 2 NA 12.4 0.385 -0.76 -19.7 1.169
## 3 NA 7.2 1.750 0.17 -33.0 0.065
## 4 NA 6.2 0.100 1.80 1.0 -0.179
## 5 NA 11.4 0.210 0.28 1.2 0.038
## 6 NA 17.6 0.040 -1.15 1.4 0.535
## 7 NA 15.5 0.530 -0.53 4.6 1.263
## 8 NA 15.9 1.190 1.14 31.9 -0.299
## 9 NA 11.6 0.320 0.55 -50.9 0.076
## 10 NA 3.8 0.220 0.31 -7.7 0.039
## FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol
## 1 23 398 0.98527 5.02 0.64 12.30
## 2 -37 68 0.99048 3.37 1.09 16.00
## 3 9 76 1.04641 4.61 0.68 8.55
## 4 104 89 0.98877 3.20 2.11 12.30
## 5 70 53 1.02899 2.54 -0.07 4.80
## 6 -250 140 0.95028 3.06 -0.02 11.40
## 7 10 17 1.00020 3.07 0.75 8.50
## 8 115 381 1.03416 2.99 0.31 11.40
## 9 35 83 1.00020 3.32 2.18 -0.50
## 10 40 129 0.90610 4.72 -0.64 10.90
## LabelAppeal AcidIndex STARS target_pro
## 1 -1 6 NA NA
## 2 0 6 2 3.699006
## 3 0 8 1 2.636868
## 4 -1 8 1 2.412248
## 5 0 10 NA NA
## 6 1 8 4 5.940416
## 7 0 12 3 3.214993
## 8 1 7 NA NA
## 9 0 12 NA NA
## 10 0 7 NA NA
write.csv(data.test, "test_prediction.csv", row.names = FALSE)library(caret)
library(dplyr)
library(psych)
library(corrplot)
library(tidyr)
library(ggplot2)
library(VIM)
library(ResourceSelection)
library(pROC)
library(tidyverse)
library(mice)
library(MASS)
data <- read.csv('wine-training-data.csv')[,-1]
str(data)
summary(data)
ggplot(stack(data[,-1]), aes(x = ind, y = values)) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 100))
ggplot(gather(select_if(data[,-1],is.numeric)), aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free')
corrplot(cor(data[,-1], use = "na.or.complete"))
Amelia::missmap(data)
t <- mice(data,m=5, maxit = 10, method = 'pmm', seed = 500,printFlag = F)
imputed.data <- complete(t)
Amelia::missmap(imputed.data )
model1 <- glm(formula = TARGET ~ ., family = poisson, data = data)
summary(model1)
model2 <- glm(formula = TARGET ~ ., family = poisson, data = imputed.data)
summary(model2)
model3 <- glm.nb(formula = TARGET ~ ., data = imputed.data,init.theta = 60123.4587, link = log)
summary(model3)
model4 <- glm.nb(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data, init.theta = 60123.4587, link = log)
summary(model4)
model5 <- lm(formula = TARGET ~ ., data = imputed.data)
summary(model5)
model6 <- lm(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS,data= imputed.data)
summary(model6)
# evalutation Data
data.test <- read.csv('wine-evaluation-data.csv')[,-1]
data.test$target_pro <- predict(model1, data.test, type='response')
head(data.test,10)
write.csv(data.test, "test_prediction.csv", row.names = FALSE)
library(caret)
library(dplyr)
library(psych)
library(corrplot)
library(tidyr)
library(ggplot2)
library(VIM)
library(ResourceSelection)
library(pROC)
library(tidyverse)
library(mice)
library(MASS)
data <- read.csv('wine-training-data.csv')[,-1]
str(data)
summary(data)
# data Exploration
ggplot(stack(data[,-1]), aes(x = ind, y = values)) +
geom_boxplot() +
coord_cartesian(ylim = c(0, 100))
ggplot(gather(select_if(data[,-1],is.numeric)), aes(value)) +
geom_histogram(bins = 20) +
facet_wrap(~key, scales = 'free')
corrplot(cor(data[,-1], use = "na.or.complete"))
#preparation
Amelia::missmap(data)
t <- mice(data,m=5, maxit = 10, method = 'pmm', seed = 500,printFlag = F)
imputed.data <- complete(t)
Amelia::missmap(imputed.data )
#models
model1 <- glm(formula = TARGET ~ ., family = poisson, data = data)
summary(model1)
model2 <- glm(formula = TARGET ~ ., family = poisson, data = imputed.data)
summary(model2)
model3 <- glm.nb(formula = TARGET ~ ., data = imputed.data,init.theta = 60123.4587, link = log)
summary(model3)
model4 <- glm.nb(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS, data = imputed.data, init.theta = 60123.4587, link = log)
summary(model4)
model5 <- lm(formula = TARGET ~ ., data = imputed.data)
summary(model5)
model6 <- lm(formula = TARGET ~ + VolatileAcidity + Chlorides + FreeSulfurDioxide + TotalSulfurDioxide + LabelAppeal + AcidIndex + STARS,data= imputed.data)
summary(model6)
# testing dataset
data.test <- read.csv('wine-evaluation-data.csv')[,-1]
data.test$target_pro <- predict(model1, data.test, type='response')
head(data.test,10)
write.csv(data.test, "test_prediction.csv", row.names = FALSE)