A large wine manufacturer is studying the data in order to predict the number of wine cases ordered based upon the wine characteristics. If the wine manufacturer can predict the number of cases, then that manufacturer will be able to adjust their wine offering to maximize sales.
Your objective is to build a count regression model to predict the number of cases of wine that will be sold given certain properties of the wine.
knitr::opts_chunk$set(echo = TRUE)
knitr::opts_chunk$set(message = FALSE)
knitr::opts_chunk$set(warning = FALSE)
library(dplyr)
library(reshape)
library(ggplot2)
library(purrr)
library(psych)
library(tidyr)
library(MASS)
library(rpart.plot)
library(gridExtra)
library(forecast)
library(fpp2)
library(fma)
library(kableExtra)
library(e1071)
library(mlbench)
library(DataExplorer)
library(timeDate)
library(caret)
library(GGally)
library(corrplot)
library(RColorBrewer)
library(tibble)
library(tidyr)
library(tidyverse)
library(dplyr)
library(reshape2)
library(mixtools)
library(tidymodels)
library(ggpmisc)
library(regclass)
library(skimr)
Load the wine training data
# load data
data <- read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_5/wine-training-data.csv")
head(data,2)
## ï..INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## 1 1 3 3.2 1.16 -0.98 54.2
## 2 2 3 4.5 0.16 -0.81 26.1
## Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol
## 1 -0.567 NA 268 0.99280 3.33 -0.59 9.9
## 2 -0.425 15 -327 1.02792 3.38 0.70 NA
## LabelAppeal AcidIndex STARS
## 1 0 8 2
## 2 -1 7 3
Load the wine evaluation data {{r setup, include=TRUE, cache = FALSE}} # load data eval_data <- read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_5/wine-evaluation-data.csv") head(eval_data,2)
instances = dim(data)[1]
features = dim(data)[2]
cat('Dataset for this assignment is', instances, 'instances and', features, 'features')
## Dataset for this assignment is 12795 instances and 16 features
map(data, ~sum(is.na(.))) %>% t()
## ï..INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## [1,] 0 0 0 0 0 616
## Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates
## [1,] 638 647 682 0 395 1210
## Alcohol LabelAppeal AcidIndex STARS
## [1,] 653 0 0 3359
describe(data)
## vars n mean sd median trimmed mad min
## ï..INDEX 1 12795 8069.98 4656.91 8110.00 8071.03 5977.84 1.00
## TARGET 2 12795 3.03 1.93 3.00 3.05 1.48 0.00
## FixedAcidity 3 12795 7.08 6.32 6.90 7.07 3.26 -18.10
## VolatileAcidity 4 12795 0.32 0.78 0.28 0.32 0.43 -2.79
## CitricAcid 5 12795 0.31 0.86 0.31 0.31 0.42 -3.24
## ResidualSugar 6 12179 5.42 33.75 3.90 5.58 15.72 -127.80
## Chlorides 7 12157 0.05 0.32 0.05 0.05 0.13 -1.17
## FreeSulfurDioxide 8 12148 30.85 148.71 30.00 30.93 56.34 -555.00
## TotalSulfurDioxide 9 12113 120.71 231.91 123.00 120.89 134.92 -823.00
## Density 10 12795 0.99 0.03 0.99 0.99 0.01 0.89
## pH 11 12400 3.21 0.68 3.20 3.21 0.39 0.48
## Sulphates 12 11585 0.53 0.93 0.50 0.53 0.44 -3.13
## Alcohol 13 12142 10.49 3.73 10.40 10.50 2.37 -4.70
## LabelAppeal 14 12795 -0.01 0.89 0.00 -0.01 1.48 -2.00
## AcidIndex 15 12795 7.77 1.32 8.00 7.64 1.48 4.00
## STARS 16 9436 2.04 0.90 2.00 1.97 1.48 1.00
## max range skew kurtosis se
## ï..INDEX 16129.00 16128.00 0.00 -1.20 41.17
## TARGET 8.00 8.00 -0.33 -0.88 0.02
## FixedAcidity 34.40 52.50 -0.02 1.67 0.06
## VolatileAcidity 3.68 6.47 0.02 1.83 0.01
## CitricAcid 3.86 7.10 -0.05 1.84 0.01
## ResidualSugar 141.15 268.95 -0.05 1.88 0.31
## Chlorides 1.35 2.52 0.03 1.79 0.00
## FreeSulfurDioxide 623.00 1178.00 0.01 1.84 1.35
## TotalSulfurDioxide 1057.00 1880.00 -0.01 1.67 2.11
## Density 1.10 0.21 -0.02 1.90 0.00
## pH 6.13 5.65 0.04 1.65 0.01
## Sulphates 4.24 7.37 0.01 1.75 0.01
## Alcohol 26.50 31.20 -0.03 1.54 0.03
## LabelAppeal 2.00 4.00 0.01 -0.26 0.01
## AcidIndex 17.00 13.00 1.65 5.19 0.01
## STARS 4.00 3.00 0.45 -0.69 0.01
meltData <- melt(data)
boxplot(data=meltData, value~variable)
p <- ggplot(meltData, aes(factor(variable), value))
p + geom_boxplot() + facet_wrap(~variable, scale="free")
res_vector <- vector()
colnames_vector <- colnames(data)
for(i in 1:ncol(data)) {
res <- cor.test(data[ , i], data$TARGET,
method = "pearson")
res_round = round(res$estimate,4)
res_vector <- c(res_vector, res_round)
}
corrdf <- data.frame(colnames_vector, res_vector)
corrdf_sort <- corrdf %>% arrange(desc(res_vector))
#Remove TARGET from correlation df
corrdf_sort = corrdf_sort[-c(1),]
corrdf_sort
## colnames_vector res_vector
## 2 STARS 0.5588
## 3 LabelAppeal 0.3565
## 4 Alcohol 0.0621
## 5 TotalSulfurDioxide 0.0515
## 6 FreeSulfurDioxide 0.0438
## 7 ResidualSugar 0.0165
## 8 CitricAcid 0.0087
## 9 ï..INDEX 0.0013
## 10 pH -0.0094
## 11 Density -0.0355
## 12 Chlorides -0.0383
## 13 Sulphates -0.0388
## 14 FixedAcidity -0.0490
## 15 VolatileAcidity -0.0888
## 16 AcidIndex -0.2460
#Plot the correlations
ggplot(corrdf_sort, aes(x=reorder(colnames_vector,res_vector), y=res_vector)) +
geom_bar(stat="identity") +
theme(axis.text.x = element_text(angle = 90, size = 10))
#Fill in missing values in training data
data_mv <- COPY<-data.frame(data)
data_mv = data_mv %>% replace_na(list(STARS = 0))
for(i in 1:ncol(data_mv)){
data_mv[is.na(data_mv[,i]), i] <- mean(data_mv[,i], na.rm = TRUE)
}
#Fill in missing values in evaluation data
eval_data<-read.csv("https://raw.githubusercontent.com/akarimhammoud/Data_621/main/Assignment_5/wine-evaluation-data.csv")
eval_data_mv <- COPY<-data.frame(eval_data)
eval_data_mv = eval_data_mv %>% replace_na(list(STARS = 0))
for(i in 1:ncol(eval_data_mv)){
eval_data_mv[is.na(eval_data_mv[,i]), i] <- mean(eval_data_mv[,i], na.rm = TRUE)
}
#Apply scalarization to training and evaluation data
data_mv_scaled = scale(data_mv)
eval_data_mv_scaled = scale(eval_data_mv, center=attr(data_mv_scaled, "scaled:center"),
scale=attr(data_mv_scaled, "scaled:scale"))
Output training and evaluation csv’s with missing values replaced, and csv’s that are scaled
#Training
write.csv(data_mv, "wine-training-data-mv.csv")
write.csv(data_mv_scaled, "wine-training-data-mv-scaled.csv")
#Evaluation
write.csv(eval_data_mv, "wine-evaluation-data-mv.csv")
write.csv(eval_data_mv_scaled, "wine-evaluation-data-mv-scaled.csv")
model1 = glm(TARGET ~ ., data=data_mv, family=poisson)
summary(model1)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = data_mv)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9721 -0.7208 0.0698 0.5794 3.2293
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.522e+00 1.958e-01 7.773 7.67e-15 ***
## ï..INDEX 4.027e-07 1.093e-06 0.368 0.712530
## FixedAcidity -3.053e-04 8.204e-04 -0.372 0.709791
## VolatileAcidity -3.342e-02 6.516e-03 -5.129 2.91e-07 ***
## CitricAcid 7.754e-03 5.892e-03 1.316 0.188210
## ResidualSugar 5.654e-05 1.546e-04 0.366 0.714620
## Chlorides -4.143e-02 1.645e-02 -2.519 0.011779 *
## FreeSulfurDioxide 1.253e-04 3.512e-05 3.567 0.000361 ***
## TotalSulfurDioxide 8.292e-05 2.275e-05 3.645 0.000268 ***
## Density -2.816e-01 1.920e-01 -1.467 0.142448
## pH -1.571e-02 7.638e-03 -2.057 0.039701 *
## Sulphates -1.267e-02 5.749e-03 -2.203 0.027584 *
## Alcohol 2.200e-03 1.410e-03 1.561 0.118607
## LabelAppeal 1.332e-01 6.064e-03 21.958 < 2e-16 ***
## AcidIndex -8.705e-02 4.548e-03 -19.138 < 2e-16 ***
## STARS 3.113e-01 4.532e-03 68.696 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 22861 on 12794 degrees of freedom
## Residual deviance: 14728 on 12779 degrees of freedom
## AIC: 46702
##
## Number of Fisher Scoring iterations: 5
require(gridExtra)
require(ggplot2)
plot(model1)
model2 = glm(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Chlorides-FreeSulfurDioxide-TotalSulfurDioxide-Density-pH-Sulphates-Alcohol-LabelAppeal,AcidIndex,STARS,data = data_mv, family=poisson)
summary(model2)
##
## Call:
## glm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Chlorides - FreeSulfurDioxide - TotalSulfurDioxide - Density -
## pH - Sulphates - Alcohol - LabelAppeal, family = poisson,
## data = data_mv, weights = AcidIndex, subset = STARS)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## 0.000e+00 5.162e-08 5.575e-08 5.960e-08 5.960e-08
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -16.55166 0.62754 -26.38 <2e-16 ***
## ï..INDEX 0.87519 0.02910 30.08 <2e-16 ***
## VolatileAcidity -1.42888 0.05741 -24.89 <2e-16 ***
## AcidIndex 2.30407 0.08293 27.79 <2e-16 ***
## STARS NA NA NA NA
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 1.4009e+04 on 9435 degrees of freedom
## Residual deviance: 1.5677e-11 on 9432 degrees of freedom
## AIC: 220163
##
## Number of Fisher Scoring iterations: 4
plot(model2)
data_scale_train<-data.frame(data_mv_scaled)
model3 <- glm.nb(TARGET ~ ., data = data_mv)
summary(model3)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = data_mv, init.theta = 48971.50111,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9720 -0.7207 0.0698 0.5793 3.2292
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.522e+00 1.958e-01 7.773 7.68e-15 ***
## ï..INDEX 4.027e-07 1.093e-06 0.368 0.712564
## FixedAcidity -3.053e-04 8.205e-04 -0.372 0.709793
## VolatileAcidity -3.342e-02 6.516e-03 -5.129 2.91e-07 ***
## CitricAcid 7.754e-03 5.893e-03 1.316 0.188223
## ResidualSugar 5.655e-05 1.546e-04 0.366 0.714606
## Chlorides -4.143e-02 1.645e-02 -2.519 0.011780 *
## FreeSulfurDioxide 1.253e-04 3.512e-05 3.567 0.000361 ***
## TotalSulfurDioxide 8.292e-05 2.275e-05 3.645 0.000268 ***
## Density -2.816e-01 1.920e-01 -1.467 0.142455
## pH -1.571e-02 7.638e-03 -2.057 0.039700 *
## Sulphates -1.267e-02 5.749e-03 -2.203 0.027585 *
## Alcohol 2.200e-03 1.410e-03 1.561 0.118629
## LabelAppeal 1.332e-01 6.065e-03 21.957 < 2e-16 ***
## AcidIndex -8.705e-02 4.549e-03 -19.137 < 2e-16 ***
## STARS 3.113e-01 4.532e-03 68.694 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(48971.5) family taken to be 1)
##
## Null deviance: 22860 on 12794 degrees of freedom
## Residual deviance: 14728 on 12779 degrees of freedom
## AIC: 46704
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 48972
## Std. Err.: 50707
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -46670.37
plot(model3)
### Negative binomial model with data_mv dataset with significant values.
model4 <- glm.nb(TARGET~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + Alcohol +
as.factor(LabelAppeal) +
as.factor(AcidIndex) +
as.factor(STARS), data = data_mv)
summary(model4)
##
## Call:
## glm.nb(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide +
## TotalSulfurDioxide + Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) +
## as.factor(STARS), data = data_mv, init.theta = 40912.6004,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2463 -0.6517 -0.0030 0.4394 3.6953
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.408e-02 3.192e-01 -0.107 0.91497
## VolatileAcidity -2.990e-02 6.533e-03 -4.576 4.74e-06 ***
## FreeSulfurDioxide 8.883e-05 3.502e-05 2.537 0.01119 *
## TotalSulfurDioxide 7.221e-05 2.274e-05 3.176 0.00149 **
## Alcohol 4.417e-03 1.410e-03 3.134 0.00173 **
## as.factor(LabelAppeal)-1 2.400e-01 3.800e-02 6.316 2.68e-10 ***
## as.factor(LabelAppeal)0 4.302e-01 3.706e-02 11.608 < 2e-16 ***
## as.factor(LabelAppeal)1 5.633e-01 3.771e-02 14.937 < 2e-16 ***
## as.factor(LabelAppeal)2 6.983e-01 4.245e-02 16.452 < 2e-16 ***
## as.factor(AcidIndex)5 -1.537e-01 3.224e-01 -0.477 0.63346
## as.factor(AcidIndex)6 -1.103e-01 3.169e-01 -0.348 0.72775
## as.factor(AcidIndex)7 -1.434e-01 3.167e-01 -0.453 0.65056
## as.factor(AcidIndex)8 -1.747e-01 3.167e-01 -0.552 0.58120
## as.factor(AcidIndex)9 -2.851e-01 3.170e-01 -0.899 0.36850
## as.factor(AcidIndex)10 -4.428e-01 3.181e-01 -1.392 0.16390
## as.factor(AcidIndex)11 -8.058e-01 3.216e-01 -2.505 0.01223 *
## as.factor(AcidIndex)12 -8.195e-01 3.273e-01 -2.504 0.01228 *
## as.factor(AcidIndex)13 -6.554e-01 3.302e-01 -1.985 0.04716 *
## as.factor(AcidIndex)14 -7.538e-01 3.428e-01 -2.199 0.02787 *
## as.factor(AcidIndex)15 -2.993e-01 4.035e-01 -0.742 0.45829
## as.factor(AcidIndex)16 -9.533e-01 5.480e-01 -1.740 0.08194 .
## as.factor(AcidIndex)17 -1.204e+00 5.481e-01 -2.198 0.02798 *
## as.factor(STARS)1 7.566e-01 1.957e-02 38.668 < 2e-16 ***
## as.factor(STARS)2 1.075e+00 1.826e-02 58.862 < 2e-16 ***
## as.factor(STARS)3 1.194e+00 1.923e-02 62.074 < 2e-16 ***
## as.factor(STARS)4 1.315e+00 2.433e-02 54.034 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(40912.6) family taken to be 1)
##
## Null deviance: 22860 on 12794 degrees of freedom
## Residual deviance: 13543 on 12769 degrees of freedom
## AIC: 45540
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 40913
## Std. Err.: 34296
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -45486.34
model5 <- lm(TARGET ~ ., data = data_scale_train)
summary(model5)
##
## Call:
## lm(formula = TARGET ~ ., data = data_scale_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.35962 -0.49243 0.03446 0.47059 3.10806
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -3.315e-16 6.077e-03 0.000 1.000000
## ï..INDEX 2.426e-03 6.080e-03 0.399 0.689881
## FixedAcidity -8.174e-07 6.180e-03 0.000 0.999894
## VolatileAcidity -4.036e-02 6.097e-03 -6.621 3.72e-11 ***
## CitricAcid 9.320e-03 6.096e-03 1.529 0.126343
## ResidualSugar 3.430e-03 6.083e-03 0.564 0.572851
## Chlorides -2.003e-02 6.086e-03 -3.291 0.000999 ***
## FreeSulfurDioxide 2.370e-02 6.088e-03 3.893 9.95e-05 ***
## TotalSulfurDioxide 2.649e-02 6.092e-03 4.349 1.38e-05 ***
## Density -1.101e-02 6.088e-03 -1.809 0.070510 .
## pH -1.198e-02 6.092e-03 -1.966 0.049318 *
## Sulphates -1.507e-02 6.086e-03 -2.477 0.013267 *
## Alcohol 2.064e-02 6.097e-03 3.386 0.000711 ***
## LabelAppeal 2.001e-01 6.323e-03 31.643 < 2e-16 ***
## AcidIndex -1.432e-01 6.332e-03 -22.613 < 2e-16 ***
## STARS 6.016e-01 6.440e-03 93.426 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6874 on 12779 degrees of freedom
## Multiple R-squared: 0.528, Adjusted R-squared: 0.5275
## F-statistic: 953.1 on 15 and 12779 DF, p-value: < 2.2e-16
plot(model5)
### linear model with scaled data with significant variables.
model6 <- lm(TARGET~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + Alcohol +
as.factor(LabelAppeal) +
as.factor(AcidIndex) +
as.factor(STARS), data = data_scale_train)
summary(model6)
##
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide +
## Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) +
## as.factor(STARS), data = data_scale_train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.5946 -0.4487 0.0204 0.4337 3.1649
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.070741 0.391988 -2.732 0.006312
## VolatileAcidity -0.038595 0.006006 -6.426 1.35e-10
## FreeSulfurDioxide 0.020122 0.005994 3.357 0.000791
## TotalSulfurDioxide 0.025044 0.006000 4.174 3.02e-05
## Alcohol 0.026496 0.006010 4.408 1.05e-05
## as.factor(LabelAppeal)-1.11204793733397 0.190666 0.032584 5.852 4.99e-09
## as.factor(LabelAppeal)0.0101741115806247 0.433759 0.031771 13.653 < 2e-16
## as.factor(LabelAppeal)1.13239616049522 0.675266 0.033196 20.342 < 2e-16
## as.factor(LabelAppeal)2.25461820940981 0.979510 0.043717 22.406 < 2e-16
## as.factor(AcidIndex)-2.09431867252628 -0.160631 0.398565 -0.403 0.686937
## as.factor(AcidIndex)-1.33898965894075 -0.102980 0.391322 -0.263 0.792432
## as.factor(AcidIndex)-0.583660645355226 -0.154375 0.390955 -0.395 0.692947
## as.factor(AcidIndex)0.171668368230302 -0.209756 0.390974 -0.536 0.591625
## as.factor(AcidIndex)0.926997381815829 -0.367250 0.391228 -0.939 0.347897
## as.factor(AcidIndex)1.68232639540136 -0.528477 0.391854 -1.349 0.177471
## as.factor(AcidIndex)2.43765540898688 -0.775551 0.393055 -1.973 0.048502
## as.factor(AcidIndex)3.19298442257241 -0.783829 0.395301 -1.983 0.047404
## as.factor(AcidIndex)3.94831343615794 -0.787984 0.399159 -1.974 0.048391
## as.factor(AcidIndex)4.70364244974347 -0.712553 0.403105 -1.768 0.077142
## as.factor(AcidIndex)5.45897146332899 -0.321913 0.458248 -0.702 0.482388
## as.factor(AcidIndex)6.21430047691452 -0.886853 0.494284 -1.794 0.072802
## as.factor(AcidIndex)6.96962949050005 -0.985757 0.467183 -2.110 0.034877
## as.factor(STARS)-0.42623524866846 0.701610 0.017091 41.050 < 2e-16
## as.factor(STARS)0.416552574962037 1.239157 0.016619 74.563 < 2e-16
## as.factor(STARS)1.25934039859254 1.531710 0.019240 79.611 < 2e-16
## as.factor(STARS)2.10212822222303 1.888885 0.030710 61.507 < 2e-16
##
## (Intercept) **
## VolatileAcidity ***
## FreeSulfurDioxide ***
## TotalSulfurDioxide ***
## Alcohol ***
## as.factor(LabelAppeal)-1.11204793733397 ***
## as.factor(LabelAppeal)0.0101741115806247 ***
## as.factor(LabelAppeal)1.13239616049522 ***
## as.factor(LabelAppeal)2.25461820940981 ***
## as.factor(AcidIndex)-2.09431867252628
## as.factor(AcidIndex)-1.33898965894075
## as.factor(AcidIndex)-0.583660645355226
## as.factor(AcidIndex)0.171668368230302
## as.factor(AcidIndex)0.926997381815829
## as.factor(AcidIndex)1.68232639540136
## as.factor(AcidIndex)2.43765540898688 *
## as.factor(AcidIndex)3.19298442257241 *
## as.factor(AcidIndex)3.94831343615794 *
## as.factor(AcidIndex)4.70364244974347 .
## as.factor(AcidIndex)5.45897146332899
## as.factor(AcidIndex)6.21430047691452 .
## as.factor(AcidIndex)6.96962949050005 *
## as.factor(STARS)-0.42623524866846 ***
## as.factor(STARS)0.416552574962037 ***
## as.factor(STARS)1.25934039859254 ***
## as.factor(STARS)2.10212822222303 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6766 on 12769 degrees of freedom
## Multiple R-squared: 0.5432, Adjusted R-squared: 0.5423
## F-statistic: 607.3 on 25 and 12769 DF, p-value: < 2.2e-16
plot(model6)
### linear model without the scaled value(data_mv dataset).
model7 <- lm(TARGET ~ ., data = data_mv)
summary(model7)
##
## Call:
## lm(formula = TARGET ~ ., data = data_mv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.5455 -0.9486 0.0664 0.9065 5.9873
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.976e+00 4.495e-01 8.845 < 2e-16 ***
## ï..INDEX 1.004e-06 2.515e-06 0.399 0.689881
## FixedAcidity -2.492e-07 1.885e-03 0.000 0.999894
## VolatileAcidity -9.917e-02 1.498e-02 -6.621 3.72e-11 ***
## CitricAcid 2.083e-02 1.362e-02 1.529 0.126343
## ResidualSugar 2.007e-04 3.559e-04 0.564 0.572851
## Chlorides -1.243e-01 3.777e-02 -3.291 0.000999 ***
## FreeSulfurDioxide 3.151e-04 8.093e-05 3.893 9.95e-05 ***
## TotalSulfurDioxide 2.262e-04 5.201e-05 4.349 1.38e-05 ***
## Density -7.993e-01 4.419e-01 -1.809 0.070510 .
## pH -3.448e-02 1.754e-02 -1.966 0.049318 *
## Sulphates -3.274e-02 1.322e-02 -2.477 0.013267 *
## Alcohol 1.095e-02 3.234e-03 3.386 0.000711 ***
## LabelAppeal 4.325e-01 1.367e-02 31.643 < 2e-16 ***
## AcidIndex -2.083e-01 9.213e-03 -22.613 < 2e-16 ***
## STARS 9.768e-01 1.046e-02 93.426 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.324 on 12779 degrees of freedom
## Multiple R-squared: 0.528, Adjusted R-squared: 0.5275
## F-statistic: 953.1 on 15 and 12779 DF, p-value: < 2.2e-16
plot(model7)
### Linear model with the data_mv dataset with significant variables.
model8 <- lm(TARGET~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide + Alcohol +
as.factor(LabelAppeal) +
as.factor(AcidIndex) +
as.factor(STARS), data = data_mv)
summary(model8)
##
## Call:
## lm(formula = TARGET ~ VolatileAcidity + FreeSulfurDioxide + TotalSulfurDioxide +
## Alcohol + as.factor(LabelAppeal) + as.factor(AcidIndex) +
## as.factor(STARS), data = data_mv)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.9982 -0.8644 0.0394 0.8354 6.0968
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.157e-01 7.560e-01 1.079 0.280664
## VolatileAcidity -9.483e-02 1.476e-02 -6.426 1.35e-10 ***
## FreeSulfurDioxide 2.675e-04 7.969e-05 3.357 0.000791 ***
## TotalSulfurDioxide 2.138e-04 5.123e-05 4.174 3.02e-05 ***
## Alcohol 1.406e-02 3.188e-03 4.408 1.05e-05 ***
## as.factor(LabelAppeal)-1 3.673e-01 6.277e-02 5.852 4.99e-09 ***
## as.factor(LabelAppeal)0 8.356e-01 6.120e-02 13.653 < 2e-16 ***
## as.factor(LabelAppeal)1 1.301e+00 6.395e-02 20.342 < 2e-16 ***
## as.factor(LabelAppeal)2 1.887e+00 8.422e-02 22.406 < 2e-16 ***
## as.factor(AcidIndex)5 -3.094e-01 7.678e-01 -0.403 0.686937
## as.factor(AcidIndex)6 -1.984e-01 7.538e-01 -0.263 0.792432
## as.factor(AcidIndex)7 -2.974e-01 7.531e-01 -0.395 0.692947
## as.factor(AcidIndex)8 -4.041e-01 7.532e-01 -0.536 0.591625
## as.factor(AcidIndex)9 -7.075e-01 7.537e-01 -0.939 0.347897
## as.factor(AcidIndex)10 -1.018e+00 7.549e-01 -1.349 0.177471
## as.factor(AcidIndex)11 -1.494e+00 7.572e-01 -1.973 0.048502 *
## as.factor(AcidIndex)12 -1.510e+00 7.615e-01 -1.983 0.047404 *
## as.factor(AcidIndex)13 -1.518e+00 7.689e-01 -1.974 0.048391 *
## as.factor(AcidIndex)14 -1.373e+00 7.765e-01 -1.768 0.077142 .
## as.factor(AcidIndex)15 -6.201e-01 8.828e-01 -0.702 0.482388
## as.factor(AcidIndex)16 -1.708e+00 9.522e-01 -1.794 0.072802 .
## as.factor(AcidIndex)17 -1.899e+00 9.000e-01 -2.110 0.034877 *
## as.factor(STARS)1 1.352e+00 3.292e-02 41.050 < 2e-16 ***
## as.factor(STARS)2 2.387e+00 3.201e-02 74.563 < 2e-16 ***
## as.factor(STARS)3 2.951e+00 3.706e-02 79.611 < 2e-16 ***
## as.factor(STARS)4 3.639e+00 5.916e-02 61.507 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.303 on 12769 degrees of freedom
## Multiple R-squared: 0.5432, Adjusted R-squared: 0.5423
## F-statistic: 607.3 on 25 and 12769 DF, p-value: < 2.2e-16
plot(model8)
### Ordinal logistic regression with scaled data
olr1 <- data_scale_train
olr1$TARGET <- as.factor(olr1$TARGET)
model9 <- polr(TARGET ~ ., data = olr1, Hess=TRUE)
summary(model9)
## Call:
## polr(formula = TARGET ~ ., data = olr1, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## ï..INDEX 0.009879 0.01637 0.60344
## FixedAcidity 0.009839 0.01664 0.59143
## VolatileAcidity -0.105399 0.01649 -6.38999
## CitricAcid 0.022077 0.01642 1.34435
## ResidualSugar -0.001255 0.01628 -0.07707
## Chlorides -0.053700 0.01631 -3.29199
## FreeSulfurDioxide 0.058083 0.01641 3.53971
## TotalSulfurDioxide 0.056892 0.01639 3.47099
## Density -0.032068 0.01651 -1.94258
## pH -0.012273 0.01645 -0.74611
## Sulphates -0.030839 0.01648 -1.87138
## Alcohol 0.096465 0.01639 5.88544
## LabelAppeal 0.758628 0.01907 39.77675
## AcidIndex -0.388535 0.01860 -20.89270
## STARS 1.540233 0.02183 70.54658
##
## Intercepts:
## Value Std. Error t value
## -1.57242723585788|-1.05331568225978 -2.2286 0.0291 -76.4926
## -1.05331568225978|-0.534204128661679 -2.0726 0.0285 -72.8472
## -0.534204128661679|-0.0150925750635791 -1.3893 0.0258 -53.9476
## -0.0150925750635791|0.504018978534521 0.1093 0.0229 4.7659
## 0.504018978534521|1.02313053213262 2.0323 0.0292 69.4889
## 1.02313053213262|1.54224208573072 4.0071 0.0458 87.4917
## 1.54224208573072|2.06135363932882 6.1891 0.0886 69.8252
## 2.06135363932882|2.58046519292692 8.5389 0.2462 34.6861
##
## Residual Deviance: 36214.49
## AIC: 36260.49
###Ordinal logistic without scaled data
olr2 <- data_mv
olr2$TARGET <- as.factor(olr2$TARGET)
model10 <- polr(TARGET ~ ., data = olr2, Hess=TRUE)
summary(model10)
## Call:
## polr(formula = TARGET ~ ., data = olr2, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## ï..INDEX 2.119e-06 NaN NaN
## FixedAcidity 1.558e-03 2.633e-03 0.59167
## VolatileAcidity -1.344e-01 2.101e-02 -6.39951
## CitricAcid 2.561e-02 1.904e-02 1.34519
## ResidualSugar -3.822e-05 4.944e-04 -0.07729
## Chlorides -1.730e-01 3.214e-04 -538.21394
## FreeSulfurDioxide 4.009e-04 1.133e-04 3.53957
## TotalSulfurDioxide 2.521e-04 7.266e-05 3.46998
## Density -1.208e+00 1.431e-03 -844.16348
## pH -1.835e-02 1.951e-02 -0.94077
## Sulphates -3.477e-02 1.856e-02 -1.87338
## Alcohol 2.656e-02 4.275e-03 6.21310
## LabelAppeal 8.513e-01 2.103e-02 40.47578
## AcidIndex -2.935e-01 8.872e-03 -33.08037
## STARS 1.298e+00 1.715e-02 75.71084
##
## Intercepts:
## Value Std. Error t value
## 0|1 -3.5371 0.0014 -2439.9380
## 1|2 -3.3811 0.0015 -2255.5000
## 2|3 -2.6979 0.0038 -712.9369
## 3|4 -1.1993 0.0264 -45.4952
## 4|5 0.7237 0.0385 18.8065
## 5|6 2.6986 0.0536 50.3522
## 6|7 4.8806 0.0537 90.8238
## 7|8 7.2303 0.0537 134.5209
##
## Residual Deviance: 36214.49
## AIC: 36260.49
library(pscl)
model11 <- zeroinfl(TARGET ~ . | STARS, data = data_mv, dist = 'negbin')
summary(model11)
##
## Call:
## zeroinfl(formula = TARGET ~ . | STARS, data = data_mv, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.31701 -0.53688 0.01918 0.40829 2.89146
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.543e+00 NA NA NA
## ï..INDEX 1.809e-07 NA NA NA
## FixedAcidity 3.013e-04 NA NA NA
## VolatileAcidity -1.499e-02 NA NA NA
## CitricAcid 1.243e-03 NA NA NA
## ResidualSugar -4.969e-05 NA NA NA
## Chlorides -2.335e-02 NA NA NA
## FreeSulfurDioxide 3.718e-05 NA NA NA
## TotalSulfurDioxide -2.958e-06 NA NA NA
## Density -2.732e-01 NA NA NA
## pH 2.922e-03 NA NA NA
## Sulphates -1.947e-03 NA NA NA
## Alcohol 6.542e-03 NA NA NA
## LabelAppeal 2.240e-01 NA NA NA
## AcidIndex -3.130e-02 NA NA NA
## STARS 1.007e-01 NA NA NA
## Log(theta) 1.797e+01 NA NA NA
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.3804 NA NA NA
## STARS -2.2238 NA NA NA
##
## Theta = 63997386.9979
## Number of iterations in BFGS optimization: 23
## Log-likelihood: -2.083e+04 on 19 Df
library(ggthemes)
scatterPreds <- predict(model11, data_mv)
qplot(data_mv$TARGET, scatterPreds, main = 'Predicted vs Actual') + ggthemes::theme_tufte()
residPlot <- scatterPreds - data_mv$TARGET
qplot(data_mv$TARGET, residPlot, main = 'Residuals') + ggthemes::theme_tufte()
absdata <- abs(data_scale_train)
model12 <- glm.nb(TARGET ~ ., data = absdata)
summary(model12)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = absdata, init.theta = 35848.33073,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8888 -0.9652 -0.8312 0.4713 1.5768
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.920318 0.041521 -22.165 <2e-16 ***
## ï..INDEX 0.012386 0.019653 0.630 0.529
## FixedAcidity 0.001218 0.013445 0.091 0.928
## VolatileAcidity 0.017113 0.013199 1.296 0.195
## CitricAcid 0.002713 0.013125 0.207 0.836
## ResidualSugar -0.004542 0.013001 -0.349 0.727
## Chlorides 0.001761 0.012668 0.139 0.889
## FreeSulfurDioxide -0.002295 0.012879 -0.178 0.859
## TotalSulfurDioxide 0.003254 0.013308 0.245 0.807
## Density 0.008868 0.012986 0.683 0.495
## pH 0.004886 0.013442 0.363 0.716
## Sulphates 0.013197 0.012681 1.041 0.298
## Alcohol -0.007084 0.013916 -0.509 0.611
## LabelAppeal 0.162795 0.013839 11.764 <2e-16 ***
## AcidIndex 0.120343 0.012366 9.732 <2e-16 ***
## STARS 0.488152 0.018843 25.907 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(35848.33) family taken to be 1)
##
## Null deviance: 9920.9 on 12794 degrees of freedom
## Residual deviance: 8970.4 on 12779 degrees of freedom
## AIC: 25738
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 35848
## Std. Err.: 45074
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -25703.86
modelsave1<-saveRDS(model12, "mymodel.rds")
plot(model12)
model13 = glm(TARGET ~ ., data=absdata, family=poisson)
summary(model13)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = absdata)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8550 -0.5038 -0.0667 0.4644 1.5768
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.920316 0.041520 -22.166 <2e-16 ***
## ï..INDEX 0.012386 0.019653 0.630 0.529
## FixedAcidity 0.001218 0.013445 0.091 0.928
## VolatileAcidity 0.017113 0.013199 1.296 0.195
## CitricAcid 0.002713 0.013125 0.207 0.836
## ResidualSugar -0.004542 0.013001 -0.349 0.727
## Chlorides 0.001761 0.012668 0.139 0.889
## FreeSulfurDioxide -0.002295 0.012879 -0.178 0.859
## TotalSulfurDioxide 0.003254 0.013308 0.245 0.807
## Density 0.008868 0.012986 0.683 0.495
## pH 0.004886 0.013442 0.363 0.716
## Sulphates 0.013197 0.012680 1.041 0.298
## Alcohol -0.007084 0.013916 -0.509 0.611
## LabelAppeal 0.162795 0.013839 11.764 <2e-16 ***
## AcidIndex 0.120342 0.012365 9.732 <2e-16 ***
## STARS 0.488151 0.018842 25.907 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 6665.5 on 12794 degrees of freedom
## Residual deviance: 5715.0 on 12779 degrees of freedom
## AIC: Inf
##
## Number of Fisher Scoring iterations: 5
plot(model13)
modelsave2<-saveRDS(model13, "mymodel.rds")
saveRDS(model12, "model.rds")
my_model12 <- readRDS("model.rds")
saveRDS(model12, "model.rds")
my_model12 <- readRDS("model.rds")
aic1 <- model1$aic
aic2 <- model2$aic
aic3 <- model3$aic
aic4 <- model4$aic
aic5 <- model5$aic
aic6 <- model6$aic
aic7 <- model7$aic
aic8 <- model8$aic
aic9 <- model9$aic
aic10 <- model10$aic
aic11 <- model11$aic
aic12 <- model12$aic
aic13 <- model13$aic
mse1 <- mean((data_mv$TARGET - predict(model1))^2)
mse2 <- mean((data_mv$TARGET - predict(model2))^2)
mse3 <- mean((data_mv$TARGET - predict(model3))^2)
mse4 <- mean((data_mv$TARGET - predict(model4))^2)
mse5 <- mean((data_scale_train$TARGET - predict(model5))^2)
mse6 <- mean((data_scale_train$TARGET - predict(model6))^2)
mse7 <- mean((data_mv$TARGET - predict(model7))^2)
mse8 <- mean((data_mv$TARGET - predict(model8))^2)
mse9 <- mean((data_scale_train$TARGET - predict(model9))^2)
mse10 <- mean((data_mv$TARGET - predict(model10))^2)
mse11 <- mean((data_mv$TARGET - predict(model11))^2)
mse12 <- mean((absdata$TARGET - predict(model12))^2)
mse13 <- mean((absdata$TARGET - predict(model13))^2)
compare_aic_mse <- matrix(c(mse1, mse2, mse3, mse4, mse5, mse6, mse7, mse8, mse9, mse10, mse11,mse12,mse13,aic1, aic2, aic3, aic4, aic5, aic6, aic7, aic8, aic9, aic10, aic11,aic12,aic13),nrow=13,ncol=2,byrow=TRUE)
rownames(compare_aic_mse) <- c("Model1","Model2","Model3","Model4","Model5","Model6","Model7","Model8","Model9","Model10","Model11","Model12","Model13")
colnames(compare_aic_mse) <- c("MSE","AIC")
compare_models <- as.data.frame(compare_models)
kable(compare_aic_mse) %>%
kable_styling(full_width = T)
| MSE | AIC | |
|---|---|---|
| Model1 | 6.749972e+00 | 7.039608e+00 |
| Model2 | 6.749968e+00 | 6.704008e+00 |
| Model3 | 4.719362e-01 | 4.568016e-01 |
| Model4 | 1.751305e+00 | 1.695142e+00 |
| Model5 | NA | NA |
| Model6 | 1.717355e+00 | 1.417136e+00 |
| Model7 | 1.417136e+00 | 4.670209e+04 |
| Model8 | 2.201626e+05 | 4.670437e+04 |
| Model9 | 4.554034e+04 | 2.573786e+04 |
| Model10 | Inf | 6.749972e+00 |
| Model11 | 7.039608e+00 | 6.749968e+00 |
| Model12 | 6.704008e+00 | 4.719362e-01 |
| Model13 | 4.568016e-01 | 1.751305e+00 |
eval_data_mv<-data.frame(eval_data_mv)
modelValidation <- function(mod,eval_data_mv){
preds = predict(mod, eval_data_mv$TARGET)
diffMat = as.numeric(preds) - as.numeric(eval_data_mv$TARGET)
diffMat = diffMat^2
loss <- mean(diffMat)
return(loss)
}
df1<-data.frame(modelValidation1(model1,data_mv))
df2<-data.frame(modelValidation1(model2,data_mv))
df3<-data.frame(modelValidation1(model3,data_mv))
df4<-data.frame(modelValidation1(model4,data_mv))
df5<-data.frame(modelValidation1(model5,data_scale_train))
df6<-data.frame(modelValidation1(model6,data_scale_train))
df7<-data.frame(modelValidation1(model7,data_mv))
df8<-data.frame(modelValidation1(model8,data_mv))
df9<-data.frame(modelValidation1(model9,data_scale_train))
df10<-data.frame(modelValidation1(model10,data_mv))
df11<-data.frame(modelValidation1(model11,data_mv))
df12<-data.frame(modelValidation1(model12,absdata))
df13<-data.frame(modelValidation1(model13,absdata))
library(kableExtra)
compare_model1 <- c(df1)
compare_model2 <- c(df2)
compare_model3 <- c(df3)
compare_model4 <- c(df4)
compare_model5 <- c(df5)
compare_model6 <- c(df6)
compare_model7 <- c(df7)
compare_model8 <- c(df8)
compare_model9 <- c(df9)
compare_model10 <- c(df10)
compare_model11 <- c(df11)
compare_model12 <- c(df12)
compare_model13 <- c(df13)
compare <- data.frame(compare_model1, compare_model2, compare_model3, compare_model4, compare_model5, compare_model6,compare_model7,compare_model8,compare_model9,compare_model10,compare_model11,compare_model12,compare_model13)
colnames(compare) <- c("Model1", "Model2", "Model3", "Model4", "Model5", "Model6", "Model7","Model8","Model9","Model10","Model11","Model12","Model13")
kable(compare)
| Model1 | Model2 | Model3 | Model4 | Model5 | Model6 | Model7 | Model8 | Model9 | Model10 | Model11 | Model12 | Model13 |
|---|---|---|---|---|---|---|---|---|---|---|---|---|
| 6.749972 | 66462383 | 6.749968 | 6.704008 | 0.4719362 | 0.4568016 | 1.751305 | 1.695142 | 17.13263 | 3.136616 | 1.717355 | 1.417136 | 1.417136 |