Link to the project in RPubs: http://rpubs.com/ofomicheva86/387855
#required packages
library(MASS)
library(corrplot)
library(PerformanceAnalytics)
library(GGally)
library(RColorBrewer)
library(VIM)
library(dplyr)
library(mice)
library(pROC)
library(caret)
library(pscl)
library(ResourceSelection)
library(stringr)
library(vcd)
library(rcompanion)
library(lmtest)
#read training data set
data <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/wine-training-data.csv",
stringsAsFactors=T, header=T)
#read testing data set
data_testing <- read.csv(file=
"https://raw.githubusercontent.com/olga0503/DATA-621/master/wine-evaluation-data.csv",
stringsAsFactors=T, header=T)
#display first six entries
head(data)
## INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## 1 1 3 3.2 1.160 -0.98 54.2
## 2 2 3 4.5 0.160 -0.81 26.1
## 3 4 5 7.1 2.640 -0.88 14.8
## 4 5 3 5.7 0.385 0.04 18.8
## 5 6 4 8.0 0.330 -1.26 9.4
## 6 7 0 11.3 0.320 0.59 2.2
## Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates
## 1 -0.567 NA 268 0.99280 3.33 -0.59
## 2 -0.425 15 -327 1.02792 3.38 0.70
## 3 0.037 214 142 0.99518 3.12 0.48
## 4 -0.425 22 115 0.99640 2.24 1.83
## 5 NA -167 108 0.99457 3.12 1.77
## 6 0.556 -37 15 0.99940 3.20 1.29
## Alcohol LabelAppeal AcidIndex STARS
## 1 9.9 0 8 2
## 2 NA -1 7 3
## 3 22.0 -1 8 3
## 4 6.2 -1 6 1
## 5 13.7 0 9 2
## 6 15.4 0 11 NA
head(data_testing)
## IN TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## 1 3 NA 5.4 -0.860 0.27 -10.7
## 2 9 NA 12.4 0.385 -0.76 -19.7
## 3 10 NA 7.2 1.750 0.17 -33.0
## 4 18 NA 6.2 0.100 1.80 1.0
## 5 21 NA 11.4 0.210 0.28 1.2
## 6 30 NA 17.6 0.040 -1.15 1.4
## Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates
## 1 0.092 23 398 0.98527 5.02 0.64
## 2 1.169 -37 68 0.99048 3.37 1.09
## 3 0.065 9 76 1.04641 4.61 0.68
## 4 -0.179 104 89 0.98877 3.20 2.11
## 5 0.038 70 53 1.02899 2.54 -0.07
## 6 0.535 -250 140 0.95028 3.06 -0.02
## Alcohol LabelAppeal AcidIndex STARS
## 1 12.30 -1 6 NA
## 2 16.00 0 6 2
## 3 8.55 0 8 1
## 4 12.30 -1 8 1
## 5 4.80 0 10 NA
## 6 11.40 1 8 4
#find dimentions
dim(data)
## [1] 12795 16
#build function that counts missing values
count_nas <- function(data){
variable_name_column <- c()
number_missing_column <- c()
for (i in 2:ncol(data)){
variable_name <- colnames(data[i])
number_missing <- sum(is.na(data[i]))
variable_name_column <- c(variable_name_column,variable_name)
number_missing_column <- c(number_missing_column,number_missing)
}
missing_table <- data.frame(variable_name_column,number_missing_column)
missing_table <- missing_table %>% mutate(percentage=round(number_missing_column*100/nrow(data),2)) %>% arrange(desc(percentage))
missing_table
}
#build function that counts negative values
count_neg <- function(data){
variable_name_column <- c()
number_negative_column <- c()
for (i in 3:ncol(data)){
neg_count <- 0
variable_name <- colnames(data[i])
for (j in 1:nrow(data)){
if(is.numeric(data[j,i]) && !is.na(data[j,i]) && data[j,i] < 0) {
neg_count <- neg_count + 1
}
}
number_negative_column <- c(number_negative_column,neg_count)
variable_name_column <- c(variable_name_column,variable_name)
}
negative_table <- data.frame(variable_name_column,number_negative_column)
negative_table <- negative_table %>% mutate(percentage=round(number_negative_column*100/nrow(data),2)) %>% arrange(desc(percentage))
negative_table
}
#chart for missing values
aggr(data[-1], prop = T, numbers = T, cex.axis=.5, cex.numbers = 0.1,
ylab=c("Proportion of missingness","Missingness Pattern"),
labels=names(data[-1]))
#count missing values
count_nas(data[3:length(data)])
## variable_name_column number_missing_column percentage
## 1 STARS 3359 26.25
## 2 Sulphates 1210 9.46
## 3 TotalSulfurDioxide 682 5.33
## 4 Alcohol 653 5.10
## 5 FreeSulfurDioxide 647 5.06
## 6 Chlorides 638 4.99
## 7 ResidualSugar 616 4.81
## 8 pH 395 3.09
## 9 VolatileAcidity 0 0.00
## 10 CitricAcid 0 0.00
## 11 Density 0 0.00
## 12 LabelAppeal 0 0.00
## 13 AcidIndex 0 0.00
count_nas(data_testing[3:length(data_testing)])
## variable_name_column number_missing_column percentage
## 1 STARS 841 25.22
## 2 Sulphates 310 9.30
## 3 Alcohol 185 5.55
## 4 ResidualSugar 168 5.04
## 5 TotalSulfurDioxide 157 4.71
## 6 FreeSulfurDioxide 152 4.56
## 7 Chlorides 138 4.14
## 8 pH 104 3.12
## 9 VolatileAcidity 0 0.00
## 10 CitricAcid 0 0.00
## 11 Density 0 0.00
## 12 LabelAppeal 0 0.00
## 13 AcidIndex 0 0.00
Replace missing values of STARS with “NONE”.
data <- data %>% mutate(STARS = as.factor(ifelse(is.na(STARS),"NONE",STARS)))
data_testing <- data_testing %>% mutate(STARS = as.factor(ifelse(is.na(STARS),"NONE",STARS)))
Replace negative values with their absolute values.
#count negative values
count_neg(data)
## variable_name_column number_negative_column percentage
## 1 LabelAppeal 3640 28.45
## 2 Chlorides 3197 24.99
## 3 ResidualSugar 3136 24.51
## 4 FreeSulfurDioxide 3036 23.73
## 5 CitricAcid 2966 23.18
## 6 VolatileAcidity 2827 22.09
## 7 TotalSulfurDioxide 2504 19.57
## 8 Sulphates 2361 18.45
## 9 FixedAcidity 1621 12.67
## 10 Alcohol 118 0.92
## 11 Density 0 0.00
## 12 pH 0 0.00
## 13 AcidIndex 0 0.00
## 14 STARS 0 0.00
count_neg(data_testing)
## variable_name_column number_negative_column percentage
## 1 LabelAppeal 924 27.71
## 2 ResidualSugar 828 24.83
## 3 CitricAcid 804 24.11
## 4 VolatileAcidity 788 23.63
## 5 Chlorides 776 23.27
## 6 FreeSulfurDioxide 774 23.21
## 7 TotalSulfurDioxide 639 19.16
## 8 Sulphates 594 17.81
## 9 FixedAcidity 439 13.16
## 10 Alcohol 25 0.75
## 11 Density 0 0.00
## 12 pH 0 0.00
## 13 AcidIndex 0 0.00
## 14 STARS 0 0.00
#exclude the variables 'STARS' and 'LabelAppeal'
stars_labelappeal <- data %>% select(STARS,LabelAppeal)
stars_labelappeal_testing <- data_testing %>% select(STARS,LabelAppeal)
data <- data %>% select(-STARS,-LabelAppeal)
data_testing <- data_testing %>% select(-STARS,-LabelAppeal,-TARGET)
#replace negative values witj their absolute values
data <- abs(data)
data_testing <- abs(data_testing)
#merge
data <- data.frame(data,stars_labelappeal)
data_testing <- data.frame(data_testing,stars_labelappeal_testing)
#confirm no negatives
count_neg(data)
## variable_name_column number_negative_column percentage
## 1 LabelAppeal 3640 28.45
## 2 FixedAcidity 0 0.00
## 3 VolatileAcidity 0 0.00
## 4 CitricAcid 0 0.00
## 5 ResidualSugar 0 0.00
## 6 Chlorides 0 0.00
## 7 FreeSulfurDioxide 0 0.00
## 8 TotalSulfurDioxide 0 0.00
## 9 Density 0 0.00
## 10 pH 0 0.00
## 11 Sulphates 0 0.00
## 12 Alcohol 0 0.00
## 13 AcidIndex 0 0.00
## 14 STARS 0 0.00
count_neg(data_testing)
## variable_name_column number_negative_column percentage
## 1 LabelAppeal 924 27.71
## 2 VolatileAcidity 0 0.00
## 3 CitricAcid 0 0.00
## 4 ResidualSugar 0 0.00
## 5 Chlorides 0 0.00
## 6 FreeSulfurDioxide 0 0.00
## 7 TotalSulfurDioxide 0 0.00
## 8 Density 0 0.00
## 9 pH 0 0.00
## 10 Sulphates 0 0.00
## 11 Alcohol 0 0.00
## 12 AcidIndex 0 0.00
## 13 STARS 0 0.00
Apply multiple imputation.
#apply multiple imputation for training data set
#exclude variable'INDEX' and 'TARGET'
exclude <- c('INDEX','TARGET')
index_target <- data[1:2]
include <- setdiff(names(data), exclude)
data_include <- data[include]
#imputation with mean
imp.data <- mice(data_include, m=9, method='pmm', printFlag=FALSE)
#merge imputed values with data frame
data <- complete(imp.data)
data <- data.frame(index_target,data)
head(data)
## INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar
## 1 1 3 3.2 1.160 0.98 54.2
## 2 2 3 4.5 0.160 0.81 26.1
## 3 4 5 7.1 2.640 0.88 14.8
## 4 5 3 5.7 0.385 0.04 18.8
## 5 6 4 8.0 0.330 1.26 9.4
## 6 7 0 11.3 0.320 0.59 2.2
## Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates
## 1 0.567 222 268 0.99280 3.33 0.59
## 2 0.425 15 327 1.02792 3.38 0.70
## 3 0.037 214 142 0.99518 3.12 0.48
## 4 0.425 22 115 0.99640 2.24 1.83
## 5 0.256 167 108 0.99457 3.12 1.77
## 6 0.556 37 15 0.99940 3.20 1.29
## Alcohol AcidIndex STARS LabelAppeal
## 1 9.9 8 2 0
## 2 11.7 7 3 -1
## 3 22.0 8 3 -1
## 4 6.2 6 1 -1
## 5 13.7 9 2 0
## 6 15.4 11 NONE 0
#confirm no NAs
count_nas(data)
## variable_name_column number_missing_column percentage
## 1 TARGET 0 0
## 2 FixedAcidity 0 0
## 3 VolatileAcidity 0 0
## 4 CitricAcid 0 0
## 5 ResidualSugar 0 0
## 6 Chlorides 0 0
## 7 FreeSulfurDioxide 0 0
## 8 TotalSulfurDioxide 0 0
## 9 Density 0 0
## 10 pH 0 0
## 11 Sulphates 0 0
## 12 Alcohol 0 0
## 13 AcidIndex 0 0
## 14 STARS 0 0
## 15 LabelAppeal 0 0
#apply multiple imputation for testing data set
exclude_testing <- c('IN','TARGET')
index_target_testing <- data_testing[1:2]
include_testing <- setdiff(names(data_testing), exclude_testing)
data_include_testing <- data_testing[include_testing]
#imputation with mean
imp.data_testing <- mice(data_include_testing, m=9, method='pmm', printFlag=FALSE)
#merge imputed values with data frame
data_testing <- complete(imp.data_testing)
data_testing <- data.frame(index_target_testing,data_testing)
head(data_testing)
## IN FixedAcidity FixedAcidity.1 VolatileAcidity CitricAcid ResidualSugar
## 1 3 5.4 5.4 0.860 0.27 10.7
## 2 9 12.4 12.4 0.385 0.76 19.7
## 3 10 7.2 7.2 1.750 0.17 33.0
## 4 18 6.2 6.2 0.100 1.80 1.0
## 5 21 11.4 11.4 0.210 0.28 1.2
## 6 30 17.6 17.6 0.040 1.15 1.4
## Chlorides FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates
## 1 0.092 23 398 0.98527 5.02 0.64
## 2 1.169 37 68 0.99048 3.37 1.09
## 3 0.065 9 76 1.04641 4.61 0.68
## 4 0.179 104 89 0.98877 3.20 2.11
## 5 0.038 70 53 1.02899 2.54 0.07
## 6 0.535 250 140 0.95028 3.06 0.02
## Alcohol AcidIndex STARS LabelAppeal
## 1 12.30 6 NONE -1
## 2 16.00 6 2 0
## 3 8.55 8 1 0
## 4 12.30 8 1 -1
## 5 4.80 10 NONE 0
## 6 11.40 8 4 1
#confirm no NAs
count_nas(data_testing)
## variable_name_column number_missing_column percentage
## 1 FixedAcidity 0 0
## 2 FixedAcidity.1 0 0
## 3 VolatileAcidity 0 0
## 4 CitricAcid 0 0
## 5 ResidualSugar 0 0
## 6 Chlorides 0 0
## 7 FreeSulfurDioxide 0 0
## 8 TotalSulfurDioxide 0 0
## 9 Density 0 0
## 10 pH 0 0
## 11 Sulphates 0 0
## 12 Alcohol 0 0
## 13 AcidIndex 0 0
## 14 STARS 0 0
## 15 LabelAppeal 0 0
#create separate boxplots for each numeric variable
par(mfrow=c(1,5))
for(i in 3:ncol(data)) {
if (is.numeric(data[,i])=="TRUE") {
boxplot(data[,i], main=names(data)[i])
}
}
#create mosaic plot for 'STARS'
count <- table(data$TARGET, data$STARS)
mosaicplot(count, main = "Distribution of 'STARS'",
xlab = "TARGET",
ylab = "STARS",
las = 1,
border = "black",
shade = TRUE
)
The following assumptions must be verified for linear regression.
par(mfrow=c(2,2))
#colnames <- dimnames(data)[[2]]
for (i in 1:ncol(data)) {
if (is.double(data[,i]) == "TRUE"){
plot(data$TARGET ~ data[,i],main=names(data)[i], xlab=names(data)[i])
reg_line <- lm(data$TARGET ~ data[,i])
abline(reg_line,col="red")
}
}
# histograms, density lines and normal probability plots
for (i in 1:ncol(data)) {
if (is.double(data[,i]) == "TRUE"){
par(mfrow=c(1,2))
plotNormalHistogram(data[,i],main=names(data)[i])
qqnorm(data[,i],main=names(data)[i])
qqline(data[,i])
}}
c. Multicollinearity assumption.
#correlation between variables
corrplot(cor(data[3:ncol(data)] %>% select_if(is.numeric)), type = "upper", method = "number", tl.cex = 0.8, tl.col="black",number.cex = .5)
model = glm(TARGET ~., data = data)
#Breush Pagan Test
bptest(model)
##
## studentized Breusch-Pagan test
##
## data: model
## BP = 1874.1, df = 18, p-value < 2.2e-16
Transforming data using Box-Cox approach.
#resolve problem with non-negative values
for (i in 2:ncol(data)){
for (j in 1:nrow(data)){
if (is.double(data[j,i])==TRUE && data[j,i] == 0) {
data[j,i]=data[j,i]+0.01}
}
}
#data transformation
for (i in 2:ncol(data)) {
if (is.double(data[,i])==TRUE){
Box = boxcox(data[,i] ~ 1, # Transform Turbidity as a single vector
lambda = seq(-10,10,1) # Try values -10 to 10 by 0.1
)
Cox = data.frame(Box$x, Box$y) # Create a data frame with the results
Cox2 = Cox[with(Cox, order(-Cox$Box.y)),] # Order the new data frame by decreasing y
#Cox2[1,] # Display the lambda with the greatest
# log likelihood
lambda = Cox2[1, "Box.x"] # Extract that lambda
data[,i] = (data[,i] ^ lambda - 1)/lambda # Transform the original data
#plotNormalHistogram(data[,i],main=names(data)[i])
#qqnorm(data[,i],main=names(data)[i])
#qqline(data[,i])
}
}
#build glm model using stepwise approach
linear_model.null = glm(TARGET ~ 1, data = data)
linear_model.full = glm(TARGET ~ ., data = data)
step(linear_model.null,
scope = list(upper=linear_model.full),
direction = "both",
data = data)
## Start: AIC=53091.37
## TARGET ~ 1
##
## Df Deviance AIC
## + STARS 4 24729 44754
## + LabelAppeal 1 41443 51354
## + AcidIndex 1 44603 52294
## + VolatileAcidity 1 47141 53003
## + TotalSulfurDioxide 1 47268 53037
## + Alcohol 1 47299 53045
## + FreeSulfurDioxide 1 47326 53053
## + Chlorides 1 47356 53061
## + FixedAcidity 1 47366 53063
## + Sulphates 1 47398 53072
## + CitricAcid 1 47410 53075
## + Density 1 47417 53077
## + ResidualSugar 1 47458 53088
## <none> 47477 53091
## + pH 1 47473 53092
## + INDEX 1 47477 53093
##
## Step: AIC=44753.56
## TARGET ~ STARS
##
## Df Deviance AIC
## + LabelAppeal 1 22900 43772
## + AcidIndex 1 23983 44363
## + VolatileAcidity 1 24627 44703
## + TotalSulfurDioxide 1 24663 44721
## + FreeSulfurDioxide 1 24690 44735
## + Chlorides 1 24695 44738
## + Alcohol 1 24704 44743
## + CitricAcid 1 24708 44745
## + Sulphates 1 24711 44746
## + FixedAcidity 1 24713 44747
## + Density 1 24715 44748
## + ResidualSugar 1 24723 44753
## <none> 24729 44754
## + INDEX 1 24726 44754
## + pH 1 24729 44756
## - STARS 4 47477 53091
##
## Step: AIC=43772.25
## TARGET ~ STARS + LabelAppeal
##
## Df Deviance AIC
## + AcidIndex 1 21990 43255
## + VolatileAcidity 1 22801 43719
## + TotalSulfurDioxide 1 22815 43727
## + FreeSulfurDioxide 1 22864 43755
## + Chlorides 1 22866 43755
## + Alcohol 1 22867 43756
## + FixedAcidity 1 22881 43764
## + Sulphates 1 22881 43764
## + CitricAcid 1 22885 43766
## + Density 1 22887 43767
## + ResidualSugar 1 22893 43770
## <none> 22900 43772
## + INDEX 1 22899 43774
## + pH 1 22899 43774
## - LabelAppeal 1 24729 44754
## - STARS 4 41443 51354
##
## Step: AIC=43255.27
## TARGET ~ STARS + LabelAppeal + AcidIndex
##
## Df Deviance AIC
## + VolatileAcidity 1 21915 43214
## + TotalSulfurDioxide 1 21938 43227
## + CitricAcid 1 21954 43237
## + Alcohol 1 21964 43243
## + FreeSulfurDioxide 1 21970 43246
## + Chlorides 1 21973 43248
## + Sulphates 1 21978 43251
## + pH 1 21983 43254
## + Density 1 21983 43254
## + ResidualSugar 1 21985 43255
## <none> 21990 43255
## + INDEX 1 21989 43257
## + FixedAcidity 1 21989 43257
## - AcidIndex 1 22900 43772
## - LabelAppeal 1 23983 44363
## - STARS 4 38357 50366
##
## Step: AIC=43213.93
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity
##
## Df Deviance AIC
## + TotalSulfurDioxide 1 21869 43189
## + CitricAcid 1 21882 43197
## + Alcohol 1 21888 43200
## + FreeSulfurDioxide 1 21897 43205
## + Chlorides 1 21900 43207
## + Sulphates 1 21905 43210
## + Density 1 21908 43212
## + pH 1 21910 43213
## + ResidualSugar 1 21911 43213
## <none> 21915 43214
## + INDEX 1 21915 43216
## + FixedAcidity 1 21915 43216
## - VolatileAcidity 1 21990 43255
## - AcidIndex 1 22801 43719
## - LabelAppeal 1 23903 44323
## - STARS 4 38156 50301
##
## Step: AIC=43188.68
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide
##
## Df Deviance AIC
## + CitricAcid 1 21837 43172
## + Alcohol 1 21839 43173
## + FreeSulfurDioxide 1 21851 43181
## + Chlorides 1 21853 43182
## + Sulphates 1 21858 43185
## + Density 1 21861 43186
## + pH 1 21863 43187
## + ResidualSugar 1 21865 43188
## <none> 21869 43189
## + FixedAcidity 1 21868 43190
## + INDEX 1 21868 43190
## - TotalSulfurDioxide 1 21915 43214
## - VolatileAcidity 1 21938 43227
## - AcidIndex 1 22724 43678
## - LabelAppeal 1 23868 44306
## - STARS 4 38034 50262
##
## Step: AIC=43172.22
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid
##
## Df Deviance AIC
## + Alcohol 1 21807 43157
## + FreeSulfurDioxide 1 21821 43165
## + Chlorides 1 21822 43166
## + Sulphates 1 21826 43168
## + Density 1 21830 43170
## + pH 1 21831 43171
## + ResidualSugar 1 21833 43172
## <none> 21837 43172
## + FixedAcidity 1 21836 43174
## + INDEX 1 21837 43174
## - CitricAcid 1 21869 43189
## - TotalSulfurDioxide 1 21882 43197
## - VolatileAcidity 1 21904 43209
## - AcidIndex 1 22711 43673
## - LabelAppeal 1 23831 44288
## - STARS 4 37943 50233
##
## Step: AIC=43156.65
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol
##
## Df Deviance AIC
## + FreeSulfurDioxide 1 21790 43149
## + Chlorides 1 21793 43150
## + Sulphates 1 21797 43152
## + Density 1 21800 43155
## + pH 1 21802 43156
## + ResidualSugar 1 21803 43156
## <none> 21807 43157
## + FixedAcidity 1 21806 43158
## + INDEX 1 21807 43158
## - Alcohol 1 21837 43172
## - CitricAcid 1 21839 43173
## - TotalSulfurDioxide 1 21855 43183
## - VolatileAcidity 1 21876 43195
## - AcidIndex 1 22672 43652
## - LabelAppeal 1 23808 44278
## - STARS 4 37800 50187
##
## Step: AIC=43148.61
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide
##
## Df Deviance AIC
## + Chlorides 1 21776 43142
## + Sulphates 1 21780 43145
## + Density 1 21783 43147
## + pH 1 21785 43148
## + ResidualSugar 1 21786 43148
## <none> 21790 43149
## + FixedAcidity 1 21789 43150
## + INDEX 1 21790 43150
## - FreeSulfurDioxide 1 21807 43157
## - Alcohol 1 21821 43165
## - CitricAcid 1 21821 43165
## - TotalSulfurDioxide 1 21836 43174
## - VolatileAcidity 1 21857 43186
## - AcidIndex 1 22641 43637
## - LabelAppeal 1 23787 44269
## - STARS 4 37743 50170
##
## Step: AIC=43142.23
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides
##
## Df Deviance AIC
## + Sulphates 1 21766 43139
## + Density 1 21769 43140
## + pH 1 21771 43141
## + ResidualSugar 1 21771 43142
## <none> 21776 43142
## + FixedAcidity 1 21775 43144
## + INDEX 1 21775 43144
## - Chlorides 1 21790 43149
## - FreeSulfurDioxide 1 21793 43150
## - Alcohol 1 21806 43158
## - CitricAcid 1 21806 43158
## - TotalSulfurDioxide 1 21822 43167
## - VolatileAcidity 1 21842 43179
## - AcidIndex 1 22612 43622
## - LabelAppeal 1 23772 44262
## - STARS 4 37701 50157
##
## Step: AIC=43138.54
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates
##
## Df Deviance AIC
## + Density 1 21760 43137
## + pH 1 21761 43138
## + ResidualSugar 1 21762 43138
## <none> 21766 43139
## + FixedAcidity 1 21765 43140
## + INDEX 1 21766 43140
## - Sulphates 1 21776 43142
## - Chlorides 1 21780 43145
## - FreeSulfurDioxide 1 21783 43146
## - Alcohol 1 21797 43155
## - CitricAcid 1 21797 43155
## - TotalSulfurDioxide 1 21811 43163
## - VolatileAcidity 1 21832 43175
## - AcidIndex 1 22596 43616
## - LabelAppeal 1 23762 44259
## - STARS 4 37665 50147
##
## Step: AIC=43136.85
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + Density
##
## Df Deviance AIC
## + pH 1 21755 43136
## + ResidualSugar 1 21756 43136
## <none> 21760 43137
## + FixedAcidity 1 21759 43139
## - Density 1 21766 43139
## + INDEX 1 21759 43139
## - Sulphates 1 21769 43140
## - Chlorides 1 21773 43143
## - FreeSulfurDioxide 1 21776 43145
## - Alcohol 1 21790 43153
## - CitricAcid 1 21790 43153
## - TotalSulfurDioxide 1 21806 43162
## - VolatileAcidity 1 21826 43174
## - AcidIndex 1 22583 43610
## - LabelAppeal 1 23754 44257
## - STARS 4 37642 50141
##
## Step: AIC=43136.22
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + Density + pH
##
## Df Deviance AIC
## + ResidualSugar 1 21751 43136
## <none> 21755 43136
## - pH 1 21760 43137
## - Density 1 21761 43138
## + FixedAcidity 1 21755 43138
## + INDEX 1 21755 43138
## - Sulphates 1 21764 43140
## - Chlorides 1 21768 43142
## - FreeSulfurDioxide 1 21772 43144
## - Alcohol 1 21785 43152
## - CitricAcid 1 21786 43152
## - TotalSulfurDioxide 1 21801 43161
## - VolatileAcidity 1 21821 43173
## - AcidIndex 1 22583 43612
## - LabelAppeal 1 23752 44258
## - STARS 4 37617 50135
##
## Step: AIC=43135.8
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + Density + pH + ResidualSugar
##
## Df Deviance AIC
## <none> 21751 43136
## - ResidualSugar 1 21755 43136
## - pH 1 21756 43136
## + FixedAcidity 1 21751 43137
## - Density 1 21757 43137
## + INDEX 1 21751 43138
## - Sulphates 1 21760 43139
## - Chlorides 1 21764 43142
## - FreeSulfurDioxide 1 21768 43144
## - Alcohol 1 21782 43152
## - CitricAcid 1 21782 43152
## - TotalSulfurDioxide 1 21796 43160
## - VolatileAcidity 1 21817 43172
## - AcidIndex 1 22577 43611
## - LabelAppeal 1 23748 44258
## - STARS 4 37605 50133
##
## Call: glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + Density + pH + ResidualSugar, data = data)
##
## Coefficients:
## (Intercept) STARS2 STARS3
## 3.742207 1.024585 1.597538
## STARS4 STARS5 LabelAppeal
## 2.293295 -1.361599 0.466437
## AcidIndex VolatileAcidity TotalSulfurDioxide
## -0.197985 -0.102806 0.015316
## CitricAcid Alcohol FreeSulfurDioxide
## 0.065283 0.039848 0.008899
## Chlorides Sulphates Density
## -0.034437 -0.038933 -0.837728
## pH ResidualSugar
## -0.024235 0.006284
##
## Degrees of Freedom: 12794 Total (i.e. Null); 12778 Residual
## Null Deviance: 47480
## Residual Deviance: 21750 AIC: 43140
#build lm model using stepwise approach
linear_model.null = glm.nb(TARGET ~ 1, data = data)
linear_model.full = glm.nb(TARGET ~ ., data = data)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
step(linear_model.null,
scope = list(upper=linear_model.full),
direction = "both",
data = data)
## Start: AIC=54331.58
## TARGET ~ 1
##
## Df Deviance AIC
## + STARS 4 12176 48386
## + LabelAppeal 1 16706 52911
## + AcidIndex 1 17361 53566
## + VolatileAcidity 1 18050 54255
## + TotalSulfurDioxide 1 18079 54283
## + Alcohol 1 18087 54292
## + FreeSulfurDioxide 1 18093 54298
## + Chlorides 1 18101 54305
## + FixedAcidity 1 18103 54307
## + Sulphates 1 18110 54315
## + CitricAcid 1 18113 54318
## + Density 1 18115 54319
## + ResidualSugar 1 18124 54329
## <none> 18129 54332
## + pH 1 18128 54333
## + INDEX 1 18129 54334
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=46643.61
## TARGET ~ STARS
##
## Df Deviance AIC
## + LabelAppeal 1 14060 46015
## + AcidIndex 1 14391 46346
## + VolatileAcidity 1 14657 46612
## + TotalSulfurDioxide 1 14668 46623
## + FreeSulfurDioxide 1 14677 46632
## + Chlorides 1 14680 46635
## + Alcohol 1 14682 46637
## + CitricAcid 1 14683 46638
## + Sulphates 1 14684 46640
## + FixedAcidity 1 14685 46640
## + Density 1 14686 46641
## <none> 14690 46644
## + ResidualSugar 1 14689 46644
## + INDEX 1 14690 46645
## + pH 1 14690 46646
## - STARS 4 22860 54805
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=46014.56
## TARGET ~ STARS + LabelAppeal
##
## Df Deviance AIC
## + AcidIndex 1 13716 45673
## + VolatileAcidity 1 14029 45986
## + TotalSulfurDioxide 1 14033 45990
## + Chlorides 1 14048 46005
## + Alcohol 1 14048 46005
## + FreeSulfurDioxide 1 14048 46005
## + FixedAcidity 1 14053 46010
## + Sulphates 1 14053 46010
## + CitricAcid 1 14055 46012
## + Density 1 14055 46012
## + ResidualSugar 1 14056 46014
## <none> 14060 46015
## + INDEX 1 14059 46016
## + pH 1 14059 46016
## - LabelAppeal 1 14691 46644
## - STARS 4 20867 52814
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45672.53
## TARGET ~ STARS + LabelAppeal + AcidIndex
##
## Df Deviance AIC
## + VolatileAcidity 1 13692 45651
## + TotalSulfurDioxide 1 13698 45657
## + CitricAcid 1 13704 45663
## + Alcohol 1 13709 45668
## + FreeSulfurDioxide 1 13709 45668
## + Chlorides 1 13710 45669
## + Sulphates 1 13711 45670
## + pH 1 13713 45672
## + ResidualSugar 1 13713 45672
## + Density 1 13714 45673
## <none> 13716 45673
## + INDEX 1 13715 45674
## + FixedAcidity 1 13716 45675
## - AcidIndex 1 14060 46015
## - LabelAppeal 1 14391 46346
## - STARS 4 19767 51716
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45650.9
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity
##
## Df Deviance AIC
## + TotalSulfurDioxide 1 13676 45637
## + CitricAcid 1 13682 45642
## + Alcohol 1 13684 45645
## + FreeSulfurDioxide 1 13686 45647
## + Chlorides 1 13686 45647
## + Sulphates 1 13688 45649
## + ResidualSugar 1 13689 45650
## + pH 1 13689 45650
## + Density 1 13690 45651
## <none> 13692 45651
## + INDEX 1 13692 45653
## + FixedAcidity 1 13692 45653
## - VolatileAcidity 1 13716 45673
## - AcidIndex 1 14029 45986
## - LabelAppeal 1 14364 46321
## - STARS 4 19697 51648
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45637.24
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide
##
## Df Deviance AIC
## + CitricAcid 1 13666 45629
## + Alcohol 1 13668 45631
## + FreeSulfurDioxide 1 13670 45633
## + Chlorides 1 13670 45633
## + Sulphates 1 13672 45635
## + pH 1 13674 45637
## + Density 1 13674 45637
## + ResidualSugar 1 13674 45637
## <none> 13676 45637
## + INDEX 1 13676 45639
## + FixedAcidity 1 13676 45639
## - TotalSulfurDioxide 1 13692 45651
## - VolatileAcidity 1 13698 45657
## - AcidIndex 1 14005 45964
## - LabelAppeal 1 14351 46310
## - STARS 4 19655 51608
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45629.15
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid
##
## Df Deviance AIC
## + Alcohol 1 13658 45623
## + FreeSulfurDioxide 1 13660 45625
## + Chlorides 1 13661 45626
## + Sulphates 1 13662 45627
## + pH 1 13664 45628
## + Density 1 13664 45629
## + ResidualSugar 1 13664 45629
## <none> 13666 45629
## + INDEX 1 13666 45631
## + FixedAcidity 1 13666 45631
## - CitricAcid 1 13676 45637
## - TotalSulfurDioxide 1 13682 45642
## - VolatileAcidity 1 13687 45648
## - AcidIndex 1 14001 45962
## - LabelAppeal 1 14338 46299
## - STARS 4 19624 51579
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45622.72
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol
##
## Df Deviance AIC
## + FreeSulfurDioxide 1 13652 45619
## + Chlorides 1 13652 45619
## + Sulphates 1 13654 45620
## + ResidualSugar 1 13655 45622
## + pH 1 13655 45622
## + Density 1 13655 45622
## <none> 13658 45623
## + INDEX 1 13658 45625
## + FixedAcidity 1 13658 45625
## - Alcohol 1 13666 45629
## - CitricAcid 1 13668 45631
## - TotalSulfurDioxide 1 13674 45637
## - VolatileAcidity 1 13680 45642
## - AcidIndex 1 13988 45951
## - LabelAppeal 1 14332 46295
## - STARS 4 19580 51537
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45618.71
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide
##
## Df Deviance AIC
## + Chlorides 1 13646 45615
## + Sulphates 1 13648 45617
## + ResidualSugar 1 13649 45618
## + Density 1 13649 45618
## + pH 1 13649 45618
## <none> 13652 45619
## + INDEX 1 13652 45621
## + FixedAcidity 1 13652 45621
## - FreeSulfurDioxide 1 13658 45623
## - Alcohol 1 13660 45625
## - CitricAcid 1 13662 45627
## - TotalSulfurDioxide 1 13667 45632
## - VolatileAcidity 1 13673 45638
## - AcidIndex 1 13978 45943
## - LabelAppeal 1 14324 46289
## - STARS 4 19560 51519
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45615.44
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides
##
## Df Deviance AIC
## + Sulphates 1 13642 45613
## + ResidualSugar 1 13644 45615
## + Density 1 13644 45615
## + pH 1 13644 45615
## <none> 13646 45615
## + INDEX 1 13646 45617
## + FixedAcidity 1 13646 45617
## - Chlorides 1 13652 45619
## - FreeSulfurDioxide 1 13652 45619
## - Alcohol 1 13655 45622
## - CitricAcid 1 13656 45623
## - TotalSulfurDioxide 1 13662 45629
## - VolatileAcidity 1 13668 45635
## - AcidIndex 1 13968 45935
## - LabelAppeal 1 14319 46286
## - STARS 4 19546 51507
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45613.42
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates
##
## Df Deviance AIC
## + ResidualSugar 1 13640 45613
## + Density 1 13640 45613
## + pH 1 13640 45613
## <none> 13642 45613
## + INDEX 1 13642 45615
## + FixedAcidity 1 13642 45615
## - Sulphates 1 13646 45615
## - Chlorides 1 13648 45617
## - FreeSulfurDioxide 1 13648 45617
## - Alcohol 1 13651 45620
## - CitricAcid 1 13652 45622
## - TotalSulfurDioxide 1 13658 45627
## - VolatileAcidity 1 13663 45632
## - AcidIndex 1 13962 45931
## - LabelAppeal 1 14315 46284
## - STARS 4 19533 51496
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45613.01
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + ResidualSugar
##
## Df Deviance AIC
## + Density 1 13638 45613
## + pH 1 13638 45613
## <none> 13640 45613
## - ResidualSugar 1 13642 45613
## - Sulphates 1 13644 45615
## + INDEX 1 13640 45615
## + FixedAcidity 1 13640 45615
## - Chlorides 1 13645 45616
## - FreeSulfurDioxide 1 13646 45617
## - Alcohol 1 13649 45620
## - CitricAcid 1 13650 45621
## - TotalSulfurDioxide 1 13655 45626
## - VolatileAcidity 1 13661 45632
## - AcidIndex 1 13959 45930
## - LabelAppeal 1 14314 46285
## - STARS 4 19529 51494
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45612.67
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + ResidualSugar + Density
##
## Df Deviance AIC
## + pH 1 13635 45612
## <none> 13638 45613
## - Density 1 13640 45613
## - ResidualSugar 1 13640 45613
## - Sulphates 1 13642 45615
## + INDEX 1 13638 45615
## + FixedAcidity 1 13638 45615
## - Chlorides 1 13643 45616
## - FreeSulfurDioxide 1 13644 45617
## - Alcohol 1 13646 45619
## - CitricAcid 1 13648 45621
## - TotalSulfurDioxide 1 13653 45626
## - VolatileAcidity 1 13659 45632
## - AcidIndex 1 13954 45927
## - LabelAppeal 1 14311 46284
## - STARS 4 19521 51488
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
##
## Step: AIC=45612.36
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + ResidualSugar + Density + pH
##
## Df Deviance AIC
## <none> 13635 45612
## - pH 1 13638 45613
## - Density 1 13638 45613
## - ResidualSugar 1 13638 45613
## - Sulphates 1 13639 45614
## + INDEX 1 13635 45614
## + FixedAcidity 1 13635 45614
## - Chlorides 1 13640 45615
## - FreeSulfurDioxide 1 13641 45616
## - Alcohol 1 13644 45619
## - CitricAcid 1 13645 45620
## - TotalSulfurDioxide 1 13651 45626
## - VolatileAcidity 1 13656 45631
## - AcidIndex 1 13954 45929
## - LabelAppeal 1 14310 46285
## - STARS 4 19511 51480
##
## Call: glm.nb(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + ResidualSugar + Density + pH, data = data,
## init.theta = 40828.34765, link = log)
##
## Coefficients:
## (Intercept) STARS2 STARS3
## 1.428340 0.319746 0.439234
## STARS4 STARS5 LabelAppeal
## 0.556300 -0.766483 0.159204
## AcidIndex VolatileAcidity TotalSulfurDioxide
## -0.079397 -0.033542 0.005287
## CitricAcid Alcohol FreeSulfurDioxide
## 0.021508 0.012191 0.003073
## Chlorides Sulphates ResidualSugar
## -0.012047 -0.014395 0.002797
## Density pH
## -0.293281 -0.010107
##
## Degrees of Freedom: 12794 Total (i.e. Null); 12778 Residual
## Null Deviance: 22860
## Residual Deviance: 13640 AIC: 45610
#build lm model using stepwise approach
linear_model.null = glm(TARGET ~ 1, family = "poisson",data = data)
linear_model.full = glm(TARGET ~ ., family = "poisson",data = data)
step(linear_model.null,
scope = list(upper=linear_model.full),
direction = "both",
data = data)
## Start: AIC=54804.92
## TARGET ~ 1
##
## Df Deviance AIC
## + STARS 4 14691 46643
## + LabelAppeal 1 20868 52814
## + AcidIndex 1 21825 53771
## + VolatileAcidity 1 22749 54695
## + TotalSulfurDioxide 1 22792 54738
## + Alcohol 1 22801 54747
## + FreeSulfurDioxide 1 22811 54757
## + Chlorides 1 22821 54767
## + FixedAcidity 1 22824 54770
## + Sulphates 1 22835 54781
## + CitricAcid 1 22839 54785
## + Density 1 22841 54787
## + ResidualSugar 1 22854 54800
## <none> 22861 54805
## + pH 1 22860 54805
## + INDEX 1 22861 54807
##
## Step: AIC=46643.21
## TARGET ~ STARS
##
## Df Deviance AIC
## + LabelAppeal 1 14060 46014
## + AcidIndex 1 14392 46346
## + VolatileAcidity 1 14657 46611
## + TotalSulfurDioxide 1 14668 46622
## + FreeSulfurDioxide 1 14678 46632
## + Chlorides 1 14680 46634
## + Alcohol 1 14683 46637
## + CitricAcid 1 14684 46638
## + Sulphates 1 14685 46639
## + FixedAcidity 1 14686 46640
## + Density 1 14687 46641
## <none> 14691 46643
## + ResidualSugar 1 14689 46643
## + INDEX 1 14690 46644
## + pH 1 14691 46645
## - STARS 4 22861 54805
##
## Step: AIC=46014.13
## TARGET ~ STARS + LabelAppeal
##
## Df Deviance AIC
## + AcidIndex 1 13716 45672
## + VolatileAcidity 1 14030 45986
## + TotalSulfurDioxide 1 14034 45990
## + Chlorides 1 14049 46005
## + Alcohol 1 14049 46005
## + FreeSulfurDioxide 1 14049 46005
## + FixedAcidity 1 14054 46009
## + Sulphates 1 14054 46010
## + CitricAcid 1 14055 46011
## + Density 1 14055 46011
## + ResidualSugar 1 14057 46013
## <none> 14060 46014
## + INDEX 1 14060 46016
## + pH 1 14060 46016
## - LabelAppeal 1 14691 46643
## - STARS 4 20868 52814
##
## Step: AIC=45672.11
## TARGET ~ STARS + LabelAppeal + AcidIndex
##
## Df Deviance AIC
## + VolatileAcidity 1 13692 45650
## + TotalSulfurDioxide 1 13699 45657
## + CitricAcid 1 13705 45663
## + Alcohol 1 13709 45667
## + FreeSulfurDioxide 1 13709 45667
## + Chlorides 1 13710 45668
## + Sulphates 1 13712 45670
## + pH 1 13713 45671
## + ResidualSugar 1 13713 45671
## + Density 1 13714 45672
## <none> 13716 45672
## + INDEX 1 13716 45674
## + FixedAcidity 1 13716 45674
## - AcidIndex 1 14060 46014
## - LabelAppeal 1 14392 46346
## - STARS 4 19768 51716
##
## Step: AIC=45650.48
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity
##
## Df Deviance AIC
## + TotalSulfurDioxide 1 13677 45637
## + CitricAcid 1 13682 45642
## + Alcohol 1 13685 45645
## + FreeSulfurDioxide 1 13686 45646
## + Chlorides 1 13687 45647
## + Sulphates 1 13688 45648
## + ResidualSugar 1 13690 45650
## + pH 1 13690 45650
## + Density 1 13690 45650
## <none> 13692 45650
## + INDEX 1 13692 45652
## + FixedAcidity 1 13692 45652
## - VolatileAcidity 1 13716 45672
## - AcidIndex 1 14030 45986
## - LabelAppeal 1 14365 46321
## - STARS 4 19698 51648
##
## Step: AIC=45636.82
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide
##
## Df Deviance AIC
## + CitricAcid 1 13667 45629
## + Alcohol 1 13668 45631
## + FreeSulfurDioxide 1 13671 45633
## + Chlorides 1 13671 45633
## + Sulphates 1 13673 45635
## + pH 1 13674 45636
## + Density 1 13674 45636
## + ResidualSugar 1 13674 45636
## <none> 13677 45637
## + INDEX 1 13677 45639
## + FixedAcidity 1 13677 45639
## - TotalSulfurDioxide 1 13692 45650
## - VolatileAcidity 1 13699 45657
## - AcidIndex 1 14006 45964
## - LabelAppeal 1 14351 46309
## - STARS 4 19656 51608
##
## Step: AIC=45628.73
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid
##
## Df Deviance AIC
## + Alcohol 1 13658 45622
## + FreeSulfurDioxide 1 13661 45625
## + Chlorides 1 13661 45625
## + Sulphates 1 13662 45626
## + pH 1 13664 45628
## + Density 1 13664 45628
## + ResidualSugar 1 13664 45628
## <none> 13667 45629
## + INDEX 1 13667 45631
## + FixedAcidity 1 13667 45631
## - CitricAcid 1 13677 45637
## - TotalSulfurDioxide 1 13682 45642
## - VolatileAcidity 1 13688 45648
## - AcidIndex 1 14002 45962
## - LabelAppeal 1 14339 46299
## - STARS 4 19625 51579
##
## Step: AIC=45622.3
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol
##
## Df Deviance AIC
## + FreeSulfurDioxide 1 13652 45618
## + Chlorides 1 13653 45619
## + Sulphates 1 13654 45620
## + ResidualSugar 1 13656 45622
## + pH 1 13656 45622
## + Density 1 13656 45622
## <none> 13658 45622
## + INDEX 1 13658 45624
## + FixedAcidity 1 13658 45624
## - Alcohol 1 13667 45629
## - CitricAcid 1 13668 45631
## - TotalSulfurDioxide 1 13674 45636
## - VolatileAcidity 1 13680 45642
## - AcidIndex 1 13988 45950
## - LabelAppeal 1 14333 46295
## - STARS 4 19581 51537
##
## Step: AIC=45618.29
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide
##
## Df Deviance AIC
## + Chlorides 1 13647 45615
## + Sulphates 1 13648 45616
## + ResidualSugar 1 13650 45618
## + Density 1 13650 45618
## + pH 1 13650 45618
## <none> 13652 45618
## + INDEX 1 13652 45620
## + FixedAcidity 1 13652 45620
## - FreeSulfurDioxide 1 13658 45622
## - Alcohol 1 13661 45625
## - CitricAcid 1 13662 45626
## - TotalSulfurDioxide 1 13668 45632
## - VolatileAcidity 1 13674 45638
## - AcidIndex 1 13979 45943
## - LabelAppeal 1 14325 46289
## - STARS 4 19560 51519
##
## Step: AIC=45615.02
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides
##
## Df Deviance AIC
## + Sulphates 1 13643 45613
## + ResidualSugar 1 13644 45614
## + Density 1 13645 45615
## + pH 1 13645 45615
## <none> 13647 45615
## + INDEX 1 13647 45617
## + FixedAcidity 1 13647 45617
## - Chlorides 1 13652 45618
## - FreeSulfurDioxide 1 13653 45619
## - Alcohol 1 13656 45622
## - CitricAcid 1 13657 45623
## - TotalSulfurDioxide 1 13663 45629
## - VolatileAcidity 1 13668 45634
## - AcidIndex 1 13968 45934
## - LabelAppeal 1 14319 46285
## - STARS 4 19547 51507
##
## Step: AIC=45613.01
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates
##
## Df Deviance AIC
## + ResidualSugar 1 13641 45613
## + Density 1 13641 45613
## + pH 1 13641 45613
## <none> 13643 45613
## + INDEX 1 13643 45615
## + FixedAcidity 1 13643 45615
## - Sulphates 1 13647 45615
## - Chlorides 1 13648 45616
## - FreeSulfurDioxide 1 13649 45617
## - Alcohol 1 13651 45619
## - CitricAcid 1 13653 45621
## - TotalSulfurDioxide 1 13658 45626
## - VolatileAcidity 1 13664 45632
## - AcidIndex 1 13962 45931
## - LabelAppeal 1 14316 46284
## - STARS 4 19534 51496
##
## Step: AIC=45612.59
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + ResidualSugar
##
## Df Deviance AIC
## + Density 1 13638 45612
## + pH 1 13638 45612
## <none> 13641 45613
## - ResidualSugar 1 13643 45613
## - Sulphates 1 13644 45614
## + INDEX 1 13640 45615
## + FixedAcidity 1 13641 45615
## - Chlorides 1 13646 45616
## - FreeSulfurDioxide 1 13646 45616
## - Alcohol 1 13649 45619
## - CitricAcid 1 13650 45621
## - TotalSulfurDioxide 1 13656 45626
## - VolatileAcidity 1 13662 45632
## - AcidIndex 1 13960 45930
## - LabelAppeal 1 14314 46284
## - STARS 4 19530 51494
##
## Step: AIC=45612.26
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + ResidualSugar + Density
##
## Df Deviance AIC
## + pH 1 13636 45612
## <none> 13638 45612
## - Density 1 13641 45613
## - ResidualSugar 1 13641 45613
## - Sulphates 1 13642 45614
## + INDEX 1 13638 45614
## + FixedAcidity 1 13638 45614
## - Chlorides 1 13643 45615
## - FreeSulfurDioxide 1 13644 45616
## - Alcohol 1 13647 45619
## - CitricAcid 1 13648 45620
## - TotalSulfurDioxide 1 13654 45626
## - VolatileAcidity 1 13660 45631
## - AcidIndex 1 13954 45927
## - LabelAppeal 1 14312 46284
## - STARS 4 19522 51488
##
## Step: AIC=45611.94
## TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + ResidualSugar + Density + pH
##
## Df Deviance AIC
## <none> 13636 45612
## - pH 1 13638 45612
## - Density 1 13638 45612
## - ResidualSugar 1 13638 45612
## - Sulphates 1 13640 45614
## + INDEX 1 13636 45614
## + FixedAcidity 1 13636 45614
## - Chlorides 1 13641 45615
## - FreeSulfurDioxide 1 13642 45616
## - Alcohol 1 13644 45618
## - CitricAcid 1 13646 45620
## - TotalSulfurDioxide 1 13651 45625
## - VolatileAcidity 1 13657 45631
## - AcidIndex 1 13954 45928
## - LabelAppeal 1 14311 46285
## - STARS 4 19512 51480
##
## Call: glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + FreeSulfurDioxide +
## Chlorides + Sulphates + ResidualSugar + Density + pH, family = "poisson",
## data = data)
##
## Coefficients:
## (Intercept) STARS2 STARS3
## 1.428325 0.319746 0.439234
## STARS4 STARS5 LabelAppeal
## 0.556298 -0.766485 0.159205
## AcidIndex VolatileAcidity TotalSulfurDioxide
## -0.079394 -0.033540 0.005287
## CitricAcid Alcohol FreeSulfurDioxide
## 0.021507 0.012192 0.003073
## Chlorides Sulphates ResidualSugar
## -0.012046 -0.014394 0.002797
## Density pH
## -0.293275 -0.010107
##
## Degrees of Freedom: 12794 Total (i.e. Null); 12778 Residual
## Null Deviance: 22860
## Residual Deviance: 13640 AIC: 45610
Select optimal model.
#optimal model
final_model <- glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide +
Sulphates + Density + pH, family="gaussian",data = data)
summary(final_model)
##
## Call:
## glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide +
## Sulphates + Density + pH, family = "gaussian", data = data)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.7293 -0.8604 0.0184 0.8488 6.2032
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 3.770114 0.112361 33.554 < 2e-16 ***
## STARS2 1.024907 0.032559 31.478 < 2e-16 ***
## STARS3 1.598058 0.037628 42.469 < 2e-16 ***
## STARS4 2.293681 0.059599 38.485 < 2e-16 ***
## STARS5 -1.361671 0.032894 -41.396 < 2e-16 ***
## LabelAppeal 0.466342 0.013618 34.245 < 2e-16 ***
## AcidIndex -0.198204 0.008986 -22.056 < 2e-16 ***
## VolatileAcidity -0.102730 0.016552 -6.206 5.59e-10 ***
## TotalSulfurDioxide 0.015431 0.002966 5.202 2.00e-07 ***
## CitricAcid 0.065437 0.015394 4.251 2.15e-05 ***
## Alcohol 0.039553 0.009412 4.202 2.66e-05 ***
## Chlorides -0.034384 0.012359 -2.782 0.00541 **
## FreeSulfurDioxide 0.008893 0.002848 3.122 0.00180 **
## Sulphates -0.039273 0.016831 -2.333 0.01965 *
## Density -0.833203 0.436183 -1.910 0.05613 .
## pH -0.024296 0.014984 -1.621 0.10495
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for gaussian family taken to be 1.70242)
##
## Null deviance: 47477 on 12794 degrees of freedom
## Residual deviance: 21755 on 12779 degrees of freedom
## AIC: 43136
##
## Number of Fisher Scoring iterations: 2
Perform residual analysis.
#linearity
plot(final_model$residuals ~ data$TARGET,
main="Residuals vs TARGET")
abline(h = 0, lty = 3) # adds a horizontal dashed line at y = 0
#normal residuals
par(mfrow=c(1,2))
hist(final_model$residuals, probability=TRUE,col="gray", border="white", main="Distribution of residuals")
d <- density(final_model$residuals)
lines(d, col="red")
#normal probability plot
qqnorm(final_model$residuals)
qqline(final_model$residuals)
#constant variability
plot(final_model)
Run likelihood ratio test.
#alternative model
alternative_model <- glm(formula = TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide, data = data)
#Likelihood Ratio Test
anova(final_model, alternative_model, test ="Chisq")
## Analysis of Deviance Table
##
## Model 1: TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide +
## Sulphates + Density + pH
## Model 2: TARGET ~ STARS + LabelAppeal + AcidIndex + VolatileAcidity +
## TotalSulfurDioxide + CitricAcid + Alcohol + Chlorides + FreeSulfurDioxide
## Resid. Df Resid. Dev Df Deviance Pr(>Chi)
## 1 12779 21755
## 2 12782 21776 -3 -20.428 0.007386 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Analize deviance.
#residual deviance test
p_value = 1 - pchisq(final_model$deviance,final_model$df.residual)
p_value
## [1] 0
Hosmer-Lemeshow Test.
hoslem.test(data$TARGET, fitted(final_model))
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: data$TARGET, fitted(final_model)
## X-squared = 354.74, df = 8, p-value < 2.2e-16
Calculate sales for testing data set.
#create dummy varibles for training dataset
data$STARS_2 <- ifelse(data$STARS == "2",1,0)
data$STARS_3 <- ifelse(data$STARS == "3",1,0)
data$STARS_4 <- ifelse(data$STARS == "4",1,0)
data$STARS_5 <- ifelse(data$STARS == "5",1,0)
#create dummy varibles for testing dataset
data_testing$STARS_2 <- ifelse(data_testing$STARS == "2",1,0)
data_testing$STARS_3 <- ifelse(data_testing$STARS == "3",1,0)
data_testing$STARS_4 <- ifelse(data_testing$STARS == "4",1,0)
data_testing$STARS_5 <- ifelse(data_testing$STARS == "5",1,0)
#create the variable 'TARGET_pred' for testimng and training data sets
data$TARGET_pred <- c()
data_testing$TARGET_pred <- c()
#calculate 'TARGET_pred'
data$TARGET_pred <- round(3.778249 + 1.025474*data$STARS_2 + 1.598549*data$STARS_3 + 2.293281*data$STARS_4 - 1.361213*data$STARS_5 + 0.465884*data$LabelAppeal - 0.197816*data$AcidIndex - 0.103156*data$VolatileAcidity + 0.014510*data$TotalSulfurDioxide + 0.065114*data$CitricAcid + 0.037178*data$Alcohol - 0.039618*data$Chlorides + 0.009917*data$FreeSulfurDioxide - 0.036880*data$Sulphates,0)
data_testing$TARGET_pred <- round(3.778249 + 1.025474*data_testing$STARS_2 + 1.598549*data_testing$STARS_3 + 2.293281*data_testing$STARS_4 - 1.361213*data_testing$STARS_5 + 0.465884*data_testing$LabelAppeal - 0.197816*data_testing$AcidIndex - 0.103156*data_testing$VolatileAcidity + 0.014510*data_testing$TotalSulfurDioxide + 0.065114*data_testing$CitricAcid + 0.037178*data_testing$Alcohol - 0.039618*data_testing$Chlorides + 0.009917*data_testing$FreeSulfurDioxide - 0.036880*data_testing$Sulphates,0)
#plot predicted vs actual
plot(predict(final_model),data$TARGET_pred,
xlab="Predicted", ylab="Actual",
main="TARGET. Actual vs Predicted")
abline(a=0,b=1,col="red")
#sagnificance of difference
ks.test(data$TARGET,data$TARGET_pred)
## Warning in ks.test(data$TARGET, data$TARGET_pred): p-value will be
## approximate in the presence of ties
##
## Two-sample Kolmogorov-Smirnov test
##
## data: data$TARGET and data$TARGET_pred
## D = 0.22415, p-value < 2.2e-16
## alternative hypothesis: two-sided
#export testing data file with predicted sales
write.table(data_testing, file = "/Users/olga/downloads/wine-evaluation-data.csv",append = FALSE)