rm(list = ls())
options(repos = "http://cran.rstudio.com/")
packages <- c("psych", # quick summary stats for data exploration,
"stargazer", # summary stats for sharing,
"tidyverse", # data manipulation like selecting variables,
"corrplot", # correlation plots
"ggplot2", # graphing
"data.table", # reshape for graphing
"car", # vif
"dplyr",
"tidyverse",
"knitr",
"reshape2",
"glmnet",
"conflicted",
"xts",
"forecast",
"qqplotr"
)
for (i in 1:length(packages)) {
if (!packages[i] %in% rownames(installed.packages())) {
install.packages(packages[i]
, repos = "http://cran.rstudio.com/"
, dependencies = TRUE
)
}
library(packages[i], character.only = TRUE)
}
## Warning: package 'psych' was built under R version 4.2.3
##
## Please cite as:
## Hlavac, Marek (2022). stargazer: Well-Formatted Regression and Summary Statistics Tables.
## R package version 5.2.3. https://CRAN.R-project.org/package=stargazer
## Warning: package 'tidyverse' was built under R version 4.2.3
## Warning: package 'ggplot2' was built under R version 4.2.3
## Warning: package 'tibble' was built under R version 4.2.3
## Warning: package 'purrr' was built under R version 4.2.3
## Warning: package 'dplyr' was built under R version 4.2.3
## Warning: package 'stringr' was built under R version 4.2.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.1 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ ggplot2::%+%() masks psych::%+%()
## ✖ ggplot2::alpha() masks psych::alpha()
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Warning: package 'corrplot' was built under R version 4.2.3
## corrplot 0.92 loaded
## Warning: package 'data.table' was built under R version 4.2.3
##
## Attaching package: 'data.table'
##
## The following objects are masked from 'package:lubridate':
##
## hour, isoweek, mday, minute, month, quarter, second, wday, week,
## yday, year
##
## The following objects are masked from 'package:dplyr':
##
## between, first, last
##
## The following object is masked from 'package:purrr':
##
## transpose
##
## Loading required package: carData
## Warning: package 'carData' was built under R version 4.2.3
##
## Attaching package: 'car'
##
## The following object is masked from 'package:dplyr':
##
## recode
##
## The following object is masked from 'package:purrr':
##
## some
##
## The following object is masked from 'package:psych':
##
## logit
## Warning: package 'knitr' was built under R version 4.2.3
##
## Attaching package: 'reshape2'
##
## The following objects are masked from 'package:data.table':
##
## dcast, melt
##
## The following object is masked from 'package:tidyr':
##
## smiths
## Warning: package 'glmnet' was built under R version 4.2.3
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 4.2.3
##
## Attaching package: 'Matrix'
##
## The following objects are masked from 'package:tidyr':
##
## expand, pack, unpack
##
## Loaded glmnet 4.1-7
## Warning: package 'xts' was built under R version 4.2.3
## Loading required package: zoo
## Warning: package 'zoo' was built under R version 4.2.3
##
## Attaching package: 'zoo'
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
##
##
## ######################### Warning from 'xts' package ##########################
## # #
## # The dplyr lag() function breaks how base R's lag() function is supposed to #
## # work, which breaks lag(my_xts). Calls to lag(my_xts) that you type or #
## # source() into this session won't work correctly. #
## # #
## # Use stats::lag() to make sure you're not using dplyr::lag(), or you can add #
## # conflictRules('dplyr', exclude = 'lag') to your .Rprofile to stop #
## # dplyr from breaking base R's lag() function. #
## # #
## # Code in packages is not affected. It's protected by R's namespace mechanism #
## # Set `options(xts.warn_dplyr_breaks_lag = FALSE)` to suppress this warning. #
## # #
## ###############################################################################
## Warning: package 'forecast' was built under R version 4.2.3
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Warning: package 'qqplotr' was built under R version 4.2.3
wine.training.data <-read.csv("C:/Users/91976/Downloads/wine-training-data.csv")
summary(wine.training.data)
## INDEX TARGET FixedAcidity VolatileAcidity
## Min. : 1 Min. :0.000 Min. :-18.100 Min. :-2.7900
## 1st Qu.: 4038 1st Qu.:2.000 1st Qu.: 5.200 1st Qu.: 0.1300
## Median : 8110 Median :3.000 Median : 6.900 Median : 0.2800
## Mean : 8070 Mean :3.029 Mean : 7.076 Mean : 0.3241
## 3rd Qu.:12106 3rd Qu.:4.000 3rd Qu.: 9.500 3rd Qu.: 0.6400
## Max. :16129 Max. :8.000 Max. : 34.400 Max. : 3.6800
##
## CitricAcid ResidualSugar Chlorides FreeSulfurDioxide
## Min. :-3.2400 Min. :-127.800 Min. :-1.1710 Min. :-555.00
## 1st Qu.: 0.0300 1st Qu.: -2.000 1st Qu.:-0.0310 1st Qu.: 0.00
## Median : 0.3100 Median : 3.900 Median : 0.0460 Median : 30.00
## Mean : 0.3084 Mean : 5.419 Mean : 0.0548 Mean : 30.85
## 3rd Qu.: 0.5800 3rd Qu.: 15.900 3rd Qu.: 0.1530 3rd Qu.: 70.00
## Max. : 3.8600 Max. : 141.150 Max. : 1.3510 Max. : 623.00
## NA's :616 NA's :638 NA's :647
## TotalSulfurDioxide Density pH Sulphates
## Min. :-823.0 Min. :0.8881 Min. :0.480 Min. :-3.1300
## 1st Qu.: 27.0 1st Qu.:0.9877 1st Qu.:2.960 1st Qu.: 0.2800
## Median : 123.0 Median :0.9945 Median :3.200 Median : 0.5000
## Mean : 120.7 Mean :0.9942 Mean :3.208 Mean : 0.5271
## 3rd Qu.: 208.0 3rd Qu.:1.0005 3rd Qu.:3.470 3rd Qu.: 0.8600
## Max. :1057.0 Max. :1.0992 Max. :6.130 Max. : 4.2400
## NA's :682 NA's :395 NA's :1210
## Alcohol LabelAppeal AcidIndex STARS
## Min. :-4.70 Min. :-2.000000 Min. : 4.000 Min. :1.000
## 1st Qu.: 9.00 1st Qu.:-1.000000 1st Qu.: 7.000 1st Qu.:1.000
## Median :10.40 Median : 0.000000 Median : 8.000 Median :2.000
## Mean :10.49 Mean :-0.009066 Mean : 7.773 Mean :2.042
## 3rd Qu.:12.40 3rd Qu.: 1.000000 3rd Qu.: 8.000 3rd Qu.:3.000
## Max. :26.50 Max. : 2.000000 Max. :17.000 Max. :4.000
## NA's :653 NA's :3359
#Removing the index column
data_t_mod <- wine.training.data [,-1]
library(stargazer)
stargazer(data_t_mod, type="text")
##
## =============================================================
## Statistic N Mean St. Dev. Min Max
## -------------------------------------------------------------
## TARGET 12,795 3.029 1.926 0 8
## FixedAcidity 12,795 7.076 6.318 -18.100 34.400
## VolatileAcidity 12,795 0.324 0.784 -2.790 3.680
## CitricAcid 12,795 0.308 0.862 -3.240 3.860
## ResidualSugar 12,179 5.419 33.749 -127.800 141.150
## Chlorides 12,157 0.055 0.318 -1.171 1.351
## FreeSulfurDioxide 12,148 30.846 148.715 -555.000 623.000
## TotalSulfurDioxide 12,113 120.714 231.913 -823.000 1,057.000
## Density 12,795 0.994 0.027 0.888 1.099
## pH 12,400 3.208 0.680 0.480 6.130
## Sulphates 11,585 0.527 0.932 -3.130 4.240
## Alcohol 12,142 10.489 3.728 -4.700 26.500
## LabelAppeal 12,795 -0.009 0.891 -2 2
## AcidIndex 12,795 7.773 1.324 4 17
## STARS 9,436 2.042 0.903 1 4
## -------------------------------------------------------------
sapply(X=data_t_mod,
FUN= class)
## TARGET FixedAcidity VolatileAcidity CitricAcid
## "integer" "numeric" "numeric" "numeric"
## ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide
## "numeric" "numeric" "numeric" "numeric"
## Density pH Sulphates Alcohol
## "numeric" "numeric" "numeric" "numeric"
## LabelAppeal AcidIndex STARS
## "integer" "integer" "integer"
#Creating Boxplots
library(ggplot2)
library(magrittr)
library(moments)
library(qqplotr)
library(ggplot2)
library(moments)
library(gridExtra)
with(data_t_mod, c(summary(FixedAcidity), SD = sd(FixedAcidity), Skew = skewness(FixedAcidity), Kurt = kurtosis(FixedAcidity)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -18.10000000 5.20000000 6.90000000 7.07571708 9.50000000 34.40000000
## SD Skew Kurt
## 6.31764346 -0.02258861 4.67572951
hist <- ggplot(data_t_mod, aes(FixedAcidity)) +
geom_histogram(fill = 'dodgerblue', binwidth = 4, color = 'darkgray') +
theme_classic() +
labs(title = 'Histogram of FixedAcidity') +
theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(data_t_mod, aes(sample = FixedAcidity)) +
geom_qq(color = 'dodgerblue') +
geom_qq_line(color = 'darkgray') +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles", title = "QQ Plot of FixedAcidity") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(data_t_mod, aes(x = "", y = FixedAcidity)) +
geom_boxplot(fill = 'dodgerblue', color = 'darkgray') +
theme_classic() +
labs(title = 'Boxplot of FixedAcidity', x = "") +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip()
box_TARGET <- ggplot(data_t_mod, aes(x = factor(TARGET), y = FixedAcidity)) +
geom_boxplot(fill = 'dodgerblue', color = 'darkgray') +
labs(x = 'TARGET', title = 'Boxplot of FixedAcidity by TARGET') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol = 2)
#The box plot below shows that outliners exist in variables FixedAcidity, VolatileAcidity, CitricAcid, ResidualSugar, Chlorides, FreeSulfurDioxide, TotalSulfurDioxide, Density, pH, Sulphates, Alcohol, LabelAppeal andAcidIndex. We use scaled training set to draw the box plot to show the corresponding outliers by ratio
library(ggpubr)
## Warning: package 'ggpubr' was built under R version 4.2.3
hist <- ggplot(data_t_mod, aes(FixedAcidity)) + geom_histogram(fill = 'orange', binwidth = 4, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of FixedAcidity') + theme(plot.title = element_text(hjust = 0.5))
library(ggplot2)
data_t_mod <- data.frame(FixedAcidity = rnorm(100))
qq_plot <- ggplot(data_t_mod, aes(sample = FixedAcidity)) +
geom_qq(color = 'orange') +
geom_abline(color = 'darkgray') +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles", title = "QQ Plot of Fixed Acidity") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
print(qq_plot)
box_plot <- ggplot(data_t_mod, aes(x="", FixedAcidity)) + geom_boxplot(fill='orange', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of FixedAcidity', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
data_t_mod <- wine.training.data
head(data_t_mod[,-1])
## TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1 3 3.2 1.160 -0.98 54.2 -0.567
## 2 3 4.5 0.160 -0.81 26.1 -0.425
## 3 5 7.1 2.640 -0.88 14.8 0.037
## 4 3 5.7 0.385 0.04 18.8 -0.425
## 5 4 8.0 0.330 -1.26 9.4 NA
## 6 0 11.3 0.320 0.59 2.2 0.556
## FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol
## 1 NA 268 0.99280 3.33 -0.59 9.9
## 2 15 -327 1.02792 3.38 0.70 NA
## 3 214 142 0.99518 3.12 0.48 22.0
## 4 22 115 0.99640 2.24 1.83 6.2
## 5 -167 108 0.99457 3.12 1.77 13.7
## 6 -37 15 0.99940 3.20 1.29 15.4
## LabelAppeal AcidIndex STARS
## 1 0 8 2
## 2 -1 7 3
## 3 -1 8 3
## 4 -1 6 1
## 5 0 9 2
## 6 0 11 NA
head(data_t_mod)
## INDEX TARGET FixedAcidity VolatileAcidity CitricAcid ResidualSugar Chlorides
## 1 1 3 3.2 1.160 -0.98 54.2 -0.567
## 2 2 3 4.5 0.160 -0.81 26.1 -0.425
## 3 4 5 7.1 2.640 -0.88 14.8 0.037
## 4 5 3 5.7 0.385 0.04 18.8 -0.425
## 5 6 4 8.0 0.330 -1.26 9.4 NA
## 6 7 0 11.3 0.320 0.59 2.2 0.556
## FreeSulfurDioxide TotalSulfurDioxide Density pH Sulphates Alcohol
## 1 NA 268 0.99280 3.33 -0.59 9.9
## 2 15 -327 1.02792 3.38 0.70 NA
## 3 214 142 0.99518 3.12 0.48 22.0
## 4 22 115 0.99640 2.24 1.83 6.2
## 5 -167 108 0.99457 3.12 1.77 13.7
## 6 -37 15 0.99940 3.20 1.29 15.4
## LabelAppeal AcidIndex STARS
## 1 0 8 2
## 2 -1 7 3
## 3 -1 8 3
## 4 -1 6 1
## 5 0 9 2
## 6 0 11 NA
box_TARGET <- ggplot(data_t_mod, aes(x=factor(TARGET), FixedAcidity)) + geom_boxplot(fill='orange', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of FixedAcidity by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
library(gridExtra)
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
library(qqplotr)
with(data_t_mod, c(summary(Chlorides), SD=sd(Chlorides), Skew=skewness(Chlorides), Kurt=kurtosis(Chlorides)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -1.17100000 -0.03100000 0.04600000 0.05482249 0.15300000 1.35100000
## NA's SD Skew Kurt
## 638.00000000 NA NA NA
hist <- ggplot(data_t_mod, aes(Chlorides)) + geom_histogram(fill = 'maroon', binwidth = .2, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of Chlorides') + theme(plot.title = element_text(hjust = 0.5))
gg_plot <- ggplot(data_t_mod, aes(sample=Chlorides)) + stat_qq_point(color='maroon') + ggplot2::stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of Chlorides") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(data_t_mod, aes(x="", Chlorides)) + geom_boxplot(fill='maroon', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of Chlorides', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(data_t_mod, aes(x=factor(TARGET), Chlorides)) + geom_boxplot(fill='maroon', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of Chlorides by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 638 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 638 rows containing non-finite values (`stat_boxplot()`).
## Removed 638 rows containing non-finite values (`stat_boxplot()`).
##Checking Alcohol Content
library(ggplot2)
library(moments)
with(data_t_mod, c(summary(Alcohol), SD=sd(Alcohol), Skew=skewness(Alcohol), Kurt=kurtosis(Alcohol)))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's SD
## -4.70000 9.00000 10.40000 10.48924 12.40000 26.50000 653.00000 NA
## Skew Kurt
## NA NA
hist <- ggplot(data_t_mod, aes(Alcohol)) + geom_histogram(fill = 'yellow', binwidth = 2, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of Alcohol') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(data_t_mod, aes(sample = Alcohol)) +
geom_qq(color = 'yellow') +
geom_qq_line(color = 'darkgray') +
labs(x = "Theoretical Quantiles", y = "Sample Quantiles", title = "QQ Plot of Alcohol") +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(data_t_mod, aes(x = "", y = Alcohol)) +
geom_boxplot(fill = 'yellow', color = 'darkgray') +
theme_classic() +
labs(title = 'Boxplot of Alcohol', x = "") +
theme(plot.title = element_text(hjust = 0.5)) +
coord_flip()
box_TARGET <- ggplot(data_t_mod, aes(x = factor(TARGET), y = Alcohol)) +
geom_boxplot(fill = 'yellow', color = 'darkgray') +
labs(x = 'TARGET', title = 'Boxplot of Alcohol by TARGET') +
theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol = 2)
## Warning: Removed 653 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 653 rows containing non-finite values (`stat_qq()`).
## Warning: Removed 653 rows containing non-finite values (`stat_qq_line()`).
## Warning: Removed 653 rows containing non-finite values (`stat_boxplot()`).
## Removed 653 rows containing non-finite values (`stat_boxplot()`).
data_t_mod %>%
select_if(is.numeric) %>%
keep(is.numeric) %>% # Keep only numeric columns
gather() %>% # Convert to key-value pairs
ggplot(aes(x=value)) + # Plot the values
facet_wrap(~key, scales = "free") + # In separate panels
geom_density()
## Warning: Removed 8200 rows containing non-finite values (`stat_density()`).
#The scaled histogram and density plots show that variables AcidIndex is right skewed; AcidIndex, STARS, LabelAppeal and TARGET have multimodal distribution; while most others seem to be normally distrbuted due to the bell curve they display.
drop_na(data_t_mod) %>%
select_if(is.numeric) %>%
cor() %>%
#corrplot(method = "square", type = "upper", order = 'hclust', tl.col = "black", diag = FALSE, bg= 'white', col = colorRampPalette(c('deeppink4','white','steelblue1'))(100))
corrplot.mixed(upper = 'pie', lower = 'number', order = 'hclust', tl.col = "black")
pairs.panels(data_t_mod[1:15])
PerformanceAnalytics::chart.Correlation(data_t_mod, histogram=TRUE, pch=19)
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
## Warning in par(usr): argument 1 does not name a graphical parameter
#The correlation matrix below shows that the response variable TARGET has strong positive relationship (>=0.6) with variables FixedAcidity,CitricAcid,ResidualSugar,Density,Alcohol.
library(ggthemes)
## Warning: package 'ggthemes' was built under R version 4.2.3
data_t_mod %>%
gather(-TARGET, key = "key", value = "ResponseVariables") %>%
ggplot(aes(x = ResponseVariables, y = TARGET)) +
geom_point(size = .5) +
geom_smooth(method='lm',formula=y~x, color = 'dark grey')+
facet_wrap(~ key, scales = "free")+
ggthemes::theme_tufte()+
ylab('Cases Bought')
## Warning: Removed 8200 rows containing non-finite values (`stat_smooth()`).
## Warning: Removed 8200 rows containing missing values (`geom_point()`).
#STARS and LableAppleal look like they have the greatest correlation.
data_t_mod%>%
summarise(across(everything(),~ sum(is.na(.)))) %>%
glimpse()
## Rows: 1
## Columns: 16
## $ INDEX <int> 0
## $ TARGET <int> 0
## $ FixedAcidity <int> 0
## $ VolatileAcidity <int> 0
## $ CitricAcid <int> 0
## $ ResidualSugar <int> 616
## $ Chlorides <int> 638
## $ FreeSulfurDioxide <int> 647
## $ TotalSulfurDioxide <int> 682
## $ Density <int> 0
## $ pH <int> 395
## $ Sulphates <int> 1210
## $ Alcohol <int> 653
## $ LabelAppeal <int> 0
## $ AcidIndex <int> 0
## $ STARS <int> 3359
library(naniar)
## Warning: package 'naniar' was built under R version 4.2.3
vis_miss(data_t_mod)
vis_miss(data_t_mod)
require(mice)
## Loading required package: mice
## Warning: package 'mice' was built under R version 4.2.3
set.seed(999)
sampl = caTools::sample.split(data_t_mod$TARGET, SplitRatio = .80)
wine_train1 <- subset(data_t_mod, sampl == TRUE)
wine_test1 <- subset(data_t_mod, sampl == FALSE)
wine_train2 <- as.data.frame(tidyr::complete(mice(wine_train1, m=1, maxit = 5, seed = 42)))
##
## iter imp variable
## 1 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 2 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 3 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 4 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 5 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
wine_test2 <- as.data.frame(tidyr::complete(mice(wine_test1, m=1, maxit = 5, seed = 42)))
##
## iter imp variable
## 1 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 2 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 3 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 4 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 5 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
#Given the low correlation between AcidIndex and TARGET it might not make a huge difference, however, we will log transform it to test.
wine_train2$AcidIndex <- log(wine_train2$AcidIndex)
wine_test2$AcidIndex <- log(wine_test2$AcidIndex)
require(ggplot2)
require(gridExtra)
wine_train1 <- wine_train1[,-1]
model1 = glm(TARGET ~ ., data=wine_train1, family=poisson)
summary(model1)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = wine_train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2128 -0.2757 0.0647 0.3766 1.6981
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.608e+00 2.796e-01 5.750 8.90e-09 ***
## FixedAcidity 6.705e-04 1.177e-03 0.570 0.56901
## VolatileAcidity -2.750e-02 9.283e-03 -2.963 0.00305 **
## CitricAcid -3.835e-03 8.519e-03 -0.450 0.65259
## ResidualSugar 1.828e-05 2.152e-04 0.085 0.93232
## Chlorides -3.764e-02 2.314e-02 -1.627 0.10377
## FreeSulfurDioxide 5.671e-05 4.892e-05 1.159 0.24630
## TotalSulfurDioxide 2.230e-05 3.177e-05 0.702 0.48274
## Density -4.025e-01 2.749e-01 -1.464 0.14326
## pH 2.307e-04 1.085e-02 0.021 0.98303
## Sulphates -5.984e-03 7.973e-03 -0.751 0.45293
## Alcohol 3.262e-03 2.004e-03 1.628 0.10360
## LabelAppeal 1.730e-01 8.858e-03 19.530 < 2e-16 ***
## AcidIndex -4.967e-02 6.666e-03 -7.451 9.28e-14 ***
## STARS 1.929e-01 8.328e-03 23.160 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4720.5 on 5143 degrees of freedom
## Residual deviance: 3242.8 on 5129 degrees of freedom
## (5093 observations deleted due to missingness)
## AIC: 18545
##
## Number of Fisher Scoring iterations: 5
plot(model1)
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 653 rows containing non-finite values (`stat_bin()`).
## Warning: Removed 653 rows containing non-finite values (`stat_qq()`).
## Warning: Removed 653 rows containing non-finite values (`stat_qq_line()`).
## Warning: Removed 653 rows containing non-finite values (`stat_boxplot()`).
## Removed 653 rows containing non-finite values (`stat_boxplot()`).
model2 = glm(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Chlorides-FreeSulfurDioxide-TotalSulfurDioxide-Density-pH-Sulphates-Alcohol, data=wine_train1, family=poisson)
summary(model2)
##
## Call:
## glm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Chlorides - FreeSulfurDioxide - TotalSulfurDioxide - Density -
## pH - Sulphates - Alcohol, family = poisson, data = wine_train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1898 -0.2777 0.0622 0.3764 1.6086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.251442 0.054724 22.868 < 2e-16 ***
## VolatileAcidity -0.027581 0.009278 -2.973 0.00295 **
## LabelAppeal 0.173177 0.008853 19.562 < 2e-16 ***
## AcidIndex -0.050616 0.006553 -7.724 1.13e-14 ***
## STARS 0.194208 0.008292 23.421 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4720.5 on 5143 degrees of freedom
## Residual deviance: 3253.1 on 5139 degrees of freedom
## (5093 observations deleted due to missingness)
## AIC: 18535
##
## Number of Fisher Scoring iterations: 5
plot(model2)
model3 = glm(TARGET ~ ., data=wine_train2, family=poisson)
summary(model3)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = wine_train2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8881 -0.6747 0.1302 0.6374 2.4015
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.248e+00 2.287e-01 9.830 < 2e-16 ***
## INDEX 8.312e-07 1.220e-06 0.681 0.495737
## FixedAcidity 8.840e-05 9.192e-04 0.096 0.923380
## VolatileAcidity -4.304e-02 7.279e-03 -5.912 3.37e-09 ***
## CitricAcid 8.337e-03 6.572e-03 1.269 0.204606
## ResidualSugar 1.243e-04 1.675e-04 0.742 0.458154
## Chlorides -6.422e-02 1.789e-02 -3.590 0.000330 ***
## FreeSulfurDioxide 1.339e-04 3.809e-05 3.515 0.000439 ***
## TotalSulfurDioxide 8.710e-05 2.460e-05 3.541 0.000398 ***
## Density -2.978e-01 2.144e-01 -1.389 0.164915
## pH -1.902e-02 8.413e-03 -2.261 0.023774 *
## Sulphates -1.234e-02 6.173e-03 -1.999 0.045630 *
## Alcohol 2.379e-03 1.555e-03 1.530 0.126110
## LabelAppeal 1.436e-01 6.770e-03 21.217 < 2e-16 ***
## AcidIndex -7.520e-01 4.001e-02 -18.798 < 2e-16 ***
## STARS 3.430e-01 6.248e-03 54.900 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18291 on 10236 degrees of freedom
## Residual deviance: 12778 on 10221 degrees of freedom
## AIC: 38368
##
## Number of Fisher Scoring iterations: 5
plot(model3)
model4 = glm(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Density-Alcohol, data=wine_train2, family=poisson)
summary(model4)
##
## Call:
## glm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Density - Alcohol, family = poisson, data = wine_train2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8904 -0.6794 0.1272 0.6380 2.3916
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.984e+00 8.912e-02 22.257 < 2e-16 ***
## INDEX 8.686e-07 1.220e-06 0.712 0.476446
## VolatileAcidity -4.328e-02 7.278e-03 -5.946 2.74e-09 ***
## Chlorides -6.556e-02 1.788e-02 -3.667 0.000245 ***
## FreeSulfurDioxide 1.320e-04 3.807e-05 3.468 0.000525 ***
## TotalSulfurDioxide 8.613e-05 2.457e-05 3.505 0.000457 ***
## pH -1.929e-02 8.410e-03 -2.293 0.021840 *
## Sulphates -1.227e-02 6.168e-03 -1.989 0.046736 *
## LabelAppeal 1.437e-01 6.771e-03 21.216 < 2e-16 ***
## AcidIndex -7.540e-01 3.936e-02 -19.154 < 2e-16 ***
## STARS 3.440e-01 6.231e-03 55.206 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18291 on 10236 degrees of freedom
## Residual deviance: 12785 on 10226 degrees of freedom
## AIC: 38365
##
## Number of Fisher Scoring iterations: 5
plot(model4)
library(MASS)
## Warning: package 'MASS' was built under R version 4.2.3
model5 <- glm.nb(TARGET ~ ., data = wine_train1)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model5)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = wine_train1, init.theta = 138898.9107,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2127 -0.2757 0.0647 0.3766 1.6981
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.608e+00 2.796e-01 5.750 8.91e-09 ***
## FixedAcidity 6.705e-04 1.177e-03 0.570 0.56900
## VolatileAcidity -2.750e-02 9.283e-03 -2.963 0.00305 **
## CitricAcid -3.835e-03 8.519e-03 -0.450 0.65259
## ResidualSugar 1.828e-05 2.152e-04 0.085 0.93231
## Chlorides -3.764e-02 2.314e-02 -1.627 0.10378
## FreeSulfurDioxide 5.671e-05 4.892e-05 1.159 0.24630
## TotalSulfurDioxide 2.230e-05 3.177e-05 0.702 0.48275
## Density -4.025e-01 2.750e-01 -1.464 0.14326
## pH 2.307e-04 1.085e-02 0.021 0.98303
## Sulphates -5.984e-03 7.973e-03 -0.751 0.45293
## Alcohol 3.262e-03 2.004e-03 1.628 0.10360
## LabelAppeal 1.730e-01 8.858e-03 19.529 < 2e-16 ***
## AcidIndex -4.967e-02 6.666e-03 -7.451 9.28e-14 ***
## STARS 1.929e-01 8.328e-03 23.160 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(138898.9) family taken to be 1)
##
## Null deviance: 4720.4 on 5143 degrees of freedom
## Residual deviance: 3242.7 on 5129 degrees of freedom
## (5093 observations deleted due to missingness)
## AIC: 18547
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 138899
## Std. Err.: 259921
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -18515.07
plot(model5)
library(MASS)
model6 <- glm.nb(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Chlorides-FreeSulfurDioxide-TotalSulfurDioxide-Density-pH-Sulphates-Alcohol, data = wine_train1)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model6)
##
## Call:
## glm.nb(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Chlorides - FreeSulfurDioxide - TotalSulfurDioxide - Density -
## pH - Sulphates - Alcohol, data = wine_train1, init.theta = 138402.5261,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1898 -0.2777 0.0622 0.3764 1.6086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.251443 0.054725 22.868 < 2e-16 ***
## VolatileAcidity -0.027581 0.009279 -2.973 0.00295 **
## LabelAppeal 0.173177 0.008853 19.562 < 2e-16 ***
## AcidIndex -0.050616 0.006553 -7.724 1.13e-14 ***
## STARS 0.194209 0.008292 23.421 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(138402.5) family taken to be 1)
##
## Null deviance: 4720.4 on 5143 degrees of freedom
## Residual deviance: 3253.0 on 5139 degrees of freedom
## (5093 observations deleted due to missingness)
## AIC: 18537
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 138403
## Std. Err.: 258834
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -18525.37
plot(model6)
## Model 7 : Negative Binomial with imputations
model7 <- glm.nb(TARGET ~ ., data = wine_train2)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model7)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = wine_train2, init.theta = 49133.51129,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8880 -0.6747 0.1302 0.6374 2.4015
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.248e+00 2.287e-01 9.830 < 2e-16 ***
## INDEX 8.311e-07 1.220e-06 0.681 0.495762
## FixedAcidity 8.842e-05 9.192e-04 0.096 0.923369
## VolatileAcidity -4.304e-02 7.280e-03 -5.912 3.38e-09 ***
## CitricAcid 8.337e-03 6.572e-03 1.269 0.204616
## ResidualSugar 1.243e-04 1.676e-04 0.742 0.458148
## Chlorides -6.422e-02 1.789e-02 -3.590 0.000330 ***
## FreeSulfurDioxide 1.339e-04 3.809e-05 3.515 0.000440 ***
## TotalSulfurDioxide 8.710e-05 2.460e-05 3.541 0.000399 ***
## Density -2.978e-01 2.144e-01 -1.389 0.164928
## pH -1.902e-02 8.413e-03 -2.261 0.023775 *
## Sulphates -1.234e-02 6.173e-03 -1.999 0.045628 *
## Alcohol 2.379e-03 1.555e-03 1.530 0.126130
## LabelAppeal 1.436e-01 6.770e-03 21.216 < 2e-16 ***
## AcidIndex -7.521e-01 4.001e-02 -18.798 < 2e-16 ***
## STARS 3.430e-01 6.248e-03 54.899 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(49133.51) family taken to be 1)
##
## Null deviance: 18290 on 10236 degrees of freedom
## Residual deviance: 12778 on 10221 degrees of freedom
## AIC: 38371
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 49134
## Std. Err.: 63329
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -38336.61
plot(model7)
model8 <- glm.nb(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Density-Alcohol, data = wine_train2)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace =
## control$trace > : iteration limit reached
summary(model8)
##
## Call:
## glm.nb(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Density - Alcohol, data = wine_train2, init.theta = 49062.73041,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8903 -0.6794 0.1272 0.6380 2.3916
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.984e+00 8.913e-02 22.256 < 2e-16 ***
## INDEX 8.685e-07 1.220e-06 0.712 0.476472
## VolatileAcidity -4.328e-02 7.278e-03 -5.946 2.74e-09 ***
## Chlorides -6.557e-02 1.788e-02 -3.667 0.000245 ***
## FreeSulfurDioxide 1.320e-04 3.807e-05 3.468 0.000525 ***
## TotalSulfurDioxide 8.613e-05 2.457e-05 3.505 0.000457 ***
## pH -1.929e-02 8.410e-03 -2.293 0.021841 *
## Sulphates -1.227e-02 6.168e-03 -1.989 0.046734 *
## LabelAppeal 1.437e-01 6.771e-03 21.216 < 2e-16 ***
## AcidIndex -7.540e-01 3.937e-02 -19.154 < 2e-16 ***
## STARS 3.440e-01 6.231e-03 55.205 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(49062.73) family taken to be 1)
##
## Null deviance: 18290 on 10236 degrees of freedom
## Residual deviance: 12784 on 10226 degrees of freedom
## AIC: 38367
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 49063
## Std. Err.: 63266
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -38343.11
plot(model8)
model9 <- lm(TARGET ~ ., data = wine_train2)
summary(model9)
##
## Call:
## lm(formula = TARGET ~ ., data = wine_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2761 -1.0236 0.1638 1.0311 4.2527
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.704e+00 5.559e-01 10.261 < 2e-16 ***
## INDEX 1.755e-06 2.999e-06 0.585 0.558452
## FixedAcidity 6.125e-04 2.244e-03 0.273 0.784855
## VolatileAcidity -1.236e-01 1.784e-02 -6.929 4.51e-12 ***
## CitricAcid 2.473e-02 1.623e-02 1.524 0.127555
## ResidualSugar 3.463e-04 4.116e-04 0.841 0.400181
## Chlorides -2.005e-01 4.387e-02 -4.569 4.95e-06 ***
## FreeSulfurDioxide 3.669e-04 9.364e-05 3.919 8.97e-05 ***
## TotalSulfurDioxide 2.260e-04 5.996e-05 3.770 0.000164 ***
## Density -7.213e-01 5.239e-01 -1.377 0.168644
## pH -4.485e-02 2.060e-02 -2.177 0.029498 *
## Sulphates -3.526e-02 1.515e-02 -2.326 0.020013 *
## Alcohol 1.054e-02 3.784e-03 2.785 0.005359 **
## LabelAppeal 4.400e-01 1.634e-02 26.932 < 2e-16 ***
## AcidIndex -2.016e+00 9.211e-02 -21.887 < 2e-16 ***
## STARS 1.174e+00 1.658e-02 70.801 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.41 on 10221 degrees of freedom
## Multiple R-squared: 0.465, Adjusted R-squared: 0.4642
## F-statistic: 592.2 on 15 and 10221 DF, p-value: < 2.2e-16
plot(model9)
#As we know the significant variables are FixedAcidity, CitricAcid and ResidualSugar, so using the same in the Linear Regression Model and applying the same of imputed training data.
model10 <- lm(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar, data = wine_train2)
summary(model10)
##
## Call:
## lm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar,
## data = wine_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.2776 -1.0217 0.1651 1.0288 4.2686
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.688e+00 5.557e-01 10.236 < 2e-16 ***
## INDEX 1.780e-06 2.999e-06 0.593 0.552956
## VolatileAcidity -1.243e-01 1.783e-02 -6.971 3.33e-12 ***
## Chlorides -2.018e-01 4.387e-02 -4.601 4.26e-06 ***
## FreeSulfurDioxide 3.686e-04 9.363e-05 3.937 8.30e-05 ***
## TotalSulfurDioxide 2.279e-04 5.994e-05 3.802 0.000144 ***
## Density -7.229e-01 5.239e-01 -1.380 0.167688
## pH -4.481e-02 2.060e-02 -2.175 0.029623 *
## Sulphates -3.574e-02 1.515e-02 -2.360 0.018311 *
## Alcohol 1.057e-02 3.782e-03 2.796 0.005188 **
## LabelAppeal 4.399e-01 1.634e-02 26.929 < 2e-16 ***
## AcidIndex -2.001e+00 9.047e-02 -22.122 < 2e-16 ***
## STARS 1.175e+00 1.658e-02 70.852 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.41 on 10224 degrees of freedom
## Multiple R-squared: 0.4648, Adjusted R-squared: 0.4642
## F-statistic: 740 on 12 and 10224 DF, p-value: < 2.2e-16
plot(model10)
polrDF <- wine_train2
polrDF$TARGET <- as.factor(polrDF$TARGET)
model11 <- polr(TARGET ~ ., data = polrDF, Hess=TRUE)
summary(model11)
## Warning in sqrt(diag(vc)): NaNs produced
## Call:
## polr(formula = TARGET ~ ., data = polrDF, Hess = TRUE)
##
## Coefficients:
## Value Std. Error t value
## INDEX 2.267e-06 NaN NaN
## FixedAcidity 1.573e-03 2.836e-03 0.5547
## VolatileAcidity -1.527e-01 2.322e-02 -6.5747
## CitricAcid 2.440e-02 2.101e-02 1.1615
## ResidualSugar 2.614e-04 5.322e-04 0.4911
## Chlorides -2.526e-01 6.045e-04 -417.7833
## FreeSulfurDioxide 4.674e-04 1.215e-04 3.8460
## TotalSulfurDioxide 2.348e-04 7.795e-05 3.0128
## Density -1.147e+00 1.954e-03 -587.2133
## pH -2.790e-02 1.333e-02 -2.0925
## Sulphates -2.635e-02 1.977e-02 -1.3330
## Alcohol 2.396e-02 4.571e-03 5.2406
## LabelAppeal 8.398e-01 2.338e-02 35.9220
## AcidIndex -2.590e+00 4.202e-03 -616.3190
## STARS 1.474e+00 2.209e-02 66.7088
##
## Intercepts:
## Value Std. Error t value
## 0|1 -5.6148 0.0020 -2868.0237
## 1|2 -5.4771 0.0020 -2762.0888
## 2|3 -4.8713 0.0051 -958.1218
## 3|4 -3.5027 0.0263 -133.4228
## 4|5 -1.6534 0.0391 -42.2503
## 5|6 0.3167 0.0560 5.6557
## 6|7 2.5198 0.0558 45.1705
## 7|8 4.8597 0.0558 87.1480
##
## Residual Deviance: 29979.83
## AIC: 30025.83
#Zero inflation understands that some Poisson distrobutions are dominated by many zeros. As such it corrects for this. This is one of the most promissing ones because as we saw in our data exploration, there were more zeros, and then normally distributed data after that.
options(repos = "http://cran.rstudio.com/")
library(pscl)
## Warning: package 'pscl' was built under R version 4.2.3
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
wine_train2 <- wine_train2 [,-1]
model12 <- zeroinfl(TARGET ~ . | STARS, data = wine_train2, dist = 'negbin')
summary(model12)
##
## Call:
## zeroinfl(formula = TARGET ~ . | STARS, data = wine_train2, dist = "negbin")
##
## Pearson residuals:
## Min 1Q Median 3Q Max
## -2.14654 -0.50114 0.06471 0.48537 2.09398
##
## Count model coefficients (negbin with log link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.770e+00 2.383e-01 7.427 1.11e-13 ***
## FixedAcidity 2.527e-04 9.455e-04 0.267 0.789258
## VolatileAcidity -1.858e-02 7.550e-03 -2.461 0.013839 *
## CitricAcid 2.031e-03 6.710e-03 0.303 0.762105
## ResidualSugar -1.932e-05 1.720e-04 -0.112 0.910584
## Chlorides -3.326e-02 1.855e-02 -1.793 0.073026 .
## FreeSulfurDioxide 4.256e-05 3.864e-05 1.102 0.270632
## TotalSulfurDioxide -2.630e-06 2.464e-05 -0.107 0.914998
## Density -2.677e-01 2.220e-01 -1.206 0.227917
## pH 5.240e-04 8.735e-03 0.060 0.952162
## Sulphates -3.152e-03 6.400e-03 -0.492 0.622431
## Alcohol 6.047e-03 1.589e-03 3.805 0.000142 ***
## LabelAppeal 2.258e-01 7.069e-03 31.937 < 2e-16 ***
## AcidIndex -2.602e-01 4.478e-02 -5.810 6.24e-09 ***
## STARS 1.221e-01 6.981e-03 17.488 < 2e-16 ***
## Log(theta) 1.981e+01 1.382e+00 14.329 < 2e-16 ***
##
## Zero-inflation model coefficients (binomial with logit link):
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.5836 0.1248 20.70 <2e-16 ***
## STARS -2.9073 0.1126 -25.82 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Theta = 400975735.8308
## Number of iterations in BFGS optimization: 24
## Log-likelihood: -1.717e+04 on 18 Df
scatterPreds <- predict(model12, wine_train2)
qplot(wine_train2$TARGET, scatterPreds, main = 'Predicted vs Actual') + ggthemes::theme_tufte()
## Warning: `qplot()` was deprecated in ggplot2 3.4.0.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
residPlot <- scatterPreds - wine_train2$TARGET
qplot(wine_train2$TARGET, residPlot, main = 'Residuals') + ggthemes::theme_tufte()
aic1 <- model1$aic
aic2 <- model2$aic
aic3 <- model3$aic
aic4 <- model4$aic
aic5 <- model5$aic
aic6 <- model6$aic
aic7 <- model7$aic
aic8 <- model8$aic
aic9 <- model9$aic
aic10 <- model10$aic
aic11 <- model11$aic
aic12 <- model12$aic
mse1 <- mean((wine_train2$TARGET - predict(model1))^2)
## Warning in wine_train2$TARGET - predict(model1): longer object length is not a
## multiple of shorter object length
mse2 <- mean((wine_train2$TARGET - predict(model2))^2)
## Warning in wine_train2$TARGET - predict(model2): longer object length is not a
## multiple of shorter object length
mse3 <- mean((wine_train2$TARGET - predict(model3))^2)
mse4 <- mean((wine_train2$TARGET - predict(model4))^2)
mse5 <- mean((wine_train2$TARGET - predict(model5))^2)
## Warning in wine_train2$TARGET - predict(model5): longer object length is not a
## multiple of shorter object length
mse6 <- mean((wine_train2$TARGET - predict(model6))^2)
## Warning in wine_train2$TARGET - predict(model6): longer object length is not a
## multiple of shorter object length
mse7 <- mean((wine_train2$TARGET - predict(model7))^2)
mse8 <- mean((wine_train2$TARGET - predict(model8))^2)
mse9 <- mean((wine_train2$TARGET - predict(model9))^2)
mse10 <- mean((wine_train2$TARGET - predict(model10))^2)
mse11 <- mean((wine_train2$TARGET - predict(model11))^2)
## Warning in Ops.factor(wine_train2$TARGET, predict(model11)): '-' not meaningful
## for factors
mse12 <- mean((wine_train2$TARGET - predict(model12))^2)
compare_aic_mse <- matrix(c(mse1, mse2, mse3, mse4, mse5, mse6, mse7, mse8, mse9, mse10, mse11, mse12,
aic1, aic2, aic3, aic4, aic5, aic6, aic7, aic8, aic9, aic10, aic11, aic12),nrow=12,ncol=2,byrow=TRUE)
## Warning in matrix(c(mse1, mse2, mse3, mse4, mse5, mse6, mse7, mse8, mse9, :
## data length [20] is not a sub-multiple or multiple of the number of rows [12]
rownames(compare_aic_mse) <- c("Model1","Model2","Model3","Model4","Model5","Model6","Model7","Model8","Model9","Model10","Model11","Model12")
colnames(compare_aic_mse) <- c("MSE","AIC")
library(kableExtra)
## Warning: package 'kableExtra' was built under R version 4.2.3
kable(compare_aic_mse) %>%
kable_styling(full_width = T)
| MSE | AIC | |
|---|---|---|
| Model1 | 6.929787 | 6.926722 |
| Model2 | 6.843881 | 6.844564 |
| Model3 | 6.929788 | 6.926723 |
| Model4 | 6.843877 | 6.844560 |
| Model5 | 1.985748 | 1.986345 |
| Model6 | NA | 1.952898 |
| Model7 | 18544.983192 | 18535.285631 |
| Model8 | 38368.436016 | 38364.927425 |
| Model9 | 18547.067808 | 18537.370350 |
| Model10 | 38370.614365 | 38367.105614 |
| Model11 | 6.929787 | 6.926722 |
| Model12 | 6.843881 | 6.844564 |
modelValidation <- function(mod){
preds = predict(mod, wine_test2)
diffMat = as.numeric(preds) - as.numeric(wine_test2$TARGET)
diffMat = diffMat^2
loss <- mean(diffMat)
return(loss)
}
compare_models <- matrix(c(modelValidation(model1),modelValidation(model2),modelValidation(model3),modelValidation(model4),modelValidation(model5),modelValidation(model6),
modelValidation(model7),modelValidation(model8),modelValidation(model9),modelValidation(model10),modelValidation(model11),modelValidation(model12)),
nrow=12,ncol=1,byrow=TRUE)
rownames(compare_models) <- c("Model1","Model2","Model3","Model4","Model5","Model6","Model7","Model8","Model9","Model10","Model11","Model12")
colnames(compare_models) <- c("Loss:")
compare_models <- as.data.frame(compare_models)
compare_models
## Loss:
## Model1 5.477096
## Model2 5.461739
## Model3 6.839470
## Model4 6.840874
## Model5 5.477090
## Model6 5.461734
## Model7 6.839466
## Model8 6.840870
## Model9 2.043590
## Model10 2.043377
## Model11 3.680219
## Model12 2.034109
#A regular Poisson regression does not perform very well. #The same can be said for the Negative Binomial. #The linear model actually performs very well. #Very surprisinly, Ordinal Logistic Regression does not work as well as the linear model.
#Using sqaured loss to check accuracy of the model
#Based on this metric, Zero Poission Inflation is the most accurate. #From the above models, Model12 - Zero Poission Inflations has least loss as it uses less variables and is parsimonious. Also the R2 looks fine. The squared loss is also fine
#In terms of MSE and AIC, Model 5 is best followed by Model 4 and tied Model 2 and Model 12. #In terms of Loss, Model 12 is best followed closely by Model 9 and Model 10.
#Overall, we choose Zero Poission Inflation due to following : - least loss - good MSE score - good AIC score
require(mice)
library(dplyr)
library(moments) # Load the package
data_e_mod <- read.csv("C:/Users/91976/Downloads/wine-evaluation-data.csv")
wine_eval <- as.data.frame(complete(data_e_mod))
wine_eval$AcidIndex <- log(wine_eval$AcidIndex)
wine_eval$TARGET <- predict(model12, newdata=wine_eval)
write.csv(wine_eval,"Evaluation_Full_Data.csv", row.names=FALSE)
data_predicted_eval <- read.csv("Evaluation_Full_Data.csv")
data_predicted_eval <- na.omit(data_predicted_eval) # Remove rows with missing values
DT::datatable(data_predicted_eval)
options(width=100)
round(with(data_predicted_eval, c(summary(TARGET), StdD=sd(TARGET), Skew=skewness(TARGET), Kurt=kurtosis(TARGET))),2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. StdD Skew Kurt
## 1.06 1.98 3.40 3.35 4.29 8.25 1.40 0.44 2.64