In this homework assignment, you will explore, analyze and model a data set containing information on approximately 12,000 commercially available wines. The variables are mostly related to the chemical properties of the wine being sold. The response variable is the number of sample cases of wine that were purchased by wine distribution companies after sampling a wine. These cases would be used to provide tasting samples to restaurants and wine stores around the United States. The more sample cases purchased, the more likely is a wine to be sold at a high end restaurant. A large wine manufacturer is studying the data in order to predict the number of wine cases ordered based upon the wine characteristics. If the wine manufacturer can predict the number of cases, then that manufacturer will be able to adjust their wine offering to maximize sales. Your objective is to build a count regression model to predict the number of cases of wine that will be sold given certain properties of the wine. HINT: Sometimes, the fact that a variable is missing is actually predictive of the target. You can only use the variables given to you (or variables that you derive from the variables provided).
library(readr)
library(kableExtra)
library(tidyverse)
## -- Attaching packages -------------------------------------------------------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.5
## v tibble 1.4.2 v dplyr 0.7.5
## v tidyr 0.8.1 v stringr 1.3.1
## v ggplot2 2.2.1 v forcats 0.3.0
## -- Conflicts ----------------------------------------------------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(knitr)
library(psych)
##
## Attaching package: 'psych'
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(usdm)
## Loading required package: sp
## Loading required package: raster
##
## Attaching package: 'raster'
## The following object is masked from 'package:dplyr':
##
## select
## The following object is masked from 'package:tidyr':
##
## extract
library(mice)
## Loading required package: lattice
##
## Attaching package: 'mice'
## The following object is masked from 'package:tidyr':
##
## complete
## The following objects are masked from 'package:base':
##
## cbind, rbind
library(ggiraph)
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
library(reshape2)
##
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
##
## smiths
library(corrgram)
library(caTools)
library(caret)
##
## Attaching package: 'caret'
## The following object is masked from 'package:purrr':
##
## lift
library(ROCR)
## Loading required package: gplots
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
library(reshape2)
library(Amelia)
## Loading required package: Rcpp
## ##
## ## Amelia II: Multiple Imputation
## ## (Version 1.7.5, built: 2018-05-07)
## ## Copyright (C) 2005-2018 James Honaker, Gary King and Matthew Blackwell
## ## Refer to http://gking.harvard.edu/amelia/ for more information
## ##
library(qqplotr)
library(moments)
library(car)
## Warning: package 'car' was built under R version 3.5.1
## Loading required package: carData
##
## Attaching package: 'car'
## The following object is masked from 'package:usdm':
##
## vif
## The following object is masked from 'package:psych':
##
## logit
## The following object is masked from 'package:dplyr':
##
## recode
## The following object is masked from 'package:purrr':
##
## some
library(MASS)
## Warning: package 'MASS' was built under R version 3.5.1
##
## Attaching package: 'MASS'
## The following objects are masked from 'package:raster':
##
## area, select
## The following object is masked from 'package:dplyr':
##
## select
library(geoR)
## Warning: package 'geoR' was built under R version 3.5.1
## --------------------------------------------------------------
## Analysis of Geostatistical Data
## For an Introduction to geoR go to http://www.leg.ufpr.br/geoR
## geoR version 1.7-5.2 (built on 2016-05-02) is now loaded
## --------------------------------------------------------------
library(pander)
## Warning: package 'pander' was built under R version 3.5.1
wine_train <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment-5/master/wine_training_data.csv", stringsAsFactors = FALSE)
do_factors <- function(wine_instance){
wine_instance <- within(wine_instance, {
LabelAppeal <- factor(LabelAppeal)
AcidIndex <- factor(AcidIndex)
STARS <- factor(STARS)
})
return (wine_instance)
}
summary(wine_train)
## ï..INDEX TARGET FixedAcidity VolatileAcidity
## Min. : 1 Min. :0.000 Min. :-18.100 Min. :-2.7900
## 1st Qu.: 4038 1st Qu.:2.000 1st Qu.: 5.200 1st Qu.: 0.1300
## Median : 8110 Median :3.000 Median : 6.900 Median : 0.2800
## Mean : 8070 Mean :3.029 Mean : 7.076 Mean : 0.3241
## 3rd Qu.:12106 3rd Qu.:4.000 3rd Qu.: 9.500 3rd Qu.: 0.6400
## Max. :16129 Max. :8.000 Max. : 34.400 Max. : 3.6800
##
## CitricAcid ResidualSugar Chlorides FreeSulfurDioxide
## Min. :-3.2400 Min. :-127.800 Min. :-1.1710 Min. :-555.00
## 1st Qu.: 0.0300 1st Qu.: -2.000 1st Qu.:-0.0310 1st Qu.: 0.00
## Median : 0.3100 Median : 3.900 Median : 0.0460 Median : 30.00
## Mean : 0.3084 Mean : 5.419 Mean : 0.0548 Mean : 30.85
## 3rd Qu.: 0.5800 3rd Qu.: 15.900 3rd Qu.: 0.1530 3rd Qu.: 70.00
## Max. : 3.8600 Max. : 141.150 Max. : 1.3510 Max. : 623.00
## NA's :616 NA's :638 NA's :647
## TotalSulfurDioxide Density pH Sulphates
## Min. :-823.0 Min. :0.8881 Min. :0.480 Min. :-3.1300
## 1st Qu.: 27.0 1st Qu.:0.9877 1st Qu.:2.960 1st Qu.: 0.2800
## Median : 123.0 Median :0.9945 Median :3.200 Median : 0.5000
## Mean : 120.7 Mean :0.9942 Mean :3.208 Mean : 0.5271
## 3rd Qu.: 208.0 3rd Qu.:1.0005 3rd Qu.:3.470 3rd Qu.: 0.8600
## Max. :1057.0 Max. :1.0992 Max. :6.130 Max. : 4.2400
## NA's :682 NA's :395 NA's :1210
## Alcohol LabelAppeal AcidIndex STARS
## Min. :-4.70 Min. :-2.000000 Min. : 4.000 Min. :1.000
## 1st Qu.: 9.00 1st Qu.:-1.000000 1st Qu.: 7.000 1st Qu.:1.000
## Median :10.40 Median : 0.000000 Median : 8.000 Median :2.000
## Mean :10.49 Mean :-0.009066 Mean : 7.773 Mean :2.042
## 3rd Qu.:12.40 3rd Qu.: 1.000000 3rd Qu.: 8.000 3rd Qu.:3.000
## Max. :26.50 Max. : 2.000000 Max. :17.000 Max. :4.000
## NA's :653 NA's :3359
Removing the Index column:
wine_train <- wine_train[,-c(1)]
There are 12795 observations and 16 variables. Each wine has 14 potential predictor variables, and 1 response variable. The response variable is “TARGET”, which is the number of cases purchased.
Visual Exploration:
Let’s dig into our available variables.
AcidIndex - Proprietary method of testing total acidity of wine by using a weighted average.
with(wine_train, c(summary(AcidIndex), SD=sd(AcidIndex), Skew=skewness(AcidIndex), Kurt=kurtosis(AcidIndex)))
## Min. 1st Qu. Median Mean 3rd Qu. Max. SD
## 4.000000 7.000000 8.000000 7.772724 8.000000 17.000000 1.323926
## Skew Kurt
## 1.648689 8.191373
hist <- ggplot(wine_train, aes(AcidIndex)) + geom_histogram(fill = 'dodgerblue', binwidth = 2, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of AcidIndex') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=AcidIndex)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of AcidIndex") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", AcidIndex)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of AcidIndex', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), AcidIndex)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of AcidIndex by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
Alcohol - This variable tells us about the Alcohol content.
with(wine_train, c(summary(Alcohol), SD=sd(Alcohol), Skew=skewness(Alcohol), Kurt=kurtosis(Alcohol)))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -4.70000 9.00000 10.40000 10.48924 12.40000 26.50000 653.00000
## SD Skew Kurt
## NA NA NA
hist <- ggplot(wine_train, aes(Alcohol)) + geom_histogram(fill = 'dodgerblue', binwidth = 2, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of Alcohol') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=Alcohol)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of Alcohol") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", Alcohol)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of Alcohol', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), Alcohol)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of Alcohol by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 653 rows containing non-finite values (stat_bin).
## Warning: Removed 653 rows containing non-finite values (stat_boxplot).
## Warning: Removed 653 rows containing non-finite values (stat_boxplot).
Chlorides - This variable tells us about the Chloride content of wine.
with(wine_train, c(summary(Chlorides), SD=sd(Chlorides), Skew=skewness(Chlorides), Kurt=kurtosis(Chlorides)))
## Min. 1st Qu. Median Mean 3rd Qu.
## -1.17100000 -0.03100000 0.04600000 0.05482249 0.15300000
## Max. NA's SD Skew Kurt
## 1.35100000 638.00000000 NA NA NA
hist <- ggplot(wine_train, aes(Chlorides)) + geom_histogram(fill = 'dodgerblue', binwidth = .2, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of Chlorides') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=Chlorides)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of Chlorides") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", Chlorides)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of Chlorides', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), Chlorides)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of Chlorides by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 638 rows containing non-finite values (stat_bin).
## Warning: Removed 638 rows containing non-finite values (stat_boxplot).
## Warning: Removed 638 rows containing non-finite values (stat_boxplot).
CitricAcid - This variable tells us about the Citric Acid Content of wine.
with(wine_train, c(summary(CitricAcid), SD=sd(CitricAcid), Skew=skewness(CitricAcid), Kurt=kurtosis(CitricAcid)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -3.24000000 0.03000000 0.31000000 0.30841266 0.58000000 3.86000000
## SD Skew Kurt
## 0.86207979 -0.05031294 4.83869638
hist <- ggplot(wine_train, aes(CitricAcid)) + geom_histogram(fill = 'dodgerblue', binwidth = 1, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of CitricAcid') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=CitricAcid)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of CitricAcid") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", CitricAcid)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of CitricAcid', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), CitricAcid)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of CitricAcid by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
Density - This variable tells us about the Density of wine.
with(wine_train, c(summary(Density), SD=sd(Density), Skew=skewness(Density), Kurt=kurtosis(Density)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.88809000 0.98772000 0.99449000 0.99420272 1.00051500 1.09924000
## SD Skew Kurt
## 0.02653765 -0.01869596 4.90072521
hist <- ggplot(wine_train, aes(Density)) + geom_histogram(fill = 'dodgerblue', binwidth = .05, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of Density') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=Density)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of Density") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", Density)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of Density', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), Density)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of Density by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
FixedAcidity - This variable tells us about the FixedAcidity of wine.
with(wine_train, c(summary(FixedAcidity), SD=sd(FixedAcidity), Skew=skewness(FixedAcidity), Kurt=kurtosis(FixedAcidity)))
## Min. 1st Qu. Median Mean 3rd Qu.
## -18.10000000 5.20000000 6.90000000 7.07571708 9.50000000
## Max. SD Skew Kurt
## 34.40000000 6.31764346 -0.02258861 4.67572951
hist <- ggplot(wine_train, aes(FixedAcidity)) + geom_histogram(fill = 'dodgerblue', binwidth = 4, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of FixedAcidity') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=FixedAcidity)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of FixedAcidity") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", FixedAcidity)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of FixedAcidity', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), FixedAcidity)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of FixedAcidity by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
FreeSulfurDioxide - This variable tells us about the Sulfur Dioxide content of wine.
with(wine_train, c(summary(FreeSulfurDioxide), SD=sd(FreeSulfurDioxide), Skew=skewness(FreeSulfurDioxide), Kurt=kurtosis(FreeSulfurDioxide)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -555.00000 0.00000 30.00000 30.84557 70.00000 623.00000
## NA's SD Skew Kurt
## 647.00000 NA NA NA
hist <- ggplot(wine_train, aes(FreeSulfurDioxide)) + geom_histogram(fill = 'dodgerblue', binwidth = 50, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of FreeSulfurDioxide') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=FreeSulfurDioxide)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of FreeSulfurDioxide") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", FreeSulfurDioxide)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of FreeSulfurDioxide', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), FreeSulfurDioxide)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of FreeSulfurDioxide by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 647 rows containing non-finite values (stat_bin).
## Warning: Removed 647 rows containing non-finite values (stat_boxplot).
## Warning: Removed 647 rows containing non-finite values (stat_boxplot).
LabelAppeal - Marketing Score indicating the appeal of label design for consumers. High numbers suggest customers like the label design. Negative numbers suggest customers don’t like the design. Many consumers purchase based on the visual appeal of the wine label design. Higher numbers suggest better sales.
with(wine_train, c(summary(LabelAppeal), SD=sd(LabelAppeal), Skew=skewness(LabelAppeal), Kurt=kurtosis(LabelAppeal)))
## Min. 1st Qu. Median Mean 3rd Qu.
## -2.000000000 -1.000000000 0.000000000 -0.009066041 1.000000000
## Max. SD Skew Kurt
## 2.000000000 0.891089247 0.008430445 2.738136433
hist <- ggplot(wine_train, aes(LabelAppeal)) + geom_histogram(fill = 'dodgerblue', binwidth = 1, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of LabelAppeal') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=LabelAppeal)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of LabelAppeal") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", LabelAppeal)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of LabelAppeal', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), LabelAppeal)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of LabelAppeal by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
ResidualSugar - This variable tells us about the ResidualSugar of wine.
with(wine_train, c(summary(ResidualSugar), SD=sd(ResidualSugar), Skew=skewness(ResidualSugar), Kurt=kurtosis(ResidualSugar)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -127.800000 -2.000000 3.900000 5.418733 15.900000 141.150000
## NA's SD Skew Kurt
## 616.000000 NA NA NA
hist <- ggplot(wine_train, aes(ResidualSugar)) + geom_histogram(fill = 'dodgerblue', binwidth = 20, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of ResidualSugar') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=ResidualSugar)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of ResidualSugar") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", ResidualSugar)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of ResidualSugar', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), ResidualSugar)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of ResidualSugar by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 616 rows containing non-finite values (stat_bin).
## Warning: Removed 616 rows containing non-finite values (stat_boxplot).
## Warning: Removed 616 rows containing non-finite values (stat_boxplot).
STARS - Wine rating by a team of experts. 4 Stars = Excellent, 1 Star = Poor. A high number of stars suggests high sales.
with(wine_train, c(summary(STARS), SD=sd(STARS), Skew=skewness(STARS), Kurt=kurtosis(STARS)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.000000 1.000000 2.000000 2.041755 3.000000 4.000000
## NA's SD Skew Kurt
## 3359.000000 NA NA NA
hist <- ggplot(wine_train, aes(STARS)) + geom_histogram(fill = 'dodgerblue', binwidth = 1, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of STARS') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=STARS)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of STARS") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", STARS)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of STARS', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), STARS)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of STARS by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 3359 rows containing non-finite values (stat_bin).
## Warning: Removed 3359 rows containing non-finite values (stat_boxplot).
## Warning: Removed 3359 rows containing non-finite values (stat_boxplot).
Sulphates - This variable tells us about the Sulphates content of wine.
with(wine_train, c(summary(Sulphates), SD=sd(Sulphates), Skew=skewness(Sulphates), Kurt=kurtosis(Sulphates)))
## Min. 1st Qu. Median Mean 3rd Qu.
## -3.1300000 0.2800000 0.5000000 0.5271118 0.8600000
## Max. NA's SD Skew Kurt
## 4.2400000 1210.0000000 NA NA NA
hist <- ggplot(wine_train, aes(Sulphates)) + geom_histogram(fill = 'dodgerblue', binwidth = .5, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of Sulphates') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=Sulphates)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of Sulphates") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", Sulphates)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of Sulphates', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), Sulphates)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of Sulphates by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 1210 rows containing non-finite values (stat_bin).
## Warning: Removed 1210 rows containing non-finite values (stat_boxplot).
## Warning: Removed 1210 rows containing non-finite values (stat_boxplot).
TotalSulfurDioxide - This variable tells us about the Total Sulfur Dioxide of Wine.
with(wine_train, c(summary(TotalSulfurDioxide), SD=sd(TotalSulfurDioxide), Skew=skewness(TotalSulfurDioxide), Kurt=kurtosis(TotalSulfurDioxide)))
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## -823.0000 27.0000 123.0000 120.7142 208.0000 1057.0000 682.0000
## SD Skew Kurt
## NA NA NA
hist <- ggplot(wine_train, aes(TotalSulfurDioxide)) + geom_histogram(fill = 'dodgerblue', binwidth = 200, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of TotalSulfurDioxide') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=TotalSulfurDioxide)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of TotalSulfurDioxide") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", TotalSulfurDioxide)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of TotalSulfurDioxide', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), TotalSulfurDioxide)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of TotalSulfurDioxide by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 682 rows containing non-finite values (stat_bin).
## Warning: Removed 682 rows containing non-finite values (stat_boxplot).
## Warning: Removed 682 rows containing non-finite values (stat_boxplot).
VolatileAcidity - This variable tells us about the VolatileAcidity content of Wine.
with(wine_train, c(summary(VolatileAcidity), SD=sd(VolatileAcidity), Skew=skewness(VolatileAcidity), Kurt=kurtosis(VolatileAcidity)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## -2.79000000 0.13000000 0.28000000 0.32410395 0.64000000 3.68000000
## SD Skew Kurt
## 0.78401424 0.02038235 4.83296606
hist <- ggplot(wine_train, aes(VolatileAcidity)) + geom_histogram(fill = 'dodgerblue', binwidth = .5, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of VolatileAcidity') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=VolatileAcidity)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of VolatileAcidity") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", VolatileAcidity)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of VolatileAcidity', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), VolatileAcidity)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of VolatileAcidity by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
pH - This variable tells us about the pH of Wine.
with(wine_train, c(summary(pH), SD=sd(pH), Skew=skewness(pH), Kurt=kurtosis(pH)))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.480000 2.960000 3.200000 3.207628 3.470000 6.130000
## NA's SD Skew Kurt
## 395.000000 NA NA NA
hist <- ggplot(wine_train, aes(pH)) + geom_histogram(fill = 'dodgerblue', binwidth = .5, color = 'darkgray' ) +
theme_classic() + labs(title = 'Histogram of pH') + theme(plot.title = element_text(hjust = 0.5))
qq_plot <- ggplot(wine_train, aes(sample=pH)) + stat_qq_point(color='dodgerblue') + stat_qq_line(color='darkgray') +
labs(x="Thoretical Quantiles", y="Sample Quantiles", title = "QQ Plot of pH") + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
box_plot <- ggplot(wine_train, aes(x="", pH)) + geom_boxplot(fill='dodgerblue', color='darkgray')+ theme_classic() +
labs(title = 'Boxplot of pH', x="") + theme(plot.title = element_text(hjust = 0.5)) + coord_flip()
box_TARGET <- ggplot(wine_train, aes(x=factor(TARGET), pH)) + geom_boxplot(fill='dodgerblue', color='darkgrey') +
labs(x='TARGET', title = 'Boxplot of pH by TARGET') + theme_classic() +
theme(plot.title = element_text(hjust = 0.5))
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 395 rows containing non-finite values (stat_bin).
## Warning: Removed 395 rows containing non-finite values (stat_boxplot).
## Warning: Removed 395 rows containing non-finite values (stat_boxplot).
Now lets see NA’s for all the variables other than STARS.STARS has NAs that is more than 10%.
Non_NAs <- sapply(wine_train, function(y) sum(length(which(!is.na(y)))))
NAs <- sapply(wine_train, function(y) sum(length(which(is.na(y)))))
NA_Percent <- NAs / (NAs + Non_NAs)
NA_SUMMARY <- data.frame(Non_NAs,NAs,NA_Percent)
missmap(wine_train, main = "Missing Values")
kable(NA_SUMMARY)
| Non_NAs | NAs | NA_Percent | |
|---|---|---|---|
| TARGET | 12795 | 0 | 0.0000000 |
| FixedAcidity | 12795 | 0 | 0.0000000 |
| VolatileAcidity | 12795 | 0 | 0.0000000 |
| CitricAcid | 12795 | 0 | 0.0000000 |
| ResidualSugar | 12179 | 616 | 0.0481438 |
| Chlorides | 12157 | 638 | 0.0498632 |
| FreeSulfurDioxide | 12148 | 647 | 0.0505666 |
| TotalSulfurDioxide | 12113 | 682 | 0.0533021 |
| Density | 12795 | 0 | 0.0000000 |
| pH | 12400 | 395 | 0.0308714 |
| Sulphates | 11585 | 1210 | 0.0945682 |
| Alcohol | 12142 | 653 | 0.0510356 |
| LabelAppeal | 12795 | 0 | 0.0000000 |
| AcidIndex | 12795 | 0 | 0.0000000 |
| STARS | 9436 | 3359 | 0.2625244 |
Finding correlations: The correlation plot below shows how variables in the dataset are related to each other. Looking at the plot, we dont see much correlations.
names(wine_train)
## [1] "TARGET" "FixedAcidity" "VolatileAcidity"
## [4] "CitricAcid" "ResidualSugar" "Chlorides"
## [7] "FreeSulfurDioxide" "TotalSulfurDioxide" "Density"
## [10] "pH" "Sulphates" "Alcohol"
## [13] "LabelAppeal" "AcidIndex" "STARS"
cor(drop_na(wine_train))
## TARGET FixedAcidity VolatileAcidity
## TARGET 1.0000000000 -0.012538100 -0.0759978765
## FixedAcidity -0.0125380998 1.000000000 0.0190109733
## VolatileAcidity -0.0759978765 0.019010973 1.0000000000
## CitricAcid 0.0023450490 0.014000376 -0.0234315631
## ResidualSugar 0.0035195999 -0.015429391 0.0015279517
## Chlorides -0.0304301331 -0.006104447 0.0148489225
## FreeSulfurDioxide 0.0226398054 0.015438463 -0.0114408079
## TotalSulfurDioxide 0.0216020726 -0.023323485 -0.0007434083
## Density -0.0475989086 0.011574241 0.0130977690
## pH 0.0002198557 -0.004553886 0.0072030364
## Sulphates -0.0212203783 0.042229181 0.0015161001
## Alcohol 0.0737771084 -0.013085026 0.0002603082
## LabelAppeal 0.4979464796 0.011375965 -0.0202419713
## AcidIndex -0.1676430648 0.154167846 0.0250529742
## STARS 0.5546857223 -0.004937345 -0.0402432388
## CitricAcid ResidualSugar Chlorides
## TARGET 0.0023450490 0.003519600 -0.0304301331
## FixedAcidity 0.0140003760 -0.015429391 -0.0061044471
## VolatileAcidity -0.0234315631 0.001527952 0.0148489225
## CitricAcid 1.0000000000 -0.009843146 -0.0335608661
## ResidualSugar -0.0098431456 1.000000000 0.0041215692
## Chlorides -0.0335608661 0.004121569 1.0000000000
## FreeSulfurDioxide 0.0121132485 0.021959113 -0.0204924876
## TotalSulfurDioxide -0.0099174506 0.017030939 0.0004188605
## Density -0.0169919691 -0.007120841 0.0206724860
## pH -0.0007581304 0.017563769 -0.0179702278
## Sulphates -0.0144237270 -0.002705775 0.0026187777
## Alcohol 0.0169864284 -0.018943324 -0.0228849573
## LabelAppeal 0.0153315666 -0.004579308 -0.0063870237
## AcidIndex 0.0545838104 -0.020301890 -0.0017134096
## STARS 0.0071401699 0.019665541 -0.0063242568
## FreeSulfurDioxide TotalSulfurDioxide Density
## TARGET 0.022639805 0.0216020726 -0.047598909
## FixedAcidity 0.015438463 -0.0233234848 0.011574241
## VolatileAcidity -0.011440808 -0.0007434083 0.013097769
## CitricAcid 0.012113248 -0.0099174506 -0.016991969
## ResidualSugar 0.021959113 0.0170309394 -0.007120841
## Chlorides -0.020492488 0.0004188605 0.020672486
## FreeSulfurDioxide 1.000000000 0.0134616726 -0.008663509
## TotalSulfurDioxide 0.013461673 1.0000000000 0.023167955
## Density -0.008663509 0.0231679548 1.000000000
## pH -0.002008516 -0.0034227601 -0.002019229
## Sulphates 0.026829029 0.0025040509 -0.010609294
## Alcohol -0.023867458 -0.0168515467 -0.006128355
## LabelAppeal 0.014960087 -0.0027237419 -0.018094403
## AcidIndex -0.014733717 -0.0221292631 0.047778830
## STARS -0.015390398 0.0220949002 -0.028492455
## pH Sulphates Alcohol LabelAppeal
## TARGET 0.0002198557 -0.021220378 0.0737771084 0.4979464796
## FixedAcidity -0.0045538857 0.042229181 -0.0130850260 0.0113759650
## VolatileAcidity 0.0072030364 0.001516100 0.0002603082 -0.0202419713
## CitricAcid -0.0007581304 -0.014423727 0.0169864284 0.0153315666
## ResidualSugar 0.0175637691 -0.002705775 -0.0189433242 -0.0045793083
## Chlorides -0.0179702278 0.002618778 -0.0228849573 -0.0063870237
## FreeSulfurDioxide -0.0020085157 0.026829029 -0.0238674577 0.0149600871
## TotalSulfurDioxide -0.0034227601 0.002504051 -0.0168515467 -0.0027237419
## Density -0.0020192285 -0.010609294 -0.0061283546 -0.0180944026
## pH 1.0000000000 0.010449255 -0.0122034469 0.0002181758
## Sulphates 0.0104492547 1.000000000 0.0108443299 0.0037686996
## Alcohol -0.0122034469 0.010844330 1.0000000000 -0.0006449123
## LabelAppeal 0.0002181758 0.003768700 -0.0006449123 1.0000000000
## AcidIndex -0.0537128921 0.031071782 -0.0558919056 0.0103009840
## STARS -0.0044002985 -0.023135130 0.0648544864 0.3188970216
## AcidIndex STARS
## TARGET -0.16764306 0.554685722
## FixedAcidity 0.15416785 -0.004937345
## VolatileAcidity 0.02505297 -0.040243239
## CitricAcid 0.05458381 0.007140170
## ResidualSugar -0.02030189 0.019665541
## Chlorides -0.00171341 -0.006324257
## FreeSulfurDioxide -0.01473372 -0.015390398
## TotalSulfurDioxide -0.02212926 0.022094900
## Density 0.04777883 -0.028492455
## pH -0.05371289 -0.004400299
## Sulphates 0.03107178 -0.023135130
## Alcohol -0.05589191 0.064854486
## LabelAppeal 0.01030098 0.318897022
## AcidIndex 1.00000000 -0.095482582
## STARS -0.09548258 1.000000000
pairs.panels(wine_train[1:15])
Now we will see the TARGET Variable.
TARGET - Number of Cases Purchased
options(width=100)
round(with(wine_train, c(summary(TARGET), StdD=sd(TARGET), Skew=skewness(TARGET), Kurt=kurtosis(TARGET))),2)
## Min. 1st Qu. Median Mean 3rd Qu. Max. StdD Skew Kurt
## 0.00 2.00 3.00 3.03 4.00 8.00 1.93 -0.33 2.12
Lets first split the data into training and test.
set.seed(999)
sampl = sample.split(wine_train$TARGET, SplitRatio = .80)
wine_train1 <- subset(wine_train, sampl == TRUE)
wine_test1 <- subset(wine_train, sampl == FALSE)
We will now use the mice package to impute missing values.
wine_train2 <- mice(wine_train1, m=1, maxit = 5, seed = 42)
##
## iter imp variable
## 1 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 2 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 3 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 4 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 5 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
wine_train2 <- complete(wine_train2)
wine_train2 <- as.data.frame(wine_train2)
wine_test2 <- test <- mice(wine_test1, m=1, maxit = 5, seed = 42)
##
## iter imp variable
## 1 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 2 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 3 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 4 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 5 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
wine_test2 <- complete(wine_test2)
wine_test2 <- as.data.frame(wine_test2)
There is very low correlation between AcidIndex and TARGET, lets do log transformation on AcidIndex.
wine_train2$AcidIndex <- log(wine_train2$AcidIndex)
wine_test2$AcidIndex <- log(wine_test2$AcidIndex)
model1 = glm(TARGET ~ ., data=wine_train1, family=poisson)
summary(model1)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = wine_train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2128 -0.2757 0.0647 0.3766 1.6981
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.608e+00 2.796e-01 5.750 8.90e-09 ***
## FixedAcidity 6.705e-04 1.177e-03 0.570 0.56901
## VolatileAcidity -2.750e-02 9.283e-03 -2.963 0.00305 **
## CitricAcid -3.835e-03 8.519e-03 -0.450 0.65259
## ResidualSugar 1.828e-05 2.152e-04 0.085 0.93232
## Chlorides -3.764e-02 2.314e-02 -1.627 0.10377
## FreeSulfurDioxide 5.671e-05 4.892e-05 1.159 0.24630
## TotalSulfurDioxide 2.230e-05 3.177e-05 0.702 0.48274
## Density -4.025e-01 2.749e-01 -1.464 0.14326
## pH 2.307e-04 1.085e-02 0.021 0.98303
## Sulphates -5.984e-03 7.973e-03 -0.751 0.45293
## Alcohol 3.262e-03 2.004e-03 1.628 0.10360
## LabelAppeal 1.730e-01 8.858e-03 19.530 < 2e-16 ***
## AcidIndex -4.967e-02 6.666e-03 -7.451 9.28e-14 ***
## STARS 1.929e-01 8.328e-03 23.160 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4720.5 on 5143 degrees of freedom
## Residual deviance: 3242.8 on 5129 degrees of freedom
## (5093 observations deleted due to missingness)
## AIC: 18545
##
## Number of Fisher Scoring iterations: 5
grid.arrange(hist, qq_plot, box_plot, box_TARGET, ncol=2)
## Warning: Removed 395 rows containing non-finite values (stat_bin).
## Warning: Removed 395 rows containing non-finite values (stat_boxplot).
## Warning: Removed 395 rows containing non-finite values (stat_boxplot).
model2 = glm(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Chlorides-FreeSulfurDioxide-TotalSulfurDioxide-Density-pH-Sulphates-Alcohol, data=wine_train1, family=poisson)
summary(model2)
##
## Call:
## glm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Chlorides - FreeSulfurDioxide - TotalSulfurDioxide - Density -
## pH - Sulphates - Alcohol, family = poisson, data = wine_train1)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1898 -0.2777 0.0622 0.3764 1.6086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.251442 0.054724 22.868 < 2e-16 ***
## VolatileAcidity -0.027581 0.009278 -2.973 0.00295 **
## LabelAppeal 0.173177 0.008853 19.562 < 2e-16 ***
## AcidIndex -0.050616 0.006553 -7.724 1.13e-14 ***
## STARS 0.194208 0.008292 23.421 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 4720.5 on 5143 degrees of freedom
## Residual deviance: 3253.1 on 5139 degrees of freedom
## (5093 observations deleted due to missingness)
## AIC: 18535
##
## Number of Fisher Scoring iterations: 5
plot(model2)
model3 = glm(TARGET ~ ., data=wine_train2, family=poisson)
summary(model3)
##
## Call:
## glm(formula = TARGET ~ ., family = poisson, data = wine_train2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1516 -0.6809 0.1304 0.6390 2.4033
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.382e+00 2.277e-01 10.463 < 2e-16 ***
## FixedAcidity -1.332e-04 9.197e-04 -0.145 0.88487
## VolatileAcidity -4.351e-02 7.275e-03 -5.982 2.21e-09 ***
## CitricAcid 8.883e-03 6.576e-03 1.351 0.17679
## ResidualSugar 1.508e-04 1.675e-04 0.900 0.36797
## Chlorides -6.506e-02 1.791e-02 -3.633 0.00028 ***
## FreeSulfurDioxide 1.143e-04 3.804e-05 3.005 0.00266 **
## TotalSulfurDioxide 8.709e-05 2.446e-05 3.560 0.00037 ***
## Density -4.047e-01 2.141e-01 -1.890 0.05876 .
## pH -1.788e-02 8.407e-03 -2.126 0.03347 *
## Sulphates -1.327e-02 6.163e-03 -2.153 0.03129 *
## Alcohol 2.690e-03 1.546e-03 1.740 0.08187 .
## LabelAppeal 1.432e-01 6.783e-03 21.107 < 2e-16 ***
## AcidIndex -7.622e-01 4.005e-02 -19.029 < 2e-16 ***
## STARS 3.401e-01 6.252e-03 54.395 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18291 on 10236 degrees of freedom
## Residual deviance: 12830 on 10222 degrees of freedom
## AIC: 38418
##
## Number of Fisher Scoring iterations: 5
plot(model3)
model4 = glm(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Density-Alcohol, data=wine_train2, family=poisson)
summary(model4)
##
## Call:
## glm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Density - Alcohol, family = poisson, data = wine_train2)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1405 -0.6852 0.1288 0.6412 2.4039
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.019e+00 8.848e-02 22.820 < 2e-16 ***
## VolatileAcidity -4.388e-02 7.273e-03 -6.033 1.61e-09 ***
## Chlorides -6.711e-02 1.790e-02 -3.750 0.000177 ***
## FreeSulfurDioxide 1.119e-04 3.802e-05 2.943 0.003256 **
## TotalSulfurDioxide 8.560e-05 2.442e-05 3.505 0.000457 ***
## pH -1.818e-02 8.404e-03 -2.164 0.030488 *
## Sulphates -1.327e-02 6.157e-03 -2.155 0.031143 *
## LabelAppeal 1.433e-01 6.783e-03 21.120 < 2e-16 ***
## AcidIndex -7.665e-01 3.941e-02 -19.448 < 2e-16 ***
## STARS 3.410e-01 6.237e-03 54.673 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for poisson family taken to be 1)
##
## Null deviance: 18291 on 10236 degrees of freedom
## Residual deviance: 12839 on 10227 degrees of freedom
## AIC: 38417
##
## Number of Fisher Scoring iterations: 5
plot(model4)
model5 <- glm.nb(TARGET ~ ., data = wine_train1)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace = control$trace > : iteration
## limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace = control$trace > : iteration
## limit reached
summary(model5)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = wine_train1, init.theta = 138898.9107,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.2127 -0.2757 0.0647 0.3766 1.6981
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.608e+00 2.796e-01 5.750 8.91e-09 ***
## FixedAcidity 6.705e-04 1.177e-03 0.570 0.56900
## VolatileAcidity -2.750e-02 9.283e-03 -2.963 0.00305 **
## CitricAcid -3.835e-03 8.519e-03 -0.450 0.65259
## ResidualSugar 1.828e-05 2.152e-04 0.085 0.93231
## Chlorides -3.764e-02 2.314e-02 -1.627 0.10378
## FreeSulfurDioxide 5.671e-05 4.892e-05 1.159 0.24630
## TotalSulfurDioxide 2.230e-05 3.177e-05 0.702 0.48275
## Density -4.025e-01 2.750e-01 -1.464 0.14326
## pH 2.307e-04 1.085e-02 0.021 0.98303
## Sulphates -5.984e-03 7.973e-03 -0.751 0.45293
## Alcohol 3.262e-03 2.004e-03 1.628 0.10360
## LabelAppeal 1.730e-01 8.858e-03 19.529 < 2e-16 ***
## AcidIndex -4.967e-02 6.666e-03 -7.451 9.28e-14 ***
## STARS 1.929e-01 8.328e-03 23.160 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(138898.9) family taken to be 1)
##
## Null deviance: 4720.4 on 5143 degrees of freedom
## Residual deviance: 3242.7 on 5129 degrees of freedom
## (5093 observations deleted due to missingness)
## AIC: 18547
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 138899
## Std. Err.: 259921
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -18515.07
plot(model5)
model6 <- glm.nb(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Chlorides-FreeSulfurDioxide-TotalSulfurDioxide-Density-pH-Sulphates-Alcohol, data = wine_train1)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace = control$trace > : iteration
## limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace = control$trace > : iteration
## limit reached
summary(model6)
##
## Call:
## glm.nb(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Chlorides - FreeSulfurDioxide - TotalSulfurDioxide - Density -
## pH - Sulphates - Alcohol, data = wine_train1, init.theta = 138402.5261,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1898 -0.2777 0.0622 0.3764 1.6086
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1.251443 0.054725 22.868 < 2e-16 ***
## VolatileAcidity -0.027581 0.009279 -2.973 0.00295 **
## LabelAppeal 0.173177 0.008853 19.562 < 2e-16 ***
## AcidIndex -0.050616 0.006553 -7.724 1.13e-14 ***
## STARS 0.194209 0.008292 23.421 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(138402.5) family taken to be 1)
##
## Null deviance: 4720.4 on 5143 degrees of freedom
## Residual deviance: 3253.0 on 5139 degrees of freedom
## (5093 observations deleted due to missingness)
## AIC: 18537
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 138403
## Std. Err.: 258834
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -18525.37
plot(model6)
model7 <- glm.nb(TARGET ~ ., data = wine_train2)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace = control$trace > : iteration
## limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace = control$trace > : iteration
## limit reached
summary(model7)
##
## Call:
## glm.nb(formula = TARGET ~ ., data = wine_train2, init.theta = 48897.24324,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1515 -0.6808 0.1304 0.6390 2.4032
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.382e+00 2.277e-01 10.463 < 2e-16 ***
## FixedAcidity -1.332e-04 9.197e-04 -0.145 0.884879
## VolatileAcidity -4.351e-02 7.275e-03 -5.981 2.21e-09 ***
## CitricAcid 8.883e-03 6.577e-03 1.351 0.176804
## ResidualSugar 1.508e-04 1.675e-04 0.900 0.367960
## Chlorides -6.506e-02 1.791e-02 -3.633 0.000280 ***
## FreeSulfurDioxide 1.143e-04 3.804e-05 3.005 0.002657 **
## TotalSulfurDioxide 8.709e-05 2.446e-05 3.560 0.000371 ***
## Density -4.047e-01 2.141e-01 -1.890 0.058762 .
## pH -1.788e-02 8.407e-03 -2.126 0.033466 *
## Sulphates -1.327e-02 6.164e-03 -2.153 0.031286 *
## Alcohol 2.690e-03 1.546e-03 1.740 0.081887 .
## LabelAppeal 1.432e-01 6.783e-03 21.106 < 2e-16 ***
## AcidIndex -7.622e-01 4.005e-02 -19.029 < 2e-16 ***
## STARS 3.401e-01 6.252e-03 54.393 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(48897.24) family taken to be 1)
##
## Null deviance: 18290 on 10236 degrees of freedom
## Residual deviance: 12830 on 10222 degrees of freedom
## AIC: 38420
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 48897
## Std. Err.: 63448
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -38388.3
plot(model7)
model8 <- glm.nb(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar-Density-Alcohol, data = wine_train2)
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace = control$trace > : iteration
## limit reached
## Warning in theta.ml(Y, mu, sum(w), w, limit = control$maxit, trace = control$trace > : iteration
## limit reached
summary(model8)
##
## Call:
## glm.nb(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar -
## Density - Alcohol, data = wine_train2, init.theta = 48805.90033,
## link = log)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.1405 -0.6852 0.1288 0.6412 2.4038
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 2.019e+00 8.849e-02 22.820 < 2e-16 ***
## VolatileAcidity -4.388e-02 7.273e-03 -6.033 1.61e-09 ***
## Chlorides -6.711e-02 1.790e-02 -3.750 0.000177 ***
## FreeSulfurDioxide 1.119e-04 3.802e-05 2.942 0.003257 **
## TotalSulfurDioxide 8.561e-05 2.443e-05 3.505 0.000457 ***
## pH -1.818e-02 8.404e-03 -2.164 0.030489 *
## Sulphates -1.327e-02 6.157e-03 -2.155 0.031144 *
## LabelAppeal 1.433e-01 6.783e-03 21.119 < 2e-16 ***
## AcidIndex -7.665e-01 3.941e-02 -19.447 < 2e-16 ***
## STARS 3.410e-01 6.237e-03 54.671 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for Negative Binomial(48805.9) family taken to be 1)
##
## Null deviance: 18290 on 10236 degrees of freedom
## Residual deviance: 12839 on 10227 degrees of freedom
## AIC: 38420
##
## Number of Fisher Scoring iterations: 1
##
##
## Theta: 48806
## Std. Err.: 63368
## Warning while fitting theta: iteration limit reached
##
## 2 x log-likelihood: -38397.65
plot(model8)
model9 <- lm(TARGET ~ ., data = wine_train2)
summary(model9)
##
## Call:
## lm(formula = TARGET ~ ., data = wine_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.6944 -1.0191 0.1692 1.0335 4.2502
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.149e+00 5.564e-01 11.052 < 2e-16 ***
## FixedAcidity -1.428e-04 2.255e-03 -0.063 0.94952
## VolatileAcidity -1.265e-01 1.792e-02 -7.056 1.82e-12 ***
## CitricAcid 2.771e-02 1.630e-02 1.699 0.08927 .
## ResidualSugar 4.479e-04 4.138e-04 1.083 0.27904
## Chlorides -1.956e-01 4.398e-02 -4.448 8.77e-06 ***
## FreeSulfurDioxide 2.930e-04 9.398e-05 3.117 0.00183 **
## TotalSulfurDioxide 2.365e-04 6.006e-05 3.938 8.28e-05 ***
## Density -1.099e+00 5.263e-01 -2.088 0.03678 *
## pH -4.064e-02 2.071e-02 -1.962 0.04978 *
## Sulphates -3.621e-02 1.519e-02 -2.384 0.01713 *
## Alcohol 1.131e-02 3.782e-03 2.991 0.00279 **
## LabelAppeal 4.379e-01 1.644e-02 26.633 < 2e-16 ***
## AcidIndex -2.041e+00 9.250e-02 -22.067 < 2e-16 ***
## STARS 1.162e+00 1.665e-02 69.754 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.417 on 10222 degrees of freedom
## Multiple R-squared: 0.4598, Adjusted R-squared: 0.4591
## F-statistic: 621.5 on 14 and 10222 DF, p-value: < 2.2e-16
plot(model9)
model10 <- lm(TARGET ~ .-FixedAcidity-CitricAcid-ResidualSugar, data = wine_train2)
summary(model10)
##
## Call:
## lm(formula = TARGET ~ . - FixedAcidity - CitricAcid - ResidualSugar,
## data = wine_train2)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.7075 -1.0195 0.1718 1.0343 4.2907
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.139e+00 5.563e-01 11.036 < 2e-16 ***
## VolatileAcidity -1.273e-01 1.792e-02 -7.104 1.30e-12 ***
## Chlorides -1.970e-01 4.397e-02 -4.479 7.57e-06 ***
## FreeSulfurDioxide 2.939e-04 9.397e-05 3.128 0.00177 **
## TotalSulfurDioxide 2.389e-04 6.003e-05 3.980 6.94e-05 ***
## Density -1.101e+00 5.263e-01 -2.093 0.03638 *
## pH -4.059e-02 2.071e-02 -1.960 0.05008 .
## Sulphates -3.699e-02 1.517e-02 -2.437 0.01481 *
## Alcohol 1.136e-02 3.781e-03 3.005 0.00266 **
## LabelAppeal 4.379e-01 1.644e-02 26.632 < 2e-16 ***
## AcidIndex -2.031e+00 9.085e-02 -22.351 < 2e-16 ***
## STARS 1.162e+00 1.665e-02 69.794 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.417 on 10225 degrees of freedom
## Multiple R-squared: 0.4596, Adjusted R-squared: 0.459
## F-statistic: 790.6 on 11 and 10225 DF, p-value: < 2.2e-16
plot(model10)
Now lets see the output of the Models using test data:
We will use the squared loss to validate the model.
modelValidation <- function(mod, test){
preds = predict(mod, test)
diffMat = as.numeric(preds) - as.numeric(test$TARGET)
diffMat = diffMat^2
loss <- mean(diffMat)
return(loss)
}
Poisson model with imputations.
modelValidation(model3, wine_test2)
## [1] 6.852209
Poisson model with imputations and only significant variables.
modelValidation(model4, wine_test2)
## [1] 6.854547
Negative Binomial with imputations:.
modelValidation(model7, wine_test2)
## [1] 6.852205
Negative Binomial with imputations and only significant variables.
modelValidation(model8, wine_test2)
## [1] 6.854543
Linear Model with imputations.
modelValidation(model9, wine_test2)
## [1] 2.029061
Linear Model with imputations and only significant variables.
modelValidation(model10, wine_test2)
## [1] 2.030002
From the above models, i would like to go with Model10 - Linear Model with imputations and only significant variables as it uses less variables and is parsimonious. Also the R2 looks fine. The squared loss is also fine.
We will use the same method to impute and use log transformation for AcidIndex.
wine_eval <- read.csv("https://raw.githubusercontent.com/Riteshlohiya/Data621-Assignment-5/master/wine_evaluation_data.csv", stringsAsFactors = FALSE)
do_factors <- function(wine_instance){
wine_instance <- within(wine_instance, {
LabelAppeal <- factor(LabelAppeal)
AcidIndex <- factor(AcidIndex)
STARS <- factor(STARS)
})
return (wine_instance)
}
summary(wine_eval)
## IN TARGET FixedAcidity VolatileAcidity CitricAcid
## Min. : 3 Mode:logical Min. :-18.200 Min. :-2.8300 Min. :-3.1200
## 1st Qu.: 4018 NA's:3335 1st Qu.: 5.200 1st Qu.: 0.0800 1st Qu.: 0.0000
## Median : 7906 Median : 6.900 Median : 0.2800 Median : 0.3100
## Mean : 8048 Mean : 6.864 Mean : 0.3103 Mean : 0.3124
## 3rd Qu.:12061 3rd Qu.: 9.000 3rd Qu.: 0.6300 3rd Qu.: 0.6050
## Max. :16130 Max. : 33.500 Max. : 3.6100 Max. : 3.7600
##
## ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide Density
## Min. :-128.300 Min. :-1.15000 Min. :-563.00 Min. :-769.00 Min. :0.8898
## 1st Qu.: -2.600 1st Qu.: 0.01600 1st Qu.: 3.00 1st Qu.: 27.25 1st Qu.:0.9883
## Median : 3.600 Median : 0.04700 Median : 30.00 Median : 124.00 Median :0.9946
## Mean : 5.319 Mean : 0.06143 Mean : 34.95 Mean : 123.41 Mean :0.9947
## 3rd Qu.: 17.200 3rd Qu.: 0.17100 3rd Qu.: 79.25 3rd Qu.: 210.00 3rd Qu.:1.0005
## Max. : 145.400 Max. : 1.26300 Max. : 617.00 Max. :1004.00 Max. :1.0998
## NA's :168 NA's :138 NA's :152 NA's :157
## pH Sulphates Alcohol LabelAppeal AcidIndex
## Min. :0.600 Min. :-3.0700 Min. :-4.20 Min. :-2.00000 Min. : 5.000
## 1st Qu.:2.980 1st Qu.: 0.3300 1st Qu.: 9.00 1st Qu.:-1.00000 1st Qu.: 7.000
## Median :3.210 Median : 0.5000 Median :10.40 Median : 0.00000 Median : 8.000
## Mean :3.237 Mean : 0.5346 Mean :10.58 Mean : 0.01349 Mean : 7.748
## 3rd Qu.:3.490 3rd Qu.: 0.8200 3rd Qu.:12.50 3rd Qu.: 1.00000 3rd Qu.: 8.000
## Max. :6.210 Max. : 4.1800 Max. :25.60 Max. : 2.00000 Max. :17.000
## NA's :104 NA's :310 NA's :185
## STARS
## Min. :1.00
## 1st Qu.:1.00
## Median :2.00
## Mean :2.04
## 3rd Qu.:3.00
## Max. :4.00
## NA's :841
wine_eval <- mice(wine_eval, m=1, maxit = 5, seed = 42)
##
## iter imp variable
## 1 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 2 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 3 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 4 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## 5 1 ResidualSugar Chlorides FreeSulfurDioxide TotalSulfurDioxide pH Sulphates Alcohol STARS
## Warning: Number of logged events: 1
wine_eval <- complete(wine_eval)
wine_eval <- as.data.frame(wine_eval)
wine_eval$AcidIndex <- log(wine_eval$AcidIndex)
wine_eval$TARGET1 <- predict(model10, newdata=wine_eval)
write.csv(wine_eval,"Evaluation_Full_Data.csv", row.names=FALSE)