# 1. Packages ---- Phase 1 -------
library(knitr)
library(mlr)
## Loading required package: ParamHelpers
library(GGally)
## Loading required package: ggplot2
library(cowplot)
##
## Attaching package: 'cowplot'
## The following object is masked from 'package:ggplot2':
##
## ggsave
library(car)
## Loading required package: carData
library(readr)
library(corrr)
## Loading required package: dplyr
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:car':
##
## recode
## The following object is masked from 'package:GGally':
##
## nasa
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(tidyverse)
## -- Attaching packages ---------------------------------- tidyverse 1.2.1 --
## v tibble 1.4.2 v purrr 0.2.4
## v tidyr 0.8.1 v stringr 1.3.1
## v tibble 1.4.2 v forcats 0.3.0
## -- Conflicts ------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x cowplot::ggsave() masks ggplot2::ggsave()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks car::recode()
## x purrr::some() masks car::some()
library(aplpack)
## Loading required package: tcltk
library(cat)
library(stringr)
knitr::opts_chunk$set(echo = TRUE)
This dataset is Obtained from and is also available from the UCI machine learning repository,
Definition of physiochemical: (of or pertaining to both physical and chemical properties, changes, and reactions. of or according to physical chemistry.)
Content: (Or Name per each Column) For more information, read [Cortez et al., 2009]. Input variables (based on physicochemical tests): 1 - fixed acidity 2 - volatile acidity 3 - citric acid 4 - residual sugar 5 - chlorides 6 - free sulfur dioxide 7 - total sulfur dioxide 8 - density 9 - pH 10 - sulphates 11 - alcohol Output variable (based on sensory data): 12 - quality (score between 0 and 10)
#KEEP
Wine <- read.csv("Wine.csv")
summary(Wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 4.60 Min. :0.1200 Min. :0.000 Min. : 0.900
## 1st Qu.: 7.10 1st Qu.:0.3900 1st Qu.:0.090 1st Qu.: 1.900
## Median : 7.90 Median :0.5200 Median :0.260 Median : 2.200
## Mean : 8.32 Mean :0.5278 Mean :0.271 Mean : 2.539
## 3rd Qu.: 9.20 3rd Qu.:0.6400 3rd Qu.:0.420 3rd Qu.: 2.600
## Max. :15.90 Max. :1.5800 Max. :1.000 Max. :15.500
## chlorides free.sulfur.dioxide total.sulfur.dioxide
## Min. :0.01200 Min. : 1.00 Min. : 6.00
## 1st Qu.:0.07000 1st Qu.: 7.00 1st Qu.: 22.00
## Median :0.07900 Median :14.00 Median : 38.00
## Mean :0.08747 Mean :15.87 Mean : 46.47
## 3rd Qu.:0.09000 3rd Qu.:21.00 3rd Qu.: 62.00
## Max. :0.61100 Max. :72.00 Max. :289.00
## density pH sulphates alcohol
## Min. :0.9901 Min. :2.740 Min. :0.3300 Min. : 8.40
## 1st Qu.:0.9956 1st Qu.:3.210 1st Qu.:0.5500 1st Qu.: 9.50
## Median :0.9968 Median :3.310 Median :0.6200 Median :10.20
## Mean :0.9967 Mean :3.311 Mean :0.6581 Mean :10.42
## 3rd Qu.:0.9978 3rd Qu.:3.400 3rd Qu.:0.7300 3rd Qu.:11.10
## Max. :1.0037 Max. :4.010 Max. :2.0000 Max. :14.90
## quality
## Min. :3.000
## 1st Qu.:5.000
## Median :6.000
## Mean :5.636
## 3rd Qu.:6.000
## Max. :8.000
# NOTE: heat.colors max is (12) & col=rainbow(20) & colors() -> is all 657 colours
attach(Wine)
plotsummary(Wine, design="chessboard") #Interestingly this shows possible outliers
## NULL
#and approximated boxplots
## shorten a few words / expressions for use in following plots
quality = Wine$quality
alcohol = Wine$alcohol
fixed = Wine$fixed.acidity #Wine$`fixed acidity`
total = Wine$total.sulfur.dioxide #Wine$`total sulfur dioxide`
free = Wine$free.sulfur.dioxide #Wine$`free sulfur dioxide`
volatile = Wine$volatile.acidity #Wine$`volatile acidity`
par(mar=c(5,10,4,2)+.1)
boxplot(Wine, horizontal = TRUE, las = 1 , outline=FALSE , col=heat.colors(12),cex.axis = 0.8 )#, pos=2 )
## With both (Total and Free) Dioxide’s removed
#Remove the two Dioxide boxplots and replot
RemoveDioxide <- Wine[c(1:1599),-c(6:7)]
par(mar=c(5,10,4,2)+.1)
boxplot(RemoveDioxide, horizontal = TRUE, las = 1 , outline=FALSE , col=heat.colors(12),cex.axis = 0.8 )#, pos=2 )
## The three I think are worth looking closer at with relation to Quality
#par(mar=c(5,4,4,2)+.1) #Reset Margins to default
#Grouped quality, fixed acidity , and alcohol
attach(Wine)
## The following objects are masked _by_ .GlobalEnv:
##
## alcohol, quality
## The following objects are masked from Wine (pos = 3):
##
## alcohol, chlorides, citric.acid, density, fixed.acidity,
## free.sulfur.dioxide, pH, quality, residual.sugar, sulphates,
## total.sulfur.dioxide, volatile.acidity
par(mar=c(5,10,4,2)+.1)
boxplot(quality, fixed , alcohol, residual.sugar , horizontal = TRUE, las = 1,
col=heat.colors(3),cex.axis = 0.8, names=c("Quality","Fixed","Alcohol", "Residual Sugar"))
#KEEP
par(mar=c(5,4,4,2)+.1) #Reset Margins to default
par(mfrow=c(2, 2))
hist(quality ,xlim=c(1,10) , ylim=c(1,800) , col = 3) #hist( fixed.acidity) shows is original vs final weight
hist(alcohol ,xlim=c(1,20) , ylim=c(1,500) , col = 5) #hist( fixed.acidity
hist(fixed ,xlim=c(1,18) , ylim=c(1,600) , col = 7)
hist(residual.sugar ,xlim=c(-1,12) , ylim=c(1,900) ,col = 14)
# What we see is a possible relationship between Quailty and alcohol, and possibly between
# fixed and alcohol
#I would like to see a fixed histogram with a lines overlay for comparison
par(mfrow=c(1, 1)) #Combined Lines Overlay
plot (density(alcohol), main = "Density Plot of Alcohol and Fixed Similarities",
lwd = 2, col = "darkblue", xlim = c(4, 16))
lines (density(fixed),lwd = 2 ,col = "red")
lines (density(quality),lwd = 2 ,col = "green")
lines (density(residual.sugar),lwd = 2 ,col = "green")
colours = c("Blue = Alcohol","Red = Fixed", "Green = Residual Sugar")
legend(x = 11, y = 0.45,legend = colours,col=c(1:3),pch = 16)
# par(mfrow=c(3, 1)) #this has been kept for CHECKING only
# qqPlot(fixed)
# qqPlot(alcohol)
# qqPlot(residual.sugar)
# Definition:
# In statistics, a perfect negative correlation is represented by the value -1.00,
# while a 0.00 indicates no correlation and a +1.00 indicates a perfect positive correlation.
#install.packages("corrr") # This will correlate() all your data frame
#library(corrr)
Wine %>% correlate() %>% focus(quality) #testing
## # A tibble: 11 x 2
## rowname quality
## <chr> <dbl>
## 1 fixed.acidity 0.124
## 2 volatile.acidity -0.391
## 3 citric.acid 0.226
## 4 residual.sugar 0.0137
## 5 chlorides -0.129
## 6 free.sulfur.dioxide -0.0507
## 7 total.sulfur.dioxide -0.185
## 8 density -0.175
## 9 pH -0.0577
## 10 sulphates 0.251
## 11 alcohol 0.476
# Correlation WRT quality
# rowname quality
# fixed.acidity 0.12405165
# volatile.acidity -0.39055778
# citric.acid 0.22637251
# residual.sugar 0.01373164
# chlorides -0.12890656
# free.sulfur.dioxide -0.05065606
# total.sulfur.dioxide -0.18510029
# density -0.17491923
# pH -0.05773139
# sulphates 0.25139708
# alcohol 0.47616632 <- includes this last missing value (on diagram below)
r2 <- round(runif(1,1,1550),0) ## note '3'is the start point in quality ,
## eg set to (runif(1,5,15),0)
#r2 # r2 check value
WQ = Wine$quality[r2:r2]
#WQ # Check values
W1 = Wine$fixed.acidity [r2:r2]
#W1 # Check values
W2 = Wine$volatile.acidity[r2:r2]
#W2 # Check values
W3 = Wine$citric.acid[r2:r2]
W4 = Wine$residual.sugar[r2:r2]
W5 = Wine$chlorides[r2:r2]
W6 = Wine$free.sulfur.dioxide[r2:r2]
W7 = Wine$total.sulfur.dioxide[r2:r2]
W8 = Wine$density[r2:r2]
W9 = Wine$quality[r2:r2]
W10 = Wine$pH[r2:r2]
W11 = Wine$sulphates[r2:r2]
W12 = Wine$alcohol[r2:r2]
# library(corrr)
# Wine %>% correlate() %>% focus(quality)
# Multiple Linear Regression
fit <- lm(quality ~ fixed.acidity + volatile.acidity + citric.acid + residual.sugar +
chlorides + free.sulfur.dioxide + total.sulfur.dioxide + density +
pH + sulphates + alcohol , data=Wine)
summary(fit) # show results
##
## Call:
## lm(formula = quality ~ fixed.acidity + volatile.acidity + citric.acid +
## residual.sugar + chlorides + free.sulfur.dioxide + total.sulfur.dioxide +
## density + pH + sulphates + alcohol, data = Wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.68911 -0.36652 -0.04699 0.45202 2.02498
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.197e+01 2.119e+01 1.036 0.3002
## fixed.acidity 2.499e-02 2.595e-02 0.963 0.3357
## volatile.acidity -1.084e+00 1.211e-01 -8.948 < 2e-16 ***
## citric.acid -1.826e-01 1.472e-01 -1.240 0.2150
## residual.sugar 1.633e-02 1.500e-02 1.089 0.2765
## chlorides -1.874e+00 4.193e-01 -4.470 8.37e-06 ***
## free.sulfur.dioxide 4.361e-03 2.171e-03 2.009 0.0447 *
## total.sulfur.dioxide -3.265e-03 7.287e-04 -4.480 8.00e-06 ***
## density -1.788e+01 2.163e+01 -0.827 0.4086
## pH -4.137e-01 1.916e-01 -2.159 0.0310 *
## sulphates 9.163e-01 1.143e-01 8.014 2.13e-15 ***
## alcohol 2.762e-01 2.648e-02 10.429 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.648 on 1587 degrees of freedom
## Multiple R-squared: 0.3606, Adjusted R-squared: 0.3561
## F-statistic: 81.35 on 11 and 1587 DF, p-value: < 2.2e-16
Intercept <- summary(fit)$coefficients[1, 1]
fixed.acidity <- summary(fit)$coefficients[2, 1]
volatile.acidity <- summary(fit)$coefficients[3, 1]
citric.acid <- summary(fit)$coefficients[4, 1]
residual.sugar <- summary(fit)$coefficients[5, 1]
chlorides <- summary(fit)$coefficients[6, 1]
free.sulfur.dioxide <- summary(fit)$coefficients[7, 1]
total.sulfur.dioxide <- summary(fit)$coefficients[8, 1]
density <- summary(fit)$coefficients[9, 1]
pH <- summary(fit)$coefficients[10, 1]
sulphates <- summary(fit)$coefficients[11, 1]
alcohol <- summary(fit)$coefficients[12, 1]
#Intercept # Used to check for correct values
#fixed.acidity
#volatile.acidity
Qual = round ( (W1 * fixed.acidity + W2 * volatile.acidity + W3 * citric.acid +
W4 * residual.sugar + W5 * chlorides + W6 * free.sulfur.dioxide +
W7 * total.sulfur.dioxide + W8 * total.sulfur.dioxide +
W8 * density + W9 * pH + W10 * sulphates + W11 * alcohol +
Intercept), 0)
Qual # Used to check for correct values , Qual is the Regression model
## [1] 4
WQ
## [1] 7
Equal <- paste0( ifelse((WQ > Qual), " Regression Value LESS than Data Value "," "),
ifelse((WQ < Qual), " Regression Value GREATER than Data Value "," "),
ifelse((WQ == Qual), " Regression Value = Data Value"," "))
cat(Equal)
## Regression Value LESS than Data Value
# diagnostic plots
par(mfrow=c(1, 2))
layout(matrix(c(1,2,3,4),2,2)) # optional 4 graphs/page
plot(fit)
# Other useful functions to help more insight
confint(fit, level=0.95) # CIs for model parameters
## 2.5 % 97.5 %
## (Intercept) -1.960710e+01 63.537517845
## fixed.acidity -2.590639e-02 0.075887499
## volatile.acidity -1.321126e+00 -0.846054953
## citric.acid -4.712441e-01 0.106116245
## residual.sugar -1.309474e-02 0.045757280
## chlorides -2.696632e+00 -1.051817956
## free.sulfur.dioxide 1.024314e-04 0.008620235
## total.sulfur.dioxide -4.693951e-03 -0.001835208
## density -6.031362e+01 24.551294539
## pH -7.894637e-01 -0.037842600
## sulphates 6.920661e-01 1.140602768
## alcohol 2.242512e-01 0.328144192
anova(fit) # anova table
## Analysis of Variance Table
##
## Response: quality
## Df Sum Sq Mean Sq F value Pr(>F)
## fixed.acidity 1 16.04 16.038 38.1924 8.132e-10 ***
## volatile.acidity 1 143.57 143.573 341.9062 < 2.2e-16 ***
## citric.acid 1 0.02 0.024 0.0581 0.809535
## residual.sugar 1 0.16 0.158 0.3764 0.539600
## chlorides 1 13.06 13.062 31.1057 2.868e-08 ***
## free.sulfur.dioxide 1 2.97 2.974 7.0828 0.007861 **
## total.sulfur.dioxide 1 30.09 30.093 71.6631 < 2.2e-16 ***
## density 1 61.31 61.310 146.0054 < 2.2e-16 ***
## pH 1 7.15 7.154 17.0358 3.859e-05 ***
## sulphates 1 55.70 55.697 132.6366 < 2.2e-16 ***
## alcohol 1 45.67 45.672 108.7643 < 2.2e-16 ***
## Residuals 1587 666.41 0.420
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
vcov(fit) # covariance matrix for model parameters
## (Intercept) fixed.acidity volatile.acidity
## (Intercept) 4.492100e+02 4.253152e-01 3.303414e-01
## fixed.acidity 4.253152e-01 6.733247e-04 -1.562945e-04
## volatile.acidity 3.303414e-01 -1.562945e-04 1.466552e-02
## citric.acid 3.235398e-02 -1.272831e-03 9.543695e-03
## residual.sugar 1.893780e-01 1.677641e-04 4.364326e-05
## chlorides 7.606468e-01 2.538893e-03 -1.340534e-02
## free.sulfur.dioxide -3.813607e-03 -6.205116e-06 4.347546e-05
## total.sulfur.dioxide 1.103784e-03 4.381638e-06 -1.889492e-05
## density -4.583128e+02 -4.409095e-01 -3.364363e-01
## pH 2.234170e+00 3.572150e-03 -6.183079e-04
## sulphates 5.983606e-01 4.511465e-04 2.899835e-03
## alcohol -4.258940e-01 -3.734381e-04 -2.530845e-04
## citric.acid residual.sugar chlorides
## (Intercept) 3.235398e-02 1.893780e-01 7.606468e-01
## fixed.acidity -1.272831e-03 1.677641e-04 2.538893e-03
## volatile.acidity 9.543695e-03 4.364326e-05 -1.340534e-02
## citric.acid 2.166083e-02 -1.179444e-04 -1.627310e-02
## residual.sugar -1.179444e-04 2.250629e-04 6.822396e-05
## chlorides -1.627310e-02 6.822396e-05 1.757984e-01
## free.sulfur.dioxide 5.192585e-05 -4.120402e-06 -5.328066e-05
## total.sulfur.dioxide -2.841887e-05 -3.968613e-07 4.283090e-05
## density -2.737179e-02 -1.931659e-01 -8.508338e-01
## pH 9.654568e-04 9.458076e-04 1.856139e-02
## sulphates -4.468331e-04 3.443137e-04 -1.680336e-02
## alcohol -5.825858e-04 -1.994800e-04 1.025390e-03
## free.sulfur.dioxide total.sulfur.dioxide
## (Intercept) -3.813607e-03 1.103784e-03
## fixed.acidity -6.205116e-06 4.381638e-06
## volatile.acidity 4.347546e-05 -1.889492e-05
## citric.acid 5.192585e-05 -2.841887e-05
## residual.sugar -4.120402e-06 -3.968613e-07
## chlorides -5.328066e-05 4.283090e-05
## free.sulfur.dioxide 4.714508e-06 -1.048949e-06
## total.sulfur.dioxide -1.048949e-06 5.310452e-07
## density 4.013057e-03 -1.241979e-03
## pH -5.726739e-05 2.707286e-05
## sulphates -1.339587e-05 -2.754751e-06
## alcohol 1.503270e-06 1.610225e-06
## density pH sulphates
## (Intercept) -4.583128e+02 2.234170e+00 5.983606e-01
## fixed.acidity -4.409095e-01 3.572150e-03 4.511465e-04
## volatile.acidity -3.364363e-01 -6.183079e-04 2.899835e-03
## citric.acid -2.737179e-02 9.654568e-04 -4.468331e-04
## residual.sugar -1.931659e-01 9.458076e-04 3.443137e-04
## chlorides -8.508338e-01 1.856139e-02 -1.680336e-02
## free.sulfur.dioxide 4.013057e-03 -5.726739e-05 -1.339587e-05
## total.sulfur.dioxide -1.241979e-03 2.707286e-05 -2.754751e-06
## density 4.679910e+02 -2.371814e+00 -6.134005e-01
## pH -2.371814e+00 3.670955e-02 2.817249e-03
## sulphates -6.134005e-01 2.817249e-03 1.307306e-02
## alcohol 4.330216e-01 -2.638799e-03 -8.744756e-04
## alcohol
## (Intercept) -4.258940e-01
## fixed.acidity -3.734381e-04
## volatile.acidity -2.530845e-04
## citric.acid -5.825858e-04
## residual.sugar -1.994800e-04
## chlorides 1.025390e-03
## free.sulfur.dioxide 1.503270e-06
## total.sulfur.dioxide 1.610225e-06
## density 4.330216e-01
## pH -2.638799e-03
## sulphates -8.744756e-04
## alcohol 7.013803e-04
## 2. Data Processing ---- Setting the scene for further analysis ---- Phase 2 --------
## 2.1. Preliminaries ----
Wine <- read.csv("Wine.csv")
train <- read.csv('Wine.csv', stringsAsFactors = FALSE, header = FALSE)
test <- read.csv('Wine.csv', stringsAsFactors = FALSE, skip = 1, header = FALSE)
adult <- rbind(train, test)
names( adult ) <- c('fixed.acidity', 'volatile.acidity', 'citric.acid', 'residual.sugar', 'chlorides',
'free.sulfur.dioxide', 'total.sulfur.dioxide', 'density', 'pH', 'sulphates',
'alcohol', 'quality')
# 2.2 Data Cleaning ----
summarizeColumns(adult)
## name type na mean disp median mad min max
## 1 fixed.acidity character 0 NA 0.9581119 NA NA 1 134
## 2 volatile.acidity character 0 NA 0.9706158 NA NA 1 94
## 3 citric.acid character 0 NA 0.9174742 NA NA 1 264
## 4 residual.sugar character 0 NA 0.9024695 NA NA 1 312
## 5 chlorides character 0 NA 0.9587371 NA NA 1 132
## 6 free.sulfur.dioxide character 0 NA 0.9137230 NA NA 1 276
## 7 total.sulfur.dioxide character 0 NA 0.9731166 NA NA 1 86
## 8 density character 0 NA 0.9774930 NA NA 1 72
## 9 pH character 0 NA 0.9643639 NA NA 1 114
## 10 sulphates character 0 NA 0.9568615 NA NA 1 138
## 11 alcohol character 0 NA 0.9130978 NA NA 1 278
## 12 quality character 0 NA 0.5742420 NA NA 1 1362
## nlevs
## 1 97
## 2 144
## 3 81
## 4 92
## 5 154
## 6 61
## 7 145
## 8 437
## 9 90
## 10 97
## 11 66
## 12 7
str( adult )
## 'data.frame': 3199 obs. of 12 variables:
## $ fixed.acidity : chr "fixed acidity" "7.4" "7.8" "7.8" ...
## $ volatile.acidity : chr "volatile acidity" "0.7" "0.88" "0.76" ...
## $ citric.acid : chr "citric acid" "0" "0" "0.04" ...
## $ residual.sugar : chr "residual sugar" "1.9" "2.6" "2.3" ...
## $ chlorides : chr "chlorides" "0.076" "0.098" "0.092" ...
## $ free.sulfur.dioxide : chr "free sulfur dioxide" "11" "25" "15" ...
## $ total.sulfur.dioxide: chr "total sulfur dioxide" "34" "67" "54" ...
## $ density : chr "density" "0.9978" "0.9968" "0.997" ...
## $ pH : chr "pH" "3.51" "3.2" "3.26" ...
## $ sulphates : chr "sulphates" "0.56" "0.68" "0.65" ...
## $ alcohol : chr "alcohol" "9.4" "9.8" "9.8" ...
## $ quality : chr "quality" "5" "5" "5" ...
View(adult)
#View(test)
#View(Wine)
## Free onwards