# First load necessary libraries and read in the data set
library(readxl)
library(car)
## Loading required package: carData
library(tidyverse)
## Warning: package 'ggplot2' was built under R version 4.4.3
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(corrplot)
## corrplot 0.95 loaded
library(olsrr)
## Warning: package 'olsrr' was built under R version 4.4.3
##
## Attaching package: 'olsrr'
##
## The following object is masked from 'package:datasets':
##
## rivers
library(dplyr)
library(GGally)
## Warning: package 'GGally' was built under R version 4.4.3
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
# Then read the data
wine <- read_excel("/Users/doris/OneDrive/Documents/data-table-B11.XLS")
# Next, check for missing values in the dataset
colSums(is.na(wine))
## Clarity Aroma Body Flavor Oakiness Quality Region
## 0 0 0 0 0 0 0
The wine dataset has no missing values
# Take a look at the structure of the wine dataset
str(wine)
## tibble [38 × 7] (S3: tbl_df/tbl/data.frame)
## $ Clarity : num [1:38] 1 1 1 1 1 1 1 1 1 1 ...
## $ Aroma : num [1:38] 3.3 4.4 3.9 3.9 5.6 4.6 4.8 5.3 4.3 4.3 ...
## $ Body : num [1:38] 2.8 4.9 5.3 2.6 5.1 4.7 4.8 4.5 4.3 3.9 ...
## $ Flavor : num [1:38] 3.1 3.5 4.8 3.1 5.5 5 4.8 4.3 3.9 4.7 ...
## $ Oakiness: num [1:38] 4.1 3.9 4.7 3.6 5.1 4.1 3.3 5.2 2.9 3.9 ...
## $ Quality : num [1:38] 9.8 12.6 11.9 11.1 13.3 12.8 12.8 12 13.6 13.9 ...
## $ Region : num [1:38] 1 1 1 1 1 1 1 1 3 1 ...
#Looking at the structure we can see that the wine dataset has 38
# observations.
# All correctly classified as numeric except for Region mis classified as
#numeric it should be categorical, thus we need to convert this variable.
# Apart from this everything looks good.
# Since Quality is the response variable am reordering the data so
# it appears as the first variable
wine <- wine[, c("Quality", setdiff(names(wine), "Quality"))]
# Let's take a look at the summary statistics
summary(wine)
## Quality Clarity Aroma Body
## Min. : 7.90 Min. :0.5000 Min. :3.300 Min. :2.600
## 1st Qu.:11.15 1st Qu.:0.8250 1st Qu.:4.125 1st Qu.:4.150
## Median :12.45 Median :1.0000 Median :4.650 Median :4.750
## Mean :12.44 Mean :0.9237 Mean :4.847 Mean :4.684
## 3rd Qu.:13.75 3rd Qu.:1.0000 3rd Qu.:5.450 3rd Qu.:5.375
## Max. :16.10 Max. :1.0000 Max. :7.700 Max. :6.600
## Flavor Oakiness Region
## Min. :2.900 Min. :2.900 Min. :1.000
## 1st Qu.:4.225 1st Qu.:3.700 1st Qu.:1.000
## Median :4.800 Median :4.100 Median :2.000
## Mean :4.768 Mean :4.255 Mean :1.868
## 3rd Qu.:5.500 3rd Qu.:4.775 3rd Qu.:3.000
## Max. :7.000 Max. :6.000 Max. :3.000
Exploratory Data Analysis (EDA)
# First we use the Pair wise scatter plot to visually identify
# pontential relationships patterns in our wine dataset
dev.new(width = 1000, height = 1000, unit = "px")
pairs(wine, pch = 19)
From the plot above its clear to see that Flavor has the strongest
positive linear relationship with quality followed by Aroma and Body.
Oakiness happens to have a negative linear relationship with Quality.
But notice that Clarity doesn’t seem to have a linear relationship. This
is not to say it does not have a relationship at all, it just might not
be linearily related. Its difficult to identify the trend For Region
since its has levels
# Next I use a loess smoother to see if the patterns becomes more clear
# Visualize data
wine_long <- wine %>%
pivot_longer(cols = -c(Quality,Region), names_to = "Variable", values_to = "Value")
ggplot(wine_long, aes(x = Value, y = Quality)) +
geom_point(color = "blue", alpha = 0.5) +
geom_smooth(method = "loess", color = "red", se = TRUE) +
facet_wrap(~ Variable, scales = "free_x") +
labs(title = "LOESS Smoothing of Quality vs Other Variables",
x = "Variable Value",
y = "Quality") +
theme_minimal()
## `geom_smooth()` using formula = 'y ~ x'
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1.0025
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.1025
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0
## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.01
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 1.0025
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.1025
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0
## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 0.01
# The first thing you notice looking at this plot is that Region is not
present, this because loess smoother works well with continous variables
and not categorical variables. Looking at the plots its clear that
# From the above plots we can visually notice the following
# Aroma and Flavor: There is a clear positive relationship with Quality,
# as the red LOESS curve shows an upward trend.
# Body: Similar to Aroma and Flavor, there is an increasing trend
# indicating higher Body values are associated with higher Quality.
# Clarity: The relationship is mostly flat, suggesting no significant
# effect of Clarity on Quality.
# Oakiness: Shows a complex relationship with an initial increase in
# Quality follwed by a slight decrease at higher Oakiness values
# Wider confidence interval at extremes weak negative relationship with Quality
# Confidence Intervals: The shaded grey regions represent the 95%
# confidence intervals, which widen when there is uncertainty in the
# relationship, so the narrower the better.
Correlation matrix
wine_numeric <- wine[, !names(wine) %in% "Region"]
correlation_matrix <- cor(wine)
print(correlation_matrix)
## Quality Clarity Aroma Body Flavor Oakiness
## Quality 1.00000000 0.02844131 0.7073243 0.5487022 0.79004713 -0.04704047
## Clarity 0.02844131 1.00000000 0.0619021 -0.3083783 -0.08515993 0.18321471
## Aroma 0.70732432 0.06190210 1.0000000 0.5489102 0.73656121 0.20164445
## Body 0.54870219 -0.30837826 0.5489102 1.0000000 0.64665917 0.15210591
## Flavor 0.79004713 -0.08515993 0.7365612 0.6466592 1.00000000 0.17976051
## Oakiness -0.04704047 0.18321471 0.2016444 0.1521059 0.17976051 1.00000000
## Region 0.50704891 -0.02032013 0.6201006 0.4354403 0.50591814 -0.05956862
## Region
## Quality 0.50704891
## Clarity -0.02032013
## Aroma 0.62010056
## Body 0.43544030
## Flavor 0.50591814
## Oakiness -0.05956862
## Region 1.00000000
ggpairs(wine_numeric)

# Interpretations
# Aroma-Quality: Strong positive correlation (0.707)
# Body-Quality: Moderate positive correlation (0.549)
# Flavor-Quality: Strong positive correlation (0.790)
# Oakiness-Quality: No significant correlation (-0.047)
# Clarity-Quality: Very weak positive correlation (0.028)
# Key strong correlations:
# Flavor has the strongest relationship with Quality
# Aroma is the second most correlated with Quality
# Body also shows a moderate positive correlation with Quality
# F-test Hypothesis
#The F-test evaluates whether the overall model is statistically significant.
# Null Hypothesis (H₀): All regression coefficients are simultaneously equal to zero.
#β₁ = β₂ = β₃ = β₄ = β₅ = β₆ = 0
#(Where β₁, β₂, etc. correspond to the coefficients for Clarity, Aroma,
# Body, Flavor, Oakiness, and Region)
#Alternative Hypothesis (H₁): At least one regression coefficient is not equal to zero.
#t-test Hypotheses
#The t-tests evaluate the significance of each individual predictor in the model.
#For each predictor (Clarity, Aroma, Body, Flavor, Oakiness, and Region variables):
#Null Hypothesis (H₀): The regression coefficient for the predictor equals zero (βᵢ = 0).
#Alternative Hypothesis (H₁): The regression coefficient for the predictor is not equal to zero (βᵢ ≠ 0).
#These t-tests help determine which specific predictors have a
#statistically significant relationship with wine Quality when controlling
#for the other variables in the model.
# We are considering the significance level to be 0.05
# Transforming the Region to the correct format
wine$Region = as.factor(wine$Region)
# Verify the change
str(wine)
## tibble [38 × 7] (S3: tbl_df/tbl/data.frame)
## $ Quality : num [1:38] 9.8 12.6 11.9 11.1 13.3 12.8 12.8 12 13.6 13.9 ...
## $ Clarity : num [1:38] 1 1 1 1 1 1 1 1 1 1 ...
## $ Aroma : num [1:38] 3.3 4.4 3.9 3.9 5.6 4.6 4.8 5.3 4.3 4.3 ...
## $ Body : num [1:38] 2.8 4.9 5.3 2.6 5.1 4.7 4.8 4.5 4.3 3.9 ...
## $ Flavor : num [1:38] 3.1 3.5 4.8 3.1 5.5 5 4.8 4.3 3.9 4.7 ...
## $ Oakiness: num [1:38] 4.1 3.9 4.7 3.6 5.1 4.1 3.3 5.2 2.9 3.9 ...
## $ Region : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 3 1 ...
Fit Multiple Linear Regression Model
# Fitting a multiple linear regression model using all predictors
model <- lm(Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region, data = wine)
# Summary of the model
summary(model)
##
## Call:
## lm(formula = Quality ~ Clarity + Aroma + Body + Flavor + Oakiness +
## Region, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.80824 -0.58413 -0.02081 0.48627 1.70909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.81437 1.96944 3.968 0.000417 ***
## Clarity 0.01705 1.45627 0.012 0.990736
## Aroma 0.08901 0.25250 0.353 0.726908
## Body 0.07967 0.26772 0.298 0.768062
## Flavor 1.11723 0.24026 4.650 6.25e-05 ***
## Oakiness -0.34644 0.23301 -1.487 0.147503
## Region2 -1.51285 0.39227 -3.857 0.000565 ***
## Region3 0.97259 0.51017 1.906 0.066218 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.9154 on 30 degrees of freedom
## Multiple R-squared: 0.8376, Adjusted R-squared: 0.7997
## F-statistic: 22.1 on 7 and 30 DF, p-value: 3.295e-10
Full F-test
# Full F-test
summary(aov(model))
## Df Sum Sq Mean Sq F value Pr(>F)
## Clarity 1 0.13 0.13 0.149 0.701824
## Aroma 1 77.35 77.35 92.306 1.15e-10 ***
## Body 1 6.41 6.41 7.654 0.009603 **
## Flavor 1 19.05 19.05 22.732 4.48e-05 ***
## Oakiness 1 8.60 8.60 10.260 0.003213 **
## Region 2 18.11 9.05 10.804 0.000292 ***
## Residuals 30 25.14 0.84
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Partial F-Test
#Partial F-Test
#First Run full model
full_model=lm(Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region, data=wine)
# Run a model with the excluded variables
exc_model=lm(Quality ~ Flavor + Region, data=wine)
#Run ANOVA of the two
anova(exc_model, full_model)
## Analysis of Variance Table
##
## Model 1: Quality ~ Flavor + Region
## Model 2: Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 34 27.213
## 2 30 25.140 4 2.073 0.6184 0.6528
# Check multicollinearity using VIF
vif(model)
## GVIF Df GVIF^(1/(2*Df))
## Clarity 1.439151 1 1.199646
## Aroma 3.298392 1 1.816147
## Body 2.149329 1 1.466059
## Flavor 2.693412 1 1.641162
## Oakiness 1.305766 1 1.142701
## Region 2.676874 2 1.279107
# VIF measures how much the variance of a regression coefficient is
# inflated due to multicollinearity.
# VIF values > 5 indicate problematic multicollinearity
# VIF values > 10 suggest severe multicollinearity
# All VIF values are below 5, which means:
# There's some correlation between predictors
# But not enough to cause serious multicollinearity issues
# The model's coefficient estimates should be relatively stable
model2 = lm(Quality ~ Flavor+Region, data = wine)
summary(model2)
##
## Call:
## lm(formula = Quality ~ Flavor + Region, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.97630 -0.58844 0.02184 0.51572 1.94232
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.0943 0.7912 8.967 1.76e-10 ***
## Flavor 1.1155 0.1738 6.417 2.49e-07 ***
## Region2 -1.5335 0.3688 -4.158 0.000205 ***
## Region3 1.2234 0.4003 3.056 0.004346 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8946 on 34 degrees of freedom
## Multiple R-squared: 0.8242, Adjusted R-squared: 0.8087
## F-statistic: 53.13 on 3 and 34 DF, p-value: 6.358e-13
# Interaction of the reduced model
interact = lm(Quality ~ Flavor*Region, data=wine)
summary(interact)
##
## Call:
## lm(formula = Quality ~ Flavor * Region, data = wine)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.94964 -0.58463 0.04393 0.49607 1.97295
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.7311 1.1292 5.961 1.22e-06 ***
## Flavor 1.1985 0.2532 4.733 4.31e-05 ***
## Region2 -2.8942 2.1183 -1.366 0.181
## Region3 3.3833 2.0153 1.679 0.103
## Flavor:Region2 0.3108 0.4766 0.652 0.519
## Flavor:Region3 -0.4029 0.3878 -1.039 0.307
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.8914 on 32 degrees of freedom
## Multiple R-squared: 0.8357, Adjusted R-squared: 0.8101
## F-statistic: 32.56 on 5 and 32 DF, p-value: 1.179e-11
Model Diagnostics
# Generating diagnostic plots to investigate if model assumptions are being met
par(mfrow = c(2, 2))# Set up a 2x2 grid of plots
plot(model, which = 1:4) # Generate diagnostic plots for the regression model

# Calculate Cook's Distance
cooks_dist <- cooks.distance(model)
# Identify influential points
influential_points <- which(cooks_dist > 0.05)
# Plot Cook's Distance for only influential points
plot(influential_points, cooks_dist[influential_points], type = "h",
main = "Influential Observations Based on Cook's Distance",
xlab = "Observation Index", ylab = "Cook's Distance", col = "blue", lwd = 1.5)
# Add threshold reference line
abline(h = 0.05, col = "red", lty = 2)
# Add point labels for clarity
text(influential_points, cooks_dist[influential_points],
labels = influential_points, pos = 3, cex = 0.8)
# Add legend
legend("topright", legend = "Threshold: Cook's Distance = 0.05",
col = "red", lty = 2, bty = "n")

# Fit the initial model
model <- lm(Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region, data=wine)
# Calculate Cook's Distance
cooks_dist <- cooks.distance(model)
# Identify influential points (Cook's Distance > 0.05)
influential_points <- which(cooks_dist > 0.05)
# Remove influential observations
wine_cleaned <- wine[-influential_points, ]
# Fit the model without influential observations
model_cleaned <- lm(Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region, data = wine_cleaned)
# Summary of the cleaned model
summary(model_cleaned)
##
## Call:
## lm(formula = Quality ~ Clarity + Aroma + Body + Flavor + Oakiness +
## Region, data = wine_cleaned)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.03020 -0.37083 -0.03834 0.40386 1.44420
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 6.33674 1.73723 3.648 0.001217 **
## Clarity 1.36297 1.34614 1.013 0.320996
## Aroma 0.15472 0.19694 0.786 0.439488
## Body 0.03905 0.23013 0.170 0.866639
## Flavor 1.06426 0.20174 5.275 1.83e-05 ***
## Oakiness -0.27269 0.18700 -1.458 0.157229
## Region2 -1.48321 0.32329 -4.588 0.000108 ***
## Region3 1.16825 0.39330 2.970 0.006483 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.6718 on 25 degrees of freedom
## Multiple R-squared: 0.9069, Adjusted R-squared: 0.8809
## F-statistic: 34.8 on 7 and 25 DF, p-value: 2.371e-11
Automatic Model Selections
# forward selection
model.forward<-ols_step_forward_p(model, penter = 0.10, details = F) # penter: threshold p-value for enter
model.forward
##
##
## Stepwise Summary
## ------------------------------------------------------------------------
## Step Variable AIC SBC SBIC R2 Adj. R2
## ------------------------------------------------------------------------
## 0 Base Model 165.209 168.484 54.520 0.00000 0.00000
## 1 Flavor 130.021 134.934 19.962 0.62417 0.61373
## 2 Region 105.152 113.340 -3.725 0.82419 0.80868
## 3 Oakiness 104.444 114.270 -3.474 0.83628 0.81644
## ------------------------------------------------------------------------
##
## Final Model Output
## ------------------
##
## Model Summary
## ---------------------------------------------------------------
## R 0.914 RMSE 0.817
## R-Squared 0.836 MSE 0.667
## Adj. R-Squared 0.816 Coef. Var 7.046
## Pred R-Squared 0.786 AIC 104.444
## MAE 0.654 SBC 114.270
## ---------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
## AIC: Akaike Information Criteria
## SBC: Schwarz Bayesian Criteria
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 129.447 4 32.362 42.141 0.0000
## Residual 25.342 33 0.768
## Total 154.788 37
## -------------------------------------------------------------------
##
## Parameter Estimates
## ----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ----------------------------------------------------------------------------------------
## (Intercept) 8.121 1.016 7.990 0.000 6.053 10.189
## Flavor 1.192 0.177 0.599 6.727 0.000 0.832 1.553
## Region2 -1.515 0.361 -0.319 -4.193 0.000 -2.251 -0.780
## Region3 1.094 0.401 0.252 2.728 0.010 0.278 1.909
## Oakiness -0.318 0.204 -0.115 -1.561 0.128 -0.733 0.097
## ----------------------------------------------------------------------------------------
# backward selection
model.backward<-ols_step_backward_p(model, prem = 0.10, details = F) # prem: threshold p-value for removal
model.backward
##
##
## Stepwise Summary
## ------------------------------------------------------------------------
## Step Variable AIC SBC SBIC R2 Adj. R2
## ------------------------------------------------------------------------
## 0 Full Model 110.141 124.879 3.893 0.83758 0.79969
## 1 Clarity 108.141 121.242 1.359 0.83758 0.80615
## 2 Body 106.271 117.734 -1.082 0.83702 0.81156
## 3 Aroma 104.444 114.270 -3.474 0.83628 0.81644
## ------------------------------------------------------------------------
##
## Final Model Output
## ------------------
##
## Model Summary
## ---------------------------------------------------------------
## R 0.914 RMSE 0.817
## R-Squared 0.836 MSE 0.667
## Adj. R-Squared 0.816 Coef. Var 7.046
## Pred R-Squared 0.786 AIC 104.444
## MAE 0.654 SBC 114.270
## ---------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
## AIC: Akaike Information Criteria
## SBC: Schwarz Bayesian Criteria
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 129.447 4 32.362 42.141 0.0000
## Residual 25.342 33 0.768
## Total 154.788 37
## -------------------------------------------------------------------
##
## Parameter Estimates
## ----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ----------------------------------------------------------------------------------------
## (Intercept) 8.121 1.016 7.990 0.000 6.053 10.189
## Flavor 1.192 0.177 0.599 6.727 0.000 0.832 1.553
## Oakiness -0.318 0.204 -0.115 -1.561 0.128 -0.733 0.097
## Region2 -1.515 0.361 -0.319 -4.193 0.000 -2.251 -0.780
## Region3 1.094 0.401 0.252 2.728 0.010 0.278 1.909
## ----------------------------------------------------------------------------------------
# stepwise selection
model.stepwise<-ols_step_both_p(model, pent = 0.10, prem = 0.10, details = FALSE)
model.stepwise
##
##
## Stepwise Summary
## ------------------------------------------------------------------------
## Step Variable AIC SBC SBIC R2 Adj. R2
## ------------------------------------------------------------------------
## 0 Base Model 165.209 168.484 54.520 0.00000 0.00000
## 1 Flavor (+) 130.021 134.934 19.962 0.62417 0.61373
## 2 Region (+) 105.152 113.340 -3.725 0.82419 0.80868
## ------------------------------------------------------------------------
##
## Final Model Output
## ------------------
##
## Model Summary
## ---------------------------------------------------------------
## R 0.908 RMSE 0.846
## R-Squared 0.824 MSE 0.716
## Adj. R-Squared 0.809 Coef. Var 7.193
## Pred R-Squared 0.780 AIC 105.152
## MAE 0.676 SBC 113.340
## ---------------------------------------------------------------
## RMSE: Root Mean Square Error
## MSE: Mean Square Error
## MAE: Mean Absolute Error
## AIC: Akaike Information Criteria
## SBC: Schwarz Bayesian Criteria
##
## ANOVA
## -------------------------------------------------------------------
## Sum of
## Squares DF Mean Square F Sig.
## -------------------------------------------------------------------
## Regression 127.575 3 42.525 53.131 0.0000
## Residual 27.213 34 0.800
## Total 154.788 37
## -------------------------------------------------------------------
##
## Parameter Estimates
## ----------------------------------------------------------------------------------------
## model Beta Std. Error Std. Beta t Sig lower upper
## ----------------------------------------------------------------------------------------
## (Intercept) 7.094 0.791 8.967 0.000 5.486 8.702
## Flavor 1.116 0.174 0.561 6.417 0.000 0.762 1.469
## Region2 -1.533 0.369 -0.323 -4.158 0.000 -2.283 -0.784
## Region3 1.223 0.400 0.282 3.056 0.004 0.410 2.037
## ----------------------------------------------------------------------------------------
Best subset selection (AIC, SBC, adjusted R-squre, C(p) etc.)
# 2. best subset selection (AIC, SBC, adjusted R-square, C(p) etc.)
model.best.subset<-ols_step_best_subset(model) # takes some of time 5-10 min. Be patient :)
model.best.subset
## Best Subsets Regression
## --------------------------------------------------------
## Model Index Predictors
## --------------------------------------------------------
## 1 Flavor
## 2 Flavor Region
## 3 Flavor Oakiness Region
## 4 Aroma Flavor Oakiness Region
## 5 Aroma Body Flavor Oakiness Region
## 6 Clarity Aroma Body Flavor Oakiness Region
## --------------------------------------------------------
##
## Subsets Regression Summary
## --------------------------------------------------------------------------------------------------------------------------------
## Adj. Pred
## Model R-Square R-Square R-Square C(p) AIC SBIC SBC MSEP FPE HSP APC
## --------------------------------------------------------------------------------------------------------------------------------
## 1 0.6242 0.6137 0.5868 35.4190 130.0214 19.9619 134.9341 61.4102 1.7010 0.0462 0.4176
## 2 0.8242 0.8087 0.7804 2.4737 105.1516 -3.7246 113.3395 29.5721 0.8636 0.0235 0.2059
## 3 0.8363 0.8164 0.7863 2.2407 104.4444 -3.4739 114.2699 28.3731 0.8488 0.0233 0.2022
## 4 0.8370 0.8116 0.7771 4.1033 106.2714 -1.0824 117.7345 29.1269 0.8921 0.0246 0.2124
## 5 0.8376 0.8061 0.7583 6.0001 108.1409 1.3593 121.2416 29.9634 0.9390 0.0262 0.2233
## 6 0.8376 0.7997 0.7213 8.0000 110.1408 3.8925 124.8790 30.9621 0.9924 0.0279 0.2358
## --------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria
## SBIC: Sawa's Bayesian Information Criteria
## SBC: Schwarz Bayesian Criteria
## MSEP: Estimated error of prediction, assuming multivariate normality
## FPE: Final Prediction Error
## HSP: Hocking's Sp
## APC: Amemiya Prediction Criteria