# First load necessary libraries and read in the data set
library(readxl)
library(car)

## Loading required package: carData

library(tidyverse)

## Warning: package 'ggplot2' was built under R version 4.4.3

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(corrplot)

## corrplot 0.95 loaded

library(olsrr)

## Warning: package 'olsrr' was built under R version 4.4.3

## 
## Attaching package: 'olsrr'
## 
## The following object is masked from 'package:datasets':
## 
##     rivers

library(dplyr)
library(GGally)

## Warning: package 'GGally' was built under R version 4.4.3

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

#  Then read the data
wine <- read_excel("/Users/doris/OneDrive/Documents/data-table-B11.XLS")

# Next, check for missing values in the  dataset
colSums(is.na(wine))

##  Clarity    Aroma     Body   Flavor Oakiness  Quality   Region 
##        0        0        0        0        0        0        0

The wine dataset has no missing values

# Take a look at the structure of the wine dataset
str(wine)

## tibble [38 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Clarity : num [1:38] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Aroma   : num [1:38] 3.3 4.4 3.9 3.9 5.6 4.6 4.8 5.3 4.3 4.3 ...
##  $ Body    : num [1:38] 2.8 4.9 5.3 2.6 5.1 4.7 4.8 4.5 4.3 3.9 ...
##  $ Flavor  : num [1:38] 3.1 3.5 4.8 3.1 5.5 5 4.8 4.3 3.9 4.7 ...
##  $ Oakiness: num [1:38] 4.1 3.9 4.7 3.6 5.1 4.1 3.3 5.2 2.9 3.9 ...
##  $ Quality : num [1:38] 9.8 12.6 11.9 11.1 13.3 12.8 12.8 12 13.6 13.9 ...
##  $ Region  : num [1:38] 1 1 1 1 1 1 1 1 3 1 ...

#Looking at the structure we can see that the wine dataset has 38 
# observations. 
# All correctly classified as numeric except for Region mis classified as
#numeric it should be categorical, thus we need to convert this variable.
# Apart from this everything looks good.

# Since Quality is the response variable am reordering the data so
# it appears as the first variable
wine <- wine[, c("Quality", setdiff(names(wine), "Quality"))]

# Let's take a look at the summary statistics
summary(wine)

##     Quality         Clarity           Aroma            Body      
##  Min.   : 7.90   Min.   :0.5000   Min.   :3.300   Min.   :2.600  
##  1st Qu.:11.15   1st Qu.:0.8250   1st Qu.:4.125   1st Qu.:4.150  
##  Median :12.45   Median :1.0000   Median :4.650   Median :4.750  
##  Mean   :12.44   Mean   :0.9237   Mean   :4.847   Mean   :4.684  
##  3rd Qu.:13.75   3rd Qu.:1.0000   3rd Qu.:5.450   3rd Qu.:5.375  
##  Max.   :16.10   Max.   :1.0000   Max.   :7.700   Max.   :6.600  
##      Flavor         Oakiness         Region     
##  Min.   :2.900   Min.   :2.900   Min.   :1.000  
##  1st Qu.:4.225   1st Qu.:3.700   1st Qu.:1.000  
##  Median :4.800   Median :4.100   Median :2.000  
##  Mean   :4.768   Mean   :4.255   Mean   :1.868  
##  3rd Qu.:5.500   3rd Qu.:4.775   3rd Qu.:3.000  
##  Max.   :7.000   Max.   :6.000   Max.   :3.000

Exploratory Data Analysis (EDA)

# First we use the Pair wise scatter plot to visually identify
# pontential relationships patterns in our wine dataset

dev.new(width = 1000, height = 1000, unit = "px")
pairs(wine, pch = 19)

From the plot above its clear to see that Flavor has the strongest positive linear relationship with quality followed by Aroma and Body. Oakiness happens to have a negative linear relationship with Quality. But notice that Clarity doesn’t seem to have a linear relationship. This is not to say it does not have a relationship at all, it just might not be linearily related. Its difficult to identify the trend For Region since its has levels

# Next I use a loess smoother to see if the patterns becomes more clear

# Visualize data
wine_long <- wine %>%
  pivot_longer(cols = -c(Quality,Region), names_to = "Variable", values_to = "Value")

ggplot(wine_long, aes(x = Value, y = Quality)) +
  geom_point(color = "blue", alpha = 0.5) +
  geom_smooth(method = "loess", color = "red", se = TRUE) +
  facet_wrap(~ Variable, scales = "free_x") +
  labs(title = "LOESS Smoothing of Quality vs Other Variables",
       x = "Variable Value",
       y = "Quality") +
  theme_minimal()

## `geom_smooth()` using formula = 'y ~ x'

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : pseudoinverse used at 1.0025

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : neighborhood radius 0.1025

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : reciprocal condition number 0

## Warning in simpleLoess(y, x, w, span, degree = degree, parametric = parametric,
## : There are other near singularities as well. 0.01

## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : pseudoinverse used at
## 1.0025

## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : neighborhood radius
## 0.1025

## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : reciprocal condition
## number 0

## Warning in predLoess(object$y, object$x, newx = if (is.null(newdata)) object$x
## else if (is.data.frame(newdata))
## as.matrix(model.frame(delete.response(terms(object)), : There are other near
## singularities as well. 0.01

# The first thing you notice looking at this plot is that Region is not present, this because loess smoother works well with continous variables and not categorical variables. Looking at the plots its clear that

# From the above plots we can visually notice the following
# Aroma and Flavor: There is a clear positive relationship with Quality,
# as the red LOESS curve shows an upward trend.

# Body: Similar to Aroma and Flavor, there is an increasing trend
# indicating higher Body values are associated with higher Quality.

# Clarity: The relationship is mostly flat, suggesting no significant
# effect of Clarity on Quality.

# Oakiness: Shows a complex relationship with an initial increase in
# Quality follwed by a slight decrease at higher Oakiness values
# Wider confidence interval at extremes weak negative relationship with Quality

# Confidence Intervals: The shaded grey regions represent the 95%
# confidence intervals, which widen when there is uncertainty in the
# relationship, so the narrower the better.

Correlation matrix

wine_numeric <- wine[, !names(wine) %in% "Region"]
correlation_matrix <- cor(wine)
print(correlation_matrix)

##              Quality     Clarity     Aroma       Body      Flavor    Oakiness
## Quality   1.00000000  0.02844131 0.7073243  0.5487022  0.79004713 -0.04704047
## Clarity   0.02844131  1.00000000 0.0619021 -0.3083783 -0.08515993  0.18321471
## Aroma     0.70732432  0.06190210 1.0000000  0.5489102  0.73656121  0.20164445
## Body      0.54870219 -0.30837826 0.5489102  1.0000000  0.64665917  0.15210591
## Flavor    0.79004713 -0.08515993 0.7365612  0.6466592  1.00000000  0.17976051
## Oakiness -0.04704047  0.18321471 0.2016444  0.1521059  0.17976051  1.00000000
## Region    0.50704891 -0.02032013 0.6201006  0.4354403  0.50591814 -0.05956862
##               Region
## Quality   0.50704891
## Clarity  -0.02032013
## Aroma     0.62010056
## Body      0.43544030
## Flavor    0.50591814
## Oakiness -0.05956862
## Region    1.00000000

ggpairs(wine_numeric)

# Interpretations
# Aroma-Quality: Strong positive correlation (0.707)
# Body-Quality: Moderate positive correlation (0.549)
# Flavor-Quality: Strong positive correlation (0.790)
# Oakiness-Quality: No significant correlation (-0.047)
# Clarity-Quality: Very weak positive correlation (0.028)

# Key strong correlations:

# Flavor has the strongest relationship with Quality
# Aroma is the second most correlated with Quality
# Body also shows a moderate positive correlation with Quality

# F-test Hypothesis
#The F-test evaluates whether the overall model is statistically significant.
# Null Hypothesis (H₀): All regression coefficients are simultaneously equal to zero.
   #β₁ = β₂ = β₃ = β₄ = β₅ = β₆ = 0
#(Where β₁, β₂, etc. correspond to the coefficients for Clarity, Aroma,
# Body, Flavor, Oakiness, and Region)
#Alternative Hypothesis (H₁): At least one regression coefficient is not equal to zero.

#t-test Hypotheses
#The t-tests evaluate the significance of each individual predictor in the model.
#For each predictor (Clarity, Aroma, Body, Flavor, Oakiness, and Region variables):
#Null Hypothesis (H₀): The regression coefficient for the predictor equals zero (βᵢ = 0).
#Alternative Hypothesis (H₁): The regression coefficient for the predictor is not equal to zero (βᵢ ≠ 0).

#These t-tests help determine which specific predictors have a
#statistically significant relationship with wine Quality when controlling
#for the other variables in the model.

# We are considering the significance level to be 0.05

# Transforming the Region to the correct format
wine$Region = as.factor(wine$Region)

# Verify the change
str(wine)

## tibble [38 × 7] (S3: tbl_df/tbl/data.frame)
##  $ Quality : num [1:38] 9.8 12.6 11.9 11.1 13.3 12.8 12.8 12 13.6 13.9 ...
##  $ Clarity : num [1:38] 1 1 1 1 1 1 1 1 1 1 ...
##  $ Aroma   : num [1:38] 3.3 4.4 3.9 3.9 5.6 4.6 4.8 5.3 4.3 4.3 ...
##  $ Body    : num [1:38] 2.8 4.9 5.3 2.6 5.1 4.7 4.8 4.5 4.3 3.9 ...
##  $ Flavor  : num [1:38] 3.1 3.5 4.8 3.1 5.5 5 4.8 4.3 3.9 4.7 ...
##  $ Oakiness: num [1:38] 4.1 3.9 4.7 3.6 5.1 4.1 3.3 5.2 2.9 3.9 ...
##  $ Region  : Factor w/ 3 levels "1","2","3": 1 1 1 1 1 1 1 1 3 1 ...

Fit Multiple Linear Regression Model

#  Fitting a multiple linear regression model using all predictors
model <- lm(Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region, data = wine)

# Summary of the model
summary(model)

## 
## Call:
## lm(formula = Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + 
##     Region, data = wine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.80824 -0.58413 -0.02081  0.48627  1.70909 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.81437    1.96944   3.968 0.000417 ***
## Clarity      0.01705    1.45627   0.012 0.990736    
## Aroma        0.08901    0.25250   0.353 0.726908    
## Body         0.07967    0.26772   0.298 0.768062    
## Flavor       1.11723    0.24026   4.650 6.25e-05 ***
## Oakiness    -0.34644    0.23301  -1.487 0.147503    
## Region2     -1.51285    0.39227  -3.857 0.000565 ***
## Region3      0.97259    0.51017   1.906 0.066218 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.9154 on 30 degrees of freedom
## Multiple R-squared:  0.8376, Adjusted R-squared:  0.7997 
## F-statistic:  22.1 on 7 and 30 DF,  p-value: 3.295e-10

Full F-test

# Full F-test
summary(aov(model))

##             Df Sum Sq Mean Sq F value   Pr(>F)    
## Clarity      1   0.13    0.13   0.149 0.701824    
## Aroma        1  77.35   77.35  92.306 1.15e-10 ***
## Body         1   6.41    6.41   7.654 0.009603 ** 
## Flavor       1  19.05   19.05  22.732 4.48e-05 ***
## Oakiness     1   8.60    8.60  10.260 0.003213 ** 
## Region       2  18.11    9.05  10.804 0.000292 ***
## Residuals   30  25.14    0.84                     
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Partial F-Test

#Partial F-Test
#First Run full model
 full_model=lm(Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region, data=wine)
# Run a model with the excluded variables
exc_model=lm(Quality ~  Flavor + Region, data=wine)
#Run ANOVA of the two
anova(exc_model, full_model)

## Analysis of Variance Table
## 
## Model 1: Quality ~ Flavor + Region
## Model 2: Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region
##   Res.Df    RSS Df Sum of Sq      F Pr(>F)
## 1     34 27.213                           
## 2     30 25.140  4     2.073 0.6184 0.6528

# Check multicollinearity using VIF
vif(model)

##              GVIF Df GVIF^(1/(2*Df))
## Clarity  1.439151  1        1.199646
## Aroma    3.298392  1        1.816147
## Body     2.149329  1        1.466059
## Flavor   2.693412  1        1.641162
## Oakiness 1.305766  1        1.142701
## Region   2.676874  2        1.279107

# VIF measures how much the variance of a regression coefficient is
# inflated due to multicollinearity.

# VIF values > 5 indicate problematic multicollinearity
# VIF values > 10 suggest severe multicollinearity
# All VIF values are below 5, which means:
# There's some correlation between predictors
# But not enough to cause serious multicollinearity issues
# The model's coefficient estimates should be relatively stable

model2 = lm(Quality ~ Flavor+Region, data = wine)
summary(model2)

## 
## Call:
## lm(formula = Quality ~ Flavor + Region, data = wine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.97630 -0.58844  0.02184  0.51572  1.94232 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   7.0943     0.7912   8.967 1.76e-10 ***
## Flavor        1.1155     0.1738   6.417 2.49e-07 ***
## Region2      -1.5335     0.3688  -4.158 0.000205 ***
## Region3       1.2234     0.4003   3.056 0.004346 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8946 on 34 degrees of freedom
## Multiple R-squared:  0.8242, Adjusted R-squared:  0.8087 
## F-statistic: 53.13 on 3 and 34 DF,  p-value: 6.358e-13

# Interaction of the reduced model
interact = lm(Quality ~ Flavor*Region, data=wine)
summary(interact)

## 
## Call:
## lm(formula = Quality ~ Flavor * Region, data = wine)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.94964 -0.58463  0.04393  0.49607  1.97295 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      6.7311     1.1292   5.961 1.22e-06 ***
## Flavor           1.1985     0.2532   4.733 4.31e-05 ***
## Region2         -2.8942     2.1183  -1.366    0.181    
## Region3          3.3833     2.0153   1.679    0.103    
## Flavor:Region2   0.3108     0.4766   0.652    0.519    
## Flavor:Region3  -0.4029     0.3878  -1.039    0.307    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.8914 on 32 degrees of freedom
## Multiple R-squared:  0.8357, Adjusted R-squared:  0.8101 
## F-statistic: 32.56 on 5 and 32 DF,  p-value: 1.179e-11

Model Diagnostics

# Generating diagnostic plots to investigate if model assumptions are being met 

par(mfrow = c(2, 2))# Set up a 2x2 grid of plots
plot(model, which = 1:4) # Generate diagnostic plots for the  regression model

# Calculate Cook's Distance
cooks_dist <- cooks.distance(model)

# Identify influential points
influential_points <- which(cooks_dist > 0.05)

# Plot Cook's Distance for only influential points
plot(influential_points, cooks_dist[influential_points], type = "h", 
     main = "Influential Observations Based on Cook's Distance",
     xlab = "Observation Index", ylab = "Cook's Distance", col = "blue", lwd = 1.5)

# Add threshold reference line
abline(h = 0.05, col = "red", lty = 2)

# Add point labels for clarity
text(influential_points, cooks_dist[influential_points], 
     labels = influential_points, pos = 3, cex = 0.8)

# Add legend
legend("topright", legend = "Threshold: Cook's Distance = 0.05",
       col = "red", lty = 2, bty = "n")

# Fit the initial model
model <- lm(Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region, data=wine)

# Calculate Cook's Distance
cooks_dist <- cooks.distance(model)

# Identify influential points (Cook's Distance > 0.05)
influential_points <- which(cooks_dist > 0.05)

# Remove influential observations
wine_cleaned <- wine[-influential_points, ]

# Fit the model without influential observations
model_cleaned <- lm(Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + Region, data = wine_cleaned)

# Summary of the cleaned model
summary(model_cleaned)

## 
## Call:
## lm(formula = Quality ~ Clarity + Aroma + Body + Flavor + Oakiness + 
##     Region, data = wine_cleaned)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.03020 -0.37083 -0.03834  0.40386  1.44420 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  6.33674    1.73723   3.648 0.001217 ** 
## Clarity      1.36297    1.34614   1.013 0.320996    
## Aroma        0.15472    0.19694   0.786 0.439488    
## Body         0.03905    0.23013   0.170 0.866639    
## Flavor       1.06426    0.20174   5.275 1.83e-05 ***
## Oakiness    -0.27269    0.18700  -1.458 0.157229    
## Region2     -1.48321    0.32329  -4.588 0.000108 ***
## Region3      1.16825    0.39330   2.970 0.006483 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.6718 on 25 degrees of freedom
## Multiple R-squared:  0.9069, Adjusted R-squared:  0.8809 
## F-statistic:  34.8 on 7 and 25 DF,  p-value: 2.371e-11

Automatic Model Selections

# forward selection
model.forward<-ols_step_forward_p(model, penter = 0.10, details = F) # penter: threshold p-value for enter
model.forward

## 
## 
##                              Stepwise Summary                             
## ------------------------------------------------------------------------
## Step    Variable        AIC        SBC       SBIC       R2       Adj. R2 
## ------------------------------------------------------------------------
##  0      Base Model    165.209    168.484    54.520    0.00000    0.00000 
##  1      Flavor        130.021    134.934    19.962    0.62417    0.61373 
##  2      Region        105.152    113.340    -3.725    0.82419    0.80868 
##  3      Oakiness      104.444    114.270    -3.474    0.83628    0.81644 
## ------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                          
## ---------------------------------------------------------------
## R                       0.914       RMSE                 0.817 
## R-Squared               0.836       MSE                  0.667 
## Adj. R-Squared          0.816       Coef. Var            7.046 
## Pred R-Squared          0.786       AIC                104.444 
## MAE                     0.654       SBC                114.270 
## ---------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares        DF    Mean Square      F         Sig. 
## -------------------------------------------------------------------
## Regression    129.447         4         32.362    42.141    0.0000 
## Residual       25.342        33          0.768                     
## Total         154.788        37                                    
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)     8.121         1.016                  7.990    0.000     6.053    10.189 
##      Flavor     1.192         0.177        0.599     6.727    0.000     0.832     1.553 
##     Region2    -1.515         0.361       -0.319    -4.193    0.000    -2.251    -0.780 
##     Region3     1.094         0.401        0.252     2.728    0.010     0.278     1.909 
##    Oakiness    -0.318         0.204       -0.115    -1.561    0.128    -0.733     0.097 
## ----------------------------------------------------------------------------------------

# backward selection
model.backward<-ols_step_backward_p(model, prem = 0.10, details = F) # prem: threshold p-value for removal
model.backward

## 
## 
##                              Stepwise Summary                             
## ------------------------------------------------------------------------
## Step    Variable        AIC        SBC       SBIC       R2       Adj. R2 
## ------------------------------------------------------------------------
##  0      Full Model    110.141    124.879     3.893    0.83758    0.79969 
##  1      Clarity       108.141    121.242     1.359    0.83758    0.80615 
##  2      Body          106.271    117.734    -1.082    0.83702    0.81156 
##  3      Aroma         104.444    114.270    -3.474    0.83628    0.81644 
## ------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                          
## ---------------------------------------------------------------
## R                       0.914       RMSE                 0.817 
## R-Squared               0.836       MSE                  0.667 
## Adj. R-Squared          0.816       Coef. Var            7.046 
## Pred R-Squared          0.786       AIC                104.444 
## MAE                     0.654       SBC                114.270 
## ---------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares        DF    Mean Square      F         Sig. 
## -------------------------------------------------------------------
## Regression    129.447         4         32.362    42.141    0.0000 
## Residual       25.342        33          0.768                     
## Total         154.788        37                                    
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)     8.121         1.016                  7.990    0.000     6.053    10.189 
##      Flavor     1.192         0.177        0.599     6.727    0.000     0.832     1.553 
##    Oakiness    -0.318         0.204       -0.115    -1.561    0.128    -0.733     0.097 
##     Region2    -1.515         0.361       -0.319    -4.193    0.000    -2.251    -0.780 
##     Region3     1.094         0.401        0.252     2.728    0.010     0.278     1.909 
## ----------------------------------------------------------------------------------------

# stepwise selection
model.stepwise<-ols_step_both_p(model, pent = 0.10, prem = 0.10, details = FALSE)
model.stepwise

## 
## 
##                              Stepwise Summary                             
## ------------------------------------------------------------------------
## Step    Variable        AIC        SBC       SBIC       R2       Adj. R2 
## ------------------------------------------------------------------------
##  0      Base Model    165.209    168.484    54.520    0.00000    0.00000 
##  1      Flavor (+)    130.021    134.934    19.962    0.62417    0.61373 
##  2      Region (+)    105.152    113.340    -3.725    0.82419    0.80868 
## ------------------------------------------------------------------------
## 
## Final Model Output 
## ------------------
## 
##                          Model Summary                          
## ---------------------------------------------------------------
## R                       0.908       RMSE                 0.846 
## R-Squared               0.824       MSE                  0.716 
## Adj. R-Squared          0.809       Coef. Var            7.193 
## Pred R-Squared          0.780       AIC                105.152 
## MAE                     0.676       SBC                113.340 
## ---------------------------------------------------------------
##  RMSE: Root Mean Square Error 
##  MSE: Mean Square Error 
##  MAE: Mean Absolute Error 
##  AIC: Akaike Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
## 
##                                ANOVA                                
## -------------------------------------------------------------------
##                Sum of                                              
##               Squares        DF    Mean Square      F         Sig. 
## -------------------------------------------------------------------
## Regression    127.575         3         42.525    53.131    0.0000 
## Residual       27.213        34          0.800                     
## Total         154.788        37                                    
## -------------------------------------------------------------------
## 
##                                   Parameter Estimates                                    
## ----------------------------------------------------------------------------------------
##       model      Beta    Std. Error    Std. Beta      t        Sig      lower     upper 
## ----------------------------------------------------------------------------------------
## (Intercept)     7.094         0.791                  8.967    0.000     5.486     8.702 
##      Flavor     1.116         0.174        0.561     6.417    0.000     0.762     1.469 
##     Region2    -1.533         0.369       -0.323    -4.158    0.000    -2.283    -0.784 
##     Region3     1.223         0.400        0.282     3.056    0.004     0.410     2.037 
## ----------------------------------------------------------------------------------------

Best subset selection (AIC, SBC, adjusted R-squre, C(p) etc.)

# 2. best subset selection (AIC, SBC, adjusted R-square, C(p) etc.)
model.best.subset<-ols_step_best_subset(model) # takes some of time 5-10 min. Be patient :)
model.best.subset

##                 Best Subsets Regression                 
## --------------------------------------------------------
## Model Index    Predictors
## --------------------------------------------------------
##      1         Flavor                                    
##      2         Flavor Region                             
##      3         Flavor Oakiness Region                    
##      4         Aroma Flavor Oakiness Region              
##      5         Aroma Body Flavor Oakiness Region         
##      6         Clarity Aroma Body Flavor Oakiness Region 
## --------------------------------------------------------
## 
##                                                    Subsets Regression Summary                                                   
## --------------------------------------------------------------------------------------------------------------------------------
##                        Adj.        Pred                                                                                          
## Model    R-Square    R-Square    R-Square     C(p)        AIC        SBIC        SBC        MSEP       FPE       HSP       APC  
## --------------------------------------------------------------------------------------------------------------------------------
##   1        0.6242      0.6137      0.5868    35.4190    130.0214    19.9619    134.9341    61.4102    1.7010    0.0462    0.4176 
##   2        0.8242      0.8087      0.7804     2.4737    105.1516    -3.7246    113.3395    29.5721    0.8636    0.0235    0.2059 
##   3        0.8363      0.8164      0.7863     2.2407    104.4444    -3.4739    114.2699    28.3731    0.8488    0.0233    0.2022 
##   4        0.8370      0.8116      0.7771     4.1033    106.2714    -1.0824    117.7345    29.1269    0.8921    0.0246    0.2124 
##   5        0.8376      0.8061      0.7583     6.0001    108.1409     1.3593    121.2416    29.9634    0.9390    0.0262    0.2233 
##   6        0.8376      0.7997      0.7213     8.0000    110.1408     3.8925    124.8790    30.9621    0.9924    0.0279    0.2358 
## --------------------------------------------------------------------------------------------------------------------------------
## AIC: Akaike Information Criteria 
##  SBIC: Sawa's Bayesian Information Criteria 
##  SBC: Schwarz Bayesian Criteria 
##  MSEP: Estimated error of prediction, assuming multivariate normality 
##  FPE: Final Prediction Error 
##  HSP: Hocking's Sp 
##  APC: Amemiya Prediction Criteria

Wine Quality Statistical Analysis

Doris Mbitazi Asongafac

2025-03-27