############################################################
# Assignment 6: Conduct Hypothesis
# Course: DDS-8521 Statistical Modeling
# Dataset: UCI Wine Quality (Vinho Verde)
############################################################
# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.1 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(boot)
set.seed(123)
# Read datasets
red <- read.csv("winequality-red.csv", sep = ";")
white <- read.csv("winequality-white.csv", sep = ";")
# Add wine type
red$type <- "Red"
white$type <- "White"
# Combine datasets
wine <- bind_rows(red, white)
# Initial exploration
head(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## 4 11.2 0.28 0.56 1.9 0.075
## 5 7.4 0.70 0.00 1.9 0.076
## 6 7.4 0.66 0.00 1.8 0.075
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## 4 17 60 0.9980 3.16 0.58 9.8
## 5 11 34 0.9978 3.51 0.56 9.4
## 6 13 40 0.9978 3.51 0.56 9.4
## quality type
## 1 5 Red
## 2 5 Red
## 3 5 Red
## 4 6 Red
## 5 5 Red
## 6 5 Red
summary(wine)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9970
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :10.30 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :10.49 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :9.000
## type
## Length:6497
## Class :character
## Mode :character
##
##
##
# Create quality group
wine <- wine %>%
mutate(quality_group = ifelse(quality >= 7, "High", "Low"))
# Exploratory Data Analysis
ggplot(wine, aes(x = quality)) +
geom_bar() +
facet_wrap(~type) +
labs(title = "Distribution of Wine Quality Ratings")

ggplot(wine, aes(x = alcohol, fill = quality_group)) +
geom_histogram(bins = 30, alpha = 0.6, position = "identity") +
labs(title = "Alcohol Content by Wine Quality Group")

# Hypothesis:
# H0: Mean alcohol_high - alcohol_low = 0
# H1: Mean alcohol_high - alcohol_low > 0
# Bootstrap function
boot_diff_mean <- function(data, indices) {
d <- data[indices, ]
mean(d$alcohol[d$quality_group == "High"]) -
mean(d$alcohol[d$quality_group == "Low"])
}
# Perform bootstrap
boot_results <- boot(
data = wine,
statistic = boot_diff_mean,
R = 2000
)
# View bootstrap results
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = wine, statistic = boot_diff_mean, R = 2000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 1.171898 -9.521963e-05 0.03706913
# Bootstrap confidence interval
boot_ci <- boot.ci(boot_results, type = c("perc", "bca"))
boot_ci
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_results, type = c("perc", "bca"))
##
## Intervals :
## Level Percentile BCa
## 95% ( 1.096, 1.246 ) ( 1.097, 1.248 )
## Calculations and Intervals on Original Scale
# Visualization of bootstrap distribution
boot_df <- data.frame(diff_means = boot_results$t)
ggplot(boot_df, aes(x = diff_means)) +
geom_histogram(bins = 40, fill = "steelblue", alpha = 0.7) +
geom_vline(xintercept = 0, linetype = "dashed") +
labs(title = "Bootstrap Distribution of Mean Alcohol Difference",
x = "Mean Difference (High - Low Quality)")
