############################################################
# Assignment 6: Conduct Hypothesis
# Course: DDS-8521 Statistical Modeling
# Dataset: UCI Wine Quality (Vinho Verde)
############################################################

# Load required libraries
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.1     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(boot)

set.seed(123)

# Read datasets
red <- read.csv("winequality-red.csv", sep = ";")
white <- read.csv("winequality-white.csv", sep = ";")

# Add wine type
red$type <- "Red"
white$type <- "White"

# Combine datasets
wine <- bind_rows(red, white)

# Initial exploration
head(wine)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
## 4          11.2             0.28        0.56            1.9     0.075
## 5           7.4             0.70        0.00            1.9     0.076
## 6           7.4             0.66        0.00            1.8     0.075
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
## 4                  17                   60  0.9980 3.16      0.58     9.8
## 5                  11                   34  0.9978 3.51      0.56     9.4
## 6                  13                   40  0.9978 3.51      0.56     9.4
##   quality type
## 1       5  Red
## 2       5  Red
## 3       5  Red
## 4       6  Red
## 5       5  Red
## 6       5  Red
summary(wine)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :0.3397   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0        1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :0.9949  
##  Mean   :0.05603   Mean   : 30.53      Mean   :115.7        Mean   :0.9947  
##  3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0        3rd Qu.:0.9970  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :10.30   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :10.49   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :9.000  
##      type          
##  Length:6497       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
# Create quality group
wine <- wine %>%
  mutate(quality_group = ifelse(quality >= 7, "High", "Low"))

# Exploratory Data Analysis
ggplot(wine, aes(x = quality)) +
  geom_bar() +
  facet_wrap(~type) +
  labs(title = "Distribution of Wine Quality Ratings")

ggplot(wine, aes(x = alcohol, fill = quality_group)) +
  geom_histogram(bins = 30, alpha = 0.6, position = "identity") +
  labs(title = "Alcohol Content by Wine Quality Group")

# Hypothesis:
# H0: Mean alcohol_high - alcohol_low = 0
# H1: Mean alcohol_high - alcohol_low > 0

# Bootstrap function
boot_diff_mean <- function(data, indices) {
  d <- data[indices, ]
  mean(d$alcohol[d$quality_group == "High"]) -
    mean(d$alcohol[d$quality_group == "Low"])
}

# Perform bootstrap
boot_results <- boot(
  data = wine,
  statistic = boot_diff_mean,
  R = 2000
)

# View bootstrap results
boot_results
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = wine, statistic = boot_diff_mean, R = 2000)
## 
## 
## Bootstrap Statistics :
##     original        bias    std. error
## t1* 1.171898 -9.521963e-05  0.03706913
# Bootstrap confidence interval
boot_ci <- boot.ci(boot_results, type = c("perc", "bca"))
boot_ci
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 2000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_results, type = c("perc", "bca"))
## 
## Intervals : 
## Level     Percentile            BCa          
## 95%   ( 1.096,  1.246 )   ( 1.097,  1.248 )  
## Calculations and Intervals on Original Scale
# Visualization of bootstrap distribution
boot_df <- data.frame(diff_means = boot_results$t)

ggplot(boot_df, aes(x = diff_means)) +
  geom_histogram(bins = 40, fill = "steelblue", alpha = 0.7) +
  geom_vline(xintercept = 0, linetype = "dashed") +
  labs(title = "Bootstrap Distribution of Mean Alcohol Difference",
       x = "Mean Difference (High - Low Quality)")