Hypothesis Testing using Bootstrap, Resampling, and Confidence Intervals


Step 1: Acquire and Load the Data


##########################################################
# Step 1. Load the data
##########################################################
# set current directory
setwd("/Users/whinton/src/rstudio/tim8521")

# Read the file as a csv file (separator is a semicolon)
red_wine <- read.csv("winequality-red.csv", header = TRUE, sep= ";",stringsAsFactors = TRUE)
white_wine <- read.csv("winequality-white.csv", header = TRUE, sep= ";",stringsAsFactors = TRUE)

# Add wine type labels
red_wine$wine_type <- "Red"
white_wine$wine_type <- "White"

# Combine both datasets
wine <- rbind(red_wine, white_wine)
wine_data <- wine

Step 2: Exploration of Data, Correlation Analysis, Hypothesis Formation


###########################################################
# Exploration of the Data and Correlation Analysis  
###########################################################

# Examine the combined dataframe
head(wine_data, 3)
##   fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1           7.4             0.70        0.00            1.9     0.076
## 2           7.8             0.88        0.00            2.6     0.098
## 3           7.8             0.76        0.04            2.3     0.092
##   free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 1                  11                   34  0.9978 3.51      0.56     9.4
## 2                  25                   67  0.9968 3.20      0.68     9.8
## 3                  15                   54  0.9970 3.26      0.65     9.8
##   quality wine_type
## 1       5       Red
## 2       5       Red
## 3       5       Red
tail(wine_data, 3)
##      fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 6495           6.5             0.24        0.19            1.2     0.041
## 6496           5.5             0.29        0.30            1.1     0.022
## 6497           6.0             0.21        0.38            0.8     0.020
##      free.sulfur.dioxide total.sulfur.dioxide density   pH sulphates alcohol
## 6495                  30                  111 0.99254 2.99      0.46     9.4
## 6496                  20                  110 0.98869 3.34      0.38    12.8
## 6497                  22                   98 0.98941 3.26      0.32    11.8
##      quality wine_type
## 6495       6     White
## 6496       7     White
## 6497       6     White
summary(wine_data)
##  fixed.acidity    volatile.acidity  citric.acid     residual.sugar  
##  Min.   : 3.800   Min.   :0.0800   Min.   :0.0000   Min.   : 0.600  
##  1st Qu.: 6.400   1st Qu.:0.2300   1st Qu.:0.2500   1st Qu.: 1.800  
##  Median : 7.000   Median :0.2900   Median :0.3100   Median : 3.000  
##  Mean   : 7.215   Mean   :0.3397   Mean   :0.3186   Mean   : 5.443  
##  3rd Qu.: 7.700   3rd Qu.:0.4000   3rd Qu.:0.3900   3rd Qu.: 8.100  
##  Max.   :15.900   Max.   :1.5800   Max.   :1.6600   Max.   :65.800  
##    chlorides       free.sulfur.dioxide total.sulfur.dioxide    density      
##  Min.   :0.00900   Min.   :  1.00      Min.   :  6.0        Min.   :0.9871  
##  1st Qu.:0.03800   1st Qu.: 17.00      1st Qu.: 77.0        1st Qu.:0.9923  
##  Median :0.04700   Median : 29.00      Median :118.0        Median :0.9949  
##  Mean   :0.05603   Mean   : 30.53      Mean   :115.7        Mean   :0.9947  
##  3rd Qu.:0.06500   3rd Qu.: 41.00      3rd Qu.:156.0        3rd Qu.:0.9970  
##  Max.   :0.61100   Max.   :289.00      Max.   :440.0        Max.   :1.0390  
##        pH          sulphates         alcohol         quality     
##  Min.   :2.720   Min.   :0.2200   Min.   : 8.00   Min.   :3.000  
##  1st Qu.:3.110   1st Qu.:0.4300   1st Qu.: 9.50   1st Qu.:5.000  
##  Median :3.210   Median :0.5100   Median :10.30   Median :6.000  
##  Mean   :3.219   Mean   :0.5313   Mean   :10.49   Mean   :5.818  
##  3rd Qu.:3.320   3rd Qu.:0.6000   3rd Qu.:11.30   3rd Qu.:6.000  
##  Max.   :4.010   Max.   :2.0000   Max.   :14.90   Max.   :9.000  
##   wine_type        
##  Length:6497       
##  Class :character  
##  Mode  :character  
##                    
##                    
## 
str(wine_data)
## 'data.frame':    6497 obs. of  13 variables:
##  $ fixed.acidity       : num  7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
##  $ volatile.acidity    : num  0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
##  $ citric.acid         : num  0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
##  $ residual.sugar      : num  1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
##  $ chlorides           : num  0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
##  $ free.sulfur.dioxide : num  11 25 15 17 11 13 15 15 9 17 ...
##  $ total.sulfur.dioxide: num  34 67 54 60 34 40 59 21 18 102 ...
##  $ density             : num  0.998 0.997 0.997 0.998 0.998 ...
##  $ pH                  : num  3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
##  $ sulphates           : num  0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
##  $ alcohol             : num  9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
##  $ quality             : int  5 5 5 6 5 5 5 7 7 5 ...
##  $ wine_type           : chr  "Red" "Red" "Red" "Red" ...
#dplyr::glimpse(wine_data)

# Check for missing values
colSums(is.na(wine_data))
##        fixed.acidity     volatile.acidity          citric.acid 
##                    0                    0                    0 
##       residual.sugar            chlorides  free.sulfur.dioxide 
##                    0                    0                    0 
## total.sulfur.dioxide              density                   pH 
##                    0                    0                    0 
##            sulphates              alcohol              quality 
##                    0                    0                    0 
##            wine_type 
##                    0
missmap(wine_data)

# check quality histogram
histogram(red_wine$quality, plot = TRUE)

histogram(white_wine$quality,  plot = TRUE)

histogram(wine_data$quality)

histogram(wine_data$alcohol)



##########################################################
# Explore Relationships and Correlation Analysis
##########################################################
# Convert quality to a factor for categorical visualization
# wine_data$quality <- as.factor(wine_data$quality)

# List of top physicochemical properties
physicochemical_vars <- c("alcohol","volatile.acidity","sulphates","citric.acid","density","quality")
if (FALSE) {
physicochemical_vars <- c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar", 
                          "chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide", "density", 
                          "pH", "sulphates", "alcohol", "quality")
}

# Compute correlation matrix
correlation_matrix <- cor(wine_data[, physicochemical_vars], use = "complete.obs")

# Display the correlation heatmap
corrplot(correlation_matrix, method = "color", type = "upper", 
         tl.col = "black", tl.srt = 45, title = "Correlation Heatmap of Key Physicochemical Properties",
         mar = c(0, 0, 2, 0))

##########################################################
#. Hypothesis based on Data and Correlation Analysis
#  (alcohol and quality are highly and positively correlated)
#
# Hypothesis: The average alcohol content of high-quality 
# wine (rating ≥ 7) is significantly higher than that of 
# lower-quality wine (rating < 7).
#   Null Hypothesis(H0):
#   The mean alcohol content of high-quality and 
#   low-quality wines is the same.
# Alternative Hypothesis(H1):
#   The mean alcohol content of high-quality wines is 
#   significantly higher than that of lower-quality wines. 
##########################################################
var = "alcohol"
print(ggplot(wine_data, aes_string(x = var, y = "quality", color = "wine_type")) +
        geom_jitter(alpha = 0.3, width = 0.2) +  # Adds jitter for better visualization
        geom_smooth(method = "lm", se = TRUE, linetype = "dashed", color = "black") +
        labs(title = paste("Scatter Plot of", var, "vs. Quality"),
             x = var, y = "Quality") +
        theme_minimal() +
        theme(legend.position = "bottom"))
## `geom_smooth()` using formula = 'y ~ x'

# Create a correlation plot matrix (pairwise scatter plots)
ggpairs(wine_data, columns = which(names(wine_data) %in% physicochemical_vars), 
      aes(color = wine_type), title = "Pairwise Scatter Plots of Physicochemical Properties")


Step 4: Perform Bootstrapping for Hypothesis Testing


##########################################################
# Perform Non-Parmetrich Bootstrapping 
# (1000 resamples with replacement) 
# to estimate the difference in mean alcohol content 
# between high-quality and low-quality wines
#
# Calculate the 95% confidence interval (CI) for the 
# difference in mean alcohol content between high-quality 
# and low-quality wines.
##########################################################


# Define groups based on quality
high_quality <- wine_data %>% filter(quality >= 7) %>% select(alcohol)
low_quality <- wine_data %>% filter(quality < 7) %>% select(alcohol)

# Define function for bootstrap resampling
bootstrap_diff <- function(data, indices) {
  sample_data <- data[indices, ]
  mean_high <- mean(sample_data$alcohol[sample_data$quality >= 7])
  mean_low <- mean(sample_data$alcohol[sample_data$quality < 7])
  return(mean_high - mean_low)
}

# Prepare data for bootstrapping
wine_data$quality_group <- ifelse(wine_data$quality >= 7, "High Quality", "Low Quality")

# Perform bootstrap resampling (1000 iterations)
set.seed(123)  # For reproducibility
boot_results <- boot(data = wine_data, statistic = bootstrap_diff, R = 1000)
boot_results
## 
## ORDINARY NONPARAMETRIC BOOTSTRAP
## 
## 
## Call:
## boot(data = wine_data, statistic = bootstrap_diff, R = 1000)
## 
## 
## Bootstrap Statistics :
##     original       bias    std. error
## t1* 1.171898 0.0004479133  0.03825358

Step 5: Compute Confidence Intervals


##########################################################
#  Compute Confidence Intervals
##########################################################

# Compute 95% Confidence Interval using the percentile method
boot_ci <- boot.ci(boot_results, type = "perc")
boot_ci
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
## 
## CALL : 
## boot.ci(boot.out = boot_results, type = "perc")
## 
## Intervals : 
## Level     Percentile     
## 95%   ( 1.095,  1.246 )  
## Calculations and Intervals on Original Scale
boot_df <- data.frame(Diff_Mean_Alcohol = boot_results$t)

# Print results

# Histogram of bootstrap distribution
ggplot(boot_df, aes(x = Diff_Mean_Alcohol)) +
  geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
  geom_density(color = "red", size = 1) +
  geom_vline(xintercept = mean(boot_results$t), linetype = "dashed", color = "black", size = 1) +
  labs(title = "Bootstrap Distribution of Alcohol Content Difference",
       x = "Difference in Mean Alcohol Content",
       y = "Density") +
  theme_minimal()

# Boxplot comparison of alcohol content between wine quality groups
ggplot(wine_data, aes(x = quality_group, y = alcohol, fill = quality_group)) +
  geom_boxplot(alpha = 0.7) +
  labs(title = "Alcohol Content by Wine Quality Group",
       x = "Wine Quality Group",
       y = "Alcohol Content") +
  theme_minimal() +
  scale_fill_manual(values = c("red", "blue"))


Step 6: Compute p-value


##########################################################
# Calculate p-value (proportion of bootstrap resamples 
# where the difference is <= 0)
##########################################################
p_value <- mean(boot_results$t <= 0)

# Print results
print(paste("P-Value:", p_value))
## [1] "P-Value: 0"
# Interpretation
if (p_value < 0.05) {
  print("The p-value is statistically significant. We reject the null hypothesis (H₀) and conclude that alcohol content significantly impacts wine quality.")
} else {
  print("The p-value is NOT statistically significant. We fail to reject the null hypothesis (H₀), meaning no strong evidence supports an effect of alcohol content on wine quality.")
}
## [1] "The p-value is statistically significant. We reject the null hypothesis (H₀) and conclude that alcohol content significantly impacts wine quality."

This study performed by Will Hinton