##########################################################
# Step 1. Load the data
##########################################################
# set current directory
setwd("/Users/whinton/src/rstudio/tim8521")
# Read the file as a csv file (separator is a semicolon)
red_wine <- read.csv("winequality-red.csv", header = TRUE, sep= ";",stringsAsFactors = TRUE)
white_wine <- read.csv("winequality-white.csv", header = TRUE, sep= ";",stringsAsFactors = TRUE)
# Add wine type labels
red_wine$wine_type <- "Red"
white_wine$wine_type <- "White"
# Combine both datasets
wine <- rbind(red_wine, white_wine)
wine_data <- wine
###########################################################
# Exploration of the Data and Correlation Analysis
###########################################################
# Examine the combined dataframe
head(wine_data, 3)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 1 7.4 0.70 0.00 1.9 0.076
## 2 7.8 0.88 0.00 2.6 0.098
## 3 7.8 0.76 0.04 2.3 0.092
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 1 11 34 0.9978 3.51 0.56 9.4
## 2 25 67 0.9968 3.20 0.68 9.8
## 3 15 54 0.9970 3.26 0.65 9.8
## quality wine_type
## 1 5 Red
## 2 5 Red
## 3 5 Red
tail(wine_data, 3)
## fixed.acidity volatile.acidity citric.acid residual.sugar chlorides
## 6495 6.5 0.24 0.19 1.2 0.041
## 6496 5.5 0.29 0.30 1.1 0.022
## 6497 6.0 0.21 0.38 0.8 0.020
## free.sulfur.dioxide total.sulfur.dioxide density pH sulphates alcohol
## 6495 30 111 0.99254 2.99 0.46 9.4
## 6496 20 110 0.98869 3.34 0.38 12.8
## 6497 22 98 0.98941 3.26 0.32 11.8
## quality wine_type
## 6495 6 White
## 6496 7 White
## 6497 6 White
summary(wine_data)
## fixed.acidity volatile.acidity citric.acid residual.sugar
## Min. : 3.800 Min. :0.0800 Min. :0.0000 Min. : 0.600
## 1st Qu.: 6.400 1st Qu.:0.2300 1st Qu.:0.2500 1st Qu.: 1.800
## Median : 7.000 Median :0.2900 Median :0.3100 Median : 3.000
## Mean : 7.215 Mean :0.3397 Mean :0.3186 Mean : 5.443
## 3rd Qu.: 7.700 3rd Qu.:0.4000 3rd Qu.:0.3900 3rd Qu.: 8.100
## Max. :15.900 Max. :1.5800 Max. :1.6600 Max. :65.800
## chlorides free.sulfur.dioxide total.sulfur.dioxide density
## Min. :0.00900 Min. : 1.00 Min. : 6.0 Min. :0.9871
## 1st Qu.:0.03800 1st Qu.: 17.00 1st Qu.: 77.0 1st Qu.:0.9923
## Median :0.04700 Median : 29.00 Median :118.0 Median :0.9949
## Mean :0.05603 Mean : 30.53 Mean :115.7 Mean :0.9947
## 3rd Qu.:0.06500 3rd Qu.: 41.00 3rd Qu.:156.0 3rd Qu.:0.9970
## Max. :0.61100 Max. :289.00 Max. :440.0 Max. :1.0390
## pH sulphates alcohol quality
## Min. :2.720 Min. :0.2200 Min. : 8.00 Min. :3.000
## 1st Qu.:3.110 1st Qu.:0.4300 1st Qu.: 9.50 1st Qu.:5.000
## Median :3.210 Median :0.5100 Median :10.30 Median :6.000
## Mean :3.219 Mean :0.5313 Mean :10.49 Mean :5.818
## 3rd Qu.:3.320 3rd Qu.:0.6000 3rd Qu.:11.30 3rd Qu.:6.000
## Max. :4.010 Max. :2.0000 Max. :14.90 Max. :9.000
## wine_type
## Length:6497
## Class :character
## Mode :character
##
##
##
str(wine_data)
## 'data.frame': 6497 obs. of 13 variables:
## $ fixed.acidity : num 7.4 7.8 7.8 11.2 7.4 7.4 7.9 7.3 7.8 7.5 ...
## $ volatile.acidity : num 0.7 0.88 0.76 0.28 0.7 0.66 0.6 0.65 0.58 0.5 ...
## $ citric.acid : num 0 0 0.04 0.56 0 0 0.06 0 0.02 0.36 ...
## $ residual.sugar : num 1.9 2.6 2.3 1.9 1.9 1.8 1.6 1.2 2 6.1 ...
## $ chlorides : num 0.076 0.098 0.092 0.075 0.076 0.075 0.069 0.065 0.073 0.071 ...
## $ free.sulfur.dioxide : num 11 25 15 17 11 13 15 15 9 17 ...
## $ total.sulfur.dioxide: num 34 67 54 60 34 40 59 21 18 102 ...
## $ density : num 0.998 0.997 0.997 0.998 0.998 ...
## $ pH : num 3.51 3.2 3.26 3.16 3.51 3.51 3.3 3.39 3.36 3.35 ...
## $ sulphates : num 0.56 0.68 0.65 0.58 0.56 0.56 0.46 0.47 0.57 0.8 ...
## $ alcohol : num 9.4 9.8 9.8 9.8 9.4 9.4 9.4 10 9.5 10.5 ...
## $ quality : int 5 5 5 6 5 5 5 7 7 5 ...
## $ wine_type : chr "Red" "Red" "Red" "Red" ...
#dplyr::glimpse(wine_data)
# Check for missing values
colSums(is.na(wine_data))
## fixed.acidity volatile.acidity citric.acid
## 0 0 0
## residual.sugar chlorides free.sulfur.dioxide
## 0 0 0
## total.sulfur.dioxide density pH
## 0 0 0
## sulphates alcohol quality
## 0 0 0
## wine_type
## 0
missmap(wine_data)
# check quality histogram
histogram(red_wine$quality, plot = TRUE)
histogram(white_wine$quality, plot = TRUE)
histogram(wine_data$quality)
histogram(wine_data$alcohol)
##########################################################
# Explore Relationships and Correlation Analysis
##########################################################
# Convert quality to a factor for categorical visualization
# wine_data$quality <- as.factor(wine_data$quality)
# List of top physicochemical properties
physicochemical_vars <- c("alcohol","volatile.acidity","sulphates","citric.acid","density","quality")
if (FALSE) {
physicochemical_vars <- c("fixed.acidity", "volatile.acidity", "citric.acid", "residual.sugar",
"chlorides", "free.sulfur.dioxide", "total.sulfur.dioxide", "density",
"pH", "sulphates", "alcohol", "quality")
}
# Compute correlation matrix
correlation_matrix <- cor(wine_data[, physicochemical_vars], use = "complete.obs")
# Display the correlation heatmap
corrplot(correlation_matrix, method = "color", type = "upper",
tl.col = "black", tl.srt = 45, title = "Correlation Heatmap of Key Physicochemical Properties",
mar = c(0, 0, 2, 0))
##########################################################
#. Hypothesis based on Data and Correlation Analysis
# (alcohol and quality are highly and positively correlated)
#
# Hypothesis: The average alcohol content of high-quality
# wine (rating ≥ 7) is significantly higher than that of
# lower-quality wine (rating < 7).
# Null Hypothesis(H0):
# The mean alcohol content of high-quality and
# low-quality wines is the same.
# Alternative Hypothesis(H1):
# The mean alcohol content of high-quality wines is
# significantly higher than that of lower-quality wines.
##########################################################
var = "alcohol"
print(ggplot(wine_data, aes_string(x = var, y = "quality", color = "wine_type")) +
geom_jitter(alpha = 0.3, width = 0.2) + # Adds jitter for better visualization
geom_smooth(method = "lm", se = TRUE, linetype = "dashed", color = "black") +
labs(title = paste("Scatter Plot of", var, "vs. Quality"),
x = var, y = "Quality") +
theme_minimal() +
theme(legend.position = "bottom"))
## `geom_smooth()` using formula = 'y ~ x'
# Create a correlation plot matrix (pairwise scatter plots)
ggpairs(wine_data, columns = which(names(wine_data) %in% physicochemical_vars),
aes(color = wine_type), title = "Pairwise Scatter Plots of Physicochemical Properties")
##########################################################
# Perform Non-Parmetrich Bootstrapping
# (1000 resamples with replacement)
# to estimate the difference in mean alcohol content
# between high-quality and low-quality wines
#
# Calculate the 95% confidence interval (CI) for the
# difference in mean alcohol content between high-quality
# and low-quality wines.
##########################################################
# Define groups based on quality
high_quality <- wine_data %>% filter(quality >= 7) %>% select(alcohol)
low_quality <- wine_data %>% filter(quality < 7) %>% select(alcohol)
# Define function for bootstrap resampling
bootstrap_diff <- function(data, indices) {
sample_data <- data[indices, ]
mean_high <- mean(sample_data$alcohol[sample_data$quality >= 7])
mean_low <- mean(sample_data$alcohol[sample_data$quality < 7])
return(mean_high - mean_low)
}
# Prepare data for bootstrapping
wine_data$quality_group <- ifelse(wine_data$quality >= 7, "High Quality", "Low Quality")
# Perform bootstrap resampling (1000 iterations)
set.seed(123) # For reproducibility
boot_results <- boot(data = wine_data, statistic = bootstrap_diff, R = 1000)
boot_results
##
## ORDINARY NONPARAMETRIC BOOTSTRAP
##
##
## Call:
## boot(data = wine_data, statistic = bootstrap_diff, R = 1000)
##
##
## Bootstrap Statistics :
## original bias std. error
## t1* 1.171898 0.0004479133 0.03825358
##########################################################
# Compute Confidence Intervals
##########################################################
# Compute 95% Confidence Interval using the percentile method
boot_ci <- boot.ci(boot_results, type = "perc")
boot_ci
## BOOTSTRAP CONFIDENCE INTERVAL CALCULATIONS
## Based on 1000 bootstrap replicates
##
## CALL :
## boot.ci(boot.out = boot_results, type = "perc")
##
## Intervals :
## Level Percentile
## 95% ( 1.095, 1.246 )
## Calculations and Intervals on Original Scale
boot_df <- data.frame(Diff_Mean_Alcohol = boot_results$t)
# Print results
# Histogram of bootstrap distribution
ggplot(boot_df, aes(x = Diff_Mean_Alcohol)) +
geom_histogram(aes(y = ..density..), bins = 30, fill = "blue", alpha = 0.5) +
geom_density(color = "red", size = 1) +
geom_vline(xintercept = mean(boot_results$t), linetype = "dashed", color = "black", size = 1) +
labs(title = "Bootstrap Distribution of Alcohol Content Difference",
x = "Difference in Mean Alcohol Content",
y = "Density") +
theme_minimal()
# Boxplot comparison of alcohol content between wine quality groups
ggplot(wine_data, aes(x = quality_group, y = alcohol, fill = quality_group)) +
geom_boxplot(alpha = 0.7) +
labs(title = "Alcohol Content by Wine Quality Group",
x = "Wine Quality Group",
y = "Alcohol Content") +
theme_minimal() +
scale_fill_manual(values = c("red", "blue"))
##########################################################
# Calculate p-value (proportion of bootstrap resamples
# where the difference is <= 0)
##########################################################
p_value <- mean(boot_results$t <= 0)
# Print results
print(paste("P-Value:", p_value))
## [1] "P-Value: 0"
# Interpretation
if (p_value < 0.05) {
print("The p-value is statistically significant. We reject the null hypothesis (H₀) and conclude that alcohol content significantly impacts wine quality.")
} else {
print("The p-value is NOT statistically significant. We fail to reject the null hypothesis (H₀), meaning no strong evidence supports an effect of alcohol content on wine quality.")
}
## [1] "The p-value is statistically significant. We reject the null hypothesis (H₀) and conclude that alcohol content significantly impacts wine quality."
This study performed by Will Hinton