# Summary statistics
summary(redwine$residual.sugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.900 1.900 2.200 2.539 2.600 15.500
# Split into 2 groups
median_sugar <- median(redwine$residual.sugar)
redwine <- redwine %>%
mutate(Group = ifelse(residual.sugar <= median_sugar, "A", "B"))
table(redwine$Group)
##
## A B
## 883 716
*Note that the groups do not have equal observations due to inclusion of the median value in Group A.
\[ \mu_A = \mu_B \]
Where:
- \(\mu_A\) is the population mean
of density for Group A (residual sugar ≤
median).
- \(\mu_B\) is the population mean
of density for Group B (residual sugar >
median).
mean_a <- mean(redwine$density[redwine$Group == "A"])
mean_b <- mean(redwine$density[redwine$Group == "B"])
ggplot(redwine, aes(x = Group, y = density, fill = Group)) +
geom_boxplot(width = 0.5, alpha = 0.7) +
# Mean points
stat_summary(fun = mean, geom = "point", size = 4, color = "darkred", shape = 18) +
# Annotations
annotate("text", x = 0.7, y = mean_a + 0.003, label = paste("Mean (A):\n", format(mean_a, digits = 5)), color = "black", size = 3.5) +
annotate("text", x = 2.3, y = mean_b + 0.002, label = paste("Mean(B):\n", format(mean_b, digits = 5)), color = "black", size = 3.5) +
annotate("text", x = 1, y = max(redwine$density) + 0.001, label = "Residual Sugar ≤ Median (A)", color = "black", face = "bold", hjust = 0.5) +
annotate("text", x = 2, y = max(redwine$density) + 0.001, label = "Residual Sugar > Median (B)", color = "black", face = "bold", hjust = 0.5) +
labs(title = "Density Distribution by Group (A vs B)",
x = "Group", y = "Density") +
scale_fill_manual(values = c("A" = "#E63946", "B" = "#457B9D")) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold"))
## Warning in annotate("text", x = 1, y = max(redwine$density) + 0.001, label =
## "Residual Sugar ≤ Median (A)", : Ignoring unknown parameters: `face`
## Warning in annotate("text", x = 2, y = max(redwine$density) + 0.001, label =
## "Residual Sugar > Median (B)", : Ignoring unknown parameters: `face`
ggplot(redwine, aes(x = Group, y = density, color = Group)) +
geom_jitter(width = 0.2, alpha = 0.4, size = 1.5) +
# Mean lines
stat_summary(fun = mean, geom = "crossbar", width = 0.5, color = "black", size = 0.8) +
# Annotations
annotate("text", x = 0.7, y = mean_a + 0.003, label = paste("Mean (A):\n", format(mean_a, digits = 5)), color = "black", size = 3.5) +
annotate("text", x = 2.3, y = mean_b + 0.002, label = paste("Mean(B):\n", format(mean_b, digits = 5)), color = "black", size = 3.5) +
labs(title = "Density Distribution by Group (A vs B) - Individual Points",
x = "Group", y = "Density") +
scale_color_manual(values = c("A" = "#E63946", "B" = "#457B9D")) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold"))
## Warning: Using `size` aesthetic for lines was deprecated in ggplot2 3.4.0.
## ℹ Please use `linewidth` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
These graphs do not provide strong enough evidence to reject the null hypothesis that the mean density between Group A and B (separated by median residual sugar). Further testing and analysis are required, but the visualization suggests that there may be a difference to explore.
We will perform an Independent Two-Sample t-test. We assume the data are approximately normally distributed, and homogeneity of variance.
t_test_result <- t.test(density ~ Group, data = redwine)
t_test_result
##
## Welch Two Sample t-test
##
## data: density by Group
## t = -14.697, df = 1365.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
## -0.001513022 -0.001156687
## sample estimates:
## mean in group A mean in group B
## 0.9961490 0.9974838
The p-value of our Two-Sample t-test is 1.6548158^{-45}.
Since the p-value was extremely close to 0—significantly smaller than 0.05—we reject the null hypothesis that there is no true difference in means between Group A and Group B. There is a significant difference between density in these groups separated by median residual sugar.
No, the t-test only compares the means of density between two groups based on residual sugar. It does not measure the direct association between residual sugar and density. In order to investigate a true association, we must analyze the correlation. (See question 2).
summary(redwine$residual.sugar)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.900 1.900 2.200 2.539 2.600 15.500
quantiles <- quantile(redwine$residual.sugar, probs = c(0.25, 0.5, 0.75))
redwine <- redwine %>%
mutate(Group = case_when(
residual.sugar <= quantiles[1] ~ "A",
residual.sugar > quantiles[1] & residual.sugar <= quantiles[2] ~ "B",
residual.sugar > quantiles[2] & residual.sugar <= quantiles[3] ~ "C",
residual.sugar > quantiles[3] ~ "D"
))
table(redwine$Group)
##
## A B C D
## 464 419 361 355
*Note that the four quantiles do not have equal numbers of observations.
\[ \mu_A = \mu_B = \mu_C = \mu_D \]
Where:
\(\mu_A\), \(\mu_B\), \(\mu_C\), and \(\mu_D\)are the population means of
density for Groups A, B, C, and D
respectively.
* The null hypothesis assumes that the means of density in the four
groups are equal.
mean_a <- mean(redwine$density[redwine$Group == "A"])
mean_b <- mean(redwine$density[redwine$Group == "B"])
mean_c <- mean(redwine$density[redwine$Group == "C"])
mean_d <- mean(redwine$density[redwine$Group == "D"])
ggplot(redwine, aes(x = Group, y = density, fill = Group)) +
geom_boxplot(width = 0.5, alpha = 0.8) +
# Mean points
stat_summary(fun = mean, geom = "point", size = 4, color = "black", shape = 23, fill = "white") +
# Annotations for means
annotate("text", x = 1, y = 1.003, label = paste("Mean (A):\n", format(mean_a, digits = 5)), color = "black", size = 3) +
annotate("text", x = 2, y = 1.003, label = paste("Mean (B):\n", format(mean_b, digits = 5)), color = "black", size = 3) +
annotate("text", x = 3, y = 1.003, label = paste("Mean (C):\n", format(mean_c, digits = 5)), color = "black", size = 3) +
annotate("text", x = 4, y = 1.003, label = paste("Mean (D):\n", format(mean_d, digits = 5)), color = "black", size = 3) +
labs(title = "Density Distribution by Group (A vs B vs C vs D)",
x = "Group", y = "Density") +
scale_fill_manual(values = c("A" = "#2A9D8F", "B" = "#E76F51", "C" = "#F4A261", "D" = "#264653")) +
theme_minimal() +
theme(legend.position = "none",
plot.title = element_text(hjust = 0.5, face = "bold"))
ggplot(redwine, aes(x = density, fill = Group)) +
geom_density(alpha = 0.5) +
geom_vline(xintercept = c(mean_a, mean_b, mean_c, mean_d),
linetype = "dashed", color = c("#2A9D8F", "#E76F51", "#F4A261", "#264653"),
size = 0.8) +
labs(title = "Density Distribution Curves by Group",
x = "Density", y = "Probability Density") +
scale_fill_manual(values = c("A" = "#2A9D8F", "B" = "#E76F51", "C" = "#F4A261", "D" = "#264653")) +
theme_minimal() +
theme(plot.title = element_text(hjust = 0.5, face = "bold"),
legend.position = "right")
The visualizations show different distributions for each group, with varying mean densities. Based on the graphs, we can see that the groups’ means are likely not equal, suggesting that the hypothesis of equal population means across the groups may not be correct. However, further analysis is required to reject or fail to reject the hypothesis.
Since we have many groups, we will perform a One-Way Analysis of Variance (ANOVA) test to compare the means of density across the four groups.
anova_result <- aov(density ~ Group, data = redwine)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## Group 3 0.000996 0.0003321 112.8 <2e-16 ***
## Residuals 1595 0.004696 0.0000029
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
The p-value from our ANOVA test is 3.065683e-66.
Since the p-value is significantly smaller than 0.05, we reject the null hypothesis that the mean density across the four groups is equal. There is a significant difference in density between the groups defined by residual sugar quantiles.
No, the ANOVA test only tests for differences in the means between groups. It does not directly measure the association between residual sugar and density. To measure association, we need to perform a correlation analysis.
cor_result <- cor(redwine$residual.sugar, redwine$density)
cor_result
## [1] 0.3552834
cor_test_result <- cor.test(redwine$residual.sugar, redwine$density)
cor_test_result$p.value
## [1] 9.013042e-49
The correlation coefficient shows a weak positive correlation (r = 0.35528) between residual sugar and density. The correlation is statistically significant (p-value = 9.013042e-49), indicating that this relationship is highly unlikely to have occurred by random chance.
In Question 1, we divided the data into two groups based on the median residual sugar value. In Question 2, we have divided the data into four groups, and the ANOVA suggests significant differences in density among these groups. Increasing the number of groups may help better capture the variability in the data, but it does not necessarily reveal a stronger association. When exploring associations between variables, correlation analysis is essential to make finite conclusions. Dividing the data into 10 groups could be useful in identifying finer patterns, but it could also result in overfitting or noise, especially if the sample size per group becomes too small. The best approach depends on the size of the data and the specific questions we want to answer.
redwine$excellent <- ifelse(redwine$quality >= 7, 1, 0)
contingency_table <- table(redwine$Group, redwine$excellent)
contingency_table
##
## 0 1
## A 411 53
## B 367 52
## C 308 53
## D 296 59
*Note that while we do not have equal numbers of observations in each quantile Group or excellence rating, we have fairly evenly distributed groupings.
n_permutations <- 10000
# Get the Chi-square statistic
calc_chi_square_stat <- function(data) {
table_perm <- table(data$Group, data$excellent)
return(chisq.test(table_perm)$statistic)
}
observed_stat <- calc_chi_square_stat(redwine)
# Permutation test
set.seed(123)
permutation_stats <- replicate(n_permutations, {
permuted_data <- redwine
permuted_data$excellent <- sample(permuted_data$excellent)
calc_chi_square_stat(permuted_data)
})
permutation_p_value <- mean(permutation_stats >= observed_stat)
permutation_p_value
## [1] 0.1367
The p-values in the permutation test (0.1367) and the Chi-square test (0.1386) are both much greater than 0.05, meaning any differences observed in the data are likely due to random chance. In order to get a better analysis and test our hypothesis, we might consider using a larger sample size to test the relationship between residual sugar and excellence of wine.
Based on the results of both the Chi-square test and the permutation test, we can draw conclusions:
_Both of the p-values are greater than 0.05, so there is no evidence to suggest that residual sugar is significantly associated with the excellence of wine. We fail to reject the null hypothesis. This suggests that residual sugar levels, when categorized into quartiles, do not have a statistically significant relationship with whether a wine is rated as excellent (quality ≥ 7). Other factors such as alcohol content, acidity, or sulfur dioxide levels may be more important contributors to wine excellence.