summary_rs <- summary(red_wine$'residual sugar')
summary_rs
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.900 1.900 2.200 2.539 2.600 15.500
median_rs <- median(red_wine$'residual sugar')
median_rs
## [1] 2.2
# Create two groups, A and B, based on the median of 'residual.sugar'
red_wine <- red_wine %>%
mutate(group = ifelse(`residual sugar` <= median_rs, "A", "B"))
red_wine$group
## [1] "A" "B" "B" "A" "A" "A" "A" "A" "A" "B" "A" "B" "A" "A" "B" "B" "A" "A"
## [19] "B" "A" "A" "B" "A" "B" "B" "A" "A" "A" "A" "A" "B" "B" "B" "B" "A" "B"
## [37] "B" "A" "A" "B" "B" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B"
## [55] "B" "B" "B" "B" "B" "B" "A" "A" "A" "A" "B" "B" "A" "A" "A" "A" "A" "A"
## [73] "A" "A" "B" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [91] "A" "A" "A" "A" "A" "B" "B" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [109] "B" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "A"
## [127] "A" "A" "A" "A" "A" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [145] "A" "A" "A" "A" "B" "B" "A" "B" "A" "A" "B" "B" "B" "B" "A" "A" "A" "A"
## [163] "A" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "B"
## [181] "B" "A" "B" "A" "A" "A" "A" "B" "A" "A" "B" "A" "B" "A" "A" "A" "B" "A"
## [199] "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "A" "A" "B" "A" "A" "B" "B"
## [217] "A" "A" "A" "B" "A" "A" "A" "A" "A" "B" "A" "B" "B" "B" "A" "A" "B" "B"
## [235] "B" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "B" "A" "A" "A" "A"
## [253] "B" "A" "A" "B" "A" "A" "A" "B" "A" "A" "A" "A" "B" "A" "B" "B" "B" "B"
## [271] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B"
## [289] "B" "B" "B" "A" "B" "B" "B" "B" "B" "A" "B" "A" "B" "A" "B" "A" "A" "B"
## [307] "A" "B" "B" "A" "B" "A" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
## [325] "B" "B" "B" "B" "B" "A" "B" "B" "B" "A" "B" "B" "A" "B" "B" "B" "B" "B"
## [343] "A" "A" "B" "A" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "B" "B" "B" "B"
## [361] "A" "B" "B" "A" "B" "B" "B" "B" "B" "B" "B" "A" "A" "A" "B" "B" "B" "B"
## [379] "B" "B" "A" "B" "A" "A" "A" "B" "A" "A" "A" "B" "A" "B" "A" "A" "B" "B"
## [397] "B" "B" "B" "B" "B" "A" "B" "B" "B" "A" "B" "B" "B" "B" "B" "B" "A" "B"
## [415] "B" "B" "A" "A" "A" "A" "B" "B" "A" "A" "A" "B" "A" "A" "A" "B" "A" "A"
## [433] "B" "B" "B" "B" "A" "B" "B" "A" "A" "A" "B" "B" "A" "A" "B" "A" "A" "B"
## [451] "B" "A" "A" "B" "A" "B" "B" "B" "B" "A" "B" "B" "B" "A" "A" "B" "A" "B"
## [469] "A" "A" "B" "B" "B" "A" "A" "A" "A" "A" "A" "B" "B" "B" "A" "A" "B" "A"
## [487] "A" "A" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "B" "B" "A" "B" "B" "A"
## [505] "A" "B" "A" "B" "A" "B" "B" "A" "A" "B" "B" "B" "B" "A" "B" "B" "B" "A"
## [523] "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "B" "A" "B" "A" "B" "A" "B" "B"
## [541] "B" "B" "A" "B" "A" "B" "A" "B" "B" "A" "A" "B" "B" "A" "B" "B" "B" "B"
## [559] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "A" "B" "A" "B" "B"
## [577] "B" "B" "B" "A" "A" "A" "A" "A" "B" "B" "B" "A" "A" "B" "B" "A" "B" "B"
## [595] "B" "B" "A" "A" "A" "B" "A" "A" "B" "A" "B" "B" "B" "B" "B" "B" "B" "B"
## [613] "B" "A" "A" "B" "B" "A" "B" "B" "B" "B" "A" "B" "B" "B" "A" "A" "B" "B"
## [631] "B" "B" "B" "B" "A" "A" "B" "B" "A" "A" "B" "B" "B" "B" "B" "B" "B" "A"
## [649] "B" "B" "A" "B" "B" "B" "B" "B" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A"
## [667] "A" "A" "B" "A" "B" "A" "A" "A" "A" "A" "A" "B" "B" "B" "A" "A" "B" "B"
## [685] "B" "B" "A" "B" "A" "A" "B" "B" "A" "B" "B" "A" "A" "A" "B" "A" "B" "A"
## [703] "A" "B" "A" "B" "A" "B" "B" "A" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B"
## [721] "A" "B" "B" "A" "B" "B" "B" "A" "A" "B" "B" "B" "A" "B" "A" "A" "A" "B"
## [739] "B" "B" "B" "B" "A" "B" "B" "A" "B" "B" "A" "A" "B" "B" "B" "B" "A" "A"
## [757] "A" "A" "A" "B" "B" "A" "A" "A" "B" "B" "B" "B" "B" "A" "B" "B" "B" "A"
## [775] "A" "B" "B" "B" "B" "B" "A" "B" "B" "B" "B" "B" "B" "A" "A" "B" "B" "B"
## [793] "B" "B" "B" "B" "B" "A" "B" "B" "B" "B" "A" "B" "B" "B" "A" "B" "A" "B"
## [811] "B" "B" "B" "A" "B" "B" "B" "A" "B" "B" "A" "A" "A" "A" "B" "B" "B" "B"
## [829] "B" "A" "B" "A" "A" "A" "A" "A" "B" "B" "A" "A" "B" "B" "B" "A" "A" "A"
## [847] "A" "A" "A" "A" "A" "A" "B" "A" "A" "B" "A" "B" "A" "A" "B" "A" "B" "B"
## [865] "B" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "B" "A" "A" "B" "B" "A" "A"
## [883] "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "A" "B" "A" "B" "B" "B" "B" "B"
## [901] "B" "B" "B" "B" "B" "B" "B" "A" "B" "A" "B" "B" "B" "B" "A" "A" "A" "B"
## [919] "A" "A" "B" "A" "A" "B" "B" "A" "B" "A" "B" "B" "A" "A" "A" "A" "A" "B"
## [937] "B" "A" "B" "A" "A" "B" "B" "B" "B" "A" "B" "A" "A" "A" "A" "A" "A" "A"
## [955] "B" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "B" "A" "A"
## [973] "B" "A" "B" "A" "A" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [991] "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [1009] "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "A" "B"
## [1027] "A" "A" "B" "A" "A" "A" "B" "B" "B" "B" "B" "A" "B" "A" "A" "A" "A" "B"
## [1045] "B" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "A" "B" "B" "B" "B" "A" "A"
## [1063] "A" "B" "A" "B" "B" "A" "A" "B" "A" "B" "A" "B" "B" "A" "A" "B" "B" "B"
## [1081] "A" "B" "A" "B" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "B" "B" "B" "A"
## [1099] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "B" "B"
## [1117] "B" "B" "A" "B" "B" "A" "A" "A" "A" "A" "A" "B" "B" "B" "A" "A" "A" "B"
## [1135] "A" "A" "A" "A" "B" "A" "A" "B" "B" "A" "B" "B" "A" "B" "A" "A" "B" "B"
## [1153] "A" "A" "A" "A" "A" "A" "B" "A" "B" "A" "B" "A" "A" "A" "A" "B" "B" "B"
## [1171] "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "B" "A" "B" "A" "A" "A" "B" "A"
## [1189] "A" "A" "B" "B" "B" "B" "A" "A" "B" "A" "A" "B" "A" "A" "A" "B" "A" "A"
## [1207] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [1225] "B" "B" "B" "A" "A" "A" "B" "B" "A" "A" "A" "B" "A" "A" "B" "B" "A" "B"
## [1243] "A" "B" "B" "A" "A" "A" "A" "B" "B" "A" "A" "A" "A" "B" "A" "B" "B" "B"
## [1261] "A" "A" "B" "A" "A" "B" "B" "B" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A"
## [1279] "B" "A" "A" "A" "A" "B" "B" "A" "B" "B" "B" "B" "A" "A" "B" "A" "A" "B"
## [1297] "B" "A" "A" "A" "A" "A" "A" "A" "B" "A" "B" "B" "B" "A" "A" "B" "A" "B"
## [1315] "B" "A" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B"
## [1333] "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "A" "A" "B"
## [1351] "A" "B" "A" "A" "B" "A" "A" "B" "B" "A" "B" "B" "A" "A" "A" "A" "A" "A"
## [1369] "A" "A" "A" "B" "A" "B" "A" "A" "A" "B" "A" "B" "B" "A" "A" "A" "A" "B"
## [1387] "A" "A" "B" "A" "B" "B" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [1405] "B" "A" "B" "A" "A" "A" "A" "B" "B" "A" "A" "A" "A" "B" "A" "A" "A" "A"
## [1423] "A" "B" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "A" "A"
## [1441] "A" "B" "A" "A" "A" "B" "A" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A"
## [1459] "A" "A" "B" "A" "B" "B" "A" "A" "A" "B" "A" "A" "A" "B" "B" "A" "B" "A"
## [1477] "B" "A" "B" "B" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [1495] "A" "A" "A" "B" "B" "B" "A" "B" "B" "B" "B" "A" "A" "B" "A" "A" "A" "A"
## [1513] "A" "A" "B" "B" "B" "A" "A" "B" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A"
## [1531] "A" "B" "A" "A" "B" "A" "A" "A" "B" "A" "B" "A" "A" "A" "B" "A" "A" "A"
## [1549] "A" "A" "B" "B" "B" "A" "B" "A" "A" "B" "B" "A" "A" "A" "A" "A" "A" "A"
## [1567] "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "A" "B" "A" "A" "B" "A" "B" "A"
## [1585] "B" "B" "B" "A" "B" "B" "A" "A" "B" "A" "A" "A" "B" "A" "B"
1a. Null Hypothesis: There is no difference between population mean
of the density of the groups A and B
Deducing from the box plots, there is a difference between
population means of density of the two groups A and B
1c. What test are you going to use?
As the comparision is between two groups, two-sample t-test is
used
t_density <- t.test(density ~ group, data = red_wine)
t_density
##
## Welch Two Sample t-test
##
## data: density by group
## t = -14.697, df = 1365.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
## -0.001513022 -0.001156687
## sample estimates:
## mean in group A mean in group B
## 0.9961490 0.9974838
1d. What is the p-value?
The p.value of t_test_result is 2.2e-16
1e. What is your conclusion?
As the p.value of t_test_result is 2.2e-16 which is less than 0.05,
we reject the null hypothesis that there is no difference between
population mean of the density of the groups A and B.
1f. Does your conclusion imply that there is an association between
“density” and “residual.sugar”?
No, the conclusion of the t-test only suggests whether there is a
statistical difference in the means of “density” between the two groups
and does not imply any association. It. To understand the relationship
regression analysis need to be conducted.
2. Produce summary statistics of “residual.sugar” and use its 1st,
2nd, and 3rd quantiles to divide the data into four groups A, B, C, and
D. We want to test if “density” in the four groups has the same
population mean.
2a. Null hypothesis - There is no difference between the population
mean of “density” for group A and the groups B,C and D.
quantiles <- quantile(red_wine$`residual sugar`, probs = c(0, 0.25, 0.5, 0.75, 1))
quantiles
## 0% 25% 50% 75% 100%
## 0.9 1.9 2.2 2.6 15.5
red_wine <- red_wine %>%
mutate(group_1 = cut(`residual sugar`, breaks = quantiles, labels = c("1", "2", "3", "4"), include.lowest = TRUE))
2c. What test are you going to use?
anova_result <- aov(density ~ group_1, data = red_wine)
summary(anova_result)
## Df Sum Sq Mean Sq F value Pr(>F)
## group_1 3 0.000996 0.0003321 112.8 <2e-16 ***
## Residuals 1595 0.004696 0.0000029
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As there are multiple groups for comparision, ANOVA test is used for
analysis
2d. What is the p-value?
2e. What is your conclusion?
As the p.value of the ANOVA is 2e-16 which is less than 0.05, we
reject the null hypothesis that there is no difference between
population mean of the density of the groups A, B, C and D.
2f. Does your conclusion imply that there is an association between
“density” and “residual.sugar”? Compare your result here with that in
Question 1. Do you think increasing the number of groups help identify
the association? Would you consider dividing the data into 10 groups so
as to help the discovery of the association? Why?
As the null hypothesis is rejected, it suggests an association. Both
the t-test (comparision of two groups) and the ANOVA test (comparision
of four groups) suggest a significant association between “density” and
“residual sugar”. Increasing the number of groups further may help
identify finer patterns in the data. However, over-segmenting the data
leads to smaller sample sizes in each group and less statistical
power.
3. Create a 2 by 4 contingency table using the categories A, B, C, D
of “residual.sugar” and the binary variable “excellent” you created in
Part B. Note that you have two factors: the categorical levels of
“residual.sugar” (A, B, C and D) and an indicator of excellent wines
(yes or no).
Null Hypothesis - There is no correlation between residual sugar and
excellency of the wine
red_wine <- red_wine %>%
mutate(excellent = ifelse(quality >= 7,1,0))
contingency_table <- table(red_wine$excellent, red_wine$group_1)
As the p-value of Chi-squared test is 0.1386, null-hypothesis hold
true i.e. there is no correlation between residual sugar and excellency
of wines
3b. Use the permutation test to do the same and compare the result
to that in (a);
observed_chi_sq <- chisq.test(contingency_table)$statistic
num_perm <- 1000
permuted_chi_sq_stats <- numeric(num_perm)
for (i in 1:num_perm) {
shuffle_excellent <- sample(red_wine$excellent)
shuffle_contingency_table <- table(shuffle_excellent, red_wine$group_1)
permuted_chi_sq_stats[i] <- chisq.test(shuffle_contingency_table)$statistic
}
p_value <- mean(permuted_chi_sq_stats >= observed_chi_sq)
# Display results
observed_chi_sq
## X-squared
## 5.499973
p_value
## [1] 0.14
3c. Can you conclude that “residual.sugar” is a significant factor
contributing to the excellence of wine? Why?
The p-value obtained from permutation is 0.134 (> 0.05). The
p-values of both chi-squared test and permutation test are greater than
0.05, indicating that the null-hypothesis hold true i.e. there is no
correlation between residual sugar and excellency of wines