summary_rs <- summary(red_wine$'residual sugar')
summary_rs
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.900   1.900   2.200   2.539   2.600  15.500
median_rs <- median(red_wine$'residual sugar')
median_rs
## [1] 2.2
# Create two groups, A and B, based on the median of 'residual.sugar'
red_wine <- red_wine %>%
  mutate(group = ifelse(`residual sugar` <= median_rs, "A", "B"))
  red_wine$group
##    [1] "A" "B" "B" "A" "A" "A" "A" "A" "A" "B" "A" "B" "A" "A" "B" "B" "A" "A"
##   [19] "B" "A" "A" "B" "A" "B" "B" "A" "A" "A" "A" "A" "B" "B" "B" "B" "A" "B"
##   [37] "B" "A" "A" "B" "B" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B"
##   [55] "B" "B" "B" "B" "B" "B" "A" "A" "A" "A" "B" "B" "A" "A" "A" "A" "A" "A"
##   [73] "A" "A" "B" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
##   [91] "A" "A" "A" "A" "A" "B" "B" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A"
##  [109] "B" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "A"
##  [127] "A" "A" "A" "A" "A" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
##  [145] "A" "A" "A" "A" "B" "B" "A" "B" "A" "A" "B" "B" "B" "B" "A" "A" "A" "A"
##  [163] "A" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "B"
##  [181] "B" "A" "B" "A" "A" "A" "A" "B" "A" "A" "B" "A" "B" "A" "A" "A" "B" "A"
##  [199] "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "A" "A" "B" "A" "A" "B" "B"
##  [217] "A" "A" "A" "B" "A" "A" "A" "A" "A" "B" "A" "B" "B" "B" "A" "A" "B" "B"
##  [235] "B" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "B" "A" "A" "A" "A"
##  [253] "B" "A" "A" "B" "A" "A" "A" "B" "A" "A" "A" "A" "B" "A" "B" "B" "B" "B"
##  [271] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B"
##  [289] "B" "B" "B" "A" "B" "B" "B" "B" "B" "A" "B" "A" "B" "A" "B" "A" "A" "B"
##  [307] "A" "B" "B" "A" "B" "A" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B"
##  [325] "B" "B" "B" "B" "B" "A" "B" "B" "B" "A" "B" "B" "A" "B" "B" "B" "B" "B"
##  [343] "A" "A" "B" "A" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "B" "B" "B" "B"
##  [361] "A" "B" "B" "A" "B" "B" "B" "B" "B" "B" "B" "A" "A" "A" "B" "B" "B" "B"
##  [379] "B" "B" "A" "B" "A" "A" "A" "B" "A" "A" "A" "B" "A" "B" "A" "A" "B" "B"
##  [397] "B" "B" "B" "B" "B" "A" "B" "B" "B" "A" "B" "B" "B" "B" "B" "B" "A" "B"
##  [415] "B" "B" "A" "A" "A" "A" "B" "B" "A" "A" "A" "B" "A" "A" "A" "B" "A" "A"
##  [433] "B" "B" "B" "B" "A" "B" "B" "A" "A" "A" "B" "B" "A" "A" "B" "A" "A" "B"
##  [451] "B" "A" "A" "B" "A" "B" "B" "B" "B" "A" "B" "B" "B" "A" "A" "B" "A" "B"
##  [469] "A" "A" "B" "B" "B" "A" "A" "A" "A" "A" "A" "B" "B" "B" "A" "A" "B" "A"
##  [487] "A" "A" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "B" "B" "A" "B" "B" "A"
##  [505] "A" "B" "A" "B" "A" "B" "B" "A" "A" "B" "B" "B" "B" "A" "B" "B" "B" "A"
##  [523] "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "B" "A" "B" "A" "B" "A" "B" "B"
##  [541] "B" "B" "A" "B" "A" "B" "A" "B" "B" "A" "A" "B" "B" "A" "B" "B" "B" "B"
##  [559] "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "A" "B" "A" "B" "B"
##  [577] "B" "B" "B" "A" "A" "A" "A" "A" "B" "B" "B" "A" "A" "B" "B" "A" "B" "B"
##  [595] "B" "B" "A" "A" "A" "B" "A" "A" "B" "A" "B" "B" "B" "B" "B" "B" "B" "B"
##  [613] "B" "A" "A" "B" "B" "A" "B" "B" "B" "B" "A" "B" "B" "B" "A" "A" "B" "B"
##  [631] "B" "B" "B" "B" "A" "A" "B" "B" "A" "A" "B" "B" "B" "B" "B" "B" "B" "A"
##  [649] "B" "B" "A" "B" "B" "B" "B" "B" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A"
##  [667] "A" "A" "B" "A" "B" "A" "A" "A" "A" "A" "A" "B" "B" "B" "A" "A" "B" "B"
##  [685] "B" "B" "A" "B" "A" "A" "B" "B" "A" "B" "B" "A" "A" "A" "B" "A" "B" "A"
##  [703] "A" "B" "A" "B" "A" "B" "B" "A" "B" "B" "B" "B" "B" "B" "B" "B" "A" "B"
##  [721] "A" "B" "B" "A" "B" "B" "B" "A" "A" "B" "B" "B" "A" "B" "A" "A" "A" "B"
##  [739] "B" "B" "B" "B" "A" "B" "B" "A" "B" "B" "A" "A" "B" "B" "B" "B" "A" "A"
##  [757] "A" "A" "A" "B" "B" "A" "A" "A" "B" "B" "B" "B" "B" "A" "B" "B" "B" "A"
##  [775] "A" "B" "B" "B" "B" "B" "A" "B" "B" "B" "B" "B" "B" "A" "A" "B" "B" "B"
##  [793] "B" "B" "B" "B" "B" "A" "B" "B" "B" "B" "A" "B" "B" "B" "A" "B" "A" "B"
##  [811] "B" "B" "B" "A" "B" "B" "B" "A" "B" "B" "A" "A" "A" "A" "B" "B" "B" "B"
##  [829] "B" "A" "B" "A" "A" "A" "A" "A" "B" "B" "A" "A" "B" "B" "B" "A" "A" "A"
##  [847] "A" "A" "A" "A" "A" "A" "B" "A" "A" "B" "A" "B" "A" "A" "B" "A" "B" "B"
##  [865] "B" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "B" "A" "A" "B" "B" "A" "A"
##  [883] "B" "B" "B" "B" "B" "B" "B" "B" "A" "B" "A" "B" "A" "B" "B" "B" "B" "B"
##  [901] "B" "B" "B" "B" "B" "B" "B" "A" "B" "A" "B" "B" "B" "B" "A" "A" "A" "B"
##  [919] "A" "A" "B" "A" "A" "B" "B" "A" "B" "A" "B" "B" "A" "A" "A" "A" "A" "B"
##  [937] "B" "A" "B" "A" "A" "B" "B" "B" "B" "A" "B" "A" "A" "A" "A" "A" "A" "A"
##  [955] "B" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "B" "A" "A"
##  [973] "B" "A" "B" "A" "A" "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
##  [991] "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [1009] "B" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "A" "B"
## [1027] "A" "A" "B" "A" "A" "A" "B" "B" "B" "B" "B" "A" "B" "A" "A" "A" "A" "B"
## [1045] "B" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "A" "B" "B" "B" "B" "A" "A"
## [1063] "A" "B" "A" "B" "B" "A" "A" "B" "A" "B" "A" "B" "B" "A" "A" "B" "B" "B"
## [1081] "A" "B" "A" "B" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "B" "B" "B" "A"
## [1099] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "B" "B"
## [1117] "B" "B" "A" "B" "B" "A" "A" "A" "A" "A" "A" "B" "B" "B" "A" "A" "A" "B"
## [1135] "A" "A" "A" "A" "B" "A" "A" "B" "B" "A" "B" "B" "A" "B" "A" "A" "B" "B"
## [1153] "A" "A" "A" "A" "A" "A" "B" "A" "B" "A" "B" "A" "A" "A" "A" "B" "B" "B"
## [1171] "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "B" "A" "B" "A" "A" "A" "B" "A"
## [1189] "A" "A" "B" "B" "B" "B" "A" "A" "B" "A" "A" "B" "A" "A" "A" "B" "A" "A"
## [1207] "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [1225] "B" "B" "B" "A" "A" "A" "B" "B" "A" "A" "A" "B" "A" "A" "B" "B" "A" "B"
## [1243] "A" "B" "B" "A" "A" "A" "A" "B" "B" "A" "A" "A" "A" "B" "A" "B" "B" "B"
## [1261] "A" "A" "B" "A" "A" "B" "B" "B" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A"
## [1279] "B" "A" "A" "A" "A" "B" "B" "A" "B" "B" "B" "B" "A" "A" "B" "A" "A" "B"
## [1297] "B" "A" "A" "A" "A" "A" "A" "A" "B" "A" "B" "B" "B" "A" "A" "B" "A" "B"
## [1315] "B" "A" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "B"
## [1333] "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "A" "A" "B"
## [1351] "A" "B" "A" "A" "B" "A" "A" "B" "B" "A" "B" "B" "A" "A" "A" "A" "A" "A"
## [1369] "A" "A" "A" "B" "A" "B" "A" "A" "A" "B" "A" "B" "B" "A" "A" "A" "A" "B"
## [1387] "A" "A" "B" "A" "B" "B" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [1405] "B" "A" "B" "A" "A" "A" "A" "B" "B" "A" "A" "A" "A" "B" "A" "A" "A" "A"
## [1423] "A" "B" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A" "B" "B" "A" "B" "A" "A"
## [1441] "A" "B" "A" "A" "A" "B" "A" "A" "A" "A" "A" "B" "A" "A" "A" "A" "A" "A"
## [1459] "A" "A" "B" "A" "B" "B" "A" "A" "A" "B" "A" "A" "A" "B" "B" "A" "B" "A"
## [1477] "B" "A" "B" "B" "A" "B" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A" "A"
## [1495] "A" "A" "A" "B" "B" "B" "A" "B" "B" "B" "B" "A" "A" "B" "A" "A" "A" "A"
## [1513] "A" "A" "B" "B" "B" "A" "A" "B" "A" "A" "B" "A" "A" "A" "A" "A" "A" "A"
## [1531] "A" "B" "A" "A" "B" "A" "A" "A" "B" "A" "B" "A" "A" "A" "B" "A" "A" "A"
## [1549] "A" "A" "B" "B" "B" "A" "B" "A" "A" "B" "B" "A" "A" "A" "A" "A" "A" "A"
## [1567] "A" "A" "A" "A" "A" "A" "A" "B" "B" "A" "A" "B" "A" "A" "B" "A" "B" "A"
## [1585] "B" "B" "B" "A" "B" "B" "A" "A" "B" "A" "A" "A" "B" "A" "B"
1a. Null Hypothesis: There is no difference between population mean of the density of the groups A and B


1b. Use visualization tools to inspect the hypothesis. Do you think the hypothesis is right or not?
boxplot(density ~ group, data = red_wine)

Deducing from the box plots, there is a difference between population means of density of the two groups A and B


1c. What test are you going to use?
As the comparision is between two groups, two-sample t-test is used
t_density <- t.test(density ~ group, data = red_wine)
t_density
## 
##  Welch Two Sample t-test
## 
## data:  density by group
## t = -14.697, df = 1365.2, p-value < 2.2e-16
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -0.001513022 -0.001156687
## sample estimates:
## mean in group A mean in group B 
##       0.9961490       0.9974838
1d. What is the p-value?
The p.value of t_test_result is 2.2e-16


1e. What is your conclusion?
As the p.value of t_test_result is 2.2e-16 which is less than 0.05, we reject the null hypothesis that there is no difference between population mean of the density of the groups A and B.


1f. Does your conclusion imply that there is an association between “density” and “residual.sugar”?
No, the conclusion of the t-test only suggests whether there is a statistical difference in the means of “density” between the two groups and does not imply any association. It. To understand the relationship regression analysis need to be conducted.


2. Produce summary statistics of “residual.sugar” and use its 1st, 2nd, and 3rd quantiles to divide the data into four groups A, B, C, and D. We want to test if “density” in the four groups has the same population mean.


2a. Null hypothesis - There is no difference between the population mean of “density” for group A and the groups B,C and D.
quantiles <- quantile(red_wine$`residual sugar`, probs = c(0, 0.25, 0.5, 0.75, 1))
quantiles
##   0%  25%  50%  75% 100% 
##  0.9  1.9  2.2  2.6 15.5
red_wine <- red_wine %>%
  mutate(group_1 = cut(`residual sugar`, breaks = quantiles, labels = c("1", "2", "3", "4"), include.lowest = TRUE))


2b. Use visualization tools to inspect the hypothesis. Do you think the hypothesis is right or not?
ggplot(red_wine, aes(x = group_1, y = density)) +
  geom_boxplot()

Deducing from the difference in central tendencies(medians) in box plots, there is a difference between population means of each of the groups A,B,Cand D


2c. What test are you going to use?
anova_result <- aov(density ~ group_1, data = red_wine)
summary(anova_result)
##               Df   Sum Sq   Mean Sq F value Pr(>F)    
## group_1        3 0.000996 0.0003321   112.8 <2e-16 ***
## Residuals   1595 0.004696 0.0000029                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
As there are multiple groups for comparision, ANOVA test is used for analysis


2d. What is the p-value?
The p value is 2e-16


2e. What is your conclusion?
As the p.value of the ANOVA is 2e-16 which is less than 0.05, we reject the null hypothesis that there is no difference between population mean of the density of the groups A, B, C and D.


2f. Does your conclusion imply that there is an association between “density” and “residual.sugar”? Compare your result here with that in Question 1. Do you think increasing the number of groups help identify the association? Would you consider dividing the data into 10 groups so as to help the discovery of the association? Why?
As the null hypothesis is rejected, it suggests an association. Both the t-test (comparision of two groups) and the ANOVA test (comparision of four groups) suggest a significant association between “density” and “residual sugar”. Increasing the number of groups further may help identify finer patterns in the data. However, over-segmenting the data leads to smaller sample sizes in each group and less statistical power.


3. Create a 2 by 4 contingency table using the categories A, B, C, D of “residual.sugar” and the binary variable “excellent” you created in Part B. Note that you have two factors: the categorical levels of “residual.sugar” (A, B, C and D) and an indicator of excellent wines (yes or no).
Null Hypothesis - There is no correlation between residual sugar and excellency of the wine
red_wine <- red_wine %>%
  mutate(excellent = ifelse(quality >= 7,1,0))

contingency_table <- table(red_wine$excellent, red_wine$group_1)


3a. Use the Chi-square test to test if these two factors are correlated or not;
chi_sq_result <- chisq.test(contingency_table)
chi_sq_result
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 5.5, df = 3, p-value = 0.1386
As the p-value of Chi-squared test is 0.1386, null-hypothesis hold true i.e. there is no correlation between residual sugar and excellency of wines


3b. Use the permutation test to do the same and compare the result to that in (a);
observed_chi_sq <- chisq.test(contingency_table)$statistic
num_perm <- 1000

permuted_chi_sq_stats <- numeric(num_perm)

for (i in 1:num_perm) {
  shuffle_excellent <- sample(red_wine$excellent)
  shuffle_contingency_table <- table(shuffle_excellent, red_wine$group_1)
  
  permuted_chi_sq_stats[i] <- chisq.test(shuffle_contingency_table)$statistic
}

p_value <- mean(permuted_chi_sq_stats >= observed_chi_sq)

# Display results
observed_chi_sq
## X-squared 
##  5.499973
p_value
## [1] 0.14


3c. Can you conclude that “residual.sugar” is a significant factor contributing to the excellence of wine? Why?
The p-value obtained from permutation is 0.134 (> 0.05). The p-values of both chi-squared test and permutation test are greater than 0.05, indicating that the null-hypothesis hold true i.e. there is no correlation between residual sugar and excellency of wines