Question 1

Produce summary statistics of “residual.sugar” and use its median to divide the data into two groups A and B. We want to test if the “distribution” in Group A and Group B has the same population mean

medianresidualsugar <- median(`residual sugar`)
summary(wine$`residual sugar`) #Produce summary statistics of “residual.sugar”
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.900   1.900   2.200   2.539   2.600  15.500
wine$group <- if_else(`residual sugar` <= medianresidualsugar, 'A', 'B')
# Frequency table
table(wine$group)
## 
##   A   B 
## 883 716
tapply(wine$`residual sugar`, wine$group, summary)
## $A
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.900   1.800   1.900   1.894   2.100   2.200 
## 
## $B
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.250   2.400   2.600   3.334   3.400  15.500
wine %>% ggplot(mapping = aes(x = `residual sugar`, fill = group)) + 
  geom_density(alpha =0.7) +
  labs(title = "Density Distribution of Residual Sugar by Group") +
  theme_grey() +  # Default
  theme(legend.position = "right")

wine %>% ggplot(mapping = aes(x = group, y = `residual sugar`, fill = group)) +
  geom_boxplot() +
  labs(title = "Boxplot of Residual Sugar by Group",
       x = "Group",
       y = "Residual Sugar")

q1ttest <- t.test(density ~ group, data = wine, alternative = "two.sided", var.equal=FALSE)
q1ttest
## 
##  Welch Two Sample t-test
## 
## data:  density by group
## t = -14.697, df = 1365.2, p-value < 0.00000000000000022
## alternative hypothesis: true difference in means between group A and group B is not equal to 0
## 95 percent confidence interval:
##  -0.001513022 -0.001156687
## sample estimates:
## mean in group A mean in group B 
##       0.9961490       0.9974838

Question 2

Produce summary statistics of “residual.sugar” and use its 1st, 2nd, and 3rd quantiles to divide the data into four groups A, B, C, and D. We want to test if the “distribution” in the four groups has the same population mean.

q1 <- quantile(wine$`residual sugar`, 0.25)
q2 <- quantile(wine$`residual sugar`, 0.50)
q3 <- quantile(wine$`residual sugar`, 0.75)
wine$group <- cut(wine$`residual sugar`, 
                breaks = c(-Inf, q1, q2, q3, Inf), 
                labels = c("A", "B", "C", "D"))

#Frequency Table 
table(wine$group)
## 
##   A   B   C   D 
## 464 419 361 355
tapply(wine$`residual sugar`, wine$group, summary)
## $A
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.900   1.600   1.800   1.714   1.900   1.900 
## 
## $B
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.000   2.000   2.100   2.094   2.200   2.200 
## 
## $C
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.250   2.300   2.400   2.437   2.500   2.600 
## 
## $D
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   2.650   2.825   3.400   4.246   4.675  15.500
wine %>% ggplot(mapping = aes(x = `residual sugar`, fill = group)) + 
  geom_density(alpha =0.7) +
  labs(title = "Density Distribution of Residual Sugar by Group") +
  theme_grey() +  # Default
  theme(legend.position = "right")

wine %>% ggplot(mapping = aes(x = group, y = `residual sugar`, fill = group)) +
  geom_boxplot() +
  labs(title = "Boxplot of Residual Sugar by Group",
       x = "Group",
       y = "Residual Sugar")

anova_result <- aov(`residual sugar` ~ group, data = wine)

summary(anova_result)
##               Df Sum Sq Mean Sq F value              Pr(>F)    
## group          3   1437   479.0   439.2 <0.0000000000000002 ***
## Residuals   1595   1740     1.1                                
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Question 3

wine <- wine %>%
  mutate(excellent = ifelse(quality>=7,'yes','no'))

contingency_table <- table(wine$group, wine$excellent)
print(contingency_table)
##    
##      no yes
##   A 411  53
##   B 367  52
##   C 308  53
##   D 296  59
chi_square_result <- chisq.test(contingency_table)
# summary(chi_square_result)
print(chi_square_result)
## 
##  Pearson's Chi-squared test
## 
## data:  contingency_table
## X-squared = 5.5, df = 3, p-value = 0.1386
permutation_result <- chisq.test(contingency_table, simulate.p.value = TRUE, B=2000)
# summary(permutation_result)
print(permutation_result)
## 
##  Pearson's Chi-squared test with simulated p-value (based on 2000
##  replicates)
## 
## data:  contingency_table
## X-squared = 5.5, df = NA, p-value = 0.1409