#Research Question: What is the effect of different brewing techniques on the amount of crème. #H0: Method 1 = Method 2 = Method 3 #H1: At least one pair of methods differ from one another.

espresso <- read.csv("C:/Users/apoor/Downloads/EspressoData (2).csv")
str(espresso)
## 'data.frame':    27 obs. of  2 variables:
##  $ cereme    : num  36.6 39.6 37.7 36 38.5 ...
##  $ brewmethod: int  1 1 1 1 1 1 1 1 1 2 ...
summary(espresso)
##      cereme        brewmethod
##  Min.   :21.02   Min.   :1   
##  1st Qu.:35.66   1st Qu.:1   
##  Median :38.52   Median :2   
##  Mean   :44.47   Mean   :2   
##  3rd Qu.:55.23   3rd Qu.:3   
##  Max.   :73.19   Max.   :3

#Check if the data is normally distributed

plot(density(espresso$cereme))

qqnorm(espresso$cereme)

#Check if there is any skewness in the data

library("moments")
agostino.test(espresso$cereme)
## 
##  D'Agostino skewness test
## 
## data:  espresso$cereme
## skew = 0.54679, z = 1.32787, p-value = 0.1842
## alternative hypothesis: data have a skewness

#Since the p value is > 0.05, we reject the alternative hypothesis.

#Normality test

shapiro.test(espresso$cereme)
## 
##  Shapiro-Wilk normality test
## 
## data:  espresso$cereme
## W = 0.92201, p-value = 0.04414

#Scatterplot of residuals

eruption.lm = lm(cereme ~ brewmethod, espresso) 
eruption.res = resid(eruption.lm)
plot(espresso$cereme, eruption.res, ylab="Residuals", xlab="brewmethod", main="Title") 
abline(0, 0)

bartlett.test(espresso$cereme, espresso$brewmethod)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  espresso$cereme and espresso$brewmethod
## Bartlett's K-squared = 0.96331, df = 2, p-value = 0.6178
tapply(espresso$cereme, espresso$brewmethod, var)
##         1         2         3 
##  53.29088 102.02220  59.30182

#We need to factorize as there are categorical variables in the data set.

m = (aov(cereme ~ factor(brewmethod), data = espresso))
summary(m)
##                    Df Sum Sq Mean Sq F value  Pr(>F)    
## factor(brewmethod)  2   4065  2032.6   28.41 4.7e-07 ***
## Residuals          24   1717    71.5                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#We see that there is high corelation between the brewing method and the amount of cereme. So we will now seehow each brewing technique relates to each other using the pairwise t-test.

library(pgirmess)
pairwise.t.test(espresso$cereme, espresso$brewmethod, p.adjust.method = "bonferroni", paired = FALSE)
## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  espresso$cereme and espresso$brewmethod 
## 
##   1       2      
## 2 5.2e-07 -      
## 3 0.24    4.4e-05
## 
## P value adjustment method: bonferroni

#Based on the values here, we can see technique 1 and 3 are similar, while 2 is not.

TukeyHSD(m)
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = cereme ~ factor(brewmethod), data = espresso)
## 
## $`factor(brewmethod)`
##      diff        lwr       upr     p adj
## 2-1  28.9  18.942931  38.85707 0.0000005
## 3-1   7.3  -2.657069  17.25707 0.1811000
## 3-2 -21.6 -31.557069 -11.64293 0.0000419
aggregate(espresso[, 1], list(espresso$brewmethod), mean)
##   Group.1    x
## 1       1 32.4
## 2       2 61.3
## 3       3 39.7

Here we can see that method 1 and 3 have a similar mean, but are significantly different from method 2 mean.

#Summary: After running the analysis, we can say that the Espresso Data holds all the assumptions for ANOVA. We saw that the second brewing method produces the highest cereme, hence we failed to reject HO as the methods differ from each other.