ANLY 510 - Intro of ANOVA

#Importing the data

espresso <- read.csv("EspressoData.csv")
head(espresso)

##   cereme brewmethod
## 1  36.64          1
## 2  39.65          1
## 3  37.74          1
## 4  35.96          1
## 5  38.52          1
## 6  21.02          1

# Research Question:
#Is there any significant difference amongst the 3 different brew methods in terms of the amount of cereme they generate

#HYPOTHESIS:
#H0: There is NO significant difference amongst the 3 different brew methods in terms of the amount of cereme they generate
#Ha: There IS significant difference amongst the 3 different brew methods in terms of the amount of cereme they generate

#Overall Distribution

plot(density(espresso$cereme))

qqnorm(espresso$cereme)

#D’Agostino Skewness Test:

library(moments)
agostino.test(espresso$cereme)

## 
##  D'Agostino skewness test
## 
## data:  espresso$cereme
## skew = 0.54679, z = 1.32787, p-value = 0.1842
## alternative hypothesis: data have a skewness

#Shapiro-Wilks Normality Test

shapiro.test(espresso$cereme)

## 
##  Shapiro-Wilk normality test
## 
## data:  espresso$cereme
## W = 0.92201, p-value = 0.04414

Since overall the data seems to be normal we can proceed to our next assumption

#Independence of Observations:

eruption.lm = lm(cereme ~ brewmethod, data = espresso)
eruption.res = resid(eruption.lm)
plot(espresso$cereme, eruption.res, xlab = "Brew Methods", ylab = "Residuals", main = "Cereme Quality")
abline(0,0)

#from above we can say that the data somewhat meets the assumptions, so now we will proceed with our next assumption

#Variance Equality ### Bartlett Test

bartlett.test(espresso$cereme, espresso$brewmethod)

## 
##  Bartlett test of homogeneity of variances
## 
## data:  espresso$cereme and espresso$brewmethod
## Bartlett's K-squared = 0.96331, df = 2, p-value = 0.6178

tapply(espresso$cereme,espresso$brewmethod, var)

##         1         2         3 
##  53.29088 102.02220  59.30182

#we need to check the value for the (largest varience)/(smallest varience) <3 for the data to be balanced and for it to be ok for the ANOVA testing
#102/53.3 = 1.91 which is < 3 hence we can say that the data it is okay for ANOVA testing

#ANOVA TESTING

aov_model <- aov(cereme ~ factor(brewmethod), data = espresso)
summary(aov_model)

##                    Df Sum Sq Mean Sq F value  Pr(>F)    
## factor(brewmethod)  2   4065  2032.6   28.41 4.7e-07 ***
## Residuals          24   1717    71.5                    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

#POST-HOC TEST

library(pgirmess)

###BONFERRONI TEST

pairwise.t.test(espresso$cereme, espresso$brewmethod, paired = FALSE, p.adjust.method = "bonferroni" )

## 
##  Pairwise comparisons using t tests with pooled SD 
## 
## data:  espresso$cereme and espresso$brewmethod 
## 
##   1       2      
## 2 5.2e-07 -      
## 3 0.24    4.4e-05
## 
## P value adjustment method: bonferroni

#Here we can see that METHOD 1 and METHOD 2 has a significant difference 
#and METHOD 2 and METHOD 3 has a significance difference

###KRUSKAL WALLIS

kruskalmc(cereme ~factor(brewmethod), data = espresso)

## Multiple comparison test after Kruskal-Wallis 
## p.value: 0.05 
## Comparisons
##       obs.dif critical.dif difference
## 1-2 14.666667     8.957452       TRUE
## 1-3  3.666667     8.957452      FALSE
## 2-3 11.000000     8.957452       TRUE

#here we see that the METHOD 1 and METHOD 2 have TRUE against the difference column showing that there is a significant difference 
#Similarly METHOD 2 and METHOD 3 also has a TRUE agaisnt the difference column showing significant difference

###TUKEY TEST

TukeyHSD(aov_model)

##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = cereme ~ factor(brewmethod), data = espresso)
## 
## $`factor(brewmethod)`
##      diff        lwr       upr     p adj
## 2-1  28.9  18.942931  38.85707 0.0000005
## 3-1   7.3  -2.657069  17.25707 0.1811000
## 3-2 -21.6 -31.557069 -11.64293 0.0000419

#here we can see that METHOD 2 and METHOD 1 have a p value<0.001 showing that the are sigificantly different 
#Similarly the METHOD 3 and METHOD 2 also have a p-value < 0.001 showing there are significant difference

#EFFECT SIZE

library(pastecs)
library(compute.es)

by(espresso$cereme, espresso$brewmethod, stat.desc)

## espresso$brewmethod: 1
##      nbr.val     nbr.null       nbr.na          min          max        range 
##    9.0000000    0.0000000    0.0000000   21.0200000   39.6500000   18.6300000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##  291.6000000   35.9600000   32.4000000    2.4333533    5.6113228   53.2908750 
##      std.dev     coef.var 
##    7.3000599    0.2253105 
## ------------------------------------------------------------ 
## espresso$brewmethod: 2
##      nbr.val     nbr.null       nbr.na          min          max        range 
##    9.0000000    0.0000000    0.0000000   46.6800000   73.1900000   26.5100000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##  551.7000000   62.5300000   61.3000000    3.3668680    7.7640115  102.0222000 
##      std.dev     coef.var 
##   10.1006039    0.1647733 
## ------------------------------------------------------------ 
## espresso$brewmethod: 3
##      nbr.val     nbr.null       nbr.na          min          max        range 
##     9.000000     0.000000     0.000000    32.680000    56.190000    23.510000 
##          sum       median         mean      SE.mean CI.mean.0.95          var 
##   357.300000    37.120000    39.700000     2.566923     5.919334    59.301825 
##      std.dev     coef.var 
##     7.700768     0.193974

#METHOD 1: n = 9, M = 32.4, std.dev = 7.3
#METHOD 2: n = 9, M = 61.3, std.dev = 10.1
#METHOD 3: n = 9, M = 39.7, std.dev = 7.7

###mes1-2

mes(61.3,32.4,10.1,7.3,9,9)

## Mean Differences ES: 
##  
##  d [ 95 %CI] = 3.28 [ 1.86 , 4.69 ] 
##   var(d) = 0.52 
##   p-value(d) = 0 
##   U3(d) = 99.95 % 
##   CLES(d) = 98.98 % 
##   Cliff's Delta = 0.98 
##  
##  g [ 95 %CI] = 3.12 [ 1.78 , 4.47 ] 
##   var(g) = 0.47 
##   p-value(g) = 0 
##   U3(g) = 99.91 % 
##   CLES(g) = 98.64 % 
##  
##  Correlation ES: 
##  
##  r [ 95 %CI] = 0.87 [ 0.67 , 0.95 ] 
##   var(r) = 0 
##   p-value(r) = 0 
##  
##  z [ 95 %CI] = 1.32 [ 0.81 , 1.83 ] 
##   var(z) = 0.07 
##   p-value(z) = 0 
##  
##  Odds Ratio ES: 
##  
##  OR [ 95 %CI] = 383.22 [ 29.45 , 4987.18 ] 
##   p-value(OR) = 0 
##  
##  Log OR [ 95 %CI] = 5.95 [ 3.38 , 8.51 ] 
##   var(lOR) = 1.71 
##   p-value(Log OR) = 0 
##  
##  Other: 
##  
##  NNT = 1.26 
##  Total N = 18

###mes2-3

mes(61.3,39.7,10.1,7.7,9,9)

## Mean Differences ES: 
##  
##  d [ 95 %CI] = 2.41 [ 1.19 , 3.62 ] 
##   var(d) = 0.38 
##   p-value(d) = 0 
##   U3(d) = 99.19 % 
##   CLES(d) = 95.55 % 
##   Cliff's Delta = 0.91 
##  
##  g [ 95 %CI] = 2.29 [ 1.14 , 3.45 ] 
##   var(g) = 0.35 
##   p-value(g) = 0 
##   U3(g) = 98.9 % 
##   CLES(g) = 94.74 % 
##  
##  Correlation ES: 
##  
##  r [ 95 %CI] = 0.79 [ 0.51 , 0.92 ] 
##   var(r) = 0.01 
##   p-value(r) = 0 
##  
##  z [ 95 %CI] = 1.06 [ 0.56 , 1.57 ] 
##   var(z) = 0.07 
##   p-value(z) = 0 
##  
##  Odds Ratio ES: 
##  
##  OR [ 95 %CI] = 78.46 [ 8.69 , 707.96 ] 
##   p-value(OR) = 0 
##  
##  Log OR [ 95 %CI] = 4.36 [ 2.16 , 6.56 ] 
##   var(lOR) = 1.26 
##   p-value(Log OR) = 0 
##  
##  Other: 
##  
##  NNT = 1.35 
##  Total N = 18

##mes1-3

mes(39.7,32.4,7.7,7.3,9,9)

## Mean Differences ES: 
##  
##  d [ 95 %CI] = 0.97 [ 0 , 1.95 ] 
##   var(d) = 0.25 
##   p-value(d) = 0.07 
##   U3(d) = 83.47 % 
##   CLES(d) = 75.43 % 
##   Cliff's Delta = 0.51 
##  
##  g [ 95 %CI] = 0.93 [ 0 , 1.86 ] 
##   var(g) = 0.23 
##   p-value(g) = 0.07 
##   U3(g) = 82.29 % 
##   CLES(g) = 74.38 % 
##  
##  Correlation ES: 
##  
##  r [ 95 %CI] = 0.46 [ -0.01 , 0.76 ] 
##   var(r) = 0.03 
##   p-value(r) = 0.07 
##  
##  z [ 95 %CI] = 0.5 [ -0.01 , 1 ] 
##   var(z) = 0.07 
##   p-value(z) = 0.07 
##  
##  Odds Ratio ES: 
##  
##  OR [ 95 %CI] = 5.84 [ 0.99 , 34.36 ] 
##   p-value(OR) = 0.07 
##  
##  Log OR [ 95 %CI] = 1.76 [ -0.01 , 3.54 ] 
##   var(lOR) = 0.82 
##   p-value(Log OR) = 0.07 
##  
##  Other: 
##  
##  NNT = 2.84 
##  Total N = 18

#Cohen's d = 0.97
#Hedge's g = 0.93
#r = 0.46
#since the Cohen's d value is much greater than 0.4 we can say that the effect was large


#SUMMARY:
#From above we can conclude that the the brew methods 1 and 2 had a significant difference and brew method 2 and 3 also had a significant difference as their p- values in the Tukey Test was  less than 0.001(p-val<0.001) and in the Kruskal-Wallis test both have TRUE against the difference column showing a significant difference. 
#While checking the effect size we see that all the 3 groups : 1-2,2-3,1-3 all have a Cohen's d > 0.4 showing that the effect was large

ANLY 510 - Intro of ANOVA

Abhishek Mendon

2023-01-22