# ANOVA = Analysis of Variance
# Variance = SD*SD
# Variance = sum((x - mean(x))^2)/(n-1)
# variance  = MSS = mean sum of squares
# SD = sqrt(variance)

# Variance in dependent variable
# dependent variable (y) = yield
# Treatments = X1, X2
# Design: CRD, RCBD, Factorial (>1 factor)
# Split plot design: one factor is difficult to manage
# Replication: number of repeating the same treatment 
# Randomization: unbiased
# Local control: keeping the same practice except the treatment
# Control group is compared with treatment
# Statistical analysis

# H0: average survival duration with breast cancer = survival with colon cancer = survival with ovary cancer

# Assumption: normality, equal variance, more importantly residual will have normal distribution

# Residual = error, if not normally distributed, it indicates that error is not random, it contains variance, i.e. variances have not been attributed to the independent variables properly.

# data
library(car)
## Loading required package: carData
library(sjstats)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(agricolae)
library(readxl)

cancer = read_excel('datasets.xlsx', sheet = 'anova', range = 'B11:D45')
glimpse(cancer)
## Rows: 34
## Columns: 3
## $ sr.      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Organ    <chr> "Colon", "Breast", "Colon", "Ovary", "Breast", "Colon", "Brea…
## $ Survival <dbl> 776, 719, 180, 201, 40, 455, 1166, 1235, 20, 89, 727, 1804, 2…
# for anova, independent variables must factor (categorical)

cancer$Organ = as.factor(cancer$Organ)

glimpse(cancer)
## Rows: 34
## Columns: 3
## $ sr.      <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Organ    <fct> Colon, Breast, Colon, Ovary, Breast, Colon, Breast, Breast, C…
## $ Survival <dbl> 776, 719, 180, 201, 40, 455, 1166, 1235, 20, 89, 727, 1804, 2…
# anova model
mod = aov(Survival ~ Organ, data = cancer)
summary(mod)
##             Df   Sum Sq Mean Sq F value Pr(>F)  
## Organ        2  5899086 2949543   3.762 0.0345 *
## Residuals   31 24304330  784011                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, H0: rejected, at least one mean is different from others

# Assumption
# Normality
boxplot(Survival ~ Organ, data = cancer)

qqPlot(Survival ~ Organ, data = cancer)

# Equal variance
leveneTest(Survival ~ Organ, data = cancer)
## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value  Pr(>F)  
## group  2   2.937 0.06791 .
##       31                  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p > 0.05, H0 not rejected, variances are equal for all groups

# Residual analysis, 4 figures in 1 plot
par(mfrow = c(2,2))
plot(mod)

par(mfrow = c(1,1))

# Recall the anova model
mod
## Call:
##    aov(formula = Survival ~ Organ, data = cancer)
## 
## Terms:
##                    Organ Residuals
## Sum of Squares   5899086  24304330
## Deg. of Freedom        2        31
## 
## Residual standard error: 885.4438
## Estimated effects may be unbalanced
summary(mod)
##             Df   Sum Sq Mean Sq F value Pr(>F)  
## Organ        2  5899086 2949543   3.762 0.0345 *
## Residuals   31 24304330  784011                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model CV and Response variable CV
# CV (%) = (SD/mean)*100

# Model CV = ((RMSE)/mean)*100
# RMSE = sqrt(MSE) = Residual Standard Error
# Response variable CV = (SD(y)/mean)*100

mean(cancer$Survival)
## [1] 836.3824
sd(cancer$Survival)
## [1] 956.6896
# SD is larger than mean, so CV will be > 100%

cv_response = (sd(cancer$Survival)/mean(cancer$Survival))*100
cv_model = (sqrt(784011)/mean(cancer$Survival))*100
cv_response
## [1] 114.3842
cv_model
## [1] 105.8659
# Model CV indicates the goodness of the model
# Model CV > 10%, good fit
# Model CV > 20, acceptable fit
# Model CV > 30, poor fit

# Variance reduced by the model = cv_response - cv_model = 8.5%
cv_response - cv_model
## [1] 8.518313
# 8.5% of variance has been reduced by the model

# Another explanation
# eta squared = Sum of Square of Dependent variable / Total Sum of Squares
# eta squared = (5899086)/(5899086+24304330) = 0.1953
# 19.53% of variance has been explained by the independent variables

# Post hoc test: will tell us which independent variable level is different from other
# H0: Breast cancer = Colon cancer = Ovary cancer
# H1: At least one mean is different from others
# But which means: Answer will be provided by post hoc test
# Post hoc test: Tukey HSD, lsd (least significant difference test), lsd with Bonferroni correction, DMRT
# For large sample: Tukey HSD is preferred (HSD = Honest Significant Difference). Tukey controls FWER (Family Wise Error Rate) i.e. keeps the overall alpha level at 0.05
# For small sample: DMRT is preferred (Duncan's Multiple Range Test)

# Bonferroni correction
agricolae::LSD.test(mod, 'Organ', alpha = 0.05, console = TRUE, p.adj = 'bonferroni')
## 
## Study: mod ~ "Organ"
## 
## LSD t Test for Survival 
## P value adjustment method: bonferroni 
## 
## Mean Square Error:  784010.7 
## 
## Organ,  means and individual ( 95 %) CI
## 
##         Survival       std  r       se       LCL       UCL Min  Max    Q25  Q50
## Breast 1395.9091 1238.9667 11 266.9713 851.41745 1940.4007  24 3808 723.00 1166
## Colon   457.4118  427.1686 17 214.7517  19.42287  895.4007  20 1843 189.00  372
## Ovary   884.3333 1098.5788  6 361.4809 147.08817 1621.5785  89 2970 239.75  406
##           Q75
## Breast 1692.5
## Colon   519.0
## Ovary  1039.5
## 
## Alpha: 0.05 ; DF Error: 31
## Critical Value of t: 2.530926 
## 
## Groups according to probability of means differences and alpha level( 0.05 )
## 
## Treatments with the same letter are not significantly different.
## 
##         Survival groups
## Breast 1395.9091      a
## Ovary   884.3333     ab
## Colon   457.4118      b
# H0: Breast cancer = Ovary cancer 
# H0: Ovary cancer = Colon cancer
# H0: Breast cancer != Colon cancer

# DMRT
agricolae::duncan.test(mod, 'Organ', alpha = 0.05, console = TRUE)
## 
## Study: mod ~ "Organ"
## 
## Duncan's new multiple range test
## for Survival 
## 
## Mean Square Error:  784010.7 
## 
## Organ,  means
## 
##         Survival       std  r       se Min  Max    Q25  Q50    Q75
## Breast 1395.9091 1238.9667 11 266.9713  24 3808 723.00 1166 1692.5
## Colon   457.4118  427.1686 17 214.7517  20 1843 189.00  372  519.0
## Ovary   884.3333 1098.5788  6 361.4809  89 2970 239.75  406 1039.5
## 
## Groups according to probability of means differences and alpha level( 0.05 )
## 
## Means with the same letter are not significantly different.
## 
##         Survival groups
## Breast 1395.9091      a
## Ovary   884.3333     ab
## Colon   457.4118      b
# Identical to Bonferroni correction

# Tukey HSD
post_hoc = HSD.test(mod, 'Organ', alpha = 0.05, console = TRUE)
## 
## Study: mod ~ "Organ"
## 
## HSD Test for Survival 
## 
## Mean Square Error:  784010.7 
## 
## Organ,  means
## 
##         Survival       std  r       se Min  Max    Q25  Q50    Q75
## Breast 1395.9091 1238.9667 11 266.9713  24 3808 723.00 1166 1692.5
## Colon   457.4118  427.1686 17 214.7517  20 1843 189.00  372  519.0
## Ovary   884.3333 1098.5788  6 361.4809  89 2970 239.75  406 1039.5
## 
## Alpha: 0.05 ; DF Error: 31 
## Critical Value of Studentized Range: 3.480647 
## 
## Groups according to probability of means differences and alpha level( 0.05 )
## 
## Treatments with the same letter are not significantly different.
## 
##         Survival groups
## Breast 1395.9091      a
## Ovary   884.3333     ab
## Colon   457.4118      b
# Identical to Bonferroni correction