# ANOVA = Analysis of Variance
# Variance = SD*SD
# Variance = sum((x - mean(x))^2)/(n-1)
# variance = MSS = mean sum of squares
# SD = sqrt(variance)
# Variance in dependent variable
# dependent variable (y) = yield
# Treatments = X1, X2
# Design: CRD, RCBD, Factorial (>1 factor)
# Split plot design: one factor is difficult to manage
# Replication: number of repeating the same treatment
# Randomization: unbiased
# Local control: keeping the same practice except the treatment
# Control group is compared with treatment
# Statistical analysis
# H0: average survival duration with breast cancer = survival with colon cancer = survival with ovary cancer
# Assumption: normality, equal variance, more importantly residual will have normal distribution
# Residual = error, if not normally distributed, it indicates that error is not random, it contains variance, i.e. variances have not been attributed to the independent variables properly.
# data
library(car)
## Loading required package: carData
library(sjstats)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some() masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(agricolae)
library(readxl)
cancer = read_excel('datasets.xlsx', sheet = 'anova', range = 'B11:D45')
glimpse(cancer)
## Rows: 34
## Columns: 3
## $ sr. <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Organ <chr> "Colon", "Breast", "Colon", "Ovary", "Breast", "Colon", "Brea…
## $ Survival <dbl> 776, 719, 180, 201, 40, 455, 1166, 1235, 20, 89, 727, 1804, 2…
# for anova, independent variables must factor (categorical)
cancer$Organ = as.factor(cancer$Organ)
glimpse(cancer)
## Rows: 34
## Columns: 3
## $ sr. <dbl> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18…
## $ Organ <fct> Colon, Breast, Colon, Ovary, Breast, Colon, Breast, Breast, C…
## $ Survival <dbl> 776, 719, 180, 201, 40, 455, 1166, 1235, 20, 89, 727, 1804, 2…
# anova model
mod = aov(Survival ~ Organ, data = cancer)
summary(mod)
## Df Sum Sq Mean Sq F value Pr(>F)
## Organ 2 5899086 2949543 3.762 0.0345 *
## Residuals 31 24304330 784011
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p < 0.05, H0: rejected, at least one mean is different from others
# Assumption
# Normality
boxplot(Survival ~ Organ, data = cancer)

qqPlot(Survival ~ Organ, data = cancer)

# Equal variance
leveneTest(Survival ~ Organ, data = cancer)
## Levene's Test for Homogeneity of Variance (center = median)
## Df F value Pr(>F)
## group 2 2.937 0.06791 .
## 31
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# p > 0.05, H0 not rejected, variances are equal for all groups
# Residual analysis, 4 figures in 1 plot
par(mfrow = c(2,2))
plot(mod)

par(mfrow = c(1,1))
# Recall the anova model
mod
## Call:
## aov(formula = Survival ~ Organ, data = cancer)
##
## Terms:
## Organ Residuals
## Sum of Squares 5899086 24304330
## Deg. of Freedom 2 31
##
## Residual standard error: 885.4438
## Estimated effects may be unbalanced
summary(mod)
## Df Sum Sq Mean Sq F value Pr(>F)
## Organ 2 5899086 2949543 3.762 0.0345 *
## Residuals 31 24304330 784011
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Model CV and Response variable CV
# CV (%) = (SD/mean)*100
# Model CV = ((RMSE)/mean)*100
# RMSE = sqrt(MSE) = Residual Standard Error
# Response variable CV = (SD(y)/mean)*100
mean(cancer$Survival)
## [1] 836.3824
sd(cancer$Survival)
## [1] 956.6896
# SD is larger than mean, so CV will be > 100%
cv_response = (sd(cancer$Survival)/mean(cancer$Survival))*100
cv_model = (sqrt(784011)/mean(cancer$Survival))*100
cv_response
## [1] 114.3842
cv_model
## [1] 105.8659
# Model CV indicates the goodness of the model
# Model CV > 10%, good fit
# Model CV > 20, acceptable fit
# Model CV > 30, poor fit
# Variance reduced by the model = cv_response - cv_model = 8.5%
cv_response - cv_model
## [1] 8.518313
# 8.5% of variance has been reduced by the model
# Another explanation
# eta squared = Sum of Square of Dependent variable / Total Sum of Squares
# eta squared = (5899086)/(5899086+24304330) = 0.1953
# 19.53% of variance has been explained by the independent variables
# Post hoc test: will tell us which independent variable level is different from other
# H0: Breast cancer = Colon cancer = Ovary cancer
# H1: At least one mean is different from others
# But which means: Answer will be provided by post hoc test
# Post hoc test: Tukey HSD, lsd (least significant difference test), lsd with Bonferroni correction, DMRT
# For large sample: Tukey HSD is preferred (HSD = Honest Significant Difference). Tukey controls FWER (Family Wise Error Rate) i.e. keeps the overall alpha level at 0.05
# For small sample: DMRT is preferred (Duncan's Multiple Range Test)
# Bonferroni correction
agricolae::LSD.test(mod, 'Organ', alpha = 0.05, console = TRUE, p.adj = 'bonferroni')
##
## Study: mod ~ "Organ"
##
## LSD t Test for Survival
## P value adjustment method: bonferroni
##
## Mean Square Error: 784010.7
##
## Organ, means and individual ( 95 %) CI
##
## Survival std r se LCL UCL Min Max Q25 Q50
## Breast 1395.9091 1238.9667 11 266.9713 851.41745 1940.4007 24 3808 723.00 1166
## Colon 457.4118 427.1686 17 214.7517 19.42287 895.4007 20 1843 189.00 372
## Ovary 884.3333 1098.5788 6 361.4809 147.08817 1621.5785 89 2970 239.75 406
## Q75
## Breast 1692.5
## Colon 519.0
## Ovary 1039.5
##
## Alpha: 0.05 ; DF Error: 31
## Critical Value of t: 2.530926
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Treatments with the same letter are not significantly different.
##
## Survival groups
## Breast 1395.9091 a
## Ovary 884.3333 ab
## Colon 457.4118 b
# H0: Breast cancer = Ovary cancer
# H0: Ovary cancer = Colon cancer
# H0: Breast cancer != Colon cancer
# DMRT
agricolae::duncan.test(mod, 'Organ', alpha = 0.05, console = TRUE)
##
## Study: mod ~ "Organ"
##
## Duncan's new multiple range test
## for Survival
##
## Mean Square Error: 784010.7
##
## Organ, means
##
## Survival std r se Min Max Q25 Q50 Q75
## Breast 1395.9091 1238.9667 11 266.9713 24 3808 723.00 1166 1692.5
## Colon 457.4118 427.1686 17 214.7517 20 1843 189.00 372 519.0
## Ovary 884.3333 1098.5788 6 361.4809 89 2970 239.75 406 1039.5
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Means with the same letter are not significantly different.
##
## Survival groups
## Breast 1395.9091 a
## Ovary 884.3333 ab
## Colon 457.4118 b
# Identical to Bonferroni correction
# Tukey HSD
post_hoc = HSD.test(mod, 'Organ', alpha = 0.05, console = TRUE)
##
## Study: mod ~ "Organ"
##
## HSD Test for Survival
##
## Mean Square Error: 784010.7
##
## Organ, means
##
## Survival std r se Min Max Q25 Q50 Q75
## Breast 1395.9091 1238.9667 11 266.9713 24 3808 723.00 1166 1692.5
## Colon 457.4118 427.1686 17 214.7517 20 1843 189.00 372 519.0
## Ovary 884.3333 1098.5788 6 361.4809 89 2970 239.75 406 1039.5
##
## Alpha: 0.05 ; DF Error: 31
## Critical Value of Studentized Range: 3.480647
##
## Groups according to probability of means differences and alpha level( 0.05 )
##
## Treatments with the same letter are not significantly different.
##
## Survival groups
## Breast 1395.9091 a
## Ovary 884.3333 ab
## Colon 457.4118 b
# Identical to Bonferroni correction