# set working directory where both files are saved
setwd("C:/Users/sdecl/Downloads/ass10")
getwd()
## [1] "C:/Users/sdecl/Downloads/ass10"
# read csv
dat <- read.csv("KN_Data.csv", header = FALSE)
dat
## V1 V2 V3 V4 V5 V6 V7
## 1 Method 1 0.34 0.12 1.23 0.70 1.75 0.12
## 2 Method 2 0.91 2.94 2.14 2.36 2.86 4.55
## 3 Method 3 6.31 8.37 9.75 6.09 9.82 7.24
## 4 Method 4 17.15 11.82 10.97 17.20 14.35 16.82
We are testing if the four methods give the same mean result.
\[ y_{ij} = \mu + \tau_i + \varepsilon_{ij}, \quad \varepsilon_{ij} \sim N(0,\sigma^2) \]
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
# make data long
dat <- pivot_longer(dat, c(V2, V3, V4, V5, V6, V7))
dat <- dat[, -2]
colnames(dat) <- c("Judge", "Proportion")
# remove NAs and make numeric
dat <- filter(dat, Proportion != "NA")
dat$Proportion <- as.numeric(dat$Proportion)
head(dat)
## # A tibble: 6 × 2
## Judge Proportion
## <chr> <dbl>
## 1 Method 1 0.34
## 2 Method 1 0.12
## 3 Method 1 1.23
## 4 Method 1 0.7
## 5 Method 1 1.75
## 6 Method 1 0.12
# boxplot of the data
boxplot(Proportion ~ Judge, data = dat,
xlab = "Judge", ylab = "Proportion",
main = "Boxplot of Observations")
Comment:
The boxplot shows that the variation between the four methods looks
large, suggesting there may be real differences in the means.
# one-way ANOVA on raw data
fit_raw <- aov(Proportion ~ Judge, data = dat)
summary(fit_raw)
## Df Sum Sq Mean Sq F value Pr(>F)
## Judge 3 708.7 236.2 76.29 4e-11 ***
## Residuals 20 61.9 3.1
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
Comment:
From the ANOVA table, the p-value is about 4e-11, which
is below 0.05.
So I reject \(H_0\) and conclude that
at least one method mean is different.
# residual checks
res <- residuals(fit_raw)
fitv <- fitted(fit_raw)
par(mfrow=c(1,2))
plot(fitv, res, xlab="fitted", ylab="residual",
main="Residuals vs Fitted"); abline(h=0,lty=2)
qqnorm(res); qqline(res)
par(mfrow=c(1,1))
Comment:
Residuals appear roughly random with constant spread, and the QQ plot
looks close to a straight line, so the normality assumption seems
fine.
# Box–Cox plot
library(MASS)
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
boxcox(lm(Proportion ~ Judge, data = dat),
main = "Box–Cox for Proportion ~ Judge")
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'main' will be disregarded
# set lambda based on Box–Cox plot
lambda <- 0 # change if needed (0 = log, 1 = raw)
# transform and re-run ANOVA
gy <- if (abs(lambda) < 1e-8) log(dat$Proportion) else
(dat$Proportion^lambda - 1)/lambda
dat$gy <- gy
fit_bc <- aov(gy ~ Judge, data = dat)
summary(fit_bc)
## Df Sum Sq Mean Sq F value Pr(>F)
## Judge 3 42.50 14.168 33.44 5.49e-08 ***
## Residuals 20 8.47 0.424
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# residual plots for transformed data
res_bc <- residuals(fit_bc)
fitv_bc <- fitted(fit_bc)
par(mfrow=c(1,2))
plot(fitv_bc, res_bc, xlab="fitted", ylab="residual",
main="Residuals vs Fitted (transformed)"); abline(h=0,lty=2)
qqnorm(res_bc); qqline(res_bc)
par(mfrow=c(1,1))
Comment:
After the transformation, the model still shows a very small p-value (≈
5.49e-08), confirming that at least one group mean is different.
# Kruskal–Wallis test
kruskal.test(Proportion ~ Judge, data = dat)
##
## Kruskal-Wallis rank sum test
##
## data: Proportion by Judge
## Kruskal-Wallis chi-squared = 21.156, df = 3, p-value = 9.771e-05
Comment:
The Kruskal–Wallis test also gives a very small p-value (~9.8e-05), so
the result agrees with the ANOVA.
Overall:
There is strong evidence that not all four flood estimation methods have
the same mean discharge.