Kruskal

\[ H_0: \theta_1 = \theta_2 = \cdots = \theta_K \] \[ H_a: \theta_j = \theta_{j'}, j \neq j' \]

############################################################

# Objetivo:
#  1) Explicar con código ANOVA de una vía y sus supuestos
#  2) Si fallan supuestos, aplicar prueba de Kruskal–Wallis
#  3) Hacer comparaciones múltiples:
#     - Paramétricas (Tukey HSD)
#     - No paramétricas (Dunn con ajuste)
#  4) Graficar (boxplots, medias±IC, diagnósticos)
############################################################

#-----------------------------------------------------------
# 0. Paquetes (descomenta para instalar si hace falta)
#-----------------------------------------------------------
# install.packages(c("ggplot2","dplyr","car","FSA","rcompanion","ggpubr"))
# Nota: 'FSA' incluye dunnTest(); 'rcompanion' ayuda a generar letras de grupos

library(ggplot2)
library(dplyr)

## 
## Adjuntando el paquete: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

library(car)          # leveneTest

## Warning: package 'car' was built under R version 4.4.3

## Cargando paquete requerido: carData

## 
## Adjuntando el paquete: 'car'

## The following object is masked from 'package:dplyr':
## 
##     recode

library(FSA)          # dunnTest

## Warning: package 'FSA' was built under R version 4.4.3

## Registered S3 methods overwritten by 'FSA':
##   method       from
##   confint.boot car 
##   hist.boot    car

## ## FSA v0.10.0. See citation('FSA') if used in publication.
## ## Run fishR() for related website and fishR('IFAR') for related book.

## 
## Adjuntando el paquete: 'FSA'

## The following object is masked from 'package:car':
## 
##     bootCase

library(rcompanion)   # cldList para letras de comparación múltiple

## Warning: package 'rcompanion' was built under R version 4.4.3

library(ggpubr)       # utilidades de gráfico (IC, anotaciones, etc.)

## Warning: package 'ggpubr' was built under R version 4.4.3

#-----------------------------------------------------------
# 1. Datos de ejemplo
#   Usaremos PlantGrowth (incluido en R): 3 grupos (ctrl, trt1, trt2)
#   Respuesta: weight
#-----------------------------------------------------------
data("PlantGrowth")
df <- PlantGrowth %>% 
  rename(grupo = group, y = weight)

# Vista rápida
head(df)

##      y grupo
## 1 4.17  ctrl
## 2 5.58  ctrl
## 3 5.18  ctrl
## 4 6.11  ctrl
## 5 4.50  ctrl
## 6 4.61  ctrl

summary(df)

##        y          grupo   
##  Min.   :3.590   ctrl:10  
##  1st Qu.:4.550   trt1:10  
##  Median :5.155   trt2:10  
##  Mean   :5.073            
##  3rd Qu.:5.530            
##  Max.   :6.310

#-----------------------------------------------------------
# 2. Visualización inicial: boxplot + puntos
#-----------------------------------------------------------
p0 <- ggplot(df, aes(x = grupo, y = y)) +
  geom_boxplot(width = 0.6, outlier.shape = NA) +
  geom_jitter(width = 0.08, alpha = 0.7) +
  labs(title = "Distribución por grupo", x = "Grupo", y = "Respuesta (y)") +
  theme_minimal(base_size = 13)
print(p0)

#-----------------------------------------------------------
# 3. ANOVA de una vía
#   Supuestos: (i) normalidad de residuos, (ii) homogeneidad de varianzas
#-----------------------------------------------------------
m.aov <- aov(y ~ grupo, data = df)
summary(m.aov)

##             Df Sum Sq Mean Sq F value Pr(>F)  
## grupo        2  3.766  1.8832   4.846 0.0159 *
## Residuals   27 10.492  0.3886                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# Residuos y supuestos
resid_aov <- residuals(m.aov)
fit_aov   <- fitted(m.aov)

# 3.1 Normalidad de residuos (Shapiro–Wilk)
shapiro.test(resid_aov)

## 
##  Shapiro-Wilk normality test
## 
## data:  resid_aov
## W = 0.96607, p-value = 0.4379

# 3.2 Homogeneidad de varianzas (Levene)
leveneTest(y ~ grupo, data = df)

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.1192 0.3412
##       27

# Gráficos de diagnóstico ANOVA
par(mfrow = c(1,2))
plot(fit_aov, resid_aov,
     xlab = "Ajustados", ylab = "Residuos",
     main = "Residuos vs Ajustados"); abline(h=0,lty=2)
qqnorm(resid_aov, main = "QQ-plot de residuos"); qqline(resid_aov, lty=2)

par(mfrow = c(1,1))

#-----------------------------------------------------------
# 4. Comparaciones múltiples paramétricas (si ANOVA es significativo y supuestos OK)
#   Tukey HSD
#-----------------------------------------------------------
# Tabla desde TukeyHSD
#tuk_tbl <- as.data.frame(tuk$grupo)

# Estandarizar nombres de columnas que vienen con espacio o con punto
# (según la versión de R/locale puede ser "p adj" o "p.adj")
#if ("p adj" %in% names(tuk_tbl)) {
#  names(tuk_tbl)[names(tuk_tbl) == "p adj"] <- "p.value"
#} else if ("p.adj" %in% names(tuk_tbl)) {
#  names(tuk_tbl)[names(tuk_tbl) == "p.adj"] <- "p.value"
#}

# Crear columna Comparison con el formato "A-B" a partir de los rownames
#tuk_tbl$Comparison <- rownames(tuk_tbl)

# Generar letras de grupos (Tukey, alfa=0.05)
#tuk_letters <- rcompanion::cldList(p.value ~ Comparison,
#                                   data = tuk_tbl,
#                                   threshold = 0.05)
#tuk_letters

# Letras de grupos para Tukey (facilita anotar gráficos)
# Pasamos p-valores ajustados a un formato que cldList pueda usar
#tuk_tbl <- as.data.frame(tuk$grupo)
#tuk_tbl$comparacion <- rownames(tuk_tbl)
# cldList espera columnas: Comparison, p.value
#tuk_letters <- cldList(p.value ~ comparacion, data = tuk_tbl, threshold = 0.05)
#tuk_letters

#-----------------------------------------------------------
# 5. Estimación de medias e IC al 95% por grupo (paramétrico)
#-----------------------------------------------------------
sum_stats <- df %>%
  group_by(grupo) %>%
  summarize(
    n = n(),
    media = mean(y),
    sd = sd(y),
    se = sd / sqrt(n),
    IC_inf = media - qt(0.975, df=n-1)*se,
    IC_sup = media + qt(0.975, df=n-1)*se,
    .groups = "drop"
  )
sum_stats

## # A tibble: 3 × 7
##   grupo     n media    sd    se IC_inf IC_sup
##   <fct> <int> <dbl> <dbl> <dbl>  <dbl>  <dbl>
## 1 ctrl     10  5.03 0.583 0.184   4.61   5.45
## 2 trt1     10  4.66 0.794 0.251   4.09   5.23
## 3 trt2     10  5.53 0.443 0.140   5.21   5.84

p1 <- ggplot(sum_stats, aes(grupo, media)) +
  geom_point(size = 3) +
  geom_errorbar(aes(ymin = IC_inf, ymax = IC_sup), width = 0.12) +
  labs(title = "Medias por grupo ± IC (95%)", x = "Grupo", y = "Media estimada") +
  theme_minimal(base_size = 13)
print(p1)

#-----------------------------------------------------------
# 6. Kruskal–Wallis (alternativa no paramétrica)
#   Úsala cuando ANOVA no es adecuado (violación de supuestos) o como contraste robusto
#-----------------------------------------------------------
kw <- kruskal.test(y ~ grupo, data = df)
kw

## 
##  Kruskal-Wallis rank sum test
## 
## data:  y by grupo
## Kruskal-Wallis chi-squared = 7.9882, df = 2, p-value = 0.01842

#-----------------------------------------------------------
# 7. Comparaciones múltiples no paramétricas
#   Dunn con ajuste (Benjamini-Hochberg por defecto; puedes usar "bonferroni")
#-----------------------------------------------------------
dunn <- dunnTest(y ~ grupo, data = df, method = "bh")
dunn

## Dunn (1964) Kruskal-Wallis multiple comparison

##   p-values adjusted with the Benjamini-Hochberg method.

##    Comparison         Z    P.unadj      P.adj
## 1 ctrl - trt1  1.117725 0.26368427 0.26368427
## 2 ctrl - trt2 -1.689290 0.09116394 0.13674592
## 3 trt1 - trt2 -2.807015 0.00500029 0.01500087

# Letras de grupos a partir de Dunn (para resumir diferencias)
# 'dunn$res' tiene columnas: Comparison y P.adj
dunn_letters <- cldList(P.adj ~ Comparison, data = dunn$res, threshold = 0.05)
dunn_letters

##   Group Letter MonoLetter
## 1  ctrl     ab         ab
## 2  trt1      a         a 
## 3  trt2      b          b

#-----------------------------------------------------------
# 8. Gráfico con anotaciones de significancia (opcional)
#   Aquí usamos pareo por pares con Wilcoxon para dibujar p-values en el plot.
#   Nota: Es visual; el análisis oficial de no-paramétricas lo reportamos con Dunn.
#-----------------------------------------------------------
comparisons <- list(c("ctrl","trt1"), c("ctrl","trt2"), c("trt1","trt2"))

p2 <- ggplot(df, aes(grupo, y)) +
  geom_boxplot(width = 0.6, outlier.shape = NA) +
  geom_jitter(width = 0.08, alpha = 0.7) +
  stat_compare_means(method = "kruskal.test", label.y = max(df$y) + 0.3) +  # global KW
  stat_compare_means(comparisons = comparisons, method = "wilcox.test",
                     p.adjust.method = "BH", label = "p.signif") +
  labs(title = "Kruskal–Wallis (global) + Wilcoxon pareado (visual)",
       x = "Grupo", y = "Respuesta (y)") +
  theme_minimal(base_size = 13)
print(p2)

## Warning in wilcox.test.default(c(4.17, 5.58, 5.18, 6.11, 4.5, 4.61, 5.17, :
## cannot compute exact p-value with ties

#-----------------------------------------------------------
# 9. RESUMEN (impresión al final)
#-----------------------------------------------------------
cat("\n================= RESUMEN =================\n")

## 
## ================= RESUMEN =================

cat("ANOVA:\n"); print(summary(m.aov))

## ANOVA:

##             Df Sum Sq Mean Sq F value Pr(>F)  
## grupo        2  3.766  1.8832   4.846 0.0159 *
## Residuals   27 10.492  0.3886                 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

cat("\nSupuesto normalidad (Shapiro): p-valor =", shapiro.test(resid_aov)$p.value, "\n")

## 
## Supuesto normalidad (Shapiro): p-valor = 0.4378986

cat("Supuesto homogeneidad (Levene):\n"); print(leveneTest(y ~ grupo, data = df))

## Supuesto homogeneidad (Levene):

## Levene's Test for Homogeneity of Variance (center = median)
##       Df F value Pr(>F)
## group  2  1.1192 0.3412
##       27

#cat("\nTukey HSD (paramétrico):\n"); print(tuk)
#cat("\nLetras (Tukey, alfa=0.05):\n"); print(tuk_letters)

cat("\nKruskal–Wallis:\n"); print(kw)

## 
## Kruskal–Wallis:

## 
##  Kruskal-Wallis rank sum test
## 
## data:  y by grupo
## Kruskal-Wallis chi-squared = 7.9882, df = 2, p-value = 0.01842

cat("\nDunn (no paramétrico, BH):\n"); print(dunn)

## 
## Dunn (no paramétrico, BH):

## Dunn (1964) Kruskal-Wallis multiple comparison
##   p-values adjusted with the Benjamini-Hochberg method.

##    Comparison         Z    P.unadj      P.adj
## 1 ctrl - trt1  1.117725 0.26368427 0.26368427
## 2 ctrl - trt2 -1.689290 0.09116394 0.13674592
## 3 trt1 - trt2 -2.807015 0.00500029 0.01500087

cat("\nLetras (Dunn, alfa=0.05):\n"); print(dunn_letters)

## 
## Letras (Dunn, alfa=0.05):

##   Group Letter MonoLetter
## 1  ctrl     ab         ab
## 2  trt1      a         a 
## 3  trt2      b          b

# Nota de uso:
# - Reporta el ANOVA si los supuestos se cumplen; si no, reporta KW y las comparaciones Dunn.
# - Las “letras” resumen qué grupos difieren (grupos con la misma letra no difieren a alfa=0.05).

Kruskal

Jhonier Rangel