#-------------------------------------------------------
# Pruebas de hipótesis con t-test y Wilcoxon en R
# Ejemplos 100% reproducibles con datasets base de R
#-------------------------------------------------------

suppressPackageStartupMessages({
  library(nortest)   # Lilliefors
})

#-----------------------------------------
# Representación gráfica de ideas de hipótesis
# (desplazamientos de distribución)
#-----------------------------------------

#--------------------------
# X ~ Normal(15, 3)
#-------------------------
par(mfrow=c(1,1))
mu <- 15
sigma <- 3
t  <- seq(mu - 4*sigma, mu + 4*sigma, length=1000)
plot(t, pnorm(t, mu, sigma), col=2, type="l",
     ylab="Función de distribución",
     main=expression(X %~% Normal(15,3)))
points(15, pnorm(13, mu, sigma), pch=16, col=4)  # F_Y(15)=F_X(13)
points(13, pnorm(13, mu, sigma), pch=16, col=2)
legend(12, 0.30,  bty="n",
       legend=c(expression(F),
                expression(F == F)),
       pch=16, col=c(2,4), cex=0.9)

#------------------------------
# X ~ Exponencial (lambda = 3)
#------------------------------
par(mfrow=c(1,2))
lambda <- 3
t <- seq(0, 2, length=1000)
plot(t, dexp(t, lambda), col=2, type="l",
     ylab="Densidad", main=expression(X %~% Exp(lambda==3)))
points(0.3, dexp(0.3, lambda), pch=16, col=2)
points(0.4, dexp(0.3, lambda), pch=16, col=4)
legend(0.65, 2.5,  bty="n",
       legend=c(expression(f[X](0.3)),
                expression(f[Y](0.4) == f[X](0.3))),
       pch=16, col=c(2,4), cex=0.9)

t <- seq(0, 2, length=1000)
plot(t, pexp(t, lambda), ylim=c(0,1), col=2, type="l",
     ylab="Distribución", main=expression(X %~% Exp(lambda==3)))
points(0.3, pexp(0.3, lambda), pch=16, col=2)
points(0.4, pexp(0.3, lambda), pch=16, col=4)
legend(1.05, 0.70,  bty="n",
       legend=c(expression(F[X](0.3)),
                expression(F[Y](0.4) == F[X](0.3))),
       pch=16, col=c(2,4), cex=0.9)

#--------------------------------------
# X ~ Weibull(alpha=2, beta=3)
#--------------------------------------
par(mfrow=c(1,1))
alpha <- 2
beta  <- 3
x <- seq(0, 10, length=200)
plot(x, dweibull(x, shape=alpha, scale=beta), type="l",
     ylab="Densidad",
     main=expression(X %~% Weibull(alpha==2, beta==3)))

#------------------------------------------------
# EJEMPLO REPRODUCIBLE (SIN ARCHIVOS): iris
# Dos grupos: setosa vs versicolor, variable: Sepal.Length
#------------------------------------------------
dat <- subset(iris, Species %in% c("setosa","versicolor"))
dat$Species <- droplevels(dat$Species)

# Vectores por grupo
x1 <- dat$Sepal.Length[dat$Species=="setosa"]
x2 <- dat$Sepal.Length[dat$Species=="versicolor"]

#-----------------------------
# t de dos muestras (igual var)
#-----------------------------
tt_eq <- t.test(x1, x2, alternative="two.sided",
                mu=0, conf.level=0.95, var.equal=TRUE)
tt_eq
## 
##  Two Sample t-test
## 
## data:  x1 and x2
## t = -10.521, df = 98, p-value < 2.2e-16
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -1.1054165 -0.7545835
## sample estimates:
## mean of x mean of y 
##     5.006     5.936
# Comprobaciones de supuestos
lillie.test(x1)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x1
## D = 0.11486, p-value = 0.09693
lillie.test(x2)
## 
##  Lilliefors (Kolmogorov-Smirnov) normality test
## 
## data:  x2
## D = 0.096241, p-value = 0.2942
var.test(Sepal.Length ~ Species, data=dat)
## 
##  F test to compare two variances
## 
## data:  Sepal.Length by Species
## F = 0.46634, num df = 49, denom df = 49, p-value = 0.008657
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.2646385 0.8217841
## sample estimates:
## ratio of variances 
##          0.4663429
bartlett.test(Sepal.Length ~ Species, data=dat)
## 
##  Bartlett test of homogeneity of variances
## 
## data:  Sepal.Length by Species
## Bartlett's K-squared = 6.8917, df = 1, p-value = 0.00866
#-------------------------------
# Wilcoxon (Mann-Whitney)
#-------------------------------
wilcox.test(Sepal.Length ~ Species, data=dat,
            alternative="two.sided", conf.level=0.95)
## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  Sepal.Length by Species
## W = 168.5, p-value = 8.346e-14
## alternative hypothesis: true location shift is not equal to 0
# Mann-Whitney "a mano" (rango-suma)
Respuesta <- dat$Sepal.Length
Trat <- dat$Species  # factor con 2 niveles
rango <- rank(Respuesta)
W1 <- sum(rango[Trat==levels(Trat)[1]])  # setosa
W2 <- sum(rango[Trat==levels(Trat)[2]])  # versicolor
n  <- sum(Trat==levels(Trat)[1])
m  <- sum(Trat==levels(Trat)[2])
U1 <- W1 - (n*(n+1))/2
U2 <- W2 - (m*(m+1))/2
cat("Rangos-suma: W1=", W1, " W2=", W2, " | U1=", U1, " U2=", U2, "\n")
## Rangos-suma: W1= 1443.5  W2= 3606.5  | U1= 168.5  U2= 2331.5