library(BSDA)
## Warning: package 'BSDA' was built under R version 4.5.1
## Cargando paquete requerido: lattice
## 
## Adjuntando el paquete: 'BSDA'
## The following object is masked from 'package:datasets':
## 
##     Orange
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.1
# leer los datos
D = read.csv("DatosEquipo7_filtrado.csv")

D$p34_2_bin = ifelse(D$p34_2 %in% c(1), "Sí", "No")
D$p34_3_bin = ifelse(D$p34_3 %in% c(1), "Sí", "No")


va1 = subset(D, p34_2_bin == "Sí")
va2 = subset(D, p34_3_bin == "Sí")

decirsi = rbind(data.frame(grupo = "Sí_p34_2", p4 = va1$p4, p26 = va1$p26), data.frame(grupo = "Sí_p34_3", p4 = va2$p4, p26 = va2$p26))


# grafica de caja para minutos de lectura
boxplot(p26 ~ grupo, data = decirsi, main = "Minutos de lectura (p26): Sí_p34_2 vs Sí_p34_3", xlab = " ", ylab = "Minutos de lectura continua", col = c("blue", "green"))

# grafica de caja para  Libros leidos por año
boxplot(p4 ~ grupo, data = decirsi, main = "Libros leídos (p4): Sí_p34_2 vs Sí_p34_3", xlab = " ", ylab = "Número de libros leídos", col = c("blue", "green"))

boxplot(p26 ~ grupo, data = decirsi, main = "Comparación p26 con medias", xlab = " ", ylab = "Minutos de lectura", col = c("blue", "green")) 
points(tapply(decirsi$p26, decirsi$grupo, mean, na.rm = TRUE),pch = 19, col = "red")  # puntos rojos = medias

boxplot(p4 ~ grupo, data = decirsi, main = "Comparación p4 con medias", xlab = " ", ylab = "Libros leídos por año", col = c("blue", "green"))
points(tapply(decirsi$p4, decirsi$grupo, mean, na.rm = TRUE), pch = 19, col = "red")  # puntos rojos = medias

# Para p4 (libros leídos al año)
ic_p4 = z.test(va1$p4, va2$p4, sigma.x=sd(va1$p4), sigma.y = sd(va2$p4), conf.level = 0.96)
ic_p4
## 
##  Two-sample z-Test
## 
## data:  va1$p4 and va2$p4
## z = -0.96899, p-value = 0.3326
## alternative hypothesis: true difference in means is not equal to 0
## 96 percent confidence interval:
##  -0.6121911  0.2196949
## sample estimates:
## mean of x mean of y 
##  1.763211  1.959459
# Para p26 (minutos continuos de lectura)
ic_p26 = z.test(va1$p26, va2$p26, sigma.x=sd(va1$p26), sigma.y = sd(va2$p26), conf.level = 0.96)
ic_p26
## 
##  Two-sample z-Test
## 
## data:  va1$p26 and va2$p26
## z = -1.4107, p-value = 0.1583
## alternative hypothesis: true difference in means is not equal to 0
## 96 percent confidence interval:
##  -6.097674  1.131732
## sample estimates:
## mean of x mean of y 
##  34.63415  37.11712
cat("Intervalo de confianza 96% para la diferencia de medias: \n")
## Intervalo de confianza 96% para la diferencia de medias:
cat("\nVariable p4 (libros leídos al año): \n")
## 
## Variable p4 (libros leídos al año):
cat("Diferencia de medias =",round(ic_p4$estimate[1] - ic_p4$estimate[2], 2), "\nIC 96% = [",round(ic_p4$conf.int[1], 2), ", ",round(ic_p4$conf.int[2], 2), "]\n")
## Diferencia de medias = -0.2 
## IC 96% = [ -0.61 ,  0.22 ]
cat("\nVariable p26 (minutos continuos de lectura): \n")
## 
## Variable p26 (minutos continuos de lectura):
cat("Diferencia de medias =", round(ic_p26$estimate[1] - ic_p26$estimate[2], 2), "\nIC 96% = [", round(ic_p26$conf.int[1], 2), ", ", round(ic_p26$conf.int[2], 2), "]\n")
## Diferencia de medias = -2.48 
## IC 96% = [ -6.1 ,  1.13 ]
LI_p26 <- ic_p26$conf.int[1]
LS_p26 <- ic_p26$conf.int[2]

plot(0, ylim = c(0, 2), xlim = c(LI_p26 - 1, LS_p26 + 1), yaxt = "n", ylab = "",
     main = "Intervalo de confianza para la diferencia de medias p26")
axis(2, at = 1, labels = "Intervalo")
arrows(LI_p26, 1, LS_p26, 1, angle = 90, code = 3, length = 0.1, lwd = 2, col = "blue")

abline(v = ic_p26$estimate[1] - ic_p26$estimate[2], col = "red", lwd = 2)

LI_p4 <- ic_p4$conf.int[1]
LS_p4 <- ic_p4$conf.int[2]

plot(0, ylim = c(0, 2), xlim = c(LI_p4 - 1, LS_p4 + 1), yaxt = "n", ylab = "",
     main = "Intervalo de confianza para la diferencia de medias p4")
axis(2, at = 1, labels = "Intervalo")
arrows(LI_p4, 1, LS_p4, 1, angle = 90, code = 3, length = 0.1, lwd = 2, col = "blue")

abline(v = ic_p4$estimate[1] - ic_p4$estimate[2], col = "red", lwd = 2)

Normalidad

shapiro.test(va1$p4)
## 
##  Shapiro-Wilk normality test
## 
## data:  va1$p4
## W = 0.4352, p-value < 2.2e-16
shapiro.test(va2$p26)
## 
##  Shapiro-Wilk normality test
## 
## data:  va2$p26
## W = 0.80049, p-value < 2.2e-16

Se rechaza la hipótesis nula de que la las variables tienen distribución normal.

Varianzas

var.test(va1$p4,va1$p26)
## 
##  F test to compare two variances
## 
## data:  va1$p4 and va1$p26
## F = 0.012163, num df = 983, denom df = 983, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.0107331 0.0137843
## sample estimates:
## ratio of variances 
##          0.0121634
var.test(va2$p4,va2$p26)
## 
##  F test to compare two variances
## 
## data:  va2$p4 and va2$p26
## F = 0.013916, num df = 665, denom df = 665, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.01195228 0.01620304
## sample estimates:
## ratio of variances 
##         0.01391629
var.test(D$p4,D$p26)
## 
##  F test to compare two variances
## 
## data:  D$p4 and D$p26
## F = 0.0099502, num df = 2015, denom df = 2015, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.009117967 0.010858488
## sample estimates:
## ratio of variances 
##        0.009950243
var.test(D$p4,D$p26)
## 
##  F test to compare two variances
## 
## data:  D$p4 and D$p26
## F = 0.0099502, num df = 2015, denom df = 2015, p-value < 2.2e-16
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
##  0.009117967 0.010858488
## sample estimates:
## ratio of variances 
##        0.009950243

Frecuencias esperadas para chi cuadrada

tabla <- table(D$p34_2, D$p5)
chi <- chisq.test(tabla, correct = FALSE)
## Warning in chisq.test(tabla, correct = FALSE): Chi-squared approximation may be
## incorrect
# Ver frecuencias esperadas
chi$expected
##    
##             0          1          2          3          4         5         6
##   0  28.13938  2.0282738  2.0049603   4.802579   7.506944  2.238095 0.2797619
##   1 589.13095 42.4642857 41.9761905 100.547619 157.166667 46.857143 5.8571429
##   2 575.95933 41.5148810 41.0376984  98.299603 153.652778 45.809524 5.7261905
##   3  13.77034  0.9925595  0.9811508   2.350198   3.673611  1.095238 0.1369048
# Verificar condición
if (all(chi$expected >= 5)) {
  print("Usar chi-cuadrado")
} else {
  print("Usar Fisher")
}
## [1] "Usar Fisher"
D$p34_2_fusion <- factor(D$p34_2_bin)

D$p34_3_fusion <- factor(D$p34_3_bin)

D$p5_fusion <- as.character(D$p5)

D$p5_fusion[D$p5 %in% c(1, 2)] <- "Trabajo"
D$p5_fusion[D$p5 %in% c(3, 4)] <- "Entretenimiento"
D$p5_fusion[D$p5 %in% c(0, 6)] <- "Otro"
D$p5_fusion[D$p5 == 5] <- "Religión"

D$p5_fusion <- factor(D$p5_fusion)

tabla_p34_2 <- table(D$p34_2_fusion, D$p5_fusion)
chi_p34_2 <- chisq.test(tabla_p34_2, correct = FALSE)

# Ver frecuencias esperadas
chi_p34_2$expected
##     
##      Entretenimiento     Otro Religión  Trabajo
##   No        270.2857 624.0119 49.14286 88.55952
##   Sí        257.7143 594.9881 46.85714 84.44048
# Verificar condición
if (all(chi_p34_2$expected >= 5)) {
  print("Usar chi-cuadrado")
} else {
  print("Usar Fisher")
}
## [1] "Usar chi-cuadrado"
tabla_p34_3 <- table(D$p34_3_fusion, D$p5_fusion)
chi_p34_3 <- chisq.test(tabla_p34_3, correct = FALSE)

# Ver frecuencias esperadas
chi_p34_3$expected
##     
##      Entretenimiento     Otro Religión   Trabajo
##   No        353.5714 816.2946 64.28571 115.84821
##   Sí        174.4286 402.7054 31.71429  57.15179
# Verificar condición
if (all(chi_p34_3$expected >= 5)) {
  print("Usar chi-cuadrado")
} else {
  print("Usar Fisher")
}
## [1] "Usar chi-cuadrado"

Fusionando las categorías podemos usar chi-cuadrado.

chi_p34_3
## 
##  Pearson's Chi-squared test
## 
## data:  tabla_p34_3
## X-squared = 131.24, df = 3, p-value < 2.2e-16
chi_p34_2
## 
##  Pearson's Chi-squared test
## 
## data:  tabla_p34_2
## X-squared = 140.63, df = 3, p-value < 2.2e-16
# Crear la tabla de contingencia
empleo <- matrix(c(273, 27,
                  261, 39,
                  356, 44),
                nrow = 3,
                byrow = TRUE,
                dimnames = list(
                  Vuelo = c("Delta", "United", "US Airways"),
                  Variable = c("Demorado", "Tiempo")
                ))

# Prueba de independencia
resultado <- chisq.test(empleo)

# Mostrar el estadístico redondeado
round(resultado$statistic, 2)
## X-squared 
##      2.45
# Valor p
resultado$p.value
## [1] 0.2935402
resultado
## 
##  Pearson's Chi-squared test
## 
## data:  empleo
## X-squared = 2.4515, df = 2, p-value = 0.2935
valor_critico <- qchisq(p = 0.99, df = 2)
round(valor_critico, 2)
## [1] 9.21
z_ci <- function(x, conf.level = 0.95) {
  n <- length(x)
  mean_x <- mean(x, na.rm = TRUE)
  sd_x <- sd(x, na.rm = TRUE)
  alpha <- 1 - conf.level
  z <- qnorm(1 - alpha/2)
  error <- z * sd_x / sqrt(n)
  ci <- c(mean_x - error, mean_x + error)
  return(list(mean = mean_x, ci = ci))
  
}
  
# Nivel de confianza
conf.level <- 0.96

# Subgrupos
grupo_2 <- subset(D, p34_2_bin == "Sí")
grupo_3 <- subset(D, p34_3_bin == "Sí")

# Intervalos de confianza individuales
ic_p4_g2 <- z_ci(grupo_2$p4, conf.level)
ic_p4_g3 <- z_ci(grupo_3$p4, conf.level)
ic_p26_g2 <- z_ci(grupo_2$p26, conf.level)
ic_p26_g3 <- z_ci(grupo_3$p26, conf.level)

z_diff_ci <- function(x1, x2, conf.level = 0.95) {
  n1 <- length(x1)
  n2 <- length(x2)
  m1 <- mean(x1, na.rm = TRUE)
  m2 <- mean(x2, na.rm = TRUE)
  sd1 <- sd(x1, na.rm = TRUE)
  sd2 <- sd(x2, na.rm = TRUE)
  alpha <- 1 - conf.level
  z <- qnorm(1 - alpha/2)
  se_diff <- sqrt((sd1^2 / n1) + (sd2^2 / n2))
  diff <- m1 - m2
  error <- z * se_diff
  ci <- c(diff - error, diff + error)
  return(list(diff = diff, ci = ci))
}

# Diferencias de medias
ic_diff_p4 <- z_diff_ci(grupo_2$p4, grupo_3$p4, conf.level)
ic_diff_p26 <- z_diff_ci(grupo_2$p26, grupo_3$p26, conf.level)

cat("Media de p4 en grupo p34_2 = Sí:\n")
## Media de p4 en grupo p34_2 = Sí:
cat("Media =", ic_p4_g2$mean, "\n Intervalo de confianza 96% = [", ic_p4_g2$ci[1], ",", ic_p4_g2$ci[2], "]\n")
## Media = 1.763211 
##  Intervalo de confianza 96% = [ 1.51578 , 2.010642 ]
cat("Media de p4 en grupo p34_3 = Sí:\n")
## Media de p4 en grupo p34_3 = Sí:
cat("Media =", ic_p4_g3$mean, "\n Intervalo de confianza 96% = [", ic_p4_g3$ci[1], ",", ic_p4_g3$ci[2], "]\n")
## Media = 1.959459 
##  Intervalo de confianza 96% = [ 1.625115 , 2.293804 ]
cat("Media de p26 en grupo p34_2 = Sí:\n")
## Media de p26 en grupo p34_2 = Sí:
cat("Media =", ic_p26_g2$mean, "\n Intervalo de confianza 96% = [", ic_p26_g2$ci[1], ",", ic_p26_g2$ci[2], "]\n")
## Media = 34.63415 
##  Intervalo de confianza 96% = [ 32.39064 , 36.87765 ]
cat("Media de p26 en grupo p34_3 = Sí:\n")
## Media de p26 en grupo p34_3 = Sí:
cat("Media =", ic_p26_g3$mean, "\n Intervalo de confianza 96% = [", ic_p26_g3$ci[1], ",", ic_p26_g3$ci[2], "]\n")
## Media = 37.11712 
##  Intervalo de confianza 96% = [ 34.2829 , 39.95133 ]
cat("Diferencia de medias de p4 entre grupos:\n")
## Diferencia de medias de p4 entre grupos:
cat("Diferencia =", ic_diff_p4$diff, "\n Intervalo de confianza 96% = [", ic_diff_p4$ci[1], ",", ic_diff_p4$ci[2], "]\n")
## Diferencia = -0.1962481 
##  Intervalo de confianza 96% = [ -0.6121911 , 0.2196949 ]
cat("Diferencia de medias de p26 entre grupos:\n")
## Diferencia de medias de p26 entre grupos:
cat("Diferencia =", ic_diff_p26$diff, "\n Intervalo de confianza 96% = [", ic_diff_p26$ci[1], ",", ic_diff_p26$ci[2], "]\n")
## Diferencia = -2.482971 
##  Intervalo de confianza 96% = [ -6.097674 , 1.131732 ]
# Crear dataframe con los resultados
df_ic <- data.frame(
  Variable = c("p4 (p34_2)", "p4 (p34_3)", "p26 (p34_2)", "p26 (p34_3)",
               "Diferencia p4", "Diferencia p26"),
  Media = c(ic_p4_g2$mean, ic_p4_g3$mean, ic_p26_g2$mean, ic_p26_g3$mean,
            ic_diff_p4$diff, ic_diff_p26$diff),
  IC_inf = c(ic_p4_g2$ci[1], ic_p4_g3$ci[1], ic_p26_g2$ci[1], ic_p26_g3$ci[1],
             ic_diff_p4$ci[1], ic_diff_p26$ci[1]),
  IC_sup = c(ic_p4_g2$ci[2], ic_p4_g3$ci[2], ic_p26_g2$ci[2], ic_p26_g3$ci[2],
             ic_diff_p4$ci[2], ic_diff_p26$ci[2])
)

ggplot(df_ic, aes(y = Variable, x = Media)) +
  geom_point(size = 3, color = "darkblue") +
  geom_errorbarh(aes(xmin = IC_inf, xmax = IC_sup), height = 0.2, color = "steelblue") +
  geom_vline(xintercept = 0, linetype = "dashed", color = "gray50") +
  labs(title = "Intervalos de confianza al 96%",
       x = "Media estimada (con IC)",
       y = "") +
  theme_minimal(base_size = 14)
## Warning: `geom_errobarh()` was deprecated in ggplot2 4.0.0.
## ℹ Please use the `orientation` argument of `geom_errorbar()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `height` was translated to `width`.