El objetivo es estimar funciones discriminantes y logísticos para determinación del sexo
library(pacman)
## Warning: package 'pacman' was built under R version 4.5.2
library(haven)
## Warning: package 'haven' was built under R version 4.5.2
## El siguiente archivo esta en formato SPSS, y lo abrimos mediante la libreria haven
p_load(haven,dplyr,ggplot2,MASS)
p_load(tinytex)
Hombro <- read_sav("Datos hombro.sav")
Hombro$sexoN <- factor(Hombro$sexoN,
levels = c(2, 1),
labels = c("Mujer", "Hombre"))
table (Hombro$sexoN) ## Frecuencias de sexo
##
## Mujer Hombre
## 30 50
Resumen por sexo de la información de Longitud Máxima de la Clavicula Derecha
res_lmcd <- Hombro %>%
group_by(sexoN) %>%
summarise(
n = sum(!is.na(LMCD)),
media = mean(LMCD, na.rm = TRUE),
sd = sd(LMCD, na.rm = TRUE)
) %>%
mutate(across(c(media, sd), ~round(.x, 2)))
res_lmcd
## # A tibble: 2 × 4
## sexoN n media sd
## <fct> <int> <dbl> <dbl>
## 1 Mujer 25 131. 7.56
## 2 Hombre 35 151. 10.2
ggplot(Hombro, aes(x = LMCD, fill = sexoN)) +
geom_density(alpha = 0.5) +
labs(
title = "Gráfica 1. Longitud máxima de la clavicula derecha por sexo",
x = "Longitud máxima de la clavicula derecha (mm)",
y = "Densidad"
) +
theme_minimal()
## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_density()`).
ggplot(Hombro, aes(x = sexoN, y = LMCD, fill = sexoN)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Longitud máxima de la clavicula derecha",
x = "Sexo",
y = " "
) +
theme_minimal()
## Warning: Removed 20 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
# C12CD
Resumen por sexo de la información de la Circunferencia en punto medio de la Clavicula Derecha
res_c12cd <- Hombro %>%
group_by(sexoN) %>%
summarise(
n = sum(!is.na(C12CD)),
media = mean(C12CD, na.rm = TRUE),
sd = sd(C12CD, na.rm = TRUE)
) %>%
mutate(across(c(media, sd), ~round(.x, 2)))
res_c12cd
## # A tibble: 2 × 4
## sexoN n media sd
## <fct> <int> <dbl> <dbl>
## 1 Mujer 29 30.3 2.83
## 2 Hombre 46 35.9 2.33
ggplot(Hombro, aes(x = C12CD, fill = sexoN)) +
geom_density(alpha = 0.5) +
labs(
title = "Gráfica 1. Circunferencia de punto medio de la clavicula derecha por sexo",
x = "Circunferencia de punto medio de la clavicula derecha (mm)",
y = "Densidad"
) +
theme_minimal()
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_density()`).
ggplot(Hombro, aes(x = sexoN, y = C12CD, fill = sexoN)) +
geom_boxplot(alpha = 0.7) +
labs(
title = "Circunferencia del punto medio de la clavicula derecha",
x = "Sexo",
y = " "
) +
theme_minimal()
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_boxplot()`).
p_load(effsize)
#### **LMCD**
cohen.d(Hombro$LMCD, Hombro$sexoN,na.rm = TRUE)
##
## Cohen's d
##
## d estimate: -2.148866 (large)
## 95 percent confidence interval:
## lower upper
## -2.803803 -1.493929
cohen.d(Hombro$C12CD, Hombro$sexoN,na.rm = TRUE)
##
## Cohen's d
##
## d estimate: -2.231244 (large)
## 95 percent confidence interval:
## lower upper
## -2.827185 -1.635303
t.test(LMCD ~ sexoN, data = Hombro, var.equal = TRUE)
##
## Two Sample t-test
##
## data: LMCD by sexoN
## t = -8.2061, df = 58, p-value = 2.767e-11
## alternative hypothesis: true difference in means between group Mujer and group Hombre is not equal to 0
## 95 percent confidence interval:
## -24.53489 -14.91249
## sample estimates:
## mean in group Mujer mean in group Hombre
## 131.3026 151.0263
t.test(LMCD ~ sexoN, data = Hombro, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: LMCD by sexoN
## t = -8.6158, df = 57.876, p-value = 5.838e-12
## alternative hypothesis: true difference in means between group Mujer and group Hombre is not equal to 0
## 95 percent confidence interval:
## -24.30633 -15.14104
## sample estimates:
## mean in group Mujer mean in group Hombre
## 131.3026 151.0263
## la alternativa no paramétrica
wilcox.test(LMCD ~ sexoN, data = Hombro)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: LMCD by sexoN
## W = 43.5, p-value = 3.584e-09
## alternative hypothesis: true location shift is not equal to 0
t.test(C12CD ~ sexoN, data = Hombro, var.equal = TRUE)
##
## Two Sample t-test
##
## data: C12CD by sexoN
## t = -9.4101, df = 73, p-value = 3.183e-14
## alternative hypothesis: true difference in means between group Mujer and group Hombre is not equal to 0
## 95 percent confidence interval:
## -6.849137 -4.454997
## sample estimates:
## mean in group Mujer mean in group Hombre
## 30.28626 35.93832
t.test(LMCD ~ sexoN, data = Hombro, var.equal = FALSE)
##
## Welch Two Sample t-test
##
## data: LMCD by sexoN
## t = -8.6158, df = 57.876, p-value = 5.838e-12
## alternative hypothesis: true difference in means between group Mujer and group Hombre is not equal to 0
## 95 percent confidence interval:
## -24.30633 -15.14104
## sample estimates:
## mean in group Mujer mean in group Hombre
## 131.3026 151.0263
## la alternativa no paramétrica
wilcox.test(C12CD ~ sexoN, data = Hombro)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: C12CD by sexoN
## W = 76, p-value = 1.154e-10
## alternative hypothesis: true location shift is not equal to 0
by(Hombro$LMCD, Hombro$sexoN, shapiro.test)
## Hombro$sexoN: Mujer
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.97687, p-value = 0.8169
##
## ------------------------------------------------------------
## Hombro$sexoN: Hombre
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.94899, p-value = 0.1055
var.test(LMCD ~ sexoN, data = Hombro)
##
## F test to compare two variances
##
## data: LMCD by sexoN
## F = 0.55358, num df = 24, denom df = 34, p-value = 0.1344
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.266830 1.206637
## sample estimates:
## ratio of variances
## 0.5535798
by(Hombro$C12CD, Hombro$sexoN, shapiro.test)
## Hombro$sexoN: Mujer
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95394, p-value = 0.2313
##
## ------------------------------------------------------------
## Hombro$sexoN: Hombre
##
## Shapiro-Wilk normality test
##
## data: dd[x, ]
## W = 0.95329, p-value = 0.06285
var.test(C12CD ~ sexoN, data = Hombro)
##
## F test to compare two variances
##
## data: C12CD by sexoN
## F = 1.4724, num df = 28, denom df = 45, p-value = 0.2424
## alternative hypothesis: true ratio of variances is not equal to 1
## 95 percent confidence interval:
## 0.7673317 2.9822806
## sample estimates:
## ratio of variances
## 1.47242
Hombro_sinNA <- na.omit(Hombro[, c("sexoN", "LMCD","C12CD")])
lda1 <- lda(sexoN ~ LMCD, data = Hombro_sinNA)
a <- coef(lda1) ## El factor que multiplica a LMHD
pred0 <- predict(lda1,
newdata = data.frame(LMCD = 0))
b <- pred0$x # valor de D cuando LMCD=0 es b
pred <- predict(lda1) ## Predicción de sexo para todos los valores de medias de la función discriminante por grupo
centroide_H <- mean(pred$x[Hombro_sinNA$sexoN == "Hombre"])
centroide_M <- mean(pred$x[Hombro_sinNA$sexoN == "Mujer"])
# Punto de corte (promedio de centroides, priors iguales)
cutoff <- mean(c(centroide_H, centroide_M))
cat("Función discriminante: D(x) = ", round(a, 4), " * LMCD + ", round(b, 4), "\n")
## Función discriminante: D(x) = 0.1089 * LMCD + -15.5587
cat("Punto de corte=", round(cutoff, 4), "Si D>",round(cutoff, 4), "es Hombre")
## Punto de corte= -0.1791 Si D> -0.1791 es Hombre
cat("Función discriminante: D(x) = ", round(a, 4), " * C12CD + ", round(b, 4), "\n")
## Función discriminante: D(x) = 0.1089 * C12CD + -15.5587
cat("Punto de corte=", round(cutoff, 4), "Si D>",round(cutoff, 4), "es Hombre")
## Punto de corte= -0.1791 Si D> -0.1791 es Hombre
tabla_clas <- table(Observado = Hombro_sinNA$sexoN,
Predicho = pred$class)
tabla_clas
## Predicho
## Observado Mujer Hombre
## Mujer 22 3
## Hombre 3 32
prop_clas <- sum(diag(tabla_clas)) / sum(tabla_clas)*100
cat("El porcentaje de clasificación correcta es ",round(prop_clas,1),"%")
## El porcentaje de clasificación correcta es 90 %
Hombro_sinNA <- na.omit(Hombro[, c("sexoN", "LMCD","C12CD")])
lda2 <- lda(sexoN ~ LMCD+C12CD, data = Hombro_sinNA)
##Ajustar Discriminante Lineal
# Obtener los coeficientes a y b
a1 <- coef(lda2)[1]
a2 <- coef(lda2)[2]
lda2
## Call:
## lda(sexoN ~ LMCD + C12CD, data = Hombro_sinNA)
##
## Prior probabilities of groups:
## Mujer Hombre
## 0.4166667 0.5833333
##
## Group means:
## LMCD C12CD
## Mujer 131.3026 30.31206
## Hombre 151.0263 35.77151
##
## Coefficients of linear discriminants:
## LD1
## LMCD 0.0727066
## C12CD 0.2559868
# Predicción de valores mediante modelo
pred2 <- predict(lda2)
table(Real = Hombro_sinNA$sexoN, Predicho = pred2$class)
## Predicho
## Real Mujer Hombre
## Mujer 24 1
## Hombre 1 34
mean(pred2$class == Hombro_sinNA$sexoN)*100
## [1] 96.66667
b <- predict(lda2,
newdata = data.frame(LMCD = 0, C12CD = 0))$x
b
## LD1
## 1 -18.95782
centroide_H <- mean(pred2$x[Hombro_sinNA$sexoN == "Hombre"])
centroide_M <- mean(pred2$x[Hombro_sinNA$sexoN == "Mujer"])
cutoff <- mean(c(centroide_H, centroide_M))
cutoff
## [1] -0.2359658
cat("Función discriminante: D(x) = ", round(a1, 4), " * LMCD + ", round(a2, 4),"*C12CD +",round(b, 4), "\n")
## Función discriminante: D(x) = 0.0727 * LMCD + 0.256 *C12CD + -18.9578
cat("Punto de corte=", round(cutoff, 4), "Si D>",round(cutoff, 4), "es Hombre")
## Punto de corte= -0.236 Si D> -0.236 es Hombre
ggplot(Hombro_sinNA, aes(x = LMCD, y = C12CD, color = sexoN)) +
geom_point(size = 3) +
theme_minimal() +
labs(title = "Discriminante lineal: LMCD vs C12CD")
a1 <- lda2$scaling["LMCD", 1]
a2 <- lda2$scaling["C12CD", 1]
b <- as.numeric(predict(lda2, newdata = data.frame(LMCD = 0, C12CD = 0))$x) - cutoff
ggplot(Hombro_sinNA, aes(x = LMCD, y = C12CD, color = sexoN)) +
geom_point(size = 3) +
geom_abline(slope = -(a1/a2), intercept = -b/a2, color = "black", linetype = "dashed") +
theme_minimal() +
labs(title = "Frontera de clasificación LDA")
LD1 <- pred2$x[, 1]
ggplot(data.frame(LD1, sexoN = Hombro_sinNA$sexoN),
aes(x = LD1, fill = sexoN)) +
geom_density(alpha = 0.4) +
geom_vline(xintercept = cutoff, linetype = "dashed") +
scale_x_continuous(limits = c(-5, 5)) + # <-- escala de -5 a 5
theme_minimal() +
labs(title = "Distribución sobre la función discriminante",
x = "LD1 (puntuación discriminante)",
y = "Densidad")
Hombro_sinNA <- na.omit(Hombro[, c("sexoN", "LMCD","C12CD")])
modelo1 <- glm(sexoN ~ LMCD + C12CD,
data = Hombro_sinNA,
family = binomial(link = "logit"))
summary(modelo1)
##
## Call:
## glm(formula = sexoN ~ LMCD + C12CD, family = binomial(link = "logit"),
## data = Hombro_sinNA)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -69.0885 22.9591 -3.009 0.00262 **
## LMCD 0.3067 0.1117 2.747 0.00602 **
## C12CD 0.8022 0.3405 2.356 0.01847 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 81.503 on 59 degrees of freedom
## Residual deviance: 18.731 on 57 degrees of freedom
## AIC: 24.731
##
## Number of Fisher Scoring iterations: 8
Hombro_sinNA$prob_Hombre <- predict(modelo1, type = "response")
Hombro_sinNA$predicho <- ifelse(Hombro_sinNA$prob_Hombre >= 0.5, "Hombre", "Mujer")
table(Real = Hombro_sinNA$sexoN, Predicho = Hombro_sinNA$predicho)
## Predicho
## Real Hombre Mujer
## Mujer 1 24
## Hombre 34 1
mean(Hombro_sinNA$sexoN == Hombro_sinNA$predicho) * 100
## [1] 96.66667
ggplot(Hombro_sinNA, aes(x = prob_Hombre, fill = sexoN)) +
geom_density(alpha = 0.4) +
geom_vline(xintercept = 0.5, linetype = "dashed") +
theme_minimal() +
labs(title = "Probabilidades predichas de ser Hombre",
x = "P(Hombre)", y = "Densidad")
coef(modelo1)
## (Intercept) LMCD C12CD
## -69.0884686 0.3067252 0.8022468
cat("Ecuación logística:\nlogit(p) = ",
round(coef(modelo1)[1], 4), " + ",
round(coef(modelo1)[2], 4), "*LMCD + ",
round(coef(modelo1)[3], 4), "*C12CD\n")
## Ecuación logística:
## logit(p) = -69.0885 + 0.3067 *LMCD + 0.8022 *C12CD