summary(cars)
## speed dist
## Min. : 4.0 Min. : 2.00
## 1st Qu.:12.0 1st Qu.: 26.00
## Median :15.0 Median : 36.00
## Mean :15.4 Mean : 42.98
## 3rd Qu.:19.0 3rd Qu.: 56.00
## Max. :25.0 Max. :120.00
### El objetivo es aplicar modelos de regresión lineal para
### estimar la estatura a partir de longitud (es) de las extremidades
## Defino mi directorio de trabajo
setwd("~/mis proyectos")
##Abriendo paquete pacman
library(pacman)
## El archivo esta en formato SPSS, lo abrimos mediante la libreria haven
p_load(haven,dplyr,ggplot2,tinytex,tidyr,GGally,purrr,labelled)
Cholula <- read_sav("Cholula.sav")
# 1. Seleccionar variables
vars <- Cholula %>% dplyr::select(X11, X14:X21)
# 2. Extraer etiquetas SPSS
etiquetas_raw <- map_chr(names(vars), ~{
lab <- attr(vars[[.x]], "label")
if (is.null(lab)) .x else lab
})
etiquetas <- make.unique(etiquetas_raw, sep = "_")
# 3. Renombrar
df <- vars %>% rename_with(~etiquetas)
# 4. Función para generar gráfica por variable
graficar_cor <- function(y_var){
datos <- df %>% dplyr::select(`Estatura total`, all_of(y_var))
prueba <- cor.test(datos[[1]], datos[[2]], use = "pairwise.complete.obs")
r <- round(prueba$estimate, 3)
p <- formatC(prueba$p.value, digits = 3, format = "e")
ggplot(datos, aes(x = `Estatura total`, y = .data[[y_var]])) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = FALSE) +
labs(
title = paste0("Estatura vs ", y_var),
subtitle = paste0("r = ", r, " | p = ", p),
x = "Estatura total",
y = y_var
) +
theme_minimal(base_size = 14)
}
# 5. Generar todas las gráficas
graficas <- map(names(df)[-1], graficar_cor)
graficas
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

##
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

##
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

##
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

##
## [[5]]
## `geom_smooth()` using formula = 'y ~ x'

##
## [[6]]
## `geom_smooth()` using formula = 'y ~ x'

##
## [[7]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

##
## [[8]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Ajustaré el modelo en función de la longitud de pierna
## Estatura total (X11)∼Longitud de la pierna (X20)
datos <- Cholula %>% dplyr::select(X11, X20)
# Extraer etiquetas SPSS
lab_X11 <- attr(datos$X11, "label")
lab_X20 <- attr(datos$X20, "label")
## Aquí estoy ajustando el modelo de mínimos cuadrados o clásico
modelo <- lm(X11 ~ X20, data = datos)
summary(modelo)
##
## Call:
## lm(formula = X11 ~ X20, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -153.19 -20.60 -0.71 20.99 105.58
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 890.0304 29.3368 30.34 <2e-16 ***
## X20 2.0589 0.0836 24.63 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 35.46 on 335 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.6442, Adjusted R-squared: 0.6432
## F-statistic: 606.6 on 1 and 335 DF, p-value: < 2.2e-16
coef <- summary(modelo)$coefficients
round(coef, 4)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 890.0304 29.3368 30.3384 0
## X20 2.0589 0.0836 24.6292 0
op <- par() # guarda la configuración actual del panel de gráfica
par(mar = c(4, 4, 2, 1)) # reduce márgenes
par(mfrow = c(2, 2))
plot(modelo)

par(op) # regresa a la configuración original
## Warning in par(op): el parámetro del gráfico "cin" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "cra" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "csi" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "cxy" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "din" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "page" no puede ser especificado
## Graficando modelo de regresión
ggplot(datos, aes(x = X20, y = X11)) +
geom_point(alpha = 0.6) +
geom_smooth(method = "lm", se = TRUE, color = "blue") +
labs(
title = paste0(lab_X11, " en función de ", lab_X20),
x = lab_X20,
y = lab_X11
) +
theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

### Se va a realizar el diagnóstico del modelo
# Calcular valores diagnósticos
# Extraer solo datos usados por el modelo
datos_modelo <- model.frame(modelo)
# Diagnósticos
diagnosticos <- data.frame(
estatura = datos_modelo$X11,
pierna = datos_modelo$X20,
residuales = residuals(modelo),
residuales_est = rstandard(modelo),
leverage = hatvalues(modelo),
cooks = cooks.distance(modelo)
)
# Identificar posibles outliers por residuales estandarizados ---
outliers_residuales <- diagnosticos[abs(diagnosticos$residuales_est) > 2, ]
# Identificar puntos con alto leverage ---
# Regla: leverage > 2p/n (p = nº de parámetros del modelo)
p <- length(coefficients(modelo))
n <- nrow(datos)
umbral_leverage <- 2 * p / n
outliers_leverage <- diagnosticos[diagnosticos$leverage > umbral_leverage, ]
# 6. Identificar puntos influyentes por distancia de Cook ---
# Regla común: Cook > 4/n
umbral_cook <- 4 / n
outliers_cook <- diagnosticos[diagnosticos$cooks > umbral_cook, ]
# Mostrar resultados ---
cat("---- OUTLIERS POR RESIDUALES ESTANDARIZADOS ----\n")
## ---- OUTLIERS POR RESIDUALES ESTANDARIZADOS ----
print(outliers_residuales)
## estatura pierna residuales residuales_est leverage cooks
## 51 1697 348 90.46313 2.554882 0.002993508 0.009799277
## 58 1711 362 75.63816 2.136997 0.003745237 0.008583940
## 152 1493 328 -72.35833 -2.046340 0.005698703 0.012000061
## 201 1497 330 -72.47619 -2.049188 0.005228114 0.011034560
## 234 1740 363 102.57923 2.898360 0.003882294 0.016370137
## 242 1702 356 78.99172 2.231085 0.003156308 0.007880516
## 254 1778 380 105.57748 2.989128 0.007912859 0.035632205
## 269 1793 404 71.16324 2.026214 0.019071639 0.039910866
## 278 1788 400 74.39894 2.115859 0.016767243 0.038172347
## 304 1682 433 -99.54564 -2.866705 0.041097025 0.176105067
## 305 1498 356 -125.00828 -3.530802 0.003156308 0.019736454
## 306 1546 393 -153.18857 -4.348622 0.013162477 0.126114603
## 307 1590 389 -100.95286 -2.863155 0.011347140 0.047043799
## 314 1518 342 -76.18331 -2.151960 0.003338237 0.007755461
cat("\n---- PUNTOS CON ALTO LEVERAGE ----\n")
##
## ---- PUNTOS CON ALTO LEVERAGE ----
print(outliers_leverage)
## estatura pierna residuales residuales_est leverage cooks
## 4 1691 400 -22.6010560 -0.642759833 0.01676724 3.522677e-03
## 73 1438 298 -65.5905338 -1.866612172 0.01809273 3.210050e-02
## 89 1534 309 7.7612726 0.220235665 0.01238674 3.041693e-04
## 132 1501 300 -6.7083872 -0.190800557 0.01695524 3.139496e-04
## 144 1696 392 -1.1296425 -0.032059952 0.01269197 6.606510e-06
## 147 1533 309 6.7612726 0.191859434 0.01238674 2.308375e-04
## 166 1521 302 9.1737595 0.260775965 0.01586221 5.480409e-04
## 170 1697 400 -16.6010560 -0.472123603 0.01676724 1.900583e-03
## 217 1674 414 -68.4260297 -1.954804137 0.02561068 5.021865e-02
## 224 1742 393 42.8114308 1.215304432 0.01316248 9.849907e-03
## 233 1643 396 -62.3653493 -1.771716429 0.01464069 2.331982e-02
## 264 1522 309 -4.2387274 -0.120279106 0.01238674 9.072363e-05
## 265 1539 304 23.0559061 0.655045046 0.01481365 3.225937e-03
## 268 1736 415 -8.4849564 -0.242488378 0.02632571 7.949106e-04
## 269 1793 404 71.1632372 2.026213556 0.01907164 3.991087e-02
## 273 1836 428 64.7489967 1.860309684 0.03663265 6.579863e-02
## 277 1717 409 -15.1313962 -0.431521707 0.02220222 2.114086e-03
## 278 1788 400 74.3989440 2.115859218 0.01676724 3.817235e-02
## 281 1739 407 10.9864571 0.313109322 0.02091664 1.047211e-03
## 301 1524 308 -0.1798007 -0.005103262 0.01284989 1.695048e-07
## 304 1682 433 -99.5456368 -2.866705386 0.04109703 1.761051e-01
## 306 1546 393 -153.1885692 -4.348622404 0.01316248 1.261146e-01
## 315 1735 420 -19.7795898 -0.566362797 0.03006762 4.971838e-03
## 316 1696 402 -21.7189094 -0.618027369 0.01789721 3.480277e-03
## 318 1777 403 57.2221639 1.628780705 0.01847887 2.497301e-02
## 329 1667 410 -67.1903229 -1.916800317 0.02286168 4.298094e-02
## 337 1720 395 16.6935774 0.474121076 0.01413684 1.611700e-03
## 339 1774 427 4.8079234 0.138075328 0.03577312 3.536549e-04
cat("\n---- PUNTOS INFLUYENTES (COOK'S DISTANCE) ----\n")
##
## ---- PUNTOS INFLUYENTES (COOK'S DISTANCE) ----
print(outliers_cook)
## estatura pierna residuales residuales_est leverage cooks
## 9 1734 381 59.51855 1.685386 0.008249986 0.01181462
## 30 1740 385 57.28284 1.623273 0.009709643 0.01291795
## 73 1438 298 -65.59053 -1.866612 0.018092729 0.03210050
## 152 1493 328 -72.35833 -2.046340 0.005698703 0.01200006
## 217 1674 414 -68.42603 -1.954804 0.025610677 0.05021865
## 233 1643 396 -62.36535 -1.771716 0.014640687 0.02331982
## 234 1740 363 102.57923 2.898360 0.003882294 0.01637014
## 250 1741 381 66.51855 1.883605 0.008249986 0.01475709
## 254 1778 380 105.57748 2.989128 0.007912859 0.03563221
## 269 1793 404 71.16324 2.026214 0.019071639 0.03991087
## 273 1836 428 64.74900 1.860310 0.036632646 0.06579863
## 278 1788 400 74.39894 2.115859 0.016767243 0.03817235
## 290 1483 318 -61.76907 -1.749527 0.008718547 0.01346042
## 304 1682 433 -99.54564 -2.866705 0.041097025 0.17610507
## 305 1498 356 -125.00828 -3.530802 0.003156308 0.01973645
## 306 1546 393 -153.18857 -4.348622 0.013162477 0.12611460
## 307 1590 389 -100.95286 -2.863155 0.011347140 0.04704380
## 318 1777 403 57.22216 1.628781 0.018478867 0.02497301
## 329 1667 410 -67.19032 -1.916800 0.022861682 0.04298094
# Ordenar por Cook’s distance (más influyentes primero) ---
diagnosticos_ordenados <- diagnosticos[order(-diagnosticos$cooks), ]
head(diagnosticos_ordenados, 10)
## estatura pierna residuales residuales_est leverage cooks
## 304 1682 433 -99.54564 -2.866705 0.041097025 0.17610507
## 306 1546 393 -153.18857 -4.348622 0.013162477 0.12611460
## 273 1836 428 64.74900 1.860310 0.036632646 0.06579863
## 217 1674 414 -68.42603 -1.954804 0.025610677 0.05021865
## 307 1590 389 -100.95286 -2.863155 0.011347140 0.04704380
## 329 1667 410 -67.19032 -1.916800 0.022861682 0.04298094
## 269 1793 404 71.16324 2.026214 0.019071639 0.03991087
## 278 1788 400 74.39894 2.115859 0.016767243 0.03817235
## 254 1778 380 105.57748 2.989128 0.007912859 0.03563221
## 73 1438 298 -65.59053 -1.866612 0.018092729 0.03210050
### Ajustar el modelo robusto
library(MASS)
##
## Adjuntando el paquete: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
modelo_robusto <- rlm(X11 ~ X20, data = datos)
summary(modelo_robusto)
##
## Call: rlm(formula = X11 ~ X20, data = datos)
## Residuals:
## Min 1Q Median 3Q Max
## -154.5658 -21.3706 -0.2032 20.7085 104.5225
##
## Coefficients:
## Value Std. Error t value
## (Intercept) 881.6641 28.8052 30.6078
## X20 2.0837 0.0821 25.3857
##
## Residual standard error: 31.22 on 335 degrees of freedom
## (2 observations deleted due to missingness)
## Comparación de los modelos
coef(modelo)
## (Intercept) X20
## 890.030381 2.058927
coef(modelo_robusto)
## (Intercept) X20
## 881.664136 2.083719
library(ggplot2)
# Datos del modelo robusto
coef_rob <- coef(modelo_robusto)
# Datos del modelo OLS
coef_ols <- coef(modelo)
ggplot(datos, aes(x = X20, y = X11)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", se = FALSE, color = "gray40", linetype = "dashed") +
geom_smooth(method = MASS::rlm, se = FALSE, color = "red") +
labs(
title = "Modelo robusto (RLM) vs modelo lineal (OLS)",
x = "Longitud de la pierna",
y = "Estatura"
) +
theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Función de pseudo-R² robusto
pseudoR2 <- function(modelo) {
y <- modelo$fitted + modelo$residuals
mad_y <- mad(y) # variabilidad total
mad_res <- mad(modelo$residuals) # variabilidad residual
1 - (mad_res^2 / mad_y^2)
}
pseudoR2(modelo_robusto)
## [1] 0.6897088
## Vamos a comparar los dos modelos el clásico y el robusto
# --- Coeficientes y SE de OLS ---
ols_sum <- summary(modelo)
ols_tab <- data.frame(
Modelo = "OLS",
Intercepto = ols_sum$coefficients[1,1],
SE_Intercepto = ols_sum$coefficients[1,2],
Pendiente = ols_sum$coefficients[2,1],
SE_Pendiente = ols_sum$coefficients[2,2]
)
# --- Coeficientes y SE de RLM ---
rlm_sum <- summary(modelo_robusto)
rlm_tab <- data.frame(
Modelo = "RLM",
Intercepto = rlm_sum$coefficients[1,1],
SE_Intercepto = rlm_sum$coefficients[1,2],
Pendiente = rlm_sum$coefficients[2,1],
SE_Pendiente = rlm_sum$coefficients[2,2]
)
R2_ols <- summary(modelo)$r.squared
R2_rlm <- pseudoR2(modelo_robusto)
AIC_rlm <- AIC(lm(modelo_robusto$wresid ~ modelo_robusto$fitted))
AIC_ols <- AIC(modelo)
comparacion <- data.frame(
Modelo = c("OLS", "RLM"),
Intercepto = c(ols_tab$Intercepto, rlm_tab$Intercepto),
SE_Intercepto = c(ols_tab$SE_Intercepto, rlm_tab$SE_Intercepto),
Pendiente = c(ols_tab$Pendiente, rlm_tab$Pendiente),
SE_Pendiente = c(ols_tab$SE_Pendiente, rlm_tab$SE_Pendiente),
R2 = c(R2_ols, R2_rlm),
AIC = c(AIC_ols, AIC_rlm)
)
comparacion
## Modelo Intercepto SE_Intercepto Pendiente SE_Pendiente R2 AIC
## 1 OLS 890.0304 29.33679 2.058927 0.08359709 0.6442210 3365.484
## 2 RLM 881.6641 28.80519 2.083719 0.08208228 0.6897088 3365.484
anova(modelo)
## Analysis of Variance Table
##
## Response: X11
## Df Sum Sq Mean Sq F value Pr(>F)
## X20 1 762787 762787 606.6 < 2.2e-16 ***
## Residuals 335 421258 1257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Vamos a ajustar el modelo sin valores atípicos o influyentes
resid_est <- rstandard(modelo)
lev <- hatvalues(modelo)
cook <- cooks.distance(modelo)
umbral_cook <- 4 / n
umbral_lev <- 2 * p / n
indices_fuera <- which(abs(resid_est) > 2 |
lev > umbral_lev |
cook > umbral_cook)
length(indices_fuera) # cuántos eliminarás
## [1] 42
datos_limpios <- datos[-indices_fuera, ]
modelo_limpio <- lm(X11 ~ X20, data = datos_limpios)
summary(modelo_limpio)
##
## Call:
## lm(formula = X11 ~ X20, data = datos_limpios)
##
## Residuals:
## Min 1Q Median 3Q Max
## -157.881 -21.346 -0.251 20.377 102.054
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 859.40461 31.46175 27.32 <2e-16 ***
## X20 2.14879 0.08985 23.92 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 34.27 on 293 degrees of freedom
## (2 observations deleted due to missingness)
## Multiple R-squared: 0.6613, Adjusted R-squared: 0.6601
## F-statistic: 572 on 1 and 293 DF, p-value: < 2.2e-16
## Comparando los modelos
summary(modelo)$r.squared # R² original
## [1] 0.644221
summary(modelo_limpio)$r.squared # R² sin atípicos
## [1] 0.6612676
anova(modelo)
## Analysis of Variance Table
##
## Response: X11
## Df Sum Sq Mean Sq F value Pr(>F)
## X20 1 762787 762787 606.6 < 2.2e-16 ***
## Residuals 335 421258 1257
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sqrt(1257)
## [1] 35.4542