summary(cars)
##      speed           dist       
##  Min.   : 4.0   Min.   :  2.00  
##  1st Qu.:12.0   1st Qu.: 26.00  
##  Median :15.0   Median : 36.00  
##  Mean   :15.4   Mean   : 42.98  
##  3rd Qu.:19.0   3rd Qu.: 56.00  
##  Max.   :25.0   Max.   :120.00
### El objetivo es aplicar modelos de regresión lineal para 
### estimar la estatura a partir de longitud (es) de las extremidades 

## Defino mi directorio de trabajo
setwd("~/mis proyectos")
##Abriendo paquete pacman
library(pacman)
## El archivo esta en formato SPSS, lo abrimos mediante la libreria haven
p_load(haven,dplyr,ggplot2,tinytex,tidyr,GGally,purrr,labelled)
Cholula <- read_sav("Cholula.sav")

# 1. Seleccionar variables
vars <- Cholula %>% dplyr::select(X11, X14:X21)

# 2. Extraer etiquetas SPSS
etiquetas_raw <- map_chr(names(vars), ~{
  lab <- attr(vars[[.x]], "label")
  if (is.null(lab)) .x else lab
})

etiquetas <- make.unique(etiquetas_raw, sep = "_")

# 3. Renombrar
df <- vars %>% rename_with(~etiquetas)

# 4. Función para generar gráfica por variable
graficar_cor <- function(y_var){
  
  datos <- df %>% dplyr::select(`Estatura total`, all_of(y_var))
  
  prueba <- cor.test(datos[[1]], datos[[2]], use = "pairwise.complete.obs")
  
  r <- round(prueba$estimate, 3)
  p <- formatC(prueba$p.value, digits = 3, format = "e")
  
  ggplot(datos, aes(x = `Estatura total`, y = .data[[y_var]])) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", se = FALSE) +
    labs(
      title = paste0("Estatura vs ", y_var),
      subtitle = paste0("r = ", r, " | p = ", p),
      x = "Estatura total",
      y = y_var
    ) +
    theme_minimal(base_size = 14)
}

# 5. Generar todas las gráficas
graficas <- map(names(df)[-1], graficar_cor)

graficas 
## [[1]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[2]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[3]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[4]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 5 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[5]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[6]]
## `geom_smooth()` using formula = 'y ~ x'

## 
## [[7]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[8]]
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Ajustaré el modelo en función de la longitud de pierna

## Estatura total (X11)∼Longitud de la pierna (X20)

datos <- Cholula %>% dplyr::select(X11, X20)
# Extraer etiquetas SPSS
lab_X11 <- attr(datos$X11, "label")
lab_X20 <- attr(datos$X20, "label")
## Aquí estoy ajustando el modelo de mínimos cuadrados o clásico
modelo <- lm(X11 ~ X20, data = datos)
summary(modelo)
## 
## Call:
## lm(formula = X11 ~ X20, data = datos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -153.19  -20.60   -0.71   20.99  105.58 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 890.0304    29.3368   30.34   <2e-16 ***
## X20           2.0589     0.0836   24.63   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 35.46 on 335 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.6442, Adjusted R-squared:  0.6432 
## F-statistic: 606.6 on 1 and 335 DF,  p-value: < 2.2e-16
coef <- summary(modelo)$coefficients
round(coef, 4)
##             Estimate Std. Error t value Pr(>|t|)
## (Intercept) 890.0304    29.3368 30.3384        0
## X20           2.0589     0.0836 24.6292        0
op <- par()   # guarda la configuración actual del panel de gráfica
par(mar = c(4, 4, 2, 1))  # reduce márgenes
par(mfrow = c(2, 2))
plot(modelo)

par(op) # regresa a la configuración original 
## Warning in par(op): el parámetro del gráfico "cin" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "cra" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "csi" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "cxy" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "din" no puede ser especificado
## Warning in par(op): el parámetro del gráfico "page" no puede ser especificado
## Graficando modelo de regresión 

ggplot(datos, aes(x = X20, y = X11)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(
    title = paste0(lab_X11, " en función de ", lab_X20),
    x = lab_X20,
    y = lab_X11
  ) +
  theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

### Se va a realizar el diagnóstico del modelo
# Calcular valores diagnósticos 

# Extraer solo datos usados por el modelo
datos_modelo <- model.frame(modelo)

# Diagnósticos
diagnosticos <- data.frame(
  estatura = datos_modelo$X11,
  pierna = datos_modelo$X20,
  residuales = residuals(modelo),
  residuales_est = rstandard(modelo),
  leverage = hatvalues(modelo),
  cooks = cooks.distance(modelo)
)
#  Identificar posibles outliers por residuales estandarizados ---
outliers_residuales <- diagnosticos[abs(diagnosticos$residuales_est) > 2, ]

# Identificar puntos con alto leverage ---
# Regla: leverage > 2p/n  (p = nº de parámetros del modelo)
p <- length(coefficients(modelo))
n <- nrow(datos)
umbral_leverage <- 2 * p / n

outliers_leverage <- diagnosticos[diagnosticos$leverage > umbral_leverage, ]

# 6. Identificar puntos influyentes por distancia de Cook ---
# Regla común: Cook > 4/n
umbral_cook <- 4 / n
outliers_cook <- diagnosticos[diagnosticos$cooks > umbral_cook, ]

#  Mostrar resultados ---
cat("---- OUTLIERS POR RESIDUALES ESTANDARIZADOS ----\n")
## ---- OUTLIERS POR RESIDUALES ESTANDARIZADOS ----
print(outliers_residuales)
##     estatura pierna residuales residuales_est    leverage       cooks
## 51      1697    348   90.46313       2.554882 0.002993508 0.009799277
## 58      1711    362   75.63816       2.136997 0.003745237 0.008583940
## 152     1493    328  -72.35833      -2.046340 0.005698703 0.012000061
## 201     1497    330  -72.47619      -2.049188 0.005228114 0.011034560
## 234     1740    363  102.57923       2.898360 0.003882294 0.016370137
## 242     1702    356   78.99172       2.231085 0.003156308 0.007880516
## 254     1778    380  105.57748       2.989128 0.007912859 0.035632205
## 269     1793    404   71.16324       2.026214 0.019071639 0.039910866
## 278     1788    400   74.39894       2.115859 0.016767243 0.038172347
## 304     1682    433  -99.54564      -2.866705 0.041097025 0.176105067
## 305     1498    356 -125.00828      -3.530802 0.003156308 0.019736454
## 306     1546    393 -153.18857      -4.348622 0.013162477 0.126114603
## 307     1590    389 -100.95286      -2.863155 0.011347140 0.047043799
## 314     1518    342  -76.18331      -2.151960 0.003338237 0.007755461
cat("\n---- PUNTOS CON ALTO LEVERAGE ----\n")
## 
## ---- PUNTOS CON ALTO LEVERAGE ----
print(outliers_leverage)
##     estatura pierna   residuales residuales_est   leverage        cooks
## 4       1691    400  -22.6010560   -0.642759833 0.01676724 3.522677e-03
## 73      1438    298  -65.5905338   -1.866612172 0.01809273 3.210050e-02
## 89      1534    309    7.7612726    0.220235665 0.01238674 3.041693e-04
## 132     1501    300   -6.7083872   -0.190800557 0.01695524 3.139496e-04
## 144     1696    392   -1.1296425   -0.032059952 0.01269197 6.606510e-06
## 147     1533    309    6.7612726    0.191859434 0.01238674 2.308375e-04
## 166     1521    302    9.1737595    0.260775965 0.01586221 5.480409e-04
## 170     1697    400  -16.6010560   -0.472123603 0.01676724 1.900583e-03
## 217     1674    414  -68.4260297   -1.954804137 0.02561068 5.021865e-02
## 224     1742    393   42.8114308    1.215304432 0.01316248 9.849907e-03
## 233     1643    396  -62.3653493   -1.771716429 0.01464069 2.331982e-02
## 264     1522    309   -4.2387274   -0.120279106 0.01238674 9.072363e-05
## 265     1539    304   23.0559061    0.655045046 0.01481365 3.225937e-03
## 268     1736    415   -8.4849564   -0.242488378 0.02632571 7.949106e-04
## 269     1793    404   71.1632372    2.026213556 0.01907164 3.991087e-02
## 273     1836    428   64.7489967    1.860309684 0.03663265 6.579863e-02
## 277     1717    409  -15.1313962   -0.431521707 0.02220222 2.114086e-03
## 278     1788    400   74.3989440    2.115859218 0.01676724 3.817235e-02
## 281     1739    407   10.9864571    0.313109322 0.02091664 1.047211e-03
## 301     1524    308   -0.1798007   -0.005103262 0.01284989 1.695048e-07
## 304     1682    433  -99.5456368   -2.866705386 0.04109703 1.761051e-01
## 306     1546    393 -153.1885692   -4.348622404 0.01316248 1.261146e-01
## 315     1735    420  -19.7795898   -0.566362797 0.03006762 4.971838e-03
## 316     1696    402  -21.7189094   -0.618027369 0.01789721 3.480277e-03
## 318     1777    403   57.2221639    1.628780705 0.01847887 2.497301e-02
## 329     1667    410  -67.1903229   -1.916800317 0.02286168 4.298094e-02
## 337     1720    395   16.6935774    0.474121076 0.01413684 1.611700e-03
## 339     1774    427    4.8079234    0.138075328 0.03577312 3.536549e-04
cat("\n---- PUNTOS INFLUYENTES (COOK'S DISTANCE) ----\n")
## 
## ---- PUNTOS INFLUYENTES (COOK'S DISTANCE) ----
print(outliers_cook)
##     estatura pierna residuales residuales_est    leverage      cooks
## 9       1734    381   59.51855       1.685386 0.008249986 0.01181462
## 30      1740    385   57.28284       1.623273 0.009709643 0.01291795
## 73      1438    298  -65.59053      -1.866612 0.018092729 0.03210050
## 152     1493    328  -72.35833      -2.046340 0.005698703 0.01200006
## 217     1674    414  -68.42603      -1.954804 0.025610677 0.05021865
## 233     1643    396  -62.36535      -1.771716 0.014640687 0.02331982
## 234     1740    363  102.57923       2.898360 0.003882294 0.01637014
## 250     1741    381   66.51855       1.883605 0.008249986 0.01475709
## 254     1778    380  105.57748       2.989128 0.007912859 0.03563221
## 269     1793    404   71.16324       2.026214 0.019071639 0.03991087
## 273     1836    428   64.74900       1.860310 0.036632646 0.06579863
## 278     1788    400   74.39894       2.115859 0.016767243 0.03817235
## 290     1483    318  -61.76907      -1.749527 0.008718547 0.01346042
## 304     1682    433  -99.54564      -2.866705 0.041097025 0.17610507
## 305     1498    356 -125.00828      -3.530802 0.003156308 0.01973645
## 306     1546    393 -153.18857      -4.348622 0.013162477 0.12611460
## 307     1590    389 -100.95286      -2.863155 0.011347140 0.04704380
## 318     1777    403   57.22216       1.628781 0.018478867 0.02497301
## 329     1667    410  -67.19032      -1.916800 0.022861682 0.04298094
# Ordenar por Cook’s distance (más influyentes primero) ---
diagnosticos_ordenados <- diagnosticos[order(-diagnosticos$cooks), ]
head(diagnosticos_ordenados, 10)
##     estatura pierna residuales residuales_est    leverage      cooks
## 304     1682    433  -99.54564      -2.866705 0.041097025 0.17610507
## 306     1546    393 -153.18857      -4.348622 0.013162477 0.12611460
## 273     1836    428   64.74900       1.860310 0.036632646 0.06579863
## 217     1674    414  -68.42603      -1.954804 0.025610677 0.05021865
## 307     1590    389 -100.95286      -2.863155 0.011347140 0.04704380
## 329     1667    410  -67.19032      -1.916800 0.022861682 0.04298094
## 269     1793    404   71.16324       2.026214 0.019071639 0.03991087
## 278     1788    400   74.39894       2.115859 0.016767243 0.03817235
## 254     1778    380  105.57748       2.989128 0.007912859 0.03563221
## 73      1438    298  -65.59053      -1.866612 0.018092729 0.03210050
### Ajustar el modelo robusto
library(MASS)
## 
## Adjuntando el paquete: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
modelo_robusto <- rlm(X11 ~ X20, data = datos)
summary(modelo_robusto)
## 
## Call: rlm(formula = X11 ~ X20, data = datos)
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -154.5658  -21.3706   -0.2032   20.7085  104.5225 
## 
## Coefficients:
##             Value    Std. Error t value 
## (Intercept) 881.6641  28.8052    30.6078
## X20           2.0837   0.0821    25.3857
## 
## Residual standard error: 31.22 on 335 degrees of freedom
##   (2 observations deleted due to missingness)
## Comparación de los modelos
coef(modelo)
## (Intercept)         X20 
##  890.030381    2.058927
coef(modelo_robusto)
## (Intercept)         X20 
##  881.664136    2.083719
library(ggplot2)

# Datos del modelo robusto
coef_rob <- coef(modelo_robusto)

# Datos del modelo OLS
coef_ols <- coef(modelo)

ggplot(datos, aes(x = X20, y = X11)) +
  geom_point(alpha = 0.7) +
  
  geom_smooth(method = "lm", se = FALSE, color = "gray40", linetype = "dashed") +
  
  geom_smooth(method = MASS::rlm, se = FALSE, color = "red") +
  
  labs(
    title = "Modelo robusto (RLM) vs modelo lineal (OLS)",
    x = "Longitud de la pierna",
    y = "Estatura"
  ) +
  
  theme_minimal(base_size = 14)
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).
## `geom_smooth()` using formula = 'y ~ x'
## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Función de pseudo-R² robusto
pseudoR2 <- function(modelo) {
  y <- modelo$fitted + modelo$residuals
  mad_y  <- mad(y)                 # variabilidad total
  mad_res <- mad(modelo$residuals) # variabilidad residual
  1 - (mad_res^2 / mad_y^2)
}

pseudoR2(modelo_robusto)
## [1] 0.6897088
## Vamos a comparar los dos modelos el clásico y el robusto

# --- Coeficientes y SE de OLS ---
ols_sum <- summary(modelo)
ols_tab <- data.frame(
  Modelo = "OLS",
  Intercepto = ols_sum$coefficients[1,1],
  SE_Intercepto = ols_sum$coefficients[1,2],
  Pendiente = ols_sum$coefficients[2,1],
  SE_Pendiente = ols_sum$coefficients[2,2]
)

# --- Coeficientes y SE de RLM ---
rlm_sum <- summary(modelo_robusto)
rlm_tab <- data.frame(
  Modelo = "RLM",
  Intercepto = rlm_sum$coefficients[1,1],
  SE_Intercepto = rlm_sum$coefficients[1,2],
  Pendiente = rlm_sum$coefficients[2,1],
  SE_Pendiente = rlm_sum$coefficients[2,2]
)

R2_ols <- summary(modelo)$r.squared
R2_rlm <- pseudoR2(modelo_robusto)

AIC_rlm <- AIC(lm(modelo_robusto$wresid ~ modelo_robusto$fitted))
AIC_ols <- AIC(modelo)

comparacion <- data.frame(
  Modelo = c("OLS", "RLM"),
  Intercepto = c(ols_tab$Intercepto, rlm_tab$Intercepto),
  SE_Intercepto = c(ols_tab$SE_Intercepto, rlm_tab$SE_Intercepto),
  Pendiente = c(ols_tab$Pendiente, rlm_tab$Pendiente),
  SE_Pendiente = c(ols_tab$SE_Pendiente, rlm_tab$SE_Pendiente),
  R2 = c(R2_ols, R2_rlm),
  AIC = c(AIC_ols, AIC_rlm)
)

comparacion
##   Modelo Intercepto SE_Intercepto Pendiente SE_Pendiente        R2      AIC
## 1    OLS   890.0304      29.33679  2.058927   0.08359709 0.6442210 3365.484
## 2    RLM   881.6641      28.80519  2.083719   0.08208228 0.6897088 3365.484
anova(modelo)
## Analysis of Variance Table
## 
## Response: X11
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## X20         1 762787  762787   606.6 < 2.2e-16 ***
## Residuals 335 421258    1257                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## Vamos a ajustar el modelo sin valores atípicos o influyentes

resid_est <- rstandard(modelo)
lev <- hatvalues(modelo)
cook <- cooks.distance(modelo)

umbral_cook <- 4 / n
umbral_lev <- 2 * p / n

indices_fuera <- which(abs(resid_est) > 2 |
                         lev > umbral_lev |
                         cook > umbral_cook)

length(indices_fuera)   # cuántos eliminarás
## [1] 42
datos_limpios <- datos[-indices_fuera, ]

modelo_limpio <- lm(X11 ~ X20, data = datos_limpios)
summary(modelo_limpio)
## 
## Call:
## lm(formula = X11 ~ X20, data = datos_limpios)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -157.881  -21.346   -0.251   20.377  102.054 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 859.40461   31.46175   27.32   <2e-16 ***
## X20           2.14879    0.08985   23.92   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 34.27 on 293 degrees of freedom
##   (2 observations deleted due to missingness)
## Multiple R-squared:  0.6613, Adjusted R-squared:  0.6601 
## F-statistic:   572 on 1 and 293 DF,  p-value: < 2.2e-16
## Comparando los modelos
summary(modelo)$r.squared          # R² original
## [1] 0.644221
summary(modelo_limpio)$r.squared   # R² sin atípicos
## [1] 0.6612676
anova(modelo)
## Analysis of Variance Table
## 
## Response: X11
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## X20         1 762787  762787   606.6 < 2.2e-16 ***
## Residuals 335 421258    1257                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
sqrt(1257)
## [1] 35.4542