TAREA 2

El propósito general del código es estimar la estatura total a partir de longitudes de las extremidades usando regresión lineal, analizar correlaciones, diagnosticar el modelo, aplicar un modelo robusto y comparar diferentes ajustes.

### El objetivo es aplicar modelos de regresión lineal para 
### estimar la estatura a partir de longitud (es) de las extremidades 

### El proposito general del codigo es estimar la estatura total a partir de las extremidades usando regresion lineal, analizar correlaciones, diagnosticar el modelo, aplicar un modelo robusto y comparar diferetnes ajustes. 

##PREPARAMOS EL ENTORNO Y CARGAMOS DATOS  
## Defino mi directorio de trabajo
setwd("~/Documents/Estadistica")
##Abriendo paquete pacman
library(pacman)

### SE CARGA EL ARCHIVO SPSS Cholula.sav mediante read_sav
## El archivo esta en formato SPSS, lo abrimos mediante la libreria haven
p_load(haven,dplyr,ggplot2,tinytex,tidyr,GGally,purrr,labelled)
Cholula <- read_sav("Cholula.sav")

### SE SELECCIONA LAS VARIABLES Y SE EXTRAEN LAS ETIQUETAS 

### SE SELECCIONAN LAS VARIABLES X11 Y X14 A X21 DEL CONJUNTO ORIGINAL 
# 1. Seleccionar variables
vars <- Cholula %>% dplyr::select(X11, X14:X21)

### SE EXTRAEN SUS ETIQUETAS SPSS (labels) PARA USARLAR COMO NOMBRES DESCRIPTIVOS Y SI HAY ETIQUETAS DESCRITIVAS SE VUELVEN UNICAS CON make.unique() 
# 2. Extraer etiquetas SPSS
etiquetas_raw <- map_chr(names(vars), ~{
  lab <- attr(vars[[.x]], "label")
  if (is.null(lab)) .x else lab
})

etiquetas <- make.unique(etiquetas_raw, sep = "_")

### EL DATA FRAME ES RENOMBRADO CON ESTAS ETIQUETAS PARA TRABAJAR CON NOMBRES MÁS CLAROS
# 3. Renombrar
df <- vars %>% rename_with(~etiquetas)


### SE DEFINE UNA FUNCION graficar_cor() que tamara la estatura total como vairable dependiente, toma una longitud corporal como variable predictora, calcula la correlacion de Pearson entre ambas, reporta el valor r y p_value en la grafia y genera un scatterplot con linea de regresion (modelo lineal simple)
# 4. Función para generar gráfica por variable
graficar_cor <- function(y_var){
  
  datos <- df %>% dplyr::select(`Estatura total`, all_of(y_var))
  
  prueba <- cor.test(datos[[1]], datos[[2]], use = "pairwise.complete.obs")
  
  r <- round(prueba$estimate, 3)
  p <- formatC(prueba$p.value, digits = 3, format = "e")
  
  ggplot(datos, aes(x = `Estatura total`, y = .data[[y_var]])) +
    geom_point(alpha = 0.6) +
    geom_smooth(method = "lm", se = FALSE) +
    labs(
      title = paste0("Estatura vs ", y_var),
      subtitle = paste0("r = ", r, " | p = ", p),
      x = "Estatura total",
      y = y_var
    ) +
    theme_minimal(base_size = 14)
}

### SE GENERAN GRAFICAS PARA TODAS LAS LONGITUDES USANDO map()
# 5. Generar todas las gráficas

graficas <- map(names(df)[-1], graficar_cor)

graficas

## [[1]]

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[2]]

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 3 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 3 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[3]]

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[4]]

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 5 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[5]]

## `geom_smooth()` using formula = 'y ~ x'

## 
## [[6]]

## `geom_smooth()` using formula = 'y ~ x'

## 
## [[7]]

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

## 
## [[8]]

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 2 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 2 rows containing missing values or values outside the scale range
## (`geom_point()`).

### SE SELECCIONA EL SUBCONJUNTO DE DATOS Y SE AJUSTA EL MODELO 
# Ajustaré el modelo en función de la longitud del antebrazo
## Estatura total (X11)∼Longitud del antebrazo (X16)

datos <- Cholula %>% dplyr::select(X11, X16)
# Extraer etiquetas SPSS
lab_X11 <- attr(datos$X11, "label")
lab_X16 <- attr(datos$X16, "label")
## Aquí estoy ajustando el modelo de mínimos cuadrados o clásico
modelo <- lm(X11 ~ X16, data = datos)
summary(modelo)

## 
## Call:
## lm(formula = X11 ~ X16, data = datos)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -124.690  -34.053   -3.468   28.275  140.109 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1031.8514    41.3060   24.98   <2e-16 ***
## X16            2.6143     0.1858   14.07   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 46.63 on 332 degrees of freedom
##   (5 observations deleted due to missingness)
## Multiple R-squared:  0.3735, Adjusted R-squared:  0.3716 
## F-statistic: 197.9 on 1 and 332 DF,  p-value: < 2.2e-16

coef <- summary(modelo)$coefficients
round(coef, 4)

##              Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1031.8514    41.3060 24.9807        0
## X16            2.6143     0.1858 14.0676        0

op <- par()   # guarda la configuración actual del panel de gráfica
par(mar = c(4, 4, 2, 1))  # reduce márgenes
par(mfrow = c(2, 2))
plot(modelo)

par(op) # regresa a la configuración original

## Warning in par(op): graphical parameter "cin" cannot be set

## Warning in par(op): graphical parameter "cra" cannot be set

## Warning in par(op): graphical parameter "csi" cannot be set

## Warning in par(op): graphical parameter "cxy" cannot be set

## Warning in par(op): graphical parameter "din" cannot be set

## Warning in par(op): graphical parameter "page" cannot be set

## Graficando modelo de regresión 

ggplot(datos, aes(x = X16, y = X11)) +
  geom_point(alpha = 0.6) +
  geom_smooth(method = "lm", se = TRUE, color = "blue") +
  labs(
    title = paste0(lab_X11, " en función de ", lab_X16),
    x = lab_X16,
    y = lab_X11
  ) +
  theme_minimal(base_size = 14)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

### Se va a realizar el diagnóstico del modelo
# Calcular valores diagnósticos 

# Extraer solo datos usados por el modelo
datos_modelo <- model.frame(modelo)

# Diagnósticos
diagnosticos <- data.frame(
  estatura = datos_modelo$X11,
  antebrazo = datos_modelo$X16,
  residuales = residuals(modelo),
  residuales_est = rstandard(modelo),
  leverage = hatvalues(modelo),
  cooks = cooks.distance(modelo)
)
#  Identificar posibles outliers por residuales estandarizados ---
outliers_residuales <- diagnosticos[abs(diagnosticos$residuales_est) > 2, ]

# Identificar puntos con alto leverage ---
# Regla: leverage > 2p/n  (p = nº de parámetros del modelo)
p <- length(coefficients(modelo))
n <- nrow(datos)
umbral_leverage <- 2 * p / n

outliers_leverage <- diagnosticos[diagnosticos$leverage > umbral_leverage, ]

# 6. Identificar puntos influyentes por distancia de Cook ---
# Regla común: Cook > 4/n
umbral_cook <- 4 / n
outliers_cook <- diagnosticos[diagnosticos$cooks > umbral_cook, ]

#  Mostrar resultados ---
cat("---- OUTLIERS POR RESIDUALES ESTANDARIZADOS ----\n")

## ---- OUTLIERS POR RESIDUALES ESTANDARIZADOS ----

print(outliers_residuales)

##     estatura antebrazo residuales residuales_est    leverage       cooks
## 2       1701       216  104.45375       2.243869 0.003535902 0.008933127
## 30      1740       232  101.62451       2.184294 0.004632958 0.011103687
## 51      1697       210  116.13972       2.497018 0.005220856 0.016361700
## 137     1620       272 -122.94861      -2.695011 0.042950018 0.162974574
## 160     1539       232  -99.37549      -2.135954 0.004632958 0.010617662
## 254     1778       244  108.25257       2.333988 0.010791913 0.029715155
## 268     1736       228  108.08182       2.321877 0.003596385 0.009729249
## 269     1793       253   99.72362       2.158429 0.018412721 0.043695290
## 273     1836       254  140.10929       3.034096 0.019418291 0.091149809
## 278     1788       250  102.56660       2.216774 0.015586585 0.038903290
## 281     1739       229  108.46749       2.330410 0.003807884 0.010379469
## 290     1483       215 -110.93192      -2.383273 0.003737321 0.010653791
## 305     1498       226 -124.68953      -2.678213 0.003268676 0.011761265
## 339     1774       240  114.70988       2.470016 0.008230722 0.025316097

cat("\n---- PUNTOS CON ALTO LEVERAGE ----\n")

## 
## ---- PUNTOS CON ALTO LEVERAGE ----

print(outliers_leverage)

##     estatura antebrazo  residuales residuales_est   leverage        cooks
## 1       1715       250   29.566602     0.63902357 0.01558659 3.232788e-03
## 36      1611       196   66.740315     1.44100596 0.01359922 1.431404e-02
## 42      1579       190   50.426283     1.09181202 0.01909572 1.160313e-02
## 52      1535       196   -9.259685    -0.19992806 0.01359922 2.755358e-04
## 67      1529       198  -20.488341    -0.44201520 0.01202116 1.188620e-03
## 71      1633       198   83.511659     1.80167941 0.01202116 1.974802e-02
## 75      1523       182   15.340907     0.33370869 0.02820310 1.615945e-03
## 80      1487       191  -44.188045    -0.95625878 0.01810023 8.428256e-03
## 103     1530       198  -19.488341    -0.42044121 0.01202116 1.075422e-03
## 137     1620       272 -122.948615    -2.69501111 0.04295002 1.629746e-01
## 165     1615       247  -62.590414    -1.35102588 0.01304632 1.206392e-02
## 198     1687       246   12.023914     0.25943552 0.01226309 4.178182e-04
## 225     1676       251  -12.047726    -0.26050823 0.01649687 5.691656e-04
## 249     1578       192   44.197627     0.95599711 0.01713650 7.967316e-03
## 269     1793       253   99.723618     2.15842912 0.01841272 4.369529e-02
## 273     1836       254  140.109289     3.03409563 0.01941829 9.114981e-02
## 277     1717       247   39.409586     0.85066333 0.01304632 4.782738e-03
## 278     1788       250  102.566602     2.21677407 0.01558659 3.890329e-02
## 280     1642       264  -80.033991    -1.74367818 0.03122096 4.899189e-02
## 285     1585       196   40.740315     0.87963379 0.01359922 5.333771e-03
## 306     1546       195    4.354643     0.09406203 0.01443589 6.479739e-05
## 315     1735       249   52.180930     1.12728461 0.01470807 9.484792e-03
## 318     1777       250   91.566602     1.97903084 0.01558659 3.100620e-02
## 322     1635       248  -45.204742    -0.97615600 0.01386131 6.696914e-03
## 324     1647       256  -54.119367    -1.17322688 0.02152472 1.513985e-02
## 334     1647       262  -69.805335    -1.51878118 0.02860632 3.396465e-02
## 337     1720       255   21.494961     0.46572424 0.02045563 2.264729e-03

cat("\n---- PUNTOS INFLUYENTES (COOK'S DISTANCE) ----\n")

## 
## ---- PUNTOS INFLUYENTES (COOK'S DISTANCE) ----

print(outliers_cook)

##     estatura antebrazo residuales residuales_est    leverage      cooks
## 36      1611       196   66.74031       1.441006 0.013599218 0.01431404
## 51      1697       210  116.13972       2.497018 0.005220856 0.01636170
## 71      1633       198   83.51166       1.801679 0.012021155 0.01974802
## 137     1620       272 -122.94861      -2.695011 0.042950018 0.16297457
## 165     1615       247  -62.59041      -1.351026 0.013046316 0.01206392
## 224     1742       245   69.63824       1.501987 0.011511618 0.01313612
## 234     1740       241   78.09555       1.682112 0.008823375 0.01259400
## 250     1741       241   79.09555       1.703652 0.008823375 0.01291859
## 254     1778       244  108.25257       2.333988 0.010791913 0.02971515
## 269     1793       253   99.72362       2.158429 0.018412721 0.04369529
## 273     1836       254  140.10929       3.034096 0.019418291 0.09114981
## 278     1788       250  102.56660       2.216774 0.015586585 0.03890329
## 280     1642       264  -80.03399      -1.743678 0.031220958 0.04899189
## 318     1777       250   91.56660       1.979031 0.015586585 0.03100620
## 324     1647       256  -54.11937      -1.173227 0.021524722 0.01513985
## 334     1647       262  -69.80533      -1.518781 0.028606322 0.03396465
## 339     1774       240  114.70988       2.470016 0.008230722 0.02531610

# Ordenar por Cook’s distance (más influyentes primero) ---
diagnosticos_ordenados <- diagnosticos[order(-diagnosticos$cooks), ]
head(diagnosticos_ordenados, 10)

##     estatura antebrazo residuales residuales_est    leverage      cooks
## 137     1620       272 -122.94861      -2.695011 0.042950018 0.16297457
## 273     1836       254  140.10929       3.034096 0.019418291 0.09114981
## 280     1642       264  -80.03399      -1.743678 0.031220958 0.04899189
## 269     1793       253   99.72362       2.158429 0.018412721 0.04369529
## 278     1788       250  102.56660       2.216774 0.015586585 0.03890329
## 334     1647       262  -69.80533      -1.518781 0.028606322 0.03396465
## 318     1777       250   91.56660       1.979031 0.015586585 0.03100620
## 254     1778       244  108.25257       2.333988 0.010791913 0.02971515
## 339     1774       240  114.70988       2.470016 0.008230722 0.02531610
## 71      1633       198   83.51166       1.801679 0.012021155 0.01974802

### Ajustar el modelo robusto
library(MASS)

## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select

modelo_robusto <- rlm(X11 ~ X16, data = datos)
summary(modelo_robusto)

## 
## Call: rlm(formula = X11 ~ X16, data = datos)
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -122.89  -32.84   -2.69   30.31  144.42 
## 
## Coefficients:
##             Value     Std. Error t value  
## (Intercept) 1050.2888   42.6575    24.6214
## X16            2.5248    0.1919    13.1553
## 
## Residual standard error: 47.05 on 332 degrees of freedom
##   (5 observations deleted due to missingness)

## Comparación de los modelos
coef(modelo)

## (Intercept)         X16 
## 1031.851392    2.614328

coef(modelo_robusto)

## (Intercept)         X16 
## 1050.288823    2.524774

library(ggplot2)

# Datos del modelo robusto
coef_rob <- coef(modelo_robusto)

# Datos del modelo OLS
coef_ols <- coef(modelo)

ggplot(datos, aes(x = X16, y = X11)) +
  geom_point(alpha = 0.7) +
  
  geom_smooth(method = "lm", se = FALSE, color = "gray40", linetype = "dashed") +
  
  geom_smooth(method = MASS::rlm, se = FALSE, color = "red") +
  
  labs(
    title = "Modelo robusto (RLM) vs modelo lineal (OLS)",
    x = "Longitud del antebrazo",
    y = "Estatura"
  ) +
  
  theme_minimal(base_size = 14)

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 5 rows containing non-finite outside the scale range
## (`stat_smooth()`).

## `geom_smooth()` using formula = 'y ~ x'

## Warning: Removed 5 rows containing non-finite outside the scale range (`stat_smooth()`).
## Removed 5 rows containing missing values or values outside the scale range
## (`geom_point()`).

# Función de pseudo-R² robusto
pseudoR2 <- function(modelo) {
  y <- modelo$fitted + modelo$residuals
  mad_y  <- mad(y)                 # variabilidad total
  mad_res <- mad(modelo$residuals) # variabilidad residual
  1 - (mad_res^2 / mad_y^2)
}

pseudoR2(modelo_robusto)

## [1] 0.3676314

## Vamos a comparar los dos modelos el clásico y el robusto

# --- Coeficientes y SE de OLS ---
ols_sum <- summary(modelo)
ols_tab <- data.frame(
  Modelo = "OLS",
  Intercepto = ols_sum$coefficients[1,1],
  SE_Intercepto = ols_sum$coefficients[1,2],
  Pendiente = ols_sum$coefficients[2,1],
  SE_Pendiente = ols_sum$coefficients[2,2]
)

# --- Coeficientes y SE de RLM ---
rlm_sum <- summary(modelo_robusto)
rlm_tab <- data.frame(
  Modelo = "RLM",
  Intercepto = rlm_sum$coefficients[1,1],
  SE_Intercepto = rlm_sum$coefficients[1,2],
  Pendiente = rlm_sum$coefficients[2,1],
  SE_Pendiente = rlm_sum$coefficients[2,2]
)

R2_ols <- summary(modelo)$r.squared
R2_rlm <- pseudoR2(modelo_robusto)

AIC_rlm <- AIC(lm(modelo_robusto$wresid ~ modelo_robusto$fitted))
AIC_ols <- AIC(modelo)

comparacion <- data.frame(
  Modelo = c("OLS", "RLM"),
  Intercepto = c(ols_tab$Intercepto, rlm_tab$Intercepto),
  SE_Intercepto = c(ols_tab$SE_Intercepto, rlm_tab$SE_Intercepto),
  Pendiente = c(ols_tab$Pendiente, rlm_tab$Pendiente),
  SE_Pendiente = c(ols_tab$SE_Pendiente, rlm_tab$SE_Pendiente),
  R2 = c(R2_ols, R2_rlm),
  AIC = c(AIC_ols, AIC_rlm)
)

comparacion

##   Modelo Intercepto SE_Intercepto Pendiente SE_Pendiente        R2      AIC
## 1    OLS   1031.851      41.30602  2.614328    0.1858406 0.3734629 3518.511
## 2    RLM   1050.289      42.65754  2.524774    0.1919213 0.3676314 3518.511

anova(modelo)

## Analysis of Variance Table
## 
## Response: X11
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## X16         1 430358  430358   197.9 < 2.2e-16 ***
## Residuals 332 721987    2175                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## Vamos a ajustar el modelo sin valores atípicos o influyentes

resid_est <- rstandard(modelo)
lev <- hatvalues(modelo)
cook <- cooks.distance(modelo)

umbral_cook <- 4 / n
umbral_lev <- 2 * p / n

indices_fuera <- which(abs(resid_est) > 2 |
                         lev > umbral_lev |
                         cook > umbral_cook)

length(indices_fuera)   # cuántos eliminarás

## [1] 40

datos_limpios <- datos[-indices_fuera, ]

modelo_limpio <- lm(X11 ~ X16, data = datos_limpios)
summary(modelo_limpio)

## 
## Call:
## lm(formula = X11 ~ X16, data = datos_limpios)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -128.515  -34.168   -2.181   28.263  113.707 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 998.6253    45.3355   22.03   <2e-16 ***
## X16           2.7569     0.2037   13.53   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 45.18 on 293 degrees of freedom
##   (4 observations deleted due to missingness)
## Multiple R-squared:  0.3846, Adjusted R-squared:  0.3825 
## F-statistic: 183.1 on 1 and 293 DF,  p-value: < 2.2e-16

## Comparando los modelos
summary(modelo)$r.squared          # R² original

## [1] 0.3734629

summary(modelo_limpio)$r.squared   # R² sin atípicos

## [1] 0.3846148

anova(modelo)

## Analysis of Variance Table
## 
## Response: X11
##            Df Sum Sq Mean Sq F value    Pr(>F)    
## X16         1 430358  430358   197.9 < 2.2e-16 ***
## Residuals 332 721987    2175                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

sqrt(1257)

## [1] 35.4542

TAREA 2

Sandra Pedraza

2025-12-08