1 Bibliotecas

library(tidyverse)   # Manipulación y visualización de datos
library(readxl)      # Leer archivos de excel (.xlsx)
library(janitor)     # Edición rápida de nombres de columnas
library(skimr)       # Resumen descriptivo
library(ggpubr)      # Gráfico cuantil-cuantil
library(qqplotr)     # Gráfico cuantil-cuantil
library(corrplot)    # Gráfico de correlaciones
library(yardstick)   # Métricas de error: R-cuadrado y para el RMSE
library(car)         # VIF: Factor Inflacionario de Varianza
library(rsample)     # Remuestreo - validación cruzada
library(parsnip)     # Ajuste de modelos con remuestreo
library(workflows)   # Ajuste de modelos con remuestreo
library(tune)        # Ajuste de modelos con remuestreo

2 Datos de ejemplo

datos <- read_excel("datos-sorgo.xlsx", skip = 1) %>% 
  clean_names() %>% 
  slice(1:23) %>% 
  select(-fieldname) %>% 
  relocate(yield_t_ha, everything())
datos %>% head()

3 Exploratorio

3.1 Resumen descriptivo

datos %>% 
  skim()
Data summary
Name Piped data
Number of rows 23
Number of columns 25
_______________________
Column type frequency:
numeric 25
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
yield_t_ha 0 1 10.82 3.54 3.97 9.21 10.52 13.19 17.47 ▃▂▇▂▃
d120_f 0 1 0.20 0.06 0.09 0.16 0.18 0.25 0.31 ▃▆▇▁▅
d135_f 0 1 0.20 0.08 0.06 0.14 0.20 0.26 0.35 ▅▇▇▇▂
d150_f 0 1 0.27 0.12 0.10 0.20 0.25 0.32 0.53 ▃▇▅▁▂
d165_f 0 1 0.46 0.14 0.15 0.37 0.48 0.56 0.67 ▂▂▇▇▇
d180_f 0 1 0.67 0.15 0.27 0.67 0.71 0.75 0.81 ▁▁▁▃▇
d195_f 0 1 0.76 0.12 0.30 0.74 0.79 0.82 0.90 ▁▁▁▆▇
d210_f 0 1 0.78 0.13 0.25 0.76 0.80 0.84 0.91 ▁▁▁▃▇
d225_f 0 1 0.72 0.15 0.24 0.66 0.73 0.83 0.90 ▁▁▃▇▇
d240_f 0 1 0.58 0.24 0.09 0.46 0.63 0.78 0.88 ▃▁▅▃▇
d164_176f 0 1 0.54 0.15 0.17 0.49 0.59 0.62 0.74 ▂▂▂▇▅
d184_199f 0 1 0.74 0.13 0.29 0.72 0.78 0.81 0.87 ▁▁▁▅▇
d202_215f 0 1 0.78 0.13 0.26 0.77 0.81 0.83 0.90 ▁▁▁▃▇
d225_239f 0 1 0.67 0.18 0.23 0.54 0.70 0.81 0.92 ▂▁▆▃▇
d249_264f 0 1 0.38 0.27 0.05 0.13 0.31 0.61 0.87 ▇▅▁▃▃
d164_176c 0 1 423.50 102.14 201.65 358.40 422.18 478.92 647.42 ▂▅▇▅▃
d184_199c 0 1 558.19 81.57 381.18 534.35 546.88 576.08 724.02 ▁▂▇▂▂
d202_215c 0 1 547.44 138.39 367.37 438.45 519.90 621.68 864.40 ▇▇▃▃▂
d225_239c 0 1 484.39 168.80 204.84 395.40 455.57 533.76 875.09 ▃▇▅▂▂
d249_264c 0 1 444.14 80.54 262.00 439.22 439.22 439.22 716.07 ▁▇▁▁▁
d164_176n 0 1 0.79 0.09 0.47 0.79 0.81 0.84 0.89 ▁▁▁▃▇
d184_199n 0 1 0.83 0.08 0.56 0.84 0.86 0.87 0.89 ▁▁▁▁▇
d202_215n 0 1 0.85 0.02 0.81 0.83 0.85 0.88 0.90 ▅▇▇▅▃
d225_239n 0 1 0.80 0.08 0.62 0.76 0.82 0.86 0.90 ▂▃▃▇▇
d249_264n 0 1 0.81 0.03 0.73 0.81 0.81 0.81 0.89 ▁▁▇▁▁

3.2 Distribuciones

datos %>% 
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "valor") %>% 
  ggplot(aes(x = valor)) +
  facet_wrap(~variable, scales = "free") +
  geom_density()

3.3 Gráfico cuantil-cuantil

datos %>% 
  pivot_longer(cols = everything(),
               names_to = "variable",
               values_to = "valor") %>% 
  ggplot(aes(sample = valor)) +
  facet_wrap(~variable, scales = "free") +
  stat_qq_band() +
  stat_qq_line() +
  stat_qq_point()

3.4 Correlaciones

datos %>% 
  cor(method = "spearman") %>% 
  corrplot()

  • Gráfico mejorado:
datos %>% 
  cor(method = "spearman") %>% 
  corrplot(diag = FALSE,
           type = "lower",
           method = "pie",
           tl.col = "black",
           tl.srt = 35)

3.5 Dispersiones

datos %>% 
  pivot_longer(cols = -yield_t_ha,
               names_to = "variable",
               values_to = "valor") %>% 
  ggplot(aes(x = valor, y = yield_t_ha)) +
  facet_wrap(~variable, scales = "free") +
  geom_point() +
  geom_smooth(method = "lm", se = TRUE)

4 Regresión lineal múltiple

\[Y_i = \beta_0\ +\ \beta_1X_1 + \beta_{2}X_2 + ... + \beta_{24}X_{24}\]

5 Modelo sobreparametrizado

modelo0 <- lm(yield_t_ha ~ ., data = datos)
modelo0 %>% 
  summary()
## 
## Call:
## lm(formula = yield_t_ha ~ ., data = datos)
## 
## Residuals:
## ALL 23 residuals are 0: no residual degrees of freedom!
## 
## Coefficients: (2 not defined because of singularities)
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.855e+02        NaN     NaN      NaN
## d120_f       1.871e+02        NaN     NaN      NaN
## d135_f      -7.367e+01        NaN     NaN      NaN
## d150_f      -1.785e+02        NaN     NaN      NaN
## d165_f       2.370e+02        NaN     NaN      NaN
## d180_f      -3.853e+02        NaN     NaN      NaN
## d195_f      -3.948e+02        NaN     NaN      NaN
## d210_f       6.477e+02        NaN     NaN      NaN
## d225_f       1.986e+02        NaN     NaN      NaN
## d240_f       1.135e+02        NaN     NaN      NaN
## d164_176f   -9.973e+01        NaN     NaN      NaN
## d184_199f    9.005e+02        NaN     NaN      NaN
## d202_215f   -1.018e+03        NaN     NaN      NaN
## d225_239f   -1.467e+02        NaN     NaN      NaN
## d249_264f    3.490e+01        NaN     NaN      NaN
## d164_176c    2.398e-01        NaN     NaN      NaN
## d184_199c   -4.600e-02        NaN     NaN      NaN
## d202_215c   -1.401e-01        NaN     NaN      NaN
## d225_239c    2.784e-02        NaN     NaN      NaN
## d249_264c    1.038e-01        NaN     NaN      NaN
## d164_176n   -2.174e+02        NaN     NaN      NaN
## d184_199n   -5.926e+01        NaN     NaN      NaN
## d202_215n    4.568e+02        NaN     NaN      NaN
## d225_239n           NA         NA      NA       NA
## d249_264n           NA         NA      NA       NA
## 
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:    NaN 
## F-statistic:   NaN on 22 and 0 DF,  p-value: NA

6 Otro modelo

  • Inicialmente podríamos usar algún valor de correlación mínimo para considerar a la variable dentro de nuestro modelo.
corr_predictoras <-
  datos %>%
  pivot_longer(cols = -yield_t_ha,
               names_to = "variable",
               values_to = "valor") %>%
  group_by(variable) %>%
  summarise(correlacion = cor(yield_t_ha, valor, method = "spearman")) %>%
  ungroup() %>%
  arrange(desc(abs(correlacion))) %>%
  filter(abs(correlacion) > 0.10)

corr_predictoras
  • Seleccionamos sólo estas 17 variables para el modelo completo.
variables_predictoras <- corr_predictoras$variable

datos_modelos <- datos %>% 
  select(yield_t_ha, variables_predictoras)

modelo1 <- lm(yield_t_ha ~ ., data = datos_modelos)
modelo1 %>% 
  summary()
## 
## Call:
## lm(formula = yield_t_ha ~ ., data = datos_modelos)
## 
## Residuals:
##        1        2        3        4        5        6        7        8 
## -0.90729 -0.23264 -1.53765  0.59465  0.86602 -0.40578  0.46581  0.36603 
##        9       10       11       12       13       14       15       16 
## -0.23752 -1.20546  3.38436 -0.88901 -3.05385 -1.36101  1.50675  0.67418 
##       17       18       19       20       21       22       23 
##  1.47817 -0.54850  0.09765 -3.20575  4.69336 -0.03990 -0.50262 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)
## (Intercept)  2.120e+01  1.294e+02   0.164    0.876
## d135_f       3.818e+01  3.240e+01   1.178    0.292
## d210_f      -2.814e+02  2.334e+02  -1.205    0.282
## d164_176c   -8.917e-03  2.218e-02  -0.402    0.704
## d202_215f    3.118e+02  2.306e+02   1.352    0.234
## d249_264c    3.961e-02  6.309e-02   0.628    0.558
## d249_264n   -4.830e+01  1.839e+02  -0.263    0.803
## d225_f      -1.586e+01  2.595e+01  -0.611    0.568
## d164_176n    1.125e+01  2.483e+01   0.453    0.669
## d180_f      -5.504e+00  4.226e+01  -0.130    0.901
## d184_199c    5.756e-03  2.026e-02   0.284    0.788
## d165_f       3.150e+01  6.253e+01   0.504    0.636
## d225_239f   -9.131e-01  1.341e+01  -0.068    0.948
## d202_215n    2.829e+01  5.973e+01   0.474    0.656
## d164_176f   -3.472e+01  9.345e+01  -0.372    0.725
## d184_199n   -2.624e+01  1.482e+01  -1.771    0.137
## d225_239n   -3.554e+00  1.884e+01  -0.189    0.858
## d120_f      -3.964e+01  4.494e+01  -0.882    0.418
## 
## Residual standard error: 3.676 on 5 degrees of freedom
## Multiple R-squared:  0.7542, Adjusted R-squared:  -0.0814 
## F-statistic: 0.9026 on 17 and 5 DF,  p-value: 0.6074

7 Selección de predictoras

modelo_forward <- step(modelo1, direction = "forward", trace = 0)
modelo_backward <- step(modelo1, direction = "backward", trace = 0)
modelo_both <- step(modelo1, direction = "both", trace = 0)

7.1 ¿Cuál es mejor modelo?

7.1.1 RMSE

yield_real <- datos$yield_t_ha

predichos_mod1 <- modelo1$fitted.values
predichos_mod_forw <- modelo_forward$fitted.values
predichos_mod_back <- modelo_backward$fitted.values
predichos_mod_both <- modelo_both$fitted.values

rmse_vec(truth = yield_real, estimate = predichos_mod1)
## [1] 1.714032
rmse_vec(truth = yield_real, estimate = predichos_mod_forw)
## [1] 1.714032
rmse_vec(truth = yield_real, estimate = predichos_mod_back)
## [1] 1.889987
rmse_vec(truth = yield_real, estimate = predichos_mod_both)
## [1] 1.889987

7.1.2 AIC

AIC(modelo1, modelo_backward, modelo_forward, modelo_both)

7.1.3 BIC

BIC(modelo1, modelo_backward, modelo_forward, modelo_both)

7.2 Modelo final

modelo_both %>% summary()
## 
## Call:
## lm(formula = yield_t_ha ~ d135_f + d210_f + d202_215f + d249_264c + 
##     d225_f + d165_f + d164_176f + d184_199n + d120_f, data = datos_modelos)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -3.7288 -0.9446  0.0777  0.4549  4.9969 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)   
## (Intercept)  1.712e+01  9.433e+00   1.815  0.09273 . 
## d135_f       3.369e+01  1.316e+01   2.561  0.02370 * 
## d210_f      -3.021e+02  1.365e+02  -2.214  0.04531 * 
## d202_215f    3.292e+02  1.353e+02   2.432  0.03022 * 
## d249_264c    2.788e-02  8.905e-03   3.131  0.00795 **
## d225_f      -1.606e+01  1.199e+01  -1.339  0.20357   
## d165_f       2.977e+01  1.938e+01   1.536  0.14859   
## d164_176f   -3.789e+01  2.040e+01  -1.857  0.08607 . 
## d184_199n   -2.781e+01  8.481e+00  -3.279  0.00598 **
## d120_f      -2.550e+01  1.749e+01  -1.458  0.16857   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.514 on 13 degrees of freedom
## Multiple R-squared:  0.7012, Adjusted R-squared:  0.4943 
## F-statistic: 3.389 on 9 and 13 DF,  p-value: 0.02296

7.3 Residuales

  • ¿Real vs Predicho?
yield_pred <- modelo_both$fitted.values
residuales <- residuals(modelo_both)
residuales %>% head()
##            1            2            3            4            5            6 
##  0.077654740  0.156722686 -1.292019238 -0.392211413  0.586282167  0.003167574

7.4 Estimados vs Reales

df_real_estimado <- bind_cols(real = yield_real, estimado = yield_pred)

df_real_estimado %>% 
  ggplot(aes(x = real, y = estimado)) +
  geom_point()

7.5 Normalidad de residuales

ggqqplot(residuales)

  • También podemos graficar la densidad de los residuales:
residuales %>% 
  enframe(value = "residual") %>% 
  ggplot(aes(x = residual)) +
  geom_density()

7.6 Homocedasticidad

df_real_estimado %>% 
  mutate(residual = residuales) %>% 
  ggplot(aes(x = estimado, y = residual)) +
  geom_point() +
  geom_hline(yintercept = 0, color = "red") 

8 Multicolinealidad

8.1 Modelo 1

modelo1 %>% 
  vif()
##      d135_f      d210_f   d164_176c   d202_215f   d249_264c   d249_264n 
##    9.622325 1476.635044    8.357920 1392.188280   42.030927   49.498059 
##      d225_f   d164_176n      d180_f   d184_199c      d165_f   d225_239f 
##   24.934002    8.680651   64.426426    4.445548  123.013064   10.007183 
##   d202_215n   d164_176f   d184_199n   d225_239n      d120_f 
##    3.295720  308.953460    2.174322    3.462451   12.413472

8.2 Modelo “both”

modelo_both %>% 
  vif()
##      d135_f      d210_f   d202_215f   d249_264c      d225_f      d165_f 
##    3.392059 1079.187361 1025.834367    1.790717   11.394063   25.281662 
##   d164_176f   d184_199n      d120_f 
##   31.493645    1.522807    4.021150

9 Un modelo más…

modelo_final <- lm(yield_t_ha ~ d135_f + d249_264c + d184_199n + d120_f,
                   data = datos)

modelo_final %>% 
  summary()
## 
## Call:
## lm(formula = yield_t_ha ~ d135_f + d249_264c + d184_199n + d120_f, 
##     data = datos)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -4.661 -2.015  0.131  1.851  5.871 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)  
## (Intercept)  19.051115   9.338904   2.040   0.0563 .
## d135_f       37.469871  13.958867   2.684   0.0151 *
## d249_264c     0.010916   0.008431   1.295   0.2118  
## d184_199n   -15.412163   8.669814  -1.778   0.0924 .
## d120_f      -37.442870  17.427953  -2.148   0.0455 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.063 on 18 degrees of freedom
## Multiple R-squared:  0.3859, Adjusted R-squared:  0.2495 
## F-statistic: 2.828 on 4 and 18 DF,  p-value: 0.05554
  • RMSE:
predichos_modfin <- modelo_final$fitted.values

rmse_vec(truth = yield_real, estimate = predichos_modfin)
## [1] 2.70933
  • VIF:
modelo_final %>% 
  vif()
##    d135_f d249_264c d184_199n    d120_f 
##  2.573108  1.081566  1.072275  2.690228

10 Remuestreo

10.1 Partición inicial

set.seed(2023)
particion <- initial_split(data = datos_modelos, prop = 0.8, strata = "yield_t_ha")
train <- training(particion)
test <- testing(particion)

10.2 Validación cruzada

set.seed(2023)
k_fold <- vfold_cv(data = train, v = 5, strata = "yield_t_ha")

10.3 Flujo de modelación

  • Primero declaramos la arquitectura del modelo:
modelo_lm <- linear_reg()
  • Después utilizamos la fórmula de nuestro modelo, en este caso usamos el modelo obtenido con los métodos paso a paso (both).
formula_modelo <- modelo_both$call %>% as.formula()
  • Después construimos el flujo del modelo con remuestreo:
flujo_lm <- workflow() %>% 
  add_model(modelo_lm) %>% 
  add_formula(formula_modelo)

set.seed(2023)
ajuste_lm <- flujo_lm %>% 
  fit_resamples(k_fold)
  • Resultados del ajuste con remuestreo:
ajuste_lm %>% 
  collect_metrics()