library(tidyverse) # Manipulación y visualización de datos
library(readxl) # Leer archivos de excel (.xlsx)
library(janitor) # Edición rápida de nombres de columnas
library(skimr) # Resumen descriptivo
library(ggpubr) # Gráfico cuantil-cuantil
library(qqplotr) # Gráfico cuantil-cuantil
library(corrplot) # Gráfico de correlaciones
library(yardstick) # Métricas de error: R-cuadrado y para el RMSE
library(car) # VIF: Factor Inflacionario de Varianza
library(rsample) # Remuestreo - validación cruzada
library(parsnip) # Ajuste de modelos con remuestreo
library(workflows) # Ajuste de modelos con remuestreo
library(tune) # Ajuste de modelos con remuestreo
<- read_excel("datos-sorgo.xlsx", skip = 1) %>%
datos clean_names() %>%
slice(1:23) %>%
select(-fieldname) %>%
relocate(yield_t_ha, everything())
%>% head() datos
%>%
datos skim()
Name | Piped data |
Number of rows | 23 |
Number of columns | 25 |
_______________________ | |
Column type frequency: | |
numeric | 25 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
---|---|---|---|---|---|---|---|---|---|---|
yield_t_ha | 0 | 1 | 10.82 | 3.54 | 3.97 | 9.21 | 10.52 | 13.19 | 17.47 | ▃▂▇▂▃ |
d120_f | 0 | 1 | 0.20 | 0.06 | 0.09 | 0.16 | 0.18 | 0.25 | 0.31 | ▃▆▇▁▅ |
d135_f | 0 | 1 | 0.20 | 0.08 | 0.06 | 0.14 | 0.20 | 0.26 | 0.35 | ▅▇▇▇▂ |
d150_f | 0 | 1 | 0.27 | 0.12 | 0.10 | 0.20 | 0.25 | 0.32 | 0.53 | ▃▇▅▁▂ |
d165_f | 0 | 1 | 0.46 | 0.14 | 0.15 | 0.37 | 0.48 | 0.56 | 0.67 | ▂▂▇▇▇ |
d180_f | 0 | 1 | 0.67 | 0.15 | 0.27 | 0.67 | 0.71 | 0.75 | 0.81 | ▁▁▁▃▇ |
d195_f | 0 | 1 | 0.76 | 0.12 | 0.30 | 0.74 | 0.79 | 0.82 | 0.90 | ▁▁▁▆▇ |
d210_f | 0 | 1 | 0.78 | 0.13 | 0.25 | 0.76 | 0.80 | 0.84 | 0.91 | ▁▁▁▃▇ |
d225_f | 0 | 1 | 0.72 | 0.15 | 0.24 | 0.66 | 0.73 | 0.83 | 0.90 | ▁▁▃▇▇ |
d240_f | 0 | 1 | 0.58 | 0.24 | 0.09 | 0.46 | 0.63 | 0.78 | 0.88 | ▃▁▅▃▇ |
d164_176f | 0 | 1 | 0.54 | 0.15 | 0.17 | 0.49 | 0.59 | 0.62 | 0.74 | ▂▂▂▇▅ |
d184_199f | 0 | 1 | 0.74 | 0.13 | 0.29 | 0.72 | 0.78 | 0.81 | 0.87 | ▁▁▁▅▇ |
d202_215f | 0 | 1 | 0.78 | 0.13 | 0.26 | 0.77 | 0.81 | 0.83 | 0.90 | ▁▁▁▃▇ |
d225_239f | 0 | 1 | 0.67 | 0.18 | 0.23 | 0.54 | 0.70 | 0.81 | 0.92 | ▂▁▆▃▇ |
d249_264f | 0 | 1 | 0.38 | 0.27 | 0.05 | 0.13 | 0.31 | 0.61 | 0.87 | ▇▅▁▃▃ |
d164_176c | 0 | 1 | 423.50 | 102.14 | 201.65 | 358.40 | 422.18 | 478.92 | 647.42 | ▂▅▇▅▃ |
d184_199c | 0 | 1 | 558.19 | 81.57 | 381.18 | 534.35 | 546.88 | 576.08 | 724.02 | ▁▂▇▂▂ |
d202_215c | 0 | 1 | 547.44 | 138.39 | 367.37 | 438.45 | 519.90 | 621.68 | 864.40 | ▇▇▃▃▂ |
d225_239c | 0 | 1 | 484.39 | 168.80 | 204.84 | 395.40 | 455.57 | 533.76 | 875.09 | ▃▇▅▂▂ |
d249_264c | 0 | 1 | 444.14 | 80.54 | 262.00 | 439.22 | 439.22 | 439.22 | 716.07 | ▁▇▁▁▁ |
d164_176n | 0 | 1 | 0.79 | 0.09 | 0.47 | 0.79 | 0.81 | 0.84 | 0.89 | ▁▁▁▃▇ |
d184_199n | 0 | 1 | 0.83 | 0.08 | 0.56 | 0.84 | 0.86 | 0.87 | 0.89 | ▁▁▁▁▇ |
d202_215n | 0 | 1 | 0.85 | 0.02 | 0.81 | 0.83 | 0.85 | 0.88 | 0.90 | ▅▇▇▅▃ |
d225_239n | 0 | 1 | 0.80 | 0.08 | 0.62 | 0.76 | 0.82 | 0.86 | 0.90 | ▂▃▃▇▇ |
d249_264n | 0 | 1 | 0.81 | 0.03 | 0.73 | 0.81 | 0.81 | 0.81 | 0.89 | ▁▁▇▁▁ |
%>%
datos pivot_longer(cols = everything(),
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(x = valor)) +
facet_wrap(~variable, scales = "free") +
geom_density()
%>%
datos pivot_longer(cols = everything(),
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(sample = valor)) +
facet_wrap(~variable, scales = "free") +
stat_qq_band() +
stat_qq_line() +
stat_qq_point()
%>%
datos cor(method = "spearman") %>%
corrplot()
%>%
datos cor(method = "spearman") %>%
corrplot(diag = FALSE,
type = "lower",
method = "pie",
tl.col = "black",
tl.srt = 35)
%>%
datos pivot_longer(cols = -yield_t_ha,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(x = valor, y = yield_t_ha)) +
facet_wrap(~variable, scales = "free") +
geom_point() +
geom_smooth(method = "lm", se = TRUE)
\[Y_i = \beta_0\ +\ \beta_1X_1 + \beta_{2}X_2 + ... + \beta_{24}X_{24}\]
<- lm(yield_t_ha ~ ., data = datos)
modelo0 %>%
modelo0 summary()
##
## Call:
## lm(formula = yield_t_ha ~ ., data = datos)
##
## Residuals:
## ALL 23 residuals are 0: no residual degrees of freedom!
##
## Coefficients: (2 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.855e+02 NaN NaN NaN
## d120_f 1.871e+02 NaN NaN NaN
## d135_f -7.367e+01 NaN NaN NaN
## d150_f -1.785e+02 NaN NaN NaN
## d165_f 2.370e+02 NaN NaN NaN
## d180_f -3.853e+02 NaN NaN NaN
## d195_f -3.948e+02 NaN NaN NaN
## d210_f 6.477e+02 NaN NaN NaN
## d225_f 1.986e+02 NaN NaN NaN
## d240_f 1.135e+02 NaN NaN NaN
## d164_176f -9.973e+01 NaN NaN NaN
## d184_199f 9.005e+02 NaN NaN NaN
## d202_215f -1.018e+03 NaN NaN NaN
## d225_239f -1.467e+02 NaN NaN NaN
## d249_264f 3.490e+01 NaN NaN NaN
## d164_176c 2.398e-01 NaN NaN NaN
## d184_199c -4.600e-02 NaN NaN NaN
## d202_215c -1.401e-01 NaN NaN NaN
## d225_239c 2.784e-02 NaN NaN NaN
## d249_264c 1.038e-01 NaN NaN NaN
## d164_176n -2.174e+02 NaN NaN NaN
## d184_199n -5.926e+01 NaN NaN NaN
## d202_215n 4.568e+02 NaN NaN NaN
## d225_239n NA NA NA NA
## d249_264n NA NA NA NA
##
## Residual standard error: NaN on 0 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: NaN
## F-statistic: NaN on 22 and 0 DF, p-value: NA
<-
corr_predictoras %>%
datos pivot_longer(cols = -yield_t_ha,
names_to = "variable",
values_to = "valor") %>%
group_by(variable) %>%
summarise(correlacion = cor(yield_t_ha, valor, method = "spearman")) %>%
ungroup() %>%
arrange(desc(abs(correlacion))) %>%
filter(abs(correlacion) > 0.10)
corr_predictoras
<- corr_predictoras$variable
variables_predictoras
<- datos %>%
datos_modelos select(yield_t_ha, variables_predictoras)
<- lm(yield_t_ha ~ ., data = datos_modelos)
modelo1 %>%
modelo1 summary()
##
## Call:
## lm(formula = yield_t_ha ~ ., data = datos_modelos)
##
## Residuals:
## 1 2 3 4 5 6 7 8
## -0.90729 -0.23264 -1.53765 0.59465 0.86602 -0.40578 0.46581 0.36603
## 9 10 11 12 13 14 15 16
## -0.23752 -1.20546 3.38436 -0.88901 -3.05385 -1.36101 1.50675 0.67418
## 17 18 19 20 21 22 23
## 1.47817 -0.54850 0.09765 -3.20575 4.69336 -0.03990 -0.50262
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.120e+01 1.294e+02 0.164 0.876
## d135_f 3.818e+01 3.240e+01 1.178 0.292
## d210_f -2.814e+02 2.334e+02 -1.205 0.282
## d164_176c -8.917e-03 2.218e-02 -0.402 0.704
## d202_215f 3.118e+02 2.306e+02 1.352 0.234
## d249_264c 3.961e-02 6.309e-02 0.628 0.558
## d249_264n -4.830e+01 1.839e+02 -0.263 0.803
## d225_f -1.586e+01 2.595e+01 -0.611 0.568
## d164_176n 1.125e+01 2.483e+01 0.453 0.669
## d180_f -5.504e+00 4.226e+01 -0.130 0.901
## d184_199c 5.756e-03 2.026e-02 0.284 0.788
## d165_f 3.150e+01 6.253e+01 0.504 0.636
## d225_239f -9.131e-01 1.341e+01 -0.068 0.948
## d202_215n 2.829e+01 5.973e+01 0.474 0.656
## d164_176f -3.472e+01 9.345e+01 -0.372 0.725
## d184_199n -2.624e+01 1.482e+01 -1.771 0.137
## d225_239n -3.554e+00 1.884e+01 -0.189 0.858
## d120_f -3.964e+01 4.494e+01 -0.882 0.418
##
## Residual standard error: 3.676 on 5 degrees of freedom
## Multiple R-squared: 0.7542, Adjusted R-squared: -0.0814
## F-statistic: 0.9026 on 17 and 5 DF, p-value: 0.6074
<- step(modelo1, direction = "forward", trace = 0)
modelo_forward <- step(modelo1, direction = "backward", trace = 0)
modelo_backward <- step(modelo1, direction = "both", trace = 0) modelo_both
<- datos$yield_t_ha
yield_real
<- modelo1$fitted.values
predichos_mod1 <- modelo_forward$fitted.values
predichos_mod_forw <- modelo_backward$fitted.values
predichos_mod_back <- modelo_both$fitted.values
predichos_mod_both
rmse_vec(truth = yield_real, estimate = predichos_mod1)
## [1] 1.714032
rmse_vec(truth = yield_real, estimate = predichos_mod_forw)
## [1] 1.714032
rmse_vec(truth = yield_real, estimate = predichos_mod_back)
## [1] 1.889987
rmse_vec(truth = yield_real, estimate = predichos_mod_both)
## [1] 1.889987
AIC(modelo1, modelo_backward, modelo_forward, modelo_both)
BIC(modelo1, modelo_backward, modelo_forward, modelo_both)
%>% summary() modelo_both
##
## Call:
## lm(formula = yield_t_ha ~ d135_f + d210_f + d202_215f + d249_264c +
## d225_f + d165_f + d164_176f + d184_199n + d120_f, data = datos_modelos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.7288 -0.9446 0.0777 0.4549 4.9969
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.712e+01 9.433e+00 1.815 0.09273 .
## d135_f 3.369e+01 1.316e+01 2.561 0.02370 *
## d210_f -3.021e+02 1.365e+02 -2.214 0.04531 *
## d202_215f 3.292e+02 1.353e+02 2.432 0.03022 *
## d249_264c 2.788e-02 8.905e-03 3.131 0.00795 **
## d225_f -1.606e+01 1.199e+01 -1.339 0.20357
## d165_f 2.977e+01 1.938e+01 1.536 0.14859
## d164_176f -3.789e+01 2.040e+01 -1.857 0.08607 .
## d184_199n -2.781e+01 8.481e+00 -3.279 0.00598 **
## d120_f -2.550e+01 1.749e+01 -1.458 0.16857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.514 on 13 degrees of freedom
## Multiple R-squared: 0.7012, Adjusted R-squared: 0.4943
## F-statistic: 3.389 on 9 and 13 DF, p-value: 0.02296
<- modelo_both$fitted.values
yield_pred <- residuals(modelo_both)
residuales %>% head() residuales
## 1 2 3 4 5 6
## 0.077654740 0.156722686 -1.292019238 -0.392211413 0.586282167 0.003167574
<- bind_cols(real = yield_real, estimado = yield_pred)
df_real_estimado
%>%
df_real_estimado ggplot(aes(x = real, y = estimado)) +
geom_point()
ggqqplot(residuales)
%>%
residuales enframe(value = "residual") %>%
ggplot(aes(x = residual)) +
geom_density()
%>%
df_real_estimado mutate(residual = residuales) %>%
ggplot(aes(x = estimado, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")
%>%
modelo1 vif()
## d135_f d210_f d164_176c d202_215f d249_264c d249_264n
## 9.622325 1476.635044 8.357920 1392.188280 42.030927 49.498059
## d225_f d164_176n d180_f d184_199c d165_f d225_239f
## 24.934002 8.680651 64.426426 4.445548 123.013064 10.007183
## d202_215n d164_176f d184_199n d225_239n d120_f
## 3.295720 308.953460 2.174322 3.462451 12.413472
%>%
modelo_both vif()
## d135_f d210_f d202_215f d249_264c d225_f d165_f
## 3.392059 1079.187361 1025.834367 1.790717 11.394063 25.281662
## d164_176f d184_199n d120_f
## 31.493645 1.522807 4.021150
<- lm(yield_t_ha ~ d135_f + d249_264c + d184_199n + d120_f,
modelo_final data = datos)
%>%
modelo_final summary()
##
## Call:
## lm(formula = yield_t_ha ~ d135_f + d249_264c + d184_199n + d120_f,
## data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -4.661 -2.015 0.131 1.851 5.871
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 19.051115 9.338904 2.040 0.0563 .
## d135_f 37.469871 13.958867 2.684 0.0151 *
## d249_264c 0.010916 0.008431 1.295 0.2118
## d184_199n -15.412163 8.669814 -1.778 0.0924 .
## d120_f -37.442870 17.427953 -2.148 0.0455 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.063 on 18 degrees of freedom
## Multiple R-squared: 0.3859, Adjusted R-squared: 0.2495
## F-statistic: 2.828 on 4 and 18 DF, p-value: 0.05554
<- modelo_final$fitted.values
predichos_modfin
rmse_vec(truth = yield_real, estimate = predichos_modfin)
## [1] 2.70933
%>%
modelo_final vif()
## d135_f d249_264c d184_199n d120_f
## 2.573108 1.081566 1.072275 2.690228
set.seed(2023)
<- initial_split(data = datos_modelos, prop = 0.8, strata = "yield_t_ha")
particion <- training(particion)
train <- testing(particion) test
set.seed(2023)
<- vfold_cv(data = train, v = 5, strata = "yield_t_ha") k_fold
<- linear_reg() modelo_lm
<- modelo_both$call %>% as.formula() formula_modelo
<- workflow() %>%
flujo_lm add_model(modelo_lm) %>%
add_formula(formula_modelo)
set.seed(2023)
<- flujo_lm %>%
ajuste_lm fit_resamples(k_fold)
%>%
ajuste_lm collect_metrics()