library(tidyverse) # Manipulación y visualización de datos
library(agridat) # Base de datos de ejemplo
library(ggpubr) # Gráfico cuantil-cuantil
library(yardstick) # Métricas de error: R-cuadrado y para el RMSE
library(plotly) # Gráficos interactivos
library(rsample) # Remuestreo - Validación cruzada
library(ggeffects) # Gráficos de estimaciones de modelos
<- hernandez.nitrogen
datos %>% head() datos
%>%
datos group_by(site) %>%
summarise(promedio = mean(yield))
%>%
datos ggplot(aes(x = site, y = yield)) +
geom_boxplot()
%>%
datos ggplot(aes(x = nitro, y = yield, color = site)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
%>%
datos ggplot(aes(x = nitro, y = yield)) +
facet_wrap(~site, scales = "free") +
geom_point() +
geom_smooth(method = "lm")
\[Y_i = \beta_0\ +\ \beta_1X_1 + \beta_{2}X_2\]
<- lm(yield ~ nitro, data = datos)
modelo1 <- lm(yield ~ site, data = datos)
modelo2 <- lm(yield ~ nitro + site, data = datos)
modelo3 summary(modelo3)
##
## Call:
## lm(formula = yield ~ nitro + site, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -3.4665 -0.9888 0.0308 0.9561 3.2666
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.163309 0.306519 23.370 < 2e-16 ***
## nitro 0.024889 0.001649 15.095 < 2e-16 ***
## siteS2 1.812934 0.364226 4.977 2.00e-06 ***
## siteS3 2.072701 0.364226 5.691 8.00e-08 ***
## siteS4 1.075991 0.368415 2.921 0.00412 **
## siteS5 3.030779 0.380109 7.973 6.92e-13 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.363 on 130 degrees of freedom
## Multiple R-squared: 0.6867, Adjusted R-squared: 0.6747
## F-statistic: 57 on 5 and 130 DF, p-value: < 2.2e-16
<- model.matrix(yield ~ nitro + site, data = datos)
mtx_modelo %>% head() mtx_modelo
## (Intercept) nitro siteS2 siteS3 siteS4 siteS5
## 1 1 0.0 0 0 0 0
## 2 1 0.0 0 0 0 0
## 3 1 0.0 0 0 0 0
## 4 1 0.0 0 0 0 0
## 5 1 33.6 0 0 0 0
## 6 1 33.6 0 0 0 0
La ecucación del modelo de regresión lineal múltiple con variables categóricas puede ser representado de la siguiente manera:
\[ Y_i = 7.163309 +\ (0.024889 \times Nitro_i) + (1.812934 \times SiteS2_i) + \\ (2.072701 \times SiteS3_i) + (1.075991 \times SiteS4_i) + (3.030779 \times SiteS5_i) \]
7.163309 + (0.024889 * 30) + (1.812934 * 0 ) + (2.072701 * 0) + (1.075991 * 0) + (3.030779 * 1)
## [1] 10.94076
predict(modelo3, newdata = data.frame(nitro = 30, site = "S5"))
## 1
## 10.94075
<- datos$yield
yield_real <- modelo3$fitted.values
yield_pred <- residuals(modelo3)
residuales %>% head() residuales
## 1 2 3 4 5 6
## -3.4664585 -3.2851785 -2.7746085 -2.5239185 -1.1008243 0.7603857
<- bind_cols(real = yield_real, estimado = yield_pred)
df_real_estimado
%>%
df_real_estimado ggplot(aes(x = real, y = estimado)) +
geom_point()
ggqqplot(residuales)
%>%
residuales enframe(value = "residual") %>%
ggplot(aes(x = residual)) +
geom_density()
%>%
df_real_estimado mutate(residual = residuales) %>%
ggplot(aes(x = estimado, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")
<- modelo1$fitted.values
predichos_mod1 <- modelo2$fitted.values
predichos_mod2 <- modelo3$fitted.values
predichos_mod3
rmse_vec(truth = yield_real, estimate = predichos_mod1)
## [1] 1.662047
rmse_vec(truth = yield_real, estimate = predichos_mod2)
## [1] 2.210637
rmse_vec(truth = yield_real, estimate = predichos_mod3)
## [1] 1.332407
%>%
modelo1 ggpredict(terms = c("nitro")) %>%
plot()
%>%
modelo2 ggpredict(terms = c("site")) %>%
plot()
%>%
modelo3 ggpredict(terms = c("nitro", "site")) %>%
plot(facet = FALSE)
\[Y_i = \beta_0\ +\ \beta_1X_i + \beta_{2}X_i^2 + \beta_{3}X_i^3 + ... + \beta_{n}X_i^p\]
# Nota: los resultados son equivalentes
<- lm(yield ~ nitro + I(nitro^2) + site, data = datos)
modelo4
.2 <- lm(yield ~ nitro + nitro2 + site,
modelo4data = datos %>% mutate(nitro2 = nitro^2))
.3 <- lm(yield ~ poly(nitro, degree = 2, raw = TRUE) + site,
modelo4data = datos)
# Nota: los resultados son equivalentes
%>%
modelo4 summary()
##
## Call:
## lm(formula = yield ~ nitro + I(nitro^2) + site, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24723 -0.69886 0.00713 0.76629 2.53071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9440772 0.2698833 22.025 < 2e-16 ***
## nitro 0.0607110 0.0040100 15.140 < 2e-16 ***
## I(nitro^2) -0.0001630 0.0000173 -9.421 2.34e-16 ***
## siteS2 1.8129336 0.2814192 6.442 2.14e-09 ***
## siteS3 2.0727014 0.2814192 7.365 1.86e-11 ***
## siteS4 1.7325330 0.2930622 5.912 2.85e-08 ***
## siteS5 2.9273633 0.2938964 9.961 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.053 on 129 degrees of freedom
## Multiple R-squared: 0.8144, Adjusted R-squared: 0.8058
## F-statistic: 94.36 on 6 and 129 DF, p-value: < 2.2e-16
.2 %>%
modelo4summary()
##
## Call:
## lm(formula = yield ~ nitro + nitro2 + site, data = datos %>%
## mutate(nitro2 = nitro^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24723 -0.69886 0.00713 0.76629 2.53071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9440772 0.2698833 22.025 < 2e-16 ***
## nitro 0.0607110 0.0040100 15.140 < 2e-16 ***
## nitro2 -0.0001630 0.0000173 -9.421 2.34e-16 ***
## siteS2 1.8129336 0.2814192 6.442 2.14e-09 ***
## siteS3 2.0727014 0.2814192 7.365 1.86e-11 ***
## siteS4 1.7325330 0.2930622 5.912 2.85e-08 ***
## siteS5 2.9273633 0.2938964 9.961 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.053 on 129 degrees of freedom
## Multiple R-squared: 0.8144, Adjusted R-squared: 0.8058
## F-statistic: 94.36 on 6 and 129 DF, p-value: < 2.2e-16
.3 %>%
modelo4summary()
##
## Call:
## lm(formula = yield ~ poly(nitro, degree = 2, raw = TRUE) + site,
## data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.24723 -0.69886 0.00713 0.76629 2.53071
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 5.9440772 0.2698833 22.025 < 2e-16 ***
## poly(nitro, degree = 2, raw = TRUE)1 0.0607110 0.0040100 15.140 < 2e-16 ***
## poly(nitro, degree = 2, raw = TRUE)2 -0.0001630 0.0000173 -9.421 2.34e-16 ***
## siteS2 1.8129336 0.2814192 6.442 2.14e-09 ***
## siteS3 2.0727014 0.2814192 7.365 1.86e-11 ***
## siteS4 1.7325330 0.2930622 5.912 2.85e-08 ***
## siteS5 2.9273633 0.2938964 9.961 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.053 on 129 degrees of freedom
## Multiple R-squared: 0.8144, Adjusted R-squared: 0.8058
## F-statistic: 94.36 on 6 and 129 DF, p-value: < 2.2e-16
%>%
modelo4 ggpredict(terms = c("nitro", "site")) %>%
plot(facet = FALSE)
<- lm(yield ~ nitro + I(nitro^2) + + I(nitro^3) + site, data = datos) modelo5
<- modelo4$fitted.values
predichos_mod4 <- modelo5$fitted.values
predichos_mod5
rmse_vec(truth = yield_real, estimate = predichos_mod1)
## [1] 1.662047
rmse_vec(truth = yield_real, estimate = predichos_mod2)
## [1] 2.210637
rmse_vec(truth = yield_real, estimate = predichos_mod3)
## [1] 1.332407
rmse_vec(truth = yield_real, estimate = predichos_mod4)
## [1] 1.025518
rmse_vec(truth = yield_real, estimate = predichos_mod5)
## [1] 0.9871166
<- modelo5$fitted.values
yield_pred_poli <- residuals(modelo5)
residuales_poli %>% head() residuales_poli
## 1 2 3 4 5 6
## -1.9817800 -1.8005000 -1.2899300 -1.0392400 -1.1295138 0.7316962
<- bind_cols(real = yield_real, estimado = yield_pred_poli)
df_real_estimado_poli
%>%
df_real_estimado_poli ggplot(aes(x = real, y = estimado)) +
geom_point()
ggqqplot(residuales_poli)
%>%
residuales_poli enframe(value = "residual") %>%
ggplot(aes(x = residual)) +
geom_density()
%>%
df_real_estimado_poli mutate(residual = residuales) %>%
ggplot(aes(x = estimado, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")
%>%
modelo5 ggpredict(terms = c("nitro", "site")) %>%
plot(facet = FALSE)