library(tidyverse) # Manipulación y visualización de datos
library(agridat) # Base de datos de ejemplo
library(ggpubr) # Gráfico cuantil-cuantil
library(yardstick) # Métricas de error: R-cuadrado y para el RMSE
library(plotly) # Gráficos interactivos
library(rsample)
<- hernandez.nitrogen
datos %>% head() datos
ggqqplot(datos$yield)
%>%
datos ggplot(aes(x = yield)) +
geom_density()
\[\rho_{(X,Y)} = \frac{Cov_{(X,Y)}}{\sigma_X\times\sigma_Y} = \frac{\sum_{i=1}^{n}(X_i-\mu_X)(Y_i-\mu_Y)}{\sigma_X\times\sigma_Y}\]
<- cov(x = datos$nitro, y = datos$yield)
covarianza <- sd(datos$nitro)
desv_nitro <- sd(datos$yield)
desv_yield
/ (desv_nitro * desv_yield) covarianza
## [1] 0.7159447
cor(x = datos$nitro, y = datos$yield, method = "pearson")
## [1] 0.7159447
%>%
datos ggplot(aes(x = nitro, y = yield)) +
geom_point()
\[Y_i = \beta_0\ +\ \beta_1X\] \[\hat{Y_i} = \hat{\beta_0}\ + \hat{\beta_1}X_i\ + \hat{\epsilon_i}\]
\[Y_i = E(Y|X_i)\ +\ \epsilon_i\\\]
\[Y_i = E(Y|X_i)\ +\ \epsilon_i\\ Y_i = \beta_0 +\ \beta_1X_i\ +\ \epsilon_i\]
<- lm(yield ~ nitro, data = datos)
modelo summary(modelo)
##
## Call:
## lm(formula = yield ~ nitro, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1752 -0.8803 0.1086 1.1148 3.4049
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.872032 0.251752 35.24 <2e-16 ***
## nitro 0.023434 0.001974 11.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.674 on 134 degrees of freedom
## Multiple R-squared: 0.5126, Adjusted R-squared: 0.5089
## F-statistic: 140.9 on 1 and 134 DF, p-value: < 2.2e-16
\[Y_i = 8.87203185 + 0.02343431 \times Dosis\]
%>% coef() modelo
## (Intercept) nitro
## 8.87203185 0.02343431
%>% confint(level = 0.95) modelo
## 2.5 % 97.5 %
## (Intercept) 8.37411051 9.36995320
## nitro 0.01952984 0.02733877
<- data.frame(nitro = c(122.4, 12.5, 33.8))
nuevos_datos predict(modelo, newdata = nuevos_datos)
## 1 2 3
## 11.740391 9.164961 9.664111
predict(modelo, newdata = nuevos_datos, interval = "confidence")
## fit lwr upr
## 1 11.740391 11.448177 12.032605
## 2 9.164961 8.706284 9.623637
## 3 9.664111 9.267389 10.060834
\[\epsilon_i\ \overset{\text{i.i.d.}}\sim\ N(\mu = 0,\ \sigma)\]
<- datos$yield
yield_real <- modelo$fitted.values
yield_pred <- yield_real - yield_pred
residuales %>% head() residuales
## 1 2 3 4 5 6
## -5.1751819 -4.9939019 -4.4833319 -4.2326419 -2.7606746 -0.8994646
residuals(modelo) %>% head()
## 1 2 3 4 5 6
## -5.1751819 -4.9939019 -4.4833319 -4.2326419 -2.7606746 -0.8994646
<- bind_cols(real = yield_real, estimado = yield_pred)
df_real_estimado %>% head() df_real_estimado
%>%
df_real_estimado ggplot(aes(x = real, y = estimado)) +
geom_point()
ggqqplot(residuales)
%>%
residuales enframe(value = "residual") %>%
ggplot(aes(x = residual)) +
geom_density()
%>% mean() residuales
## [1] 1.232164e-17
%>%
df_real_estimado mutate(residual = residuales) %>%
ggplot(aes(x = estimado, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")
\[ RMSE = \sqrt{\frac{\sum_{i=1}^{n}(y_i - \hat{y_i})^2}{n}} \]
Donde \(y_i\) es el valor real y \(\hat{y_i}\) es el valor predicho o estimado por el modelo.
sqrt(sum((yield_real - yield_pred)^2) / length(yield_real))
## [1] 1.662047
rmse_vec(truth = yield_real, estimate = yield_pred)
## [1] 1.662047
\[ R^2 = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y})^2}{\sum_{i = 1}^{n} (y_i - \bar{y})^2} \]
1 - ( sum((yield_real - yield_pred)^2) / sum((yield_real - mean(yield_real))^2) )
## [1] 0.5125768
rsq_vec(truth = yield_real, estimate = yield_pred)
## [1] 0.5125768
cor(x = datos$nitro, y = datos$yield, method = "pearson")^2
## [1] 0.5125768
%>%
datos ggplot(aes(x = nitro, y = yield)) +
geom_point() +
geom_smooth(method = "lm")
%>%
datos ggplot(aes(x = nitro, y = log(yield))) +
geom_point() +
geom_smooth(method = "lm")
<- lm(log(yield) ~ nitro, data = datos)
modelo_log %>% summary() modelo_log
##
## Call:
## lm(formula = log(yield) ~ nitro, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84331 -0.07030 0.03508 0.11883 0.28701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1507960 0.0283609 75.84 <2e-16 ***
## nitro 0.0023708 0.0002224 10.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1886 on 134 degrees of freedom
## Multiple R-squared: 0.4589, Adjusted R-squared: 0.4549
## F-statistic: 113.6 on 1 and 134 DF, p-value: < 2.2e-16
exp(2.1507960) # B0
## [1] 8.591695
exp(0.0023708) # B1
## [1] 1.002374
<- modelo_log$fitted.values %>% exp()
yield_pred_log <- yield_real
yield_real_log
rmse_vec(truth = yield_real_log, estimate = yield_pred_log)
## [1] 1.78757
rsq_vec(truth = yield_real_log, estimate = yield_pred_log)
## [1] 0.4610944
<- bind_cols(real = yield_real_log, estimado = yield_pred_log)
df_real_estimado_log %>% head() df_real_estimado_log
%>%
df_real_estimado_log ggplot(aes(x = real, y = estimado)) +
geom_point()
ggqqplot(residuals(modelo_log))
%>%
df_real_estimado_log mutate(residual = residuals(modelo_log)) %>%
ggplot(aes(x = estimado, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")
<- seq(from = 0, to = 300, length.out = 100)
dosis_nitrogeno %>% head() dosis_nitrogeno
## [1] 0.000000 3.030303 6.060606 9.090909 12.121212 15.151515
<- data.frame(nitro = dosis_nitrogeno)
nuevas_dosis %>% head() nuevas_dosis
<- predict(modelo, newdata = nuevas_dosis)
pred_original <- predict(modelo_log, newdata = nuevas_dosis) %>% exp() pred_logaritmos
ggplotly(
%>%
nuevas_dosis mutate(pred_orig = pred_original,
pred_log = pred_logaritmos) %>%
pivot_longer(cols = -nitro, names_to = "modelo", values_to = "pred_kg") %>%
ggplot(aes(x = nitro, y = pred_kg, color = modelo)) +
geom_line()
)
\[ (X^TX)^{-1}X^TY \]
<- cbind(1, datos$nitro)
matriz_x %>% head() matriz_x
## [,1] [,2]
## [1,] 1 0.0
## [2,] 1 0.0
## [3,] 1 0.0
## [4,] 1 0.0
## [5,] 1 33.6
## [6,] 1 33.6
model.matrix(yield ~ nitro, data = datos) %>% head()
## (Intercept) nitro
## 1 1 0.0
## 2 1 0.0
## 3 1 0.0
## 4 1 0.0
## 5 1 33.6
## 6 1 33.6
<- datos$yield
vector_y %>% head() vector_y
## [1] 3.69685 3.87813 4.38870 4.63939 6.89875 8.75996
<- matriz_x %>% t()
x_transpuesta %>% head() x_transpuesta
## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 1 1 1 1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 0 0 0 0 33.6 33.6 33.6 33.6 67.2 67.2 67.2 67.2 100.8 100.8
## [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1 1 1 1 1.0 1.0
## [2,] 100.8 100.8 134.4 134.4 134.4 134.4 168 168 168 168 201.6 201.6
## [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38]
## [1,] 1.0 1.0 1 1 1 1 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 201.6 201.6 0 0 0 0 33.6 33.6 33.6 33.6 67.2 67.2
## [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49] [,50]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1 1
## [2,] 67.2 67.2 100.8 100.8 100.8 100.8 134.4 134.4 134.4 134.4 168 168
## [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62]
## [1,] 1 1 1.0 1.0 1.0 1.0 1 1 1 1 1.0 1.0
## [2,] 168 168 201.6 201.6 201.6 201.6 0 0 0 0 33.6 33.6
## [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73] [,74]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 33.6 33.6 67.2 67.2 67.2 67.2 100.8 100.8 100.8 100.8 134.4 134.4
## [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86]
## [1,] 1.0 1.0 1 1 1 1 1.0 1.0 1.0 1.0 1 1
## [2,] 134.4 134.4 168 168 168 168 201.6 201.6 201.6 201.6 0 0
## [,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98]
## [1,] 1 1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 0 0 44.8 44.8 44.8 44.8 89.6 89.6 89.6 89.6 134.4 134.4
## [,99] [,100] [,101] [,102] [,103] [,104] [,105] [,106] [,107] [,108]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1 1 1 1
## [2,] 134.4 134.4 179.2 179.2 179.2 179.2 224 224 224 224
## [,109] [,110] [,111] [,112] [,113] [,114] [,115] [,116] [,117] [,118]
## [1,] 1.0 1.0 1.0 1.0 1 1 1 1 1.0 1.0
## [2,] 268.8 268.8 268.8 268.8 0 0 0 0 33.6 33.6
## [,119] [,120] [,121] [,122] [,123] [,124] [,125] [,126] [,127] [,128]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 33.6 33.6 67.2 67.2 67.2 67.2 100.8 100.8 100.8 100.8
## [,129] [,130] [,131] [,132] [,133] [,134] [,135] [,136]
## [1,] 1.0 1.0 1.0 1.0 1 1 1 1
## [2,] 134.4 134.4 134.4 134.4 168 168 168 168
<- x_transpuesta %*% matriz_x
x_transx %>% head() x_transx
## [,1] [,2]
## [1,] 136.0 14246.4
## [2,] 14246.4 2211758.1
<- x_transx %>% solve()
inversa_x_tranx inversa_x_tranx
## [,1] [,2]
## [1,] 0.0226060556 -1.456104e-04
## [2,] -0.0001456104 1.390036e-06
<- x_transpuesta %*% vector_y
transx_y transx_y
## [,1]
## [1,] 1540.451
## [2,] 178225.535
<- inversa_x_tranx %*% transx_y
betas betas
## [,1]
## [1,] 8.87203185
## [2,] 0.02343431
set.seed(2023)
<- initial_split(data = datos, prop = 0.7, strata = "yield")
particion <- training(particion)
datos_train <- testing(particion) datos_test
particion
## <Training/Testing/Total>
## <92/44/136>
%>%
datos_train summarise(mean(yield))
%>%
datos_test summarise(mean(yield))
<- vfold_cv(data = datos_train, v = 5, strata = "yield")
k_fold k_fold
$splits[[1]] %>% training() k_fold