library(tidyverse) # Manipulación y visualización de datos
library(agridat) # Base de datos de ejemplo
library(ggpubr) # Gráfico cuantil-cuantil
library(yardstick) # Métricas de error: R-cuadrado y para el RMSE
library(plotly) # Gráficos interactivos
library(rsample)datos <- hernandez.nitrogen
datos %>% head()ggqqplot(datos$yield)datos %>%
ggplot(aes(x = yield)) +
geom_density()\[\rho_{(X,Y)} = \frac{Cov_{(X,Y)}}{\sigma_X\times\sigma_Y} = \frac{\sum_{i=1}^{n}(X_i-\mu_X)(Y_i-\mu_Y)}{\sigma_X\times\sigma_Y}\]
covarianza <- cov(x = datos$nitro, y = datos$yield)
desv_nitro <- sd(datos$nitro)
desv_yield <- sd(datos$yield)
covarianza / (desv_nitro * desv_yield)## [1] 0.7159447
cor(x = datos$nitro, y = datos$yield, method = "pearson")## [1] 0.7159447
datos %>%
ggplot(aes(x = nitro, y = yield)) +
geom_point()\[Y_i = \beta_0\ +\ \beta_1X\] \[\hat{Y_i} = \hat{\beta_0}\ + \hat{\beta_1}X_i\ + \hat{\epsilon_i}\]
\[Y_i = E(Y|X_i)\ +\ \epsilon_i\\\]
\[Y_i = E(Y|X_i)\ +\ \epsilon_i\\ Y_i = \beta_0 +\ \beta_1X_i\ +\ \epsilon_i\]
modelo <- lm(yield ~ nitro, data = datos)
summary(modelo)##
## Call:
## lm(formula = yield ~ nitro, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.1752 -0.8803 0.1086 1.1148 3.4049
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.872032 0.251752 35.24 <2e-16 ***
## nitro 0.023434 0.001974 11.87 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.674 on 134 degrees of freedom
## Multiple R-squared: 0.5126, Adjusted R-squared: 0.5089
## F-statistic: 140.9 on 1 and 134 DF, p-value: < 2.2e-16
\[Y_i = 8.87203185 + 0.02343431 \times Dosis\]
modelo %>% coef()## (Intercept) nitro
## 8.87203185 0.02343431
modelo %>% confint(level = 0.95)## 2.5 % 97.5 %
## (Intercept) 8.37411051 9.36995320
## nitro 0.01952984 0.02733877
nuevos_datos <- data.frame(nitro = c(122.4, 12.5, 33.8))
predict(modelo, newdata = nuevos_datos)## 1 2 3
## 11.740391 9.164961 9.664111
predict(modelo, newdata = nuevos_datos, interval = "confidence")## fit lwr upr
## 1 11.740391 11.448177 12.032605
## 2 9.164961 8.706284 9.623637
## 3 9.664111 9.267389 10.060834
\[\epsilon_i\ \overset{\text{i.i.d.}}\sim\ N(\mu = 0,\ \sigma)\]
yield_real <- datos$yield
yield_pred <- modelo$fitted.values
residuales <- yield_real - yield_pred
residuales %>% head()## 1 2 3 4 5 6
## -5.1751819 -4.9939019 -4.4833319 -4.2326419 -2.7606746 -0.8994646
residuals(modelo) %>% head()## 1 2 3 4 5 6
## -5.1751819 -4.9939019 -4.4833319 -4.2326419 -2.7606746 -0.8994646
df_real_estimado <- bind_cols(real = yield_real, estimado = yield_pred)
df_real_estimado %>% head()df_real_estimado %>%
ggplot(aes(x = real, y = estimado)) +
geom_point()ggqqplot(residuales)residuales %>%
enframe(value = "residual") %>%
ggplot(aes(x = residual)) +
geom_density()residuales %>% mean()## [1] 1.232164e-17
df_real_estimado %>%
mutate(residual = residuales) %>%
ggplot(aes(x = estimado, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")\[ RMSE = \sqrt{\frac{\sum_{i=1}^{n}(y_i - \hat{y_i})^2}{n}} \]
Donde \(y_i\) es el valor real y \(\hat{y_i}\) es el valor predicho o estimado por el modelo.
sqrt(sum((yield_real - yield_pred)^2) / length(yield_real))## [1] 1.662047
rmse_vec(truth = yield_real, estimate = yield_pred)## [1] 1.662047
\[ R^2 = 1 - \frac{\sum_{i=1}^{n} (y_i - \hat{y})^2}{\sum_{i = 1}^{n} (y_i - \bar{y})^2} \]
1 - ( sum((yield_real - yield_pred)^2) / sum((yield_real - mean(yield_real))^2) )## [1] 0.5125768
rsq_vec(truth = yield_real, estimate = yield_pred)## [1] 0.5125768
cor(x = datos$nitro, y = datos$yield, method = "pearson")^2## [1] 0.5125768
datos %>%
ggplot(aes(x = nitro, y = yield)) +
geom_point() +
geom_smooth(method = "lm")datos %>%
ggplot(aes(x = nitro, y = log(yield))) +
geom_point() +
geom_smooth(method = "lm")modelo_log <- lm(log(yield) ~ nitro, data = datos)
modelo_log %>% summary()##
## Call:
## lm(formula = log(yield) ~ nitro, data = datos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.84331 -0.07030 0.03508 0.11883 0.28701
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.1507960 0.0283609 75.84 <2e-16 ***
## nitro 0.0023708 0.0002224 10.66 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.1886 on 134 degrees of freedom
## Multiple R-squared: 0.4589, Adjusted R-squared: 0.4549
## F-statistic: 113.6 on 1 and 134 DF, p-value: < 2.2e-16
exp(2.1507960) # B0## [1] 8.591695
exp(0.0023708) # B1## [1] 1.002374
yield_pred_log <- modelo_log$fitted.values %>% exp()
yield_real_log <- yield_real
rmse_vec(truth = yield_real_log, estimate = yield_pred_log)## [1] 1.78757
rsq_vec(truth = yield_real_log, estimate = yield_pred_log)## [1] 0.4610944
df_real_estimado_log <- bind_cols(real = yield_real_log, estimado = yield_pred_log)
df_real_estimado_log %>% head()df_real_estimado_log %>%
ggplot(aes(x = real, y = estimado)) +
geom_point()ggqqplot(residuals(modelo_log))df_real_estimado_log %>%
mutate(residual = residuals(modelo_log)) %>%
ggplot(aes(x = estimado, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red")dosis_nitrogeno <- seq(from = 0, to = 300, length.out = 100)
dosis_nitrogeno %>% head()## [1] 0.000000 3.030303 6.060606 9.090909 12.121212 15.151515
nuevas_dosis <- data.frame(nitro = dosis_nitrogeno)
nuevas_dosis %>% head()pred_original <- predict(modelo, newdata = nuevas_dosis)
pred_logaritmos <- predict(modelo_log, newdata = nuevas_dosis) %>% exp()ggplotly(
nuevas_dosis %>%
mutate(pred_orig = pred_original,
pred_log = pred_logaritmos) %>%
pivot_longer(cols = -nitro, names_to = "modelo", values_to = "pred_kg") %>%
ggplot(aes(x = nitro, y = pred_kg, color = modelo)) +
geom_line()
)\[ (X^TX)^{-1}X^TY \]
matriz_x <- cbind(1, datos$nitro)
matriz_x %>% head()## [,1] [,2]
## [1,] 1 0.0
## [2,] 1 0.0
## [3,] 1 0.0
## [4,] 1 0.0
## [5,] 1 33.6
## [6,] 1 33.6
model.matrix(yield ~ nitro, data = datos) %>% head()## (Intercept) nitro
## 1 1 0.0
## 2 1 0.0
## 3 1 0.0
## 4 1 0.0
## 5 1 33.6
## 6 1 33.6
vector_y <- datos$yield
vector_y %>% head()## [1] 3.69685 3.87813 4.38870 4.63939 6.89875 8.75996
x_transpuesta <- matriz_x %>% t()
x_transpuesta %>% head()## [,1] [,2] [,3] [,4] [,5] [,6] [,7] [,8] [,9] [,10] [,11] [,12] [,13] [,14]
## [1,] 1 1 1 1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 0 0 0 0 33.6 33.6 33.6 33.6 67.2 67.2 67.2 67.2 100.8 100.8
## [,15] [,16] [,17] [,18] [,19] [,20] [,21] [,22] [,23] [,24] [,25] [,26]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1 1 1 1 1.0 1.0
## [2,] 100.8 100.8 134.4 134.4 134.4 134.4 168 168 168 168 201.6 201.6
## [,27] [,28] [,29] [,30] [,31] [,32] [,33] [,34] [,35] [,36] [,37] [,38]
## [1,] 1.0 1.0 1 1 1 1 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 201.6 201.6 0 0 0 0 33.6 33.6 33.6 33.6 67.2 67.2
## [,39] [,40] [,41] [,42] [,43] [,44] [,45] [,46] [,47] [,48] [,49] [,50]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1 1
## [2,] 67.2 67.2 100.8 100.8 100.8 100.8 134.4 134.4 134.4 134.4 168 168
## [,51] [,52] [,53] [,54] [,55] [,56] [,57] [,58] [,59] [,60] [,61] [,62]
## [1,] 1 1 1.0 1.0 1.0 1.0 1 1 1 1 1.0 1.0
## [2,] 168 168 201.6 201.6 201.6 201.6 0 0 0 0 33.6 33.6
## [,63] [,64] [,65] [,66] [,67] [,68] [,69] [,70] [,71] [,72] [,73] [,74]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 33.6 33.6 67.2 67.2 67.2 67.2 100.8 100.8 100.8 100.8 134.4 134.4
## [,75] [,76] [,77] [,78] [,79] [,80] [,81] [,82] [,83] [,84] [,85] [,86]
## [1,] 1.0 1.0 1 1 1 1 1.0 1.0 1.0 1.0 1 1
## [2,] 134.4 134.4 168 168 168 168 201.6 201.6 201.6 201.6 0 0
## [,87] [,88] [,89] [,90] [,91] [,92] [,93] [,94] [,95] [,96] [,97] [,98]
## [1,] 1 1 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 0 0 44.8 44.8 44.8 44.8 89.6 89.6 89.6 89.6 134.4 134.4
## [,99] [,100] [,101] [,102] [,103] [,104] [,105] [,106] [,107] [,108]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1 1 1 1
## [2,] 134.4 134.4 179.2 179.2 179.2 179.2 224 224 224 224
## [,109] [,110] [,111] [,112] [,113] [,114] [,115] [,116] [,117] [,118]
## [1,] 1.0 1.0 1.0 1.0 1 1 1 1 1.0 1.0
## [2,] 268.8 268.8 268.8 268.8 0 0 0 0 33.6 33.6
## [,119] [,120] [,121] [,122] [,123] [,124] [,125] [,126] [,127] [,128]
## [1,] 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0 1.0
## [2,] 33.6 33.6 67.2 67.2 67.2 67.2 100.8 100.8 100.8 100.8
## [,129] [,130] [,131] [,132] [,133] [,134] [,135] [,136]
## [1,] 1.0 1.0 1.0 1.0 1 1 1 1
## [2,] 134.4 134.4 134.4 134.4 168 168 168 168
x_transx <- x_transpuesta %*% matriz_x
x_transx %>% head()## [,1] [,2]
## [1,] 136.0 14246.4
## [2,] 14246.4 2211758.1
inversa_x_tranx <- x_transx %>% solve()
inversa_x_tranx## [,1] [,2]
## [1,] 0.0226060556 -1.456104e-04
## [2,] -0.0001456104 1.390036e-06
transx_y <- x_transpuesta %*% vector_y
transx_y## [,1]
## [1,] 1540.451
## [2,] 178225.535
betas <- inversa_x_tranx %*% transx_y
betas## [,1]
## [1,] 8.87203185
## [2,] 0.02343431
set.seed(2023)
particion <- initial_split(data = datos, prop = 0.7, strata = "yield")
datos_train <- training(particion)
datos_test <- testing(particion)particion## <Training/Testing/Total>
## <92/44/136>
datos_train %>%
summarise(mean(yield))datos_test %>%
summarise(mean(yield))k_fold <- vfold_cv(data = datos_train, v = 5, strata = "yield")
k_foldk_fold$splits[[1]] %>% training()