library(tidyverse)
library(ggplot2)
library(janitor)
library(readxl)
library(skimr)
library(qqplotr)
library(corrplot)
library(yardstick)
library(car)
library(ggpubr)
library(rsample)
library(parsnip)
library(workflows)
library(tune)
train <- read.csv("train.csv")
test <- read.csv("test_sinY.csv")
ejemplo <- read.csv("ejemplo_submission.csv")
train %>% head()train %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(x = valor)) +
facet_wrap(~variable, scales = "free") +
geom_density()train %>%
pivot_longer(cols = everything(),
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(sample = valor)) +
facet_wrap(~variable, scales = "free") +
stat_qq_band() +
stat_qq_line() +
stat_qq_point()correlacion_predictoras <-
train %>%
pivot_longer(cols = -N,
names_to = "variable",
values_to = "valor") %>%
group_by(variable) %>%
summarise(correlacion = cor(N , valor, method = "spearman")) %>%
ungroup() %>%
arrange(desc(abs(correlacion))) %>%
filter(abs(correlacion) > 0.10)
correlacion_predictorasvariables_predictoras <- correlacion_predictoras$variable
datos_modelos <- train %>%
select(N, variables_predictoras)
modelo1 <- lm(N ~ ., data = datos_modelos)
modelo1 %>%
summary()##
## Call:
## lm(formula = N ~ ., data = datos_modelos)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.116746 -0.025108 -0.002902 0.021801 0.167052
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.260e-02 5.523e-02 0.409 0.682422
## June_SI -2.555e-01 7.875e-02 -3.245 0.001226 **
## may_red -2.505e-05 1.074e-05 -2.332 0.019978 *
## june_NIR -8.107e-03 9.442e-02 -0.086 0.931601
## march_green 6.514e-05 4.542e-05 1.434 0.151978
## april_red 1.098e-05 3.244e-05 0.338 0.735152
## may_blue 4.415e-06 1.042e-05 0.423 0.672048
## May_RI -6.955e-11 5.717e-11 -1.217 0.224159
## April_RI 3.577e-11 1.390e-11 2.573 0.010268 *
## april_NIR -3.362e-06 2.137e-05 -0.157 0.875018
## march_NIR -3.726e-05 1.997e-05 -1.866 0.062480 .
## March_NDVI 3.234e-01 1.069e-01 3.024 0.002580 **
## Elevation 2.899e-04 7.793e-05 3.719 0.000214 ***
## April_NDVI -5.732e-02 1.040e-01 -0.551 0.581583
## May_BI -2.120e-18 8.419e-18 -0.252 0.801237
## April_CI -3.446e-02 4.358e-02 -0.791 0.429299
## March_CI 1.384e-01 1.022e-01 1.354 0.175987
## id -1.002e-05 4.917e-06 -2.038 0.041901 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03937 on 782 degrees of freedom
## Multiple R-squared: 0.3204, Adjusted R-squared: 0.3056
## F-statistic: 21.68 on 17 and 782 DF, p-value: < 2.2e-16
modelo_forward <- step(modelo1, direction = "forward", trace = 0)
modelo_backward <- step(modelo1, direction = "backward", trace = 0)
modelo_both <- step(modelo1, direction = "both", trace = 0)N_real <- train$N
predichos_mod1 <- modelo1$fitted.values
predichos_mod_forw <- modelo_forward$fitted.values
predichos_mod_back <- modelo_backward$fitted.values
predichos_mod_both <- modelo_both$fitted.values
rmse_vec(truth = N_real, estimate = predichos_mod1)## [1] 0.03892663
rmse_vec(truth = N_real, estimate = predichos_mod_forw)## [1] 0.03892663
rmse_vec(truth = N_real, estimate = predichos_mod_back)## [1] 0.03895322
rmse_vec(truth = N_real, estimate = predichos_mod_both)## [1] 0.03895322
AIC(modelo1, modelo_backward, modelo_forward, modelo_both)BIC(modelo1, modelo_backward, modelo_forward, modelo_both)modelo_both %>%
vif()## June_SI may_red march_green May_RI April_RI march_NIR
## 1.964574 5.749282 57.471942 2.106692 1.771450 27.813733
## March_NDVI Elevation April_NDVI March_CI id
## 22.478749 1.298369 3.535719 9.907085 1.018340
modelo_1 <- lm(N ~ June_SI + May_RI + April_RI + Elevation + April_NDVI, data = train)
summary(modelo_1)##
## Call:
## lm(formula = N ~ June_SI + May_RI + April_RI + Elevation + April_NDVI,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.12492 -0.02480 -0.00403 0.02175 0.16551
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 4.016e-03 2.166e-02 0.185 0.853
## June_SI -3.839e-01 3.732e-02 -10.288 < 2e-16 ***
## May_RI -1.162e-11 4.846e-11 -0.240 0.810
## April_RI 4.897e-11 1.111e-11 4.406 1.2e-05 ***
## Elevation 3.645e-04 7.123e-05 5.117 3.9e-07 ***
## April_NDVI 2.524e-02 1.575e-02 1.602 0.109
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04014 on 794 degrees of freedom
## Multiple R-squared: 0.2827, Adjusted R-squared: 0.2781
## F-statistic: 62.57 on 5 and 794 DF, p-value: < 2.2e-16
predicciones <- predict(modelo_1, newdata = test)
predicciones %>% head()## 1 2 3 4 5 6
## 0.10651258 0.12191399 0.06924964 0.13685306 0.14132219 0.09399901
Modelo basado segun las distribucciones
modelo_2 <- lm(N ~ June_SI + April_CI + a_plan_cur + april_NIR + March_CI + march_NIR + S_Topo_Posi_Index + S_TWI + S_Plan_cur, data = train)
summary(modelo_2)##
## Call:
## lm(formula = N ~ June_SI + April_CI + a_plan_cur + april_NIR +
## March_CI + march_NIR + S_Topo_Posi_Index + S_TWI + S_Plan_cur,
## data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.11861 -0.02637 -0.00407 0.02220 0.16591
##
## Coefficients: (1 not defined because of singularities)
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.482e-01 1.379e-02 10.747 <2e-16 ***
## June_SI -4.579e-01 3.743e-02 -12.234 <2e-16 ***
## April_CI 5.932e-02 3.526e-02 1.682 0.0929 .
## a_plan_cur 1.209e-02 7.101e-03 1.703 0.0890 .
## april_NIR -7.548e-06 7.979e-06 -0.946 0.3444
## March_CI -1.213e-02 3.681e-02 -0.329 0.7419
## march_NIR -3.230e-06 8.127e-06 -0.397 0.6911
## S_Topo_Posi_Index -6.323e-04 8.433e-04 -0.750 0.4536
## S_TWI NA NA NA NA
## S_Plan_cur 4.486e-03 9.420e-02 0.048 0.9620
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04159 on 791 degrees of freedom
## Multiple R-squared: 0.2328, Adjusted R-squared: 0.225
## F-statistic: 30 on 8 and 791 DF, p-value: < 2.2e-16
predicciones <- predict(modelo_2, newdata = test)
predicciones %>% head()## 1 2 3 4 5 6
## 0.12126114 0.12493828 0.07904479 0.14912355 0.13700282 0.10265968
En este caso vamos a asumir que nuestro modelo va a realizar predicciones con base en el mes de abril
modelo_3 <- lm(N ~ april_red + april_NIR + April_BI + April_CI + April_RI, data = train)
summary(modelo_3)##
## Call:
## lm(formula = N ~ april_red + april_NIR + April_BI + April_CI +
## April_RI, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.116651 -0.026723 -0.006384 0.022392 0.187294
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.672e-01 1.340e-02 12.476 < 2e-16 ***
## april_red -3.779e-05 5.527e-06 -6.838 1.6e-11 ***
## april_NIR -1.682e-06 4.615e-06 -0.365 0.71554
## April_BI 1.737e-18 5.599e-18 0.310 0.75653
## April_CI 9.134e-03 3.369e-02 0.271 0.78633
## April_RI 4.156e-11 1.311e-11 3.170 0.00159 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.04236 on 794 degrees of freedom
## Multiple R-squared: 0.2014, Adjusted R-squared: 0.1963
## F-statistic: 40.04 on 5 and 794 DF, p-value: < 2.2e-16
predicciones <- predict(modelo_3, newdata = test)
predicciones %>% head()## 1 2 3 4 5 6
## 0.09498155 0.10205585 0.10295509 0.10288908 0.12724644 0.10623683
BIC(modelo_1, modelo_2, modelo_3)N_real <- train$N
predichos_mod1 <- modelo_1$fitted.values
rmse_vec(truth = N_real, estimate = predichos_mod1)## [1] 0.03999154
predichos_mod2 <- modelo_2$fitted.values
rmse_vec(truth = N_real, estimate = predichos_mod2)## [1] 0.04135911
predichos_mod3 <- modelo_3$fitted.values
rmse_vec(truth = N_real, estimate = predichos_mod3)## [1] 0.04219678
El modelo que finalmente seleccionamos fue el modelo 1
N_real <- train$N
N_pred <- modelo_1$fitted.values
residuales <- residuals(modelo_1)
residuales %>% head()## 1 2 3 4 5 6
## -0.029443286 -0.035034739 -0.014103524 0.074355015 -0.018681738 -0.005942524
df_real_estimado <- bind_cols(real = N_real, estimado = N_pred)
df_real_estimado %>%
ggplot(aes(x = real, y = estimado)) +
geom_point()df_real_estimado %>%
mutate(residual = residuales) %>%
ggplot(aes(x = estimado, y = residual)) +
geom_point() +
geom_hline(yintercept = 0, color = "red") ggqqplot(residuales)train %>%
pivot_longer(cols = May_RI,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(x = valor)) +
facet_wrap(~variable, scales = "free") +
geom_density()train %>%
pivot_longer(cols = June_SI,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(x = valor)) +
facet_wrap(~variable, scales = "free") +
geom_density()train %>%
pivot_longer(cols = April_RI,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(x = valor)) +
facet_wrap(~variable, scales = "free") +
geom_density()train %>%
pivot_longer(cols = Elevation,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(x = valor)) +
facet_wrap(~variable, scales = "free") +
geom_density()train %>%
pivot_longer(cols = April_NDVI,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(x = valor)) +
facet_wrap(~variable, scales = "free") +
geom_density()train %>%
pivot_longer(cols = June_SI,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(sample = valor)) +
facet_wrap(~variable, scales = "free") +
stat_qq_band() +
stat_qq_line() +
stat_qq_point()train %>%
pivot_longer(cols = Elevation,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(sample = valor)) +
facet_wrap(~variable, scales = "free") +
stat_qq_band() +
stat_qq_line() +
stat_qq_point()train %>%
pivot_longer(cols = May_RI,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(sample = valor)) +
facet_wrap(~variable, scales = "free") +
stat_qq_band() +
stat_qq_line() +
stat_qq_point()train %>%
pivot_longer(cols = April_RI,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(sample = valor)) +
facet_wrap(~variable, scales = "free") +
stat_qq_band() +
stat_qq_line() +
stat_qq_point()train %>%
pivot_longer(cols = April_NDVI,
names_to = "variable",
values_to = "valor") %>%
ggplot(aes(sample = valor)) +
facet_wrap(~variable, scales = "free") +
stat_qq_band() +
stat_qq_line() +
stat_qq_point()NOTA: las variables de mayor importancia según sus gráficas de distribución y basandonos en las bandas de confianza de los gráficos cuantil-cuantil son: June_SI y la Elevation