Haique, Ana Karina anahaique@gmail.com
Rubio, Natalia rubionatalia@yahoo.com.ar
Malaspina, Maria Laura mlmalaspina@hotmail.com
Gimenez Gustavo gunegim@gmail.com
https://cran.r-project.org/web/packages/visdat/vignettes/using_visdat.html
https://rdocumentation.org/packages/visdat/versions/0.0.4.9500
library(visdat)
library(naniar)
library(simputation)
##
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
##
## impute_median
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6 ✔ purrr 0.3.4
## ✔ tibble 3.1.7 ✔ dplyr 1.0.9
## ✔ tidyr 1.2.0 ✔ stringr 1.4.0
## ✔ readr 2.1.2 ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom 1.0.0 ✔ rsample 1.0.0
## ✔ dials 1.0.0 ✔ tune 1.0.0
## ✔ infer 1.0.2 ✔ workflows 1.0.0
## ✔ modeldata 1.0.0 ✔ workflowsets 1.0.0
## ✔ parsnip 1.0.0 ✔ yardstick 1.0.0
## ✔ recipes 1.0.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(modeltime)
library(timetk)
library(ggplot2)
# Data
data <- read.csv2("LechonSim.csv")
data_tbl <- data %>% as_tibble() %>%
mutate(Fecha = dmy(Fecha))
data_tbl
## # A tibble: 97 × 7
## Fecha Chu_Cor Chu_Cor_NA Chu_Va_Inf Patagonia_A Patagonia_B PPLechon
## <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 2014-04-01 47.5 47.5 47.5 39.5 40 43.6
## 2 2014-04-01 47.5 47.5 47.5 39.5 40 43.6
## 3 2014-05-01 44 44 44 40 40 42
## 4 2014-06-01 47.5 47.5 47.5 40 48 45.8
## 5 2014-07-01 47.5 47.5 47.5 40 46 45.2
## 6 2014-08-01 47.5 47.5 47.5 40 46 45.2
## 7 2014-09-01 50 50 50 40 48 47
## 8 2014-10-01 50 50 50 37 48 46.2
## 9 2014-11-01 57.5 57.5 57.5 37 51.5 50.9
## 10 2015-01-01 56.5 56.5 56.5 40 55 52
## # … with 87 more rows
# Visualizar la serie (Chu_Cor) con los datos completos:
data_tbl %>%
plot_time_series(.date_var = Fecha,
.value = Chu_Cor,
.smooth = FALSE,
.line_size = 1,
.title = ""
)
# Modelamos
data_tbl %>%
tk_augment_timeseries_signature() %>%
select(Fecha, index.num, Chu_Cor) %>%
plot_time_series_regression(
.line_size = 1,
.title = "",
.date_var = Fecha,
.formula = Chu_Cor ~ splines::ns(index.num,
df = 1,
knots = quantile(index.num, probs = c(0.4,0.7))
),
.show_summary = TRUE
)
## tk_augment_timeseries_signature(): Using the following .date_var variable: Fecha
##
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84.484 -5.184 0.688 6.036 74.616
##
## Coefficients:
## Estimate
## (Intercept) 43.612
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 67.807
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 469.655
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 592.258
## Std. Error
## (Intercept) 6.714
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 8.732
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 17.211
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 7.543
## t value
## (Intercept) 6.496
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 7.765
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 27.288
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 78.519
## Pr(>|t|)
## (Intercept) 4.03e-09
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 1.05e-11
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 < 2e-16
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 < 2e-16
##
## (Intercept) ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.85 on 93 degrees of freedom
## Multiple R-squared: 0.9858, Adjusted R-squared: 0.9853
## F-statistic: 2150 on 3 and 93 DF, p-value: < 2.2e-16
#Multiple R-squared: 0.9879
# Se retira un 9% de las observaciones (es un porcentaje similar al
# que mostraron las series con NA de la base de datos)
## Correlacion entre las variaables:
data_cor_matrix <- data_tbl %>% select(-Fecha) %>% cor(use = "complete.obs")
library(corrplot)
## corrplot 0.92 loaded
corrplot(data_cor_matrix,method = "number",number.digits = 4)
## Visualizacion de los NA:
#Chu_Va_inf
data_tbl %>% select(Fecha,Chu_Cor,Chu_Va_Inf, PPLechon) %>% vis_dat()
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
data_tbl %>% select(Fecha,Chu_Cor,Chu_Va_Inf, PPLechon) %>% vis_miss()
data_tbl %>%
ggplot(aes(x=PPLechon,y = Chu_Cor_NA)) +
geom_miss_point()
## Imputacion Lineal:
data_tbl %>%
#label if Chu_Va_inf is missing
add_label_missings(Chu_Cor_NA) %>%
#Imputacion lineal
impute_lm(Chu_Cor_NA ~ PPLechon) %>%
#Visualizacion
ggplot(aes(PPLechon, Chu_Cor_NA, color = any_missing)) +
geom_point()
#Para obtener la tabla completa:
data_lm_imp <- data_tbl %>%
#label if Chu_Va_inf is missing
add_label_missings(Chu_Cor_NA) %>%
#Imputacion lineal
impute_lm(Chu_Cor_NA ~ PPLechon)
#Volvemos a correr el modelo Natural Spline:
data_lm_imp %>%
tk_augment_timeseries_signature() %>%
select(Fecha, index.num, Chu_Cor_NA) %>%
plot_time_series_regression(
.line_size = 1,
.title = "",
.date_var = Fecha,
.formula = Chu_Cor_NA ~ splines::ns(index.num,
df = 1,
knots = quantile(index.num, probs = c(0.4,0.7))
),
.show_summary = TRUE
)
## tk_augment_timeseries_signature(): Using the following .date_var variable: Fecha
##
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84.351 -5.725 0.379 5.633 76.827
##
## Coefficients:
## Estimate
## (Intercept) 42.592
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 62.592
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 472.543
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 591.849
## Std. Error
## (Intercept) 6.606
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 8.592
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 16.935
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 7.422
## t value
## (Intercept) 6.447
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 7.285
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 27.904
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 79.744
## Pr(>|t|)
## (Intercept) 5.02e-09
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 1.03e-10
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 < 2e-16
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 < 2e-16
##
## (Intercept) ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.52 on 93 degrees of freedom
## Multiple R-squared: 0.9862, Adjusted R-squared: 0.9857
## F-statistic: 2211 on 3 and 93 DF, p-value: < 2.2e-16
#Multiple R-squared: 0.9862
## Imputacion Ramdom Forest
data_tbl %>%
#label if Chu_Va_inf is missing
add_label_missings(Chu_Cor_NA) %>%
#Imputacion lineal
impute_rf(Chu_Cor_NA ~ PPLechon) %>%
#Visualizacion
ggplot(aes(PPLechon, Chu_Cor_NA, color = any_missing)) +
geom_point()
#Para obtener la tabla completa:
data_rf_imp <- data_tbl %>%
#label if Chu_Va_inf is missing
add_label_missings(Chu_Cor_NA) %>%
#Imputacion lineal
impute_rf(Chu_Cor_NA ~ PPLechon)
#Modelamos
data_rf_imp %>%
tk_augment_timeseries_signature() %>%
select(Fecha, index.num, Chu_Cor_NA) %>%
plot_time_series_regression(
.line_size = 1,
.title = "",
.date_var = Fecha,
.formula = Chu_Cor_NA ~ splines::ns(index.num,
df = 1,
knots = quantile(index.num, probs = c(0.4,0.7))
),
.show_summary = TRUE
)
## tk_augment_timeseries_signature(): Using the following .date_var variable: Fecha
##
## Call:
## stats::lm(formula = .formula, data = df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84.327 -5.815 0.834 5.671 76.121
##
## Coefficients:
## Estimate
## (Intercept) 43.142
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 64.350
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 470.766
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 592.064
## Std. Error
## (Intercept) 6.754
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 8.784
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 17.313
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 7.588
## t value
## (Intercept) 6.388
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 7.326
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 27.192
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 78.031
## Pr(>|t|)
## (Intercept) 6.58e-09
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 8.45e-11
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 < 2e-16
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 < 2e-16
##
## (Intercept) ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 20.98 on 93 degrees of freedom
## Multiple R-squared: 0.9856, Adjusted R-squared: 0.9851
## F-statistic: 2119 on 3 and 93 DF, p-value: < 2.2e-16
#Multiple R-squared: 0.9853,