Informacion de los autores:

Haique, Ana Karina

Rubio, Natalia

Malaspina, Maria Laura

Gimenez Gustavo

Referencias Bibliograficas

https://cran.r-project.org/web/packages/visdat/vignettes/using_visdat.html

https://rdocumentation.org/packages/visdat/versions/0.0.4.9500

https://cran.r-project.org/web/packages/naniar/naniar.pdf

Librerias Principales

library(visdat)
library(naniar)
library(simputation)
## 
## Attaching package: 'simputation'
## The following object is masked from 'package:naniar':
## 
##     impute_median
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✔ ggplot2 3.3.6     ✔ purrr   0.3.4
## ✔ tibble  3.1.7     ✔ dplyr   1.0.9
## ✔ tidyr   1.2.0     ✔ stringr 1.4.0
## ✔ readr   2.1.2     ✔ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(lubridate)
## 
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.0.0 ──
## ✔ broom        1.0.0     ✔ rsample      1.0.0
## ✔ dials        1.0.0     ✔ tune         1.0.0
## ✔ infer        1.0.2     ✔ workflows    1.0.0
## ✔ modeldata    1.0.0     ✔ workflowsets 1.0.0
## ✔ parsnip      1.0.0     ✔ yardstick    1.0.0
## ✔ recipes      1.0.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
library(modeltime)
library(timetk)
library(ggplot2)

Código de trabajo

# Data

data <- read.csv2("LechonSim.csv")

data_tbl <- data %>% as_tibble() %>% 
  mutate(Fecha = dmy(Fecha))

data_tbl 
## # A tibble: 97 × 7
##    Fecha      Chu_Cor Chu_Cor_NA Chu_Va_Inf Patagonia_A Patagonia_B PPLechon
##    <date>       <dbl>      <dbl>      <dbl>       <dbl>       <dbl>    <dbl>
##  1 2014-04-01    47.5       47.5       47.5        39.5        40       43.6
##  2 2014-04-01    47.5       47.5       47.5        39.5        40       43.6
##  3 2014-05-01    44         44         44          40          40       42  
##  4 2014-06-01    47.5       47.5       47.5        40          48       45.8
##  5 2014-07-01    47.5       47.5       47.5        40          46       45.2
##  6 2014-08-01    47.5       47.5       47.5        40          46       45.2
##  7 2014-09-01    50         50         50          40          48       47  
##  8 2014-10-01    50         50         50          37          48       46.2
##  9 2014-11-01    57.5       57.5       57.5        37          51.5     50.9
## 10 2015-01-01    56.5       56.5       56.5        40          55       52  
## # … with 87 more rows
# Visualizar la serie (Chu_Cor) con los datos completos:

data_tbl %>% 
  plot_time_series(.date_var    = Fecha, 
                   .value       = Chu_Cor,
                   .smooth      = FALSE,
                   
                   .line_size   = 1,
                   .title       = ""
  )
# Modelamos

data_tbl %>% 
  tk_augment_timeseries_signature() %>% 
  select(Fecha, index.num, Chu_Cor) %>% 
  plot_time_series_regression(
    .line_size   = 1,
    .title    = "",
    .date_var = Fecha,
    .formula  = Chu_Cor ~ splines::ns(index.num, 
                                         df = 1,
                                         knots = quantile(index.num, probs = c(0.4,0.7))
                                      ),
    .show_summary = TRUE
  )
## tk_augment_timeseries_signature(): Using the following .date_var variable: Fecha
## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -84.484  -5.184   0.688   6.036  74.616 
## 
## Coefficients:
##                                                                                   Estimate
## (Intercept)                                                                         43.612
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1   67.807
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  469.655
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  592.258
##                                                                                   Std. Error
## (Intercept)                                                                            6.714
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1      8.732
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2     17.211
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3      7.543
##                                                                                   t value
## (Intercept)                                                                         6.496
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1   7.765
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  27.288
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  78.519
##                                                                                   Pr(>|t|)
## (Intercept)                                                                       4.03e-09
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 1.05e-11
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  < 2e-16
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  < 2e-16
##                                                                                      
## (Intercept)                                                                       ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.85 on 93 degrees of freedom
## Multiple R-squared:  0.9858, Adjusted R-squared:  0.9853 
## F-statistic:  2150 on 3 and 93 DF,  p-value: < 2.2e-16
#Multiple R-squared:  0.9879

# Se retira un 9% de las observaciones (es un porcentaje similar al 
# que mostraron las series con NA de la base de datos)


## Correlacion entre las variaables:

data_cor_matrix <- data_tbl %>% select(-Fecha) %>% cor(use = "complete.obs")

library(corrplot)
## corrplot 0.92 loaded
corrplot(data_cor_matrix,method = "number",number.digits = 4)

## Visualizacion de los NA:

#Chu_Va_inf

data_tbl %>% select(Fecha,Chu_Cor,Chu_Va_Inf, PPLechon) %>% vis_dat()
## Warning: `gather_()` was deprecated in tidyr 1.2.0.
## Please use `gather()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

data_tbl %>% select(Fecha,Chu_Cor,Chu_Va_Inf, PPLechon) %>% vis_miss()

data_tbl %>% 
  ggplot(aes(x=PPLechon,y = Chu_Cor_NA)) +
  geom_miss_point()

## Imputacion Lineal:

data_tbl %>% 
  #label if Chu_Va_inf is missing
  add_label_missings(Chu_Cor_NA) %>% 
  #Imputacion lineal
  impute_lm(Chu_Cor_NA ~ PPLechon) %>% 
  #Visualizacion
  ggplot(aes(PPLechon, Chu_Cor_NA, color = any_missing)) +
  geom_point()

#Para obtener la tabla completa:

data_lm_imp <- data_tbl %>% 
  #label if Chu_Va_inf is missing
  add_label_missings(Chu_Cor_NA) %>% 
  #Imputacion lineal
  impute_lm(Chu_Cor_NA ~ PPLechon)

#Volvemos a correr el modelo Natural Spline:

data_lm_imp %>% 
  tk_augment_timeseries_signature() %>% 
  select(Fecha, index.num, Chu_Cor_NA) %>% 
  plot_time_series_regression(
    .line_size   = 1,
    .title    = "",
    .date_var = Fecha,
    .formula  = Chu_Cor_NA ~ splines::ns(index.num, 
                                      df = 1,
                                      knots = quantile(index.num, probs = c(0.4,0.7))
    ),
    .show_summary = TRUE
  )
## tk_augment_timeseries_signature(): Using the following .date_var variable: Fecha
## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -84.351  -5.725   0.379   5.633  76.827 
## 
## Coefficients:
##                                                                                   Estimate
## (Intercept)                                                                         42.592
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1   62.592
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  472.543
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  591.849
##                                                                                   Std. Error
## (Intercept)                                                                            6.606
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1      8.592
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2     16.935
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3      7.422
##                                                                                   t value
## (Intercept)                                                                         6.447
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1   7.285
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  27.904
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  79.744
##                                                                                   Pr(>|t|)
## (Intercept)                                                                       5.02e-09
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 1.03e-10
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  < 2e-16
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  < 2e-16
##                                                                                      
## (Intercept)                                                                       ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.52 on 93 degrees of freedom
## Multiple R-squared:  0.9862, Adjusted R-squared:  0.9857 
## F-statistic:  2211 on 3 and 93 DF,  p-value: < 2.2e-16
#Multiple R-squared:  0.9862

## Imputacion Ramdom Forest

data_tbl %>% 
  #label if Chu_Va_inf is missing
  add_label_missings(Chu_Cor_NA) %>% 
  #Imputacion lineal
  impute_rf(Chu_Cor_NA ~ PPLechon) %>% 
  #Visualizacion
  ggplot(aes(PPLechon, Chu_Cor_NA, color = any_missing)) +
  geom_point()

#Para obtener la tabla completa:

data_rf_imp  <- data_tbl %>% 
  #label if Chu_Va_inf is missing
  add_label_missings(Chu_Cor_NA) %>% 
  #Imputacion lineal
  impute_rf(Chu_Cor_NA ~ PPLechon)

#Modelamos 

data_rf_imp %>% 
  tk_augment_timeseries_signature() %>% 
  select(Fecha, index.num, Chu_Cor_NA) %>% 
  plot_time_series_regression(
    .line_size   = 1,
    .title    = "",
    .date_var = Fecha,
    .formula  = Chu_Cor_NA ~ splines::ns(index.num, 
                                         df = 1,
                                         knots = quantile(index.num, probs = c(0.4,0.7))
    ),
    .show_summary = TRUE
  )  
## tk_augment_timeseries_signature(): Using the following .date_var variable: Fecha
## 
## Call:
## stats::lm(formula = .formula, data = df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -84.327  -5.815   0.834   5.671  76.121 
## 
## Coefficients:
##                                                                                   Estimate
## (Intercept)                                                                         43.142
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1   64.350
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  470.766
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  592.064
##                                                                                   Std. Error
## (Intercept)                                                                            6.754
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1      8.784
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2     17.313
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3      7.588
##                                                                                   t value
## (Intercept)                                                                         6.388
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1   7.326
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  27.192
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  78.031
##                                                                                   Pr(>|t|)
## (Intercept)                                                                       6.58e-09
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 8.45e-11
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2  < 2e-16
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3  < 2e-16
##                                                                                      
## (Intercept)                                                                       ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))1 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))2 ***
## splines::ns(index.num, df = 1, knots = quantile(index.num, probs = c(0.4, 0.7)))3 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 20.98 on 93 degrees of freedom
## Multiple R-squared:  0.9856, Adjusted R-squared:  0.9851 
## F-statistic:  2119 on 3 and 93 DF,  p-value: < 2.2e-16
#Multiple R-squared:  0.9853,