BasesPerdidas

Base de datos

library(ggplot2)
library(readr)
library(tidyverse)
library(dplyr)
library(janitor)
library(cowplot)
theme_set(theme_cowplot())
library(reshape2)
library(lubridate)
library(forecast)
library(tseries)
library(tsibble)
library(stats)
library(ggfortify)
library(ggplot2)
library(dplyr)
library(cowbell)
library(xtable)
library(viridis)
library(ggsci)

Base de datos del clima

The data was obtained from the National Aeronautics and Space Administration (NASA) Langley Research Center (LaRC) Prediction of Worldwide Energy Resource (POWER) Project funded through the NASA Earth Science/Applied Science Program.

The data was obtained from the POWER Project’s Hourly 2.x.x version on 2023/09/03.

https://power.larc.nasa.gov/data-access-viewer/

https://power.larc.nasa.gov/docs/referencing/

clima <- read_csv("POWER_Point_Monthly_Timeseries_1981_2021_040d7732N_073d9721W_LST.csv",   skip = 25)

Limpieza de datos

clima1 <- clima%>%pivot_longer(cols = JAN:ANN, 
               names_to = "MES", 
               values_to = "VALUE")%>%
  filter(MES!="ANN")%>%
  mutate(DATE=ym(paste0(YEAR,"-",MES ))) %>% 
  select(-c(YEAR, MES)) %>% 
  pivot_wider(names_from = PARAMETER, values_from = VALUE)%>% 
  clean_names()%>%
  remove_empty()%>%
  mutate(year=year(date))%>%
  select_if(~all(!is.na(.)))
  
  ggplot(clima1, aes(x=date, y=t2m))+geom_line()+geom_hline(yintercept = 24, linetype = "dashed", color = "red")+theme(aspect.ratio = .4)+geom_hline(yintercept = -5, linetype = "dashed", color = "blue")

train<-clima1%>%filter(year<=2010)
test<-clima1%>%filter(year>2010)


train_tseries <- ts(train$t2m, start = c(min(year(train$date)), min(month(train$date))) , frequency = 12)

test_tseries <- ts(test$t2m, start = c(min(year(test$date)), min(month(test$date))) , frequency = 12)

full_tseries<-ts(clima1$t2m, start = c(min(year(test$date)), min(month(test$date))) , frequency = 12)


melted_data <- melt(clima1, id.vars = "date") 
# ggplot(melted_data, aes(x = date, y = log10(value), color = variable)) + geom_line()

#DE ACA NO VAMOS A USAR NINGUN GRAF

tabla<-function(t_series, origen){
  mean_temp <- mean(t_series)
  median_temp <- median(t_series)
  min_temp <- min(t_series)
  max_temp <- max(t_series)
  std_dev_temp <- sd(t_series)
  summary_df <- data.frame(
    Statistic = c("Media", "Mediana", "Mínimo", "Máximo", "Desviación estándar"),
    Value = c(mean_temp, median_temp, min_temp, max_temp, std_dev_temp)
)
  colnames(summary_df)<-c("Estadístico", origen)
  return(summary_df)
}
resumen_train<-tabla(train_tseries, "Train")
resumen_test<-tabla(test_tseries, "Test")
resumen_completo<-tabla(full_tseries, "Base completa")

resumen <- resumen_train%>%
  left_join( resumen_test, by = "Estadístico") %>%
  left_join(resumen_completo, by = "Estadístico")

# Create the LaTeX table
table1<-xtable(resumen, digits = c(0,3,3,3, 3), align = c("l", "r", "r", "r", "r"))

# Print the LaTeX table
print(table1, include.rownames = FALSE)

% latex table generated in R 4.3.1 by xtable 1.8-4 package
% Mon Nov 20 02:56:21 2023
\begin{table}[ht]
\centering
\begin{tabular}{rrrr}
  \hline
Estadístico & Train & Test & Base completa \\ 
  \hline
Media & 10.419 & 11.144 & 10.613 \\ 
  Mediana & 10.075 & 11.480 & 10.390 \\ 
  Mínimo & -7.100 & -8.590 & -8.590 \\ 
  Máximo & 26.230 & 25.620 & 26.230 \\ 
  Desviación estándar & 9.195 & 9.356 & 9.234 \\ 
   \hline
\end{tabular}
\end{table}

Ajuste del modelo

train_tseries %>% ggseasonplot() +
  labs(x = "Mes",
       y = "Temperatura",
       title = NULL,
       fill = "Año",
       caption = "Fuente: Elaboración propia con datos de NASA")+
  theme(plot.caption = element_text(hjust = 0)) + 
  theme_cowplot()

pdf("trainSPlot.pdf", width = 10)
train_tseries %>% ggseasonplot() +
  labs(x = "Mes",
       y = "Temperatura",
       title = NULL,
       fill = "Año",
       caption = "Fuente: Elaboración propia con datos de NASA")+
  theme(plot.caption = element_text(hjust = 0)) + 
  theme_cowplot()
  # scale_color_viridis(discrete = TRUE, option = "D")+
  # scale_fill_viridis(discrete = TRUE) 
dev.off()

png 
  2

test_tseries %>% ggseasonplot() +
  labs(x = "Mes",
       y = "Temperatura",
       title = NULL,
       caption = "Fuente: Elaboración propia con datos de NASA")+
  theme(plot.caption = element_text(hjust = 0)) +
  theme_cowplot()

pdf("testSPlot.pdf", width = 10)
test_tseries %>% ggseasonplot() +
  labs(x = "Mes",
       y = "Temperatura",
       title = NULL,
       caption = "Fuente: Elaboración propia con datos de NASA")+
  theme(plot.caption = element_text(hjust = 0)) +
  theme_cowplot()
dev.off()

png 
  2

plot(train_tseries, xlab = ("Año"))
title(sub = "Fuente: Elaboración propia con datos de NASA", adj = 0)
modeloLineal = lm(train_tseries ~ time(train_tseries))
abline(reg = modeloLineal)

ylab("Temperatura")

$y
[1] "Temperatura"

attr(,"class")
[1] "labels"

xlab("Año")

$x
[1] "Año"

attr(,"class")
[1] "labels"

pdf("trainLM.pdf", width = 10)

plot(train_tseries, xlab = ("Año"))
title(sub = "Fuente: Elaboración propia con datos de NASA", adj = 0)
modeloLineal = lm(train_tseries ~ time(train_tseries))
abline(reg = modeloLineal) 
ylab("Temperatura")

$y
[1] "Temperatura"

attr(,"class")
[1] "labels"

xlab("Año")

$x
[1] "Año"

attr(,"class")
[1] "labels"

dev.off()

png 
  2

media = aggregate(train_tseries,FUN=mean)
plot(media, xlab = ("Año"), ylab = ("Temperatura promedio"))
title(sub = "Fuente: Elaboración propia con datos de NASA", adj = 0)

pdf("trainMA.pdf", width = 10)
media = aggregate(train_tseries,FUN=mean)
plot(media, xlab = ("Año"), ylab = ("Temperatura promedio"))
title(sub = "Fuente: Elaboración propia con datos de NASA", adj = 0)
dev.off()

png 
  2

boxplot(train_tseries ~ cycle(train_tseries), xlab = ("Mes"), ylab = "Temperatura", col = plasma(12))
title(sub = "Fuente: Elaboración propia con datos de NASA", adj = 0)

pdf("boxplot.pdf", width = 10)
boxplot(train_tseries ~ cycle(train_tseries), xlab = ("Mes"), ylab = "Temperatura", col = plasma(12))
title(sub = "Fuente: Elaboración propia con datos de NASA", adj = 0)
dev.off()

png 
  2

fit_temp <- auto.arima(train$t2m, ic="bic")
prediccion_temp <- forecast(fit_temp, h = length(test$t2m))
summary(fit_temp)

Series: train$t2m 
ARIMA(2,0,1) with non-zero mean 

Coefficients:
         ar1      ar2      ma1     mean
      1.7144  -0.9765  -0.7729  10.4039
s.e.  0.0113   0.0109   0.0245   0.0956

sigma^2 = 4.324:  log likelihood = -775.22
AIC=1560.45   AICc=1560.62   BIC=1579.88

Training set error measures:
                      ME     RMSE      MAE      MPE     MAPE      MASE
Training set -0.02546485 2.067822 1.605699 19.81736 73.14933 0.3545635
                   ACF1
Training set -0.3299556

plot(prediccion_temp)

adf.test(train_tseries)


    Augmented Dickey-Fuller Test

data:  train_tseries
Dickey-Fuller = -10.646, Lag order = 7, p-value = 0.01
alternative hypothesis: stationary

sarima_model <- auto.arima(train_tseries, seasonal = TRUE)

# Make predictions
# sarima_forecast <- forecast(sarima_model, h = length(test_tseries))

sarima_interval95 <-forecast(sarima_model, h = length(test_tseries), level=c(95))

esperanza <- sarima_interval95$mean
lower <- sarima_interval95$lower
upper <- sarima_interval95$upper
pruebaModelo<-data.frame(real=test_tseries,
                         predicho=esperanza,
                         intervalAbajo=lower, 
                         intervalArriba=upper,
                         Fecha=seq(from=as.Date("2011-01-01"), to=as.Date("2021-12-01"), by = "month"))

colnames(pruebaModelo)<-c("Temperatura real",
                          "Temperatura predicha",
                          "Límite inferior del intervalo (95%)", 
                          "Límite superior del intervalo (95%)", 
                          "Fecha"
                          )
pruebaModelo<-pruebaModelo%>%pivot_longer(cols=-Fecha,
                                          names_to = "Temperatura",
                                          values_to = "Observación")

accuracy_metrics <- accuracy(esperanza, test_tseries)

accuracy_table <- xtable(accuracy_metrics)

print(accuracy_table, digits = c(0,3,3, 3, 3, 3, 3, 3))

% latex table generated in R 4.3.1 by xtable 1.8-4 package
% Mon Nov 20 02:56:39 2023
\begin{table}[ht]
\centering
\begin{tabular}{rrrrrrrr}
  \hline
 & ME & RMSE & MAE & MPE & MAPE & ACF1 & Theil's U \\ 
  \hline
Test set & -0.01 & 2.04 & 1.54 & -263.96 & 359.57 & 0.24 & 1.40 \\ 
   \hline
\end{tabular}
\end{table}

# plot(test_tseries, main = NULL, xlab = ("Año"), ylab = "Temperaturas esperadas del conjunto de prueba")# "Actual vs. Predicted Temperatures")
# lines(sarima_forecast$mean, col = "red")
# legend("topleft", legend = c("Actual", "Predicción"), col = c("black", "red"), lty = 1)

# pdf("predTemp.pdf", width = 20)

# pdf("predTemp2.pdf", width = 20)

(ggplot(data = pruebaModelo, aes(x = Fecha, y = Observación, color = Temperatura)) +
  geom_line() +
  labs(caption = "Fuente: Elaboración propia con datos de NASA") +
  scale_color_manual(values = c("Temperatura real" = "black",
                                "Temperatura predicha"= "red",
                          "Límite inferior del intervalo (95%)"= "cyan", 
                          "Límite superior del intervalo (95%)"= "pink"))+
  theme_cowplot() +
  theme(plot.caption = element_text(hjust = 0)))

# dev.off()

library(xtable)
resultados <- data.frame(
  Prueba = c(
    # "Prueba de normalidad",
    "Prueba de Ljung-Box",
    "Prueba de heterocedasticidad",
    "Prueba de estacionariedad"
  ),
  Resultado = c(
    # shapiro.test(sarima_model$residuals)$p.value,
    Box.test(sarima_model$residuals, lag = 10, type = "Ljung-Box")$p.value,
    white.test(sarima_model$residuals)$p.value,
    adf.test(sarima_model$residuals)$p.value
  )
)
tabla_latex <- xtable(resultados, 
                      caption = "Resultados de las pruebas en los residuos del modelo SARIMA")
print(tabla_latex, type = "latex")

% latex table generated in R 4.3.1 by xtable 1.8-4 package
% Mon Nov 20 02:56:39 2023
\begin{table}[ht]
\centering
\begin{tabular}{rlr}
  \hline
 & Prueba & Resultado \\ 
  \hline
1 & Prueba de Ljung-Box & 0.86 \\ 
  2 & Prueba de heterocedasticidad & 0.47 \\ 
  3 & Prueba de estacionariedad & 0.01 \\ 
   \hline
\end{tabular}
\caption{Resultados de las pruebas en los residuos del modelo SARIMA} 
\end{table}

resumen<-summary(sarima_model)
coefficients <- resumen$coef
std_errors <- sqrt(diag(resumen$var.coef))
aic <- resumen$aic
bic <- resumen$bic
resumen_sarimaModel <- data.frame(
  Coefficients = coefficients,
  StandardErrors = std_errors
  # PValues = p_values
)
resumen_sarimaModel <- rbind(
  resumen_sarimaModel,
  c("AIC", aic, NA),
  c("BIC", bic, NA)
)
resumen_latex <- xtable(resumen_sarimaModel, caption = "Resumen del modelo ajustado")

print(resumen_latex, digits = c(0,3,3), align = c("l", "r", "r"))

% latex table generated in R 4.3.1 by xtable 1.8-4 package
% Mon Nov 20 02:56:39 2023
\begin{table}[ht]
\centering
\begin{tabular}{rll}
  \hline
 & Coefficients & StandardErrors \\ 
  \hline
ar1 & -0.216620226258099 & 0.289795046931894 \\ 
  ar2 & 0.372864986197777 & 0.242383365423148 \\ 
  ma1 & 0.424676067184716 & 0.305839459534777 \\ 
  ma2 & -0.16623733419521 & 0.25189477889407 \\ 
  sar1 & -0.687475433816859 & 0.0502949821880856 \\ 
  sar2 & -0.369423878628048 & 0.0514046999109033 \\ 
  drift & 0.00332001884099555 & 0.00582992935573155 \\ 
  8 & AIC & 1404.02492735219 \\ 
  9 & BIC & 1434.84254719039 \\ 
   \hline
\end{tabular}
\caption{Resumen del modelo ajustado} 
\end{table}

Box.test(sarima_model$resid,type="Ljung-Box")


    Box-Ljung test

data:  sarima_model$resid
X-squared = 0.0096337, df = 1, p-value = 0.9218

(acfPlot <- ggAcf(train_tseries, lag.max = 36)+
  labs(title="",caption = "Fuente: Elaboración propia con datos de NASA") +
  theme_cowplot()+
  theme(plot.caption = element_text(hjust = 0)) +
  xlab("Rezago")
)

# pdf("acf.pdf", width = 10)
acfPlot

# dev.off()

(pacfPlot <- ggPacf(train_tseries, lag.max = 36)+
  labs(title="",caption = "Fuente: Elaboración propia con datos de NASA") +
  theme_cowplot()+
  theme(plot.caption = element_text(hjust = 0)) +
  xlab("Rezago")
)

# pdf("pacf.pdf", width = 10)
pacfPlot

# dev.off()