library(TSstudio)
data("USgas")
ts_plot(USgas,
title = "US Monthly Natural Gas consumption",
Ytitle = "Billion Cubic Feet",
Xtitle = "Year")
En este grafico podemos observar una tendencia positiva o asendente , en la serie de tiempo. Tambien los datos resultantes tienden a aumentar conforme transcurre el tiempo.
ts_info(USgas)
## The USgas series is a ts object with 1 variable and 238 observations
## Frequency: 12
## Start time: 2000 1
## End time: 2019 10
ts_decompose(USgas)
En este grafico podemos observar la decomposition de las categorias usadas para la creacionde una serie de tiempo del consumo de gas en Estados unidos. Las categorias presentadas en los cuatro graficos son, aleatoriedad, observaciones, estacionalidad y tendencia.
USgas_df <- ts_to_prophet(USgas)
head(USgas_df)
## ds y
## 1 2000-01-01 2510.5
## 2 2000-02-01 2330.7
## 3 2000-03-01 2050.6
## 4 2000-04-01 1783.3
## 5 2000-05-01 1632.9
## 6 2000-06-01 1513.1
Matris de datos.
USgas_df$trend <- 1:nrow(USgas_df)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
USgas_df$seasonal <- month(USgas_df$ds, label = T)
head(USgas_df)
## ds y trend seasonal
## 1 2000-01-01 2510.5 1 ene
## 2 2000-02-01 2330.7 2 feb
## 3 2000-03-01 2050.6 3 mar
## 4 2000-04-01 1783.3 4 abr
## 5 2000-05-01 1632.9 5 may
## 6 2000-06-01 1513.1 6 jun
Matris de datos.
h <- 12 # setting a testing partition length
train <- USgas_df[1:(nrow(USgas_df) - h), ]
test <- USgas_df[(nrow(USgas_df) - h + 1):nrow(USgas_df), ]
md_trend <- lm(y ~ trend, data = train)
summary(md_trend)
##
## Call:
## lm(formula = y ~ trend, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -547.2 -307.4 -153.2 333.1 1052.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1751.0074 52.6435 33.26 < 2e-16 ***
## trend 2.4489 0.4021 6.09 4.86e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 394.4 on 224 degrees of freedom
## Multiple R-squared: 0.1421, Adjusted R-squared: 0.1382
## F-statistic: 37.09 on 1 and 224 DF, p-value: 4.861e-09
train$yhat <- predict(md_trend, newdata = train)
test$yhat <- predict(md_trend, newdata = test)
library(plotly)
## Loading required package: ggplot2
##
## Attaching package: 'plotly'
## The following object is masked from 'package:ggplot2':
##
## last_plot
## The following object is masked from 'package:stats':
##
## filter
## The following object is masked from 'package:graphics':
##
## layout
plot_lm <- function(data, train, test, title = NULL){
p <- plot_ly(data = data,
x = ~ ds,
y = ~ y,
type = "scatter",
mode = "line",
name = "Actual") %>%
add_lines(x = ~ train$ds,
y = ~ train$yhat,
line = list(color = "red"),
name = "Fitted") %>%
add_lines(x = ~ test$ds,
y = ~ test$yhat,
line = list(color = "green", dash = "dot", width = 3),
name = "Forecasted") %>%
layout(title = title,
xaxis = list(title = ""),
yaxis = list(title = "Billion Cubic Feet"),
legend = list(x = 0.05, y = 0.95))
return(p)
}
plot_lm(data = USgas_df,
train = train,
test = test,
title = "Predicting the Trend Component of the Series")
En este grafico se observa la serie de tiempo de la grafica anterior. En el cual hay una funcion que indica el valor ajustado de la media de la prediccion de los datos introducidos.Sin embargo podemos ver que su predicciion no es correcta puesto que no oscilacion. Tambien la linea verde punteada indica el comportamiento que va a tener la funcion, al igual funcion roja no es completaamente confiable.
mape_trend <- c(mean(abs(train$y - train$yhat) / train$y),
mean(abs(test$y - test$yhat) / test$y))
mape_trend
## [1] 0.1644088 0.1299951
md_seasonal <- lm(y ~ seasonal, data = train)
summary(md_seasonal)
##
## Call:
## lm(formula = y ~ seasonal, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -608.98 -162.34 -50.77 148.40 566.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2030.88 14.43 140.747 < 2e-16 ***
## seasonal.L -480.00 50.24 -9.554 < 2e-16 ***
## seasonal.Q 1103.83 50.17 22.000 < 2e-16 ***
## seasonal.C 72.42 50.05 1.447 0.149389
## seasonal^4 174.60 50.07 3.487 0.000592 ***
## seasonal^5 288.01 50.13 5.745 3.13e-08 ***
## seasonal^6 -44.67 50.09 -0.892 0.373460
## seasonal^7 -187.91 49.96 -3.762 0.000218 ***
## seasonal^8 84.95 49.84 1.705 0.089706 .
## seasonal^9 46.16 49.78 0.927 0.354828
## seasonal^10 77.55 49.76 1.559 0.120587
## seasonal^11 -11.09 49.75 -0.223 0.823856
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 216.9 on 214 degrees of freedom
## Multiple R-squared: 0.7521, Adjusted R-squared: 0.7394
## F-statistic: 59.04 on 11 and 214 DF, p-value: < 2.2e-16
train$yhat <- predict(md_seasonal, newdata = train)
test$yhat <- predict(md_seasonal, newdata = test)
plot_lm(data = USgas_df,
train = train,
test = test,
title = "Predicting the Seasonal Component of the Series")
En el grafico se vuelve a presentar la misma serie de tiempo, pero esta vez el modelo de predicccion se ajusta para aproximarse a los datos introducidos. La funcion de color rojo, toma una asimilacion a la tendencia de suvida y vajada de la serie de tiempo. Sin embargo, se mantien paralela al eje, lo cual no evidencia la tendencia creciente del consumo de gas a partir del 2012.Por lo tanto, la linea punteada tampoco va a ser completamente fiable.
mape_seasonal <- c(mean(abs(train$y - train$yhat) / train$y),
mean(abs(test$y - test$yhat) / test$y))
mape_seasonal
## [1] 0.08628973 0.21502100
md1 <- lm(y ~ seasonal + trend, data = train)
summary(md1)
##
## Call:
## lm(formula = y ~ seasonal + trend, data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -514.73 -77.17 -17.70 85.80 336.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1733.7153 17.0794 101.509 < 2e-16 ***
## seasonal.L -498.1709 29.6354 -16.810 < 2e-16 ***
## seasonal.Q 1115.2951 29.5872 37.695 < 2e-16 ***
## seasonal.C 78.9835 29.5103 2.676 0.00802 **
## seasonal^4 175.6505 29.5196 5.950 1.09e-08 ***
## seasonal^5 285.0192 29.5568 9.643 < 2e-16 ***
## seasonal^6 -49.3611 29.5319 -1.671 0.09610 .
## seasonal^7 -192.3050 29.4540 -6.529 4.77e-10 ***
## seasonal^8 81.8787 29.3835 2.787 0.00581 **
## seasonal^9 44.4849 29.3480 1.516 0.13106
## seasonal^10 76.8636 29.3372 2.620 0.00943 **
## seasonal^11 -11.2755 29.3353 -0.384 0.70109
## trend 2.6182 0.1305 20.065 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 127.9 on 213 degrees of freedom
## Multiple R-squared: 0.9142, Adjusted R-squared: 0.9094
## F-statistic: 189.2 on 12 and 213 DF, p-value: < 2.2e-16
train$yhat <- predict(md1, newdata = train)
test$yhat <- predict(md1, newdata = test)
plot_lm(data = USgas_df,
train = train,
test = test,
title = "Predicting the Seasonal Component of the Series")
Ahora en esta nueva grafica se ve al modelo predictivo seguir una la tendencia de los datos , con osilaciones parecidas a las que presentan los datos introducidos. Sin embargo, hay maximos y minimos que no se terminan de ajustar con verasidad a los datos introducidos.
mape_md1 <- c(mean(abs(train$y - train$yhat) / train$y),
mean(abs(test$y - test$yhat) / test$y))
mape_md1
## [1] 0.04774945 0.09143290
md2 <- lm(y ~ seasonal + trend + I(trend^2), data = train)
summary(md2)
##
## Call:
## lm(formula = y ~ seasonal + trend + I(trend^2), data = train)
##
## Residuals:
## Min 1Q Median 3Q Max
## -468.47 -54.66 -2.21 63.11 294.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.882e+03 2.199e+01 85.568 < 2e-16 ***
## seasonal.L -4.917e+02 2.530e+01 -19.438 < 2e-16 ***
## seasonal.Q 1.121e+03 2.525e+01 44.381 < 2e-16 ***
## seasonal.C 8.247e+01 2.518e+01 3.275 0.00123 **
## seasonal^4 1.763e+02 2.519e+01 6.999 3.35e-11 ***
## seasonal^5 2.835e+02 2.522e+01 11.243 < 2e-16 ***
## seasonal^6 -5.175e+01 2.520e+01 -2.054 0.04123 *
## seasonal^7 -1.946e+02 2.513e+01 -7.741 3.97e-13 ***
## seasonal^8 8.030e+01 2.507e+01 3.203 0.00157 **
## seasonal^9 4.362e+01 2.504e+01 1.742 0.08293 .
## seasonal^10 7.651e+01 2.503e+01 3.057 0.00253 **
## seasonal^11 -1.137e+01 2.503e+01 -0.454 0.65005
## trend -1.270e+00 4.472e-01 -2.840 0.00494 **
## I(trend^2) 1.713e-02 1.908e-03 8.977 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109.1 on 212 degrees of freedom
## Multiple R-squared: 0.9379, Adjusted R-squared: 0.9341
## F-statistic: 246.1 on 13 and 212 DF, p-value: < 2.2e-16
train$yhat <- predict(md2, newdata = train)
test$yhat <- predict(md2, newdata = test)
plot_lm(data = USgas_df,
train = train,
test = test,
title = "Predicting the Seasonal Component of the Series")
mape_md2 <- c(mean(abs(train$y - train$yhat) / train$y),
mean(abs(test$y - test$yhat) / test$y))
mape_md2
## [1] 0.03688770 0.04212618
En este ultimo grafico que presentara a la serie de tiempo inicial, se puede observar un modelo de prediciion modificado, el cual es mas exacto en cuanto a los puntos maximos y minimos.Por lo tanto, el ajuste al modelo de prediccion es mas cercano, a la realidad del consumo de gas.
USgas_split <- ts_split(USgas, sample.out = h)
train.ts <- USgas_split$train
test.ts <- USgas_split$test
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
md3 <- tslm(train.ts ~ season + trend + I(trend^2))
summary(md3)
##
## Call:
## tslm(formula = train.ts ~ season + trend + I(trend^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -468.47 -54.66 -2.21 63.11 294.32
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.635e+03 3.224e+01 81.738 < 2e-16 ***
## season2 -3.004e+02 3.540e+01 -8.487 3.69e-15 ***
## season3 -4.841e+02 3.540e+01 -13.676 < 2e-16 ***
## season4 -9.128e+02 3.540e+01 -25.787 < 2e-16 ***
## season5 -1.099e+03 3.540e+01 -31.033 < 2e-16 ***
## season6 -1.118e+03 3.540e+01 -31.588 < 2e-16 ***
## season7 -9.547e+02 3.540e+01 -26.968 < 2e-16 ***
## season8 -9.322e+02 3.541e+01 -26.329 < 2e-16 ***
## season9 -1.136e+03 3.541e+01 -32.070 < 2e-16 ***
## season10 -1.046e+03 3.541e+01 -29.532 < 2e-16 ***
## season11 -8.001e+02 3.590e+01 -22.286 < 2e-16 ***
## season12 -2.618e+02 3.590e+01 -7.293 5.95e-12 ***
## trend -1.270e+00 4.472e-01 -2.840 0.00494 **
## I(trend^2) 1.713e-02 1.908e-03 8.977 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109.1 on 212 degrees of freedom
## Multiple R-squared: 0.9379, Adjusted R-squared: 0.9341
## F-statistic: 246.1 on 13 and 212 DF, p-value: < 2.2e-16
r <- which(USgas_df$ds == as.Date("2014-01-01"))
USgas_df$s_break <- ifelse(year(USgas_df$ds) >= 2010, 1, 0)
USgas_df$s_break[r] <- 1
md3 <- tslm(USgas ~ season + trend + I(trend^2) + s_break, data = USgas_df)
summary(md3)
##
## Call:
## tslm(formula = USgas ~ season + trend + I(trend^2) + s_break,
## data = USgas_df)
##
## Residuals:
## Min 1Q Median 3Q Max
## -469.25 -50.68 -2.66 63.63 275.89
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 2.661e+03 3.200e+01 83.164 < 2e-16 ***
## season2 -3.054e+02 3.448e+01 -8.858 2.61e-16 ***
## season3 -4.849e+02 3.448e+01 -14.062 < 2e-16 ***
## season4 -9.272e+02 3.449e+01 -26.885 < 2e-16 ***
## season5 -1.108e+03 3.449e+01 -32.114 < 2e-16 ***
## season6 -1.127e+03 3.450e+01 -32.660 < 2e-16 ***
## season7 -9.568e+02 3.450e+01 -27.730 < 2e-16 ***
## season8 -9.340e+02 3.451e+01 -27.061 < 2e-16 ***
## season9 -1.138e+03 3.452e+01 -32.972 < 2e-16 ***
## season10 -1.040e+03 3.453e+01 -30.122 < 2e-16 ***
## season11 -7.896e+02 3.497e+01 -22.577 < 2e-16 ***
## season12 -2.649e+02 3.498e+01 -7.571 9.72e-13 ***
## trend -1.928e+00 4.479e-01 -4.304 2.51e-05 ***
## I(trend^2) 1.862e-02 1.676e-03 11.113 < 2e-16 ***
## s_break 6.060e+01 2.836e+01 2.137 0.0337 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 109 on 223 degrees of freedom
## Multiple R-squared: 0.9423, Adjusted R-squared: 0.9387
## F-statistic: 260.3 on 14 and 223 DF, p-value: < 2.2e-16
library(UKgrid)
UKdaily <- extract_grid(type = "data.frame",
columns = "ND",
aggregate = "daily")
head(UKdaily)
## TIMESTAMP ND
## 1 2005-04-01 1920069
## 2 2005-04-02 1674699
## 3 2005-04-03 1631352
## 4 2005-04-04 1916693
## 5 2005-04-05 1952082
## 6 2005-04-06 1964584
ts_plot(UKdaily,
title = "The UK National Demand for Electricity",
Ytitle = "MW",
Xtitle = "Year")
El presente grafico nos brinda un conjunto de datos relacionados a la demanda de electricidad en el reino unido. En el periodo comprendio entre 2006 y 2018(eje x), y (eje y) el consumo en millones de watts. En este ejemplo podemos ver cambios en los valores de los datos, que tienden a decreser con el paso del tiempo, pero la magnitud del cambio estacional o periodico(Por que mantienen un patron que se repite en un perio) sigue siendo la misma.
ts_heatmap(UKdaily[which(year(UKdaily$TIMESTAMP) >= 2016),],
title = "UK the Daily National Grid Demand Heatmap")
Este es un grafico tipo mapa de calor. Los datos son tomados en el mismo periodo de tiempo semanal, en diferentes años(2018,2019 y 2020). La densidad de los datos se denotan con colores, entre mas oscuro se encuntre el color mayor densidad tendra.Estos datos son medidos en millones. Se puede observar que, a los margens laterales(izquierdo y derecho) se presentan gran cantidad de datos, con respecto al centro.
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
UKdaily <- UKdaily %>%
mutate(wday = wday(TIMESTAMP, label = TRUE),
month = month(TIMESTAMP, label = TRUE),
lag365 = dplyr::lag(ND, 365)) %>%
filter(!is.na(lag365)) %>%
arrange(TIMESTAMP)
str(UKdaily)
## 'data.frame': 4939 obs. of 5 variables:
## $ TIMESTAMP: Date, format: "2006-04-01" "2006-04-02" ...
## $ ND : int 1718405 1691341 1960325 2023886 2026204 2008422 1981175 1770440 1749715 2012865 ...
## $ wday : Ord.factor w/ 7 levels "dom\\."<"lun\\."<..: 7 1 2 3 4 5 6 7 1 2 ...
## $ month : Ord.factor w/ 12 levels "ene"<"feb"<"mar"<..: 4 4 4 4 4 4 4 4 4 4 ...
## $ lag365 : int 1920069 1674699 1631352 1916693 1952082 1964584 1990895 2003982 1811436 1684720 ...
start_date <- min(UKdaily$TIMESTAMP)
UK_ts <- ts(UKdaily$ND,
start = c(year(start_date), yday(start_date)),
frequency = 365)
ts_cor(UK_ts, lag.max = 365 * 4)
h <- 365
UKpartitions <- ts_split(UK_ts, sample.out = h)
train_ts <- UKpartitions$train
test_ts <- UKpartitions$test
train_df <- UKdaily[1:(nrow(UKdaily) - h), ]
test_df <- UKdaily[(nrow(UKdaily) - h + 1):nrow(UKdaily), ]
md_tslm1 <- tslm(train_ts ~ season + trend)
fc_tslm1 <- forecast(md_tslm1, h = h)
test_forecast(actual = UK_ts,
forecast.obj = fc_tslm1,
test = test_ts)
accuracy(fc_tslm1, test_ts)
## ME RMSE MAE MPE MAPE MASE
## Training set -4.781286e-12 121551.4 100439.64 -0.5721337 6.294111 0.8418677
## Test set -1.740215e+04 123156.6 96785.27 -1.8735336 7.160573 0.8112374
## ACF1 Theil's U
## Training set 0.5277884 NA
## Test set 0.5106681 1.071899
En este grafico se puede observar que la funcion de color rojo (“fitted”) tiene una forma parecida a la funcion que forman los datos introducidos, lo mismo aplica para el modelo (“Forestcat”). Ambas funciones tienen una tendencia negativa, manteniendo un intervalo de oscilacion estable.
md_tslm2 <- tslm(train_ts ~ season + trend + wday, data = train_df)
fc_tslm2 <- forecast(md_tslm2, h = h, newdata = test_df)
test_forecast(actual = UK_ts,
forecast.obj = fc_tslm2,
test = test_ts)
accuracy(fc_tslm2, test_ts)
## ME RMSE MAE MPE MAPE MASE
## Training set 8.172823e-12 70245.98 52146.79 -0.1738708 3.167605 0.4370853
## Test set -1.764563e+04 80711.71 65373.21 -1.3715505 4.682071 0.5479470
## ACF1 Theil's U
## Training set 0.7513664 NA
## Test set 0.6075598 0.68445
En este grafico se puede observar que la funcion de color rojo (“fitted”) tiene una forma parecida a la funcion que forman lod datos introducidos, lo mismo aplica para el modelo (“Forestcat”). Ambas funciones tienen una tendencia negativa, manteniendo un intervalo de oscilacion estable. Pero en esta nueva grafica se visualiza que tamto la funcion de color rojo, como el modelo cubren casi todos los datos introducidos. Por lo tanto, encontramos que el modelo ha sido modificado para una mayor presicion. Sin embargo, se visualizan maximos y minimos relativos que quedan por fuera del modelo predictivo.
md_tslm3 <- tslm(train_ts ~ season + trend + wday + month + lag365, data = train_df)
fc_tslm3 <- forecast(md_tslm3, h = h, newdata = test_df)
test_forecast(actual = UK_ts,
forecast.obj = fc_tslm3,
test = test_ts)
accuracy(fc_tslm3, test_ts)
## ME RMSE MAE MPE MAPE MASE
## Training set -9.836939e-13 69904.92 52006.75 -0.1717563 3.163385 0.4359116
## Test set -1.754061e+04 81783.55 65957.66 -1.3613252 4.722083 0.5528457
## ACF1 Theil's U
## Training set 0.7500146 NA
## Test set 0.6094666 0.6925767
E n esta nueva grafica, obtenemos un modelo mucho mas acertado, que el del grafico anterior.
summary(md_tslm3)$coefficients %>% tail(1)
## Estimate Std. Error t value Pr(>|t|)
## lag365 -0.06038328 0.01545495 -3.907051 9.490321e-05
anova(md_tslm3)
## Analysis of Variance Table
##
## Response: train_ts
## Df Sum Sq Mean Sq F value Pr(>F)
## season 364 1.4279e+14 3.9227e+11 73.5348 < 2.2e-16 ***
## trend 1 7.2634e+13 7.2634e+13 13615.7078 < 2.2e-16 ***
## wday 6 4.5009e+13 7.5016e+12 1406.2214 < 2.2e-16 ***
## month 11 1.3721e+11 1.2473e+10 2.3382 0.007266 **
## lag365 1 8.1432e+10 8.1432e+10 15.2650 9.49e-05 ***
## Residuals 4190 2.2352e+13 5.3345e+09
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
final_md <- tslm(UK_ts ~ season + trend + wday + month + lag365,
data = UKdaily)
checkresiduals(final_md)
##
## Breusch-Godfrey test for serial correlation of order up to 730
##
## data: Residuals from Linear regression model
## LM test = 3301.1, df = 730, p-value < 2.2e-16
UK_fc_df <- data.frame(date = seq.Date(from = max(UKdaily$TIMESTAMP) + days(1),
by = "day",
length.out = h))
UK_fc_df$wday <- factor(lubridate::wday(UK_fc_df$date, label = TRUE), ordered = FALSE)
UK_fc_df$month <- factor(month(UK_fc_df$date, label = TRUE), ordered = FALSE)
UK_fc_df$lag365 <- tail(UKdaily$ND, h)
UKgrid_fc <- forecast(final_md, h = h, newdata = UK_fc_df)
plot_forecast(UKgrid_fc,
title = "The UK National Demand for Electricity Forecast",
Ytitle = "MW",
Xtitle = "Year")