R Notebook

library(TSstudio)

data("USgas")

ts_plot(USgas,
        title = "US Monthly Natural Gas consumption",
        Ytitle = "Billion Cubic Feet", Xtitle = "Year")

ts_info(USgas)

##  The USgas series is a ts object with 1 variable and 235 observations
##  Frequency: 12 
##  Start time: 2000 1 
##  End time: 2019 7

ts_decompose(USgas)

# transform the series from a ts object to a data.frame object. ”
# The function transforms the ts object into two columns of data.frame, where the two columns represent the time and numeric components of the series
USgas_df <- ts_to_prophet(USgas)
head(USgas)

## [1] 2510.5 2330.7 2050.6 1783.3 1632.9 1513.1

head(USgas_df)

# 1）“The first feature we will create is the series trend. ”
USgas_df$trend <- 1:nrow(USgas_df)

# 2）“The second feature we want to create is the seasonal component.”
# use the month function from the lubridate package to extract the month of the year from the ds date variable
library(lubridate)

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

USgas_df$seasonal <- month(USgas_df$ds, label = T)
head(USgas_df)

h <- 12 
train <- USgas_df[1:(nrow(USgas_df) - h), ]
test <- USgas_df[(nrow(USgas_df) - h + 1):nrow(USgas_df), ]

md_trend <- lm(y ~ trend, data = train)

summary(md_trend)

## 
## Call:
## lm(formula = y ~ trend, data = train)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -547.5 -307.6 -156.5  346.0 1038.5 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1750.3156    53.2454  32.873  < 2e-16 ***
## trend          2.4576     0.4122   5.963 9.74e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 396.2 on 221 degrees of freedom
## Multiple R-squared:  0.1386, Adjusted R-squared:  0.1347 
## F-statistic: 35.55 on 1 and 221 DF,  p-value: 9.739e-09

train$yhat <- predict(md_trend, newdata = train)
test$yhat <- predict(md_trend, newdata = test)

library(plotly)

## Loading required package: ggplot2

## 
## Attaching package: 'plotly'

## The following object is masked from 'package:ggplot2':
## 
##     last_plot

## The following object is masked from 'package:stats':
## 
##     filter

## The following object is masked from 'package:graphics':
## 
##     layout

plot_lm <- function(data, train, test, title = NULL){
  p <- plot_ly(data = data, 
               x = ~ ds, 
               y = ~ y, 
               type = "scatter",
               mode = "line",
               name = "Actual") %>%
    add_lines(x =  ~ train$ds,
              y = ~ train$yhat,
              line = list(color = "red"),
              name = "Fitted") %>%
    add_lines(x =  ~ test$ds,
              y = ~ test$yhat,
              line = list(color = "green", dash = "dot", width = 3),
              name = "Forecasted") %>%
    layout(title = title,
           xaxis = list(title = ""),
           yaxis = list(title = "Billion Cubic Feet"),
           legend = list(x = 0.05, y = 0.95))
  return(p)
}
# -------- Code Chank 13 --------
plot_lm(data = USgas_df, 
        train = train, 
        test = test,
        title = "Predicting the Trend Component of the Series")

# -------- Code Chank 14 --------
mape_trend <- c(mean(abs(train$y - train$yhat) / train$y), mean(abs(test$y - test$yhat) / test$y))
mape_trend

## [1] 0.1661594 0.1240414

md_seasonal <- lm(y ~ seasonal, data = train)
summary(md_seasonal)

## 
## Call:
## lm(formula = y ~ seasonal, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -606.98 -152.27  -52.98  141.70  549.23 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2023.62      14.00 144.494  < 2e-16 ***
## seasonal.L   -496.21      48.62 -10.206  < 2e-16 ***
## seasonal.Q   1112.76      48.49  22.948  < 2e-16 ***
## seasonal.C     96.77      48.61   1.991 0.047815 *  
## seasonal^4    185.39      48.50   3.823 0.000174 ***
## seasonal^5    278.16      48.59   5.725 3.53e-08 ***
## seasonal^6    -61.14      48.51  -1.260 0.208911    
## seasonal^7   -195.16      48.55  -4.020 8.12e-05 ***
## seasonal^8     85.22      48.52   1.756 0.080516 .  
## seasonal^9     44.72      48.48   0.922 0.357355    
## seasonal^10    71.65      48.55   1.476 0.141551    
## seasonal^11   -17.28      48.22  -0.358 0.720457    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 209.1 on 211 degrees of freedom
## Multiple R-squared:  0.771,  Adjusted R-squared:  0.7591 
## F-statistic: 64.59 on 11 and 211 DF,  p-value: < 2.2e-16

train$yhat <- predict(md_seasonal, newdata = train)
test$yhat <- predict(md_seasonal, newdata = test)

plot_lm(data = USgas_df, 
        train = train, 
        test = test,
        title = "Predicting the Seasonal Component of the Series")

# -------- Code Chank 17 --------
mape_seasonal <- c(mean(abs(train$y - train$yhat) / train$y), mean(abs(test$y - test$yhat) / test$y))
mape_seasonal

## [1] 0.08340338 0.20408252

md1 <- lm(y ~ seasonal + trend, data = train)

summary(md1)

## 
## Call:
## lm(formula = y ~ seasonal + trend, data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -515.91  -72.13  -13.97   85.37  328.98 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1740.2757    16.8832 103.077  < 2e-16 ***
## seasonal.L  -504.2525    29.2110 -17.262  < 2e-16 ***
## seasonal.Q  1117.6084    29.1299  38.366  < 2e-16 ***
## seasonal.C    89.3614    29.2045   3.060  0.00250 ** 
## seasonal^4   180.6430    29.1331   6.201 2.94e-09 ***
## seasonal^5   281.5304    29.1901   9.645  < 2e-16 ***
## seasonal^6   -56.6069    29.1400  -1.943  0.05340 .  
## seasonal^7  -196.1621    29.1676  -6.725 1.63e-10 ***
## seasonal^8    81.0555    29.1509   2.781  0.00592 ** 
## seasonal^9    43.7237    29.1241   1.501  0.13478    
## seasonal^10   75.1242    29.1685   2.576  0.01070 *  
## seasonal^11  -13.4849    28.9703  -0.465  0.64207    
## trend          2.5299     0.1307  19.357  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 125.6 on 210 degrees of freedom
## Multiple R-squared:  0.9178, Adjusted R-squared:  0.9131 
## F-statistic: 195.3 on 12 and 210 DF,  p-value: < 2.2e-16

# -------- Code Chank 19 --------
train$yhat <- predict(md1, newdata = train)
test$yhat <- predict(md1, newdata = test)


plot_lm(data = USgas_df, 
        train = train, 
        test = test,
        title = "Predicting the Seasonal Component of the Series")

# -------- Code Chank 20 --------
mape_md1 <- c(mean(abs(train$y - train$yhat) / train$y),
              mean(abs(test$y - test$yhat) / test$y))
mape_md1

## [1] 0.04652981 0.08404506

md2 <- lm(y ~ seasonal + trend + I(trend^2), data = train)
summary(md2)

## 
## Call:
## lm(formula = y ~ seasonal + trend + I(trend^2), data = train)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470.11  -54.52   -3.69   61.13  297.64 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 1877.37494   22.20507  84.547  < 2e-16 ***
## seasonal.L  -493.60713   25.47215 -19.378  < 2e-16 ***
## seasonal.Q  1119.88762   25.37023  44.142  < 2e-16 ***
## seasonal.C    85.88620   25.43713   3.376 0.000876 ***
## seasonal^4   178.25462   25.37315   7.025 2.96e-11 ***
## seasonal^5   283.10061   25.42187  11.136  < 2e-16 ***
## seasonal^6   -54.39236   25.37889  -2.143 0.033252 *  
## seasonal^7  -196.60962   25.40160  -7.740 4.19e-13 ***
## seasonal^8    79.04275   25.38814   3.113 0.002109 ** 
## seasonal^9    43.22206   25.36376   1.704 0.089851 .  
## seasonal^10   76.79860   25.40316   3.023 0.002814 ** 
## seasonal^11  -11.64491   25.23068  -0.462 0.644893    
## trend         -1.12357    0.45779  -2.454 0.014932 *  
## I(trend^2)     0.01631    0.00198   8.239 1.89e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 109.4 on 209 degrees of freedom
## Multiple R-squared:  0.9379, Adjusted R-squared:  0.9341 
## F-statistic: 242.9 on 13 and 209 DF,  p-value: < 2.2e-16

train$yhat <- predict(md2, newdata = train)
test$yhat <- predict(md2, newdata = test)


plot_lm(data = USgas_df, 
        train = train, 
        test = test,
        title = "Predicting the Seasonal Component of the Series")

mape_md2 <- c(mean(abs(train$y - train$yhat) / train$y),
              mean(abs(test$y - test$yhat) / test$y))

mape_md2

## [1] 0.03696217 0.03962603

USgas_split <- ts_split(USgas, sample.out = 12)
train.ts <- USgas_split$train
test.ts <- USgas_split$test

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

md3 <- tslm(train.ts ~ season + trend + I(trend^2))
summary(md3)

## 
## Call:
## tslm(formula = train.ts ~ season + trend + I(trend^2))
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -470.11  -54.52   -3.69   61.13  297.64 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.631e+03  3.237e+01  81.258  < 2e-16 ***
## season2     -2.993e+02  3.549e+01  -8.434 5.49e-15 ***
## season3     -4.828e+02  3.549e+01 -13.604  < 2e-16 ***
## season4     -9.115e+02  3.549e+01 -25.686  < 2e-16 ***
## season5     -1.096e+03  3.549e+01 -30.895  < 2e-16 ***
## season6     -1.116e+03  3.549e+01 -31.458  < 2e-16 ***
## season7     -9.529e+02  3.549e+01 -26.849  < 2e-16 ***
## season8     -9.316e+02  3.599e+01 -25.888  < 2e-16 ***
## season9     -1.138e+03  3.599e+01 -31.614  < 2e-16 ***
## season10    -1.049e+03  3.599e+01 -29.161  < 2e-16 ***
## season11    -7.986e+02  3.599e+01 -22.191  < 2e-16 ***
## season12    -2.610e+02  3.599e+01  -7.253 7.81e-12 ***
## trend       -1.124e+00  4.578e-01  -2.454   0.0149 *  
## I(trend^2)   1.631e-02  1.980e-03   8.239 1.89e-14 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 109.4 on 209 degrees of freedom
## Multiple R-squared:  0.9379, Adjusted R-squared:  0.9341 
## F-statistic: 242.9 on 13 and 209 DF,  p-value: < 2.2e-16

r <- which(USgas_df$ds == as.Date("2014-01-01"))
USgas_df$s_break <- ifelse(year(USgas_df$ds) >= 2010, 1, 0)
USgas_df$s_break[r] <- 1
md3 <- tslm(USgas ~ season + trend + I(trend^2) + s_break, data = USgas_df)
summary(md3)

## 
## Call:
## tslm(formula = USgas ~ season + trend + I(trend^2) + s_break, 
##     data = USgas_df)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -469.46  -51.75   -5.44   66.41  277.89 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  2.656e+03  3.183e+01  83.458  < 2e-16 ***
## season2     -3.043e+02  3.428e+01  -8.876 2.46e-16 ***
## season3     -4.834e+02  3.428e+01 -14.101  < 2e-16 ***
## season4     -9.256e+02  3.428e+01 -27.001  < 2e-16 ***
## season5     -1.105e+03  3.429e+01 -32.230  < 2e-16 ***
## season6     -1.124e+03  3.429e+01 -32.789  < 2e-16 ***
## season7     -9.534e+02  3.430e+01 -27.796  < 2e-16 ***
## season8     -9.341e+02  3.475e+01 -26.883  < 2e-16 ***
## season9     -1.137e+03  3.475e+01 -32.727  < 2e-16 ***
## season10    -1.047e+03  3.476e+01 -30.135  < 2e-16 ***
## season11    -7.881e+02  3.477e+01 -22.670  < 2e-16 ***
## season12    -2.646e+02  3.478e+01  -7.608 8.06e-13 ***
## trend       -1.758e+00  4.498e-01  -3.908 0.000124 ***
## I(trend^2)   1.742e-02  1.722e-03  10.112  < 2e-16 ***
## s_break      6.945e+01  2.837e+01   2.448 0.015129 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 108.4 on 220 degrees of freedom
## Multiple R-squared:  0.9429, Adjusted R-squared:  0.9393 
## F-statistic: 259.4 on 14 and 220 DF,  p-value: < 2.2e-16

library(UKgrid)

UKdaily <- extract_grid(type = "data.frame", columns = "ND", aggregate = "daily")

head(UKdaily)

ts_plot(UKdaily,
        title = "The UK National Demand for Electricity",
        Ytitle = "MW",
        Xtitle = "Year")

# -------- Code Chank 26 --------
ts_heatmap(UKdaily[which(year(UKdaily$TIMESTAMP) >= 2016),],
           title = "UK the Daily National Grid Demand Heatmap")

# -------- Code Chank 27 --------
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

UKdaily <- UKdaily %>%
  mutate(wday = wday(TIMESTAMP, label = TRUE),
         month = month(TIMESTAMP, label = TRUE),
         lag365 = dplyr::lag(ND, 365)) %>%
  filter(!is.na(lag365)) %>%
  arrange(TIMESTAMP)
str(UKdaily)

## 'data.frame':    4939 obs. of  5 variables:
##  $ TIMESTAMP: Date, format: "2006-04-01" "2006-04-02" ...
##  $ ND       : int  1718405 1691341 1960325 2023886 2026204 2008422 1981175 1770440 1749715 2012865 ...
##  $ wday     : Ord.factor w/ 7 levels "Sun"<"Mon"<"Tue"<..: 7 1 2 3 4 5 6 7 1 2 ...
##  $ month    : Ord.factor w/ 12 levels "Jan"<"Feb"<"Mar"<..: 4 4 4 4 4 4 4 4 4 4 ...
##  $ lag365   : int  1920069 1674699 1631352 1916693 1952082 1964584 1990895 2003982 1811436 1684720 ...

# -------- Code Chank 28 --------
start_date <- min(UKdaily$TIMESTAMP)

head(UKdaily)

UK_ts <- ts(UKdaily$ND, 
            start = c(year(start_date), yday(start_date)),
            frequency = 365)
head(UK_ts)

## Time Series:
## Start = c(2006, 91) 
## End = c(2006, 96) 
## Frequency = 365 
## [1] 1718405 1691341 1960325 2023886 2026204 2008422

ts_acf(UK_ts, lag.max = 365 * 4)

## Warning: The 'ts_acf' function is deprecated, please use 'ts_cor' instead

h <-  365
UKpartitions <- ts_split(UK_ts, sample.out = h)
train_ts <- UKpartitions$train
test_ts <- UKpartitions$test

train_df <- UKdaily[1:(nrow(UKdaily) - h), ]
test_df <- UKdaily[(nrow(UKdaily) - h + 1):nrow(UKdaily), ]

md_tslm1 <- tslm(train_ts ~ season + trend)
fc_tslm1 <- forecast(md_tslm1, h = h)
test_forecast(actual = UK_ts,
              forecast.obj = fc_tslm1,
              test = test_ts)

accuracy(fc_tslm1, test_ts)

##                         ME     RMSE       MAE        MPE     MAPE      MASE
## Training set -9.030002e-12 121551.4 100439.64 -0.5721337 6.294111 0.8418677
## Test set     -1.740215e+04 123156.6  96785.27 -1.8735336 7.160573 0.8112374
##                   ACF1 Theil's U
## Training set 0.5277884        NA
## Test set     0.5106681  1.071899

md_tslm2 <- tslm(train_ts ~ season + trend + wday, data = train_df)
fc_tslm2 <- forecast(md_tslm2, h = h, newdata = test_df)
test_forecast(actual = UK_ts,
              forecast.obj = fc_tslm2,
              test = test_ts)

accuracy(fc_tslm2, test_ts)

##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -9.812170e-13 70245.98 52146.79 -0.1738708 3.167605 0.4370853
## Test set     -1.764563e+04 80711.71 65373.21 -1.3715505 4.682071 0.5479470
##                   ACF1 Theil's U
## Training set 0.7513664        NA
## Test set     0.6075598   0.68445

md_tslm3 <- tslm(train_ts ~ season + trend + wday + month + lag365, data = train_df)
fc_tslm3 <- forecast(md_tslm3, h = h, newdata = test_df)
test_forecast(actual = UK_ts,
              forecast.obj = fc_tslm3,
              test = test_ts)

accuracy(fc_tslm3, test_ts)

##                         ME     RMSE      MAE        MPE     MAPE      MASE
## Training set -1.175109e-11 69904.92 52006.75 -0.1717563 3.163385 0.4359116
## Test set     -1.754061e+04 81783.55 65957.66 -1.3613252 4.722083 0.5528457
##                   ACF1 Theil's U
## Training set 0.7500146        NA
## Test set     0.6094666 0.6925767

summary(md_tslm3)$coefficients %>% tail(1)

##           Estimate Std. Error   t value     Pr(>|t|)
## lag365 -0.06038328 0.01545495 -3.907051 9.490321e-05

anova(md_tslm3)

final_md <- tslm(UK_ts ~ season + trend + wday + month + lag365, data = UKdaily)
checkresiduals(final_md)

## 
##  Breusch-Godfrey test for serial correlation of order up to 730
## 
## data:  Residuals from Linear regression model
## LM test = 3301.1, df = 730, p-value < 2.2e-16

UK_fc_df <- data.frame(date = seq.Date(from = max(UKdaily$TIMESTAMP) + days(1), 
                                       by = "day", length.out = h))
UK_fc_df$wday <- factor(lubridate::wday(UK_fc_df$date, label = TRUE), ordered = FALSE)
UK_fc_df$month <- factor(month(UK_fc_df$date, label = TRUE), ordered = FALSE)
UK_fc_df$lag365 <- tail(UKdaily$ND, h)
head(UK_fc_df)

UKgrid_fc <- forecast(final_md, h = h, newdata = UK_fc_df)
plot_forecast(UKgrid_fc,
              title = "The UK National Demand for Electricity Forecast",
              Ytitle = "MW", Xtitle = "Year")