Homework

library(dplyr)
library(ggplot2)
library(tidyr)
library(tibble)
library(tsibble)
library(ggfortify)
library(tidyverse)
library(fpp3)
library(moments)
library(zoo)
library(fable)
library(readxl)
library(seasonal)
library(caTools)

#Problem 1

jan2014_elec <-
    vic_elec %>% 
    filter(yearmonth(Time) == yearmonth('2014 Jan')) %>% 
    index_by(Day = as_date(Time)) %>% 
    summarise(
        Demand = sum(Demand),
        Temperature = max(Temperature)
    )

#1A

jan2014_elec %>%
    pivot_longer(c(Demand, Temperature), names_to = 'Measure', values_to = 'Value') %>% 
    ggplot() +
    geom_line(aes(Day, Value)) +
    facet_grid(rows = vars(Measure), scales = 'free') +
    labs(
        x = 'Date',
        y = 'Temperature (Celcius) & Demand (GWh)',
        title = 'Victorian Electricity Demand & Temperature',
        subtitle = 'January 2014'
    )

jan2014_elec %>%
    pivot_longer(c(Demand, Temperature), names_to = 'Measure', values_to = 'Value') %>% 
    ggplot() +
    geom_point(aes(Day, Value)) +
    facet_grid(rows = vars(Measure), scales = 'free') +
    labs(
        x = 'Date',
        y = 'Temperature (Celcius) & Demand (GWh)',
        title = 'Victorian Electricity Demand & Temperature',
        subtitle = "January 2014"
)

jan2014_elec %>% 
    ggplot(aes(Temperature, Demand)) +
    geom_point() +
    geom_smooth(method = 'lm', formula = 'y~x') +
  labs(title = "Victorian Electricity Demand as a Regression Model", subtitle = "January 2014")

#There is a positive relationship between demand and temperature because people will want to keep their houses cooler (there demand more electricity) as temperatures rise.

#1B

jan2014_residual <-
    jan2014_elec %>% 
    model(lm = TSLM(Demand ~ Temperature))
jan2014_residual %>% 
    augment() %>% 
    ggplot() +
    geom_point(aes(Demand, .resid)) +
    labs(
        title = 'Victorian Electricity Demand - Jan 2014',
        subtitle = 'Linear Model Residuals',
        x = 'Demand (GWh)',
        y = 'Model Residuals'
    )

#There do not appear to be any outliers but there appears to be some heteroskedasticity, meaning this model can’t be very good for forecasting.

#1C

jan2014_residual %>% 
    forecast(
        new_data(jan2014_elec, 1) %>% mutate(Temperature = 15)
    ) %>% 
    autoplot(jan2014_elec)

jan2014_residual %>% 
    forecast(
        new_data(jan2014_elec, 1) %>% mutate(Temperature = 35)
    ) %>%  
    autoplot(jan2014_elec)

#I don’t believe these forecasts for a second. The prediction intervals and point forecasts fall way outside the normal range of electricity demand.

#1D

jan2014_residual %>% 
    forecast(
        new_data(jan2014_elec, 1) %>% mutate(Temperature = 15)
    ) %>% 
    hilo(level = 80) %>% 
    pull(`80%`)

## <hilo[1]>
## [1] [117908.1, 184888.6]80

#These are the prediction intervals at the 80% precent level.

#1E

all_vic_elec <- vic_elec %>%
    index_by(Day = as_date(Time)) %>% 
    summarise(
        Demand = sum(Demand),
        Temperature = max(Temperature)
    )

all_vic_elec %>%
    pivot_longer(c(Demand, Temperature), names_to = 'Measure', values_to = 'Value') %>% 
    ggplot() +
    geom_line(aes(Day, Value)) +
    facet_grid(rows = vars(Measure), scales = 'free') +
    labs(
        x = 'Date',
        y = 'Temperature (Celcius) & Demand (GWh)',
        title = 'Victorian Electricity Demand & Temperature')

all_vic_elec %>% 
    ggplot(aes(Temperature, Demand)) +
    geom_point() +
    geom_smooth(method = 'lm', formula = 'y~x') +
  labs(title = "Victorian Electricity Demand as a Regression Model")

#Our model suffers from the same issue of the entire data set. There is obvious heteroskedasticity in this plot, meaning that we would be unable to make accurate forecasts using this data without trying to force some homoskedasticity.

#PROBLEM 2

#2A

autoplot(olympic_running) +
  facet_wrap(~Length + Sex, scales = "free") +
  labs(title = "Winning Olympic Running Times", x = "Year", y = "Time") +
  theme(legend.position = "none")

## Plot variable not specified, automatically selected `.vars = Time`

#These plots show that the winning times for almost every event and across both sexes has been decreasing steadily over time.

autoplot(olympic_running) +
  facet_wrap(~Length + Sex, scales = "free") +
  labs(title = "Winning Olympic Running Times w/Regression", x = "Year", y = "Time") +
  theme(legend.position = "none") +
  geom_smooth(method="lm", se=FALSE)

## Plot variable not specified, automatically selected `.vars = Time`
## `geom_smooth()` using formula 'y ~ x'

## Warning: Removed 31 rows containing non-finite values (stat_smooth).

olympic_avg <- lm(Year ~ ., olympic_running)
olympic_avg

## 
## Call:
## lm(formula = Year ~ ., data = olympic_running)
## 
## Coefficients:
## (Intercept)       Length     Sexwomen         Time  
##  1947.93654      0.05872     34.13151     -0.32515

#The average winning time has been decreasing by 0.3 seconds each year roughly.

library(forecast)

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

## Registered S3 methods overwritten by 'forecast':
##   method                 from     
##   autoplot.Arima         ggfortify
##   autoplot.acf           ggfortify
##   autoplot.ar            ggfortify
##   autoplot.bats          ggfortify
##   autoplot.decomposed.ts ggfortify
##   autoplot.ets           ggfortify
##   autoplot.forecast      ggfortify
##   autoplot.stl           ggfortify
##   autoplot.ts            ggfortify
##   fitted.ar              ggfortify
##   fortify.ts             ggfortify
##   residuals.ar           ggfortify

## 
## Attaching package: 'forecast'

## The following objects are masked from 'package:fabletools':
## 
##     accuracy, forecast

checkresiduals(olympic_avg)

## 
##  Breusch-Godfrey test for serial correlation of order up to 10
## 
## data:  Residuals
## LM test = 179.57, df = 10, p-value < 2.2e-16

detach_package <- function(pkg, character.only = FALSE)
{
  if(!character.only)
  {
    pkg <- deparse(substitute(pkg))
  }
  search_item <- paste("package", pkg, sep = ":")
  while(search_item %in% search())
  {
    detach(search_item, unload = TRUE, character.only = TRUE)
  }
}

detach_package(forecast)

olympic_running %>%
  filter(Sex == "men") %>%
  filter(Length == "100") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Men's 100 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "women") %>%
  filter(Length == "100") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
   labs(title = "Women's 100 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "men") %>%
  filter(Length == "200") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Men's 200 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "women") %>%
  filter(Length == "200") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Women's 200 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "men") %>%
  filter(Length == "400") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Men's 400 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "women") %>%
  filter(Length == "400") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Women's 400 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "men") %>%
  filter(Length == "800") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Men's 800 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "women") %>%
  filter(Length == "800") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Women's 800 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "men") %>%
  filter(Length == "1500") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Men's 1500 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "women") %>%
  filter(Length == "1500") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Women's 1500 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "men") %>%
  filter(Length == "5000") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Men's 5000 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "women") %>%
  filter(Length == "5000") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Women's 5000 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "men") %>%
  filter(Length == "10000") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Men's 10000 Meter Winning Time Forecast")

olympic_running %>%
  filter(Sex == "women") %>%
  filter(Length == "10000") %>%
  model(TSLM(Time ~ Year)) %>%
  forecast(new_data(olympic_running, 1) %>%
             mutate(Year = 2020)) %>%
  autoplot(olympic_running) +
  labs(title = "Women's 10000 Meter Winning Time Forecast")

#Problem 3

autoplot(souvenirs) + 
  labs(title = "Time Series of Souvenirs")

## Plot variable not specified, automatically selected `.vars = Sales`

#There are consistent spikes aroyund December of each year. This is likely due to the fact summer happens in Australia when it is December for the United States.

souvenirs_numeric <- souvenirs
souvenirs_sales <- as.numeric(souvenirs$Sales)
hist(souvenirs_sales)

souvenirs_log <- souvenirs_numeric
souvenirs_log_sales <- log(souvenirs_numeric$Sales)
hist(souvenirs_log_sales)

#Because of the dramatic spikes in December, the data is not normally distributed, meaning logging the data is necessary.

souvenir_ts <- ts(souvenirs$Sales,start=c(1987,1), end=c(1993,6), frequency=12)
souvenir_log2 <- log(souvenir_ts)
dummysurffest <- rep(0, length(souvenir_ts))
dummysurffest[seq_along(dummysurffest)%%12 == 3] = 1
dummysurffest[3] = 0
dummysurffest <- ts(dummysurffest, freq = 12, start=c(1987,1))
Bothofem <- data.frame(souvenir_log2, dummysurffest)

library(forecast)

## 
## Attaching package: 'forecast'

## The following objects are masked from 'package:fabletools':
## 
##     accuracy, forecast

fitted_souvenir <- tslm(souvenir_log2 ~ trend + season + dummysurffest, data = Bothofem)

forecast_souvenir <- data.frame(dummysurffest = rep(0, 12))
forecast_souvenir[3,] = 1
forecast(fitted_souvenir, newdata = forecast_souvenir)

##          Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## Jul 1993       9.883136  9.637329 10.128943  9.503919 10.262353
## Aug 1993       9.867772  9.621965 10.113579  9.488555 10.246989
## Sep 1993      10.531942 10.191069 10.872815 10.006062 11.057822
## Oct 1993      10.092605  9.846798 10.338412  9.713388 10.471822
## Nov 1993      10.585189 10.339382 10.830995 10.205971 10.964406
## Dec 1993      11.357556 11.111749 11.603363 10.978339 11.736773
## Jan 1994       9.430933  9.186093  9.675774  9.053207  9.808659
## Feb 1994       9.704370  9.459530  9.949210  9.326644 10.082096
## Mar 1994       9.695742  9.365756 10.025727  9.186658 10.204825
## Apr 1994       9.881046  9.636206 10.125887  9.503320 10.258772
## May 1994       9.928500  9.683659 10.173340  9.550774 10.306226
## Jun 1994       9.989861  9.745020 10.234701  9.612135 10.367587

plot(fitted_souvenir$residuals)

#The residuals appear to be white noise. There isn’t much trend or seasonality here.

boxplot(residuals(fitted_souvenir)~cycle(residuals(fitted_souvenir)))

#The boxplot shows show the correlations between the variables. According to the chart, 11 and 12 have the greatest impact on sales.

checkresiduals(fitted_souvenir)

## 
##  Breusch-Godfrey test for serial correlation of order up to 17
## 
## data:  Residuals from Linear regression model
## LM test = 31.535, df = 17, p-value = 0.01718

fitted_souvenir$coefficients

##   (Intercept)         trend       season2       season3       season4 
##     7.6662400     0.0207611     0.2526756     0.2232860     0.3878297 
##       season5       season6       season7       season8       season9 
##     0.4145219     0.4551219     0.5767690     0.5406444     0.6296713 
##      season10      season11      season12 dummysurffest 
##     0.7239552     1.1957774     1.9473841     0.5543818

forecast_souvenir2 <- data.frame(dummysurffest = rep(0, 36))
souvenir_predictions <- forecast(fitted_souvenir, newdata = forecast_souvenir2)
souvenir_predictions

##          Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## Jul 1993       9.883136  9.637329 10.128943  9.503919 10.262353
## Aug 1993       9.867772  9.621965 10.113579  9.488555 10.246989
## Sep 1993       9.977560  9.731753 10.223367  9.598343 10.356777
## Oct 1993      10.092605  9.846798 10.338412  9.713388 10.471822
## Nov 1993      10.585189 10.339382 10.830995 10.205971 10.964406
## Dec 1993      11.357556 11.111749 11.603363 10.978339 11.736773
## Jan 1994       9.430933  9.186093  9.675774  9.053207  9.808659
## Feb 1994       9.704370  9.459530  9.949210  9.326644 10.082096
## Mar 1994       9.695742  9.365756 10.025727  9.186658 10.204825
## Apr 1994       9.881046  9.636206 10.125887  9.503320 10.258772
## May 1994       9.928500  9.683659 10.173340  9.550774 10.306226
## Jun 1994       9.989861  9.745020 10.234701  9.612135 10.367587
## Jul 1994      10.132269  9.883394 10.381144  9.748319 10.516219
## Aug 1994      10.116905  9.868031 10.365780  9.732955 10.500856
## Sep 1994      10.226693  9.977819 10.475568  9.842743 10.610644
## Oct 1994      10.341738 10.092864 10.590613  9.957788 10.725689
## Nov 1994      10.834322 10.585447 11.083197 10.450372 11.218272
## Dec 1994      11.606690 11.357815 11.855564 11.222739 11.990640
## Jan 1995       9.680067  9.431764  9.928369  9.296999 10.063134
## Feb 1995       9.953503  9.705201 10.201806  9.570436 10.336570
## Mar 1995       9.944875  9.610605 10.279144  9.429182 10.460567
## Apr 1995      10.130180  9.881877 10.378482  9.747112 10.513247
## May 1995      10.177633  9.929330 10.425935  9.794566 10.560700
## Jun 1995      10.238994  9.990691 10.487296  9.855927 10.622061
## Jul 1995      10.381402 10.128745 10.634059  9.991617 10.771188
## Aug 1995      10.366039 10.113381 10.618696  9.976253 10.755824
## Sep 1995      10.475827 10.223169 10.728484 10.086041 10.865612
## Oct 1995      10.590872 10.338214 10.843529 10.201086 10.980657
## Nov 1995      11.083455 10.830798 11.336112 10.693669 11.473240
## Dec 1995      11.855823 11.603165 12.108480 11.466037 12.245608
## Jan 1996       9.929200  9.676730 10.181669  9.539704 10.318696
## Feb 1996      10.202636  9.950167 10.455106  9.813141 10.592132
## Mar 1996      10.194008  9.854949 10.533067  9.670926 10.717090
## Apr 1996      10.379313 10.126843 10.631782  9.989817 10.768809
## May 1996      10.426766 10.174296 10.679236 10.037270 10.816262
## Jun 1996      10.488127 10.235658 10.740597 10.098631 10.877623

autoplot(souvenir_predictions)

#The Lijung-Box tells us if there is a statistical difference between our residuals and white noise. Lower values indicate that residuals are not white noise.

#Problem 4

us_gasoline2005 <- us_gasoline %>%
  filter(year(Week) < "2005")

fourier_us_gas <- us_gasoline2005 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 1)))
report(fourier_us_gas)

## Series: Barrels 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.969489 -0.197166 -0.002252  0.200869  0.975792 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.092e+00  2.131e-02 332.782  < 2e-16 ***
## trend()              2.807e-03  5.081e-05  55.237  < 2e-16 ***
## fourier(K = 1)C1_52 -1.238e-01  1.505e-02  -8.226 9.01e-16 ***
## fourier(K = 1)S1_52 -2.383e-01  1.505e-02 -15.832  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2865 on 722 degrees of freedom
## Multiple R-squared: 0.8248,  Adjusted R-squared: 0.8241
## F-statistic:  1133 on 3 and 722 DF, p-value: < 2.22e-16

augment(fourier_us_gas) %>%
  ggplot(aes(x = Week)) +
  geom_line(aes(y = Barrels, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
  scale_color_manual(values = c(Data = "black", Fitted = "green"))

#The fitted values don’t seem to catch the extremeties of the peeks and troughs of the data.

detach_package(forecast)

fourier_us_gas %>%
  forecast(h = 52) %>%
  autoplot(us_gasoline2005)

fourier_us_gas2 <- us_gasoline2005 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 2)))
report(fourier_us_gas2)

## Series: Barrels 
## Model: TSLM 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.9375162 -0.1897569 -0.0006692  0.2058275  1.0016928 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.094e+00  2.121e-02 334.493  < 2e-16 ***
## trend()              2.802e-03  5.057e-05  55.420  < 2e-16 ***
## fourier(K = 2)C1_52 -1.237e-01  1.497e-02  -8.265 6.71e-16 ***
## fourier(K = 2)S1_52 -2.383e-01  1.497e-02 -15.917  < 2e-16 ***
## fourier(K = 2)C2_52  4.493e-02  1.495e-02   3.006  0.00274 ** 
## fourier(K = 2)S2_52  1.054e-02  1.498e-02   0.704  0.48193    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.285 on 720 degrees of freedom
## Multiple R-squared: 0.8271,  Adjusted R-squared: 0.8259
## F-statistic: 688.8 on 5 and 720 DF, p-value: < 2.22e-16

augment(fourier_us_gas2) %>%
  ggplot(aes(x = Week)) +
  geom_line(aes(y = Barrels, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
  scale_color_manual(values = c(Data = "black", Fitted = "green"))

fourier_us_gas2 %>%
  forecast(h = 52) %>%
  autoplot(us_gasoline2005)

fourier_us_gas5 <- us_gasoline2005 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 5)))
report(fourier_us_gas5)

## Series: Barrels 
## Model: TSLM 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.8568361 -0.1759670 -0.0002609  0.1916623  0.9380241 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.095e+00  2.043e-02 347.198  < 2e-16 ***
## trend()              2.798e-03  4.873e-05  57.428  < 2e-16 ***
## fourier(K = 5)C1_52 -1.242e-01  1.442e-02  -8.610  < 2e-16 ***
## fourier(K = 5)S1_52 -2.390e-01  1.442e-02 -16.570  < 2e-16 ***
## fourier(K = 5)C2_52  4.517e-02  1.440e-02   3.137  0.00178 ** 
## fourier(K = 5)S2_52  9.760e-03  1.443e-02   0.676  0.49898    
## fourier(K = 5)C3_52  9.586e-02  1.442e-02   6.646    6e-11 ***
## fourier(K = 5)S3_52  2.543e-04  1.440e-02   0.018  0.98592    
## fourier(K = 5)C4_52  2.854e-02  1.442e-02   1.979  0.04821 *  
## fourier(K = 5)S4_52  2.861e-02  1.440e-02   1.987  0.04733 *  
## fourier(K = 5)C5_52 -3.364e-02  1.440e-02  -2.336  0.01974 *  
## fourier(K = 5)S5_52  3.123e-02  1.443e-02   2.165  0.03073 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2746 on 714 degrees of freedom
## Multiple R-squared: 0.8409,  Adjusted R-squared: 0.8384
## F-statistic:   343 on 11 and 714 DF, p-value: < 2.22e-16

augment(fourier_us_gas5) %>%
  ggplot(aes(x = Week)) +
  geom_line(aes(y = Barrels, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
  scale_color_manual(values = c(Data = "black", Fitted = "green"))

fourier_us_gas5 %>%
  forecast(h = 52) %>%
  autoplot(us_gasoline2005)

fourier_us_gas10 <- us_gasoline2005 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 10)))
report(fourier_us_gas10)

## Series: Barrels 
## Model: TSLM 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.8625657 -0.1755251  0.0003646  0.1817594  0.9806206 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            7.094e+00  2.002e-02 354.363  < 2e-16 ***
## trend()                2.799e-03  4.774e-05  58.625  < 2e-16 ***
## fourier(K = 10)C1_52  -1.245e-01  1.413e-02  -8.814  < 2e-16 ***
## fourier(K = 10)S1_52  -2.395e-01  1.413e-02 -16.945  < 2e-16 ***
## fourier(K = 10)C2_52   4.529e-02  1.411e-02   3.210  0.00139 ** 
## fourier(K = 10)S2_52   9.203e-03  1.414e-02   0.651  0.51524    
## fourier(K = 10)C3_52   9.636e-02  1.413e-02   6.819 1.98e-11 ***
## fourier(K = 10)S3_52  -4.035e-06  1.411e-02   0.000  0.99977    
## fourier(K = 10)C4_52   2.905e-02  1.413e-02   2.056  0.04015 *  
## fourier(K = 10)S4_52   2.884e-02  1.411e-02   2.044  0.04134 *  
## fourier(K = 10)C5_52  -3.349e-02  1.410e-02  -2.375  0.01783 *  
## fourier(K = 10)S5_52   3.176e-02  1.413e-02   2.247  0.02494 *  
## fourier(K = 10)C6_52  -6.569e-02  1.412e-02  -4.653 3.91e-06 ***
## fourier(K = 10)S6_52   2.815e-02  1.412e-02   1.993  0.04660 *  
## fourier(K = 10)C7_52  -2.240e-02  1.413e-02  -1.585  0.11340    
## fourier(K = 10)S7_52   3.279e-02  1.411e-02   2.324  0.02038 *  
## fourier(K = 10)C8_52  -1.671e-02  1.411e-02  -1.184  0.23676    
## fourier(K = 10)S8_52  -1.432e-03  1.412e-02  -0.101  0.91926    
## fourier(K = 10)C9_52  -1.768e-02  1.411e-02  -1.253  0.21066    
## fourier(K = 10)S9_52  -6.335e-04  1.413e-02  -0.045  0.96424    
## fourier(K = 10)C10_52  1.274e-02  1.412e-02   0.902  0.36751    
## fourier(K = 10)S10_52 -2.368e-02  1.411e-02  -1.678  0.09387 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.269 on 704 degrees of freedom
## Multiple R-squared: 0.8494,  Adjusted R-squared: 0.8449
## F-statistic: 189.1 on 21 and 704 DF, p-value: < 2.22e-16

augment(fourier_us_gas10) %>%
  ggplot(aes(x = Week)) +
  geom_line(aes(y = Barrels, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
  scale_color_manual(values = c(Data = "black", Fitted = "green"))

fourier_us_gas10 %>%
  forecast(h = 52) %>%
  autoplot(us_gasoline2005)

fourier_us_gas20 <- us_gasoline2005 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 20)))
report(fourier_us_gas20)

## Series: Barrels 
## Model: TSLM 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.8455538 -0.1703796  0.0009278  0.1724455  0.9995200 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            7.095e+00  2.001e-02 354.550  < 2e-16 ***
## trend()                2.798e-03  4.772e-05  58.649  < 2e-16 ***
## fourier(K = 20)C1_52  -1.245e-01  1.412e-02  -8.814  < 2e-16 ***
## fourier(K = 20)S1_52  -2.394e-01  1.413e-02 -16.946  < 2e-16 ***
## fourier(K = 20)C2_52   4.525e-02  1.410e-02   3.209  0.00139 ** 
## fourier(K = 20)S2_52   9.317e-03  1.413e-02   0.659  0.50988    
## fourier(K = 20)C3_52   9.624e-02  1.413e-02   6.814 2.09e-11 ***
## fourier(K = 20)S3_52   3.202e-05  1.410e-02   0.002  0.99819    
## fourier(K = 20)C4_52   2.895e-02  1.412e-02   2.050  0.04072 *  
## fourier(K = 20)S4_52   2.877e-02  1.410e-02   2.040  0.04175 *  
## fourier(K = 20)C5_52  -3.349e-02  1.410e-02  -2.376  0.01779 *  
## fourier(K = 20)S5_52   3.165e-02  1.413e-02   2.240  0.02541 *  
## fourier(K = 20)C6_52  -6.559e-02  1.411e-02  -4.649 4.01e-06 ***
## fourier(K = 20)S6_52   2.808e-02  1.411e-02   1.990  0.04701 *  
## fourier(K = 20)C7_52  -2.229e-02  1.413e-02  -1.578  0.11498    
## fourier(K = 20)S7_52   3.283e-02  1.410e-02   2.329  0.02018 *  
## fourier(K = 20)C8_52  -1.668e-02  1.411e-02  -1.183  0.23735    
## fourier(K = 20)S8_52  -1.323e-03  1.412e-02  -0.094  0.92536    
## fourier(K = 20)C9_52  -1.775e-02  1.410e-02  -1.259  0.20862    
## fourier(K = 20)S9_52  -5.484e-04  1.412e-02  -0.039  0.96903    
## fourier(K = 20)C10_52  1.263e-02  1.412e-02   0.894  0.37140    
## fourier(K = 20)S10_52 -2.368e-02  1.411e-02  -1.679  0.09360 .  
## fourier(K = 20)C11_52 -2.835e-02  1.411e-02  -2.008  0.04500 *  
## fourier(K = 20)S11_52  2.542e-02  1.411e-02   1.801  0.07208 .  
## fourier(K = 20)C12_52 -1.166e-02  1.411e-02  -0.827  0.40869    
## fourier(K = 20)S12_52 -2.505e-02  1.411e-02  -1.775  0.07633 .  
## fourier(K = 20)C13_52  6.269e-03  1.411e-02   0.444  0.65703    
## fourier(K = 20)S13_52  1.421e-02  1.411e-02   1.007  0.31442    
## fourier(K = 20)C14_52 -3.088e-03  1.411e-02  -0.219  0.82685    
## fourier(K = 20)S14_52  1.816e-02  1.411e-02   1.287  0.19865    
## fourier(K = 20)C15_52 -1.403e-03  1.411e-02  -0.099  0.92086    
## fourier(K = 20)S15_52  1.743e-02  1.411e-02   1.235  0.21708    
## fourier(K = 20)C16_52 -1.224e-02  1.412e-02  -0.867  0.38624    
## fourier(K = 20)S16_52 -1.567e-03  1.411e-02  -0.111  0.91160    
## fourier(K = 20)C17_52  7.639e-03  1.410e-02   0.542  0.58827    
## fourier(K = 20)S17_52 -2.197e-02  1.412e-02  -1.556  0.12021    
## fourier(K = 20)C18_52  2.355e-03  1.411e-02   0.167  0.86749    
## fourier(K = 20)S18_52  1.760e-02  1.412e-02   1.247  0.21299    
## fourier(K = 20)C19_52 -2.098e-03  1.412e-02  -0.149  0.88197    
## fourier(K = 20)S19_52 -7.173e-05  1.410e-02  -0.005  0.99594    
## fourier(K = 20)C20_52 -2.844e-03  1.411e-02  -0.202  0.84034    
## fourier(K = 20)S20_52  7.698e-04  1.411e-02   0.055  0.95652    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2688 on 684 degrees of freedom
## Multiple R-squared: 0.8538,  Adjusted R-squared: 0.8451
## F-statistic: 97.46 on 41 and 684 DF, p-value: < 2.22e-16

augment(fourier_us_gas20) %>%
  ggplot(aes(x = Week)) +
  geom_line(aes(y = Barrels, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
  scale_color_manual(values = c(Data = "black", Fitted = "green"))

fourier_us_gas20 %>%
  forecast(h = 52) %>%
  autoplot(us_gasoline2005)

us_gasoline2005_fit <- us_gasoline2005 %>%
  model(K1 = TSLM(Barrels ~ trend() + fourier(K = 1)),
        K2 = TSLM(Barrels ~ trend() + fourier(K = 2)),
        K5 = TSLM(Barrels ~ trend() + fourier(K = 5)),
        K10 = TSLM(Barrels ~ trend() + fourier(K = 10)),
        K20 = TSLM(Barrels ~ trend() + fourier(K = 20)))
       
glance(us_gasoline2005_fit) %>% select(.model, r_squared, adj_r_squared, AICc)

## # A tibble: 5 × 4
##   .model r_squared adj_r_squared   AICc
##   <chr>      <dbl>         <dbl>  <dbl>
## 1 K1         0.825         0.824 -1809.
## 2 K2         0.827         0.826 -1814.
## 3 K5         0.841         0.838 -1862.
## 4 K10        0.849         0.845 -1881.
## 5 K20        0.854         0.845 -1859.

us_gasoline2005_fit2 <- us_gasoline2005 %>%
  model(K11 = TSLM(Barrels ~ trend() + fourier(K = 11)),
        K12 = TSLM(Barrels ~ trend() + fourier(K = 12)),
        K13 = TSLM(Barrels ~ trend() + fourier(K = 13)),
        K14 = TSLM(Barrels ~ trend() + fourier(K = 14)),
        K15 = TSLM(Barrels ~ trend() + fourier(K = 15)),
        K16 = TSLM(Barrels ~ trend() + fourier(K = 16)),
        K17 = TSLM(Barrels ~ trend() + fourier(K = 17)),
        K18 = TSLM(Barrels ~ trend() + fourier(K = 18)),
        K19 = TSLM(Barrels ~ trend() + fourier(K = 19)))
glance(us_gasoline2005_fit2) %>% select(.model, r_squared, adj_r_squared, AICc)

## # A tibble: 9 × 4
##   .model r_squared adj_r_squared   AICc
##   <chr>      <dbl>         <dbl>  <dbl>
## 1 K11        0.851         0.846 -1885.
## 2 K12        0.852         0.847 -1884.
## 3 K13        0.852         0.846 -1881.
## 4 K14        0.852         0.846 -1879.
## 5 K15        0.853         0.846 -1876.
## 6 K16        0.853         0.846 -1872.
## 7 K17        0.853         0.846 -1871.
## 8 K18        0.854         0.846 -1868.
## 9 K19        0.854         0.846 -1864.

fourier_us_gas11 <- us_gasoline2005 %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 11)))
report(fourier_us_gas11)

## Series: Barrels 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.896208 -0.177835  0.003786  0.173355  1.014511 
## 
## Coefficients:
##                         Estimate Std. Error t value Pr(>|t|)    
## (Intercept)            7.095e+00  1.994e-02 355.717  < 2e-16 ***
## trend()                2.799e-03  4.756e-05  58.844  < 2e-16 ***
## fourier(K = 11)C1_52  -1.245e-01  1.407e-02  -8.845  < 2e-16 ***
## fourier(K = 11)S1_52  -2.394e-01  1.408e-02 -17.007  < 2e-16 ***
## fourier(K = 11)C2_52   4.529e-02  1.405e-02   3.223  0.00133 ** 
## fourier(K = 11)S2_52   9.256e-03  1.408e-02   0.657  0.51125    
## fourier(K = 11)C3_52   9.632e-02  1.408e-02   6.842 1.70e-11 ***
## fourier(K = 11)S3_52   3.762e-05  1.405e-02   0.003  0.99787    
## fourier(K = 11)C4_52   2.899e-02  1.408e-02   2.060  0.03980 *  
## fourier(K = 11)S4_52   2.884e-02  1.406e-02   2.052  0.04054 *  
## fourier(K = 11)C5_52  -3.354e-02  1.405e-02  -2.387  0.01725 *  
## fourier(K = 11)S5_52   3.172e-02  1.408e-02   2.253  0.02458 *  
## fourier(K = 11)C6_52  -6.569e-02  1.406e-02  -4.671 3.59e-06 ***
## fourier(K = 11)S6_52   2.808e-02  1.407e-02   1.996  0.04628 *  
## fourier(K = 11)C7_52  -2.235e-02  1.408e-02  -1.588  0.11283    
## fourier(K = 11)S7_52   3.274e-02  1.405e-02   2.330  0.02010 *  
## fourier(K = 11)C8_52  -1.664e-02  1.406e-02  -1.183  0.23704    
## fourier(K = 11)S8_52  -1.428e-03  1.407e-02  -0.102  0.91916    
## fourier(K = 11)C9_52  -1.763e-02  1.406e-02  -1.254  0.21019    
## fourier(K = 11)S9_52  -5.731e-04  1.407e-02  -0.041  0.96753    
## fourier(K = 11)C10_52  1.272e-02  1.407e-02   0.904  0.36621    
## fourier(K = 11)S10_52 -2.359e-02  1.406e-02  -1.678  0.09375 .  
## fourier(K = 11)C11_52 -2.837e-02  1.407e-02  -2.016  0.04414 *  
## fourier(K = 11)S11_52  2.555e-02  1.406e-02   1.817  0.06962 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.268 on 702 degrees of freedom
## Multiple R-squared: 0.851,   Adjusted R-squared: 0.8461
## F-statistic: 174.3 on 23 and 702 DF, p-value: < 2.22e-16

augment(fourier_us_gas11) %>%
  ggplot(aes(x = Week)) +
  geom_line(aes(y = Barrels, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
  scale_color_manual(values = c(Data = "black", Fitted = "green"))

fourier_us_gas11 %>%
  forecast(h = 52) %>%
  autoplot(us_gasoline2005)

fourier_us_gas11 %>%
  gg_tsresiduals()

augment(fourier_us_gas11) %>%
  features(.resid, ljung_box, lag = 104, dof = 0)

## # A tibble: 1 × 3
##   .model                                    lb_stat lb_pvalue
##   <chr>                                       <dbl>     <dbl>
## 1 TSLM(Barrels ~ trend() + fourier(K = 11))    149.   0.00268

us_gasoline2005_plot <- us_gasoline2005 %>%
  model(
    Mean = MEAN(Barrels),
    Naive = NAIVE(Barrels),
    Seasonal_naive = SNAIVE(Barrels),
    Drift = RW(Barrels ~ drift()))
us_gasoline2005_plot %>%
  forecast(h = 52) %>%
  autoplot(us_gasoline2005)

fourier_us_gas11 %>%
  forecast(h = 52) %>%
  autoplot(us_gasoline2005)

#The best forecast appears to be when k = 11. #Our model is fairly accurate and gives a reasonable prediction interval.

#Problem 5

global_economy %>%
  filter(Country=="Afghanistan") %>%
  tsibble(key = Code, index = Year) %>%
  autoplot(Population, show.legend =  FALSE) +
  labs(title= "Afghanistan Population", y = "Population")

global_economy %>%
  filter(Country=="Afghanistan")%>%
  model(TSLM(Population ~ Year)) %>%
  report()

## Series: Population 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -5794518 -2582559   744761  2259222  6036280 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -829292529   49730866  -16.68   <2e-16 ***
## Year            425774      25008   17.02   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3188000 on 56 degrees of freedom
## Multiple R-squared: 0.8381,  Adjusted R-squared: 0.8352
## F-statistic: 289.9 on 1 and 56 DF, p-value: < 2.22e-16

global_economy %>%
  filter(Country=="Afghanistan")%>%
  filter(Year<1980)%>%
  model(TSLM(Population ~ Year)) %>%
  report()

## Series: Population 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -146380.8 -110290.6    -451.2  105877.8  202881.3 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -470734527    8787062  -53.57   <2e-16 ***
## Year            244657       4462   54.84   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 115100 on 18 degrees of freedom
## Multiple R-squared: 0.994,   Adjusted R-squared: 0.9937
## F-statistic:  3007 on 1 and 18 DF, p-value: < 2.22e-16

global_economy %>%
  filter(Country=="Afghanistan")%>%
  filter(Year>1989)%>%
  model(TSLM(Population ~ Year)) %>%
  report()

## Series: Population 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -619234 -212927    6598  234280  612277 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -1.670e+09  1.640e+07  -101.8   <2e-16 ***
## Year         8.451e+05  8.184e+03   103.3   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 349800 on 26 degrees of freedom
## Multiple R-squared: 0.9976,  Adjusted R-squared: 0.9975
## F-statistic: 1.066e+04 on 1 and 26 DF, p-value: < 2.22e-16

Afghanistan_forecast1 <- global_economy %>%
  filter(Country=="Afghanistan")%>%
  model(TSLM(Population ~ Year)) 

Afghanistan_forecast2 <- global_economy %>%
  filter(Country=="Afghanistan")%>%
  filter(Year>1989)%>%
  model(TSLM(Population ~ Year)) 

Afghanistan <- global_economy %>%
  filter(Country=="Afghanistan")

Afghanistan_rep <- Afghanistan %>%
  model(piecewise = TSLM(Population ~ trend(knots = c(1980, 1989))))
report(Afghanistan_rep)

## Series: Population 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -577590 -174198  -16784  187226  679947 
## 
## Coefficients:
##                                      Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                           8697573     131122   66.33   <2e-16 ***
## trend(knots = c(1980, 1989))trend      224372       9623   23.32   <2e-16 ***
## trend(knots = c(1980, 1989))trend_21  -456804      24498  -18.65   <2e-16 ***
## trend(knots = c(1980, 1989))trend_30  1082782      21418   50.55   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 300900 on 54 degrees of freedom
## Multiple R-squared: 0.9986,  Adjusted R-squared: 0.9985
## F-statistic: 1.293e+04 on 3 and 54 DF, p-value: < 2.22e-16

forecast(Afghanistan_forecast1, h = 5)

## # A fable: 5 x 5 [1Y]
## # Key:     Country, .model [1]
##   Country     .model                   Year          Population     .mean
##   <fct>       <chr>                   <dbl>              <dist>     <dbl>
## 1 Afghanistan TSLM(Population ~ Year)  2018   N(3e+07, 1.1e+13) 29919575.
## 2 Afghanistan TSLM(Population ~ Year)  2019   N(3e+07, 1.1e+13) 30345349.
## 3 Afghanistan TSLM(Population ~ Year)  2020 N(3.1e+07, 1.1e+13) 30771123.
## 4 Afghanistan TSLM(Population ~ Year)  2021 N(3.1e+07, 1.1e+13) 31196897.
## 5 Afghanistan TSLM(Population ~ Year)  2022 N(3.2e+07, 1.1e+13) 31622671.

forecast(Afghanistan_forecast2, h = 5)

## # A fable: 5 x 5 [1Y]
## # Key:     Country, .model [1]
##   Country     .model                   Year          Population     .mean
##   <fct>       <chr>                   <dbl>              <dist>     <dbl>
## 1 Afghanistan TSLM(Population ~ Year)  2018 N(3.6e+07, 1.4e+11) 35925602.
## 2 Afghanistan TSLM(Population ~ Year)  2019 N(3.7e+07, 1.4e+11) 36770747.
## 3 Afghanistan TSLM(Population ~ Year)  2020 N(3.8e+07, 1.4e+11) 37615892.
## 4 Afghanistan TSLM(Population ~ Year)  2021 N(3.8e+07, 1.5e+11) 38461037.
## 5 Afghanistan TSLM(Population ~ Year)  2022 N(3.9e+07, 1.5e+11) 39306182.

Afghanistan_drift <- Afghanistan %>%
  model(RW(Population ~ drift())) %>%
  forecast(h = "5 years") %>%
  autoplot(Afghanistan) +
  labs(title = "Drift Forecast of Afghanistan Population", y = "Population in Millions")

Afghanistan_drift

#The drift forecast for this data appears to be fairly accurate.

#Problem 6

#Assumptions of OLS: #1 The model is linear, correctly specifiec, and has an additive error term. This basically means we are using a linear regression that accounts for unknowns in the error variable.

#2 No Perfect Multicollinearity. This means that two of our x (predictor) variables cannot have a correlation = 1, meaning that they are the exact same thing (IE BMI and bodyweight/height^2).

#3 The error term has 0 population mean. This just means we have no omitted variable.

#4 No Serial Correlation (AKA Autocorrelation). This means errors from past time periods cannot be predict future values of our error terms.

#5 No predictor variables are correlated with the error term. If this is false the model will have omitted variable bais.

#6 Homoskedasticity, meaning that there is constant variance in the error term across the model.

#7 Error term is normally distributed. This just means that our forecasted value should be as close to the actual value as possible.

#An unbiased estimator means that your estimate is the actual observed value, meaning that your model is completely clear of bias. A consistent estimator gets more accurate as the sample size gets larger and larger, but it is still ultimately biased.

#Both R^2 and Adj. R^2 are values that tell us what percent of variation in y in our data can be explained. The difference is that Adj. R^2 accounts for the number of predictor variables you have in your model, because increasing the number of predictors tends to increase R^2.

Homework_4

Peter Cook

2022-10-04