library(dplyr)
library(ggplot2)
library(tidyr)
library(tibble)
library(tsibble)
library(ggfortify)
library(tidyverse)
library(fpp3)
library(moments)
library(zoo)
library(fable)
library(readxl)
library(seasonal)
library(caTools)
#Problem 1
jan2014_elec <-
vic_elec %>%
filter(yearmonth(Time) == yearmonth('2014 Jan')) %>%
index_by(Day = as_date(Time)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature)
)
#1A
jan2014_elec %>%
pivot_longer(c(Demand, Temperature), names_to = 'Measure', values_to = 'Value') %>%
ggplot() +
geom_line(aes(Day, Value)) +
facet_grid(rows = vars(Measure), scales = 'free') +
labs(
x = 'Date',
y = 'Temperature (Celcius) & Demand (GWh)',
title = 'Victorian Electricity Demand & Temperature',
subtitle = 'January 2014'
)
jan2014_elec %>%
pivot_longer(c(Demand, Temperature), names_to = 'Measure', values_to = 'Value') %>%
ggplot() +
geom_point(aes(Day, Value)) +
facet_grid(rows = vars(Measure), scales = 'free') +
labs(
x = 'Date',
y = 'Temperature (Celcius) & Demand (GWh)',
title = 'Victorian Electricity Demand & Temperature',
subtitle = "January 2014"
)
jan2014_elec %>%
ggplot(aes(Temperature, Demand)) +
geom_point() +
geom_smooth(method = 'lm', formula = 'y~x') +
labs(title = "Victorian Electricity Demand as a Regression Model", subtitle = "January 2014")
#There is a positive relationship between demand and temperature because people will want to keep their houses cooler (there demand more electricity) as temperatures rise.
#1B
jan2014_residual <-
jan2014_elec %>%
model(lm = TSLM(Demand ~ Temperature))
jan2014_residual %>%
augment() %>%
ggplot() +
geom_point(aes(Demand, .resid)) +
labs(
title = 'Victorian Electricity Demand - Jan 2014',
subtitle = 'Linear Model Residuals',
x = 'Demand (GWh)',
y = 'Model Residuals'
)
#There do not appear to be any outliers but there appears to be some heteroskedasticity, meaning this model can’t be very good for forecasting.
#1C
jan2014_residual %>%
forecast(
new_data(jan2014_elec, 1) %>% mutate(Temperature = 15)
) %>%
autoplot(jan2014_elec)
jan2014_residual %>%
forecast(
new_data(jan2014_elec, 1) %>% mutate(Temperature = 35)
) %>%
autoplot(jan2014_elec)
#I don’t believe these forecasts for a second. The prediction intervals and point forecasts fall way outside the normal range of electricity demand.
#1D
jan2014_residual %>%
forecast(
new_data(jan2014_elec, 1) %>% mutate(Temperature = 15)
) %>%
hilo(level = 80) %>%
pull(`80%`)
## <hilo[1]>
## [1] [117908.1, 184888.6]80
#These are the prediction intervals at the 80% precent level.
#1E
all_vic_elec <- vic_elec %>%
index_by(Day = as_date(Time)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature)
)
all_vic_elec %>%
pivot_longer(c(Demand, Temperature), names_to = 'Measure', values_to = 'Value') %>%
ggplot() +
geom_line(aes(Day, Value)) +
facet_grid(rows = vars(Measure), scales = 'free') +
labs(
x = 'Date',
y = 'Temperature (Celcius) & Demand (GWh)',
title = 'Victorian Electricity Demand & Temperature')
all_vic_elec %>%
ggplot(aes(Temperature, Demand)) +
geom_point() +
geom_smooth(method = 'lm', formula = 'y~x') +
labs(title = "Victorian Electricity Demand as a Regression Model")
#Our model suffers from the same issue of the entire data set. There is obvious heteroskedasticity in this plot, meaning that we would be unable to make accurate forecasts using this data without trying to force some homoskedasticity.
#PROBLEM 2
#2A
autoplot(olympic_running) +
facet_wrap(~Length + Sex, scales = "free") +
labs(title = "Winning Olympic Running Times", x = "Year", y = "Time") +
theme(legend.position = "none")
## Plot variable not specified, automatically selected `.vars = Time`
#These plots show that the winning times for almost every event and across both sexes has been decreasing steadily over time.
autoplot(olympic_running) +
facet_wrap(~Length + Sex, scales = "free") +
labs(title = "Winning Olympic Running Times w/Regression", x = "Year", y = "Time") +
theme(legend.position = "none") +
geom_smooth(method="lm", se=FALSE)
## Plot variable not specified, automatically selected `.vars = Time`
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 31 rows containing non-finite values (stat_smooth).
olympic_avg <- lm(Year ~ ., olympic_running)
olympic_avg
##
## Call:
## lm(formula = Year ~ ., data = olympic_running)
##
## Coefficients:
## (Intercept) Length Sexwomen Time
## 1947.93654 0.05872 34.13151 -0.32515
#The average winning time has been decreasing by 0.3 seconds each year roughly.
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
##
## Attaching package: 'forecast'
## The following objects are masked from 'package:fabletools':
##
## accuracy, forecast
checkresiduals(olympic_avg)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 179.57, df = 10, p-value < 2.2e-16
detach_package <- function(pkg, character.only = FALSE)
{
if(!character.only)
{
pkg <- deparse(substitute(pkg))
}
search_item <- paste("package", pkg, sep = ":")
while(search_item %in% search())
{
detach(search_item, unload = TRUE, character.only = TRUE)
}
}
detach_package(forecast)
olympic_running %>%
filter(Sex == "men") %>%
filter(Length == "100") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Men's 100 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "women") %>%
filter(Length == "100") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Women's 100 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "men") %>%
filter(Length == "200") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Men's 200 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "women") %>%
filter(Length == "200") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Women's 200 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "men") %>%
filter(Length == "400") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Men's 400 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "women") %>%
filter(Length == "400") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Women's 400 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "men") %>%
filter(Length == "800") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Men's 800 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "women") %>%
filter(Length == "800") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Women's 800 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "men") %>%
filter(Length == "1500") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Men's 1500 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "women") %>%
filter(Length == "1500") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Women's 1500 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "men") %>%
filter(Length == "5000") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Men's 5000 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "women") %>%
filter(Length == "5000") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Women's 5000 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "men") %>%
filter(Length == "10000") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Men's 10000 Meter Winning Time Forecast")
olympic_running %>%
filter(Sex == "women") %>%
filter(Length == "10000") %>%
model(TSLM(Time ~ Year)) %>%
forecast(new_data(olympic_running, 1) %>%
mutate(Year = 2020)) %>%
autoplot(olympic_running) +
labs(title = "Women's 10000 Meter Winning Time Forecast")
#Problem 3
autoplot(souvenirs) +
labs(title = "Time Series of Souvenirs")
## Plot variable not specified, automatically selected `.vars = Sales`
#There are consistent spikes aroyund December of each year. This is likely due to the fact summer happens in Australia when it is December for the United States.
souvenirs_numeric <- souvenirs
souvenirs_sales <- as.numeric(souvenirs$Sales)
hist(souvenirs_sales)
souvenirs_log <- souvenirs_numeric
souvenirs_log_sales <- log(souvenirs_numeric$Sales)
hist(souvenirs_log_sales)
#Because of the dramatic spikes in December, the data is not normally distributed, meaning logging the data is necessary.
souvenir_ts <- ts(souvenirs$Sales,start=c(1987,1), end=c(1993,6), frequency=12)
souvenir_log2 <- log(souvenir_ts)
dummysurffest <- rep(0, length(souvenir_ts))
dummysurffest[seq_along(dummysurffest)%%12 == 3] = 1
dummysurffest[3] = 0
dummysurffest <- ts(dummysurffest, freq = 12, start=c(1987,1))
Bothofem <- data.frame(souvenir_log2, dummysurffest)
library(forecast)
##
## Attaching package: 'forecast'
## The following objects are masked from 'package:fabletools':
##
## accuracy, forecast
fitted_souvenir <- tslm(souvenir_log2 ~ trend + season + dummysurffest, data = Bothofem)
forecast_souvenir <- data.frame(dummysurffest = rep(0, 12))
forecast_souvenir[3,] = 1
forecast(fitted_souvenir, newdata = forecast_souvenir)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jul 1993 9.883136 9.637329 10.128943 9.503919 10.262353
## Aug 1993 9.867772 9.621965 10.113579 9.488555 10.246989
## Sep 1993 10.531942 10.191069 10.872815 10.006062 11.057822
## Oct 1993 10.092605 9.846798 10.338412 9.713388 10.471822
## Nov 1993 10.585189 10.339382 10.830995 10.205971 10.964406
## Dec 1993 11.357556 11.111749 11.603363 10.978339 11.736773
## Jan 1994 9.430933 9.186093 9.675774 9.053207 9.808659
## Feb 1994 9.704370 9.459530 9.949210 9.326644 10.082096
## Mar 1994 9.695742 9.365756 10.025727 9.186658 10.204825
## Apr 1994 9.881046 9.636206 10.125887 9.503320 10.258772
## May 1994 9.928500 9.683659 10.173340 9.550774 10.306226
## Jun 1994 9.989861 9.745020 10.234701 9.612135 10.367587
plot(fitted_souvenir$residuals)
#The residuals appear to be white noise. There isn’t much trend or seasonality here.
boxplot(residuals(fitted_souvenir)~cycle(residuals(fitted_souvenir)))
#The boxplot shows show the correlations between the variables. According to the chart, 11 and 12 have the greatest impact on sales.
checkresiduals(fitted_souvenir)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals from Linear regression model
## LM test = 31.535, df = 17, p-value = 0.01718
fitted_souvenir$coefficients
## (Intercept) trend season2 season3 season4
## 7.6662400 0.0207611 0.2526756 0.2232860 0.3878297
## season5 season6 season7 season8 season9
## 0.4145219 0.4551219 0.5767690 0.5406444 0.6296713
## season10 season11 season12 dummysurffest
## 0.7239552 1.1957774 1.9473841 0.5543818
forecast_souvenir2 <- data.frame(dummysurffest = rep(0, 36))
souvenir_predictions <- forecast(fitted_souvenir, newdata = forecast_souvenir2)
souvenir_predictions
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jul 1993 9.883136 9.637329 10.128943 9.503919 10.262353
## Aug 1993 9.867772 9.621965 10.113579 9.488555 10.246989
## Sep 1993 9.977560 9.731753 10.223367 9.598343 10.356777
## Oct 1993 10.092605 9.846798 10.338412 9.713388 10.471822
## Nov 1993 10.585189 10.339382 10.830995 10.205971 10.964406
## Dec 1993 11.357556 11.111749 11.603363 10.978339 11.736773
## Jan 1994 9.430933 9.186093 9.675774 9.053207 9.808659
## Feb 1994 9.704370 9.459530 9.949210 9.326644 10.082096
## Mar 1994 9.695742 9.365756 10.025727 9.186658 10.204825
## Apr 1994 9.881046 9.636206 10.125887 9.503320 10.258772
## May 1994 9.928500 9.683659 10.173340 9.550774 10.306226
## Jun 1994 9.989861 9.745020 10.234701 9.612135 10.367587
## Jul 1994 10.132269 9.883394 10.381144 9.748319 10.516219
## Aug 1994 10.116905 9.868031 10.365780 9.732955 10.500856
## Sep 1994 10.226693 9.977819 10.475568 9.842743 10.610644
## Oct 1994 10.341738 10.092864 10.590613 9.957788 10.725689
## Nov 1994 10.834322 10.585447 11.083197 10.450372 11.218272
## Dec 1994 11.606690 11.357815 11.855564 11.222739 11.990640
## Jan 1995 9.680067 9.431764 9.928369 9.296999 10.063134
## Feb 1995 9.953503 9.705201 10.201806 9.570436 10.336570
## Mar 1995 9.944875 9.610605 10.279144 9.429182 10.460567
## Apr 1995 10.130180 9.881877 10.378482 9.747112 10.513247
## May 1995 10.177633 9.929330 10.425935 9.794566 10.560700
## Jun 1995 10.238994 9.990691 10.487296 9.855927 10.622061
## Jul 1995 10.381402 10.128745 10.634059 9.991617 10.771188
## Aug 1995 10.366039 10.113381 10.618696 9.976253 10.755824
## Sep 1995 10.475827 10.223169 10.728484 10.086041 10.865612
## Oct 1995 10.590872 10.338214 10.843529 10.201086 10.980657
## Nov 1995 11.083455 10.830798 11.336112 10.693669 11.473240
## Dec 1995 11.855823 11.603165 12.108480 11.466037 12.245608
## Jan 1996 9.929200 9.676730 10.181669 9.539704 10.318696
## Feb 1996 10.202636 9.950167 10.455106 9.813141 10.592132
## Mar 1996 10.194008 9.854949 10.533067 9.670926 10.717090
## Apr 1996 10.379313 10.126843 10.631782 9.989817 10.768809
## May 1996 10.426766 10.174296 10.679236 10.037270 10.816262
## Jun 1996 10.488127 10.235658 10.740597 10.098631 10.877623
autoplot(souvenir_predictions)
#The Lijung-Box tells us if there is a statistical difference between our residuals and white noise. Lower values indicate that residuals are not white noise.
#Problem 4
us_gasoline2005 <- us_gasoline %>%
filter(year(Week) < "2005")
fourier_us_gas <- us_gasoline2005 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 1)))
report(fourier_us_gas)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.969489 -0.197166 -0.002252 0.200869 0.975792
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.092e+00 2.131e-02 332.782 < 2e-16 ***
## trend() 2.807e-03 5.081e-05 55.237 < 2e-16 ***
## fourier(K = 1)C1_52 -1.238e-01 1.505e-02 -8.226 9.01e-16 ***
## fourier(K = 1)S1_52 -2.383e-01 1.505e-02 -15.832 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2865 on 722 degrees of freedom
## Multiple R-squared: 0.8248, Adjusted R-squared: 0.8241
## F-statistic: 1133 on 3 and 722 DF, p-value: < 2.22e-16
augment(fourier_us_gas) %>%
ggplot(aes(x = Week)) +
geom_line(aes(y = Barrels, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
scale_color_manual(values = c(Data = "black", Fitted = "green"))
#The fitted values don’t seem to catch the extremeties of the peeks and troughs of the data.
detach_package(forecast)
fourier_us_gas %>%
forecast(h = 52) %>%
autoplot(us_gasoline2005)
fourier_us_gas2 <- us_gasoline2005 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 2)))
report(fourier_us_gas2)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9375162 -0.1897569 -0.0006692 0.2058275 1.0016928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.094e+00 2.121e-02 334.493 < 2e-16 ***
## trend() 2.802e-03 5.057e-05 55.420 < 2e-16 ***
## fourier(K = 2)C1_52 -1.237e-01 1.497e-02 -8.265 6.71e-16 ***
## fourier(K = 2)S1_52 -2.383e-01 1.497e-02 -15.917 < 2e-16 ***
## fourier(K = 2)C2_52 4.493e-02 1.495e-02 3.006 0.00274 **
## fourier(K = 2)S2_52 1.054e-02 1.498e-02 0.704 0.48193
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.285 on 720 degrees of freedom
## Multiple R-squared: 0.8271, Adjusted R-squared: 0.8259
## F-statistic: 688.8 on 5 and 720 DF, p-value: < 2.22e-16
augment(fourier_us_gas2) %>%
ggplot(aes(x = Week)) +
geom_line(aes(y = Barrels, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
scale_color_manual(values = c(Data = "black", Fitted = "green"))
fourier_us_gas2 %>%
forecast(h = 52) %>%
autoplot(us_gasoline2005)
fourier_us_gas5 <- us_gasoline2005 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 5)))
report(fourier_us_gas5)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8568361 -0.1759670 -0.0002609 0.1916623 0.9380241
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.095e+00 2.043e-02 347.198 < 2e-16 ***
## trend() 2.798e-03 4.873e-05 57.428 < 2e-16 ***
## fourier(K = 5)C1_52 -1.242e-01 1.442e-02 -8.610 < 2e-16 ***
## fourier(K = 5)S1_52 -2.390e-01 1.442e-02 -16.570 < 2e-16 ***
## fourier(K = 5)C2_52 4.517e-02 1.440e-02 3.137 0.00178 **
## fourier(K = 5)S2_52 9.760e-03 1.443e-02 0.676 0.49898
## fourier(K = 5)C3_52 9.586e-02 1.442e-02 6.646 6e-11 ***
## fourier(K = 5)S3_52 2.543e-04 1.440e-02 0.018 0.98592
## fourier(K = 5)C4_52 2.854e-02 1.442e-02 1.979 0.04821 *
## fourier(K = 5)S4_52 2.861e-02 1.440e-02 1.987 0.04733 *
## fourier(K = 5)C5_52 -3.364e-02 1.440e-02 -2.336 0.01974 *
## fourier(K = 5)S5_52 3.123e-02 1.443e-02 2.165 0.03073 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2746 on 714 degrees of freedom
## Multiple R-squared: 0.8409, Adjusted R-squared: 0.8384
## F-statistic: 343 on 11 and 714 DF, p-value: < 2.22e-16
augment(fourier_us_gas5) %>%
ggplot(aes(x = Week)) +
geom_line(aes(y = Barrels, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
scale_color_manual(values = c(Data = "black", Fitted = "green"))
fourier_us_gas5 %>%
forecast(h = 52) %>%
autoplot(us_gasoline2005)
fourier_us_gas10 <- us_gasoline2005 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 10)))
report(fourier_us_gas10)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8625657 -0.1755251 0.0003646 0.1817594 0.9806206
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.094e+00 2.002e-02 354.363 < 2e-16 ***
## trend() 2.799e-03 4.774e-05 58.625 < 2e-16 ***
## fourier(K = 10)C1_52 -1.245e-01 1.413e-02 -8.814 < 2e-16 ***
## fourier(K = 10)S1_52 -2.395e-01 1.413e-02 -16.945 < 2e-16 ***
## fourier(K = 10)C2_52 4.529e-02 1.411e-02 3.210 0.00139 **
## fourier(K = 10)S2_52 9.203e-03 1.414e-02 0.651 0.51524
## fourier(K = 10)C3_52 9.636e-02 1.413e-02 6.819 1.98e-11 ***
## fourier(K = 10)S3_52 -4.035e-06 1.411e-02 0.000 0.99977
## fourier(K = 10)C4_52 2.905e-02 1.413e-02 2.056 0.04015 *
## fourier(K = 10)S4_52 2.884e-02 1.411e-02 2.044 0.04134 *
## fourier(K = 10)C5_52 -3.349e-02 1.410e-02 -2.375 0.01783 *
## fourier(K = 10)S5_52 3.176e-02 1.413e-02 2.247 0.02494 *
## fourier(K = 10)C6_52 -6.569e-02 1.412e-02 -4.653 3.91e-06 ***
## fourier(K = 10)S6_52 2.815e-02 1.412e-02 1.993 0.04660 *
## fourier(K = 10)C7_52 -2.240e-02 1.413e-02 -1.585 0.11340
## fourier(K = 10)S7_52 3.279e-02 1.411e-02 2.324 0.02038 *
## fourier(K = 10)C8_52 -1.671e-02 1.411e-02 -1.184 0.23676
## fourier(K = 10)S8_52 -1.432e-03 1.412e-02 -0.101 0.91926
## fourier(K = 10)C9_52 -1.768e-02 1.411e-02 -1.253 0.21066
## fourier(K = 10)S9_52 -6.335e-04 1.413e-02 -0.045 0.96424
## fourier(K = 10)C10_52 1.274e-02 1.412e-02 0.902 0.36751
## fourier(K = 10)S10_52 -2.368e-02 1.411e-02 -1.678 0.09387 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.269 on 704 degrees of freedom
## Multiple R-squared: 0.8494, Adjusted R-squared: 0.8449
## F-statistic: 189.1 on 21 and 704 DF, p-value: < 2.22e-16
augment(fourier_us_gas10) %>%
ggplot(aes(x = Week)) +
geom_line(aes(y = Barrels, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
scale_color_manual(values = c(Data = "black", Fitted = "green"))
fourier_us_gas10 %>%
forecast(h = 52) %>%
autoplot(us_gasoline2005)
fourier_us_gas20 <- us_gasoline2005 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 20)))
report(fourier_us_gas20)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8455538 -0.1703796 0.0009278 0.1724455 0.9995200
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.095e+00 2.001e-02 354.550 < 2e-16 ***
## trend() 2.798e-03 4.772e-05 58.649 < 2e-16 ***
## fourier(K = 20)C1_52 -1.245e-01 1.412e-02 -8.814 < 2e-16 ***
## fourier(K = 20)S1_52 -2.394e-01 1.413e-02 -16.946 < 2e-16 ***
## fourier(K = 20)C2_52 4.525e-02 1.410e-02 3.209 0.00139 **
## fourier(K = 20)S2_52 9.317e-03 1.413e-02 0.659 0.50988
## fourier(K = 20)C3_52 9.624e-02 1.413e-02 6.814 2.09e-11 ***
## fourier(K = 20)S3_52 3.202e-05 1.410e-02 0.002 0.99819
## fourier(K = 20)C4_52 2.895e-02 1.412e-02 2.050 0.04072 *
## fourier(K = 20)S4_52 2.877e-02 1.410e-02 2.040 0.04175 *
## fourier(K = 20)C5_52 -3.349e-02 1.410e-02 -2.376 0.01779 *
## fourier(K = 20)S5_52 3.165e-02 1.413e-02 2.240 0.02541 *
## fourier(K = 20)C6_52 -6.559e-02 1.411e-02 -4.649 4.01e-06 ***
## fourier(K = 20)S6_52 2.808e-02 1.411e-02 1.990 0.04701 *
## fourier(K = 20)C7_52 -2.229e-02 1.413e-02 -1.578 0.11498
## fourier(K = 20)S7_52 3.283e-02 1.410e-02 2.329 0.02018 *
## fourier(K = 20)C8_52 -1.668e-02 1.411e-02 -1.183 0.23735
## fourier(K = 20)S8_52 -1.323e-03 1.412e-02 -0.094 0.92536
## fourier(K = 20)C9_52 -1.775e-02 1.410e-02 -1.259 0.20862
## fourier(K = 20)S9_52 -5.484e-04 1.412e-02 -0.039 0.96903
## fourier(K = 20)C10_52 1.263e-02 1.412e-02 0.894 0.37140
## fourier(K = 20)S10_52 -2.368e-02 1.411e-02 -1.679 0.09360 .
## fourier(K = 20)C11_52 -2.835e-02 1.411e-02 -2.008 0.04500 *
## fourier(K = 20)S11_52 2.542e-02 1.411e-02 1.801 0.07208 .
## fourier(K = 20)C12_52 -1.166e-02 1.411e-02 -0.827 0.40869
## fourier(K = 20)S12_52 -2.505e-02 1.411e-02 -1.775 0.07633 .
## fourier(K = 20)C13_52 6.269e-03 1.411e-02 0.444 0.65703
## fourier(K = 20)S13_52 1.421e-02 1.411e-02 1.007 0.31442
## fourier(K = 20)C14_52 -3.088e-03 1.411e-02 -0.219 0.82685
## fourier(K = 20)S14_52 1.816e-02 1.411e-02 1.287 0.19865
## fourier(K = 20)C15_52 -1.403e-03 1.411e-02 -0.099 0.92086
## fourier(K = 20)S15_52 1.743e-02 1.411e-02 1.235 0.21708
## fourier(K = 20)C16_52 -1.224e-02 1.412e-02 -0.867 0.38624
## fourier(K = 20)S16_52 -1.567e-03 1.411e-02 -0.111 0.91160
## fourier(K = 20)C17_52 7.639e-03 1.410e-02 0.542 0.58827
## fourier(K = 20)S17_52 -2.197e-02 1.412e-02 -1.556 0.12021
## fourier(K = 20)C18_52 2.355e-03 1.411e-02 0.167 0.86749
## fourier(K = 20)S18_52 1.760e-02 1.412e-02 1.247 0.21299
## fourier(K = 20)C19_52 -2.098e-03 1.412e-02 -0.149 0.88197
## fourier(K = 20)S19_52 -7.173e-05 1.410e-02 -0.005 0.99594
## fourier(K = 20)C20_52 -2.844e-03 1.411e-02 -0.202 0.84034
## fourier(K = 20)S20_52 7.698e-04 1.411e-02 0.055 0.95652
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2688 on 684 degrees of freedom
## Multiple R-squared: 0.8538, Adjusted R-squared: 0.8451
## F-statistic: 97.46 on 41 and 684 DF, p-value: < 2.22e-16
augment(fourier_us_gas20) %>%
ggplot(aes(x = Week)) +
geom_line(aes(y = Barrels, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
scale_color_manual(values = c(Data = "black", Fitted = "green"))
fourier_us_gas20 %>%
forecast(h = 52) %>%
autoplot(us_gasoline2005)
us_gasoline2005_fit <- us_gasoline2005 %>%
model(K1 = TSLM(Barrels ~ trend() + fourier(K = 1)),
K2 = TSLM(Barrels ~ trend() + fourier(K = 2)),
K5 = TSLM(Barrels ~ trend() + fourier(K = 5)),
K10 = TSLM(Barrels ~ trend() + fourier(K = 10)),
K20 = TSLM(Barrels ~ trend() + fourier(K = 20)))
glance(us_gasoline2005_fit) %>% select(.model, r_squared, adj_r_squared, AICc)
## # A tibble: 5 × 4
## .model r_squared adj_r_squared AICc
## <chr> <dbl> <dbl> <dbl>
## 1 K1 0.825 0.824 -1809.
## 2 K2 0.827 0.826 -1814.
## 3 K5 0.841 0.838 -1862.
## 4 K10 0.849 0.845 -1881.
## 5 K20 0.854 0.845 -1859.
us_gasoline2005_fit2 <- us_gasoline2005 %>%
model(K11 = TSLM(Barrels ~ trend() + fourier(K = 11)),
K12 = TSLM(Barrels ~ trend() + fourier(K = 12)),
K13 = TSLM(Barrels ~ trend() + fourier(K = 13)),
K14 = TSLM(Barrels ~ trend() + fourier(K = 14)),
K15 = TSLM(Barrels ~ trend() + fourier(K = 15)),
K16 = TSLM(Barrels ~ trend() + fourier(K = 16)),
K17 = TSLM(Barrels ~ trend() + fourier(K = 17)),
K18 = TSLM(Barrels ~ trend() + fourier(K = 18)),
K19 = TSLM(Barrels ~ trend() + fourier(K = 19)))
glance(us_gasoline2005_fit2) %>% select(.model, r_squared, adj_r_squared, AICc)
## # A tibble: 9 × 4
## .model r_squared adj_r_squared AICc
## <chr> <dbl> <dbl> <dbl>
## 1 K11 0.851 0.846 -1885.
## 2 K12 0.852 0.847 -1884.
## 3 K13 0.852 0.846 -1881.
## 4 K14 0.852 0.846 -1879.
## 5 K15 0.853 0.846 -1876.
## 6 K16 0.853 0.846 -1872.
## 7 K17 0.853 0.846 -1871.
## 8 K18 0.854 0.846 -1868.
## 9 K19 0.854 0.846 -1864.
fourier_us_gas11 <- us_gasoline2005 %>%
model(TSLM(Barrels ~ trend() + fourier(K = 11)))
report(fourier_us_gas11)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.896208 -0.177835 0.003786 0.173355 1.014511
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.095e+00 1.994e-02 355.717 < 2e-16 ***
## trend() 2.799e-03 4.756e-05 58.844 < 2e-16 ***
## fourier(K = 11)C1_52 -1.245e-01 1.407e-02 -8.845 < 2e-16 ***
## fourier(K = 11)S1_52 -2.394e-01 1.408e-02 -17.007 < 2e-16 ***
## fourier(K = 11)C2_52 4.529e-02 1.405e-02 3.223 0.00133 **
## fourier(K = 11)S2_52 9.256e-03 1.408e-02 0.657 0.51125
## fourier(K = 11)C3_52 9.632e-02 1.408e-02 6.842 1.70e-11 ***
## fourier(K = 11)S3_52 3.762e-05 1.405e-02 0.003 0.99787
## fourier(K = 11)C4_52 2.899e-02 1.408e-02 2.060 0.03980 *
## fourier(K = 11)S4_52 2.884e-02 1.406e-02 2.052 0.04054 *
## fourier(K = 11)C5_52 -3.354e-02 1.405e-02 -2.387 0.01725 *
## fourier(K = 11)S5_52 3.172e-02 1.408e-02 2.253 0.02458 *
## fourier(K = 11)C6_52 -6.569e-02 1.406e-02 -4.671 3.59e-06 ***
## fourier(K = 11)S6_52 2.808e-02 1.407e-02 1.996 0.04628 *
## fourier(K = 11)C7_52 -2.235e-02 1.408e-02 -1.588 0.11283
## fourier(K = 11)S7_52 3.274e-02 1.405e-02 2.330 0.02010 *
## fourier(K = 11)C8_52 -1.664e-02 1.406e-02 -1.183 0.23704
## fourier(K = 11)S8_52 -1.428e-03 1.407e-02 -0.102 0.91916
## fourier(K = 11)C9_52 -1.763e-02 1.406e-02 -1.254 0.21019
## fourier(K = 11)S9_52 -5.731e-04 1.407e-02 -0.041 0.96753
## fourier(K = 11)C10_52 1.272e-02 1.407e-02 0.904 0.36621
## fourier(K = 11)S10_52 -2.359e-02 1.406e-02 -1.678 0.09375 .
## fourier(K = 11)C11_52 -2.837e-02 1.407e-02 -2.016 0.04414 *
## fourier(K = 11)S11_52 2.555e-02 1.406e-02 1.817 0.06962 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.268 on 702 degrees of freedom
## Multiple R-squared: 0.851, Adjusted R-squared: 0.8461
## F-statistic: 174.3 on 23 and 702 DF, p-value: < 2.22e-16
augment(fourier_us_gas11) %>%
ggplot(aes(x = Week)) +
geom_line(aes(y = Barrels, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(y = "Thousand of Barrels Per Day", title = "US Gas Harmonic Regression") +
scale_color_manual(values = c(Data = "black", Fitted = "green"))
fourier_us_gas11 %>%
forecast(h = 52) %>%
autoplot(us_gasoline2005)
fourier_us_gas11 %>%
gg_tsresiduals()
augment(fourier_us_gas11) %>%
features(.resid, ljung_box, lag = 104, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 TSLM(Barrels ~ trend() + fourier(K = 11)) 149. 0.00268
us_gasoline2005_plot <- us_gasoline2005 %>%
model(
Mean = MEAN(Barrels),
Naive = NAIVE(Barrels),
Seasonal_naive = SNAIVE(Barrels),
Drift = RW(Barrels ~ drift()))
us_gasoline2005_plot %>%
forecast(h = 52) %>%
autoplot(us_gasoline2005)
fourier_us_gas11 %>%
forecast(h = 52) %>%
autoplot(us_gasoline2005)
#The best forecast appears to be when k = 11. #Our model is fairly accurate and gives a reasonable prediction interval.
#Problem 5
global_economy %>%
filter(Country=="Afghanistan") %>%
tsibble(key = Code, index = Year) %>%
autoplot(Population, show.legend = FALSE) +
labs(title= "Afghanistan Population", y = "Population")
global_economy %>%
filter(Country=="Afghanistan")%>%
model(TSLM(Population ~ Year)) %>%
report()
## Series: Population
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -5794518 -2582559 744761 2259222 6036280
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -829292529 49730866 -16.68 <2e-16 ***
## Year 425774 25008 17.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3188000 on 56 degrees of freedom
## Multiple R-squared: 0.8381, Adjusted R-squared: 0.8352
## F-statistic: 289.9 on 1 and 56 DF, p-value: < 2.22e-16
global_economy %>%
filter(Country=="Afghanistan")%>%
filter(Year<1980)%>%
model(TSLM(Population ~ Year)) %>%
report()
## Series: Population
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -146380.8 -110290.6 -451.2 105877.8 202881.3
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -470734527 8787062 -53.57 <2e-16 ***
## Year 244657 4462 54.84 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 115100 on 18 degrees of freedom
## Multiple R-squared: 0.994, Adjusted R-squared: 0.9937
## F-statistic: 3007 on 1 and 18 DF, p-value: < 2.22e-16
global_economy %>%
filter(Country=="Afghanistan")%>%
filter(Year>1989)%>%
model(TSLM(Population ~ Year)) %>%
report()
## Series: Population
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -619234 -212927 6598 234280 612277
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -1.670e+09 1.640e+07 -101.8 <2e-16 ***
## Year 8.451e+05 8.184e+03 103.3 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 349800 on 26 degrees of freedom
## Multiple R-squared: 0.9976, Adjusted R-squared: 0.9975
## F-statistic: 1.066e+04 on 1 and 26 DF, p-value: < 2.22e-16
Afghanistan_forecast1 <- global_economy %>%
filter(Country=="Afghanistan")%>%
model(TSLM(Population ~ Year))
Afghanistan_forecast2 <- global_economy %>%
filter(Country=="Afghanistan")%>%
filter(Year>1989)%>%
model(TSLM(Population ~ Year))
Afghanistan <- global_economy %>%
filter(Country=="Afghanistan")
Afghanistan_rep <- Afghanistan %>%
model(piecewise = TSLM(Population ~ trend(knots = c(1980, 1989))))
report(Afghanistan_rep)
## Series: Population
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -577590 -174198 -16784 187226 679947
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8697573 131122 66.33 <2e-16 ***
## trend(knots = c(1980, 1989))trend 224372 9623 23.32 <2e-16 ***
## trend(knots = c(1980, 1989))trend_21 -456804 24498 -18.65 <2e-16 ***
## trend(knots = c(1980, 1989))trend_30 1082782 21418 50.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 300900 on 54 degrees of freedom
## Multiple R-squared: 0.9986, Adjusted R-squared: 0.9985
## F-statistic: 1.293e+04 on 3 and 54 DF, p-value: < 2.22e-16
forecast(Afghanistan_forecast1, h = 5)
## # A fable: 5 x 5 [1Y]
## # Key: Country, .model [1]
## Country .model Year Population .mean
## <fct> <chr> <dbl> <dist> <dbl>
## 1 Afghanistan TSLM(Population ~ Year) 2018 N(3e+07, 1.1e+13) 29919575.
## 2 Afghanistan TSLM(Population ~ Year) 2019 N(3e+07, 1.1e+13) 30345349.
## 3 Afghanistan TSLM(Population ~ Year) 2020 N(3.1e+07, 1.1e+13) 30771123.
## 4 Afghanistan TSLM(Population ~ Year) 2021 N(3.1e+07, 1.1e+13) 31196897.
## 5 Afghanistan TSLM(Population ~ Year) 2022 N(3.2e+07, 1.1e+13) 31622671.
forecast(Afghanistan_forecast2, h = 5)
## # A fable: 5 x 5 [1Y]
## # Key: Country, .model [1]
## Country .model Year Population .mean
## <fct> <chr> <dbl> <dist> <dbl>
## 1 Afghanistan TSLM(Population ~ Year) 2018 N(3.6e+07, 1.4e+11) 35925602.
## 2 Afghanistan TSLM(Population ~ Year) 2019 N(3.7e+07, 1.4e+11) 36770747.
## 3 Afghanistan TSLM(Population ~ Year) 2020 N(3.8e+07, 1.4e+11) 37615892.
## 4 Afghanistan TSLM(Population ~ Year) 2021 N(3.8e+07, 1.5e+11) 38461037.
## 5 Afghanistan TSLM(Population ~ Year) 2022 N(3.9e+07, 1.5e+11) 39306182.
Afghanistan_drift <- Afghanistan %>%
model(RW(Population ~ drift())) %>%
forecast(h = "5 years") %>%
autoplot(Afghanistan) +
labs(title = "Drift Forecast of Afghanistan Population", y = "Population in Millions")
Afghanistan_drift
#The drift forecast for this data appears to be fairly accurate.
#Problem 6
#Assumptions of OLS: #1 The model is linear, correctly specifiec, and has an additive error term. This basically means we are using a linear regression that accounts for unknowns in the error variable.
#2 No Perfect Multicollinearity. This means that two of our x (predictor) variables cannot have a correlation = 1, meaning that they are the exact same thing (IE BMI and bodyweight/height^2).
#3 The error term has 0 population mean. This just means we have no omitted variable.
#4 No Serial Correlation (AKA Autocorrelation). This means errors from past time periods cannot be predict future values of our error terms.
#5 No predictor variables are correlated with the error term. If this is false the model will have omitted variable bais.
#6 Homoskedasticity, meaning that there is constant variance in the error term across the model.
#7 Error term is normally distributed. This just means that our forecasted value should be as close to the actual value as possible.
#An unbiased estimator means that your estimate is the actual observed value, meaning that your model is completely clear of bias. A consistent estimator gets more accurate as the sample size gets larger and larger, but it is still ultimately biased.
#Both R^2 and Adj. R^2 are values that tell us what percent of variation in y in our data can be explained. The difference is that Adj. R^2 accounts for the number of predictor variables you have in your model, because increasing the number of predictors tends to increase R^2.