library(readxl)
library(tidyquant)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(dplyr)
library(seasonal)
library(seasonalview)
library(fabletools)
library(fable)
library(ggfortify)
library(quantmod)
#Question 1
#Question 1 Part a
jan14_vic_elec <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as_date(Time)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature))
ggplot(jan14_vic_elec, mapping = aes(x = Temperature, y = Demand)) +
geom_point()+
geom_smooth(method = "lm", se = TRUE)+
labs(title = "January 14th Electricity Demand with Regression Line")
## `geom_smooth()` using formula 'y ~ x'
#There is a positive trend since demand for cooling via air conditioning rises as temperature increases.
#Question 1 Part b
res_test <- jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature))
res_test %>%
gg_tsresiduals()
#The model appears to be adequate, as there is no statistically significant autocorrelation. However, the histogram is centered around 0 but not well normally distributed.
#Question 1 Part c
demand15 <- jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 15)
) %>%
autoplot(jan14_vic_elec) +
labs(title = "15C Forecasted")
demand35 <- jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 35)
) %>%
autoplot(jan14_vic_elec) +
labs(title = "35C Forecasted")
demand15
demand35
#The 35C forecast is very believable but the 15C forecast is way too low compared to other lows. This could be because the model is missing controls and is overestimating the effect of lower temperatures on electricity demand.
#Question 1 Part d
#I ran hilo() after autoplot() in the previous part for the below intervals. However, the forecast would not plot with it. I kept getting errors when trying to do hilo() completely separately.
#35C: [242088.4, 306880.1], 15C: [117908.1, 184888.6]
#Question 1 Part e
plot(Demand~Temperature, data=vic_elec, main= "Temperature and Demand")
#The relationship is not entirely linear, likely due to heating-related electricity demand at lower temperatures. This would throw off a linear regression and lead to inaccurate forecasts. This could explain the strange value when I forecasted 15C.
#Question 2
#Question 2 Part a
autoplot(olympic_running)
## Plot variable not specified, automatically selected `.vars = Time`
#Obviously as length increases winning times decrease. It also appears as if men generally have faster times. There is a slow but steady downward trend for most events, likely due to better running technology (shoes, better training, etc.).
#Question 2 Part b
olympic_reg <- lm(Time ~ ., olympic_running)
olympic_reg
##
## Call:
## lm(formula = Time ~ ., data = olympic_running)
##
## Coefficients:
## (Intercept) Year Length Sexwomen
## 734.9863 -0.3905 0.1766 32.9732
#Average winning times have been decreasing at a rate of .39 seconds per year.
#Question 2 Part c
library(forecast) #To use checkresiduals function. I could not get gg_tsresiduals() to work.
## Registered S3 methods overwritten by 'forecast':
## method from
## autoplot.Arima ggfortify
## autoplot.acf ggfortify
## autoplot.ar ggfortify
## autoplot.bats ggfortify
## autoplot.decomposed.ts ggfortify
## autoplot.ets ggfortify
## autoplot.forecast ggfortify
## autoplot.stl ggfortify
## autoplot.ts ggfortify
## fitted.ar ggfortify
## fortify.ts ggfortify
## residuals.ar ggfortify
##
## Attaching package: 'forecast'
##
## The following objects are masked from 'package:fabletools':
##
## accuracy, forecast
checkresiduals(olympic_reg)
##
## Breusch-Godfrey test for serial correlation of order up to 10
##
## data: Residuals
## LM test = 179.8, df = 10, p-value < 2.2e-16
#The residuals are autocorrelated, do not resemble white noise, and are heavily skewed to the right. This is likely affecting the suitability of the model.
detach("package:forecast", unload = TRUE) #unloading the forecast package to prevent issues with accuracy().
#Question 2 Part d
olympic_running %>%
distinct(Length)
## # A tibble: 7 × 1
## Length
## <int>
## 1 100
## 2 200
## 3 400
## 4 800
## 5 1500
## 6 5000
## 7 10000
#Men 100M
olympic_running %>%
filter(Sex=="men") %>%
filter(Length=="100") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Men 200M
olympic_running %>%
filter(Sex=="men") %>%
filter(Length=="200") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Men 400M
olympic_running %>%
filter(Sex=="men") %>%
filter(Length=="400") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Men 800M
olympic_running %>%
filter(Sex=="men") %>%
filter(Length=="800") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Men 1500M
olympic_running %>%
filter(Sex=="men") %>%
filter(Length=="1500") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Men 5000M
olympic_running %>%
filter(Sex=="men") %>%
filter(Length=="5000") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Men 10000M
olympic_running %>%
filter(Sex=="men") %>%
filter(Length=="10000") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Women 100M
olympic_running %>%
filter(Sex=="women") %>%
filter(Length=="100") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Women 200M
olympic_running %>%
filter(Sex=="women") %>%
filter(Length=="200") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Women 400M
olympic_running %>%
filter(Sex=="women") %>%
filter(Length=="400") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Women 800M
olympic_running %>%
filter(Sex=="women") %>%
filter(Length=="800") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Women 1500M
olympic_running %>%
filter(Sex=="women") %>%
filter(Length=="1500") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Women 5000M
olympic_running %>%
filter(Sex=="women") %>%
filter(Length=="5000") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#Women 10000M
olympic_running %>%
filter(Sex=="women") %>%
filter(Length=="10000") %>%
model(TSLM(Time ~ Year)) %>%
forecast(
new_data(olympic_running, 1) %>%
mutate(Year = 2020)
) %>%
autoplot(olympic_running)
#These calculations assume that trends will continue with no exogenous factors affecting times. In the real 2020, the Olympics did not happen at all so this forecast would have obviously been inaccurate.
#Question 3
#Question 3 Part a
ggplot(souvenirs, mapping = aes(x = Month, y = Sales)) +
geom_line()
#There appears to be a strong surge in March and Christmas, as the question states. There is certainly an upwards trend despite the peaks.
#Question 3 Part b
#The surges will skew the data and can throw off a predictive model. If one hopes to capture the true trend, less the peaks, a logarithm should be taken.
#Question 3 Part c
souvenirts <- ts(souvenirs$Sales,start = c(1987,1), end = c(1993,6), frequency=12)
souvenir_log <- log(souvenirts)
dsurf <- rep(0, length(souvenirts))
dsurf[seq_along(dsurf)%%12 == 3] = 1
dsurf[3] = 0
dsurf <- ts(dsurf, freq = 12, start = c(1987,1))
souvenir_final <- data.frame(souvenir_log, dsurf)
library(forecast) #Loaded for forecast() function and checkresiduals().
##
## Attaching package: 'forecast'
## The following objects are masked from 'package:fabletools':
##
## accuracy, forecast
souvenir_fit <- tslm(souvenir_log ~ trend + season + dsurf, data = souvenir_final)
souvenir_forecast <- data.frame(dsurf = rep(0, 12))
souvenir_forecast[3,] = 1
forecast(souvenir_fit, newdata = souvenir_forecast)
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jul 1993 9.883136 9.637329 10.128943 9.503919 10.262353
## Aug 1993 9.867772 9.621965 10.113579 9.488555 10.246989
## Sep 1993 10.531942 10.191069 10.872815 10.006062 11.057822
## Oct 1993 10.092605 9.846798 10.338412 9.713388 10.471822
## Nov 1993 10.585189 10.339382 10.830995 10.205971 10.964406
## Dec 1993 11.357556 11.111749 11.603363 10.978339 11.736773
## Jan 1994 9.430933 9.186093 9.675774 9.053207 9.808659
## Feb 1994 9.704370 9.459530 9.949210 9.326644 10.082096
## Mar 1994 9.695742 9.365756 10.025727 9.186658 10.204825
## Apr 1994 9.881046 9.636206 10.125887 9.503320 10.258772
## May 1994 9.928500 9.683659 10.173340 9.550774 10.306226
## Jun 1994 9.989861 9.745020 10.234701 9.612135 10.367587
#Question 3 Part d
checkresiduals(souvenir_fit)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals from Linear regression model
## LM test = 31.535, df = 17, p-value = 0.01718
#These residuals do not appear to be white noise. They are autocorrelated at multiple lags and the mean is not centered around 0.
#Question 3 Part e
boxplot(residuals(souvenir_fit)~cycle(residuals(souvenir_fit)))
#There is much less variation in the middle months than the summer months, causing heteroskedasticity. This is likely due to Australia's summer months being the last and first months of each year and the town attracting more tourists at that time.
#Question 3 Part f
souvenir_fit$coefficients
## (Intercept) trend season2 season3 season4 season5
## 7.6662400 0.0207611 0.2526756 0.2232860 0.3878297 0.4145219
## season6 season7 season8 season9 season10 season11
## 0.4551219 0.5767690 0.5406444 0.6296713 0.7239552 1.1957774
## season12 dsurf
## 1.9473841 0.5543818
#The value of each coefficient tells the effect of each season on sales. For example, in December sales increase by 1.95 relative to the intercept of 7.67.
#Question 3 Part g
checkresiduals(souvenir_fit)
##
## Breusch-Godfrey test for serial correlation of order up to 17
##
## data: Residuals from Linear regression model
## LM test = 31.535, df = 17, p-value = 0.01718
#The p-value of .01718 is less than 5%, so my residuals are statistically differentiable from white noise.
#Question 3 Part h
souvenir_skip <- data.frame(dsurf= rep(0, 36)) #number of months to skip ahead for forecasting.
souvenir_3yearplot <- forecast(souvenir_fit, newdata = souvenir_skip)
souvenir_3yearplot
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jul 1993 9.883136 9.637329 10.128943 9.503919 10.262353
## Aug 1993 9.867772 9.621965 10.113579 9.488555 10.246989
## Sep 1993 9.977560 9.731753 10.223367 9.598343 10.356777
## Oct 1993 10.092605 9.846798 10.338412 9.713388 10.471822
## Nov 1993 10.585189 10.339382 10.830995 10.205971 10.964406
## Dec 1993 11.357556 11.111749 11.603363 10.978339 11.736773
## Jan 1994 9.430933 9.186093 9.675774 9.053207 9.808659
## Feb 1994 9.704370 9.459530 9.949210 9.326644 10.082096
## Mar 1994 9.695742 9.365756 10.025727 9.186658 10.204825
## Apr 1994 9.881046 9.636206 10.125887 9.503320 10.258772
## May 1994 9.928500 9.683659 10.173340 9.550774 10.306226
## Jun 1994 9.989861 9.745020 10.234701 9.612135 10.367587
## Jul 1994 10.132269 9.883394 10.381144 9.748319 10.516219
## Aug 1994 10.116905 9.868031 10.365780 9.732955 10.500856
## Sep 1994 10.226693 9.977819 10.475568 9.842743 10.610644
## Oct 1994 10.341738 10.092864 10.590613 9.957788 10.725689
## Nov 1994 10.834322 10.585447 11.083197 10.450372 11.218272
## Dec 1994 11.606690 11.357815 11.855564 11.222739 11.990640
## Jan 1995 9.680067 9.431764 9.928369 9.296999 10.063134
## Feb 1995 9.953503 9.705201 10.201806 9.570436 10.336570
## Mar 1995 9.944875 9.610605 10.279144 9.429182 10.460567
## Apr 1995 10.130180 9.881877 10.378482 9.747112 10.513247
## May 1995 10.177633 9.929330 10.425935 9.794566 10.560700
## Jun 1995 10.238994 9.990691 10.487296 9.855927 10.622061
## Jul 1995 10.381402 10.128745 10.634059 9.991617 10.771188
## Aug 1995 10.366039 10.113381 10.618696 9.976253 10.755824
## Sep 1995 10.475827 10.223169 10.728484 10.086041 10.865612
## Oct 1995 10.590872 10.338214 10.843529 10.201086 10.980657
## Nov 1995 11.083455 10.830798 11.336112 10.693669 11.473240
## Dec 1995 11.855823 11.603165 12.108480 11.466037 12.245608
## Jan 1996 9.929200 9.676730 10.181669 9.539704 10.318696
## Feb 1996 10.202636 9.950167 10.455106 9.813141 10.592132
## Mar 1996 10.194008 9.854949 10.533067 9.670926 10.717090
## Apr 1996 10.379313 10.126843 10.631782 9.989817 10.768809
## May 1996 10.426766 10.174296 10.679236 10.037270 10.816262
## Jun 1996 10.488127 10.235658 10.740597 10.098631 10.877623
autoplot(souvenir_3yearplot)
#Question 3 Part i
#Since logging did not work, a Box-Cox using guerrero could help smooth the data by finding an optimal lambda specifically for this series.
#Question 4
#Question 4 Parts a and b
#Since the data is weekly, the maximum value of k is 26.
us_filter <- us_gasoline %>%
filter(year(Week) < "2005")
us_filter
## # A tsibble: 726 x 2 [1W]
## Week Barrels
## <week> <dbl>
## 1 1991 W06 6.62
## 2 1991 W07 6.43
## 3 1991 W08 6.58
## 4 1991 W09 7.22
## 5 1991 W10 6.88
## 6 1991 W11 6.95
## 7 1991 W12 7.33
## 8 1991 W13 6.78
## 9 1991 W14 7.50
## 10 1991 W15 6.92
## # … with 716 more rows
gas_fit <- us_filter %>%
model(K1 = TSLM(Barrels ~ trend() + fourier(K = 1)),
K2 = TSLM(Barrels ~ trend() + fourier(K = 2)),
K5 = TSLM(Barrels ~ trend() + fourier(K = 5)),
K10 = TSLM(Barrels ~ trend() + fourier(K = 10)),
K20 = TSLM(Barrels ~ trend() + fourier(K = 20)),
K26 = TSLM(Barrels ~ trend() + fourier(K = 26)))
glance(gas_fit) %>% select(.model, r_squared, adj_r_squared, AICc)
## # A tibble: 6 × 4
## .model r_squared adj_r_squared AICc
## <chr> <dbl> <dbl> <dbl>
## 1 K1 0.825 0.824 -1809.
## 2 K2 0.827 0.826 -1814.
## 3 K5 0.841 0.838 -1862.
## 4 K10 0.849 0.845 -1881.
## 5 K20 0.854 0.845 -1859.
## 6 K26 0.857 0.846 -1851.
fourier_gas1 <- us_filter %>%
model(TSLM(Barrels ~ trend() + fourier(K = 1)))
report(fourier_gas1)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.969489 -0.197166 -0.002252 0.200869 0.975792
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.092e+00 2.131e-02 332.782 < 2e-16 ***
## trend() 2.807e-03 5.081e-05 55.237 < 2e-16 ***
## fourier(K = 1)C1_52 -1.238e-01 1.505e-02 -8.226 9.01e-16 ***
## fourier(K = 1)S1_52 -2.383e-01 1.505e-02 -15.832 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2865 on 722 degrees of freedom
## Multiple R-squared: 0.8248, Adjusted R-squared: 0.8241
## F-statistic: 1133 on 3 and 722 DF, p-value: < 2.22e-16
#I chose k = 1 as it minimized AICc.
augment(fourier_gas1) %>%
ggplot(aes(x = Week)) +
geom_line(aes(y = Barrels, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(y = "Thousand of Barrels Per Day", title = "K = 1 Harmonic Regression") +
scale_color_manual(values = c(Data = "black", Fitted = "blue"))
#Question 4 Part c
fourier_gas1 %>%
gg_tsresiduals()
augment(fourier_gas1) %>%
features(.resid, ljung_box, lag = 104, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 TSLM(Barrels ~ trend() + fourier(K = 1)) 258. 4.44e-15
#The residuals appear to resemble white noise in the plot. However, the Ljung-Box test p-value is less than 5%, suggesting otherwise.
#Question 4 Part d
#Couldn't get this to work.
#Question 5
#Question 5 Part a
afghanistan <- global_economy %>%
filter(Country == "Afghanistan") %>%
mutate(mil_pop = Population/1000000)
autoplot(afghanistan, mil_pop) +
labs(title = "Afghanistan Population", x = "Year", y = "Population (Millions)")
#The impact of the war is very noticeable from war-related deaths and Afghans fleeing the country. Immigration likely slowed as well.
#Question 5 Part b
ols <- afghanistan %>%
model(TSLM(mil_pop ~ Year))
report(ols)
## Series: mil_pop
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.7945 -2.5826 0.7448 2.2592 6.0363
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -829.29253 49.73087 -16.68 <2e-16 ***
## Year 0.42577 0.02501 17.02 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.188 on 56 degrees of freedom
## Multiple R-squared: 0.8381, Adjusted R-squared: 0.8352
## F-statistic: 289.9 on 1 and 56 DF, p-value: < 2.22e-16
ols %>%
gg_tsresiduals()
piecewise5 <- afghanistan %>%
model(piecewise = TSLM(mil_pop ~ trend(knots = c(1980,1989))))
report(piecewise5)
## Series: mil_pop
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.57759 -0.17420 -0.01678 0.18723 0.67995
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 8.697573 0.131122 66.33 <2e-16 ***
## trend(knots = c(1980, 1989))trend 0.224372 0.009623 23.32 <2e-16 ***
## trend(knots = c(1980, 1989))trend_21 -0.456803 0.024498 -18.65 <2e-16 ***
## trend(knots = c(1980, 1989))trend_30 1.082782 0.021418 50.55 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.3009 on 54 degrees of freedom
## Multiple R-squared: 0.9986, Adjusted R-squared: 0.9985
## F-statistic: 1.293e+04 on 3 and 54 DF, p-value: < 2.22e-16
#The R-squared on the piecewise is significantly higher than the OLS, potentially indicating a higher predictive power.
#Question 5 Part c
detach("package:forecast", unload = TRUE)
drift5 <- afghanistan %>%
model(RW(mil_pop ~ drift())) %>%
forecast(h = "5 years") %>%
autoplot(afghanistan) +
labs(title = "Drift Forecast", y = "Population (Millions)")
drift5
naive5 <- afghanistan %>%
model(NAIVE(mil_pop)) %>%
forecast(h = "5 years") %>%
autoplot(afghanistan) +
labs(title = "Naive Forecast", y = "Population (Millions)")
naive5
mean5 <- afghanistan %>%
model(MEAN(mil_pop)) %>%
forecast(h = "5 years") %>%
autoplot(afghanistan) +
labs(title = "Mean Forecast", y = "Population (Millions)")
mean5
#I did not do SNAIVE as there was no seasonality to account for. The drift forecast is likely best as it accounts for the positive growth trend without being pulled down by past values.
#Question 6
#Question 6 Part a
#Assumption 1: Assume that the model is linear. There can be no exponentiality in any variables and no omitted variables. We also add an error term to account for unexplained variation.
#Assumption 2: No perfect multicollinearity. No variables can be perfect functions of each other (ie. BMI and body weight). R and STATA will automatically drop perfect multicollinear variables.
#Assumption 3: Sum of errors is 0. The sum of all errors must always be 0 to prevent bias.
#Assumption 4: No heteroskedasticity. Heteroskedasticity is when variance in the error term is not constant, leading to biaded standard errors.
#Assumption 5: No autocorrelation. Autocorrelation leads to serial correlation, which correlated error terms to other error terms in previous periods.
#Assumption 6: Errors are uncorrelated to predictor variables. No independent variables can be correlated to the error term.
#Assumption 7: Errors follow a normal distribution. The histogram of the error terms must be normally distributed to prevent bias in any direction.
#Question 6 Part b
#Unbiased estimator means a model is as close to the true coefficients as possible.
#Consistent estimator means that as n increases the model's coefficients get closer to the true coefficients. The key difference between the two is that consistency relates to a change in sample size.
#Question 6 Part c
#R-squared is the variation in the dependent variable that can be explained by the independent variables. Mathematically speaking, it can never decrease despite the amount of included controls. Thus, a high R-squared can be concerning in some models as it may indicate irrelevent variables.
#Adjusted R-squared is R-squared adjusted for degrees of freedom, meaning it can move down due to irrelevant variables.