library(fastDummies)
library(lubridate)
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
library(generics)
##
## Attaching package: 'generics'
## The following object is masked from 'package:caret':
##
## train
## The following object is masked from 'package:lubridate':
##
## as.difftime
## The following objects are masked from 'package:base':
##
## as.difftime, as.factor, as.ordered, intersect, is.element, setdiff,
## setequal, union
library(tsibble)
##
## Attaching package: 'tsibble'
## The following object is masked from 'package:lubridate':
##
## interval
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(dplyr)
##
## Attaching package: 'dplyr'
## The following object is masked from 'package:generics':
##
## explain
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
## ✓ tibble 3.1.6 ✓ feasts 0.2.2
## ✓ tidyr 1.2.0 ✓ fable 0.3.1
## ✓ tsibbledata 0.4.0
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x tsibble::intersect() masks generics::intersect(), base::intersect()
## x tsibble::interval() masks lubridate::interval()
## x dplyr::lag() masks stats::lag()
## x fabletools::MAE() masks caret::MAE()
## x fabletools::RMSE() masks caret::RMSE()
## x tsibble::setdiff() masks generics::setdiff(), base::setdiff()
## x tsibble::union() masks generics::union(), base::union()
library(modeest)
library(forecast)
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
## Registered S3 method overwritten by 'forecast':
## method from
## predict.default statip
##
## Attaching package: 'forecast'
## The following object is masked from 'package:modeest':
##
## naive
## The following objects are masked from 'package:generics':
##
## accuracy, forecast
library(latex2exp)
library(seasonal)
##
## Attaching package: 'seasonal'
## The following object is masked from 'package:tibble':
##
## view
##Answer Based on the plots, Monaco has the highest GDP per capita, which has only varied a couple of times throughout the plot.
global_economy\(Year <- as.numeric(global_economy\)Year) global_economy %>% group_by(Year, Country) %>% summarise(Max = max(GDPperCapita), )
summary(global_economy)
## Country Code Year GDP
## Afghanistan : 58 ABW : 58 Min. :1960 Min. :8.824e+06
## Albania : 58 AFG : 58 1st Qu.:1974 1st Qu.:2.155e+09
## Algeria : 58 AGO : 58 Median :1989 Median :1.483e+10
## American Samoa: 58 ALB : 58 Mean :1989 Mean :1.034e+12
## Andorra : 58 AND : 58 3rd Qu.:2003 3rd Qu.:1.839e+11
## Angola : 58 ARB : 58 Max. :2017 Max. :8.074e+13
## (Other) :14802 (Other):14802 NA's :3322
## Growth CPI Imports Exports
## Min. :-64.047 Min. : 0.00 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.617 1st Qu.: 13.46 1st Qu.: 21.61 1st Qu.: 17.54
## Median : 3.913 Median : 53.86 Median : 31.13 Median : 26.95
## Mean : 3.909 Mean : 56.93 Mean : 38.03 Mean : 33.39
## 3rd Qu.: 6.242 3rd Qu.: 90.28 3rd Qu.: 49.08 3rd Qu.: 42.25
## Max. :149.973 Max. :4583.70 Max. :427.58 Max. :433.22
## NA's :3756 NA's :7480 NA's :4554 NA's :4563
## Population
## Min. :4.279e+03
## 1st Qu.:9.251e+05
## Median :6.345e+06
## Mean :2.060e+08
## 3rd Qu.:4.220e+07
## Max. :7.530e+09
## NA's :3
global_economy$GDPperCapita <- global_economy$GDP/global_economy$Population
global_economy %>% autoplot(GDPperCapita, show.legend = FALSE) +
labs(y = "US Dollars",
title = "GDP per Capita by Year, Globally")
## Warning: Removed 3242 row(s) containing missing values (geom_path).
global_economy %>%
filter(GDPperCapita == max(GDPperCapita, na.rm = TRUE)) %>%
select(Country, Year, GDPperCapita)
## # A tsibble: 1 x 3 [1Y]
## # Key: Country [1]
## Country Year GDPperCapita
## <fct> <dbl> <dbl>
## 1 Monaco 2014 185153.
In this instance I did transform GDP into GDP per capita to see if population had an effect, but in this case the transformation is not necessary as seen in the two plots being essentially identical showing no population effect on GDP.
global_economy %>% filter(Country == "United States") %>%
autoplot(GDP/10 ^ 12) +
labs(y = "US Dollars, Trillions",
title = "GDP by Year, United States")
## Population Adjustment
global_economy %>% filter(Country == "United States") %>%
autoplot(GDPperCapita) +
labs(y = "US Dollars",
title = "GDP per Capita by Year, United States")
o Slaughter of Victorian “Bulls, bullocks and steers” in aus_livestock.
In this case, I did not see any reason to transform the data set. The number of slaughters appears fairly varied and random, with only a slight decreasing trend at the beginning of the plot.
aus_livestock %>% filter(Animal == "Bulls, bullocks and steers", State == "Victoria") %>%
autoplot(Count) +
labs(title = "Slaughter of Victorian Bulls, Bullocks and Steers")
o Victorian Electricity Demand from vic_elec.
For this data set I noticed that in the daily plot there was some seasonality but really difficult to clearly visualize it. Thus, I performed a date transformation to make the plot show weekly electricity demanded and you can see the clear seasonal trends.
vic_elec %>%
autoplot(Demand) +
labs(y= "Demand, in MW",
title = "Victorian Electricity Demand")
## Date Transformation
vic_elec %>% group_by(Date) %>%
index_by(Date = yearweek(Time)) %>%
summarise(Demand = sum(Demand)) %>%
autoplot(Demand)
o Gas production from aus_production.
I performed a Box-Cox transformation on this data set because of the constant increase in variability through time. The lambda = 0.12, and you can see the transformation dampens the affect of variability in the plot.
aus_production %>%
autoplot(Gas) +
labs(y= "Production, in Petajoules",
title = "Australian Gas production")
## Box-Cox Transformation
lambda <- aus_production %>%
features(Gas, features = guerrero) %>%
pull(lambda_guerrero)
aus_production %>%
autoplot(box_cox(Gas, lambda)) +
labs(y = "Production, in Petajoules",
title = latex2exp::TeX(paste0(
"Transformed gas production with $\\lambda$ = ",
round(lambda,2))))
I attempted a Box-Cox transformation, but I believe it is unhelpful as the variation at first increases with time but then flattens out and actually decreases a bit in the middle portion of the plot before it increases again. If it were a constant increase without the drop perhaps a transformation could have been helpful.
canadian_gas %>%
autoplot(Volume) +
labs(y= "Production, in billions of cubic meters",
title = "Canadian Gas production")
## Box-Cox Transformation
lambda <- canadian_gas %>%
features(Volume, features = guerrero) %>%
pull(lambda_guerrero)
canadian_gas %>%
autoplot(box_cox(Volume, lambda)) +
labs(y = "Production, in billions of cubic meters",
title = latex2exp::TeX(paste0(
"Transformed gas production with $\\lambda$ = ",
round(lambda,2))))
For retail data, I performed a Box-Cox transformation and the lambda selected was equal to -0.02. From both the plot of the data and the transformed data, we can see the transformation has slightly lower variance but the effects were not great in this case.
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries %>%
autoplot(Turnover) +
labs(y= "$Million AUD",
title = "Retail Turnover")
## Box-Cox Transformation
lambda <- myseries %>%
features(Turnover, features = guerrero) %>%
pull(lambda_guerrero)
myseries %>%
autoplot(box_cox(Turnover, lambda)) +
labs(y = "$Million AUD",
title = latex2exp::TeX(paste0(
"Transformed Retail Turnover with $\\lambda$ = ",
round(lambda,2))))
When examining the data of Tobacco from aus_production I performed a Box-Cox transformation in which lambda = 0.93. Since this lamda is very close to 1 it was unsurprising that the transformation had very little effect on the variance of the data, indicating that this data does not need to be transformed with this method.
Looking at the Economy class passengers between Melbourne and Sydney from ansett, I performed a Box-Cox with lambda = 2. From the plots we can see that the transformation helps with the variance, but there appears to be some outlier data as the plot severely drops in 1989.
Finally, with Pedestrian Counts at Southern Cross Station from pedestrian, I performed a Box-Cox with lambda = -0.23. The variance does not change much here. Perhaps, I should performed a date transformation before the Box-Cox.
## Tobacco
aus_production %>%
autoplot(Tobacco) +
labs(y= "Production, in tonnes",
title = "Australian Tobacco and Cigarette production")
## Warning: Removed 24 row(s) containing missing values (geom_path).
# Box-Cox Transformation
lambda <- aus_production %>%
features(Tobacco, features = guerrero) %>%
pull(lambda_guerrero)
aus_production %>%
autoplot(box_cox(Tobacco, lambda)) +
labs(y = "Production, in tonnes",
title = latex2exp::TeX(paste0(
"Transformed Tobacco and Cigarette production with $\\lambda$ = ",
round(lambda,2))))
## Warning: Removed 24 row(s) containing missing values (geom_path).
## Economy Class Passengers
ansett %>% filter(Airports == "MEL-SYD") %>% filter(Class == "Economy") %>%
autoplot(Passengers) +
labs(y= "Total Passengers",
title = "Economy Class Passengers Traveling Between Melbourne and Sydney")
# Box-Cox Transformation
lambda <- ansett %>% filter(Airports == "MEL-SYD") %>% filter(Class == "Economy") %>%
features(Passengers, features = guerrero) %>%
pull(lambda_guerrero)
ansett %>% filter(Airports == "MEL-SYD") %>% filter(Class == "Economy") %>%
autoplot(box_cox(Passengers, lambda)) +
labs(y = "Total Passengers",
title = latex2exp::TeX(paste0(
"Transformed Passengers Traveling with $\\lambda$ = ",
round(lambda,2))))
## Pedestrian Counts
south <- pedestrian %>% filter(Sensor == "Southern Cross Station")
autoplot(south, Count) +
labs(y= "Count",
title = "Hourly Pedestrian Count at Southern Cross Station")
# Box-Cox Transformation
lambda <- south %>%
features(Count, features = guerrero) %>%
pull(lambda_guerrero)
south %>%
autoplot(box_cox(Count, lambda)) +
labs(y = "Count",
title = latex2exp::TeX(paste0(
"Transformed Pedestrian Count with $\\lambda$ = ",
round(lambda,2))))
5 MA ==> ((Y1 + Y2 + Y3 + Y4 + Y5)/5) + ((Y2 + Y3 + Y4 + Y5 + Y6)/5) + ((Y3 + Y4 + Y5 + Y6 + Y7)/5)
So it really = 1/3 * 1/5 (1Y1 + 2Y2 + 3Y3 + 3Y4 + 3Y5 + 2Y6 + 1Y7)
==> 3X5 MA = (1/15)Y1 + (2/15)Y2 + (1/5)Y3 + (1/5)Y4 + (1/5)Y5 + (2/15)Y6 + (1/15)Y7
## Checking math to see if weights equal 7 term weighted average ()
a <- 1/15
b <- 2/15
c <- 1/5
d <- 1/5
e <- 1/5
f <- 2/15
g <- 1/15
c(a,b,c,d,e,f,g)
## [1] 0.06666667 0.13333333 0.20000000 0.20000000 0.20000000 0.13333333 0.06666667
Based on the plot below we can see low gas production in Quarter 1 which steadily increases until it peaks around the middle of the year and declines as it approaches the beginning of the year. The trend cycle appears to be a steady increase. We can see that the peak each year has grown.
Classical decomposition results can be seen below under part b.
The results definitely show a clear trend upwards in gas production, as well as similar seasonal fluctuations from the original plot.
Plot below under part d.
The effect of the outlier is to create a massive peak in both the plot and the decompositions trend and random components. The seasonally adjusted also has a large peak that makes it difficult to accurately calculate it.
The only difference is that in this case the plot has the peak at the end of the plot, the affect on the decomposition and the seasonally adjusted calculations is the same.
gas <- tail(aus_production, 5*4) %>% select(Gas)
##a
gas %>%
autoplot(Gas) +
labs(y= "Production, in Petajoules",
title = "Australian Gas production in Last Five Years")
##b
gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components() %>%
autoplot() +
labs(title = "Classical Multiplicative Decomposition of Gas Production from last 5 years")
## Warning: Removed 2 row(s) containing missing values (geom_path).
##d
decomp <- gas %>%
model(classical_decomposition(Gas, type = "multiplicative")) %>%
components()
decomp %>%
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Gas, colour = "Data")) +
geom_line(aes(y = season_adjust,
colour = "Seasonally Adjusted")) +
labs(y = "Production, in Petajoules",
title = "Seasonally Adjusted Gas Production") +
scale_colour_manual(
values = c("gray", "blue"),
breaks = c("Data", "Seasonally Adjusted")
)
##e
outlier_gas <- gas
outlier_gas$Gas[9] <- outlier_gas$Gas[9]+300
outlier_gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components() %>%
autoplot() +
labs(title = "Classical Multiplicative Decomposition of Gas Production with Outlier")
## Warning: Removed 2 row(s) containing missing values (geom_path).
outlier_decomp <- outlier_gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components()
outlier_decomp %>%
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Gas, colour = "Data")) +
geom_line(aes(y = season_adjust,
colour = "Seasonally Adjusted")) +
labs(y = "Production, in Petajoules",
title = "Seasonally Adjusted Gas Production with Outlier") +
scale_colour_manual(
values = c("gray", "blue"),
breaks = c("Data", "Seasonally Adjusted")
)
##f
outlier_gas2 <- gas
outlier_gas2$Gas[20] <- outlier_gas2$Gas[20]+300
outlier_gas2 %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components() %>%
autoplot() +
labs(title = "Classical Multiplicative Decomposition of Gas Production with Outlier at end")
## Warning: Removed 2 row(s) containing missing values (geom_path).
outlier_decomp2 <- outlier_gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components()
outlier_decomp2 %>%
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Gas, colour = "Data")) +
geom_line(aes(y = season_adjust,
colour = "Seasonally Adjusted")) +
labs(y = "Production, in Petajoules",
title = "Seasonally Adjusted Gas Production with Outlier at end") +
scale_colour_manual(
values = c("gray", "blue"),
breaks = c("Data", "Seasonally Adjusted")
)
With the X-11 decomposition we see the irregulaties from the portion of the data between 2000 and 2010. The seasonality and the trend are noticeably decreased from the rest of the plot.
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
x11_dcmp <- myseries %>%
model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
components()
autoplot(x11_dcmp) +
labs(title =
"Decomposition of total retail Turnover using X-11.")
Looking at both plots the irregularity (remainder) component sticks out as it is fairly constant until there is a drop around 1990. Looking at the trend we see a steady increase as the scale increases. We can identify seasonality as well as especially when looking at the seasonal component plot, in which December shows the largest increases in the labor force throughout.
Yes, the recession is definitely apparent in the estimated components, especially the remainder where we see a sudden decrease. It is not as visible on the value plot but can still clearly be seen.
Looking at the seasonal component plot we can see that over time the seasonal shape has become a bit more sporadic, as opposed to the fairly flat early years. Looking at the plot as a whole the middle portion from about 1970-1980 was the most sporadic.
Plot of seasonally adjusted series below in part c.
Based on the plot, very similar results from both the SEATS and X-11 methods. They seemed to have further dampened seasonal effects.
canadian_gas %>%
autoplot(Volume) +
labs(y= "Production, in billions of cubic meters",
title = "Canadian Gas production")
canadian_gas %>%
gg_subseries(Volume) +
labs(y= "Production, in billions of cubic meters",
title = "Canadian Gas production, Subseries Plot")
canadian_gas %>%
gg_season(Volume) +
labs(y= "Production, in billions of cubic meters",
title = "Canadian Gas production, Season Plot")
##a
canadian_gas %>%
model(
STL(Volume ~ trend(window = 7) +
season(window = "periodic"),
robust = TRUE)) %>%
components() %>%
autoplot()
##b
stl <- canadian_gas %>%
model(
STL(Volume ~ trend(window = 7) +
season(window = "periodic"),
robust = TRUE)) %>%
components()
stl %>%
gg_season(season_year) +
labs(y= "",
title = "Seasonal Gas production,")
##c
ggplot(data = stl,
aes(x = Month,
y = season_adjust)) +
geom_line(col = "blue") +
geom_line(data = canadian_gas,
aes(x = Month,
y = Volume),
col = "grey",
lty = "dashed") +
labs(x = "Month",
y = "Volume",
title = "Seasonally Adjusted vs. Actual Volume")
##d
x11 <-
canadian_gas %>%
model(x11 = X_13ARIMA_SEATS(Volume~x11())) %>%
components()
seats <-
canadian_gas %>%
model(seats = X_13ARIMA_SEATS(Volume~seats())) %>%
components()
ggplot(data = stl,
aes(x = Month,
y = season_adjust)) +
geom_line(col = "blue") +
geom_line(data = canadian_gas,
aes(x = Month,
y = Volume),
col = "grey",
lty = "dashed") +
geom_line(data = x11,
aes(x = Month,
y = season_adjust),
col = "green") +
geom_line(data = seats,
aes(x = Month,
y = season_adjust),
col = "red") +
labs(x = "Month",
y = "Volume",
title = "Seasonally Adjusted vs. Actual Volume",
subtitle = "STL v X11 v SEATS")
There is a positive relationship between Demand and temperature due to the fact that as temperature increases, energy needed for running air conditioning increases with it.
The model appears to be adequate based off the residual plots. The first day along with a couple of other days of the model have lower temperatures and thus lower output of energy than usual but it does not appear to overly influence the model.
Looking at both of these models it is difficult to believe the model with the forecast for the day with 15 degrees as it does not approach the real data, but the 35 degree forecast seems to have reasonable intervals that lead me to believe it could be a reliable forecast for this data.
Intervals below in part d.
The plot shows that my model is non-linear, and is more likely quadratic. This is intuitive as the hotter months would show increased electric use and the colder months decreased.
jan14_vic_elec <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as_date(Time)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature)
)
##a
jan14_vic_elec %>%
autoplot(Demand) +
labs(title = "Electricity Demanded, Jan 2014")
jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
report()
## Series: Demand
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -49978.2 -10218.9 -121.3 18533.2 35440.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59083.9 17424.8 3.391 0.00203 **
## Temperature 6154.3 601.3 10.235 3.89e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24540 on 29 degrees of freedom
## Multiple R-squared: 0.7832, Adjusted R-squared: 0.7757
## F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11
##b
model <- jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature))
model %>% gg_tsresiduals()
##c
jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 15)
) %>%
autoplot(jan14_vic_elec)
jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 35)
) %>%
autoplot(jan14_vic_elec)
##d
interval15 <- jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 15)
) %>%
hilo()
interval15$Demand
## <distribution[1]>
## 1
## N(151398, 6.8e+08)
interval30 <- jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 30)
) %>%
hilo()
interval30$Demand
## <distribution[1]>
## 1
## N(243713, 6.2e+08)
##e
ggplot(vic_elec, aes(x = Temperature, y = Demand)) +
geom_point() +
labs(x = "Temperature",
y = "Electricity Demand (MW)",
title = "Victoria Electricity Demand")
Overall these plots show a decreased trend in the winning times for most races. This is not the case for the 1500M for women’s and for men’s in more recent olympics.
The fitted regression lines can be seen below (had difficult time displaying the slope on the plot thus could not calculate average rate per year).
The residuals for each race can be seen below, they seem to indicate that the fitted line is not the most whole encompassing model.
Predictions and intervals below in part d. I am making the assumption that the winning times will continue to follow the downward trend.
##a
#100M
olympic_running %>% filter(Length == 100) %>%
autoplot() +
labs(x= "Year", y= "Time", title = "Winning Times by Year, 100M")
## Plot variable not specified, automatically selected `.vars = Time`
#200M
olympic_running %>% filter(Length == 200) %>%
autoplot() +
labs(x= "Year", y= "Time", title = "Winning Times by Year, 200M")
## Plot variable not specified, automatically selected `.vars = Time`
#400M
olympic_running %>% filter(Length == 400) %>%
autoplot() +
labs(x= "Year", y= "Time", title = "Winning Times by Year, 400M")
## Plot variable not specified, automatically selected `.vars = Time`
#800M
olympic_running %>% filter(Length == 800) %>%
autoplot() +
labs(x= "Year", y= "Time", title = "Winning Times by Year, 800M")
## Plot variable not specified, automatically selected `.vars = Time`
#1500M
olympic_running %>% filter(Length == 1500) %>%
autoplot() +
labs(x= "Year", y= "Time", title = "Winning Times by Year, 1500M")
## Plot variable not specified, automatically selected `.vars = Time`
#5kM
olympic_running %>% filter(Length == 5000) %>%
autoplot() +
labs(x= "Year", y= "Time", title = "Winning Times by Year, 5K")
## Plot variable not specified, automatically selected `.vars = Time`
#10kM
olympic_running %>% filter(Length == 10000) %>%
autoplot() +
labs(x= "Year", y= "Winning Time", title = "Winning Times by Year, 10K")
## Plot variable not specified, automatically selected `.vars = Time`
##b
olympic_running %>% filter(Length == 100) %>%
as.data.frame() %>%
ggplot(aes(x=Year, y=Time)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title = "100M Winning Times by Year")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 5 rows containing non-finite values (stat_smooth).
## Warning: Removed 5 rows containing missing values (geom_point).
olympic_running %>% filter(Length == 200) %>%
as.data.frame() %>%
ggplot(aes(x=Year, y=Time)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title = "200M Winning Times by Year")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
olympic_running %>% filter(Length == 400) %>%
as.data.frame() %>%
ggplot(aes(x=Year, y=Time)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title = "400M Winning Times by Year")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).
olympic_running %>% filter(Length == 800) %>%
as.data.frame() %>%
ggplot(aes(x=Year, y=Time)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title = "800M Winning Times by Year")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 11 rows containing non-finite values (stat_smooth).
## Warning: Removed 11 rows containing missing values (geom_point).
olympic_running %>% filter(Length == 1500) %>%
as.data.frame() %>%
ggplot(aes(x=Year, y=Time)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title = "1500M Winning Times by Year")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Warning: Removed 3 rows containing missing values (geom_point).
olympic_running %>% filter(Length == 5000) %>%
as.data.frame() %>%
ggplot(aes(x=Year, y=Time)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title = "5k Winning Times by Year")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).
olympic_running %>% filter(Length == 10000) %>%
as.data.frame() %>%
ggplot(aes(x=Year, y=Time)) +
geom_point() +
geom_smooth(method="lm", se=FALSE) +
labs(title= "10K Winning Times by Year")
## `geom_smooth()` using formula 'y ~ x'
## Warning: Removed 3 rows containing non-finite values (stat_smooth).
## Removed 3 rows containing missing values (geom_point).
##c
model100men <- olympic_running %>% filter(Length == 100, Sex == "men") %>%
model(TSLM(Time ~ Year))
model100men %>% gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).
model100women <- olympic_running %>% filter(Length == 100, Sex == "women") %>%
model(TSLM(Time ~ Year))
model100women %>% gg_tsresiduals()
## Warning: Removed 2 rows containing missing values (geom_point).
## Warning: Removed 2 rows containing non-finite values (stat_bin).
model200men <- olympic_running %>% filter(Length == 200, Sex == "men") %>%
model(TSLM(Time ~ Year))
model200men %>% gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).
model200women <- olympic_running %>% filter(Length == 200, Sex == "women") %>%
model(TSLM(Time ~ Year))
model200women %>% gg_tsresiduals()
model400men <- olympic_running %>% filter(Length == 400, Sex == "men") %>%
model(TSLM(Time ~ Year))
model400men %>% gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).
model400women <- olympic_running %>% filter(Length == 400, Sex == "women") %>%
model(TSLM(Time ~ Year))
model400women %>% gg_tsresiduals()
model800men <- olympic_running %>% filter(Length == 800, Sex == "men") %>%
model(TSLM(Time ~ Year))
model800men %>% gg_tsresiduals()
## Warning: Removed 4 rows containing missing values (geom_point).
## Warning: Removed 4 rows containing non-finite values (stat_bin).
model800women <- olympic_running %>% filter(Length == 800, Sex == "women") %>%
model(TSLM(Time ~ Year))
model800women %>% gg_tsresiduals()
## Warning: Removed 7 rows containing missing values (geom_point).
## Warning: Removed 7 rows containing non-finite values (stat_bin).
model1500men <- olympic_running %>% filter(Length == 1500, Sex == "men") %>%
model(TSLM(Time ~ Year))
model1500men %>% gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Warning: Removed 3 rows containing non-finite values (stat_bin).
model1500women <- olympic_running %>% filter(Length == 1500, Sex == "women") %>%
model(TSLM(Time ~ Year))
model1500women %>% gg_tsresiduals()
model5kmen <- olympic_running %>% filter(Length == 5000, Sex == "men") %>%
model(TSLM(Time ~ Year))
model5kmen %>% gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).
model5kwomen <- olympic_running %>% filter(Length == 5000, Sex == "women") %>%
model(TSLM(Time ~ Year))
model5kwomen %>% gg_tsresiduals()
model10kmen <- olympic_running %>% filter(Length == 10000, Sex == "men") %>%
model(TSLM(Time ~ Year))
model10kmen %>% gg_tsresiduals()
## Warning: Removed 3 rows containing missing values (geom_point).
## Removed 3 rows containing non-finite values (stat_bin).
model10kwomen <- olympic_running %>% filter(Length == 10000, Sex == "women") %>%
model(TSLM(Time ~ Year))
model10kwomen %>% gg_tsresiduals()
##d
mens100 <- olympic_running %>% filter(Length == 100, Sex == "men")
fc_100men <- forecast(
model100men,
newdata = data.frame(mens100 = 2020)
)
fc_100men
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 100 men TSLM(Time ~ Year) 2020 N(9.5, 0.061) 9.53
## 2 100 men TSLM(Time ~ Year) 2024 N(9.5, 0.062) 9.48
womens100 <- olympic_running %>% filter(Length == 100, Sex == "women")
fc_100women <- forecast(
model100women,
newdata = data.frame(womens100 = 2020)
)
fc_100women
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 100 women TSLM(Time ~ Year) 2020 N(11, 0.059) 10.5
## 2 100 women TSLM(Time ~ Year) 2024 N(10, 0.061) 10.5
mens200 <- olympic_running %>% filter(Length == 200, Sex == "men")
fc_200men <- forecast(
model200men,
newdata = data.frame(mens200 = 2020)
)
fc_200men
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 200 men TSLM(Time ~ Year) 2020 N(19, 0.13) 19.1
## 2 200 men TSLM(Time ~ Year) 2024 N(19, 0.13) 19.0
womens200 <- olympic_running %>% filter(Length == 200, Sex == "women")
fc_200women <- forecast(
model200women,
newdata = data.frame(womens200 = 2020)
)
fc_200women
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 200 women TSLM(Time ~ Year) 2020 N(21, 0.31) 21.2
## 2 200 women TSLM(Time ~ Year) 2024 N(21, 0.32) 21.1
mens400 <- olympic_running %>% filter(Length == 400, Sex == "men")
fc_400men <- forecast(
model400men,
newdata = data.frame(mens400 = 2020)
)
fc_400men
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 400 men TSLM(Time ~ Year) 2020 N(42, 1.5) 42.0
## 2 400 men TSLM(Time ~ Year) 2024 N(42, 1.5) 41.8
womens400 <- olympic_running %>% filter(Length == 400, Sex == "women")
fc_400women <- forecast(
model400women,
newdata = data.frame(womens400 = 2020)
)
fc_400women
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 400 women TSLM(Time ~ Year) 2020 N(48, 1.4) 48.4
## 2 400 women TSLM(Time ~ Year) 2024 N(48, 1.5) 48.3
mens100 <- olympic_running %>% filter(Length == 100, Sex == "men")
fc_100men <- forecast(
model100men,
newdata = data.frame(mens100 = 2020)
)
fc_100men
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 100 men TSLM(Time ~ Year) 2020 N(9.5, 0.061) 9.53
## 2 100 men TSLM(Time ~ Year) 2024 N(9.5, 0.062) 9.48
womens800 <- olympic_running %>% filter(Length == 800, Sex == "women")
fc_800women <- forecast(
model800women,
newdata = data.frame(womens800 = 2020)
)
fc_800women
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 800 women TSLM(Time ~ Year) 2020 N(111, 14) 111.
## 2 800 women TSLM(Time ~ Year) 2024 N(111, 15) 111.
mens1500 <- olympic_running %>% filter(Length == 1500, Sex == "men")
fc_1500men <- forecast(
model1500men,
newdata = data.frame(mens1500 = 2020)
)
fc_1500men
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 1500 men TSLM(Time ~ Year) 2020 N(207, 74) 207.
## 2 1500 men TSLM(Time ~ Year) 2024 N(206, 75) 206.
womens1500 <- olympic_running %>% filter(Length == 1500, Sex == "women")
fc_1500women <- forecast(
model1500women,
newdata = data.frame(womens1500 = 2020)
)
fc_1500women
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 1500 women TSLM(Time ~ Year) 2020 N(245, 35) 245.
## 2 1500 women TSLM(Time ~ Year) 2024 N(246, 38) 246.
mens5k <- olympic_running %>% filter(Length == 5000, Sex == "men")
fc_5kmen <- forecast(
model5kmen,
newdata = data.frame(mens5k = 2020)
)
fc_5kmen
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 5000 men TSLM(Time ~ Year) 2020 N(773, 285) 773.
## 2 5000 men TSLM(Time ~ Year) 2024 N(769, 290) 769.
womens5k <- olympic_running %>% filter(Length == 5000, Sex == "women")
fc_5kwomen <- forecast(
model5kwomen,
newdata = data.frame(womens5k = 2020)
)
fc_5kwomen
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 5000 women TSLM(Time ~ Year) 2020 N(892, 1562) 892.
## 2 5000 women TSLM(Time ~ Year) 2024 N(891, 1944) 891.
mens10k <- olympic_running %>% filter(Length == 10000, Sex == "men")
fc_10kmen <- forecast(
model10kmen,
newdata = data.frame(mens10k = 2020)
)
fc_10kmen
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 10000 men TSLM(Time ~ Year) 2020 N(1580, 970) 1580.
## 2 10000 men TSLM(Time ~ Year) 2024 N(1570, 986) 1570.
womens10k <- olympic_running %>% filter(Length == 10000, Sex == "women")
fc_10kwomen <- forecast(
model10kwomen,
newdata = data.frame(womens10k = 2020)
)
fc_10kwomen
## # A fable: 2 x 6 [4Y]
## # Key: Length, Sex, .model [1]
## Length Sex .model Year Time .mean
## <int> <chr> <chr> <dbl> <dist> <dbl>
## 1 10000 women TSLM(Time ~ Year) 2020 N(1763, 530) 1763.
## 2 10000 women TSLM(Time ~ Year) 2024 N(1749, 608) 1749.
##Answer
Elasticity ==> β1=(dy/dx)*(x/y) Inverse of this ==> β1(dx/x)=(dy/y). Taking the integral of this expresses in terms of y as a function of x and shows that β1 is the elasticity coefficient.
Overall an increasing trend as time goes on, except for an unexpected dip around 1991 possibly due to the recession.
The data shows variation as time goes on as sales increases with the level of the series. Taking the log reduces the variation and the changes will be percentage rather than a count.
Regression model below in part c.
The residual plots seem to indicate that intervals produced might not encompass the data, with areas of over and underprediction.
From the boxplots there are varying levels of accuracy of the model on each month. Some months such as June and July are fairly encouraging, while others indicate the model may not be sufficient.
The coefficients indicate that the volume of Sales is trending upwards approximately 2% each month, in February it increases about 25% compared to January, in March it increases about 27% compared to January, in April it increases about 38% compared to January, in May it increases about 41% compared to January, in June it increases about 45% compared to January, in July it increases about 61% compared to January, in August it increases about 59% compared to January, in September it increases about 67% compared to January, in October it increases about 75% compared to January, in November it increases about 121% compared to January, in December it increases about 196% compared to January, and the surfing festival increases sales volume in March by about 50%.
It tells me that there could be autocorrelation of the models residuals.
Had issue producing these forecasts, attempted code shown below.
Perhaps with more information on tourism attractions outside of the festival we could create more dummy variables and better describe certain abnormalities in the model.
##a
autoplot(souvenirs, Sales) +
labs(title = "Monthly Sales Figures")
##c
souvenirs$logSales <- log(souvenirs$Sales)
souvenirs$dummynew <- rep(0, length(souvenirs$Month))
souvenirs$dummynew[seq_along(souvenirs$dummynew)%%12 == 3] <- 1
souvenirs$dummynew[3] <- 0
salesmodel <- souvenirs %>% model(TSLM(logSales ~ trend() + season() + dummynew))
report(salesmodel)
## Series: logSales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.336727 -0.127571 0.002568 0.109106 0.376714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6196670 0.0742471 102.626 < 2e-16 ***
## trend() 0.0220198 0.0008268 26.634 < 2e-16 ***
## season()year2 0.2514168 0.0956790 2.628 0.010555 *
## season()year3 0.2660828 0.1934044 1.376 0.173275
## season()year4 0.3840535 0.0957075 4.013 0.000148 ***
## season()year5 0.4094870 0.0957325 4.277 5.88e-05 ***
## season()year6 0.4488283 0.0957647 4.687 1.33e-05 ***
## season()year7 0.6104545 0.0958039 6.372 1.71e-08 ***
## season()year8 0.5879644 0.0958503 6.134 4.53e-08 ***
## season()year9 0.6693299 0.0959037 6.979 1.36e-09 ***
## season()year10 0.7473919 0.0959643 7.788 4.48e-11 ***
## season()year11 1.2067479 0.0960319 12.566 < 2e-16 ***
## season()year12 1.9622412 0.0961066 20.417 < 2e-16 ***
## dummynew 0.5015151 0.1964273 2.553 0.012856 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared: 0.9567, Adjusted R-squared: 0.9487
## F-statistic: 119 on 13 and 70 DF, p-value: < 2.22e-16
##d
salesmodel %>% gg_tsresiduals()
##e
residuals <- (resid(salesmodel))
residuals$Month <- as.Date(residuals$Month, format = c('%m/%Y'))
residuals$Month <- as.factor(month(residuals$Month))
Jan <- residuals %>% filter(Month == 1)
boxplot(Jan$.resid, xlab= "January")
Feb <- residuals %>% filter(Month == 2)
boxplot(Feb$.resid, xlab= "February")
Mar <- residuals %>% filter(Month == 3)
boxplot(Mar$.resid, xlab= "March")
Apr <- residuals %>% filter(Month == 4)
boxplot(Apr$.resid, xlab= "April")
May <- residuals %>% filter(Month == 5)
boxplot(May$.resid, xlab= "May")
Jun <- residuals %>% filter(Month == 6)
boxplot(Jun$.resid, xlab= "June")
Jul <- residuals %>% filter(Month == 7)
boxplot(Jul$.resid, xlab= "July")
Aug <- residuals %>% filter(Month == 8)
boxplot(Aug$.resid, xlab= "August")
Sep <- residuals %>% filter(Month == 9)
boxplot(Sep$.resid, xlab= "September")
Oct <- residuals %>% filter(Month == 10)
boxplot(Oct$.resid, xlab= "October")
Nov <- residuals %>% filter(Month == 11)
boxplot(Nov$.resid, xlab= "November")
Dec <- residuals %>% filter(Month == 12)
boxplot(Dec$.resid, xlab= "December")
##f
report(salesmodel)
## Series: logSales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.336727 -0.127571 0.002568 0.109106 0.376714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6196670 0.0742471 102.626 < 2e-16 ***
## trend() 0.0220198 0.0008268 26.634 < 2e-16 ***
## season()year2 0.2514168 0.0956790 2.628 0.010555 *
## season()year3 0.2660828 0.1934044 1.376 0.173275
## season()year4 0.3840535 0.0957075 4.013 0.000148 ***
## season()year5 0.4094870 0.0957325 4.277 5.88e-05 ***
## season()year6 0.4488283 0.0957647 4.687 1.33e-05 ***
## season()year7 0.6104545 0.0958039 6.372 1.71e-08 ***
## season()year8 0.5879644 0.0958503 6.134 4.53e-08 ***
## season()year9 0.6693299 0.0959037 6.979 1.36e-09 ***
## season()year10 0.7473919 0.0959643 7.788 4.48e-11 ***
## season()year11 1.2067479 0.0960319 12.566 < 2e-16 ***
## season()year12 1.9622412 0.0961066 20.417 < 2e-16 ***
## dummynew 0.5015151 0.1964273 2.553 0.012856 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared: 0.9567, Adjusted R-squared: 0.9487
## F-statistic: 119 on 13 and 70 DF, p-value: < 2.22e-16
##g (Do not think I did this part right)
ljung_box(residuals$.resid, lag = 1, dof = 0)
## lb_stat lb_pvalue
## 2.545074e+01 4.538236e-07
##h- not done had a very difficult time with this one
"df15 <- data.frame(logSales = souvenirs$logSales, dummynew = souvenirs$dummynew, Month = souvenirs$Month)
smodel <- df15 %>% model(TSLM(logSales ~ trend() + season() + dummynew))
newdata <- data.frame(dummynew = rep(0, 36))
fc_sales <- forecast(
salesmodel,
newdata = newdata
)
fc_sales"
## [1] "df15 <- data.frame(logSales = souvenirs$logSales, dummynew = souvenirs$dummynew, Month = souvenirs$Month)\nsmodel <- df15 %>% model(TSLM(logSales ~ trend() + season() + dummynew))\nnewdata <- data.frame(dummynew = rep(0, 36))\nfc_sales <- forecast(\n salesmodel, \n newdata = newdata\n )\nfc_sales"
Plot below. It is difficult to figure out which regression model fits the fitted values just by looking at the plots. All produce very similar results.
Shown below. Based on the results, of the three models, the third model with K=7 is the most approriate model based on having the lowest CV and low AICc value.
Based on the residuals, the model appears to be random and normal although the Ljung-Box indicates there could be autocorrelation.
The forecasts seem to be pretty good, the plot of the forecast seems to fall in line with the overall trend of the data and has normal seasonal effects.
gas <- us_gasoline
gas$Year <- year(gas$Week)
gas$Year <-NULL
gas_new <- gas[1:726,]
##a
fourier_gas <- gas_new %>%
model(TSLM(Barrels ~ trend() + fourier(K = 2)))
report(fourier_gas)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.9375162 -0.1897569 -0.0006692 0.2058275 1.0016928
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.094e+00 2.121e-02 334.493 < 2e-16 ***
## trend() 2.802e-03 5.057e-05 55.420 < 2e-16 ***
## fourier(K = 2)C1_52 -1.237e-01 1.497e-02 -8.265 6.71e-16 ***
## fourier(K = 2)S1_52 -2.383e-01 1.497e-02 -15.917 < 2e-16 ***
## fourier(K = 2)C2_52 4.493e-02 1.495e-02 3.006 0.00274 **
## fourier(K = 2)S2_52 1.054e-02 1.498e-02 0.704 0.48193
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.285 on 720 degrees of freedom
## Multiple R-squared: 0.8271, Adjusted R-squared: 0.8259
## F-statistic: 688.8 on 5 and 720 DF, p-value: < 2.22e-16
autoplot(gas_new, series="Data", color ="blue") +
autolayer(fitted(fourier_gas), series="Fitted") +
xlab("Year") + ylab("Gas Supply") +
ggtitle("Gas Supply (K=2)")
## Plot variable not specified, automatically selected `.vars = Barrels`
## Warning: Ignoring unknown parameters: series
## Plot variable not specified, automatically selected `.vars = .fitted`
## Warning: Ignoring unknown parameters: series
fourier_gas2 <- gas_new %>%
model(TSLM(Barrels ~ trend() + fourier(K = 7)))
report(fourier_gas2)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8709966 -0.1723461 -0.0004569 0.1780736 0.9484129
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.095e+00 2.003e-02 354.233 < 2e-16 ***
## trend() 2.798e-03 4.776e-05 58.591 < 2e-16 ***
## fourier(K = 7)C1_52 -1.244e-01 1.413e-02 -8.804 < 2e-16 ***
## fourier(K = 7)S1_52 -2.394e-01 1.414e-02 -16.932 < 2e-16 ***
## fourier(K = 7)C2_52 4.528e-02 1.411e-02 3.208 0.0014 **
## fourier(K = 7)S2_52 9.330e-03 1.414e-02 0.660 0.5096
## fourier(K = 7)C3_52 9.626e-02 1.414e-02 6.809 2.10e-11 ***
## fourier(K = 7)S3_52 7.197e-05 1.411e-02 0.005 0.9959
## fourier(K = 7)C4_52 2.892e-02 1.413e-02 2.046 0.0411 *
## fourier(K = 7)S4_52 2.881e-02 1.412e-02 2.041 0.0416 *
## fourier(K = 7)C5_52 -3.355e-02 1.411e-02 -2.378 0.0177 *
## fourier(K = 7)S5_52 3.165e-02 1.414e-02 2.238 0.0255 *
## fourier(K = 7)C6_52 -6.565e-02 1.412e-02 -4.648 3.99e-06 ***
## fourier(K = 7)S6_52 2.803e-02 1.413e-02 1.984 0.0476 *
## fourier(K = 7)C7_52 -2.228e-02 1.414e-02 -1.576 0.1155
## fourier(K = 7)S7_52 3.274e-02 1.411e-02 2.320 0.0206 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2691 on 710 degrees of freedom
## Multiple R-squared: 0.848, Adjusted R-squared: 0.8448
## F-statistic: 264.1 on 15 and 710 DF, p-value: < 2.22e-16
autoplot(gas_new, series="Data", color = "blue") +
autolayer(fitted(fourier_gas2), series="Fitted") +
xlab("Year") + ylab("Gas Supply") +
ggtitle("Gas Supply (K=7)")
## Plot variable not specified, automatically selected `.vars = Barrels`
## Warning: Ignoring unknown parameters: series
## Plot variable not specified, automatically selected `.vars = .fitted`
## Warning: Ignoring unknown parameters: series
fourier_gas3 <- gas_new %>%
model(TSLM(Barrels ~ trend() + fourier(K = 12)))
report(fourier_gas3)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.8699494 -0.1759126 0.0007382 0.1704002 1.0418007
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.095e+00 1.992e-02 356.199 < 2e-16 ***
## trend() 2.799e-03 4.750e-05 58.923 < 2e-16 ***
## fourier(K = 12)C1_52 -1.244e-01 1.406e-02 -8.852 < 2e-16 ***
## fourier(K = 12)S1_52 -2.393e-01 1.406e-02 -17.023 < 2e-16 ***
## fourier(K = 12)C2_52 4.527e-02 1.403e-02 3.226 0.00132 **
## fourier(K = 12)S2_52 9.367e-03 1.406e-02 0.666 0.50562
## fourier(K = 12)C3_52 9.622e-02 1.406e-02 6.844 1.68e-11 ***
## fourier(K = 12)S3_52 8.916e-05 1.404e-02 0.006 0.99493
## fourier(K = 12)C4_52 2.889e-02 1.406e-02 2.055 0.04023 *
## fourier(K = 12)S4_52 2.880e-02 1.404e-02 2.052 0.04059 *
## fourier(K = 12)C5_52 -3.357e-02 1.403e-02 -2.392 0.01701 *
## fourier(K = 12)S5_52 3.162e-02 1.406e-02 2.248 0.02487 *
## fourier(K = 12)C6_52 -6.563e-02 1.404e-02 -4.673 3.56e-06 ***
## fourier(K = 12)S6_52 2.800e-02 1.405e-02 1.993 0.04667 *
## fourier(K = 12)C7_52 -2.225e-02 1.406e-02 -1.582 0.11404
## fourier(K = 12)S7_52 3.273e-02 1.403e-02 2.333 0.01996 *
## fourier(K = 12)C8_52 -1.657e-02 1.404e-02 -1.180 0.23838
## fourier(K = 12)S8_52 -1.354e-03 1.405e-02 -0.096 0.92328
## fourier(K = 12)C9_52 -1.764e-02 1.404e-02 -1.257 0.20924
## fourier(K = 12)S9_52 -4.747e-04 1.405e-02 -0.034 0.97306
## fourier(K = 12)C10_52 1.264e-02 1.405e-02 0.900 0.36864
## fourier(K = 12)S10_52 -2.354e-02 1.404e-02 -1.677 0.09401 .
## fourier(K = 12)C11_52 -2.845e-02 1.405e-02 -2.025 0.04322 *
## fourier(K = 12)S11_52 2.552e-02 1.404e-02 1.818 0.06957 .
## fourier(K = 12)C12_52 -1.183e-02 1.404e-02 -0.842 0.40001
## fourier(K = 12)S12_52 -2.507e-02 1.405e-02 -1.784 0.07480 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2676 on 700 degrees of freedom
## Multiple R-squared: 0.8518, Adjusted R-squared: 0.8465
## F-statistic: 160.9 on 25 and 700 DF, p-value: < 2.22e-16
autoplot(gas_new, series="Data", color = "blue") +
autolayer(fitted(fourier_gas3), series="Fitted") +
xlab("Year") + ylab("Gas Supply") +
ggtitle("Gas Supply (K=12)")
## Plot variable not specified, automatically selected `.vars = Barrels`
## Warning: Ignoring unknown parameters: series
## Plot variable not specified, automatically selected `.vars = .fitted`
## Warning: Ignoring unknown parameters: series
##b
glance(fourier_gas) %>% select(CV, AICc)
## # A tibble: 1 × 2
## CV AICc
## <dbl> <dbl>
## 1 0.0819 -1814.
glance(fourier_gas2) %>% select(CV, AICc)
## # A tibble: 1 × 2
## CV AICc
## <dbl> <dbl>
## 1 0.0740 -1887.
glance(fourier_gas3) %>% select(CV, AICc)
## # A tibble: 1 × 2
## CV AICc
## <dbl> <dbl>
## 1 0.0742 -1884.
##c
fourier_gas2 %>% gg_tsresiduals()
resid <- resid(fourier_gas2)
ljung_box(resid$.resid, lag=1, dof=0)
## lb_stat lb_pvalue
## 5.23083693 0.02218985
##d
gascast <- forecast(fourier_gas2, newdata=data.frame(fourier(gas_new,12,52)))
gas_new2<- gas[727:780,]
gascast %>%
autoplot(gas_new, level = NULL) +
autolayer(gas_new2, Barrels, colour = "black") +
labs(x = "Year", y = "Gas Supply") +
guides(colour = guide_legend(title = "Forecast"))
Plot below. Yes, we can see in the plot a period of time in which the population is convex and its during the time of the war.
Looking at both models it is clear that the piecewise trend model performs better, and it is shown visually with how well it aligns with the actual data.
From the plots we can see that again the piecewise forecast is seemingly better than the linear trend model.
##a
global_economy %>% filter(Country == "Afghanistan") %>%
autoplot(Population) +
labs(y = "Population",
title = "Population by Year, Afghanistan")
##b
afghan_pop <- global_economy %>% filter(Country == "Afghanistan")
fit_trends <- afghan_pop %>%
model(trend_model = TSLM(Population ~ trend()), piecewise = TSLM(Population ~ trend(knots = c(1980, 1989))))
fc_trends <- fit_trends %>% forecast(h = 10)
afghan_pop %>%
autoplot(Population) +
geom_line(data = fitted(fit_trends),
aes(y = .fitted, colour = .model)) +
autolayer(fc_trends, alpha = 0.5, level = 95) +
labs(y = "Population",
title = "Population of Afghanistan")
##c- Kept getting 2 year forecasts, could not figure out a fix
popcast <- forecast(fit_trends, newdata=data.frame(afghan_pop = 2018, 2019, 2020, 2021, 2022))
popcast %>%
autoplot(afghan_pop, level = NULL) +
labs(x = "Year", y = "Population") +
guides(colour = guide_legend(title = "Forecast"))
(For advanced readers following on from Section 7.9). Using matrix notation it was shown that if y=Xβ+ε, where ε has mean 0 and variance matrix σ2I, the estimated coefficients are given by ^β=(X′X)−1X′y and a forecast is given by y=x∗β=x∗(X′X)−1X′y where x∗ is a row vector containing the values of the predictors for the forecast (in the same format as X), and the forecast variance is given by Var(^y)=σ2[1+x∗(X′X)−1(x∗)′]. Consider the simple time trend model where yt=β0+β1t.
Using the following results,T∑t=1t=12T(T+1),T∑t=1t2=16T(T+1)(2T+1)
derive the following expressions: a. X′X=16[6T3T(T+1)3T(T+1)T(T+1)(2T+1)] b. (X′X)−1=2T(T2−1)[(T+1)(2T+1)−3(T+1)−3(T+1)6] c. ^β0=2T(T−1)[(2T+1)T∑t=1yt−3T∑t=1tyt] ^β1=6T(T2−1)[2T∑t=1tyt−(T+1)T∑t=1yt] d. Var(yt)=σ2[1+2T(T−1)(1−4T−6h+6(T+h)2T+1)]