Libraries:
# Libraries ---------------------------------------------------------------
library('tidyverse')
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library('lubridate')
##
## Attaching package: 'lubridate'
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library('forecast')
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library('feasts')
## Loading required package: fabletools
##
## Attaching package: 'fabletools'
## The following objects are masked from 'package:forecast':
##
## accuracy, forecast
library('fpp3')
## -- Attaching packages -------------------------------------------- fpp3 0.4.0 --
## v tsibble 1.1.1 v fable 0.3.1
## v tsibbledata 0.4.0
## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x lubridate::date() masks base::date()
## x dplyr::filter() masks stats::filter()
## x fabletools::forecast() masks forecast::forecast()
## x tsibble::intersect() masks base::intersect()
## x tsibble::interval() masks lubridate::interval()
## x dplyr::lag() masks stats::lag()
## x tsibble::setdiff() masks base::setdiff()
## x tsibble::union() masks base::union()
library('gridExtra')
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
Chapter 3:
Question 1
# data cleaning/Exploration -----------------------------------------------
view(global_economy)
colSums(is.na(global_economy))
## Country Code Year GDP Growth CPI Imports
## 0 0 0 3322 3756 7480 4554
## Exports Population
## 4563 3
nrow(global_economy)
## [1] 15150
colSums(global_economy == "")
## Country Code Year GDP Growth CPI Imports
## 0 0 0 NA NA NA NA
## Exports Population
## NA NA
global <- na.omit(global_economy)
Though it may not be best practice, especially when using official data. NA’s were taken out of the data. There were some that were causing issues when plotting.
#creating the GDP Per Capita column
global <- mutate(global, GPC = GDP/Population)
#too many levels so I finding the highest GPCS; anything over 8526.97 should do
summary(global$GPC)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 37.52 661.88 2147.46 7884.37 8526.97 119225.38
global %>%
filter(GPC > 8526.97) %>%
ggplot(aes(Year, GPC, col = Country)) +
geom_line()
#I'm colorblind so this doesn't help :(
summary(global$Year)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1961 1983 1997 1995 2008 2017
global %>%
filter(Year == 1961) %>%
filter(GPC == max(GPC)) %>%
select(Country, GPC)
## # A tsibble: 1 x 3 [1Y]
## # Key: Country [1]
## Country GPC Year
## <fct> <dbl> <dbl>
## 1 United States 3067. 1961
# US w/ 3067
global %>%
filter(Year == 1985) %>%
filter(GPC == max(GPC)) %>%
select(Country, GPC)
## # A tsibble: 1 x 3 [1Y]
## # Key: Country [1]
## Country GPC Year
## <fct> <dbl> <dbl>
## 1 United States 18269. 1985
#US w/ 18269
global %>%
filter(Year == 2000) %>%
filter(GPC == max(GPC)) %>%
select(Country, GPC)
## # A tsibble: 1 x 3 [1Y]
## # Key: Country [1]
## Country GPC Year
## <fct> <dbl> <dbl>
## 1 Luxembourg 48736. 2000
#Lix at 48736
global %>%
filter(Year == 2017) %>%
filter(GPC == max(GPC)) %>%
select(Country, GPC)
## # A tsibble: 1 x 3 [1Y]
## # Key: Country [1]
## Country GPC Year
## <fct> <dbl> <dbl>
## 1 Luxembourg 104103. 2017
global %>%
filter(GPC == max(GPC)) %>%
select(Country, GPC)
## # A tsibble: 1 x 3 [1Y]
## # Key: Country [1]
## Country GPC Year
## <fct> <dbl> <dbl>
## 1 Luxembourg 119225. 2014
global %>%
filter(Country == "Luxembourg") %>%
autoplot(GPC) +
labs(title = " Luxembourg GDP Per Capita",
y =" GDP Per Capita",
x= "Year") +
autolayer(filter(global %>%
filter(Country == "United States")), GPC, color = "Blue")
The country with the largest GDP per capita (GPC) was Luxembourg in 2014. The country had a GPC of 119225. After sampling the data it shows. Originally, The United States had the largest GPC but Luxembourgh took the lead throughout the years. Luxembourg’s GPC Sky rocketted in the 2000’s and started to stagnate around the 2010s.
Question 2
Global Economy Data
global %>%
filter(Country == "United States") %>%
ggplot(aes(Year, log(GDP))) +
geom_line() +
labs(y = " GDP Log", title = "United States GDP")
The log of the data was taking to more accurately reflect the pattern of the data while having more managable numbers to look at and understand because log turns the data into terms of percentages.
Australian Livestock Data
aus_livestock %>%
filter(Animal == "Bulls, bullocks and steers"& State == "Victoria") %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Count`
A box cox transformation should be taken for data to help with the seasonal variation. None was taken becuase the data was counts.
Victorian Electricity Demand
vic_elec %>%
filter(year(Date) == 2014)%>%
gg_season(Demand, period = "year")
view(vic_elec)
A transformation wasn’t done for this data. It has high seasonal varaition and would benefit from a box cox transformation but due to the amount of observations the benefit isn’t seen. This data would benefit from a decomposition to better understand it.
Australian Beer Production
lam_beer <- aus_production %>%
features(Beer, features = guerrero) %>%
pull(lambda_guerrero)
aus_production %>%
autoplot(box_cox(Beer, lam_beer))
A Box cox transformation was used for this data to help was the increasing seasonality of the data. The seasonality is still there but to further investigat ethe data a decompisiton should be used.
Question 3
head(canadian_gas)
## # A tsibble: 6 x 2 [1M]
## Month Volume
## <mth> <dbl>
## 1 1960 Jan 1.43
## 2 1960 Feb 1.31
## 3 1960 Mar 1.40
## 4 1960 Apr 1.17
## 5 1960 May 1.12
## 6 1960 Jun 1.01
plot_1 <- canadian_gas %>%
autoplot() +
labs(title = "Orginal Data: Candian Gas Volume")
## Plot variable not specified, automatically selected `.vars = Volume`
lambda <- canadian_gas %>%
features(Volume, features = guerrero) %>%
pull(lambda_guerrero)
Lam_plot <- canadian_gas %>%
autoplot(box_cox(Volume, lambda)) +
labs(y = "",
title =
"Transformed Volume with lambda = 0.39 ",
round(lambda,2))
grid.arrange(plot_1, Lam_plot, ncol = 2)
The box-cox transformation helps reduce the seasonal variation of the data especially in the 80s and 90s.
Question 4
set.seed(1234567)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries %>%
autoplot() +
labs(title = " Austrailian Retail turnover")
## Plot variable not specified, automatically selected `.vars = Turnover`
retail_lam <- myseries %>%
features(Turnover, features = guerrero) %>%
pull(lambda_guerrero)
retail_lam
## [1] 0.1541981
myseries %>%
autoplot(box_cox(Turnover, retail_lam)) +
labs(y = "",
title =
"Transformed Volume with lambda = 0.16 ")
According to the guerror approach to choosing a lambda, the value selected would be 0.16. Thought I have used the set seed function, there still seems to be some change in the data and the lambda might be subject to change.
Question 5
tobacco_lam <- aus_production %>%
features(Tobacco, features = guerrero) %>%
pull(lambda_guerrero)
tobacco_lam
## [1] 0.9289402
air <- ansett %>%
filter(Airports == "MEL-SYD" & Class =="Economy")
Econ_lam <- air %>%
features(Airports, features = guerrero) %>%
pull(lambda_guerrero)
Econ_lam
## [1] 1.999927
ped <- pedestrian %>%
filter(Sensor == "Southern Cross Station")
ped_lam <- ped %>%
features(Count, features = guerrero) %>%
pull(lambda_guerrero)
ped_lam
## [1] -0.2255423
To find the lambdas, I used the guerrero method to automaticall generate them. The lambdas found to fit the data best where 0.93 for the Australian Tobacco production, 1.99 or 2 for Economy Passenger class, and -0.2255 or -0.23 for Pedestrians at south station.
Question 6
#Show that a 3×5 MA is equivalent to a
#7-term weighted moving average with weights of 0.067, 0.133, 0.200, 0.200, 0.200, 0.133, and 0.067
#the 5 MA would be at 0.200 as of the order above
(0.067+0.133+0.200+0.200+0.200)/5
## [1] 0.16
#0.16
#5 MA at 0.200 (the second one)
(0.133+0.200+0.200+0.200+0.133)/5
## [1] 0.1732
#0.1732
(0.200+0.200+0.200+0.133+0.067)/5
## [1] 0.16
#0.16
# The 3x5 MA is 0.1644
(0.16+0.1732+0.16)/3
## [1] 0.1644
# 7 Day weighted moving average
(1/15*(0.067))+2/15*(0.133)+(3/15*(0.2))+(3/15*(0.2))+(3/15*(0.2))+(2/15*(0.133))+(1/15*(0.067))
## [1] 0.1644
Above you can see the calculations fo the 5 moving averages and the 7 day weighted moving averages. They calculations show they are the same.
Question 7
#A Seasonal Fluctuations
gas <- tail(aus_production, 5*4) %>% select(Gas)
gas %>%
autoplot(Gas) +
labs( title = "Austalian Gas Production",
y= "Gas Production",
x= "Time in Quarters")
Plot_gas <- gas %>%
autoplot(Gas) +
labs( title = "Austalian Gas Production",
y= "Gas Production",
x= "Time in Quarters")
The trend is a positive and upward sloping throughout the data. The seasonality of the data is an increase from Q1 to Q2. Then a decrease from Q3 to Q4. But the seasonality increases more than it decreases.
#B Multiplicative Decomposition
gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")
) %>%
components() %>%
autoplot() +
labs(title = "Australian Gas Production")
## Warning: Removed 2 row(s) containing missing values (geom_path).
Part C
The analysis of trend in part A was accurate, positive upward sloping. The interpretation of the seasonality was slightly off. The reason why the seasonality increased more than it decreased was because of the upward trend. The seasonality does retain the pattern of increasing from Q1 to Q2 and decreasing over Q2 and Q4.
#D Seasoanlly adjusted data
gas_dcmp <- gas %>%
model(
classical_decomposition(Gas, type = "multiplicative")) %>%
components()
gas_seas <- as_tsibble(bind_cols(gas$Quarter,gas_dcmp$season_adjust))
## New names:
## Using `...1` as index variable.
## * `` -> `...1`
## * `` -> `...2`
plot_seas <- gas_seas %>%
autoplot() +
labs(title = "Australian Gas Production without Seasonality",
y = "Gas Production",
x = "Time in Quarters")
## Plot variable not specified, automatically selected `.vars = ...2`
grid.arrange(Plot_gas,plot_seas)
#E Outlier
gas_adjusted <- gas
gas_adjusted[12,1] <- gas[12,1] + 500
view(gas_adjusted)
adjust_dcmp <- gas_adjusted %>%
model(
classical_decomposition(Gas, type = "multiplicative")) %>%
components()
gas_adj <- as_tsibble(bind_cols(gas$Quarter,adjust_dcmp$season_adjust))
## New names:
## Using `...1` as index variable.
## * `` -> `...1`
## * `` -> `...2`
plot_adjust <- gas_adj %>%
autoplot() +
labs(title = "Australian Gas Production Adjusted",
y = "Gas Production",
x = "Time in Quarters")
## Plot variable not specified, automatically selected `.vars = ...2`
grid.arrange(Plot_gas, plot_seas, plot_adjust)
The outlier helps to eliminate the trend. The seasonality of the data is still represented in the graph.
#Part F
#F.
adjusted_2 <- gas
adjusted_2[18,1] <- gas[18,1] + 500
adjusted_2 %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Gas`
A2_dcmp <- adjusted_2 %>%
model(
classical_decomposition(Gas, type = "multiplicative")) %>%
components()
gas_A2 <- as_tsibble(bind_cols(gas$Quarter,A2_dcmp$season_adjust))
## New names:
## Using `...1` as index variable.
## * `` -> `...1`
## * `` -> `...2`
plot_A2 <- gas_A2 %>%
autoplot() +
labs(title = "Australian Gas Production Adjusted",
y = "Gas Production",
x = "Time in Quarters")
## Plot variable not specified, automatically selected `.vars = ...2`
grid.arrange(Plot_gas, plot_seas,plot_adjust, plot_A2)
If the outlier is near the end then you can see a better concept of seasonality within the data. The if the outlier is more towards the middle of the data, it is more difficult to pick the trend out of the data.
Question 8
set.seed(12345678)
myseries <- aus_retail %>%
filter(`Series ID` == sample(aus_retail$`Series ID`,1))
myseries %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Turnover`
myseries %>%
gg_season()
## Plot variable not specified, automatically selected `y = Turnover`
myseries %>%
gg_subseries()
## Plot variable not specified, automatically selected `y = Turnover`
myseries %>%
gg_lag()
## Plot variable not specified, automatically selected `y = Turnover`
x11_dcmp <- myseries %>%
model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
components()
autoplot(x11_dcmp) +
labs(title =
"Decomposition of Autralian Turnover using X-11.")
The irregular portion of the decomposition provides more details than the functions used in section 2.10 #8. The irregular graph shows where there tend to be outliers not seen, such as the dip in the late 80’s or around 2011.
Question 9
The decomposition of the Australian labor force is interesting because the value and trend are both positive and upward sloping. The seasonal graph fluctuates between plus or minus 100. The remainder scale is 0 to -400. Though the labor force is always increasing, the remainders shows that some sort of recession h append. A recession can be slightly seen in value where there seems to be a consistent dip for two years but isn’t as noticeable as the remainder graph.
The recession is noticeable in the seasonal components graph as well. Most months show a dip after the year 1990. The most prevalent months are March, April, August, and November.
Question 10
#A Functions
canadian_gas %>%
autoplot
## Plot variable not specified, automatically selected `.vars = Volume`
canadian_gas %>%
gg_subseries
## Plot variable not specified, automatically selected `y = Volume`
canadian_gas %>%
gg_season()
## Plot variable not specified, automatically selected `y = Volume`
#B STL Decomposition
can_dcmp <- canadian_gas %>%
model(
STL(Volume ~ trend(window = 13) +
season(window = 9),
robust = TRUE)) %>%
components()
#C Seasonal adjustmetns
can_season <- as_tsibble(bind_cols(can_dcmp$season_year, canadian_gas$Month))
## New names:
## Using `...2` as index variable.
## * `` -> `...1`
## * `` -> `...2`
can_season %>%
gg_season +
labs(title = "Seasonal Analysis: Canadian Gas",
y = "Seasonal Adjustment",
x= "Time in Months")
## Plot variable not specified, automatically selected `y = ...1`
The seasonal shape changes in an overall U shape trend. There are seasonal peaks and valleys with the larges being in February.
#D Plausible Seasonally Adjusted Series
Can_adj <- as_tsibble(bind_cols(can_dcmp$season_adjust, canadian_gas$Month))
## New names:
## Using `...2` as index variable.
## * `` -> `...1`
## * `` -> `...2`
plot_stl <- Can_adj %>%
autoplot()+
labs(title = "Seasonally Adjusted Gas Production(STL)",
y = "Volume",
x= "Time in Months")
## Plot variable not specified, automatically selected `.vars = ...1`
plot_stl
#E Seats X-11
x11_gas <- canadian_gas %>%
model(x11 = X_13ARIMA_SEATS(Volume ~ x11())) %>%
components()
autoplot(x11_gas) +
labs(title =
"Decomposition of Canadian Gas Production using X-11.")
Cad_x11 <- as_tsibble(bind_cols(x11_gas$season_adjust, canadian_gas$Month))
## New names:
## Using `...2` as index variable.
## * `` -> `...1`
## * `` -> `...2`
plot_cadx11 <- Cad_x11 %>%
autoplot()+
labs(title = "Seasonally Adjusted Gas Production (X11)",
y = "Volume",
x= "Time in Months")
## Plot variable not specified, automatically selected `.vars = ...1`
plot_cadx11
grid.arrange(plot_stl, plot_cadx11)
The decomposition using x11 was different than that of the STL.X11 seasonal decomposition shows more of an inflated season at the beginning when compared to STL. The seasonally adjusted graphs overall look similar. This is most likely do to the trend being about the same. There are still visible small seasonality within the data that are different by comparison.
Chapter 7:
Question 1
jan14_vic_elec <- vic_elec %>%
filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
index_by(Date = as_date(Time)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature)
)
daily<- ts(jan14_vic_elec)
#A.)plotting Demand~temperature
ggplot(jan14_vic_elec, aes(Temperature, Demand)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
The relationship is linear because as temperatures get higher be want to cool their homes/businesses.This requires more electricity to run ACs at higher output.
#B.) Residual plot
fit_jan <- jan14_vic_elec %>%
model(TSLM(Demand~Temperature))
report(fit_jan)
## Series: Demand
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -49978.2 -10218.9 -121.3 18533.2 35440.6
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 59083.9 17424.8 3.391 0.00203 **
## Temperature 6154.3 601.3 10.235 3.89e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 24540 on 29 degrees of freedom
## Multiple R-squared: 0.7832, Adjusted R-squared: 0.7757
## F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11
fit_jan %>% gg_tsresiduals()
There seems to be some heteroscedasticity in the model. It is skewed to the right in all the plots There is an exception, the ACF graph has a spike at the beginning and then skews. This type of model doesn’t fit the data well. Though it had an R squared of 0.78.Due to the grouping of the data a nonparametric model like KNN might fit the data better if it wasn’t time series. There is no autocorrelation according to the ACF plot.
#C Forecast
#Forecast for 15 degrees
jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 15)
) %>%
autoplot(jan14_vic_elec)
#forecast for 35 degrees
jan14_vic_elec %>%
model(TSLM(Demand ~ Temperature)) %>%
forecast(
new_data(jan14_vic_elec, 1) %>%
mutate(Temperature = 35)
) %>%
autoplot(jan14_vic_elec)
#D Prediction interval
temp_model <- tslm(Demand~Temperature, data = daily)
fcasttemp15 <- forecast(temp_model, newdata=data.frame(Temperature=15))
fcasttemp15
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 32 151398.4 117127.2 185669.5 97951.22 204845.5
fcasttemp35 <- forecast(temp_model, newdata=data.frame(Temperature=35))
fcasttemp35
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 32 274484.2 241333 307635.5 222783.7 326184.8
The forecasted damand when the temperature is 15 degrees is 151398 and 274484.2 when the temperature would be 35 degrees. The 95% prediction interval for 15 degrees is 97951.22 - 204845.5 and 222783.7 - 326184.8 for 35 degrees.
#E
total_elc <- vic_elec %>%
index_by(Date = as_date(Time)) %>%
summarise(
Demand = sum(Demand),
Temperature = max(Temperature)
)
ggplot(total_elc, aes(Temperature, Demand)) +
geom_point() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
fit_elec <- total_elc %>%
model(TSLM(Demand~Temperature))
report(fit_elec)
## Series: Demand
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -62818.41 -13699.52 -25.83 17452.70 118947.95
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 220345.8 2742.3 80.352 <2e-16 ***
## Temperature 172.0 125.9 1.366 0.172
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 25470 on 1094 degrees of freedom
## Multiple R-squared: 0.001702, Adjusted R-squared: 0.0007897
## F-statistic: 1.865 on 1 and 1094 DF, p-value: 0.17229
The new model has an adjusted R^2 of 0.00079 compared to the origanal model adjusted R^2 of 0.78. This shows that the original model predicted its sample of the data well but was not representative of the population.
Question 2:
#A.)
colSums(is.na(olympic_running))
## Year Length Sex Time
## 0 0 0 31
colSums(olympic_running == "")
## Year Length Sex Time
## 0 0 0 NA
#There are NAs during the World wars. Used NA omit for a better look at the data
run<- na.omit(olympic_running)
run
## # A tsibble: 281 x 4 [4Y]
## # Key: Length, Sex [14]
## Year Length Sex Time
## <int> <int> <chr> <dbl>
## 1 1896 100 men 12
## 2 1900 100 men 11
## 3 1904 100 men 11
## 4 1908 100 men 10.8
## 5 1912 100 men 10.8
## 6 1920 100 men 10.8
## 7 1924 100 men 10.6
## 8 1928 100 men 10.8
## 9 1932 100 men 10.3
## 10 1936 100 men 10.3
## # ... with 271 more rows
run %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Time`
run %>%
filter(Sex == "men") %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Time`
run %>%
filter(Sex == "women") %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Time`
#The times seem to be slightly decreasing
The times to be decreasing for every new olympics. There is a difference in timing for men and women’s olympics. There were originally gaps in data due to the world wars so they were omitted from the data.
#B Regression
#Seeing the data we will build a regression for
run %>%
filter(Sex == "men" & Length == 100) %>%
ggplot(aes(Year, Time)) +
geom_line() +
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
#men 100 - Building the model
men_100 <- run %>%
filter(Sex == "men" & Length == 100) %>%
model(TSLM(Time~trend()))
#getting the forecast
report(men_100)
## Series: Time
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3443995 -0.1081360 0.0007715 0.0750701 0.9015537
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 11.148896 0.090895 122.66 < 2e-16 ***
## trend() -0.050450 0.004794 -10.52 7.24e-11 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2321 on 26 degrees of freedom
## Multiple R-squared: 0.8099, Adjusted R-squared: 0.8026
## F-statistic: 110.8 on 1 and 26 DF, p-value: 7.2403e-11
one_fore <- forecast(men_100)
one_fore$Time
## <distribution[2]>
## 1 2
## N(9.5, 0.061) N(9.5, 0.062)
#took the forecasted time of 9.5 from above and plugged it into similar code used in question 1
run %>%
filter(Sex == "men" & Length == 100) %>%
model(TSLM(Time~trend())) %>%
forecast(
new_data(run,1)%>%
mutate(Time = 9.5)
) %>%
autoplot(run %>%
filter(Sex == "men" & Length == 100))
#men 200
men_200 <- run %>%
filter(Sex == "men" & Length == 200) %>%
model(TSLM(Time~trend()))
two_fore <- forecast(men_200)
two_fore$Time
## <distribution[2]>
## 1 2
## N(19, 0.13) N(19, 0.13)
run %>%
filter(Sex == "men" & Length == 200) %>%
model(TSLM(Time~trend())) %>%
forecast(
new_data(run,1)%>%
mutate(Time = 19)
) %>%
autoplot(run %>%
filter(Sex == "men" & Length == 200))
olympic_model <- run %>%
model(TSLM(Time~trend()))
report(olympic_model)
## Warning in report.mdl_df(olympic_model): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.
## # A tibble: 14 x 17
## Length Sex .model r_squared adj_r_squared sigma2 statistic p_value df
## <int> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <int>
## 1 100 men TSLM(T~ 0.810 0.803 5.39e-2 111. 7.24e-11 2
## 2 100 women TSLM(T~ 0.751 0.738 4.98e-2 57.4 3.72e- 7 2
## 3 200 men TSLM(T~ 0.883 0.878 1.10e-1 189. 3.80e-13 2
## 4 200 women TSLM(T~ 0.686 0.667 2.51e-1 35.0 2.17e- 5 2
## 5 400 men TSLM(T~ 0.823 0.817 1.29e+0 121. 2.75e-11 2
## 6 400 women TSLM(T~ 0.316 0.259 1.06e+0 5.54 3.65e- 2 2
## 7 800 men TSLM(T~ 0.746 0.736 1.15e+1 73.5 6.47e- 9 2
## 8 800 women TSLM(T~ 0.655 0.631 1.16e+1 26.6 1.45e- 4 2
## 9 1500 men TSLM(T~ 0.686 0.674 6.55e+1 56.9 5.23e- 8 2
## 10 1500 women TSLM(T~ 0.161 0.0769 2.57e+1 1.92 1.96e- 1 2
## 11 5000 men TSLM(T~ 0.816 0.808 2.45e+2 97.6 1.50e- 9 2
## 12 5000 women TSLM(T~ 0.00762 -0.240 8.37e+2 0.0307 8.69e- 1 2
## 13 10000 men TSLM(T~ 0.897 0.893 8.35e+2 192. 2.37e-12 2
## 14 10000 women TSLM(T~ 0.806 0.774 3.30e+2 24.9 2.47e- 3 2
## # ... with 8 more variables: log_lik <dbl>, AIC <dbl>, AICc <dbl>, BIC <dbl>,
## # CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>
olympic_forecast <- olympic_model %>%
forecast()
This code was originally done to all of the data but it was too condensed and didn’t provide any value. It was broken down to two examples to show that it can be done to all of them. The models were originally split up between men and women to help leseen the clutter of the data. A forecast was run for all the data to see how it woudld come out.
#c Residuals against the year
augment(men_200) %>%
ggplot(aes(x = Year, y = .resid)) +
geom_point() +
labs(
y = "Residuals",
x = "Year)",
title = "Residuals Plotted Over Time"
) +
geom_abline()
There is a slight pattern in the data indicating some heteroscadasticity
#d Predict winning times
oly_predict <- olympic_forecast %>%
filter(Year == 2020 & Sex =='men')
table(oly_predict$Length, oly_predict$Time)
##
## N(9.5, 0.061) N(19, 0.13) N(42, 1.5) N(99, 13) N(207, 74) N(773, 285)
## 100 1 0 0 0 0 0
## 200 0 1 0 0 0 0
## 400 0 0 1 0 0 0
## 800 0 0 0 1 0 0
## 1500 0 0 0 0 1 0
## 5000 0 0 0 0 0 1
## 10000 0 0 0 0 0 0
##
## N(1580, 970)
## 100 0
## 200 0
## 400 0
## 800 0
## 1500 0
## 5000 0
## 10000 1
oly_women <- olympic_forecast %>%
filter(Year == 2020 & Sex =='women')
table(oly_women$Length, oly_women$Time)
##
## N(11, 0.059) N(21, 0.31) N(48, 1.4) N(111, 14) N(245, 35) N(892, 1562)
## 100 1 0 0 0 0 0
## 200 0 1 0 0 0 0
## 400 0 0 1 0 0 0
## 800 0 0 0 1 0 0
## 1500 0 0 0 0 1 0
## 5000 0 0 0 0 0 1
## 10000 0 0 0 0 0 0
##
## N(1763, 530)
## 100 0
## 200 0
## 400 0
## 800 0
## 1500 0
## 5000 0
## 10000 1
The assumptions we make fore this data is the errors have a mean of zero, there is no autocorrelation, and the predictor variable is unrelated
Question 3
The equation of elasticity is B = dY/dX*X/Y. So taking the derivative with respect to X of the log-log model provides the same result, which is the elasticity coefficient.
Question 4
#A Time Series Plot
souvenirs %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = Sales`
There is a seasonal increase that can be explicitly seen around Christmas, especially in 1994 and another slight increase around march for the surfing festival.
#B Logarithm
souvenirs %>%
ggplot(aes(x=Month, y = log(Sales)))+
geom_line()
When taking the log of a variable it puts it terms of percentages. This makes larger numbers easier to interpret and their pattern more recognizable from a graphing perspective. From a modelling perspective for a addition unit of beta 1, y changes by a percentage. This also can be easier to interpret for large numbers or various mathematical/Economic purposes like elasticity.
#C tslm model with dummy variables
#creating the surfing festival dummy variable
festival <- rep(0, nrow(souvenirs))
festival[seq_along(festival)%%12 == 3] <- 1
festival[3] <- 0
festival <- ts(festival,frequency = 12, start = c(1987,1))
#getting the data together
souv <- souvenirs
souv$Sales <- log(souv$Sales)
souv <- bind_cols(souv, festival)
## New names:
## * `` -> `...3`
names(souv)[names(souv) == "...3"] <- "festival"
souv
## # A tsibble: 84 x 3 [1M]
## Month Sales festival
## <mth> <dbl> <dbl>
## 1 1987 Jan 7.42 0
## 2 1987 Feb 7.78 0
## 3 1987 Mar 7.95 0
## 4 1987 Apr 8.17 0
## 5 1987 May 8.23 0
## 6 1987 Jun 8.22 0
## 7 1987 Jul 8.38 0
## 8 1987 Aug 8.18 0
## 9 1987 Sep 8.52 0
## 10 1987 Oct 8.77 0
## # ... with 74 more rows
souvlm <- souv %>%
model(TSLM(Sales~trend()+season()+festival))
report(souvlm)
## Series: Sales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.336727 -0.127571 0.002568 0.109106 0.376714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6196670 0.0742471 102.626 < 2e-16 ***
## trend() 0.0220198 0.0008268 26.634 < 2e-16 ***
## season()year2 0.2514168 0.0956790 2.628 0.010555 *
## season()year3 0.2660828 0.1934044 1.376 0.173275
## season()year4 0.3840535 0.0957075 4.013 0.000148 ***
## season()year5 0.4094870 0.0957325 4.277 5.88e-05 ***
## season()year6 0.4488283 0.0957647 4.687 1.33e-05 ***
## season()year7 0.6104545 0.0958039 6.372 1.71e-08 ***
## season()year8 0.5879644 0.0958503 6.134 4.53e-08 ***
## season()year9 0.6693299 0.0959037 6.979 1.36e-09 ***
## season()year10 0.7473919 0.0959643 7.788 4.48e-11 ***
## season()year11 1.2067479 0.0960319 12.566 < 2e-16 ***
## season()year12 1.9622412 0.0961066 20.417 < 2e-16 ***
## festival 0.5015151 0.1964273 2.553 0.012856 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared: 0.9567, Adjusted R-squared: 0.9487
## F-statistic: 119 on 13 and 70 DF, p-value: < 2.22e-16
#D residuals and fitted values
res <- residuals(souvlm)
fit <- as.vector(fitted(souvlm))
fit_res <- as.vector(residuals(souvlm))
#residuals over time
res %>%
autoplot() +
labs(title = "Residuals Against Time",
y = "Residuals",
x = "Year")
## Plot variable not specified, automatically selected `.vars = .resid`
#residuals against fitted values
res %>% ggplot(aes(x = fit$.fitted, y = .resid)) +
geom_point() +
labs(
y = "Residuals",
x = "Fitted",
title = "Fitted Values v. Residuals"
)
OVerall, both plots are fairly random in nature. There seems to be some skewness to the right which could indicate some heteroscadasticy.
#E Boxplot
res.month <- data.frame(res, month = rep(1:12, 7))
boxplot(res$.resid~month, data = res.month)
The residuals are not normal distributed throughout the months. April, June, and November have least amount of variation.
#F
report(souvlm)
## Series: Sales
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.336727 -0.127571 0.002568 0.109106 0.376714
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.6196670 0.0742471 102.626 < 2e-16 ***
## trend() 0.0220198 0.0008268 26.634 < 2e-16 ***
## season()year2 0.2514168 0.0956790 2.628 0.010555 *
## season()year3 0.2660828 0.1934044 1.376 0.173275
## season()year4 0.3840535 0.0957075 4.013 0.000148 ***
## season()year5 0.4094870 0.0957325 4.277 5.88e-05 ***
## season()year6 0.4488283 0.0957647 4.687 1.33e-05 ***
## season()year7 0.6104545 0.0958039 6.372 1.71e-08 ***
## season()year8 0.5879644 0.0958503 6.134 4.53e-08 ***
## season()year9 0.6693299 0.0959037 6.979 1.36e-09 ***
## season()year10 0.7473919 0.0959643 7.788 4.48e-11 ***
## season()year11 1.2067479 0.0960319 12.566 < 2e-16 ***
## season()year12 1.9622412 0.0961066 20.417 < 2e-16 ***
## festival 0.5015151 0.1964273 2.553 0.012856 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared: 0.9567, Adjusted R-squared: 0.9487
## F-statistic: 119 on 13 and 70 DF, p-value: < 2.22e-16
souvlm %>% gg_tsresiduals()
The coefficients are increasing thoughout every year showing there is a clear pattern. The residual plots show they are skewed and the ACF plot shows autocorrelation.
#G Box test
augment(souvlm) %>%
features(.innov, ljung_box, lag = 10, dof = 4)
## # A tibble: 1 x 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 TSLM(Sales ~ trend() + season() + festival) 69.8 4.54e-13
With a pvalue of less than 0.05 it can be confirmed that the residuals are distinguishable from white noise and the model has autocorrelation.
#H plot it anyway
festival_2 <- rep(0,36)
festival_2[seq_along(festival_2)%%12 == 3] <- 1
more_festive <- data.frame(festival_2)
y <- tsibble(Month = rep(yearmonth("1994 Jan")+0:35))
## Using `Month` as index variable.
y <- bind_cols(y, more_festive)
names(y)[names(y) == "mth"] <- "Month"
names(y)[names(y) == "festival_2"] <- "festival"
y
## # A tsibble: 36 x 2 [1M]
## Month festival
## <mth> <dbl>
## 1 1994 Jan 0
## 2 1994 Feb 0
## 3 1994 Mar 1
## 4 1994 Apr 0
## 5 1994 May 0
## 6 1994 Jun 0
## 7 1994 Jul 0
## 8 1994 Aug 0
## 9 1994 Sep 0
## 10 1994 Oct 0
## # ... with 26 more rows
fore_souv <- forecast(souvlm, new_data = y)
fore_souv %>%
autoplot(souv)
fore_souv$Sales
## <distribution[36]>
## 1 2 3 4 5
## N(9.5, 0.038) N(9.8, 0.038) N(10, 0.039) N(9.9, 0.038) N(10, 0.038)
## 6 7 8 9 10
## N(10, 0.038) N(10, 0.038) N(10, 0.038) N(10, 0.038) N(10, 0.038)
## 11 12 13 14 15
## N(11, 0.038) N(12, 0.038) N(9.8, 0.039) N(10, 0.039) N(11, 0.039)
## 16 17 18 19 20
## N(10, 0.039) N(10, 0.039) N(10, 0.039) N(10, 0.039) N(10, 0.039)
## 21 22 23 24 25
## N(11, 0.039) N(11, 0.039) N(11, 0.039) N(12, 0.039) N(10, 0.04)
## 26 27 28 29 30
## N(10, 0.04) N(11, 0.04) N(10, 0.04) N(11, 0.04) N(11, 0.04)
## 31 32 33 34 35
## N(11, 0.04) N(11, 0.04) N(11, 0.04) N(11, 0.04) N(11, 0.04)
## 36
## N(12, 0.04)
Question 5
gas <- ts(us_gasoline$Barrels, frequency = 52, start = 1991)
gas %>%
autoplot()
fourier_gas <- us_gasoline %>%
model(TSLM(Barrels ~ trend() + fourier(K = 4)))
report(fourier_gas)
## Series: Barrels
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.15495 -0.31437 0.01426 0.34554 0.95759
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.579e+00 2.302e-02 329.272 < 2e-16 ***
## trend() 1.438e-03 2.941e-05 48.877 < 2e-16 ***
## fourier(K = 4)C1_52 -1.134e-01 1.628e-02 -6.964 5.17e-12 ***
## fourier(K = 4)S1_52 -2.318e-01 1.625e-02 -14.265 < 2e-16 ***
## fourier(K = 4)C2_52 4.333e-02 1.626e-02 2.665 0.00779 **
## fourier(K = 4)S2_52 3.001e-02 1.626e-02 1.845 0.06522 .
## fourier(K = 4)C3_52 8.517e-02 1.625e-02 5.241 1.86e-07 ***
## fourier(K = 4)S3_52 3.532e-02 1.627e-02 2.171 0.03013 *
## fourier(K = 4)C4_52 1.847e-02 1.627e-02 1.135 0.25658
## fourier(K = 4)S4_52 4.180e-02 1.625e-02 2.573 0.01020 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4232 on 1345 degrees of freedom
## Multiple R-squared: 0.6671, Adjusted R-squared: 0.6648
## F-statistic: 299.4 on 9 and 1345 DF, p-value: < 2.22e-16
There is little change in using more than 1 fourier term. The change is only about .04 to the adjusted R squared. This is probably due to the high seasonality of weekly gasoline data
#B
tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 1), data = gas)%>%
CV()
## CV AIC AICc BIC AdjR2
## 0.1856335 -2279.9142617 -2279.8697843 -2253.8564780 0.6537409
tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 2), data = gas)%>%
CV()
## CV AIC AICc BIC AdjR2
## 0.1847782 -2286.1806647 -2286.0975170 -2249.6997676 0.6558450
tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 3), data = gas)%>%
CV()
## CV AIC AICc BIC AdjR2
## 0.1810039 -2314.1676504 -2314.0338214 -2267.2636398 0.6633752
tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 4), data = gas)%>%
CV()
## CV AIC AICc BIC AdjR2
## 0.1804806 -2318.1079209 -2317.9113460 -2260.7807968 0.6648444
tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 5), data = gas)%>%
CV()
## CV AIC AICc BIC AdjR2
## 0.1804563 -2318.3222016 -2318.0507623 -2250.5719640 0.6653876
tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 10), data = gas)%>%
CV()
## CV AIC AICc BIC AdjR2
## 0.1813963 -2311.5414205 -2310.7119689 -2191.6756156 0.6661503
The lowest AIC is the one where k = 1
#C
gas_lm <-us_gasoline %>%
model(TSLM(Barrels ~ trend() + fourier(K = 1)))
gas_lm %>%
gg_tsresiduals()
augment(gas_lm) %>%
features(.innov, ljung_box, lag = 5, dof = 3)
## # A tibble: 1 x 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 TSLM(Barrels ~ trend() + fourier(K = 1)) 2940. 0
With a P-value of 0 there is autocorrelation within the model. This is reinforced by the ACF plot with every point being out of the bounds.
#D
plot_fivegas<- us_gasoline %>%
filter(year(Week) == 2005) %>%
autoplot() +
labs(title = "2005 Production Per Week",
x = "Weeks")
## Plot variable not specified, automatically selected `.vars = Barrels`
us_gasoline[1355,]
## # A tsibble: 1 x 2 [1W]
## Week Barrels
## <week> <dbl>
## 1 2017 W03 8.04
newgas <- tsibble(Week = rep(yearweek("2017 W04")+0:51))
## Using `Week` as index variable.
newgas
## # A tsibble: 52 x 1 [1W]
## Week
## <week>
## 1 2017 W04
## 2 2017 W05
## 3 2017 W06
## 4 2017 W07
## 5 2017 W08
## 6 2017 W09
## 7 2017 W10
## 8 2017 W11
## 9 2017 W12
## 10 2017 W13
## # ... with 42 more rows
fivegas <- us_gasoline %>%
filter(year(Week)>= 2015)
foregas <- forecast(gas_lm, newgas)
plot_foregas <- foregas %>%
autoplot()
grid.arrange(plot_fivegas, plot_foregas)
I was having issues overlaying the forecast to 2005 data because of the timing. We can see the forecast is fairly similar to 2005 data but smoothed. It starts off low, a peak around 26 weeks, decline, and the it starts to settle.
Question 6:
#A Population over time
afg <- global_economy %>%
filter(Code == "AFG")
afg%>%
ggplot(aes(x= Year, y = Population)) +
geom_line()+
geom_smooth(method = "lm")
## `geom_smooth()` using formula 'y ~ x'
There is a decline in population starting just before 1980 and only increases again just after 1985. The Soviet-Afghan war was 1979 - 1989. The start of the dip in population is only around that time.
# B linear and piecewise models
fit_trends <- afg %>%
model(
linear = TSLM(Population ~ trend()),
piecewise = TSLM(Population ~ trend(knots = c(1979, 1989)))
)
afg%>%
ggplot(aes(x= Year, y = Population)) +
geom_line()+
geom_line(data = fitted(fit_trends),
aes(y=.fitted,colour = .model))+
labs(title = "AFG Population: Linear v. Piecewise")
# C forecasting
fc_trends <- fit_trends %>% forecast(h = 10)
afg%>%
ggplot(aes(x= Year, y = Population)) +
geom_line()+
geom_line(data = fitted(fit_trends),
aes(y=.fitted,colour = .model))+
autolayer(fc_trends, alpha = 0.5, level = 95) +
labs(title = "AFG Population: Linear v. Piecewise with Forecast")
There is significantly more variation in the prediction interval for the linear model than the piecewise model. This shows that the more accurate forecast would be the piecewise.