Assignment 1: Chapters 3 & 7

Libraries:

# Libraries ---------------------------------------------------------------

library('tidyverse')

## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --

## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

library('lubridate')

## 
## Attaching package: 'lubridate'

## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union

library('forecast')

## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo

library('feasts')

## Loading required package: fabletools

## 
## Attaching package: 'fabletools'

## The following objects are masked from 'package:forecast':
## 
##     accuracy, forecast

library('fpp3')

## -- Attaching packages -------------------------------------------- fpp3 0.4.0 --

## v tsibble     1.1.1     v fable       0.3.1
## v tsibbledata 0.4.0

## -- Conflicts ------------------------------------------------- fpp3_conflicts --
## x lubridate::date()      masks base::date()
## x dplyr::filter()        masks stats::filter()
## x fabletools::forecast() masks forecast::forecast()
## x tsibble::intersect()   masks base::intersect()
## x tsibble::interval()    masks lubridate::interval()
## x dplyr::lag()           masks stats::lag()
## x tsibble::setdiff()     masks base::setdiff()
## x tsibble::union()       masks base::union()

library('gridExtra')

## 
## Attaching package: 'gridExtra'

## The following object is masked from 'package:dplyr':
## 
##     combine

Chapter 3:

Question 1

# data cleaning/Exploration -----------------------------------------------
view(global_economy)

colSums(is.na(global_economy))

##    Country       Code       Year        GDP     Growth        CPI    Imports 
##          0          0          0       3322       3756       7480       4554 
##    Exports Population 
##       4563          3

nrow(global_economy)

## [1] 15150

colSums(global_economy == "")

##    Country       Code       Year        GDP     Growth        CPI    Imports 
##          0          0          0         NA         NA         NA         NA 
##    Exports Population 
##         NA         NA

global <- na.omit(global_economy)

Though it may not be best practice, especially when using official data. NA’s were taken out of the data. There were some that were causing issues when plotting.

#creating the GDP Per Capita column 
global <- mutate(global, GPC = GDP/Population)

#too many levels so I finding the highest GPCS; anything over 8526.97 should do
summary(global$GPC)

##      Min.   1st Qu.    Median      Mean   3rd Qu.      Max. 
##     37.52    661.88   2147.46   7884.37   8526.97 119225.38

global %>% 
  filter(GPC > 8526.97) %>% 
  ggplot(aes(Year, GPC, col = Country)) +
  geom_line()

#I'm colorblind so this doesn't help :(  
summary(global$Year)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    1961    1983    1997    1995    2008    2017

global %>% 
  filter(Year == 1961) %>% 
  filter(GPC == max(GPC)) %>% 
  select(Country, GPC)

## # A tsibble: 1 x 3 [1Y]
## # Key:       Country [1]
##   Country         GPC  Year
##   <fct>         <dbl> <dbl>
## 1 United States 3067.  1961

# US w/ 3067
global %>% 
    filter(Year == 1985) %>% 
    filter(GPC == max(GPC)) %>% 
  select(Country, GPC)

## # A tsibble: 1 x 3 [1Y]
## # Key:       Country [1]
##   Country          GPC  Year
##   <fct>          <dbl> <dbl>
## 1 United States 18269.  1985

#US w/ 18269

global %>% 
  filter(Year == 2000) %>% 
  filter(GPC == max(GPC)) %>% 
  select(Country, GPC)

## # A tsibble: 1 x 3 [1Y]
## # Key:       Country [1]
##   Country       GPC  Year
##   <fct>       <dbl> <dbl>
## 1 Luxembourg 48736.  2000

#Lix at 48736

global %>% 
  filter(Year == 2017) %>% 
  filter(GPC == max(GPC)) %>% 
  select(Country, GPC)

## # A tsibble: 1 x 3 [1Y]
## # Key:       Country [1]
##   Country        GPC  Year
##   <fct>        <dbl> <dbl>
## 1 Luxembourg 104103.  2017

global %>% 
  filter(GPC == max(GPC)) %>% 
  select(Country, GPC)

## # A tsibble: 1 x 3 [1Y]
## # Key:       Country [1]
##   Country        GPC  Year
##   <fct>        <dbl> <dbl>
## 1 Luxembourg 119225.  2014

global %>% 
  filter(Country == "Luxembourg") %>% 
  autoplot(GPC) + 
  labs(title = " Luxembourg GDP Per Capita", 
       y =" GDP Per Capita", 
       x= "Year") + 
  autolayer(filter(global %>% 
                     filter(Country == "United States")), GPC, color = "Blue")

The country with the largest GDP per capita (GPC) was Luxembourg in 2014. The country had a GPC of 119225. After sampling the data it shows. Originally, The United States had the largest GPC but Luxembourgh took the lead throughout the years. Luxembourg’s GPC Sky rocketted in the 2000’s and started to stagnate around the 2010s.

Question 2

Global Economy Data

global %>% 
  filter(Country == "United States") %>% 
  ggplot(aes(Year, log(GDP))) + 
  geom_line() +
  labs(y = " GDP Log", title = "United States GDP")

The log of the data was taking to more accurately reflect the pattern of the data while having more managable numbers to look at and understand because log turns the data into terms of percentages.

Australian Livestock Data

aus_livestock %>% 
  filter(Animal == "Bulls, bullocks and steers"& State == "Victoria") %>% 
  autoplot()

## Plot variable not specified, automatically selected `.vars = Count`

A box cox transformation should be taken for data to help with the seasonal variation. None was taken becuase the data was counts.

Victorian Electricity Demand

vic_elec %>% 
  filter(year(Date) == 2014)%>%
  gg_season(Demand, period = "year")

view(vic_elec)

A transformation wasn’t done for this data. It has high seasonal varaition and would benefit from a box cox transformation but due to the amount of observations the benefit isn’t seen. This data would benefit from a decomposition to better understand it.

Australian Beer Production

lam_beer <- aus_production %>% 
  features(Beer, features = guerrero) %>% 
  pull(lambda_guerrero)

aus_production %>%
  autoplot(box_cox(Beer, lam_beer))

A Box cox transformation was used for this data to help was the increasing seasonality of the data. The seasonality is still there but to further investigat ethe data a decompisiton should be used.

Question 3

head(canadian_gas)

## # A tsibble: 6 x 2 [1M]
##      Month Volume
##      <mth>  <dbl>
## 1 1960 Jan   1.43
## 2 1960 Feb   1.31
## 3 1960 Mar   1.40
## 4 1960 Apr   1.17
## 5 1960 May   1.12
## 6 1960 Jun   1.01

plot_1 <- canadian_gas %>% 
  autoplot() + 
  labs(title = "Orginal Data: Candian Gas Volume")

## Plot variable not specified, automatically selected `.vars = Volume`

lambda <- canadian_gas %>% 
  features(Volume, features = guerrero) %>%
  pull(lambda_guerrero)

Lam_plot <- canadian_gas %>% 
  autoplot(box_cox(Volume, lambda)) +
  labs(y = "",
       title = 
         "Transformed Volume with lambda = 0.39 ",
         round(lambda,2)) 
grid.arrange(plot_1, Lam_plot, ncol = 2)

The box-cox transformation helps reduce the seasonal variation of the data especially in the 80s and 90s.

Question 4

set.seed(1234567)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1))

myseries %>%  
  autoplot() + 
  labs(title = " Austrailian Retail turnover")

## Plot variable not specified, automatically selected `.vars = Turnover`

retail_lam <- myseries %>% 
  features(Turnover, features = guerrero) %>% 
  pull(lambda_guerrero)

retail_lam

## [1] 0.1541981

myseries %>% 
  autoplot(box_cox(Turnover, retail_lam)) +
  labs(y = "",
       title = 
         "Transformed Volume with lambda = 0.16 ")

According to the guerror approach to choosing a lambda, the value selected would be 0.16. Thought I have used the set seed function, there still seems to be some change in the data and the lambda might be subject to change.

Question 5

tobacco_lam <- aus_production %>% 
  features(Tobacco, features = guerrero) %>%
  pull(lambda_guerrero)
tobacco_lam

## [1] 0.9289402

air <- ansett %>% 
  filter(Airports == "MEL-SYD" & Class =="Economy")

Econ_lam <- air %>% 
  features(Airports, features = guerrero) %>%
  pull(lambda_guerrero)
Econ_lam

## [1] 1.999927

ped <- pedestrian %>% 
  filter(Sensor == "Southern Cross Station")
ped_lam <- ped %>% 
  features(Count, features = guerrero) %>%
  pull(lambda_guerrero)
ped_lam

## [1] -0.2255423

To find the lambdas, I used the guerrero method to automaticall generate them. The lambdas found to fit the data best where 0.93 for the Australian Tobacco production, 1.99 or 2 for Economy Passenger class, and -0.2255 or -0.23 for Pedestrians at south station.

Question 6

#Show that a 3×5 MA is equivalent to a 
#7-term weighted moving average with weights of 0.067, 0.133, 0.200, 0.200, 0.200, 0.133, and 0.067

#the 5 MA would be at 0.200 as of the order above 

(0.067+0.133+0.200+0.200+0.200)/5

## [1] 0.16

#0.16
#5 MA at 0.200 (the second one)
(0.133+0.200+0.200+0.200+0.133)/5

## [1] 0.1732

#0.1732
(0.200+0.200+0.200+0.133+0.067)/5

## [1] 0.16

#0.16

# The 3x5 MA is 0.1644
(0.16+0.1732+0.16)/3

## [1] 0.1644

# 7 Day weighted moving average
(1/15*(0.067))+2/15*(0.133)+(3/15*(0.2))+(3/15*(0.2))+(3/15*(0.2))+(2/15*(0.133))+(1/15*(0.067))

## [1] 0.1644

Above you can see the calculations fo the 5 moving averages and the 7 day weighted moving averages. They calculations show they are the same.

Question 7

#A Seasonal Fluctuations 
gas <- tail(aus_production, 5*4) %>% select(Gas)

gas %>%
  autoplot(Gas) + 
  labs( title = "Austalian Gas Production", 
        y= "Gas Production", 
        x= "Time in Quarters")

Plot_gas <- gas %>%
  autoplot(Gas) + 
  labs( title = "Austalian Gas Production", 
        y= "Gas Production", 
        x= "Time in Quarters")

The trend is a positive and upward sloping throughout the data. The seasonality of the data is an increase from Q1 to Q2. Then a decrease from Q3 to Q4. But the seasonality increases more than it decreases.

#B Multiplicative Decomposition
gas %>%
  model(
    classical_decomposition(Gas, type = "multiplicative")
  ) %>%
  components() %>%
  autoplot() +
  labs(title = "Australian Gas Production")

## Warning: Removed 2 row(s) containing missing values (geom_path).

Part C

The analysis of trend in part A was accurate, positive upward sloping. The interpretation of the seasonality was slightly off. The reason why the seasonality increased more than it decreased was because of the upward trend. The seasonality does retain the pattern of increasing from Q1 to Q2 and decreasing over Q2 and Q4.

#D  Seasoanlly adjusted data
gas_dcmp <- gas %>%
  model(
    classical_decomposition(Gas, type = "multiplicative")) %>% 
  components()

gas_seas <- as_tsibble(bind_cols(gas$Quarter,gas_dcmp$season_adjust))

## New names:
## Using `...1` as index variable.
## * `` -> `...1`
## * `` -> `...2`

plot_seas <- gas_seas %>% 
  autoplot() + 
  labs(title = "Australian Gas Production without Seasonality", 
       y = "Gas Production", 
       x = "Time in Quarters")

## Plot variable not specified, automatically selected `.vars = ...2`

grid.arrange(Plot_gas,plot_seas)

#E Outlier
gas_adjusted <- gas
gas_adjusted[12,1] <- gas[12,1] + 500
view(gas_adjusted)

adjust_dcmp <- gas_adjusted %>%
  model(
    classical_decomposition(Gas, type = "multiplicative")) %>% 
  components() 
gas_adj <- as_tsibble(bind_cols(gas$Quarter,adjust_dcmp$season_adjust))

## New names:
## Using `...1` as index variable.
## * `` -> `...1`
## * `` -> `...2`

plot_adjust <- gas_adj %>% 
  autoplot() + 
  labs(title = "Australian Gas Production Adjusted", 
       y = "Gas Production", 
       x = "Time in Quarters")

## Plot variable not specified, automatically selected `.vars = ...2`

grid.arrange(Plot_gas, plot_seas, plot_adjust)

The outlier helps to eliminate the trend. The seasonality of the data is still represented in the graph.

#Part F 

#F. 
adjusted_2 <- gas
adjusted_2[18,1] <- gas[18,1] + 500


adjusted_2 %>% 
  autoplot()

## Plot variable not specified, automatically selected `.vars = Gas`

A2_dcmp <- adjusted_2 %>%
  model(
    classical_decomposition(Gas, type = "multiplicative")) %>% 
  components() 
gas_A2 <- as_tsibble(bind_cols(gas$Quarter,A2_dcmp$season_adjust))

## New names:
## Using `...1` as index variable.
## * `` -> `...1`
## * `` -> `...2`

plot_A2 <- gas_A2 %>% 
  autoplot() + 
  labs(title = "Australian Gas Production Adjusted", 
       y = "Gas Production", 
       x = "Time in Quarters")

## Plot variable not specified, automatically selected `.vars = ...2`

grid.arrange(Plot_gas, plot_seas,plot_adjust, plot_A2)

If the outlier is near the end then you can see a better concept of seasonality within the data. The if the outlier is more towards the middle of the data, it is more difficult to pick the trend out of the data.

Question 8

set.seed(12345678)
myseries <- aus_retail %>%
  filter(`Series ID` == sample(aus_retail$`Series ID`,1)) 

myseries %>% 
  autoplot()

## Plot variable not specified, automatically selected `.vars = Turnover`

myseries %>% 
  gg_season()

## Plot variable not specified, automatically selected `y = Turnover`

myseries %>% 
  gg_subseries()

## Plot variable not specified, automatically selected `y = Turnover`

myseries %>% 
  gg_lag()

## Plot variable not specified, automatically selected `y = Turnover`

x11_dcmp <- myseries %>%
  model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
  components()
autoplot(x11_dcmp) +
  labs(title =
         "Decomposition of Autralian Turnover using X-11.")

The irregular portion of the decomposition provides more details than the functions used in section 2.10 #8. The irregular graph shows where there tend to be outliers not seen, such as the dip in the late 80’s or around 2011.

Question 9

The decomposition of the Australian labor force is interesting because the value and trend are both positive and upward sloping. The seasonal graph fluctuates between plus or minus 100. The remainder scale is 0 to -400. Though the labor force is always increasing, the remainders shows that some sort of recession h append. A recession can be slightly seen in value where there seems to be a consistent dip for two years but isn’t as noticeable as the remainder graph.

The recession is noticeable in the seasonal components graph as well. Most months show a dip after the year 1990. The most prevalent months are March, April, August, and November.

Question 10

#A Functions
canadian_gas %>% 
  autoplot

## Plot variable not specified, automatically selected `.vars = Volume`

canadian_gas %>% 
  gg_subseries

## Plot variable not specified, automatically selected `y = Volume`

canadian_gas %>% 
  gg_season()

## Plot variable not specified, automatically selected `y = Volume`

#B STL Decomposition
can_dcmp <- canadian_gas %>%
  model(
    STL(Volume ~ trend(window = 13) +
          season(window = 9),
        robust = TRUE)) %>%
  components()

#C Seasonal adjustmetns 
can_season <- as_tsibble(bind_cols(can_dcmp$season_year, canadian_gas$Month))

## New names:
## Using `...2` as index variable.
## * `` -> `...1`
## * `` -> `...2`

can_season %>% 
  gg_season + 
  labs(title = "Seasonal Analysis: Canadian Gas",
       y = "Seasonal Adjustment", 
       x= "Time in Months")

## Plot variable not specified, automatically selected `y = ...1`

The seasonal shape changes in an overall U shape trend. There are seasonal peaks and valleys with the larges being in February.

#D Plausible Seasonally Adjusted Series
Can_adj <- as_tsibble(bind_cols(can_dcmp$season_adjust, canadian_gas$Month))

## New names:
## Using `...2` as index variable.
## * `` -> `...1`
## * `` -> `...2`

plot_stl <- Can_adj %>% 
  autoplot()+ 
  labs(title = "Seasonally Adjusted Gas Production(STL)",
       y = "Volume", 
       x= "Time in Months")

## Plot variable not specified, automatically selected `.vars = ...1`

plot_stl

#E Seats X-11 
x11_gas <- canadian_gas %>%
  model(x11 = X_13ARIMA_SEATS(Volume ~ x11())) %>%
  components()
autoplot(x11_gas) +
  labs(title =
         "Decomposition of Canadian Gas Production using X-11.")

Cad_x11 <- as_tsibble(bind_cols(x11_gas$season_adjust, canadian_gas$Month))

## New names:
## Using `...2` as index variable.
## * `` -> `...1`
## * `` -> `...2`

plot_cadx11 <- Cad_x11 %>% 
  autoplot()+ 
  labs(title = "Seasonally Adjusted Gas Production (X11)",
       y = "Volume", 
       x= "Time in Months")

## Plot variable not specified, automatically selected `.vars = ...1`

plot_cadx11

grid.arrange(plot_stl, plot_cadx11)

The decomposition using x11 was different than that of the STL.X11 seasonal decomposition shows more of an inflated season at the beginning when compared to STL. The seasonally adjusted graphs overall look similar. This is most likely do to the trend being about the same. There are still visible small seasonality within the data that are different by comparison.

Chapter 7:

Question 1

jan14_vic_elec <- vic_elec %>%
  filter(yearmonth(Time) == yearmonth("2014 Jan")) %>%
  index_by(Date = as_date(Time)) %>%
  summarise(
    Demand = sum(Demand),
    Temperature = max(Temperature)
  ) 

daily<- ts(jan14_vic_elec)
#A.)plotting Demand~temperature 
ggplot(jan14_vic_elec, aes(Temperature, Demand)) +
  geom_point() +
  geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

The relationship is linear because as temperatures get higher be want to cool their homes/businesses.This requires more electricity to run ACs at higher output.

#B.) Residual plot 
fit_jan <- jan14_vic_elec %>% 
  model(TSLM(Demand~Temperature))
report(fit_jan)

## Series: Demand 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -49978.2 -10218.9   -121.3  18533.2  35440.6 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  59083.9    17424.8   3.391  0.00203 ** 
## Temperature   6154.3      601.3  10.235 3.89e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 24540 on 29 degrees of freedom
## Multiple R-squared: 0.7832,  Adjusted R-squared: 0.7757
## F-statistic: 104.7 on 1 and 29 DF, p-value: 3.8897e-11

fit_jan %>% gg_tsresiduals()

There seems to be some heteroscedasticity in the model. It is skewed to the right in all the plots There is an exception, the ACF graph has a spike at the beginning and then skews. This type of model doesn’t fit the data well. Though it had an R squared of 0.78.Due to the grouping of the data a nonparametric model like KNN might fit the data better if it wasn’t time series. There is no autocorrelation according to the ACF plot.

#C Forecast 
#Forecast for 15 degrees 
jan14_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan14_vic_elec, 1) %>%
      mutate(Temperature = 15)
  ) %>%
  autoplot(jan14_vic_elec)

#forecast for 35 degrees
jan14_vic_elec %>%
  model(TSLM(Demand ~ Temperature)) %>%
  forecast(
    new_data(jan14_vic_elec, 1) %>%
      mutate(Temperature = 35)
  ) %>%
  autoplot(jan14_vic_elec)

#D Prediction interval 
temp_model <- tslm(Demand~Temperature, data = daily)
 
fcasttemp15 <- forecast(temp_model, newdata=data.frame(Temperature=15))
fcasttemp15

##    Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 32       151398.4 117127.2 185669.5 97951.22 204845.5

fcasttemp35 <- forecast(temp_model, newdata=data.frame(Temperature=35))
fcasttemp35

##    Point Forecast  Lo 80    Hi 80    Lo 95    Hi 95
## 32       274484.2 241333 307635.5 222783.7 326184.8

The forecasted damand when the temperature is 15 degrees is 151398 and 274484.2 when the temperature would be 35 degrees. The 95% prediction interval for 15 degrees is 97951.22 - 204845.5 and 222783.7 - 326184.8 for 35 degrees.

#E 
 
 total_elc <- vic_elec %>%
   index_by(Date = as_date(Time)) %>%
   summarise(
     Demand = sum(Demand),
     Temperature = max(Temperature)
   )
 ggplot(total_elc, aes(Temperature, Demand)) +
   geom_point() +
   geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

 fit_elec <- total_elc %>% 
   model(TSLM(Demand~Temperature))
 report(fit_elec)

## Series: Demand 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -62818.41 -13699.52    -25.83  17452.70 118947.95 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 220345.8     2742.3  80.352   <2e-16 ***
## Temperature    172.0      125.9   1.366    0.172    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 25470 on 1094 degrees of freedom
## Multiple R-squared: 0.001702,    Adjusted R-squared: 0.0007897
## F-statistic: 1.865 on 1 and 1094 DF, p-value: 0.17229

The new model has an adjusted R^2 of 0.00079 compared to the origanal model adjusted R^2 of 0.78. This shows that the original model predicted its sample of the data well but was not representative of the population.

Question 2:

#A.)
colSums(is.na(olympic_running))

##   Year Length    Sex   Time 
##      0      0      0     31

colSums(olympic_running == "")

##   Year Length    Sex   Time 
##      0      0      0     NA

#There are NAs during the World wars. Used NA omit for a better look at the data
run<- na.omit(olympic_running)
run

## # A tsibble: 281 x 4 [4Y]
## # Key:       Length, Sex [14]
##     Year Length Sex    Time
##    <int>  <int> <chr> <dbl>
##  1  1896    100 men    12  
##  2  1900    100 men    11  
##  3  1904    100 men    11  
##  4  1908    100 men    10.8
##  5  1912    100 men    10.8
##  6  1920    100 men    10.8
##  7  1924    100 men    10.6
##  8  1928    100 men    10.8
##  9  1932    100 men    10.3
## 10  1936    100 men    10.3
## # ... with 271 more rows

run %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = Time`

run %>%
  filter(Sex == "men") %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = Time`

run %>%
  filter(Sex == "women") %>%
  autoplot()

## Plot variable not specified, automatically selected `.vars = Time`

#The times seem to be slightly decreasing

The times to be decreasing for every new olympics. There is a difference in timing for men and women’s olympics. There were originally gaps in data due to the world wars so they were omitted from the data.

#B Regression 

#Seeing the data we will build a regression for
run %>% 
  filter(Sex == "men" & Length == 100) %>% 
  ggplot(aes(Year, Time)) +
  geom_line() +
  geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

#men 100 - Building the model 
men_100 <- run %>% 
  filter(Sex == "men" & Length == 100) %>% 
  model(TSLM(Time~trend())) 

#getting the forecast
report(men_100)

## Series: Time 
## Model: TSLM 
## 
## Residuals:
##        Min         1Q     Median         3Q        Max 
## -0.3443995 -0.1081360  0.0007715  0.0750701  0.9015537 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 11.148896   0.090895  122.66  < 2e-16 ***
## trend()     -0.050450   0.004794  -10.52 7.24e-11 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.2321 on 26 degrees of freedom
## Multiple R-squared: 0.8099,  Adjusted R-squared: 0.8026
## F-statistic: 110.8 on 1 and 26 DF, p-value: 7.2403e-11

one_fore <- forecast(men_100)
one_fore$Time

## <distribution[2]>
##             1             2 
## N(9.5, 0.061) N(9.5, 0.062)

#took the forecasted time of 9.5 from above and plugged it into similar code used in question 1
run %>% 
  filter(Sex == "men" & Length == 100) %>% 
  model(TSLM(Time~trend())) %>% 
  forecast(
    new_data(run,1)%>%
      mutate(Time = 9.5)
  ) %>% 
  autoplot(run %>% 
             filter(Sex == "men" & Length == 100))

#men 200 
men_200 <- run %>% 
  filter(Sex == "men" & Length == 200) %>% 
  model(TSLM(Time~trend())) 

two_fore <- forecast(men_200)
two_fore$Time

## <distribution[2]>
##           1           2 
## N(19, 0.13) N(19, 0.13)

run %>% 
  filter(Sex == "men" & Length == 200) %>% 
  model(TSLM(Time~trend())) %>% 
  forecast(
    new_data(run,1)%>%
      mutate(Time = 19)
  ) %>% 
  autoplot(run %>% 
             filter(Sex == "men" & Length == 200))

olympic_model <- run %>% 
  model(TSLM(Time~trend()))
report(olympic_model)

## Warning in report.mdl_df(olympic_model): Model reporting is only supported for
## individual models, so a glance will be shown. To see the report for a specific
## model, use `select()` and `filter()` to identify a single model.

## # A tibble: 14 x 17
##    Length Sex   .model  r_squared adj_r_squared  sigma2 statistic  p_value    df
##     <int> <chr> <chr>       <dbl>         <dbl>   <dbl>     <dbl>    <dbl> <int>
##  1    100 men   TSLM(T~   0.810          0.803  5.39e-2  111.     7.24e-11     2
##  2    100 women TSLM(T~   0.751          0.738  4.98e-2   57.4    3.72e- 7     2
##  3    200 men   TSLM(T~   0.883          0.878  1.10e-1  189.     3.80e-13     2
##  4    200 women TSLM(T~   0.686          0.667  2.51e-1   35.0    2.17e- 5     2
##  5    400 men   TSLM(T~   0.823          0.817  1.29e+0  121.     2.75e-11     2
##  6    400 women TSLM(T~   0.316          0.259  1.06e+0    5.54   3.65e- 2     2
##  7    800 men   TSLM(T~   0.746          0.736  1.15e+1   73.5    6.47e- 9     2
##  8    800 women TSLM(T~   0.655          0.631  1.16e+1   26.6    1.45e- 4     2
##  9   1500 men   TSLM(T~   0.686          0.674  6.55e+1   56.9    5.23e- 8     2
## 10   1500 women TSLM(T~   0.161          0.0769 2.57e+1    1.92   1.96e- 1     2
## 11   5000 men   TSLM(T~   0.816          0.808  2.45e+2   97.6    1.50e- 9     2
## 12   5000 women TSLM(T~   0.00762       -0.240  8.37e+2    0.0307 8.69e- 1     2
## 13  10000 men   TSLM(T~   0.897          0.893  8.35e+2  192.     2.37e-12     2
## 14  10000 women TSLM(T~   0.806          0.774  3.30e+2   24.9    2.47e- 3     2
## # ... with 8 more variables: log_lik <dbl>, AIC <dbl>, AICc <dbl>, BIC <dbl>,
## #   CV <dbl>, deviance <dbl>, df.residual <int>, rank <int>

olympic_forecast <- olympic_model %>% 
  forecast()

This code was originally done to all of the data but it was too condensed and didn’t provide any value. It was broken down to two examples to show that it can be done to all of them. The models were originally split up between men and women to help leseen the clutter of the data. A forecast was run for all the data to see how it woudld come out.

#c Residuals against the year

augment(men_200) %>% 
  ggplot(aes(x = Year, y = .resid)) +
  geom_point() +
  labs(
    y = "Residuals",
    x = "Year)",
    title = "Residuals Plotted Over Time"
  ) +
  geom_abline()

There is a slight pattern in the data indicating some heteroscadasticity

#d Predict winning times
oly_predict <- olympic_forecast %>% 
  filter(Year == 2020 & Sex =='men') 
 table(oly_predict$Length, oly_predict$Time)

##        
##         N(9.5, 0.061) N(19, 0.13) N(42, 1.5) N(99, 13) N(207, 74) N(773, 285)
##   100               1           0          0         0          0           0
##   200               0           1          0         0          0           0
##   400               0           0          1         0          0           0
##   800               0           0          0         1          0           0
##   1500              0           0          0         0          1           0
##   5000              0           0          0         0          0           1
##   10000             0           0          0         0          0           0
##        
##         N(1580, 970)
##   100              0
##   200              0
##   400              0
##   800              0
##   1500             0
##   5000             0
##   10000            1

 oly_women <- olympic_forecast %>% 
   filter(Year == 2020 & Sex =='women') 
 table(oly_women$Length, oly_women$Time)

##        
##         N(11, 0.059) N(21, 0.31) N(48, 1.4) N(111, 14) N(245, 35) N(892, 1562)
##   100              1           0          0          0          0            0
##   200              0           1          0          0          0            0
##   400              0           0          1          0          0            0
##   800              0           0          0          1          0            0
##   1500             0           0          0          0          1            0
##   5000             0           0          0          0          0            1
##   10000            0           0          0          0          0            0
##        
##         N(1763, 530)
##   100              0
##   200              0
##   400              0
##   800              0
##   1500             0
##   5000             0
##   10000            1

The assumptions we make fore this data is the errors have a mean of zero, there is no autocorrelation, and the predictor variable is unrelated

Question 3

The equation of elasticity is B = dY/dX*X/Y. So taking the derivative with respect to X of the log-log model provides the same result, which is the elasticity coefficient.

Question 4

#A Time Series Plot 
souvenirs %>% 
   autoplot()

## Plot variable not specified, automatically selected `.vars = Sales`

There is a seasonal increase that can be explicitly seen around Christmas, especially in 1994 and another slight increase around march for the surfing festival.

#B Logarithm  
souvenirs %>% 
  ggplot(aes(x=Month, y = log(Sales)))+ 
  geom_line()

When taking the log of a variable it puts it terms of percentages. This makes larger numbers easier to interpret and their pattern more recognizable from a graphing perspective. From a modelling perspective for a addition unit of beta 1, y changes by a percentage. This also can be easier to interpret for large numbers or various mathematical/Economic purposes like elasticity.

#C tslm model with dummy variables 

#creating the surfing festival dummy variable
festival <- rep(0, nrow(souvenirs))
festival[seq_along(festival)%%12 == 3] <- 1
festival[3] <- 0 

festival <- ts(festival,frequency = 12, start = c(1987,1))
 
#getting the data together 
souv <- souvenirs

souv$Sales <- log(souv$Sales)

souv <- bind_cols(souv, festival)

## New names:
## * `` -> `...3`

names(souv)[names(souv) == "...3"] <- "festival"
souv

## # A tsibble: 84 x 3 [1M]
##       Month Sales festival
##       <mth> <dbl>    <dbl>
##  1 1987 Jan  7.42        0
##  2 1987 Feb  7.78        0
##  3 1987 Mar  7.95        0
##  4 1987 Apr  8.17        0
##  5 1987 May  8.23        0
##  6 1987 Jun  8.22        0
##  7 1987 Jul  8.38        0
##  8 1987 Aug  8.18        0
##  9 1987 Sep  8.52        0
## 10 1987 Oct  8.77        0
## # ... with 74 more rows

souvlm <- souv %>% 
  model(TSLM(Sales~trend()+season()+festival))

report(souvlm)

## Series: Sales 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.336727 -0.127571  0.002568  0.109106  0.376714 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.6196670  0.0742471 102.626  < 2e-16 ***
## trend()        0.0220198  0.0008268  26.634  < 2e-16 ***
## season()year2  0.2514168  0.0956790   2.628 0.010555 *  
## season()year3  0.2660828  0.1934044   1.376 0.173275    
## season()year4  0.3840535  0.0957075   4.013 0.000148 ***
## season()year5  0.4094870  0.0957325   4.277 5.88e-05 ***
## season()year6  0.4488283  0.0957647   4.687 1.33e-05 ***
## season()year7  0.6104545  0.0958039   6.372 1.71e-08 ***
## season()year8  0.5879644  0.0958503   6.134 4.53e-08 ***
## season()year9  0.6693299  0.0959037   6.979 1.36e-09 ***
## season()year10 0.7473919  0.0959643   7.788 4.48e-11 ***
## season()year11 1.2067479  0.0960319  12.566  < 2e-16 ***
## season()year12 1.9622412  0.0961066  20.417  < 2e-16 ***
## festival       0.5015151  0.1964273   2.553 0.012856 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared: 0.9567,  Adjusted R-squared: 0.9487
## F-statistic:   119 on 13 and 70 DF, p-value: < 2.22e-16

#D residuals and fitted values

res  <- residuals(souvlm)
fit <- as.vector(fitted(souvlm))
fit_res <- as.vector(residuals(souvlm))
#residuals over time 
res %>% 
  autoplot() + 
  labs(title = "Residuals Against Time", 
       y = "Residuals", 
       x = "Year")

## Plot variable not specified, automatically selected `.vars = .resid`

#residuals against fitted values

res %>% ggplot(aes(x = fit$.fitted, y = .resid)) +
  geom_point() +
  labs(
    y = "Residuals",
    x = "Fitted",
    title = "Fitted Values v. Residuals"
  )

OVerall, both plots are fairly random in nature. There seems to be some skewness to the right which could indicate some heteroscadasticy.

#E Boxplot 
res.month <- data.frame(res, month = rep(1:12, 7))
boxplot(res$.resid~month, data = res.month)

The residuals are not normal distributed throughout the months. April, June, and November have least amount of variation.

#F 
report(souvlm)

## Series: Sales 
## Model: TSLM 
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.336727 -0.127571  0.002568  0.109106  0.376714 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    7.6196670  0.0742471 102.626  < 2e-16 ***
## trend()        0.0220198  0.0008268  26.634  < 2e-16 ***
## season()year2  0.2514168  0.0956790   2.628 0.010555 *  
## season()year3  0.2660828  0.1934044   1.376 0.173275    
## season()year4  0.3840535  0.0957075   4.013 0.000148 ***
## season()year5  0.4094870  0.0957325   4.277 5.88e-05 ***
## season()year6  0.4488283  0.0957647   4.687 1.33e-05 ***
## season()year7  0.6104545  0.0958039   6.372 1.71e-08 ***
## season()year8  0.5879644  0.0958503   6.134 4.53e-08 ***
## season()year9  0.6693299  0.0959037   6.979 1.36e-09 ***
## season()year10 0.7473919  0.0959643   7.788 4.48e-11 ***
## season()year11 1.2067479  0.0960319  12.566  < 2e-16 ***
## season()year12 1.9622412  0.0961066  20.417  < 2e-16 ***
## festival       0.5015151  0.1964273   2.553 0.012856 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared: 0.9567,  Adjusted R-squared: 0.9487
## F-statistic:   119 on 13 and 70 DF, p-value: < 2.22e-16

souvlm %>% gg_tsresiduals()

The coefficients are increasing thoughout every year showing there is a clear pattern. The residual plots show they are skewed and the ACF plot shows autocorrelation.

#G Box test 
augment(souvlm) %>%
  features(.innov, ljung_box, lag = 10, dof = 4)

## # A tibble: 1 x 3
##   .model                                      lb_stat lb_pvalue
##   <chr>                                         <dbl>     <dbl>
## 1 TSLM(Sales ~ trend() + season() + festival)    69.8  4.54e-13

With a pvalue of less than 0.05 it can be confirmed that the residuals are distinguishable from white noise and the model has autocorrelation.

#H plot it anyway
festival_2 <- rep(0,36)
festival_2[seq_along(festival_2)%%12 == 3] <- 1
more_festive <- data.frame(festival_2)

y <- tsibble(Month = rep(yearmonth("1994 Jan")+0:35))

## Using `Month` as index variable.

y <- bind_cols(y, more_festive)
names(y)[names(y) == "mth"] <- "Month"
names(y)[names(y) == "festival_2"] <- "festival"
y

## # A tsibble: 36 x 2 [1M]
##       Month festival
##       <mth>    <dbl>
##  1 1994 Jan        0
##  2 1994 Feb        0
##  3 1994 Mar        1
##  4 1994 Apr        0
##  5 1994 May        0
##  6 1994 Jun        0
##  7 1994 Jul        0
##  8 1994 Aug        0
##  9 1994 Sep        0
## 10 1994 Oct        0
## # ... with 26 more rows

fore_souv <- forecast(souvlm, new_data = y)


fore_souv %>% 
  autoplot(souv)

fore_souv$Sales

## <distribution[36]>
##             1             2             3             4             5 
## N(9.5, 0.038) N(9.8, 0.038)  N(10, 0.039) N(9.9, 0.038)  N(10, 0.038) 
##             6             7             8             9            10 
##  N(10, 0.038)  N(10, 0.038)  N(10, 0.038)  N(10, 0.038)  N(10, 0.038) 
##            11            12            13            14            15 
##  N(11, 0.038)  N(12, 0.038) N(9.8, 0.039)  N(10, 0.039)  N(11, 0.039) 
##            16            17            18            19            20 
##  N(10, 0.039)  N(10, 0.039)  N(10, 0.039)  N(10, 0.039)  N(10, 0.039) 
##            21            22            23            24            25 
##  N(11, 0.039)  N(11, 0.039)  N(11, 0.039)  N(12, 0.039)   N(10, 0.04) 
##            26            27            28            29            30 
##   N(10, 0.04)   N(11, 0.04)   N(10, 0.04)   N(11, 0.04)   N(11, 0.04) 
##            31            32            33            34            35 
##   N(11, 0.04)   N(11, 0.04)   N(11, 0.04)   N(11, 0.04)   N(11, 0.04) 
##            36 
##   N(12, 0.04)

Question 5

gas <- ts(us_gasoline$Barrels, frequency = 52, start = 1991)

gas %>% 
  autoplot()

fourier_gas <- us_gasoline %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 4)))
report(fourier_gas)

## Series: Barrels 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.15495 -0.31437  0.01426  0.34554  0.95759 
## 
## Coefficients:
##                       Estimate Std. Error t value Pr(>|t|)    
## (Intercept)          7.579e+00  2.302e-02 329.272  < 2e-16 ***
## trend()              1.438e-03  2.941e-05  48.877  < 2e-16 ***
## fourier(K = 4)C1_52 -1.134e-01  1.628e-02  -6.964 5.17e-12 ***
## fourier(K = 4)S1_52 -2.318e-01  1.625e-02 -14.265  < 2e-16 ***
## fourier(K = 4)C2_52  4.333e-02  1.626e-02   2.665  0.00779 ** 
## fourier(K = 4)S2_52  3.001e-02  1.626e-02   1.845  0.06522 .  
## fourier(K = 4)C3_52  8.517e-02  1.625e-02   5.241 1.86e-07 ***
## fourier(K = 4)S3_52  3.532e-02  1.627e-02   2.171  0.03013 *  
## fourier(K = 4)C4_52  1.847e-02  1.627e-02   1.135  0.25658    
## fourier(K = 4)S4_52  4.180e-02  1.625e-02   2.573  0.01020 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4232 on 1345 degrees of freedom
## Multiple R-squared: 0.6671,  Adjusted R-squared: 0.6648
## F-statistic: 299.4 on 9 and 1345 DF, p-value: < 2.22e-16

There is little change in using more than 1 fourier term. The change is only about .04 to the adjusted R squared. This is probably due to the high seasonality of weekly gasoline data

#B 
tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 1), data = gas)%>% 
    CV()

##            CV           AIC          AICc           BIC         AdjR2 
##     0.1856335 -2279.9142617 -2279.8697843 -2253.8564780     0.6537409

tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 2), data = gas)%>% 
  CV()

##            CV           AIC          AICc           BIC         AdjR2 
##     0.1847782 -2286.1806647 -2286.0975170 -2249.6997676     0.6558450

tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 3), data = gas)%>% 
  CV()

##            CV           AIC          AICc           BIC         AdjR2 
##     0.1810039 -2314.1676504 -2314.0338214 -2267.2636398     0.6633752

tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 4), data = gas)%>% 
  CV()

##            CV           AIC          AICc           BIC         AdjR2 
##     0.1804806 -2318.1079209 -2317.9113460 -2260.7807968     0.6648444

tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 5), data = gas)%>% 
  CV()

##            CV           AIC          AICc           BIC         AdjR2 
##     0.1804563 -2318.3222016 -2318.0507623 -2250.5719640     0.6653876

tslm(us_gasoline$Barrels~trend+ fourier(gas, K= 10), data = gas)%>% 
  CV()

##            CV           AIC          AICc           BIC         AdjR2 
##     0.1813963 -2311.5414205 -2310.7119689 -2191.6756156     0.6661503

The lowest AIC is the one where k = 1

#C
gas_lm <-us_gasoline %>%
  model(TSLM(Barrels ~ trend() + fourier(K = 1))) 
  
gas_lm %>% 
  gg_tsresiduals()

augment(gas_lm) %>%
  features(.innov, ljung_box, lag = 5, dof = 3)

## # A tibble: 1 x 3
##   .model                                   lb_stat lb_pvalue
##   <chr>                                      <dbl>     <dbl>
## 1 TSLM(Barrels ~ trend() + fourier(K = 1))   2940.         0

With a P-value of 0 there is autocorrelation within the model. This is reinforced by the ACF plot with every point being out of the bounds.

#D 
plot_fivegas<- us_gasoline %>% 
  filter(year(Week) == 2005) %>% 
  autoplot() + 
  labs(title = "2005 Production Per Week", 
       x = "Weeks")

## Plot variable not specified, automatically selected `.vars = Barrels`

us_gasoline[1355,]

## # A tsibble: 1 x 2 [1W]
##       Week Barrels
##     <week>   <dbl>
## 1 2017 W03    8.04

newgas <- tsibble(Week = rep(yearweek("2017 W04")+0:51))

## Using `Week` as index variable.

newgas

## # A tsibble: 52 x 1 [1W]
##        Week
##      <week>
##  1 2017 W04
##  2 2017 W05
##  3 2017 W06
##  4 2017 W07
##  5 2017 W08
##  6 2017 W09
##  7 2017 W10
##  8 2017 W11
##  9 2017 W12
## 10 2017 W13
## # ... with 42 more rows

fivegas <- us_gasoline %>% 
  filter(year(Week)>= 2015)
foregas <- forecast(gas_lm, newgas)



plot_foregas <- foregas %>% 
  autoplot()

grid.arrange(plot_fivegas, plot_foregas)

I was having issues overlaying the forecast to 2005 data because of the timing. We can see the forecast is fairly similar to 2005 data but smoothed. It starts off low, a peak around 26 weeks, decline, and the it starts to settle.

Question 6:

#A Population over time 
afg <- global_economy %>% 
  filter(Code == "AFG")

afg%>%
ggplot(aes(x= Year, y = Population)) + 
  geom_line()+
  geom_smooth(method = "lm")

## `geom_smooth()` using formula 'y ~ x'

There is a decline in population starting just before 1980 and only increases again just after 1985. The Soviet-Afghan war was 1979 - 1989. The start of the dip in population is only around that time.

# B linear and piecewise models
fit_trends <- afg %>%
  model(
    linear = TSLM(Population ~ trend()),
    piecewise = TSLM(Population ~ trend(knots = c(1979, 1989)))
    )


afg%>%
  ggplot(aes(x= Year, y = Population)) + 
  geom_line()+
  geom_line(data = fitted(fit_trends), 
            aes(y=.fitted,colour = .model))+
  labs(title = "AFG Population: Linear v. Piecewise")

# C forecasting
fc_trends <- fit_trends %>% forecast(h = 10) 


afg%>%
  ggplot(aes(x= Year, y = Population)) + 
  geom_line()+
  geom_line(data = fitted(fit_trends), 
            aes(y=.fitted,colour = .model))+
  autolayer(fc_trends, alpha = 0.5, level = 95) +
  labs(title = "AFG Population: Linear v. Piecewise with Forecast")

There is significantly more variation in the prediction interval for the linear model than the piecewise model. This shows that the more accurate forecast would be the piecewise.

Assignment 1: Chapters 3 & 7

Tyler Brierley

2022-04-24