suppressMessages(library(fpp3))

## Warning: package 'lubridate' was built under R version 4.1.2

## Warning: package 'tsibble' was built under R version 4.1.2

## Warning: package 'tsibbledata' was built under R version 4.1.2

## Warning: package 'feasts' was built under R version 4.1.2

## Warning: package 'fabletools' was built under R version 4.1.2

## Warning: package 'fable' was built under R version 4.1.2

y <- tsibble(
               Year = 2015:2019,
               Observation = c(123, 39, 78, 52, 110),
               index = Year
)


y

## # A tsibble: 5 x 2 [1Y]
##    Year Observation
##   <int>       <dbl>
## 1  2015         123
## 2  2016          39
## 3  2017          78
## 4  2018          52
## 5  2019         110

prison <- readr::read_csv("https://OTexts.com/fpp3/extrafiles/prison_population.csv")

## Rows: 3072 Columns: 6

## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr  (4): State, Gender, Legal, Indigenous
## dbl  (1): Count
## date (1): Date

## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

prison <- prison %>% 
               mutate(Quarter = yearquarter(Date)) %>% 
               select(-Date) %>% 
               as_tibble(key = c(State, Gender, Legal, Indigenous),
                         index = Quarter)


prison

## # A tibble: 3,072 x 6
##    State Gender Legal     Indigenous Count Quarter
##    <chr> <chr>  <chr>     <chr>      <dbl>   <qtr>
##  1 ACT   Female Remanded  ATSI           0 2005 Q1
##  2 ACT   Female Remanded  Non-ATSI       2 2005 Q1
##  3 ACT   Female Sentenced ATSI           0 2005 Q1
##  4 ACT   Female Sentenced Non-ATSI       5 2005 Q1
##  5 ACT   Male   Remanded  ATSI           7 2005 Q1
##  6 ACT   Male   Remanded  Non-ATSI      58 2005 Q1
##  7 ACT   Male   Sentenced ATSI           5 2005 Q1
##  8 ACT   Male   Sentenced Non-ATSI     101 2005 Q1
##  9 NSW   Female Remanded  ATSI          51 2005 Q1
## 10 NSW   Female Remanded  Non-ATSI     131 2005 Q1
## # ... with 3,062 more rows

Time plots

theme_set(theme_light())

melsyd_economy <- ansett %>% 
               filter(Airports == "MEL-SYD", Class == "Economy") %>% 
               mutate(Passengers = Passengers/1000)

autoplot(melsyd_economy, Passengers) + 
               labs(title = "Ansett airlines economy class",
                    subtitle = "Melbourne-Sydney",
                    y = "Passengers ('000)")

The time plot immediately reveals some interesting features - There was a period in 1989 when no passengers were carried - this was due to an industrial dispute - There was a period of reduced load in 1992. This was due to a trial in which some economy class seats were replaced by business class seats - A large increase in passenger load occured in the second half of 1991 - There are some large dips in load around the start of each year. These are due to holiday effects - There is a long-term fluctuation in the level of the series which increases during 1987, decreases in 1989, and increases again through 1990 and 1991.

Any model will need to take all these features into account in order to effectively forecast the passengers load into the future.

A simpler time series is shown in Figure 2.2, using the a10 data saved earlier

Time series patterns

Seasonal plots

vic_elec %>% gg_season(Demand, period = "year") +
  theme(legend.position = "none") +
  labs(y="MWh", title="Electricity demand: Victoria")

vic_elec %>% gg_season(Demand, period = "week") +
  theme(legend.position = "none") +
  labs(y="MWh", title="Electricity demand: Victoria")

vic_elec %>% gg_season(Demand, period = "year") +
  labs(y="MWh", title="Electricity demand: Victoria")

Subseries

holidays <- tourism %>%
  filter(Purpose == "Holiday") %>%
  group_by(State) %>%
  summarise(Trips = sum(Trips))

autoplot(holidays, Trips) +
  labs(y = "Overnight trips ('000)",
       title = "Australian domestic holidays")

gg_season(holidays, Trips) +
  labs(y = "Overnight trips ('000)",
       title = "Australian domestic holidays")

holidays %>%
  gg_subseries(Trips) +
  labs(y = "Overnight trips ('000)",
       title = "Australian domestic holidays")

Scatterplots

vic_elec %>%
  filter(year(Time) == 2014) %>%
  autoplot(Demand) +
  labs(y = "GW",
       title = "Half-hourly electricity demand: Victoria")

vic_elec %>%
  filter(year(Time) == 2014) %>%
  autoplot(Temperature) +
  labs(
    y = "Degrees Celsius",
    title = "Half-hourly temperatures: Melbourne, Australia"
  )

We can study the relationship between demand and temperature by plotting one series against the other

vic_elec %>%
  filter(year(Time) == 2014) %>%
  ggplot(aes(x = Temperature, y = Demand)) +
  geom_point() +
  labs(x = "Temperature (degrees Celsius)",
       y = "Electricity demand (GW)")

Scaterplot matrices

When there are several potential predictor variables, it is useful to plot each variable against each opther variable. Consider the eight time series, showing quarterly vistor numbers across states and territories of Australia

visitors <- tourism %>%
  group_by(State) %>%
  summarise(Trips = sum(Trips))
visitors %>%
  ggplot(aes(x = Quarter, y = Trips)) +
  geom_line() +
  facet_grid(vars(State), scales = "free_y") +
  labs(title = "Australian domestic tourism",
       y= "Overnight trips ('000)")

To see the relationships betwen these eight time series, we can plot each time series against the others. These plots can be arranged in a scaterplot matrix

visitors %>% 
               pivot_wider(values_from = Trips, names_from = State) %>% 
               GGally::ggpairs(columns = 2:9)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

Lag plots

Displays scatterplots of quarterly Australian beer production, where the horizontal axis shows lagged values of the time series. Each graph show yt ploted against y(t-k) for different values of k.

recent_production <- aus_production %>% 
               filter(year(Quarter) >= 2000) 

recent_production %>% 
               gg_lag(Beer, geom = 'point') + 
               labs(x = "lag(Beer, k)")

Here the colours indicate the quarter of the variable on the verticval axis. The relationship is storngly positive at lags 4 and 8, reflecting the strong seasonality in the data.

Autocorrelation

recent_production %>% ACF(Beer, lag_max = 9)

## # A tsibble: 9 x 2 [1Q]
##     lag      acf
##   <lag>    <dbl>
## 1    1Q -0.0530 
## 2    2Q -0.758  
## 3    3Q -0.0262 
## 4    4Q  0.802  
## 5    5Q -0.0775 
## 6    6Q -0.657  
## 7    7Q  0.00119
## 8    8Q  0.707  
## 9    9Q -0.0888

The values in the acf column are r1,…,r9, coresponding to the 9 scatterplots. The plot is sometimes known as a correlogram

recent_production %>% 
               ACF(Beer) %>%
               autoplot() +
               labs(title = "Australian beer production")

In this graph:

r4 is higher than for the other lags. This is due to the seasonal pattern in the data: the peaks tend to be four quarters apart and the troughs tend to be four quarters apart.
r2 is more negative than for the other lags because troughs tend to be two quarters behind peaks.
The dashed blue lines indicate whether the correlations are significantly different from zero (as explained in Section 2.9).

White Noise

set.seed(30)
y <- tsibble(sample = 1:50, wn = rnorm(50), index = sample)
y %>% autoplot(wn) + labs(title = "White noise", y = "")

y %>%
  ACF(wn) %>%
  autoplot() + labs(title = "White noise")

For white noise series, we expect each autocorrelation to be close to zero. Of course, they will not exactly equal to zero as there is some random variation.

Exercises

Use the help function to explore what the series gafa_stock, PBS, vic_elec and pelt represent

gafa_stock is a tsible containing data on irregular trading days:
PBS: is a monthly tsible with 2 values ({“Scripts” : “Total number of scripts”, “Cost:” : “Cost of the scripts in $AUD”})
vic_elec: is a half-hourly tsible with 3 values
Demand: Total electricity demand in MW
Temperature: Temperature of Melbourne
Holiday:: Indicator for if that day is a public holiday.
pelt: Hudson Bay Company trading records for Snowshoe Hare and Canadian Lynx furs from 1845 to 1935. This data contains trade records for all areas of the company.
Hare: The number of Snowshoe Hare pelts traded
Lynx:: The number of Canadian Lynx pelts traded

Use filter() to find what days corresponded to the peak closing price for each of the four stocks in gafa_stock

gafa_stock %>% 
               group_by(Symbol) %>%
               mutate(max_closing_price = max(Close),
                      diff_max = max_closing_price - Close) %>%
               ungroup() %>%
               filter(diff_max == 0) %>% distinct

## # A tibble: 4 x 10
##   Symbol Date        Open  High   Low Close Adj_Close   Volume max_closing_price
##   <chr>  <date>     <dbl> <dbl> <dbl> <dbl>     <dbl>    <dbl>             <dbl>
## 1 AAPL   2018-10-03  230.  233.  230.  232.      230. 28654800              232.
## 2 AMZN   2018-09-04 2026. 2050. 2013  2040.     2040.  5721100             2040.
## 3 FB     2018-07-25  216.  219.  214.  218.      218. 58954200              218.
## 4 GOOG   2018-07-26 1251  1270. 1249. 1268.     1268.  2405600             1268.
## # ... with 1 more variable: diff_max <dbl>

setwd("C:/Users/DellPC/Desktop")
suppressMessages(library(tidyverse))

tute1 <- read.csv("tute1.csv")

View(tute1)

Convert the data to time series

mytimeseries <- tute1 %>%
               mutate(Quarter = yearmonth(Quarter)) %>%
               as_tsibble(index = Quarter)

mytimeseries

## # A tsibble: 100 x 4 [3M]
##     Quarter Sales AdBudget   GDP
##       <mth> <dbl>    <dbl> <dbl>
##  1 1981 Mar 1020.     659.  252.
##  2 1981 Jun  889.     589   291.
##  3 1981 Sep  795      512.  291.
##  4 1981 Dec 1004.     614.  292.
##  5 1982 Mar 1058.     647.  279.
##  6 1982 Jun  944.     602   254 
##  7 1982 Sep  778.     531.  296.
##  8 1982 Dec  932.     608.  272.
##  9 1983 Mar  996.     638.  260.
## 10 1983 Jun  908.     582.  280.
## # ... with 90 more rows

Construct time series plots of each of the three series

mytimeseries %>%
  pivot_longer(-Quarter) %>%
  ggplot(aes(x = Quarter, y = value, colour = name)) +
  geom_line() +
  facet_grid(name ~ ., scales = "free_y")

The USgas package contains data on the demand for natural gas in the US.

suppressMessages(library(USgas))

## Warning: package 'USgas' was built under R version 4.1.2

us_total <- tsibble(us_total, index = year, key = state)

new_en_area <- us_total %>%
  filter(state %in% c('Maine', 'Vermont', 'New Hampshire', 'Massachusetts', 'Connecticut', 'Rhode Island'))

Download tourism.xlsx from the book website and read it into R using

suppressMessages(library(readxl))

tourism <- read_excel('tourism.xlsx')

Transformation and Adjustment

lambda <- aus_production %>% 
  features(Gas, features = guerrero) %>% 
  pull(lambda_guerrero)

lambda

## [1] 0.1205077

aus_production %>%
  autoplot(box_cox(Gas, lambda)) +
  labs(y = "",
       title = paste0(
         "Transformed gas production with $\\lambda$ = ",
         round(lambda,2)))

Example: Employment in the US retail sector

We will look at several methods for obtaining the component St, Tt and Rt. We will decompose the number of persions employed in retail. The data shows the total monthly number of persions in thousands employed in the retail sector across the US since 1990.

us_retail_employment <- us_employment %>%
  filter(year(Month) >= 1990, Title == "Retail Trade") %>%
  select(-Series_ID) 

autoplot(us_retail_employment, Employed) + 
  labs(y = "Persons (thousands)",
       title = "Total employment in uS retail")

To illustrate the ideas, we will use the STL decomposition method

dcmp <- us_retail_employment %>%
  model(stl = STL(Employed))

components(dcmp)

## # A dable: 357 x 7 [1M]
## # Key:     .model [1]
## # :        Employed = trend + season_year + remainder
##    .model    Month Employed  trend season_year remainder season_adjust
##    <chr>     <mth>    <dbl>  <dbl>       <dbl>     <dbl>         <dbl>
##  1 stl    1990 Jan   13256. 13288.      -33.0      0.836        13289.
##  2 stl    1990 Feb   12966. 13269.     -258.     -44.6          13224.
##  3 stl    1990 Mar   12938. 13250.     -290.     -22.1          13228.
##  4 stl    1990 Apr   13012. 13231.     -220.       1.05         13232.
##  5 stl    1990 May   13108. 13211.     -114.      11.3          13223.
##  6 stl    1990 Jun   13183. 13192.      -24.3     15.5          13207.
##  7 stl    1990 Jul   13170. 13172.      -23.2     21.6          13193.
##  8 stl    1990 Aug   13160. 13151.       -9.52    17.8          13169.
##  9 stl    1990 Sep   13113. 13131.      -39.5     22.0          13153.
## 10 stl    1990 Oct   13185. 13110.       61.6     13.2          13124.
## # ... with 347 more rows

The trend column (containing the trend-cycle Tt) follows the overall movement of the series, ignoring any seasonality and random fluctuations

components(dcmp) %>%
  as_tsibble() %>%
  autoplot(Employed, colour = "gray") + 
  geom_line(aes(y = trend), colour = "#D55E00") +
  geom_line(aes(y = season_year), colour = "steelblue") +
  geom_line(aes(y = remainder), colour = "green", linetype = 1)

  labs( 
    y = "Persons (thousands)",
    title = "Total employment in US retail"
    )

## $y
## [1] "Persons (thousands)"
## 
## $title
## [1] "Total employment in US retail"
## 
## attr(,"class")
## [1] "labels"

We can plot all of the components in a single figure using autoplot()

components(dcmp) %>% autoplot()

The three components are shown separately in the bottom three panels. These components can be added together to reconstruct the data shown in the top pannel. Notice that the seasonal components changes over time, so that any two consecutive years have similar patterns, but years far apart may have different seasonal patterns. The remainder components shown in the bottom panel is what is left over when the seasonal and trend-cycle componetns have been substracted from the data.

The grey bars to the left of each panel show the relative scales of the componetns. Each grey bar represents the same length but because the plots are on different scales, the bars vary in size. The large grey bar in the bottom panel shows that the variation in the remainder component is smallest compared to the variation in the data.

Seasonally adjusted data

IF the seasonal component is removed from the original data, the resulting values are the “seasonal adjusted” data.

components(dcmp)

## # A dable: 357 x 7 [1M]
## # Key:     .model [1]
## # :        Employed = trend + season_year + remainder
##    .model    Month Employed  trend season_year remainder season_adjust
##    <chr>     <mth>    <dbl>  <dbl>       <dbl>     <dbl>         <dbl>
##  1 stl    1990 Jan   13256. 13288.      -33.0      0.836        13289.
##  2 stl    1990 Feb   12966. 13269.     -258.     -44.6          13224.
##  3 stl    1990 Mar   12938. 13250.     -290.     -22.1          13228.
##  4 stl    1990 Apr   13012. 13231.     -220.       1.05         13232.
##  5 stl    1990 May   13108. 13211.     -114.      11.3          13223.
##  6 stl    1990 Jun   13183. 13192.      -24.3     15.5          13207.
##  7 stl    1990 Jul   13170. 13172.      -23.2     21.6          13193.
##  8 stl    1990 Aug   13160. 13151.       -9.52    17.8          13169.
##  9 stl    1990 Sep   13113. 13131.      -39.5     22.0          13153.
## 10 stl    1990 Oct   13185. 13110.       61.6     13.2          13124.
## # ... with 347 more rows

components(dcmp) %>%
  as_tsibble() %>%
  autoplot(Employed, colour = "gray") + 
  geom_line(aes(y=season_adjust), colour = "#0072B2") +
  labs(y = "Persons (thousands)",
       title ="Total employment in US retail")

If the variation due to seasonality is not of primary interest, the seasonally adjusted series can be useful. For exampl,e monthly unemployment data are usually seasonally adjusted in order to highlight variation due to the underlying state of the economy rather than the seasonal variation

global_economy %>%
  filter(Country == "Australia") %>%
  autoplot(Exports) +
  labs(y = "% of GDP", title = "Total Australian exports")

This is easily computed using slide_dbl() from the slider package which applies a function to “sliding” time windows. In this case, we use the mean() function with a window of size 5.

aus_exports <- global_economy %>% 
  filter(Country == "Australia") %>%
  mutate(
    `5-MA` = slider::slide_dbl(Exports, mean,
                               .before = 2,
                               .after = 2, 
                               .complete = TRUE)
  )

To see what the trend looks like, we plot it along with the original data

aus_exports %>%
  autoplot(Exports) +
  geom_line(aes(y = `5-MA`), colour = "#D55E00") +
  labs(y = "% of GDP",
       title = "Total Australian exports")

## Warning: Removed 4 row(s) containing missing values (geom_path).

Notice that the trend-cycle (in organge) is smoother than the original data and captures the main movement ofthe time series without all of the minor fluctations. In general, a large order (m period) means a smoother curve Simple moving averages such as these are usually of an odd order.

Methods used by official statistics agencies

x11_dcmp <- us_retail_employment %>%
  model(x11 = X_13ARIMA_SEATS(Employed ~ x11())) %>%
  components()

autoplot(x11_dcmp) + 
  labs(title = "Decomposition of total US retail employment using X-11")

x11_dcmp %>% 
  ggplot(aes(x = Month)) + 
  geom_line(aes(y = Employed, colour = "Data")) + 
  geom_line(aes(y = season_adjust,
                colour = "Seasonally Adjusted")) +
  geom_line(aes(y = trend, colour = "Trend")) +
  labs(y = "Persons (thousands)",
       title = "Total employment in US retail") +
   scale_colour_manual(
    values = c("gray", "#0072B2", "#D55E00"),
    breaks = c("Data", "Seasonally Adjusted", "Trend")
  )

It’s really hard to differentiate the trend line and seasonal adjusted line

x11_dcmp %>% 
  gg_subseries(seasonal)

SEATS method

seats_dcmp <- us_retail_employment %>% 
  model(seats = X_13ARIMA_SEATS(Employed ~ seats())) %>%
  components() 

autoplot(seats_dcmp) +
labs(title = "Decomposition of total US retail employment using SEATS")

STL decomposition

us_retail_employment %>%
  model(
    STL(Employed ~ trend(window = 7) + season(window = "periodic"), 
        robust = TRUE)
  ) %>%
  components() %>% 
  autoplot()

The two main parameters to be chosen when using STL are ther trend-cycle window trend(window = ?) and season. These control how rapidly the trrend-cycle and seasonal components can change. Small values allow for more rapid changes. Both trend and seasonal windows should be odd numbers - Trend windown is the number of consecutive observations to be used when estimating the trend-cycle - seasonal window is the number of consecutive years to be used in estimating each value in the seasonal component.

lambda <- canadian_gas %>% 
  features(Volume, features = guerrero) %>% 
  pull(lambda_guerrero)

canadian_gas %>% 
  autoplot(box_cox(Volume, lambda))

A good transformation is one which makes the size of the seasonal variation about the same across the whole series –> forecasting model simpler

us_retail_employment %>% 
  features(Employed, features = guerrero) %>% 
  pull(lambda_guerrero)

## [1] 1.680167

lambda <- us_retail_employment %>% 
  features(Employed, features = guerrero) %>% 
  pull(lambda_guerrero)


us_retail_employment %>% 
  mutate(Employed_Box_Cox = box_cox(Employed, lambda))

## # A tsibble: 357 x 4 [1M]
##       Month Title        Employed Employed_Box_Cox
##       <mth> <chr>           <dbl>            <dbl>
##  1 1990 Jan Retail Trade   13256.         5023161.
##  2 1990 Feb Retail Trade   12966.         4840214.
##  3 1990 Mar Retail Trade   12938.         4822603.
##  4 1990 Apr Retail Trade   13012.         4869099.
##  5 1990 May Retail Trade   13108.         4929606.
##  6 1990 Jun Retail Trade   13183.         4976771.
##  7 1990 Jul Retail Trade   13170.         4968718.
##  8 1990 Aug Retail Trade   13160.         4962000.
##  9 1990 Sep Retail Trade   13113.         4932829.
## 10 1990 Oct Retail Trade   13185.         4978293.
## # ... with 347 more rows

tourism %>% mutate(Quarter = as.Date(Quarter)) %>% 
  group_by(Quarter, Region, State, Purpose) %>% 
  summarise(Trips = sum(Trips)) %>% ungroup()  %>%
  as_tsibble(index = Quarter, key = c(Region, State, Purpose)) %>%
  features(Trips, feat_acf)

## `summarise()` has grouped output by 'Quarter', 'Region', 'State'. You can override using the `.groups` argument.

## # A tibble: 304 x 10
##    Region         State Purpose     acf1 acf10 diff1_acf1 diff1_acf10 diff2_acf1
##    <chr>          <chr> <chr>      <dbl> <dbl>      <dbl>       <dbl>      <dbl>
##  1 Adelaide       Sout~ Busine~  0.0333  0.131     -0.520       0.463     -0.676
##  2 Adelaide       Sout~ Holiday  0.0456  0.372     -0.343       0.614     -0.487
##  3 Adelaide       Sout~ Other    0.517   1.15      -0.409       0.383     -0.675
##  4 Adelaide       Sout~ Visiti~  0.0684  0.294     -0.394       0.452     -0.518
##  5 Adelaide Hills Sout~ Busine~  0.0709  0.134     -0.580       0.415     -0.750
##  6 Adelaide Hills Sout~ Holiday  0.131   0.313     -0.536       0.500     -0.716
##  7 Adelaide Hills Sout~ Other    0.261   0.330     -0.253       0.317     -0.457
##  8 Adelaide Hills Sout~ Visiti~  0.139   0.117     -0.472       0.239     -0.626
##  9 Alice Springs  Nort~ Busine~  0.217   0.367     -0.500       0.381     -0.658
## 10 Alice Springs  Nort~ Holiday -0.00660 2.11      -0.153       2.11      -0.274
## # ... with 294 more rows, and 2 more variables: diff2_acf10 <dbl>,
## #   season_acf1 <dbl>

STL Features

tourism %>% mutate(Quarter = as.Date(Quarter)) %>% 
  group_by(Quarter, Region, State, Purpose) %>% 
  summarise(Trips = sum(Trips)) %>% ungroup()  %>%
  as_tsibble(index = Quarter, key = c(Region, State, Purpose)) %>%
  features(Trips, feat_stl)

## `summarise()` has grouped output by 'Quarter', 'Region', 'State'. You can override using the `.groups` argument.

## # A tibble: 304 x 12
##    Region         State Purpose trend_strength seasonal_streng~ seasonal_peak_w~
##    <chr>          <chr> <chr>            <dbl>            <dbl>            <dbl>
##  1 Adelaide       Sout~ Busine~         0.255            0.136                 3
##  2 Adelaide       Sout~ Holiday         0.235            0.0737                3
##  3 Adelaide       Sout~ Other           0.542            0.0996                0
##  4 Adelaide       Sout~ Visiti~         0.250            0.0404                5
##  5 Adelaide Hills Sout~ Busine~         0.308            0.153                 6
##  6 Adelaide Hills Sout~ Holiday         0.351            0.299                 4
##  7 Adelaide Hills Sout~ Other           0.282            0.306                 2
##  8 Adelaide Hills Sout~ Visiti~         0.339            0.167                 6
##  9 Alice Springs  Nort~ Busine~         0.484            0.328                 2
## 10 Alice Springs  Nort~ Holiday         0.0550           0.0310                3
## # ... with 294 more rows, and 6 more variables: seasonal_trough_week <dbl>,
## #   spikiness <dbl>, linearity <dbl>, curvature <dbl>, stl_e_acf1 <dbl>,
## #   stl_e_acf10 <dbl>

tourism %>% mutate(Quarter = as.Date(Quarter)) %>% 
  group_by(Quarter, Region, State, Purpose) %>% 
  summarise(Trips = sum(Trips)) %>% ungroup()  %>%
  as_tsibble(index = Quarter, key = c(Region, State, Purpose)) %>%
  features(Trips, feat_stl) %>% 
  ggplot(aes(x = trend_strength, y = seasonal_strength_week,
             col = Purpose)) +
  geom_point() +
  facet_wrap(vars(State))

## `summarise()` has grouped output by 'Quarter', 'Region', 'State'. You can override using the `.groups` argument.

Exploring Australian tourism data

tourism_features <- tourism %>%
  mutate(yearmonth = yearmonth(Quarter)) %>% 
  group_by(yearmonth, Region, State, Purpose) %>%
  summarise(Trips = sum(Trips)) %>%
  ungroup() %>%
  as_tsibble(index = yearmonth,
             key = c(Region, State, Purpose)) %>%
  features(Trips, feature_set(pkgs = "feasts"))

## `summarise()` has grouped output by 'yearmonth', 'Region', 'State'. You can override using the `.groups` argument.

## Warning: `n_flat_spots()` was deprecated in feasts 0.1.5.
## Please use `longest_flat_spot()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

data.table::as.data.table(tourism_features)

##               Region           State  Purpose trend_strength
##   1:        Adelaide South Australia Business      0.4638070
##   2:        Adelaide South Australia  Holiday      0.5542186
##   3:        Adelaide South Australia    Other      0.7457291
##   4:        Adelaide South Australia Visiting      0.4354986
##   5:  Adelaide Hills South Australia Business      0.4641066
##  ---                                                        
## 300:         Wimmera        Victoria Visiting      0.4713401
## 301: Yorke Peninsula South Australia Business      0.4255492
## 302: Yorke Peninsula South Australia  Holiday      0.4444709
## 303: Yorke Peninsula South Australia    Other      0.3396787
## 304: Yorke Peninsula South Australia Visiting      0.4776542
##      seasonal_strength_year seasonal_peak_year seasonal_trough_year
##   1:              0.4069585                  3                    1
##   2:              0.6187292                  1                    2
##   3:              0.2020916                  2                    1
##   4:              0.4518154                  1                    3
##   5:              0.1793947                  3                    0
##  ---                                                               
## 300:              0.1856479                  0                    3
## 301:              0.4052251                  3                    2
## 302:              0.8285899                  1                    3
## 303:              0.1829069                  3                    2
## 304:              0.4744151                  1                    3
##         spikiness  linearity curvature stl_e_acf1 stl_e_acf10         acf1
##   1: 1.584763e+02 -5.3126332 71.584178 -0.5323114   0.5939747  0.033270698
##   2: 9.167156e+00 49.0407477 78.741593 -0.5104023   0.5608486  0.045569980
##   3: 2.097208e+00 95.0866802 43.397808 -0.3513377   0.4033702  0.516805734
##   4: 5.612952e+01 34.5767879 71.375453 -0.5009820   1.0070351  0.068365364
##   5: 1.029723e-01  0.9684728 -3.218993 -0.5998442   0.4966204  0.070914982
##  ---                                                                      
## 300: 1.934541e-01 -6.8774356 12.293419 -0.3442779   0.2693228  0.157781095
## 301: 4.032906e-02 -7.0508954 -6.297616 -0.4496479   0.3776330  0.031072440
## 302: 3.779598e+00 19.2821863 -2.630294 -0.5013775   0.5355811 -0.005009535
## 303: 7.039313e-03  4.6116130  4.165383 -0.6572410   0.7372501 -0.107387687
## 304: 4.581695e-01  8.7233883 22.724485 -0.4070941   0.3536849  0.067882317
##           acf10 diff1_acf1 diff1_acf10 diff2_acf1 diff2_acf10  season_acf1
##   1: 0.13064555 -0.5196224   0.4625367 -0.6764291   0.7413571  0.200875465
##   2: 0.37189807 -0.3426251   0.6143649 -0.4868330   0.5582436  0.351151346
##   3: 1.15359755 -0.4086204   0.3828913 -0.6751196   0.7923533  0.341780651
##   4: 0.29423657 -0.3936543   0.4522912 -0.5179591   0.4472958  0.344930472
##   5: 0.13447305 -0.5795037   0.4145373 -0.7504035   0.7460039 -0.062808374
##  ---                                                                      
## 300: 0.08197425 -0.3940686   0.2330508 -0.6312346   0.5195782  0.009475965
## 301: 0.11829404 -0.4046258   0.2996577 -0.5586419   0.3850501  0.218627751
## 302: 2.01274146 -0.1444313   2.0602538 -0.2560175   1.5709391  0.578855599
## 303: 0.14948393 -0.6337660   0.6221219 -0.6892032   0.7790063  0.160689464
## 304: 0.19616217 -0.3520264   0.4193485 -0.5240023   0.4558189  0.225135365
##           pacf5 diff1_pacf5 diff2_pacf5 season_pacf zero_run_mean
##   1: 0.04792863   0.5603609   1.1300541  0.19571809      0.000000
##   2: 0.14985231   0.5379268   0.6975047  0.32944749      0.000000
##   3: 0.44673126   0.4758081   0.8503707  0.29463321      0.000000
##   4: 0.18562269   0.6306896   1.0003484  0.31401005      0.000000
##   5: 0.07570925   0.4084429   1.0004127 -0.11186325      1.470588
##  ---                                                             
## 300: 0.04951515   0.4033058   0.7416009  0.05036312      0.000000
## 301: 0.06199786   0.5192551   0.8138003  0.20483857      2.000000
## 302: 0.50330202   0.7819035   0.9643191  0.25446231      0.000000
## 303: 0.09473928   0.5306137   0.5994322  0.12430695      1.062500
## 304: 0.08411755   0.5548266   0.9489343  0.20169272      0.000000
##      nonzero_squared_cv zero_start_prop zero_end_prop lambda_guerrero
##   1:         0.05253095           0.000        0.0000      0.51426156
##   2:         0.02988750           0.000        0.0000     -0.16946420
##   3:         0.09398659           0.000        0.0000     -0.22712799
##   4:         0.02508551           0.000        0.0000      0.87027361
##   5:         1.48000233           0.025        0.0125      0.12417795
##  ---                                                                 
## 300:         0.25150909           0.000        0.0000      0.37635213
## 301:         0.46662715           0.000        0.0000      0.53473211
## 302:         0.10657602           0.000        0.0000      0.31004649
## 303:         0.90447870           0.000        0.0125      0.16413204
## 304:         0.12858538           0.000        0.0000     -0.04376923
##       kpss_stat kpss_pvalue    pp_stat pp_pvalue ndiffs nsdiffs      bp_stat
##   1: 0.22097756  0.10000000  -8.442164      0.01      0       0  0.088555145
##   2: 0.44856481  0.05622207  -8.452134      0.01      0       0  0.166129844
##   3: 1.40621121  0.01000000  -4.799338      0.01      1       0 21.367053370
##   4: 0.26365274  0.10000000  -8.207348      0.01      0       0  0.373905839
##   5: 0.05954492  0.10000000  -8.257608      0.01      0       0  0.402314775
##  ---                                                                        
## 300: 0.23362531  0.10000000  -7.433464      0.01      0       0  1.991589922
## 301: 0.28907527  0.10000000  -8.510207      0.01      0       0  0.077239724
## 302: 0.08743528  0.10000000 -10.012092      0.01      0       1  0.002007635
## 303: 0.48683801  0.04463108  -9.783816      0.01      1       0  0.922569233
## 304: 0.20190106  0.10000000  -7.966597      0.01      0       0  0.368640721
##         bp_pvalue      lb_stat    lb_pvalue var_tiled_var var_tiled_mean
##   1: 7.660221e-01  0.091917999 7.617528e-01     0.5052217     0.31776899
##   2: 6.835745e-01  0.172438572 6.779536e-01     0.3401192     0.31227922
##   3: 3.792324e-06 22.178460460 2.484455e-06     0.4720950     0.57371512
##   4: 5.408829e-01  0.388104795 5.332973e-01     0.5315660     0.26745418
##   5: 5.258962e-01  0.417592552 5.181408e-01     3.0933547     0.29365329
##  ---                                                                    
## 300: 1.581747e-01  2.067219919 1.504956e-01     1.0268901     0.29735856
## 301: 7.810735e-01  0.080172878 7.770633e-01     0.8889341     0.24336852
## 302: 9.642614e-01  0.002083875 9.635896e-01     0.1624422     0.06510158
## 303: 3.368013e-01  0.957603508 3.277914e-01     2.7993875     0.25287825
## 304: 5.437461e-01  0.382639736 5.361939e-01     0.7109478     0.21400002
##      shift_level_max shift_level_index shift_var_max shift_var_index
##   1:       69.558469                28    4383.15988              31
##   2:       60.047999                 2    3778.85614               3
##   3:       30.255541                63     754.48986              74
##   4:       45.772850                 2    2635.01493              66
##   5:       10.828389                31     163.26490              24
##  ---                                                                
## 300:       16.326673                19     222.56808              22
## 301:        7.347688                19      94.73032              63
## 302:       43.151041                 2    1242.07207              10
## 303:        3.424730                 2      25.77065              77
## 304:       13.417147                75     256.95876              25
##      shift_kl_max shift_kl_index spectral_entropy n_crossing_points
##   1:     3.393246             27        0.8433183                43
##   2:     1.876747             41        0.7526746                36
##   3:     8.890109             73        0.7930121                28
##   4:     2.208410              3        0.7681266                42
##   5:    27.793506             73        0.9907541                36
##  ---                                                               
## 300:     4.938615             18        0.9970247                41
## 301:     4.581601             27        1.0000000                41
## 302:     1.037124             73        0.4913552                40
## 303:     4.402667              2        0.9636301                43
## 304:     1.721313              9        1.0000000                44
##      longest_flat_spot coef_hurst stat_arch_lm
##   1:                 4  0.5708798   0.18290707
##   2:                 3  0.5575531   0.16104496
##   3:                 3  0.8621758   0.45592319
##   4:                 2  0.5827218   0.17333973
##   5:                10  0.5547129   0.01436185
##  ---                                          
## 300:                 3  0.5885009   0.06833493
## 301:                 3  0.5080092   0.08241034
## 302:                 2  0.5000458   0.41425311
## 303:                 4  0.5306369   0.79318205
## 304:                 2  0.5011130   0.10578048

library(glue)

## 
## Attaching package: 'glue'

## The following object is masked from 'package:dplyr':
## 
##     collapse

tourism_features %>%
  select_at(vars(contains("season"), Purpose)) %>%
  mutate(
    seasonal_peak_year = seasonal_peak_year +
      4*(seasonal_peak_year==0),
    seasonal_trough_year = seasonal_trough_year +
      4*(seasonal_trough_year==0),
    seasonal_peak_year = glue("Q{seasonal_peak_year}"),
    seasonal_trough_year = glue("Q{seasonal_trough_year}"),
  ) %>%
  GGally::ggpairs(mapping = aes(colour = Purpose)) +
  scale_fill_viridis_d()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Here, the Purpose variable is mapped to colour. There is a lot of information in this figure, and we will highlight just a few things we can learn.

The three numerical measures related to seasonality (seasonal_strength_year, season_acf1 and season_pacf) are all positively correlated.
The bottom left panel and the top right panel both show that the most strongly seasonal series in the bottom row of the seasonal_peak_year and seasonal_trough_year column show that seasonal peaks in Business travel occurs most often in Quarter3, and least often in Quarter 1.
The bar plots in the bottom row of the seasonal_peak_year and seasonal_trough_year columns show that seasonal peaks in Business travel occurs most often in Quarter 3, and least often in Quarter 1.

It is difficult to explore more than a handful of variables in this way. A useful way to handle many more variables is to use a dimension reduction technique such as principal components. This give linear combinations of variables that explain the most variation in the original data. We can compute the principal components of the tourism features as follows.

suppressMessages(library(tidyverse))
suppressMessages(library(tidymodels))

pcs <- tourism_features %>% 
  select(-State, -Region, -Purpose) %>%
  prcomp(scale = TRUE) %>%
  augment(tourism_features)

pcs %>%
  ggplot(aes(x = .fittedPC1, y = .fittedPC2, col = Purpose)) +
  geom_point() +
  theme(aspect.ratio = 1)

pcs %>%
  ggplot(aes(x = .fittedPC2, y = .fittedPC3, col = Purpose)) +
  geom_point() +
  theme(aspect.ratio = 1)

outliers <- pcs %>%
  filter(.fittedPC1 > 10) %>%
  select(Region, State, Purpose)

outliers %>%
  left_join(tourism %>%
  mutate(Quarter = yearmonth(Quarter)) %>% 
  group_by(Quarter, Region, State, Purpose) %>%
  summarise(Trips = sum(Trips)) %>%
  ungroup() %>%
  as_tsibble(index = Quarter,
             key = c(Region, State, Purpose)), by = c("State", "Region", "Purpose")) %>%
  mutate(
    Series = glue("{State}", "{Region}", "{Purpose}",
                  .sep = "\n\n")
  ) %>%
  ggplot(aes(x = Quarter, y = Trips)) +
  geom_line() +
  facet_grid(Series ~ ., scales = "free") +
  labs(title = "Outlying time series in PC space")

## `summarise()` has grouped output by 'Quarter', 'Region', 'State'. You can override using the `.groups` argument.

We can speculate why these series are identified as unusual

Holiday visits to the south coast of NSW is highly seasonal but almost no trend, whereas most holiday destination in Australia show some trend over time.
Melbourne is an unusual holiday destination because it has almost no seasonality, whereas most holiday destinations in Australia have highly seasonal tourism.
The north western corner of Western Australia is unusual because it shows an increase in business tourism in the last few year of data, but little or no seasonality
The south western corner of Western Australia is unusual because it shows both an increase in holiday tourism in the last few years in a high level of seasonality

#Max of average Cost

PBS %>% group_by(Concession, Type, ATC1) %>%
  summarise(mean_cost = mean(Cost, na.rm = TRUE), 
            sd_cost = sd(Cost, na.rm = TRUE)) %>% 
  ungroup() %>%
  inner_join (
PBS %>% group_by(Concession, Type, ATC1) %>%
  summarise(mean_cost = mean(Cost, na.rm = TRUE), 
            sd_cost = sd(Cost, na.rm = TRUE)) %>% 
  ungroup() %>%
  filter(mean_cost == max(mean_cost)),
by = c("Concession" = "Concession", "Type" = "Type", "ATC1" = "ATC1")
) %>% 
  autoplot()

## Plot variable not specified, automatically selected `.vars = mean_cost.x`

#Max of sd cost

PBS %>% group_by(Concession, Type, ATC1) %>%
  summarise(mean_cost = mean(Cost, na.rm = TRUE), 
            sd_cost = sd(Cost, na.rm = TRUE)) %>% 
  ungroup() %>%
  inner_join (
PBS %>% group_by(Concession, Type, ATC1) %>%
  summarise(mean_cost = mean(Cost, na.rm = TRUE), 
            sd_cost = sd(Cost, na.rm = TRUE)) %>% 
  ungroup() %>%
  filter(sd_cost == max(sd_cost, na.rm = TRUE)),
by = c("Concession" = "Concession", "Type" = "Type", "ATC1" = "ATC1")
) %>% 
  autoplot()

## Plot variable not specified, automatically selected `.vars = mean_cost.x`

tourism %>%
  mutate(yearmonth = yearmonth(Quarter)) %>% 
  filter(Purpose == "Holiday") %>%
  group_by(yearmonth, Region, State, Purpose) %>%
  summarise(Trips = sum(Trips)) %>%
  ungroup() %>%
  as_tsibble(index = yearmonth,
             key = c(Region, State, Purpose)) %>%
  features(Trips, feature_set(pkgs = "feasts")) %>%
  select_at(vars(contains("season"),  State))

## `summarise()` has grouped output by 'yearmonth', 'Region', 'State'. You can override using the `.groups` argument.

## # A tibble: 76 x 6
##    seasonal_strength_~ seasonal_peak_y~ seasonal_trough~ season_acf1 season_pacf
##                  <dbl>            <dbl>            <dbl>       <dbl>       <dbl>
##  1               0.619                1                2      0.351      0.329  
##  2               0.296                2                1      0.208      0.181  
##  3               0.832                3                1      0.729      0.495  
##  4               0.308                3                0      0.400      0.208  
##  5               0.393                1                0      0.460      0.396  
##  6               0.876                3                1      0.640      0.467  
##  7               0.875                1                3      0.777      0.620  
##  8               0.337                1                3      0.0496     0.00825
##  9               0.626                3                1      0.412      0.322  
## 10               0.194                2                3     -0.146     -0.177  
## # ... with 66 more rows, and 1 more variable: State <chr>

Untitled

Nguyen_LSCM

1/31/2022