suppressMessages(library(fpp3))
## Warning: package 'lubridate' was built under R version 4.1.2
## Warning: package 'tsibble' was built under R version 4.1.2
## Warning: package 'tsibbledata' was built under R version 4.1.2
## Warning: package 'feasts' was built under R version 4.1.2
## Warning: package 'fabletools' was built under R version 4.1.2
## Warning: package 'fable' was built under R version 4.1.2
y <- tsibble(
Year = 2015:2019,
Observation = c(123, 39, 78, 52, 110),
index = Year
)
y
## # A tsibble: 5 x 2 [1Y]
## Year Observation
## <int> <dbl>
## 1 2015 123
## 2 2016 39
## 3 2017 78
## 4 2018 52
## 5 2019 110
prison <- readr::read_csv("https://OTexts.com/fpp3/extrafiles/prison_population.csv")
## Rows: 3072 Columns: 6
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (4): State, Gender, Legal, Indigenous
## dbl (1): Count
## date (1): Date
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
prison <- prison %>%
mutate(Quarter = yearquarter(Date)) %>%
select(-Date) %>%
as_tibble(key = c(State, Gender, Legal, Indigenous),
index = Quarter)
prison
## # A tibble: 3,072 x 6
## State Gender Legal Indigenous Count Quarter
## <chr> <chr> <chr> <chr> <dbl> <qtr>
## 1 ACT Female Remanded ATSI 0 2005 Q1
## 2 ACT Female Remanded Non-ATSI 2 2005 Q1
## 3 ACT Female Sentenced ATSI 0 2005 Q1
## 4 ACT Female Sentenced Non-ATSI 5 2005 Q1
## 5 ACT Male Remanded ATSI 7 2005 Q1
## 6 ACT Male Remanded Non-ATSI 58 2005 Q1
## 7 ACT Male Sentenced ATSI 5 2005 Q1
## 8 ACT Male Sentenced Non-ATSI 101 2005 Q1
## 9 NSW Female Remanded ATSI 51 2005 Q1
## 10 NSW Female Remanded Non-ATSI 131 2005 Q1
## # ... with 3,062 more rows
theme_set(theme_light())
melsyd_economy <- ansett %>%
filter(Airports == "MEL-SYD", Class == "Economy") %>%
mutate(Passengers = Passengers/1000)
autoplot(melsyd_economy, Passengers) +
labs(title = "Ansett airlines economy class",
subtitle = "Melbourne-Sydney",
y = "Passengers ('000)")
The time plot immediately reveals some interesting features - There was a period in 1989 when no passengers were carried - this was due to an industrial dispute - There was a period of reduced load in 1992. This was due to a trial in which some economy class seats were replaced by business class seats - A large increase in passenger load occured in the second half of 1991 - There are some large dips in load around the start of each year. These are due to holiday effects - There is a long-term fluctuation in the level of the series which increases during 1987, decreases in 1989, and increases again through 1990 and 1991.
Any model will need to take all these features into account in order to effectively forecast the passengers load into the future.
A simpler time series is shown in Figure 2.2, using the a10 data saved earlier
vic_elec %>% gg_season(Demand, period = "year") +
theme(legend.position = "none") +
labs(y="MWh", title="Electricity demand: Victoria")
vic_elec %>% gg_season(Demand, period = "week") +
theme(legend.position = "none") +
labs(y="MWh", title="Electricity demand: Victoria")
vic_elec %>% gg_season(Demand, period = "year") +
labs(y="MWh", title="Electricity demand: Victoria")
holidays <- tourism %>%
filter(Purpose == "Holiday") %>%
group_by(State) %>%
summarise(Trips = sum(Trips))
autoplot(holidays, Trips) +
labs(y = "Overnight trips ('000)",
title = "Australian domestic holidays")
gg_season(holidays, Trips) +
labs(y = "Overnight trips ('000)",
title = "Australian domestic holidays")
holidays %>%
gg_subseries(Trips) +
labs(y = "Overnight trips ('000)",
title = "Australian domestic holidays")
vic_elec %>%
filter(year(Time) == 2014) %>%
autoplot(Demand) +
labs(y = "GW",
title = "Half-hourly electricity demand: Victoria")
vic_elec %>%
filter(year(Time) == 2014) %>%
autoplot(Temperature) +
labs(
y = "Degrees Celsius",
title = "Half-hourly temperatures: Melbourne, Australia"
)
We can study the relationship between demand and temperature by plotting one series against the other
vic_elec %>%
filter(year(Time) == 2014) %>%
ggplot(aes(x = Temperature, y = Demand)) +
geom_point() +
labs(x = "Temperature (degrees Celsius)",
y = "Electricity demand (GW)")
When there are several potential predictor variables, it is useful to plot each variable against each opther variable. Consider the eight time series, showing quarterly vistor numbers across states and territories of Australia
visitors <- tourism %>%
group_by(State) %>%
summarise(Trips = sum(Trips))
visitors %>%
ggplot(aes(x = Quarter, y = Trips)) +
geom_line() +
facet_grid(vars(State), scales = "free_y") +
labs(title = "Australian domestic tourism",
y= "Overnight trips ('000)")
To see the relationships betwen these eight time series, we can plot each time series against the others. These plots can be arranged in a scaterplot matrix
visitors %>%
pivot_wider(values_from = Trips, names_from = State) %>%
GGally::ggpairs(columns = 2:9)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
Displays scatterplots of quarterly Australian beer production, where the horizontal axis shows lagged values of the time series. Each graph show yt ploted against y(t-k) for different values of k.
recent_production <- aus_production %>%
filter(year(Quarter) >= 2000)
recent_production %>%
gg_lag(Beer, geom = 'point') +
labs(x = "lag(Beer, k)")
Here the colours indicate the quarter of the variable on the verticval axis. The relationship is storngly positive at lags 4 and 8, reflecting the strong seasonality in the data.
recent_production %>% ACF(Beer, lag_max = 9)
## # A tsibble: 9 x 2 [1Q]
## lag acf
## <lag> <dbl>
## 1 1Q -0.0530
## 2 2Q -0.758
## 3 3Q -0.0262
## 4 4Q 0.802
## 5 5Q -0.0775
## 6 6Q -0.657
## 7 7Q 0.00119
## 8 8Q 0.707
## 9 9Q -0.0888
The values in the acf column are r1,…,r9, coresponding to the 9 scatterplots. The plot is sometimes known as a correlogram
recent_production %>%
ACF(Beer) %>%
autoplot() +
labs(title = "Australian beer production")
In this graph:
set.seed(30)
y <- tsibble(sample = 1:50, wn = rnorm(50), index = sample)
y %>% autoplot(wn) + labs(title = "White noise", y = "")
y %>%
ACF(wn) %>%
autoplot() + labs(title = "White noise")
For white noise series, we expect each autocorrelation to be close to zero. Of course, they will not exactly equal to zero as there is some random variation.
gafa_stock, PBS, vic_elec and pelt representtsible containing data on irregular trading days:tsible with 2 values ({“Scripts” : “Total number of scripts”, “Cost:” : “Cost of the scripts in $AUD”})tsible with 3 valuesfilter() to find what days corresponded to the peak closing price for each of the four stocks in gafa_stockgafa_stock %>%
group_by(Symbol) %>%
mutate(max_closing_price = max(Close),
diff_max = max_closing_price - Close) %>%
ungroup() %>%
filter(diff_max == 0) %>% distinct
## # A tibble: 4 x 10
## Symbol Date Open High Low Close Adj_Close Volume max_closing_price
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 AAPL 2018-10-03 230. 233. 230. 232. 230. 28654800 232.
## 2 AMZN 2018-09-04 2026. 2050. 2013 2040. 2040. 5721100 2040.
## 3 FB 2018-07-25 216. 219. 214. 218. 218. 58954200 218.
## 4 GOOG 2018-07-26 1251 1270. 1249. 1268. 1268. 2405600 1268.
## # ... with 1 more variable: diff_max <dbl>
setwd("C:/Users/DellPC/Desktop")
suppressMessages(library(tidyverse))
tute1 <- read.csv("tute1.csv")
View(tute1)
mytimeseries <- tute1 %>%
mutate(Quarter = yearmonth(Quarter)) %>%
as_tsibble(index = Quarter)
mytimeseries
## # A tsibble: 100 x 4 [3M]
## Quarter Sales AdBudget GDP
## <mth> <dbl> <dbl> <dbl>
## 1 1981 Mar 1020. 659. 252.
## 2 1981 Jun 889. 589 291.
## 3 1981 Sep 795 512. 291.
## 4 1981 Dec 1004. 614. 292.
## 5 1982 Mar 1058. 647. 279.
## 6 1982 Jun 944. 602 254
## 7 1982 Sep 778. 531. 296.
## 8 1982 Dec 932. 608. 272.
## 9 1983 Mar 996. 638. 260.
## 10 1983 Jun 908. 582. 280.
## # ... with 90 more rows
mytimeseries %>%
pivot_longer(-Quarter) %>%
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line() +
facet_grid(name ~ ., scales = "free_y")
USgas package contains data on the demand for natural gas in the US.suppressMessages(library(USgas))
## Warning: package 'USgas' was built under R version 4.1.2
us_total <- tsibble(us_total, index = year, key = state)
new_en_area <- us_total %>%
filter(state %in% c('Maine', 'Vermont', 'New Hampshire', 'Massachusetts', 'Connecticut', 'Rhode Island'))
suppressMessages(library(readxl))
tourism <- read_excel('tourism.xlsx')
lambda <- aus_production %>%
features(Gas, features = guerrero) %>%
pull(lambda_guerrero)
lambda
## [1] 0.1205077
aus_production %>%
autoplot(box_cox(Gas, lambda)) +
labs(y = "",
title = paste0(
"Transformed gas production with $\\lambda$ = ",
round(lambda,2)))
We will look at several methods for obtaining the component St, Tt and Rt. We will decompose the number of persions employed in retail. The data shows the total monthly number of persions in thousands employed in the retail sector across the US since 1990.
us_retail_employment <- us_employment %>%
filter(year(Month) >= 1990, Title == "Retail Trade") %>%
select(-Series_ID)
autoplot(us_retail_employment, Employed) +
labs(y = "Persons (thousands)",
title = "Total employment in uS retail")
To illustrate the ideas, we will use the STL decomposition method
dcmp <- us_retail_employment %>%
model(stl = STL(Employed))
components(dcmp)
## # A dable: 357 x 7 [1M]
## # Key: .model [1]
## # : Employed = trend + season_year + remainder
## .model Month Employed trend season_year remainder season_adjust
## <chr> <mth> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 stl 1990 Jan 13256. 13288. -33.0 0.836 13289.
## 2 stl 1990 Feb 12966. 13269. -258. -44.6 13224.
## 3 stl 1990 Mar 12938. 13250. -290. -22.1 13228.
## 4 stl 1990 Apr 13012. 13231. -220. 1.05 13232.
## 5 stl 1990 May 13108. 13211. -114. 11.3 13223.
## 6 stl 1990 Jun 13183. 13192. -24.3 15.5 13207.
## 7 stl 1990 Jul 13170. 13172. -23.2 21.6 13193.
## 8 stl 1990 Aug 13160. 13151. -9.52 17.8 13169.
## 9 stl 1990 Sep 13113. 13131. -39.5 22.0 13153.
## 10 stl 1990 Oct 13185. 13110. 61.6 13.2 13124.
## # ... with 347 more rows
The trend column (containing the trend-cycle Tt) follows the overall movement of the series, ignoring any seasonality and random fluctuations
components(dcmp) %>%
as_tsibble() %>%
autoplot(Employed, colour = "gray") +
geom_line(aes(y = trend), colour = "#D55E00") +
geom_line(aes(y = season_year), colour = "steelblue") +
geom_line(aes(y = remainder), colour = "green", linetype = 1)
labs(
y = "Persons (thousands)",
title = "Total employment in US retail"
)
## $y
## [1] "Persons (thousands)"
##
## $title
## [1] "Total employment in US retail"
##
## attr(,"class")
## [1] "labels"
We can plot all of the components in a single figure using autoplot()
components(dcmp) %>% autoplot()
The three components are shown separately in the bottom three panels. These components can be added together to reconstruct the data shown in the top pannel. Notice that the seasonal components changes over time, so that any two consecutive years have similar patterns, but years far apart may have different seasonal patterns. The remainder components shown in the bottom panel is what is left over when the seasonal and trend-cycle componetns have been substracted from the data.
The grey bars to the left of each panel show the relative scales of the componetns. Each grey bar represents the same length but because the plots are on different scales, the bars vary in size. The large grey bar in the bottom panel shows that the variation in the remainder component is smallest compared to the variation in the data.
IF the seasonal component is removed from the original data, the resulting values are the “seasonal adjusted” data.
components(dcmp)
## # A dable: 357 x 7 [1M]
## # Key: .model [1]
## # : Employed = trend + season_year + remainder
## .model Month Employed trend season_year remainder season_adjust
## <chr> <mth> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 stl 1990 Jan 13256. 13288. -33.0 0.836 13289.
## 2 stl 1990 Feb 12966. 13269. -258. -44.6 13224.
## 3 stl 1990 Mar 12938. 13250. -290. -22.1 13228.
## 4 stl 1990 Apr 13012. 13231. -220. 1.05 13232.
## 5 stl 1990 May 13108. 13211. -114. 11.3 13223.
## 6 stl 1990 Jun 13183. 13192. -24.3 15.5 13207.
## 7 stl 1990 Jul 13170. 13172. -23.2 21.6 13193.
## 8 stl 1990 Aug 13160. 13151. -9.52 17.8 13169.
## 9 stl 1990 Sep 13113. 13131. -39.5 22.0 13153.
## 10 stl 1990 Oct 13185. 13110. 61.6 13.2 13124.
## # ... with 347 more rows
components(dcmp) %>%
as_tsibble() %>%
autoplot(Employed, colour = "gray") +
geom_line(aes(y=season_adjust), colour = "#0072B2") +
labs(y = "Persons (thousands)",
title ="Total employment in US retail")
If the variation due to seasonality is not of primary interest, the seasonally adjusted series can be useful. For exampl,e monthly unemployment data are usually seasonally adjusted in order to highlight variation due to the underlying state of the economy rather than the seasonal variation
global_economy %>%
filter(Country == "Australia") %>%
autoplot(Exports) +
labs(y = "% of GDP", title = "Total Australian exports")
This is easily computed using slide_dbl() from the slider package which applies a function to “sliding” time windows. In this case, we use the mean() function with a window of size 5.
aus_exports <- global_economy %>%
filter(Country == "Australia") %>%
mutate(
`5-MA` = slider::slide_dbl(Exports, mean,
.before = 2,
.after = 2,
.complete = TRUE)
)
To see what the trend looks like, we plot it along with the original data
aus_exports %>%
autoplot(Exports) +
geom_line(aes(y = `5-MA`), colour = "#D55E00") +
labs(y = "% of GDP",
title = "Total Australian exports")
## Warning: Removed 4 row(s) containing missing values (geom_path).
Notice that the trend-cycle (in organge) is smoother than the original data and captures the main movement ofthe time series without all of the minor fluctations. In general, a large order (m period) means a smoother curve Simple moving averages such as these are usually of an odd order.
x11_dcmp <- us_retail_employment %>%
model(x11 = X_13ARIMA_SEATS(Employed ~ x11())) %>%
components()
autoplot(x11_dcmp) +
labs(title = "Decomposition of total US retail employment using X-11")
x11_dcmp %>%
ggplot(aes(x = Month)) +
geom_line(aes(y = Employed, colour = "Data")) +
geom_line(aes(y = season_adjust,
colour = "Seasonally Adjusted")) +
geom_line(aes(y = trend, colour = "Trend")) +
labs(y = "Persons (thousands)",
title = "Total employment in US retail") +
scale_colour_manual(
values = c("gray", "#0072B2", "#D55E00"),
breaks = c("Data", "Seasonally Adjusted", "Trend")
)
It’s really hard to differentiate the trend line and seasonal adjusted line
x11_dcmp %>%
gg_subseries(seasonal)
seats_dcmp <- us_retail_employment %>%
model(seats = X_13ARIMA_SEATS(Employed ~ seats())) %>%
components()
autoplot(seats_dcmp) +
labs(title = "Decomposition of total US retail employment using SEATS")
us_retail_employment %>%
model(
STL(Employed ~ trend(window = 7) + season(window = "periodic"),
robust = TRUE)
) %>%
components() %>%
autoplot()
The two main parameters to be chosen when using STL are ther trend-cycle window trend(window = ?) and season. These control how rapidly the trrend-cycle and seasonal components can change. Small values allow for more rapid changes. Both trend and seasonal windows should be odd numbers - Trend windown is the number of consecutive observations to be used when estimating the trend-cycle - seasonal window is the number of consecutive years to be used in estimating each value in the seasonal component.
lambda <- canadian_gas %>%
features(Volume, features = guerrero) %>%
pull(lambda_guerrero)
canadian_gas %>%
autoplot(box_cox(Volume, lambda))
A good transformation is one which makes the size of the seasonal variation about the same across the whole series –> forecasting model simpler
us_retail_employment %>%
features(Employed, features = guerrero) %>%
pull(lambda_guerrero)
## [1] 1.680167
lambda <- us_retail_employment %>%
features(Employed, features = guerrero) %>%
pull(lambda_guerrero)
us_retail_employment %>%
mutate(Employed_Box_Cox = box_cox(Employed, lambda))
## # A tsibble: 357 x 4 [1M]
## Month Title Employed Employed_Box_Cox
## <mth> <chr> <dbl> <dbl>
## 1 1990 Jan Retail Trade 13256. 5023161.
## 2 1990 Feb Retail Trade 12966. 4840214.
## 3 1990 Mar Retail Trade 12938. 4822603.
## 4 1990 Apr Retail Trade 13012. 4869099.
## 5 1990 May Retail Trade 13108. 4929606.
## 6 1990 Jun Retail Trade 13183. 4976771.
## 7 1990 Jul Retail Trade 13170. 4968718.
## 8 1990 Aug Retail Trade 13160. 4962000.
## 9 1990 Sep Retail Trade 13113. 4932829.
## 10 1990 Oct Retail Trade 13185. 4978293.
## # ... with 347 more rows
tourism %>% mutate(Quarter = as.Date(Quarter)) %>%
group_by(Quarter, Region, State, Purpose) %>%
summarise(Trips = sum(Trips)) %>% ungroup() %>%
as_tsibble(index = Quarter, key = c(Region, State, Purpose)) %>%
features(Trips, feat_acf)
## `summarise()` has grouped output by 'Quarter', 'Region', 'State'. You can override using the `.groups` argument.
## # A tibble: 304 x 10
## Region State Purpose acf1 acf10 diff1_acf1 diff1_acf10 diff2_acf1
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Adelaide Sout~ Busine~ 0.0333 0.131 -0.520 0.463 -0.676
## 2 Adelaide Sout~ Holiday 0.0456 0.372 -0.343 0.614 -0.487
## 3 Adelaide Sout~ Other 0.517 1.15 -0.409 0.383 -0.675
## 4 Adelaide Sout~ Visiti~ 0.0684 0.294 -0.394 0.452 -0.518
## 5 Adelaide Hills Sout~ Busine~ 0.0709 0.134 -0.580 0.415 -0.750
## 6 Adelaide Hills Sout~ Holiday 0.131 0.313 -0.536 0.500 -0.716
## 7 Adelaide Hills Sout~ Other 0.261 0.330 -0.253 0.317 -0.457
## 8 Adelaide Hills Sout~ Visiti~ 0.139 0.117 -0.472 0.239 -0.626
## 9 Alice Springs Nort~ Busine~ 0.217 0.367 -0.500 0.381 -0.658
## 10 Alice Springs Nort~ Holiday -0.00660 2.11 -0.153 2.11 -0.274
## # ... with 294 more rows, and 2 more variables: diff2_acf10 <dbl>,
## # season_acf1 <dbl>
tourism %>% mutate(Quarter = as.Date(Quarter)) %>%
group_by(Quarter, Region, State, Purpose) %>%
summarise(Trips = sum(Trips)) %>% ungroup() %>%
as_tsibble(index = Quarter, key = c(Region, State, Purpose)) %>%
features(Trips, feat_stl)
## `summarise()` has grouped output by 'Quarter', 'Region', 'State'. You can override using the `.groups` argument.
## # A tibble: 304 x 12
## Region State Purpose trend_strength seasonal_streng~ seasonal_peak_w~
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 Adelaide Sout~ Busine~ 0.255 0.136 3
## 2 Adelaide Sout~ Holiday 0.235 0.0737 3
## 3 Adelaide Sout~ Other 0.542 0.0996 0
## 4 Adelaide Sout~ Visiti~ 0.250 0.0404 5
## 5 Adelaide Hills Sout~ Busine~ 0.308 0.153 6
## 6 Adelaide Hills Sout~ Holiday 0.351 0.299 4
## 7 Adelaide Hills Sout~ Other 0.282 0.306 2
## 8 Adelaide Hills Sout~ Visiti~ 0.339 0.167 6
## 9 Alice Springs Nort~ Busine~ 0.484 0.328 2
## 10 Alice Springs Nort~ Holiday 0.0550 0.0310 3
## # ... with 294 more rows, and 6 more variables: seasonal_trough_week <dbl>,
## # spikiness <dbl>, linearity <dbl>, curvature <dbl>, stl_e_acf1 <dbl>,
## # stl_e_acf10 <dbl>
tourism %>% mutate(Quarter = as.Date(Quarter)) %>%
group_by(Quarter, Region, State, Purpose) %>%
summarise(Trips = sum(Trips)) %>% ungroup() %>%
as_tsibble(index = Quarter, key = c(Region, State, Purpose)) %>%
features(Trips, feat_stl) %>%
ggplot(aes(x = trend_strength, y = seasonal_strength_week,
col = Purpose)) +
geom_point() +
facet_wrap(vars(State))
## `summarise()` has grouped output by 'Quarter', 'Region', 'State'. You can override using the `.groups` argument.
tourism_features <- tourism %>%
mutate(yearmonth = yearmonth(Quarter)) %>%
group_by(yearmonth, Region, State, Purpose) %>%
summarise(Trips = sum(Trips)) %>%
ungroup() %>%
as_tsibble(index = yearmonth,
key = c(Region, State, Purpose)) %>%
features(Trips, feature_set(pkgs = "feasts"))
## `summarise()` has grouped output by 'yearmonth', 'Region', 'State'. You can override using the `.groups` argument.
## Warning: `n_flat_spots()` was deprecated in feasts 0.1.5.
## Please use `longest_flat_spot()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
data.table::as.data.table(tourism_features)
## Region State Purpose trend_strength
## 1: Adelaide South Australia Business 0.4638070
## 2: Adelaide South Australia Holiday 0.5542186
## 3: Adelaide South Australia Other 0.7457291
## 4: Adelaide South Australia Visiting 0.4354986
## 5: Adelaide Hills South Australia Business 0.4641066
## ---
## 300: Wimmera Victoria Visiting 0.4713401
## 301: Yorke Peninsula South Australia Business 0.4255492
## 302: Yorke Peninsula South Australia Holiday 0.4444709
## 303: Yorke Peninsula South Australia Other 0.3396787
## 304: Yorke Peninsula South Australia Visiting 0.4776542
## seasonal_strength_year seasonal_peak_year seasonal_trough_year
## 1: 0.4069585 3 1
## 2: 0.6187292 1 2
## 3: 0.2020916 2 1
## 4: 0.4518154 1 3
## 5: 0.1793947 3 0
## ---
## 300: 0.1856479 0 3
## 301: 0.4052251 3 2
## 302: 0.8285899 1 3
## 303: 0.1829069 3 2
## 304: 0.4744151 1 3
## spikiness linearity curvature stl_e_acf1 stl_e_acf10 acf1
## 1: 1.584763e+02 -5.3126332 71.584178 -0.5323114 0.5939747 0.033270698
## 2: 9.167156e+00 49.0407477 78.741593 -0.5104023 0.5608486 0.045569980
## 3: 2.097208e+00 95.0866802 43.397808 -0.3513377 0.4033702 0.516805734
## 4: 5.612952e+01 34.5767879 71.375453 -0.5009820 1.0070351 0.068365364
## 5: 1.029723e-01 0.9684728 -3.218993 -0.5998442 0.4966204 0.070914982
## ---
## 300: 1.934541e-01 -6.8774356 12.293419 -0.3442779 0.2693228 0.157781095
## 301: 4.032906e-02 -7.0508954 -6.297616 -0.4496479 0.3776330 0.031072440
## 302: 3.779598e+00 19.2821863 -2.630294 -0.5013775 0.5355811 -0.005009535
## 303: 7.039313e-03 4.6116130 4.165383 -0.6572410 0.7372501 -0.107387687
## 304: 4.581695e-01 8.7233883 22.724485 -0.4070941 0.3536849 0.067882317
## acf10 diff1_acf1 diff1_acf10 diff2_acf1 diff2_acf10 season_acf1
## 1: 0.13064555 -0.5196224 0.4625367 -0.6764291 0.7413571 0.200875465
## 2: 0.37189807 -0.3426251 0.6143649 -0.4868330 0.5582436 0.351151346
## 3: 1.15359755 -0.4086204 0.3828913 -0.6751196 0.7923533 0.341780651
## 4: 0.29423657 -0.3936543 0.4522912 -0.5179591 0.4472958 0.344930472
## 5: 0.13447305 -0.5795037 0.4145373 -0.7504035 0.7460039 -0.062808374
## ---
## 300: 0.08197425 -0.3940686 0.2330508 -0.6312346 0.5195782 0.009475965
## 301: 0.11829404 -0.4046258 0.2996577 -0.5586419 0.3850501 0.218627751
## 302: 2.01274146 -0.1444313 2.0602538 -0.2560175 1.5709391 0.578855599
## 303: 0.14948393 -0.6337660 0.6221219 -0.6892032 0.7790063 0.160689464
## 304: 0.19616217 -0.3520264 0.4193485 -0.5240023 0.4558189 0.225135365
## pacf5 diff1_pacf5 diff2_pacf5 season_pacf zero_run_mean
## 1: 0.04792863 0.5603609 1.1300541 0.19571809 0.000000
## 2: 0.14985231 0.5379268 0.6975047 0.32944749 0.000000
## 3: 0.44673126 0.4758081 0.8503707 0.29463321 0.000000
## 4: 0.18562269 0.6306896 1.0003484 0.31401005 0.000000
## 5: 0.07570925 0.4084429 1.0004127 -0.11186325 1.470588
## ---
## 300: 0.04951515 0.4033058 0.7416009 0.05036312 0.000000
## 301: 0.06199786 0.5192551 0.8138003 0.20483857 2.000000
## 302: 0.50330202 0.7819035 0.9643191 0.25446231 0.000000
## 303: 0.09473928 0.5306137 0.5994322 0.12430695 1.062500
## 304: 0.08411755 0.5548266 0.9489343 0.20169272 0.000000
## nonzero_squared_cv zero_start_prop zero_end_prop lambda_guerrero
## 1: 0.05253095 0.000 0.0000 0.51426156
## 2: 0.02988750 0.000 0.0000 -0.16946420
## 3: 0.09398659 0.000 0.0000 -0.22712799
## 4: 0.02508551 0.000 0.0000 0.87027361
## 5: 1.48000233 0.025 0.0125 0.12417795
## ---
## 300: 0.25150909 0.000 0.0000 0.37635213
## 301: 0.46662715 0.000 0.0000 0.53473211
## 302: 0.10657602 0.000 0.0000 0.31004649
## 303: 0.90447870 0.000 0.0125 0.16413204
## 304: 0.12858538 0.000 0.0000 -0.04376923
## kpss_stat kpss_pvalue pp_stat pp_pvalue ndiffs nsdiffs bp_stat
## 1: 0.22097756 0.10000000 -8.442164 0.01 0 0 0.088555145
## 2: 0.44856481 0.05622207 -8.452134 0.01 0 0 0.166129844
## 3: 1.40621121 0.01000000 -4.799338 0.01 1 0 21.367053370
## 4: 0.26365274 0.10000000 -8.207348 0.01 0 0 0.373905839
## 5: 0.05954492 0.10000000 -8.257608 0.01 0 0 0.402314775
## ---
## 300: 0.23362531 0.10000000 -7.433464 0.01 0 0 1.991589922
## 301: 0.28907527 0.10000000 -8.510207 0.01 0 0 0.077239724
## 302: 0.08743528 0.10000000 -10.012092 0.01 0 1 0.002007635
## 303: 0.48683801 0.04463108 -9.783816 0.01 1 0 0.922569233
## 304: 0.20190106 0.10000000 -7.966597 0.01 0 0 0.368640721
## bp_pvalue lb_stat lb_pvalue var_tiled_var var_tiled_mean
## 1: 7.660221e-01 0.091917999 7.617528e-01 0.5052217 0.31776899
## 2: 6.835745e-01 0.172438572 6.779536e-01 0.3401192 0.31227922
## 3: 3.792324e-06 22.178460460 2.484455e-06 0.4720950 0.57371512
## 4: 5.408829e-01 0.388104795 5.332973e-01 0.5315660 0.26745418
## 5: 5.258962e-01 0.417592552 5.181408e-01 3.0933547 0.29365329
## ---
## 300: 1.581747e-01 2.067219919 1.504956e-01 1.0268901 0.29735856
## 301: 7.810735e-01 0.080172878 7.770633e-01 0.8889341 0.24336852
## 302: 9.642614e-01 0.002083875 9.635896e-01 0.1624422 0.06510158
## 303: 3.368013e-01 0.957603508 3.277914e-01 2.7993875 0.25287825
## 304: 5.437461e-01 0.382639736 5.361939e-01 0.7109478 0.21400002
## shift_level_max shift_level_index shift_var_max shift_var_index
## 1: 69.558469 28 4383.15988 31
## 2: 60.047999 2 3778.85614 3
## 3: 30.255541 63 754.48986 74
## 4: 45.772850 2 2635.01493 66
## 5: 10.828389 31 163.26490 24
## ---
## 300: 16.326673 19 222.56808 22
## 301: 7.347688 19 94.73032 63
## 302: 43.151041 2 1242.07207 10
## 303: 3.424730 2 25.77065 77
## 304: 13.417147 75 256.95876 25
## shift_kl_max shift_kl_index spectral_entropy n_crossing_points
## 1: 3.393246 27 0.8433183 43
## 2: 1.876747 41 0.7526746 36
## 3: 8.890109 73 0.7930121 28
## 4: 2.208410 3 0.7681266 42
## 5: 27.793506 73 0.9907541 36
## ---
## 300: 4.938615 18 0.9970247 41
## 301: 4.581601 27 1.0000000 41
## 302: 1.037124 73 0.4913552 40
## 303: 4.402667 2 0.9636301 43
## 304: 1.721313 9 1.0000000 44
## longest_flat_spot coef_hurst stat_arch_lm
## 1: 4 0.5708798 0.18290707
## 2: 3 0.5575531 0.16104496
## 3: 3 0.8621758 0.45592319
## 4: 2 0.5827218 0.17333973
## 5: 10 0.5547129 0.01436185
## ---
## 300: 3 0.5885009 0.06833493
## 301: 3 0.5080092 0.08241034
## 302: 2 0.5000458 0.41425311
## 303: 4 0.5306369 0.79318205
## 304: 2 0.5011130 0.10578048
library(glue)
##
## Attaching package: 'glue'
## The following object is masked from 'package:dplyr':
##
## collapse
tourism_features %>%
select_at(vars(contains("season"), Purpose)) %>%
mutate(
seasonal_peak_year = seasonal_peak_year +
4*(seasonal_peak_year==0),
seasonal_trough_year = seasonal_trough_year +
4*(seasonal_trough_year==0),
seasonal_peak_year = glue("Q{seasonal_peak_year}"),
seasonal_trough_year = glue("Q{seasonal_trough_year}"),
) %>%
GGally::ggpairs(mapping = aes(colour = Purpose)) +
scale_fill_viridis_d()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
Here, the Purpose variable is mapped to colour. There is a lot of information in this figure, and we will highlight just a few things we can learn.
seasonal_strength_year, season_acf1 and season_pacf) are all positively correlated.seasonal_peak_year and seasonal_trough_year column show that seasonal peaks in Business travel occurs most often in Quarter3, and least often in Quarter 1.seasonal_peak_year and seasonal_trough_year columns show that seasonal peaks in Business travel occurs most often in Quarter 3, and least often in Quarter 1.It is difficult to explore more than a handful of variables in this way. A useful way to handle many more variables is to use a dimension reduction technique such as principal components. This give linear combinations of variables that explain the most variation in the original data. We can compute the principal components of the tourism features as follows.
suppressMessages(library(tidyverse))
suppressMessages(library(tidymodels))
pcs <- tourism_features %>%
select(-State, -Region, -Purpose) %>%
prcomp(scale = TRUE) %>%
augment(tourism_features)
pcs %>%
ggplot(aes(x = .fittedPC1, y = .fittedPC2, col = Purpose)) +
geom_point() +
theme(aspect.ratio = 1)
pcs %>%
ggplot(aes(x = .fittedPC2, y = .fittedPC3, col = Purpose)) +
geom_point() +
theme(aspect.ratio = 1)
outliers <- pcs %>%
filter(.fittedPC1 > 10) %>%
select(Region, State, Purpose)
outliers %>%
left_join(tourism %>%
mutate(Quarter = yearmonth(Quarter)) %>%
group_by(Quarter, Region, State, Purpose) %>%
summarise(Trips = sum(Trips)) %>%
ungroup() %>%
as_tsibble(index = Quarter,
key = c(Region, State, Purpose)), by = c("State", "Region", "Purpose")) %>%
mutate(
Series = glue("{State}", "{Region}", "{Purpose}",
.sep = "\n\n")
) %>%
ggplot(aes(x = Quarter, y = Trips)) +
geom_line() +
facet_grid(Series ~ ., scales = "free") +
labs(title = "Outlying time series in PC space")
## `summarise()` has grouped output by 'Quarter', 'Region', 'State'. You can override using the `.groups` argument.
We can speculate why these series are identified as unusual
#Max of average Cost
PBS %>% group_by(Concession, Type, ATC1) %>%
summarise(mean_cost = mean(Cost, na.rm = TRUE),
sd_cost = sd(Cost, na.rm = TRUE)) %>%
ungroup() %>%
inner_join (
PBS %>% group_by(Concession, Type, ATC1) %>%
summarise(mean_cost = mean(Cost, na.rm = TRUE),
sd_cost = sd(Cost, na.rm = TRUE)) %>%
ungroup() %>%
filter(mean_cost == max(mean_cost)),
by = c("Concession" = "Concession", "Type" = "Type", "ATC1" = "ATC1")
) %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = mean_cost.x`
#Max of sd cost
PBS %>% group_by(Concession, Type, ATC1) %>%
summarise(mean_cost = mean(Cost, na.rm = TRUE),
sd_cost = sd(Cost, na.rm = TRUE)) %>%
ungroup() %>%
inner_join (
PBS %>% group_by(Concession, Type, ATC1) %>%
summarise(mean_cost = mean(Cost, na.rm = TRUE),
sd_cost = sd(Cost, na.rm = TRUE)) %>%
ungroup() %>%
filter(sd_cost == max(sd_cost, na.rm = TRUE)),
by = c("Concession" = "Concession", "Type" = "Type", "ATC1" = "ATC1")
) %>%
autoplot()
## Plot variable not specified, automatically selected `.vars = mean_cost.x`
tourism %>%
mutate(yearmonth = yearmonth(Quarter)) %>%
filter(Purpose == "Holiday") %>%
group_by(yearmonth, Region, State, Purpose) %>%
summarise(Trips = sum(Trips)) %>%
ungroup() %>%
as_tsibble(index = yearmonth,
key = c(Region, State, Purpose)) %>%
features(Trips, feature_set(pkgs = "feasts")) %>%
select_at(vars(contains("season"), State))
## `summarise()` has grouped output by 'yearmonth', 'Region', 'State'. You can override using the `.groups` argument.
## # A tibble: 76 x 6
## seasonal_strength_~ seasonal_peak_y~ seasonal_trough~ season_acf1 season_pacf
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 0.619 1 2 0.351 0.329
## 2 0.296 2 1 0.208 0.181
## 3 0.832 3 1 0.729 0.495
## 4 0.308 3 0 0.400 0.208
## 5 0.393 1 0 0.460 0.396
## 6 0.876 3 1 0.640 0.467
## 7 0.875 1 3 0.777 0.620
## 8 0.337 1 3 0.0496 0.00825
## 9 0.626 3 1 0.412 0.322
## 10 0.194 2 3 -0.146 -0.177
## # ... with 66 more rows, and 1 more variable: State <chr>