library(tsibble)
## Warning: package 'tsibble' was built under R version 4.3.3
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(fpp3)
## Warning: package 'fpp3' was built under R version 4.3.3
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.0 ──
## ✔ tibble 3.2.1 ✔ tsibbledata 0.4.1
## ✔ dplyr 1.1.4 ✔ feasts 0.3.2
## ✔ tidyr 1.3.1 ✔ fable 0.3.4
## ✔ lubridate 1.9.3 ✔ fabletools 0.4.2
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ lubridate::interval() masks tsibble::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
Explore the following four time series: Bricks from aus_production, Lynx from pelt, Close from gafa_stock, Demand from vic_elec.
bricks_data <- aus_production |>
select(Bricks)
print(head(bricks_data))
## # A tsibble: 6 x 2 [1Q]
## Bricks Quarter
## <dbl> <qtr>
## 1 189 1956 Q1
## 2 204 1956 Q2
## 3 208 1956 Q3
## 4 197 1956 Q4
## 5 187 1957 Q1
## 6 214 1957 Q2
The time interval for the Bricks series is quarters.
autoplot(bricks_data, Bricks)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
lynx_data <- pelt |>
select(Lynx)
print(head(lynx_data))
## # A tsibble: 6 x 2 [1Y]
## Lynx Year
## <dbl> <dbl>
## 1 30090 1845
## 2 45150 1846
## 3 49150 1847
## 4 39520 1848
## 5 21230 1849
## 6 8420 1850
The time interval for the Lynx series is years.
autoplot(lynx_data, Lynx)
close_data <- gafa_stock |>
select(Close)
print(head(close_data))
## # A tsibble: 6 x 3 [!]
## # Key: Symbol [1]
## Close Date Symbol
## <dbl> <date> <chr>
## 1 79.0 2014-01-02 AAPL
## 2 77.3 2014-01-03 AAPL
## 3 77.7 2014-01-06 AAPL
## 4 77.1 2014-01-07 AAPL
## 5 77.6 2014-01-08 AAPL
## 6 76.6 2014-01-09 AAPL
The time interval for the Close series is days.
autoplot(close_data, Close)
demand_data <- vic_elec |>
select(Demand)
print(head(demand_data))
## # A tsibble: 6 x 2 [30m] <Australia/Melbourne>
## Demand Time
## <dbl> <dttm>
## 1 4383. 2012-01-01 00:00:00
## 2 4263. 2012-01-01 00:30:00
## 3 4049. 2012-01-01 01:00:00
## 4 3878. 2012-01-01 01:30:00
## 5 4036. 2012-01-01 02:00:00
## 6 3866. 2012-01-01 02:30:00
The time interval for the Demand series is half-hours.
autoplot(demand_data, Demand) +
labs(x = "Date",
y = "Total Demand (MWh",
title = "Victoria, Australia Electricity Demand")
Use filter() to find what days corresponded to the peak closing price for each of the four stocks in gafa_stock. |
print(close_data |>
group_by(Symbol) |>
filter(Close == max(Close))
)
## # A tsibble: 4 x 3 [!]
## # Key: Symbol [4]
## # Groups: Symbol [4]
## Close Date Symbol
## <dbl> <date> <chr>
## 1 232. 2018-10-03 AAPL
## 2 2040. 2018-09-04 AMZN
## 3 218. 2018-07-25 FB
## 4 1268. 2018-07-26 GOOG
The peak closing price for each stock is shown in the table above.
Convert the data to time series |
# Reading the data
library(readr)
tute1 <- read_csv("tute1.csv")
## Rows: 100 Columns: 4
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## dbl (3): Sales, AdBudget, GDP
## date (1): Quarter
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Code provided by the textbook
mytimeseries <- tute1 |>
mutate(Quarter = yearquarter(Quarter)) |>
as_tsibble(index = Quarter)
Construct time series plots of each of the three series. Check what happens when you don’t include facet_grid(). |
# Code provided by the textbook
mytimeseries |>
pivot_longer(-Quarter) |>
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line() +
facet_grid(name ~ ., scales = "free_y")
Now, without including facet_grid:
mytimeseries |>
pivot_longer(-Quarter) |>
ggplot(aes(x = Quarter, y = value, colour = name)) +
geom_line()
Without facet_grid, all three variables are plotted on the same grid with a single scale that applies to all three. This could be useful to compare the same quantity in multiple contexts (for example, natural gas consumption in different states across the same time period. Spoilers!) With facet_grid, each numeric variable gets its own graph with its own scale.
Create a tsibble from us_total with year as the index and state as the key. |
library(USgas)
gas_tsibble <- us_total |>
as_tsibble(key = state, index = year)
print(head(gas_tsibble))
## # A tsibble: 6 x 3 [1Y]
## # Key: state [1]
## year state y
## <int> <chr> <int>
## 1 1997 Alabama 324158
## 2 1998 Alabama 329134
## 3 1999 Alabama 337270
## 4 2000 Alabama 353614
## 5 2001 Alabama 332693
## 6 2002 Alabama 379343
Plot the annual natural gas consumption by state for the New England area (comprising the states of Maine, Vermont, New Hampshire, Massachusetts, Connecticut and Rhode Island). |
ne_states <- c("Maine", "Vermont", "New Hampshire", "Massachusetts", "Connecticut", "Rhode Island")
ne_gas <- gas_tsibble |>
filter(state %in% ne_states)
autoplot(ne_gas, y)
Download tourism.xlsx from the book website and read it into R using readxl::read_excel(). |
library(readxl)
tourism <- read_excel("tourism.xlsx")
print(head(tourism))
## # A tibble: 6 × 5
## Quarter Region State Purpose Trips
## <chr> <chr> <chr> <chr> <dbl>
## 1 1998-01-01 Adelaide South Australia Business 135.
## 2 1998-04-01 Adelaide South Australia Business 110.
## 3 1998-07-01 Adelaide South Australia Business 166.
## 4 1998-10-01 Adelaide South Australia Business 127.
## 5 1999-01-01 Adelaide South Australia Business 137.
## 6 1999-04-01 Adelaide South Australia Business 200.
Create a tsibble which is identical to the tourism tsibble from the tsibble package. |
tourism_tsibble <- tourism |>
mutate(Quarter = yearquarter(Quarter)) |>
as_tsibble(key = c(Region, State, Purpose), index = Quarter)
head(tourism_tsibble)
## # A tsibble: 6 x 5 [1Q]
## # Key: Region, State, Purpose [1]
## Quarter Region State Purpose Trips
## <qtr> <chr> <chr> <chr> <dbl>
## 1 1998 Q1 Adelaide South Australia Business 135.
## 2 1998 Q2 Adelaide South Australia Business 110.
## 3 1998 Q3 Adelaide South Australia Business 166.
## 4 1998 Q4 Adelaide South Australia Business 127.
## 5 1999 Q1 Adelaide South Australia Business 137.
## 6 1999 Q2 Adelaide South Australia Business 200.
Find what combination of Region and Purpose had the maximum number of overnight trips on average. |
This question seems ambiguous to me, I can’t tell if it’s asking for:
1. The values of Region
and Purpose
for the
row with the maximum possible Trips
value; or 2. The values
of Region
and Purpose
that have the highest
average value across the data set.
So, I did it both ways!
max_combo_1 <- tourism_tsibble |>
filter(Trips == max(Trips))
print(max_combo_1)
## # A tsibble: 1 x 5 [1Q]
## # Key: Region, State, Purpose [1]
## Quarter Region State Purpose Trips
## <qtr> <chr> <chr> <chr> <dbl>
## 1 2017 Q4 Melbourne Victoria Visiting 985.
In interpretation 1, Visiting Melbourne had the maximum number of overnight trips (985.2784 in Q4 of 2017).
max_combo_2 <- tourism |>
group_by(Region, Purpose) |>
summarise(avg_trips = mean(Trips)) |>
select(Region, Purpose, avg_trips) |>
distinct() |>
arrange(desc(avg_trips))
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
print(head(max_combo_2,1))
## # A tibble: 1 × 3
## # Groups: Region [1]
## Region Purpose avg_trips
## <chr> <chr> <dbl>
## 1 Sydney Visiting 747.
In interpretation 2, the combination with the highest average value in the Trips column across the data set was Visiting Sydney.
Create a new tsibble which combines the Purposes and Regions, and just has total trips by State. |
total_by_state <- tourism_tsibble |>
group_by(State) |>
summarise(total_trips = sum(Trips))
print(head(total_by_state))
## # A tsibble: 6 x 3 [1Q]
## # Key: State [1]
## State Quarter total_trips
## <chr> <qtr> <dbl>
## 1 ACT 1998 Q1 551.
## 2 ACT 1998 Q2 416.
## 3 ACT 1998 Q3 436.
## 4 ACT 1998 Q4 450.
## 5 ACT 1999 Q1 379.
## 6 ACT 1999 Q2 558.
Use the following graphics functions: autoplot(), gg_season(), gg_subseries(), gg_lag(), ACF() and explore features from the following time series: “Total Private” Employed from us_employment, Bricks from aus_production, Hare from pelt, “H02” Cost from PBS, and Barrels from us_gasoline.
priv_employed_data <- us_employment |>
filter(Title == "Total Private")
autoplot(priv_employed_data)
## Plot variable not specified, automatically selected `.vars = Employed`
gg_season(priv_employed_data)
## Plot variable not specified, automatically selected `y = Employed`
gg_subseries(priv_employed_data)
## Plot variable not specified, automatically selected `y = Employed`
gg_lag(priv_employed_data)
## Plot variable not specified, automatically selected `y = Employed`
ACF(priv_employed_data, Employed) |> autoplot()
There is no seasonality or cyclicity evident in the data, but the overall trend is up over time. From the gg_season plot we can see that there is no particular seasonal pattern (as the line for each year is close to flat), but both the original line plot and the subseries plot show the dramatic impact that the financial crisis of 2008 had on payrolls. I’m not sure how useful this data is, since the number of employed persons will almost certainly increase as the population increases over time. I wonder if this analysis would be more meaningful if it used the labor force participation rate, or unemployment rate, or some other rate stat that accounted for the growth in population.
bricks_data <- aus_production |>
select(Bricks)
autoplot(bricks_data, Bricks)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
gg_season(bricks_data, Bricks)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
gg_subseries(bricks_data, Bricks)
## Warning: Removed 5 rows containing missing values or values outside the scale range
## (`geom_line()`).
gg_lag(bricks_data, Bricks)
## Warning: Removed 20 rows containing missing values (gg_lag).
ACF(bricks_data, Bricks) |> autoplot()
This data shows a general trend upward, followed by a general trend downward. There are different cyclical patterns evident, in some years there is almost always a Q2 increase, and this is sometimes followed by a Q3 increase and sometims followed by a Q3 decrease. Four specific major outlier drops are visible in the data.
hare_data <- pelt |>
select(Hare)
autoplot(hare_data, Hare)
gg_subseries(hare_data, Hare)
gg_lag(hare_data, Hare)
ACF(hare_data, Hare) |> autoplot()
The overall trend in this data is roughly flat. We do see a cyclical nature to it but is quite a long cycle, appearing to be about a decade. There are a couple of unusually high peaks.
ho2_data <- PBS |>
filter(ATC2 == "H02") |>
select(Cost)
autoplot(ho2_data, Cost)
gg_season(ho2_data, Cost)
gg_subseries(ho2_data, Cost)
ACF(ho2_data, Cost) |> autoplot()
This data contains four separate time series. The Concessional/Co-payments series shows a clear overall trend upward, with some evidence of seasonal variation (generally, higher in the summer months). The General/Co-Payments series is largely flat with no clear trend or seasonality/cyclicity.The Concessional/Safety net series has a clear overall trend upward with an even clearer seasonal pattern (starting high in January before dropping very low from February-April and then trending back up again until the following January). The General/Safety net series also shows that seasonal pattern, but the overall trend there is flat (with a few noticeable outliers at the beginning of the data set).
barrels_data <- us_gasoline
autoplot(barrels_data, Barrels)
gg_season(barrels_data, Barrels)
gg_subseries(barrels_data, Barrels)
gg_lag(barrels_data, Barrels)
ACF(barrels_data, Barrels) |> autoplot()
This series shows a trend upward until around 2006, followed by a trend downward until about 2011, followed by another trend upward. There is something of a seasonal pattern evident, with most years higher in the summer months than the winter months, but it is not nearly as dramatic of a seasonal pattern as there was in the two “safety net” series from the PBS data. There are a couple of visible outlier years, but the data fluctuates a fair amount which makes the outliers look less extreme.