# DO NOT FORGET TO CALL THESE 3 packages FIRST
library(fpp3) # forecasting package
## Warning in system("timedatectl", intern = TRUE): running command 'timedatectl'
## had status 1
## Registered S3 method overwritten by 'tsibble':
## method from
## as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble 3.2.1 ✔ tsibble 1.1.6
## ✔ dplyr 1.1.0 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.3.0 ✔ feasts 0.4.1
## ✔ lubridate 1.9.2 ✔ fable 0.4.1
## ✔ ggplot2 3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(tidyverse) # graphs and tidy
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0 ✔ readr 2.1.4
## ✔ purrr 1.0.1 ✔ stringr 1.5.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl) # reading excel data
# download the data from the web page.
download.file("http://OTexts.com/fpp3/extrafiles/tourism.xlsx",
tourism_file <- tempfile())
# reads the downloaded tourism data by excel
my_tourism <- readxl::read_excel(tourism_file)
## Your turn: create a tsibble format of the data below, and rename it as my_tourism:
# A1.Answer:
my_tourism <- my_tourism %>%
mutate(Quarter = yearquarter(Quarter)) %>%
as_tsibble(index = Quarter, key = c(Region, State, Purpose))
# Check data structure
glimpse(my_tourism)
## Rows: 24,320
## Columns: 5
## Key: Region, State, Purpose [304]
## $ Quarter <qtr> 1998 Q1, 1998 Q2, 1998 Q3, 1998 Q4, 1999 Q1, 1999 Q2, 1999 Q3,…
## $ Region <chr> "Adelaide", "Adelaide", "Adelaide", "Adelaide", "Adelaide", "A…
## $ State <chr> "South Australia", "South Australia", "South Australia", "Sout…
## $ Purpose <chr> "Business", "Business", "Business", "Business", "Business", "B…
## $ Trips <dbl> 135.0777, 109.9873, 166.0347, 127.1605, 137.4485, 199.9126, 16…
# A2.Answer:
# 1. Check if it's annual or quarterly
start_date <- min(my_tourism$Quarter)
end_date <- max(my_tourism$Quarter)
# Answer: This is quarterly data starting from 1998 Q1 to 2017 Q4.
# 2. Cross-table of State & Purpose
table(my_tourism$State, my_tourism$Purpose)
##
## Business Holiday Other Visiting
## ACT 80 80 80 80
## New South Wales 1040 1040 1040 1040
## Northern Territory 560 560 560 560
## Queensland 960 960 960 960
## South Australia 960 960 960 960
## Tasmania 400 400 400 400
## Victoria 1680 1680 1680 1680
## Western Australia 400 400 400 400
# There are 8 states and 4 trip purposes.
# 3. Group by Region & Purpose (in tibble format)
grouped <- my_tourism %>%
as_tibble() %>%
group_by(Region, Purpose)
# 4. Average trips per Region-Purpose combo
summary_table <- grouped %>%
summarise(Trips = mean(Trips, na.rm = TRUE), .groups = "drop")
# 5. Find Region-Purpose with max average trips
max_trips <- summary_table %>%
ungroup() %>%
filter(Trips == max(Trips))
max_trips
## # A tibble: 1 × 3
## Region Purpose Trips
## <chr> <chr> <dbl>
## 1 Sydney Visiting 747.
# A3.Answer:
state_tourism <- my_tourism %>%
group_by(State) %>%
summarise(Trips = sum(Trips, na.rm = TRUE), .groups = "drop")
# View the result to check
print(state_tourism)
## # A tsibble: 640 x 3 [1Q]
## # Key: State [8]
## State Quarter Trips
## <chr> <qtr> <dbl>
## 1 ACT 1998 Q1 551.
## 2 ACT 1998 Q2 416.
## 3 ACT 1998 Q3 436.
## 4 ACT 1998 Q4 450.
## 5 ACT 1999 Q1 379.
## 6 ACT 1999 Q2 558.
## 7 ACT 1999 Q3 449.
## 8 ACT 1999 Q4 595.
## 9 ACT 2000 Q1 600.
## 10 ACT 2000 Q2 557.
## # … with 630 more rows
# Bricks from aus_production
# Lynx from pelt
# Close from gafa_stock
# Demand from vic_elec
# Bricks from aus_production
autoplot(aus_production, Bricks) +
ggtitle("Bricks Production in Australia") +
ylab("Bricks (Millions)")
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).
# Lynx from pelt
autoplot(pelt, Lynx) +
ggtitle("Lynx Trappings in Canada") +
ylab("Number of Lynx")
# Close from gafa_stock
autoplot(gafa_stock %>% filter(Symbol == "GOOG"), Close) +
ggtitle("Google Stock Price (Close)") +
ylab("Price (USD)")
# Demand from vic_elec
autoplot(vic_elec, Demand) +
ggtitle("Electricity Demand in Victoria") +
ylab("Demand (MW)")
# B1.Answer:
### 1. Bricks Production in Australia
# 1. The plot shows a strong upward trend from the 1950s to around 1980.
# 2. After 1980, production becomes more volatile with sharp dips and peaks.
### 2. Lynx Trappings in Canada
# 1. Clear cyclic pattern with peaks occurring roughly every 10 years.
# 2. The trapping counts swing dramatically between low and high values.
# 3. No obvious long-term upward or downward trend — the data fluctuates around similar levels over time.
### 3. Google Stock Price (Close)
# 1. Clear upward trend from 2014 to 2018, with some periods of short-term fluctuations.
# 2. A sharp increase after 2016 likely reflects major market events.
# 3. The x-axis label contains a minor formatting issue ("Date [!]").
### 4. Electricity Demand in Victoria:
# 1. The demand data fluctuates heavily, with frequent sharp spikes.
# 2. Seasonal cycles are visible, with peaks and troughs repeating regularly.
# 3. The plot is very dense, making it difficult to observe smaller trends directly.
snowy <- my_tourism %>%
filter(Region == "Snowy Mountains")
head(snowy)
## # A tsibble: 6 x 5 [1Q]
## # Key: Region, State, Purpose [1]
## Quarter Region State Purpose Trips
## <qtr> <chr> <chr> <chr> <dbl>
## 1 1998 Q1 Snowy Mountains New South Wales Business 15.9
## 2 1998 Q2 Snowy Mountains New South Wales Business 20.3
## 3 1998 Q3 Snowy Mountains New South Wales Business 36.2
## 4 1998 Q4 Snowy Mountains New South Wales Business 9.15
## 5 1999 Q1 Snowy Mountains New South Wales Business 20.9
## 6 1999 Q2 Snowy Mountains New South Wales Business 33.3
Question: Take snowy data. Then sums up all trips in State and Purpose by each quarter every year by using summarizer() commands. Then Use autoplot(), gg_season() and gg_subseries() to explore the quarterly trips of snowy data. What do you observe? What type of pattern do you see. Write your comment on Answer below:
# Visualize the trips directly for Snowy Mountains
autoplot(snowy, Trips) +
ggtitle("Snowy Mountains - Quarterly Trips") +
ylab("Number of Trips")
# Seasonal plot (compares same quarter across years)
gg_season(snowy, Trips) +
ggtitle("Seasonal Plot: Snowy Mountains - Trips by Quarter")
# Subseries plot (each quarter shown in its own time series)
gg_subseries(snowy, Trips) +
ggtitle("Subseries Plot: Snowy Mountains - Trips by Quarter")
# Now, aggregate trips by Quarter (summed across State and Purpose)
snowy_quarterly <- snowy %>%
index_by(Quarter) %>% # Group by time index (Quarter)
summarise(Trips = sum(Trips, na.rm = TRUE)) # Sum trips across states and purposes
# Visualize the aggregated data
autoplot(snowy_quarterly, Trips) +
ggtitle("Total Quarterly Trips - Snowy Mountains")
gg_season(snowy_quarterly, Trips) +
ggtitle("Seasonal Plot: Total Quarterly Trips - Snowy Mountains")
gg_subseries(snowy_quarterly, Trips) +
ggtitle("Subseries Plot: Total Quarterly Trips - Snowy Mountains")
# C2.Answer:
cat("
Observations:
- Clear seasonal pattern: Trips peak in Quarter 3, likely due to the skiing season.
- There is some variation across years, but the seasonal pattern holds consistently.
- Over time, total trips appear relatively stable, but Q3 consistently stands out as the peak quarter.
- This reflects strong seasonality, driven by the popularity of the Snowy Mountains during winter months (ski tourism).
")
##
## Observations:
## - Clear seasonal pattern: Trips peak in Quarter 3, likely due to the skiing season.
## - There is some variation across years, but the seasonal pattern holds consistently.
## - Over time, total trips appear relatively stable, but Q3 consistently stands out as the peak quarter.
## - This reflects strong seasonality, driven by the popularity of the Snowy Mountains during winter months (ski tourism).
# D1.Answer:
# i) Bricks from aus_production
gg_lag(aus_production, Bricks) +
ggtitle("Lag Plot: Bricks Production (Australia)")
## Warning: Removed 20 rows containing missing values (gg_lag).
ACF(aus_production, Bricks) %>%
autoplot() +
ggtitle("ACF: Bricks Production (Australia)")
# ii) Lynx from pelt
gg_lag(pelt, Lynx) +
ggtitle("Lag Plot: Lynx Trappings (Canada)")
ACF(pelt, Lynx) %>%
autoplot() +
ggtitle("ACF: Lynx Trappings (Canada)")
# iii) Victorian Electricity Demand from aus_elec
gg_lag(vic_elec, Demand) +
ggtitle("Lag Plot: Victorian Electricity Demand")
ACF(vic_elec, Demand) %>%
autoplot() +
ggtitle("ACF: Victorian Electricity Demand")
# My comments:
# Bricks Production: Shows strong quarterly seasonality with high correlation between adjacent quarters, indicating predictable production cycles across years.
#Lynx Trappings: Displays clear multi-year cycles, reflecting population booms and crashes, which is common in ecological predator-prey systems.
#Victorian Electricity Demand: Exhibits daily seasonality, with strong correlation at short lags (half-hourly intervals), influenced by regular human activity patterns like work hours and evening peaks.
# D2.Answer:
# Seasonality: Strong quarterly seasonality with repeating production cycles every year.
# Trend: Slight upward trend over time indicating gradual increase in production.
# Cyclicity: No clear long-term cyclicity; behavior is mainly seasonal.
# Learning: Bricks production follows regular seasonal construction patterns driven by weather and economic activity.
# D2.Answer:
# Seasonality: No regular annual seasonality, but long-term cycles are visible.
# Trend: No steady trend; population rises and falls repeatedly.
# Cyclicity: Strong multi-year cyclic pattern (around 10 years), characteristic of predator-prey population cycles.
# Learning: Lynx trappings reflect ecological population dynamics rather than seasonal economic factors.
# D2.Answer:
# Seasonality: Strong daily seasonality, repeating every 24 hours (48 half-hour intervals).
# Trend: Slight upward trend over time due to population growth and increased demand.
# Cyclicity: No strong irregular cycles — the main pattern is daily seasonality.
# Learning: Electricity demand is tightly linked to daily human activities, with clear peaks during waking and working hours.
goog <- gafa_stock %>%
filter(Symbol == "GOOG", year(Date) >= 2018)
# E1. Answer:
dgoog = goog %>% # get google daily data(>2018)
mutate(trading_day = row_number()) %>% #missing dates, create rownumber()-trading days!
update_tsibble(index = trading_day, regular = TRUE) %>% #update tsibble() with new index.
mutate(diff = difference(Close)) #calculates the first difference of a series with difference() command. it calculates the daily changes in the stock price.
# Check the result
head(dgoog)
## # A tsibble: 6 x 10 [1]
## # Key: Symbol [1]
## Symbol Date Open High Low Close Adj_Close Volume trading_day diff
## <chr> <date> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <int> <dbl>
## 1 GOOG 2018-01-02 1048. 1067. 1045. 1065 1065 1237600 1 NA
## 2 GOOG 2018-01-03 1064. 1086. 1063. 1082. 1082. 1430200 2 17.5
## 3 GOOG 2018-01-04 1088 1094. 1084. 1086. 1086. 1004600 3 3.92
## 4 GOOG 2018-01-05 1094 1104. 1092 1102. 1102. 1279100 4 15.8
## 5 GOOG 2018-01-08 1102. 1111. 1102. 1107. 1107. 1047600 5 4.71
## 6 GOOG 2018-01-09 1109. 1111. 1101. 1106. 1106. 902500 6 -0.680
autoplot(dgoog, diff) + ggtitle(“First Difference of Google Stock Price (GOOG)”) + ylab(“Daily Change in Close Price”)
ACF(dgoog, diff) %>% autoplot() + ggtitle(“ACF of First Difference (GOOG)”)
# E2.Answer:
#### The first plot shows the daily changes in the closing price of Google stock.There is no clear trend or seasonality visible — the data fluctuates around zero. The variance (spread of the values) seems to be somewhat stable over time, though there may be occasional larger spikes. This behavior is consistent with a potential white noise process, though this is not enough evidence alone.
#### In the ACF plot, most of the autocorrelations are close to zero, and they stay within the blue significance bands. This suggests that the first-differenced series does not exhibit strong autocorrelation at any lag, which is another characteristic of white noise. There are a few spikes slightly outside the bands, but they are minor and could be attributed to randomness.