# DO NOT FORGET TO CALL THESE 3 packages FIRST
library(fpp3)       # forecasting package
## Registered S3 method overwritten by 'tsibble':
##   method               from 
##   as_tibble.grouped_df dplyr
## ── Attaching packages ──────────────────────────────────────────── fpp3 1.0.1 ──
## ✔ tibble      3.2.1     ✔ tsibble     1.1.5
## ✔ dplyr       1.1.4     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.3.1     ✔ feasts      0.4.1
## ✔ lubridate   1.9.3     ✔ fable       0.4.0
## ✔ ggplot2     3.5.1
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()
library(tidyverse)  # graphs and tidy
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ forcats 1.0.0     ✔ readr   2.1.5
## ✔ purrr   1.0.2     ✔ stringr 1.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()     masks stats::filter()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag()        masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(readxl)     # reading excel data

PART A:

A1: Get tourism data as given below from the link. Then convert the data into tsibble format by using tsibble functions. Save it as my_tourism

# download the data from the web page. 

download.file("http://OTexts.com/fpp3/extrafiles/tourism.xlsx",
              tourism_file <- tempfile()) 

# reads the downloaded tourism data by excel

my_tourism <- readxl::read_excel("C:/Users/vishe/Downloads/tourism.xlsx") 

## Your turn: create a tsibble format of the data below, and rename it as my_tourism:

# A1.Answer: 
my_tourism <- my_tourism %>%
  mutate(Quarter = yearquarter(Quarter)) %>%  
  as_tsibble(index = Quarter, key = c(Region, State, Purpose))  
# View the tsibble
print(my_tourism)
## # A tsibble: 24,320 x 5 [1Q]
## # Key:       Region, State, Purpose [304]
##    Quarter Region   State           Purpose  Trips
##      <qtr> <chr>    <chr>           <chr>    <dbl>
##  1 1998 Q1 Adelaide South Australia Business  135.
##  2 1998 Q2 Adelaide South Australia Business  110.
##  3 1998 Q3 Adelaide South Australia Business  166.
##  4 1998 Q4 Adelaide South Australia Business  127.
##  5 1999 Q1 Adelaide South Australia Business  137.
##  6 1999 Q2 Adelaide South Australia Business  200.
##  7 1999 Q3 Adelaide South Australia Business  169.
##  8 1999 Q4 Adelaide South Australia Business  134.
##  9 2000 Q1 Adelaide South Australia Business  154.
## 10 2000 Q2 Adelaide South Australia Business  169.
## # ℹ 24,310 more rows

A2. To analyze the data, first view my_tourism set, and see the dates of the data.

1. Is it annual or quarterly data? Which year does it start from?

2. Use table(State,Purpose) command to see the cross-table. How many States are there in the data? How many Purpose of trips category?

3. Group the data by Region and Purpose by using group() command.To eliminate time effect, use tibble format. (i.e as_tibble() %>% group_by(Region, Purpose))

4. After grouping the data in #3, use summarize() function to get the average of Trips for each combination, and assign it as Trips( i.e., summarise(Trips = mean(Trips)).

5. Find which region has Purpose had the maximum number of overnight trips on average. ungroup the data to find the result( i.e ungroup() %>% filter(Trips == max(Trips)))

# A2.Answer: 
is_quarterly <- !is.null(as_tsibble(my_tourism)$index)
## Warning: Unknown or uninitialised column: `index`.
start_year <- min(year(as_tsibble(my_tourism)$Quarter))

# part-1 :Check if data is quarterly
if (is_quarterly) {
  print("The data is quarterly.")
} else {
  print("The data is annual.")
}
## [1] "The data is annual."
# Print the starting year
print(paste("The data starts from:", start_year))
## [1] "The data starts from: 1998"
# part-2: Cross-table of 'State' and 'Purpose'
state_purpose_table <- table(my_tourism$State, my_tourism$Purpose)

# Print the cross-table
print(state_purpose_table)
##                     
##                      Business Holiday Other Visiting
##   ACT                      80      80    80       80
##   New South Wales        1040    1040  1040     1040
##   Northern Territory      560     560   560      560
##   Queensland              960     960   960      960
##   South Australia         960     960   960      960
##   Tasmania                400     400   400      400
##   Victoria               1680    1680  1680     1680
##   Western Australia       400     400   400      400
# Count the unique values in 'State' and 'Purpose'
num_states <- n_distinct(my_tourism$State)
num_purposes <- n_distinct(my_tourism$Purpose)

# Print the results
print(paste("There are", num_states, "states in the data."))
## [1] "There are 8 states in the data."
print(paste("There are", num_purposes, "trip purpose categories in the data."))
## [1] "There are 4 trip purpose categories in the data."
#part-3:
# Group the data by 'Region' and 'Purpose'
grouped_data <- my_tourism %>%
  as_tibble() %>%  # Convert to a tibble to remove the tsibble properties (time effect)
  group_by(Region, Purpose)

# View the grouped data
print(grouped_data)
## # A tibble: 24,320 × 5
## # Groups:   Region, Purpose [304]
##    Quarter Region   State           Purpose  Trips
##      <qtr> <chr>    <chr>           <chr>    <dbl>
##  1 1998 Q1 Adelaide South Australia Business  135.
##  2 1998 Q2 Adelaide South Australia Business  110.
##  3 1998 Q3 Adelaide South Australia Business  166.
##  4 1998 Q4 Adelaide South Australia Business  127.
##  5 1999 Q1 Adelaide South Australia Business  137.
##  6 1999 Q2 Adelaide South Australia Business  200.
##  7 1999 Q3 Adelaide South Australia Business  169.
##  8 1999 Q4 Adelaide South Australia Business  134.
##  9 2000 Q1 Adelaide South Australia Business  154.
## 10 2000 Q2 Adelaide South Australia Business  169.
## # ℹ 24,310 more rows
#Part-4:
average_trips <- grouped_data %>%
  summarise(Trips = mean(Trips, na.rm = TRUE))
## `summarise()` has grouped output by 'Region'. You can override using the
## `.groups` argument.
# View the summarized data
print(average_trips)
## # A tibble: 304 × 3
## # Groups:   Region [76]
##    Region         Purpose   Trips
##    <chr>          <chr>     <dbl>
##  1 Adelaide       Business 156.  
##  2 Adelaide       Holiday  157.  
##  3 Adelaide       Other     56.6 
##  4 Adelaide       Visiting 205.  
##  5 Adelaide Hills Business   2.66
##  6 Adelaide Hills Holiday   10.5 
##  7 Adelaide Hills Other      1.40
##  8 Adelaide Hills Visiting  14.2 
##  9 Alice Springs  Business  14.6 
## 10 Alice Springs  Holiday   31.9 
## # ℹ 294 more rows
#part-5:
max_trips <- average_trips %>%
  ungroup() %>%
  filter(Trips == max(Trips, na.rm = TRUE)) %>%
  slice(1)  # Select only the first occurrence if there are ties

# Print the result
print(max_trips)
## # A tibble: 1 × 3
##   Region Purpose  Trips
##   <chr>  <chr>    <dbl>
## 1 Sydney Visiting  747.

A3.Question: Create a new tsibble which combines the Purposes and Regions,and just has total trips by State. (i.e. group_by(State) %>%summarise(Trips = sum(Trips)) ). Then ungroup() the data. Save it as state_tourism.

# A3.Answer: 
state_tourism <- my_tourism %>%
  as_tibble() %>%  # Convert to tibble to simplify manipulation
  group_by(State, Quarter) %>%  # Group by State and Quarter to retain the time index
  summarise(Trips = sum(Trips, na.rm = TRUE), .groups = "drop") %>%  # Sum the trips for each state
  as_tsibble(index = Quarter, key = State)  # Convert back to tsibble with Quarter as the time index

# Print the new tsibble
print(state_tourism)
## # A tsibble: 640 x 3 [1Q]
## # Key:       State [8]
##    State Quarter Trips
##    <chr>   <qtr> <dbl>
##  1 ACT   1998 Q1  551.
##  2 ACT   1998 Q2  416.
##  3 ACT   1998 Q3  436.
##  4 ACT   1998 Q4  450.
##  5 ACT   1999 Q1  379.
##  6 ACT   1999 Q2  558.
##  7 ACT   1999 Q3  449.
##  8 ACT   1999 Q4  595.
##  9 ACT   2000 Q1  600.
## 10 ACT   2000 Q2  557.
## # ℹ 630 more rows

PART B:

B1.Question: Create plots of the following time series. Analyze it visually. What do you see? comment on the below in Answer.

# Bricks from aus_production 
# Lynx from pelt 
# Close from gafa_stock 
# Demand from vic_elec
# B1.Answer: 
# Required libraries
library(fpp3)

# B1: Plotting the Time Series

# 1. Bricks from aus_production
bricks_plot <- aus_production %>%
  autoplot(Bricks) + 
  ggtitle("Australian Brick Production") +
  ylab("Bricks Produced (millions)")

# 2. Lynx from pelt
lynx_plot <- pelt %>%
  autoplot(Lynx) + 
  ggtitle("Annual Lynx Trappings") +
  ylab("Number of Lynx Trapped")

# 3. Close from gafa_stock
gafa_stock_plot <- gafa_stock %>%
  filter(Symbol == "GOOG") %>%  # Example: Plotting Google's stock prices
  autoplot(Close) + 
  ggtitle("Google Stock Closing Prices") +
  ylab("Closing Price (USD)")

# 4. Demand from vic_elec
vic_elec_plot <- vic_elec %>%
  autoplot(Demand) + 
  ggtitle("Electricity Demand in Victoria") +
  ylab("Demand (MW)")

# Display the plots
print(bricks_plot)
## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

print(lynx_plot)

print(gafa_stock_plot)

print(vic_elec_plot)

PART C:

C1. Question: Use my_tourism data you created as tsibble in A1. Filter Region for Snowy_Mountains and save it as snowy.

snowy <- my_tourism %>%
  filter(Region == "Snowy Mountains")

# View the filtered dataset
print(snowy)
## # A tsibble: 320 x 5 [1Q]
## # Key:       Region, State, Purpose [4]
##    Quarter Region          State           Purpose  Trips
##      <qtr> <chr>           <chr>           <chr>    <dbl>
##  1 1998 Q1 Snowy Mountains New South Wales Business 15.9 
##  2 1998 Q2 Snowy Mountains New South Wales Business 20.3 
##  3 1998 Q3 Snowy Mountains New South Wales Business 36.2 
##  4 1998 Q4 Snowy Mountains New South Wales Business  9.15
##  5 1999 Q1 Snowy Mountains New South Wales Business 20.9 
##  6 1999 Q2 Snowy Mountains New South Wales Business 33.3 
##  7 1999 Q3 Snowy Mountains New South Wales Business 21.6 
##  8 1999 Q4 Snowy Mountains New South Wales Business  8.45
##  9 2000 Q1 Snowy Mountains New South Wales Business 20.1 
## 10 2000 Q2 Snowy Mountains New South Wales Business 15.2 
## # ℹ 310 more rows

C2. Question: Use autoplot(), gg_season() and gg_subseries() to explore the snowy data. What do you observe? What type of pattern do you see. Write your comment on Answer below:

Question: Take snowy data. Then sums up all trips in State and Purpose by each quarter every year by using summarizer() commands. Then Use autoplot(), gg_season() and gg_subseries() to explore the quarterly trips of snowy data. What do you observe? What type of pattern do you see. Write your comment on Answer below:

# C2.Answer: 
# 1. autoplot(): Plot the total trips over time for the "Snowy Mountains"
autoplot(snowy, Trips) +
  ggtitle("Snowy Mountains - Total Trips Over Time") +
  ylab("Number of Trips")

# 2. gg_season(): Visualizing seasonal patterns in the data
gg_season(snowy, Trips) +
  ggtitle("Seasonal Plot of Trips in Snowy Mountains") +
  ylab("Number of Trips")

# 3. gg_subseries(): Exploring subseries patterns across different quarters
gg_subseries(snowy, Trips) +
  ggtitle("Subseries Plot of Trips in Snowy Mountains") +
    ylab("Number of Trips")

observation: The autoplot() reveals strong seasonal patterns, with regular peaks likely in winter. The gg_season() plot confirms that trips spike in Q1, hinting at winter sports activity. The gg_subseries() further highlights consistent peaks within specific quarters, reinforcing the seasonal tourism trend.

PART D:

D1.Question: Use these two functions 1) gg_lag 2) ACF to explore the following time series: i)Bricks from aus_production ii)Lynx from pelt iii) Victorian Electricity Demand from aus_elec. Write your comments about each graphs you created

# D1.Answer:
# gg_lag for Bricks
gg_lag(aus_production, Bricks, geom = "point") +
  ggtitle("Lag Plot for Australian Brick Production")
## Warning: Removed 20 rows containing missing values (gg_lag).

# ACF for Bricks
aus_production %>%
  ACF(Bricks) %>%
  autoplot() +
  ggtitle("ACF for Australian Brick Production")

# D1.2: Lynx from pelt
# gg_lag for Lynx
gg_lag(pelt, Lynx, geom = "point") +
  ggtitle("Lag Plot for Lynx Trappings")

# ACF for Lynx
pelt %>%
  ACF(Lynx) %>%
  autoplot() +
  ggtitle("ACF for Lynx Trappings")

# D1.3: Victorian Electricity Demand from Electricity Demand
# gg_lag for Demand
# gg_lag for Demand
gg_lag(vic_elec, Demand, geom = "point") +
  ggtitle("Lag Plot for Victorian Electricity Demand")

# ACF for Demand
vic_elec %>%
  ACF(Demand) %>%
  autoplot() +
  ggtitle("ACF for Victorian Electricity Demand")

observation : Observations for D1:

Bricks:
- gg_lag(): Indicates strong seasonal correlation. - ACF(): Spikes every 4 quarters confirm seasonality and persistence.

Lynx:
- gg_lag(): Shows a cyclic pattern, matching ecological cycles. - ACF(): Peaks every 10 years indicate cyclic behavior.

Electricity Demand:
- gg_lag(): Highlights daily and weekly patterns. - ACF(): Peaks at 24-hour intervals confirm daily seasonality.

D2.Question: after using these functions, Can you spot any seasonality, cyclicity and trend? What do you learn about the series? Write your comments below for each series:

i) Bricks

D2.Answer:

The lag plots (gg_lag()) and ACF plots for the Bricks data reveal: Seasonality: A clear repeating pattern at regular intervals, with significant peaks in the ACF plot every 4 quarters, highlighting a strong yearly seasonal component. Cyclicity: No indication of long-term cycles beyond the yearly seasonality, aligning with typical annual production patterns. Trend: The autoplot() shows some variations, but there’s no evidence of a sustained upward or downward trend. Production appears stable, mainly fluctuating due to seasonal changes.

ii) Lynx from pelt:

D2.Answer:

Seasonality: Since the data is annual, the lag plot doesnt show a regular seasonal pattern typical of quarterly or monthly series. However, the ACF plot reveals periodic peaks, indicating some cyclic behavior, though not in a simple yearly cycle. Cyclicity: The ACF plot shows strong peaks roughly every 10 years, suggesting a cyclic pattern. This aligns with ecological data, where animal populations follow multi-year cycles due to factors like predator-prey dynamics. Trend: There’s no clear long-term trend in the Lynx data. The cyclic peaks remain fairly stable, indicating a repeating ecological cycle.

iii) Electricty

D2.Answer:

’’’ Seasonality: The lag plot reveals a clear daily pattern with consistent peaks, indicating strong daily seasonality. The ACF plot confirms this with spikes at 24-hour intervals.

Cyclicity: In addition to daily seasonality, the ACF plot hints at weekly cycles, likely reflecting higher demand on certain days, such as weekdays.

Trend: The autoplot() suggests a potential upward trend over time, possibly driven by factors like population growth and changes in usage habits. ’’’

PART E: See the data below for Google Stock price from the gafa_stock data set.

E1: Calculate the first difference of the “goog” series.

E1: Calculate the first difference of the Google stock price series

goog <- gafa_stock %>% filter(Symbol == “GOOG”, year(Date) >= 2018) # Filter for Google and dates from 2018 onwards # Calculate the first difference dgoog <- goog %>% mutate(trading_day = row_number()) %>% # Create a sequential trading day index update_tsibble(index = trading_day, regular = TRUE) %>% # Update the tsibble with the new index mutate(diff = difference(Close)) # Calculate the first difference of the “Close” price # Display the first few rows of the result print(dgoog)

E2. Question: Does “diff” (difference of the series) look like white noise? Recall the definition of white noise. Recall what function do you use to check if a series is white noise or not. Use the necessary graph that shows if a series is white noise? Comment based on the graph.

E2.Answer:

E2: Visualizing and Testing if “diff” is White Noise

Plot the “diff” series

autoplot(dgoog, diff) + ggtitle(“First Difference of Google Stock Price (Close)”) + ylab(“Daily Change in Stock Price”)

ACF plot to check for autocorrelation

dgoog %>% ACF(diff) %>% autoplot() + ggtitle(“ACF of First Difference (diff) of Google Stock Price”)

Perform Ljung-Box test to check for white noise

ljung_box_test <- dgoog %>% features(diff, ljung_box, lag = 10)

Display the result of the Ljung-Box test

print(ljung_box_test)

Visual Inspection: If “diff” fluctuates randomly around a constant mean (zero) without trends or patterns, it may indicate white noise.

ACF Plot: Autocorrelation values within the 95% confidence interval suggest no significant autocorrelation, supporting the white noise hypothesis.

Ljung-Box Test: A high p-value (over 0.05) indicates no significant autocorrelation, suggesting the series is white noise.