DATA 624 HW 2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.4.3

## Warning: package 'ggplot2' was built under R version 4.4.3

## Warning: package 'dplyr' was built under R version 4.4.3

library(openintro)

## Warning: package 'openintro' was built under R version 4.4.3

library(fpp3)

## Warning: package 'fpp3' was built under R version 4.4.3

## Warning: package 'tsibble' was built under R version 4.4.3

## Warning: package 'tsibbledata' was built under R version 4.4.3

## Warning: package 'feasts' was built under R version 4.4.3

## Warning: package 'fabletools' was built under R version 4.4.3

## Warning: package 'fable' was built under R version 4.4.3

library(ggplot2)
library(stringr)

Exercise 3.1

# -------------------------------------------------
# 1. Create GDP per capita variable
# -------------------------------------------------


# Create GDP per capita
gdp_pc_data <- global_economy |>
  mutate(GDP_pc = GDP / Population)

gdp_pc_data |>
  ggplot(aes(x = Year, y = GDP_pc, group = Country, color = GDP_pc)) +
  geom_line(alpha = 0.6, linewidth = 0.6) +
  scale_color_viridis_c(option = "plasma") +
  labs(title = "GDP per Capita Over Time by Country",
       x = "Year",
       y = "GDP per Capita",
       color = "GDP per Capita") +
  theme_minimal(base_size = 14)

## Warning: Removed 3242 rows containing missing values or values outside the scale range
## (`geom_line()`).

# -------------------------------------------------
# 2. Plot GDP per capita for ALL countries
# -------------------------------------------------
gdp_pc_data |>
  ggplot(aes(x = Year, y = GDP_pc, group = Country)) +
  geom_line(alpha = 0.25) +
  labs(title = "GDP per Capita Over Time by Country",
       x = "Year",
       y = "GDP per Capita") +
  theme_minimal()

## Warning: Removed 3242 rows containing missing values or values outside the scale range
## (`geom_line()`).

# -------------------------------------------------
# 3. Find the country with the highest GDP per capita
# -------------------------------------------------
highest_country <- gdp_pc_data |>
  slice_max(GDP_pc, n = 1)

highest_country

# Print only the country name
highest_country |>
  pull(Country)

## [1] Monaco
## 263 Levels: Afghanistan Albania Algeria American Samoa Andorra ... Zimbabwe

# -------------------------------------------------
# 4. Plot GDP per capita for that country (Monaco if highest)
# -------------------------------------------------
gdp_pc_data |>
  filter(Country == "Monaco") |>
  ggplot(aes(x = Year, y = GDP_pc)) +
  geom_line(color = "blue", linewidth = 1) +
  labs(title = "GDP per Capita Over Time: Monaco",
       x = "Year",
       y = "GDP per Capita") +
  theme_minimal()

## Warning: Removed 11 rows containing missing values or values outside the scale range
## (`geom_line()`).

Exercise 3.2

# -------------------------------------------------
# 1. United States GDP (global_economy)
# -------------------------------------------------
us_gdp <- global_economy |>
  filter(Country == "United States")

autoplot(us_gdp, GDP) +
  labs(title = "United States GDP", y = "GDP")

# Exponential growth → log transformation appropriate
autoplot(us_gdp, log(GDP)) +
  labs(title = "United States GDP (Log Transformed)",
       y = "log(GDP)")

# -------------------------------------------------
# 2. Victorian Bulls, Bullocks and Steers (aus_livestock)
# -------------------------------------------------
vic_bulls <- aus_livestock |>
  filter(State == "Victoria",
         Animal == "Bulls, bullocks and steers")

autoplot(vic_bulls, Count) +
  labs(title = "Victorian Bulls, Bullocks and Steers",
       y = "Count")

# Variance appears relatively stable → no transformation needed


# -------------------------------------------------
# 3. Victorian Electricity Demand (vic_elec)
# -------------------------------------------------
autoplot(vic_elec, Demand) +
  labs(title = "Victorian Electricity Demand",
       y = "Demand")

# Variance increases with level → log transformation helpful
autoplot(vic_elec, log(Demand)) +
  labs(title = "Victorian Electricity Demand (Log Transformed)",
       y = "log(Demand)")

# -------------------------------------------------
# 4. Gas Production (aus_production)
# -------------------------------------------------
gas_prod <- aus_production |> select(Quarter, Gas)

autoplot(gas_prod, Gas) +
  labs(title = "Australian Gas Production",
       y = "Gas")

# Slight increasing variance → log transformation reasonable
autoplot(gas_prod, log(Gas)) +
  labs(title = "Gas Production (Log Transformed)",
       y = "log(Gas)")

Exercise 3.3

# Check variable names (Gas is NOT in canadian_gas)
colnames(canadian_gas)

## [1] "Month"  "Volume"

# Box-Cox lambda (use Volume)
canadian_gas |>
  features(Volume, guerrero)

lambda <- canadian_gas |>
  features(Volume, guerrero) |>
  pull(lambda_guerrero)

canadian_gas |>
  mutate(Volume_bc = box_cox(Volume, lambda)) |>
  autoplot(Volume_bc) +
  labs(title = "Canadian gas (Box-Cox transformed)", y = "Box-Cox(Volume)")

#The estimated Box-Cox parameter λ is approximately 0.58. Since λ is between 0 and 1, 
#this suggests a moderate transformation is appropriate. The variance increases
#slightly with the level of the series, so a square-root-like transformation can help 
#stabilize the variance. However, the need for transformation is not extreme and would not be helpful.

Exercise 3.4

# Reproducible random series selection
set.seed(12345678)

series_id <- sample(unique(aus_retail$`Series ID`), 1)

myseries <- aus_retail |>
  filter(`Series ID` == series_id)

# Show which series was selected (ID + descriptors)
series_id

## [1] "A3349639C"

myseries |>
  distinct(`Series ID`, State, Industry)

# Plot the selected retail series
myseries |>
  autoplot(Turnover) +
  labs(
    title = paste("Selected retail series:", series_id),
    y = "Turnover",
    x = NULL
  )

# Estimate Box-Cox lambda using Guerrero method
lambda_tbl <- myseries |>
  features(Turnover, guerrero)

lambda_tbl

lambda <- lambda_tbl |>
  pull(lambda_guerrero)

lambda

## [1] -0.353785

##As the Lmabda = -0.353785 we recommend Negative Box–Cox transformation 
##(inverse-type transformation).

Exercise 3.5

# 1) Tobacco
lambda_tob <- aus_production |>
  select(Tobacco) |>
  features(Tobacco, guerrero) |>
  pull(lambda_guerrero)

lambda_tob

## [1] 0.9264636

# 2) Economy passengers MEL–SYD
lambda_ans <- ansett |>
  filter(Airports == "MEL-SYD", Class == "Economy") |>
  select(Passengers) |>
  features(Passengers, guerrero) |>
  pull(lambda_guerrero)

lambda_ans

## [1] 1.999927

# 3) Pedestrian counts Southern Cross
lambda_ped <- pedestrian |>
  filter(str_detect(Sensor, "Southern Cross")) |>
  select(Count) |>
  features(Count, guerrero) |>
  pull(lambda_guerrero)

lambda_ped

## [1] -0.2501616

# Box-Cox Transformation Suggestions

#A Tobacco: λ ≈ 0.93
# → No strong transformation needed (approximately linear)

#B Economy passengers: λ ≈ 2
# → Square transformation

#C Pedestrian counts: λ ≈ -0.25
# → Reciprocal power transformation

Exercise 3.7

# -------------------------------------------------
# Select last 5 years (20 quarters)
# -------------------------------------------------
gas <- tail(aus_production, 5*4) |>
  select(Quarter, Gas)

# (A) Plot original series
autoplot(gas, Gas) +
  labs(title = "Gas Production (Last 5 Years)",
       x = "Quarter", y = "Gas")

# The time series shows clear quarterly seasonal fluctuations
# with regular peaks and troughs each year.
# There is also a slight upward trend over the five-year period.

# -------------------------------------------------
# (B) Classical multiplicative decomposition
# -------------------------------------------------
gas_fit <- gas |>
  model(classical_decomposition(Gas, type = "multiplicative"))

gas_comp <- components(gas_fit)

autoplot(gas_comp) +
  labs(title = "Multiplicative Classical Decomposition")

## Warning: Removed 2 rows containing missing values or values outside the scale range
## (`geom_line()`).

# The multiplicative classical decomposition confirms stable
# seasonal effects and a smooth upward trend-cycle component.

# -------------------------------------------------
# (C) Seasonally adjusted data
# -------------------------------------------------
gas_sa <- gas_comp |>
  mutate(SA = Gas / seasonal)

autoplot(gas_sa, SA) +
  labs(title = "Seasonally Adjusted Gas",
       y = "Seasonally Adjusted Gas")

# The seasonally adjusted series removes the regular seasonal
# variation and reveals the underlying trend more clearly.

# -------------------------------------------------
# (D) Add outlier in middle
# -------------------------------------------------
gas_mid <- gas
gas_mid$Gas[10] <- gas_mid$Gas[10] + 300

gas_mid_fit <- gas_mid |>
  model(classical_decomposition(Gas, type = "multiplicative"))

gas_mid_sa <- components(gas_mid_fit) |>
  mutate(SA = Gas / seasonal)

# Compare original vs middle outlier
bind_rows(
  gas_sa |> as_tibble() |> mutate(Type = "Original"),
  gas_mid_sa |> as_tibble() |> mutate(Type = "Middle Outlier")
) |>
  ggplot(aes(x = Quarter, y = SA, color = Type)) +
  geom_line(linewidth = 1) +
  labs(title = "Effect of Middle Outlier on SA")

# When an outlier is introduced in the middle of the series,
# it noticeably distorts the seasonally adjusted values and
# slightly affects the estimated trend and seasonal components.

# -------------------------------------------------
# (E) Add outlier near end
# -------------------------------------------------
gas_end <- gas
gas_end$Gas[20] <- gas_end$Gas[20] + 300

gas_end_fit <- gas_end |>
  model(classical_decomposition(Gas, type = "multiplicative"))

gas_end_sa <- components(gas_end_fit) |>
  mutate(SA = Gas / seasonal)

# Compare all three
bind_rows(
  gas_sa |> as_tibble() |> mutate(Type = "Original"),
  gas_mid_sa |> as_tibble() |> mutate(Type = "Middle Outlier"),
  gas_end_sa |> as_tibble() |> mutate(Type = "End Outlier")
) |>
  ggplot(aes(x = Quarter, y = SA, color = Type)) +
  geom_line(linewidth = 1) +
  labs(title = "Effect of Outlier Location on SA")

# When the outlier is placed near the end of the series,
# the effect on the seasonally adjusted data is smaller
# because fewer moving-average calculations are influenced.



# F.
# Yes, the location of the outlier matters.
# A middle outlier causes greater distortion,
# while an end outlier has a more limited impact.

Exercise 3.8

data(aus_retail)

# Create myseries (replace with your State/Industry from Exercise 7)
myseries <- aus_retail %>%
  filter(State == "Victoria",
         Industry == "Department stores")

# X-11 decomposition
x11_dcmp <- myseries %>%
  model(x11 = X_13ARIMA_SEATS(Turnover ~ x11())) %>%
  components()

# Plot
autoplot(x11_dcmp) +
  labs(title = "X-11 Decomposition of Retail Turnover")

#The irregular component of the decomposition shows increased volatility the the early 1990s and again in the late 2000s. These spikes correspond to recession periods during those times. Apart from these disturbances, the series displays a steady upward trend and a consistent seasonal pattern in retail turnover.

Exercise 3.9

A.The STL decomposition shows a steady upward trend in the Australian civilian labour force over time. The seasonal pattern is consistent each year, with regular peaks#and troughs, and its magnitude (about ±100) is small compared to the overall leve of the series (6,500–9,000). This indicates that long-term growth explains most of the variation. The remainder component is generally small but shows a noticeable

#B.disturbance in the early 1990s. Yes, the 1991/1992 recession is visible. It appears as a sharp negative spike in the remainder component and a temporary slowing in the trend, while the seasonal pattern remains stable.