624_HW3_Luis_Hernandez

#5.10 Q1

#Australian Population
#Here we have annualized data with a consistent upward trend so I chose a Random Walk with drift
aus_pop <- global_economy |>
  filter(Country == "Australia")

pop_fit<-aus_pop |>
  model(RW(Population ~ drift()))

pop_fc <- pop_fit |> 
  forecast(h = 10)

autoplot(pop_fc, aus_pop)

#Bricks
#This is seasonal Bricks data with a strong seasonal pattern so I chose a seasonal naive
bricks_fit <- aus_production |>
  select(Quarter, Bricks) |>
  model(SNAIVE(Bricks))

bricks_fc <- bricks_fit |> 
  forecast(h = 12)

autoplot(bricks_fc, aus_production)

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf
## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

#NSW Lambs
#Similarly livestock is seasonally affected data that has a trend from different months year over year
nsw_lambs <- aus_livestock |>
  filter(State == "New South Wales", Animal == "Lambs")

nsw_lambs

## # A tsibble: 558 x 4 [1M]
## # Key:       Animal, State [1]
##       Month Animal State            Count
##       <mth> <fct>  <fct>            <dbl>
##  1 1972 Jul Lambs  New South Wales 587600
##  2 1972 Aug Lambs  New South Wales 553700
##  3 1972 Sep Lambs  New South Wales 494900
##  4 1972 Oct Lambs  New South Wales 533500
##  5 1972 Nov Lambs  New South Wales 574300
##  6 1972 Dec Lambs  New South Wales 517500
##  7 1973 Jan Lambs  New South Wales 562600
##  8 1973 Feb Lambs  New South Wales 426900
##  9 1973 Mar Lambs  New South Wales 496300
## 10 1973 Apr Lambs  New South Wales 496000
## # ℹ 548 more rows

lambs_fit <- nsw_lambs |>
  model(SNAIVE(Count))

lambs_fc <- lambs_fit |>
  forecast(h = 12)
lambs_fc

## # A fable: 12 x 6 [1M]
## # Key:     Animal, State, .model [1]
##    Animal State           .model           Month
##    <fct>  <fct>           <chr>            <mth>
##  1 Lambs  New South Wales SNAIVE(Count) 2019 Jan
##  2 Lambs  New South Wales SNAIVE(Count) 2019 Feb
##  3 Lambs  New South Wales SNAIVE(Count) 2019 Mar
##  4 Lambs  New South Wales SNAIVE(Count) 2019 Apr
##  5 Lambs  New South Wales SNAIVE(Count) 2019 May
##  6 Lambs  New South Wales SNAIVE(Count) 2019 Jun
##  7 Lambs  New South Wales SNAIVE(Count) 2019 Jul
##  8 Lambs  New South Wales SNAIVE(Count) 2019 Aug
##  9 Lambs  New South Wales SNAIVE(Count) 2019 Sep
## 10 Lambs  New South Wales SNAIVE(Count) 2019 Oct
## 11 Lambs  New South Wales SNAIVE(Count) 2019 Nov
## 12 Lambs  New South Wales SNAIVE(Count) 2019 Dec
## # ℹ 2 more variables: Count <dist>, .mean <dbl>

autoplot(lambs_fc, nsw_lambs)

#Household Wealth
#Yearly data with a likely increasing trend so I chose a RW model
wealth_fit <- hh_budget |>
  model(RW(Wealth ~ drift()))

wealth_fc <- wealth_fit |> forecast(h = 8)
wealth_fc

## # A fable: 32 x 5 [1Y]
## # Key:     Country, .model [4]
##    Country   .model                Year
##    <chr>     <chr>                <dbl>
##  1 Australia RW(Wealth ~ drift())  2017
##  2 Australia RW(Wealth ~ drift())  2018
##  3 Australia RW(Wealth ~ drift())  2019
##  4 Australia RW(Wealth ~ drift())  2020
##  5 Australia RW(Wealth ~ drift())  2021
##  6 Australia RW(Wealth ~ drift())  2022
##  7 Australia RW(Wealth ~ drift())  2023
##  8 Australia RW(Wealth ~ drift())  2024
##  9 Canada    RW(Wealth ~ drift())  2017
## 10 Canada    RW(Wealth ~ drift())  2018
## # ℹ 22 more rows
## # ℹ 2 more variables: Wealth <dist>, .mean <dbl>

autoplot(wealth_fc, hh_budget)

#Takeaway Food Turnover
#
takeaway_data <- aus_retail |>
  filter(Industry == "Takeaway food services")

takeaway_fit <- takeaway_data |>
  model(SNAIVE(Turnover))

takeaway_fc <- takeaway_fit |>
  forecast(h = "2 years")

autoplot(takeaway_fc, takeaway_data) +
  facet_wrap(~State, ncol = 2, scales = "free_y") +
  theme(strip.text = element_text(size = 10))

Including Plots

#5.10 Q2

#Filter then build monthly tsibble
fb_monthly <- gafa_stock |>
  filter(Symbol == "FB") |>
  as_tibble() |>                              
  mutate(Month = yearmonth(Date)) |>
  group_by(Month) |>
  summarise(Close = last(Close)) |>
  as_tsibble(index = Month)

#Time plot
autoplot(fb_monthly, Close) +
  labs(title = "Facebook (FB) Monthly Closing Price",
       y = "Close price")

# Drift forecast (1 month ahead)
fb_fit_drift <- fb_monthly |>
  model(RW(Close ~ drift()))

fb_fc_drift <- fb_fit_drift |>
  forecast(h = 1)

#Plot of forecast 
autoplot(fb_fc_drift, fb_monthly) +
  labs(title = "Facebook (FB) Drift Forecast (1 Month Ahead)",
       y = "Close price")

#Show drift forecasts are equal extending the line between first & last observations
y_first <- first(fb_monthly$Close)
y_last  <- last(fb_monthly$Close)
T <- nrow(fb_monthly)

c_drift <- (y_last - y_first) / (T - 1)

# Manual drift forecasts for h steps 

h <- 1

manual_mean <- y_last + c_drift * (1:h)

# Forecast show they are equal to line drawn between first and last obs
fpp3_mean <- fb_fc_drift |>
  as_tibble() |>
  pull(.mean)

all.equal(fpp3_mean, manual_mean)

## [1] TRUE

first_month <- first(fb_monthly$Month)
last_month  <- last(fb_monthly$Month)

# Create future months to extend the line
future_months <- yearmonth(last_month) + 1:h

line_months <- c(fb_monthly$Month, future_months)
line_values <- y_first + c_drift * (0:(length(line_months) - 1))

line_df <- tibble(Month = line_months, Line = line_values)

autoplot(fb_monthly, Close) +
  geom_line(data = line_df, aes(x = Month, y = Line), linewidth = 1) +
  labs(
    title = "Drift = Straight-Line Extension (First → Last Observation)",
    y = "Close price")

#Other benchmark forecasts. 
#Used 12 months ahead instead of 30 days and plotted models together for comparison to analyze later
fb_models <- fb_monthly |>
  model(
    mean  = MEAN(Close),
    naive = NAIVE(Close),
    drift = RW(Close ~ drift())
  )

fb_fc_all <- fb_models |>
  forecast(h = 12)   

autoplot(fb_fc_all, fb_monthly) +
  labs(
    title = "Benchmark Forecasts for Facebook (FB)",
    y = "Close price"
  )

# Train/test split on monthly data
fb_train <- fb_monthly |> filter_index(. ~ "2017 Dec")
fb_test  <- fb_monthly |> filter_index("2018 Jan" ~ .)

fit_split <- fb_train |>
  model(
    mean  = MEAN(Close),
    naive = NAIVE(Close),
    drift = RW(Close ~ drift())
  )

fc_split <- fit_split |> forecast(new_data = fb_test)
accuracy(fc_split, fb_test)

## # A tibble: 3 × 10
##   .model .type     ME  RMSE   MAE    MPE  MAPE  MASE RMSSE  ACF1
##   <chr>  <chr>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 drift  Test  -23.9   35.0  26.4 -16.2   17.5   NaN   NaN 0.625
## 2 mean   Test   59.2   62.1  59.2  34.3   34.3   NaN   NaN 0.569
## 3 naive  Test   -8.18  20.6  15.8  -6.30  10.3   NaN   NaN 0.569

#conclusion
#The Facebook monthly closing price shows an upward trend with noticeable volatility. The drift method projects average monthly increase in the future, equivalent to extending the straight line between the first and last observations. When comparing methods, the naive forecast it seems performs best for stock prices. The mean forecast performs poorly since stock prices do not typically return to a fixed long term average but really move in trends that may not be seasonal but more likely associated with recent actions until a turn happens. This caused a spread far too wide but we can see contrary to that was the naive model since stocks are a more akin to a random walk by nature.This is supported in the final table by the lower RMSE and MAPE.

#5.10 Q3
# Extract data of interest
recent_production <- aus_production |>
  filter(year(Quarter) >= 1992)
# Define and estimate a model
fit <- recent_production |> model(SNAIVE(Beer))
# Look at the residuals
fit |> gg_tsresiduals()

## Warning: `gg_tsresiduals()` was deprecated in feasts 0.4.2.
## ℹ Please use `ggtime::gg_tsresiduals()` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 4 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 4 rows containing missing values or values outside the scale range
## (`geom_rug()`).

# Look a some forecasts
fit |> forecast() |> autoplot(recent_production)

#The seasonal naive model captures the quarterly seasonal pattern in beer production. The residuals are hovering around zero and show no strong remaining seasonal structure. There might be slight autocorrelation remainder, suggesting the may not just be white noise. The forecasts repeat that seasonal pattern too and don't account for long-term trend. Overall, the seasonal naive method provides a good baseline forecast, but a more advanced model may improve accuracy. This could be a combination model, additional parameters or perhaps an moving model.

#5.10 Q4
# Extract Australian exports
exports <- global_economy |>
  filter(Country == "Australia") |>
  select(Year, Exports)

# Fit naive model
fit_exports <- exports |>
  model(NAIVE(Exports))


fit_exports |> gg_tsresiduals()

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 1 row containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 1 row containing missing values or values outside the scale range
## (`geom_rug()`).

# Forecasts
fit_exports |>
  forecast(h = 10) |>
  autoplot(exports) +
  labs(title = "Naive Forecasts for Australian Exports",
       y = "Exports")

#Bricks data
bricks <- aus_production |>
  select(Quarter, Bricks)

# Seasonal naive
fit_bricks <- bricks |>
  model(SNAIVE(Bricks))

# Residual diagnostics
fit_bricks |> gg_tsresiduals()

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 24 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_rug()`).

# Forecasts
fit_bricks |>
  forecast(h = 24) |>
  autoplot(bricks) +
  labs(title = "Seasonal Naive Forecasts for Bricks",
       y = "Bricks")

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning in max(ids, na.rm = TRUE): no non-missing arguments to max; returning
## -Inf

## Warning: Removed 24 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 20 rows containing missing values or values outside the scale range
## (`geom_line()`).

#added this because the ribbon wasn't completely clear
fit |> forecast(h = 24)

## # A fable: 24 x 4 [1Q]
## # Key:     .model [1]
##    .model       Quarter
##    <chr>          <qtr>
##  1 SNAIVE(Beer) 2010 Q3
##  2 SNAIVE(Beer) 2010 Q4
##  3 SNAIVE(Beer) 2011 Q1
##  4 SNAIVE(Beer) 2011 Q2
##  5 SNAIVE(Beer) 2011 Q3
##  6 SNAIVE(Beer) 2011 Q4
##  7 SNAIVE(Beer) 2012 Q1
##  8 SNAIVE(Beer) 2012 Q2
##  9 SNAIVE(Beer) 2012 Q3
## 10 SNAIVE(Beer) 2012 Q4
## # ℹ 14 more rows
## # ℹ 2 more variables: Beer <dist>, .mean <dbl>

#Conclusion:For the Australian Exports, the naive method was chosen since the data are annual and do not exhibit a seasonal pattern that was observed. The residuals suggest the model removes short-term randomness, some trend-related structure may remain, whihc is indicating that naive provides only a solid benchmark for this series as the model handles the immediate jumps well but still doesn't capture the overall trend. That is what makes it good as a comparison model to look at. The Bricks series displays strong quarterly seasonality, making the seasonal naive method the better approach. The residual plots for Bricks show that most of the seasonal structure are removed, and the residuals fluctuate around zero. The ACF plot shows significant autocorrelation at several lags, indicating that the residuals are not just white noise. This is why SNAIVE is a solid baseline for the Bricks data, while NAIVE serves as a good baseline for Australian Exports.

#5.10 Q7
set.seed(111)

my_retail <- aus_retail |>
  filter(`Series ID` == sample(aus_retail$`Series ID`, 1))
#a.Create a training dataset consisting of observations before 2011 using
myseries_train <- my_retail |>
  filter(year(Month) < 2011)

#b.Check that your data have been split appropriately by producing the following plot.
#This is a training dataset of observations prior to 2011, as shown by the red segment in the plot. Observations from 2011 onward form the test 

autoplot(my_retail, Turnover) +
  autolayer(myseries_train, Turnover, colour = "red")

#c.Fit a seasonal naïve model using SNAIVE() applied to your training data (myseries_train).

fit <- myseries_train |>
  model(SNAIVE(Turnover))
#d.Check the residuals.

fit |> gg_tsresiduals()

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_line()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_point()`).

## Warning: Removed 12 rows containing non-finite outside the scale range
## (`stat_bin()`).

## Warning: Removed 12 rows containing missing values or values outside the scale range
## (`geom_rug()`).

#Do the residuals appear to be uncorrelated and normally distributed?
#The residuals don't appear to be white noise. The ACF shows strong positive autocorrelation at lag 1 and several significant lags thereafter, indicating that the seasonal naive model has not fully removed other dependence in the series. Although the residuals are centered near zero, the presence of autocorrelation suggests that there is remaining structure in the data that is not accounted for.

#e.Produce forecasts for the test data
#The forecasts repeat seasonal pattern but may not capture the continuing upward trend observed in the test data. As a result, forecast errors increase over time, and the model underestimates the actual values in the later period as the forecast fully sits below
fc <- fit |>
  forecast(new_data = anti_join(my_retail, myseries_train))

## Joining with `by = join_by(State, Industry, `Series ID`, Month, Turnover)`

fc |> autoplot(my_retail)

#f.Compare the accuracy of your forecasts against the actual values.
#Forecast accuracy is far worse on the test set. The RMSE increases from 37.6 in the training sample to 116.4 in the test sample, the MAPE increases from approximately 6.8% to 10.8%. This is suggesting that the seasonal naive model performs much worse on unseen data and fails to account for the upward trend that occurs towards the end near 2015.
fit |> accuracy()

## # A tibble: 1 × 12
##   State    Industry .model .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>    <chr>    <chr>  <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Victoria Cafes, … SNAIV… Trai…  22.1  37.6  27.6  6.81  8.76     1     1 0.829

fc |> accuracy(my_retail)

## # A tibble: 1 × 12
##   .model    State Industry .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>     <chr> <chr>    <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(T… Vict… Cafes, … Test   69.0  116.  89.9  7.68  10.8  3.25  3.09 0.921

#g.How sensitive are the accuracy measures to the amount of training data used?
# The seasonal model captures the seasonal pattern in the retail turnover series but the residuals diagnostic table shows us that the residuals are not white noise. The ACF shows strong positive autocorrelation at lag 1 and several other lags, suggesting structure in the data. The histogram indicates good symmetry but it fails to eliminate concerns about dependence. The forecasts seem to capture the seasonal pattern but are simply missing that upward trend towards the end. Our measures confirm this as RMSE increases from 37.6 in the training sample to 116.4 in the test, and MAPE increases from 6.8% to 10.8%. This suggests that forecast accuracy is sensitive to the training period chosen, and the growth after 2011 reduces predictive performance of the seasonal model.

Note that the echo = FALSE parameter was added to the code chunk to prevent printing of the R code that generated the plot.