extra credit

library(fpp3)

## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──

## ✔ tibble      3.1.8     ✔ tsibble     1.1.2
## ✔ dplyr       1.0.9     ✔ tsibbledata 0.4.1
## ✔ tidyr       1.2.0     ✔ feasts      0.2.2
## ✔ lubridate   1.8.0     ✔ fable       0.3.1
## ✔ ggplot2     3.3.6

## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date()    masks base::date()
## ✖ dplyr::filter()      masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval()  masks lubridate::interval()
## ✖ dplyr::lag()         masks stats::lag()
## ✖ tsibble::setdiff()   masks base::setdiff()
## ✖ tsibble::union()     masks base::union()

library(readxl)
library(ggplot2)
library(tsibble)
library(lubridate)
library(tidyverse)

## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──

## ✔ readr   2.1.2     ✔ stringr 1.4.1
## ✔ purrr   0.3.4     ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date()        masks base::date()
## ✖ dplyr::filter()          masks stats::filter()
## ✖ tsibble::intersect()     masks lubridate::intersect(), base::intersect()
## ✖ tsibble::interval()      masks lubridate::interval()
## ✖ dplyr::lag()             masks stats::lag()
## ✖ tsibble::setdiff()       masks lubridate::setdiff(), base::setdiff()
## ✖ tsibble::union()         masks lubridate::union(), base::union()

library(dplyr)
library(seasonal)

## 
## Attaching package: 'seasonal'
## 
## The following object is masked from 'package:tibble':
## 
##     view

library(zoo)

## 
## Attaching package: 'zoo'
## 
## The following object is masked from 'package:tsibble':
## 
##     index
## 
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric

library(tidyr)
library(tibble)
library(ggfortify)
library(moments)
library(fable)
library(brew)
library(sos)

## 
## Attaching package: 'sos'
## 
## The following object is masked from 'package:tidyr':
## 
##     matches
## 
## The following object is masked from 'package:dplyr':
## 
##     matches
## 
## The following object is masked from 'package:utils':
## 
##     ?

library(slider)
library(x13binary)
library(USgas)
library(GGally)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

GA_data <- read_excel("C:/Users/ryanf/OneDrive/Pictures/final paper/GA data.xlsx")

ga_data <-GA_data %>%
  as_tsibble(index=Year)
head(ga_data)

## # A tsibble: 6 x 5 [1Y]
##    Year   GDP Income Consumption Total_employment
##   <dbl> <dbl>  <dbl>       <dbl>            <dbl>
## 1  1998   9      6.6         8.1              2.6
## 2  1999   5.7    8.2         8.2              2.8
## 3  2000   3.9    5           4.8              0.2
## 4  2001   3      2.8         3.8              0.3
## 5  2002   4.7    3.9         5.6              0.8
## 6  2003   6.5    5.4         6.3              2.6

gg <-GA_data%>%
  as_tibble()
head(gg)

## # A tibble: 6 × 5
##    Year   GDP Income Consumption Total_employment
##   <dbl> <dbl>  <dbl>       <dbl>            <dbl>
## 1  1998   9      6.6         8.1              2.6
## 2  1999   5.7    8.2         8.2              2.8
## 3  2000   3.9    5           4.8              0.2
## 4  2001   3      2.8         3.8              0.3
## 5  2002   4.7    3.9         5.6              0.8
## 6  2003   6.5    5.4         6.3              2.6

gg %>%
  ggpairs

year doesnt appear to have any relationship to the rest of the variables. All of the rest of my relationships of statistically significant with most being at the 90% level. Income and consumption are significant at the 99% level with GDP. Consumption is significant at the 95% level with Income. All three of the remaining variables (consumption, income, and employment) have a positive correlation with GDP. Personal income has a positive correlation with total employment and consumption. Consumption has a positive correlation with total employment.

ga_model <- ga_data %>%
  model(TSLM(Consumption ~ GDP + Income + Total_employment))
report(ga_model)

## Series: Consumption 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.82914 -0.81453 -0.06476  0.79437  3.51296 
## 
## Coefficients:
##                  Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.09569    0.89627   0.107 0.916094    
## GDP               0.89891    0.19253   4.669 0.000167 ***
## Income            0.12738    0.21102   0.604 0.553218    
## Total_employment  0.10114    0.35788   0.283 0.780533    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.505 on 19 degrees of freedom
## Multiple R-squared: 0.7294,  Adjusted R-squared: 0.6867
## F-statistic: 17.08 on 3 and 19 DF, p-value: 1.2684e-05

The interecept has a value of .09569 meaning when GDP Income and Total_employment changes by 0%, Consumption goes up by .095%.

GDP has a value of .89 meaning that a 1% increase in consumption leads to a .89% increase in GDP. This is the only value that is statistically significant, but is also significant at the 99% level.

Income has .12 value meaning a 1% increase in consumption leads to a .12% increase in Income.

Total_employment has a vlaue of .10 meaning that a % increase in consumption leads to a .1% increase in total_employment

augment (ga_model) %>%
  ggplot(aes(x = Year)) +
  geom_line(aes(y = Consumption, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  labs(y = "Percentage Change", title = "Percent Change in Georgia Consumption") +
  scale_colour_manual(values = c(Data = "black", Fitted = "blue")) +
  guides(colour = guide_legend(title = NULL))

fit_ga <- ga_data %>%
  model(lm = TSLM(Consumption ~ GDP + Income + Total_employment))

ga_scenarios <- scenarios(
  Increase = new_data(ga_data, 3) %>%
  mutate(GDP = 1, Income = 1, Total_employment = 1),
  names_to = "Scenario")
ga_forecast1 <- forecast(fit_ga, new_data = ga_scenarios)

ga_scenarios2 <- scenarios(
  Increase = new_data(ga_data, 3) %>%
    mutate(GDP = 0, Income = 1, Total_employment = 0),
  Decrease = new_data(ga_data, 3) %>%
    mutate(GDP = 0, Income = -.5, Total_employment = 0),
  Increase1 = new_data(ga_data, 3) %>%
    mutate(GDP = 0, Income = .8, Total_employment = 0), names_to = "Scenario")

ga_forecast2 <- forecast(fit_ga, new_data = ga_scenarios2)

ga_data %>%
  autoplot(Consumption) +
  autolayer(ga_forecast1) +
  autolayer(ga_forecast2) +
  labs(title = "Georgia Scenario 1 and 2 on Consumption Forecast", y = "Percentage Change")

I plotted both of the scenarios with prediction intervals. Since the most recent data contains covid data, consumption % change is high due to financial stimulus. I could see the drop being pretty drastic as stimulus slows down and concerns over inflation and recession increase. It most likely would not slow down to zero, but it might be within the prediction interval.

PROBLEM 2

M3_exam1 <- read_excel("C:/Users/ryanf/Downloads/M3_exam1.xlsx")
#View(M3_exam1)

ryan_data <- M3_exam1%>%
  select(-Series) %>%
  select(-N) %>%
  select(-NF) %>%
  select(-Category) %>%
  select(-'Starting Year') %>%
  select(-'Starting Quarter') %>%
  filter(`Student Name` == 'Ryan') %>%
  pivot_longer(-'Student Name') %>%
  select(-'Student Name') %>%
  select(-name) %>%
  mutate(Micro = value) %>%
  select(-value)

ryan_data <- na.omit(ryan_data)

ryan_data <- ryan_data %>%
  mutate(Date = yearquarter(57:99)) %>%
  select(Date, Micro) %>%
  as_tsibble(index = Date)

ryan_train <- ryan_data %>%
  filter(year(Date) <= 1990)
ryan_test <- ryan_data %>%
  filter(year(Date) > 1990)
ryan_drift_prep <- ryan_data %>%
  filter(year(Date) > 1987)
ryan_drift_prep_train <- ryan_drift_prep %>%
  filter(year(Date) < 1993)
ryan_drift_prep_test <- ryan_drift_prep %>%
  filter(year(Date) > 1992)
#ryan_data

ryan_data %>%
  ggplot(aes(x = Date)) +
  geom_line(aes(y = Micro)) +
  labs(title = "Micro Data", y = "Micro Data Index")

ryan_data %>%
  features(Micro, features = guerrero)

## # A tibble: 1 × 1
##   lambda_guerrero
##             <dbl>
## 1            1.69

lamda <- 1.69
ryan_data %>%
  autoplot(box_cox(Micro, lamda)) +
  labs(title = "Box-Cox Transformed Micro Data", x = "Date")

ryan_stl <- ryan_data %>%
  model(STL(Micro ~ season(window = 4), robust = TRUE)) %>%
  components() %>% 
  autoplot() +
  labs(title = "STL Decomposition:Micro Data")
ryan_stl

I did a box cox and stl decomposition in order help me with my data. The box cox did not change the variance of my data very much. The stl helped me understand the increasing trend more as well as rule out forecasts such as seasonal forecasts due to the unliklihood of seasonal patterns.

ryan_mod <- ryan_train %>%
  model(
    Seasonal_naive = SNAIVE(Micro),
    Naive = NAIVE(Micro),
    Drift = RW(Micro ~ drift()),
    Mean = MEAN(Micro)
  )

ryan_fore <- ryan_mod %>%
  forecast(h = 4)
accuracy(ryan_fore, ryan_data)

## # A tibble: 4 × 10
##   .model         .type     ME  RMSE   MAE    MPE  MAPE  MASE RMSSE    ACF1
##   <chr>          <chr>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Drift          Test    63.4  206.  201.  0.643  2.07 0.312 0.286 -0.0531
## 2 Mean           Test  2627.  2629. 2627. 27.0   27.0  4.08  3.64  -0.274 
## 3 Naive          Test   485.   498.  485.  4.97   4.97 0.755 0.691 -0.274 
## 4 Seasonal_naive Test   822.   857.  822.  8.43   8.43 1.28  1.19  -0.141

ryan_fore %>%
  autoplot() +
  autolayer(ryan_data)

## Plot variable not specified, automatically selected `.vars = Micro`

Based off my training data, drift method seems to be the best fit. This is backed up by the fact the accuracy test for drift had the best numbers. I figured this would be the case based off the strong trend shown in my stl decomposition.

ryan_drift2 <- ryan_drift_prep_train %>% 
  model(Drift = RW(Micro ~ drift())) 

ryan_drift2_forecast <- ryan_drift2 %>%
  forecast(h = "1 year")

ryan_drift2 %>%
  gg_tsresiduals()

## Warning: Removed 1 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_point).

## Warning: Removed 1 rows containing non-finite values (stat_bin).

augment(ryan_drift2) %>%
  features(.resid, ljung_box, lag = 10, dof = 0)

## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 Drift     6.14     0.803

accuracy(ryan_drift2_forecast, ryan_data)

## # A tibble: 1 × 10
##   .model .type    ME  RMSE   MAE    MPE  MAPE   MASE RMSSE   ACF1
##   <chr>  <chr> <dbl> <dbl> <dbl>  <dbl> <dbl>  <dbl> <dbl>  <dbl>
## 1 Drift  Test  -41.6  84.5  69.2 -0.499 0.819 0.0948 0.102 -0.617

ryan_drift3 <- ryan_data %>% 
  model(Drift = RW(Micro ~ drift())) 

ryan_drift3_forecast <- ryan_drift3 %>%
  forecast(h = "1 year")

ryan_data %>%
  ggplot(aes(x = Date)) +
  geom_line(aes(y = Micro)) +
  autolayer(ryan_drift3_forecast) +
  labs(title = "Drift Forecast", y = "Micro Data Index")

Because I tried all of the forecast methods on my training data, I could already be fairly confident drift was the best method for me. It had the best accuracy. Visually, my data already had a strong trend line so it looked to be a good fit for trend. The mean of the residuals has an average at zero which is good. I do wish the residuals weren’t so horizontal though. It makes me think it might not be completely white noise. There was also a pretty big outlier in 1992. My p value in the box test was pretty high though.

In conclusion, drift is the best forecast for my data because it has homoskedastic residuals distrubuted at a mean of zero indistinguishable from white noise based on the box test. Furthermore, my forecast has the best accuracy of all tested methods and is visually convincing.

extra credit

Ryan

2022-10-21