library(readxl)
library(tidyquant)
library(fpp3)
library(moments)
library(tsibble)
library(tsibbledata)
library(ggplot2)
library(dplyr)
library(seasonal)
library(seasonalview)
library(fabletools)
library(fable)
library(ggfortify)
library(quantmod)
library(GGally)

#QUESTION 1

#1 and #2

col <- read_excel("/Users/Peter Cook/Documents/Economics and Finance/Business Forecasting/Forecasting Data/Colorado.xlsx")
col
## # A tibble: 23 × 5
##     Year   GDP Income Consumption Employment
##    <dbl> <dbl>  <dbl>       <dbl>      <dbl>
##  1  1998   8.9    7.5         9.6        2.8
##  2  1999  10     11.3        10.3        3.9
##  3  2000   3.7    4.9         5.3        0.8
##  4  2001   1.8    0.1         2.8       -0.8
##  5  2002   3.2    2.1         4.4       -0.3
##  6  2003   4.1    3.8         6.1        1.7
##  7  2004   8.1    6.8         5.9        2.6
##  8  2005   5.4    8.2         6.1        2.4
##  9  2006   6.8    6.7         5.9        3.5
## 10  2007   4.2    3.9         3.5        1  
## # … with 13 more rows
## # ℹ Use `print(n = ...)` to see more rows
ts_col <- col %>%
  as_tsibble(index=Year)
ts_col
## # A tsibble: 23 x 5 [1Y]
##     Year   GDP Income Consumption Employment
##    <dbl> <dbl>  <dbl>       <dbl>      <dbl>
##  1  1998   8.9    7.5         9.6        2.8
##  2  1999  10     11.3        10.3        3.9
##  3  2000   3.7    4.9         5.3        0.8
##  4  2001   1.8    0.1         2.8       -0.8
##  5  2002   3.2    2.1         4.4       -0.3
##  6  2003   4.1    3.8         6.1        1.7
##  7  2004   8.1    6.8         5.9        2.6
##  8  2005   5.4    8.2         6.1        2.4
##  9  2006   6.8    6.7         5.9        3.5
## 10  2007   4.2    3.9         3.5        1  
## # … with 13 more rows
## # ℹ Use `print(n = ...)` to see more rows
ggpairs(ts_col %>% select(-Year))

#Imported the data set (cleaned it in excel) and converetd into a tsibble. Plotted the variables using ggpairs.

#3

#All of the variables are very highly and statistically significantly (at the 99.9% level ) correlated (positive) with each other, which makes sense because they are all indicators of the economic of a state.

#4

col_model <- ts_col %>%
  model(lm = TSLM(Consumption ~ Income + GDP + Employment))
report(col_model)
## Series: Consumption 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.40850 -0.99901 -0.04727  0.78164  2.56161 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept)   1.2604     0.5658   2.228   0.0382 *  
## Income       -0.1421     0.1296  -1.097   0.2865    
## GDP           0.9475     0.1689   5.609 2.08e-05 ***
## Employment    0.1306     0.2795   0.467   0.6457    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.304 on 19 degrees of freedom
## Multiple R-squared: 0.8503,  Adjusted R-squared: 0.8267
## F-statistic: 35.99 on 3 and 19 DF, p-value: 4.8942e-08

#Estimated a model measuring consumption as a function of GDP, Personal Income, and Employment.

#5 #Our model tells us that for every one percent increase in employment, consumption rises by 0.1306%s. For every percent increase in GDP, consumption rises by 0.9475%. For every percent decrease in income, consumption falls by 0.1421%. The effect of GDP on consumption is statistically significant at the 99.9% level. If Income, GDP, and Employment are not changing at all, then consumption tends to steadily rise by 1.2604% over time.

#6

augment(col_model) %>%
  ggplot(aes(x = Year)) +
  geom_line(aes(y = Consumption, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  scale_color_manual(
    values = c(Data = "black", Fitted = "green")
  ) +
  labs(y = "Consumption",
       title = "Colorado Consumption") +
  guides(colour = guide_legend(title = "Series"))

#This graph plots the fitted values of the model for all values of consumption that are currently shown in the data.

#7

col_model <- ts_col %>%
  model(lm = TSLM(Consumption ~ Income + GDP + Employment))


forecasted_scenarios <- scenarios(
  Increase = new_data(ts_col, 3) %>%
    mutate(Income=1, GDP=1, Employment=1),
  names_to = "Scenario")

forecast1 <- forecast(col_model, new_data = forecasted_scenarios)

 

ts_col %>%
  autoplot(Consumption) +
  autolayer(forecast1) +
  labs(title = "Colorado Consumption", y = "Percent Change")

#Consumption forecasted into the next three years when there is constant growth of 1% in GDP, Personal Income, and Employment.

future_scenarios2 <- scenarios(
  Increase = new_data(ts_col, 3) %>%
    mutate(Income= 1, GDP = 0, Employment = 0),
  Decrease = new_data(ts_col, 3) %>%
    mutate(Income = -0.5, GDP = 0, Employment = 0),
  Increase8 = new_data(ts_col, 3) %>%
    mutate(Income = 0.8, GDP = 0, Employment = 0),
  names_to = "Scenario")

 

forecast2 <- forecast(col_model, new_data = future_scenarios2)

 

ts_col %>%
  autoplot(Consumption) +
  autolayer(forecast2) +
  labs(title = "Colorado consumption", y = "Percent change")

forecast2
## # A fable: 9 x 8 [1Y]
## # Key:     Scenario, .model [3]
##   Scenario  .model  Year Consumption .mean Income   GDP Employment
##   <chr>     <chr>  <dbl>      <dist> <dbl>  <dbl> <dbl>      <dbl>
## 1 Increase  lm      2021   N(1.1, 2)  1.12    1       0          0
## 2 Increase  lm      2022   N(1.1, 2)  1.12    1       0          0
## 3 Increase  lm      2023   N(1.1, 2)  1.12    1       0          0
## 4 Decrease  lm      2021 N(1.3, 2.1)  1.33   -0.5     0          0
## 5 Decrease  lm      2022 N(1.3, 2.1)  1.33   -0.5     0          0
## 6 Decrease  lm      2023 N(1.3, 2.1)  1.33   -0.5     0          0
## 7 Increase8 lm      2021   N(1.1, 2)  1.15    0.8     0          0
## 8 Increase8 lm      2022   N(1.1, 2)  1.15    0.8     0          0
## 9 Increase8 lm      2023   N(1.1, 2)  1.15    0.8     0          0

#Our prediction intervals for the different changes in income are roughly the same with similar point forecasts.

#Comparing the two forecasts, I would have expected that the different percent changes in income growth would have had a more siginificant effect on the prediciton intervals and point forecasts, but both forecasts proudce roughly equivalent values for both.

#QUESTION 2

data <- read_excel("/Users/Peter Cook/Documents/Economics and Finance/Business Forecasting/Forecasting Data/Exam1.xlsx")

data_ts <- data %>%
  mutate(Quarter = yearquarter(Quarter)) %>%
  as_tsibble(index=Quarter)


data_ts
## # A tsibble: 44 x 2 [1Q]
##    Quarter  Data
##      <qtr> <dbl>
##  1 1984 Q1 2183.
##  2 1984 Q2 2190.
##  3 1984 Q3 2137.
##  4 1984 Q4 2156.
##  5 1985 Q1 2142.
##  6 1985 Q2 2267.
##  7 1985 Q3 2308.
##  8 1985 Q4 2428.
##  9 1986 Q1 2444.
## 10 1986 Q2 2520.
## # … with 34 more rows
## # ℹ Use `print(n = ...)` to see more rows

#Imported the data for my variable (cleaned most of it in excel) and converted it into a tsibble.

data_ts %>% 
  model(MEAN(Data)) %>%
  forecast(h = 4) %>%
  autoplot(data_ts) +
  labs(title = "Forecast of Data using MEAN Method")

#Forecasted my data four quarters into the future using the MEAN method.

data_ts %>% 
  model(SNAIVE(Data)) %>%
  forecast(h = 4) %>%
  autoplot(data_ts) +
  labs(title = "Forecast of Data using SNAIVE Method")

#Forecasted my data using the SNAIVE method.

data_ts %>% 
  model(NAIVE(Data)) %>%
  forecast(h = 4) %>%
  autoplot(data_ts) +
  labs(title = "Forecast of Data using NAIVE Method")

#Forecasted my data using the NAIVE method.

data_ts %>% 
  model(RW(Data ~ drift())) %>%
  forecast(h = 4) %>%
  autoplot(data_ts) +
  labs(title = "Forecast of Data using Drift Method")

#Forecasted my data using the Drift Method.

#If we are going just by pure eyeball analysis, the drift method seems like it would be the best forecast for this data. There is no seasonality. Data looks like it could be cyclical and there are two large trends, one upwards trend from 1984-early 90s and then a large downwards trend from the mid 90s-94. Because there doesn’t appear to be any seasonality, a box-cox transformation is not necessary.

#All of these forecasts have pretty large prediction intervals, which makes me wonder about what the autocorrelation of my residuals looks like.

data_ts %>% ACF(Data, lag_max = 8) %>% autoplot() + labs(title = "ACF Plot of my Residuals")

#The autocorrelation confirms my suspicions. My data has a significant trend component, making my residuals not resemble white noise. We can tell this by the fact that the autocorrelation is large and positive at the beginning and gets smaller as lags increase (this indicates trend). All of the autocorrelations that spike outside the blue dotted line, which is the critical value 1.96 at the 95% interval, indicates that my residuals are not white noise. Because my residuals do not resemble white noise, they are not normally distributed, meaning I should use bootstrapping to produce forecasts of my data.

fitted <- data_ts%>%
  model(RW(Data ~ drift()))

simulate <- fitted %>% generate(h=4,times=5, bootstrap = TRUE)

Boostrapped <- fitted %>% forecast (h = 4, bootstrap = TRUE, times=5000)

autoplot(Boostrapped, data_ts) + labs(title = "Drift Method Forecast of my Data using Bootstrapped Values")

#I created a series of bootstrapped values for my data as a response to my ACF plot and used them to forecast my data using the drift method four periods into the future. In total, there are 5000 bootstrapped values that create the prediction interval for my forecast.

trained <- data_ts %>%
  filter(between(year(Quarter), 1984, 1991))


tested <- data_ts %>%
  filter(year(Quarter) >= 1992)

data_trained <- trained %>%
    model(
    Mean = MEAN(Data),
    Naive = NAIVE(Data),
    Seasonal_naive = SNAIVE(Data),
    Drift = RW(Data ~ drift())
  )

data_fc <- data_trained %>%
  forecast(h = 4)

accuracy(data_trained)
## # A tibble: 4 × 10
##   .model         .type           ME  RMSE   MAE     MPE  MAPE  MASE RMSSE  ACF1
##   <chr>          <chr>        <dbl> <dbl> <dbl>   <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Mean           Training  1.71e-13 1136. 1115. -11.7   35.5  2.85  1.75  0.935
## 2 Naive          Training  8.55e+ 1  247.  108.   2.34   2.93 0.275 0.380 0.100
## 3 Seasonal_naive Training  3.85e+ 2  649.  392.   9.99  10.2  1     1     0.758
## 4 Drift          Training -1.91e-13  231.  103.  -0.316  2.91 0.264 0.357 0.100

#I created a training and test set of my data. This table estimates the parameters of the four forecasting methods (MEAN, NAIVE, SNAIVE, and Drift). Based on the MAPE score (which we are using because the data is scale-free), the Drift method is the best.

accuracy(data_fc, tested)
## # A tibble: 4 × 10
##   .model         .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>          <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 Drift          Test  -295.   555.  300. -7.18  7.26   NaN   NaN -0.0346
## 2 Mean           Test  1205.  1269. 1205. 24.8  24.8    NaN   NaN -0.118 
## 3 Naive          Test   -81.5  406.  301. -2.51  6.90   NaN   NaN -0.118 
## 4 Seasonal_naive Test  -108.   397.  275. -3.02  6.39   NaN   NaN -0.0130

#Interestingly, the accuracy test for the prediction error says that SNAIVE is the most accurate model (according the MAPE score. This doesn’t make much sense bcause there doesn’t appear to be any seaonsality in the data and the ACF plot reveals a trend component in the residuals. I don’t think this ultimately knocks out the Drift method becuase the Drift method is better for capturing trends in data. But, for the sake of being thorough, lets try cross validation and compare Drift and SNAIVE.

data_stretched <- data_ts %>%
  stretch_tsibble(.init = 3, .step = 1)

drift_cv <- data_stretched %>%
  model(RW(Data ~ drift()))

SNAIVE_cv <- data_stretched %>%
  model(SNAIVE(Data))


drift_cv_fc <- drift_cv %>%
  forecast(h=4)

SNAIVE_cv_fc <- SNAIVE_cv %>%
  forecast(h=4)

drift_cv_fc %>% accuracy(data_ts)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 4 observations are missing between 1995 Q1 and 1995 Q4
## # A tibble: 1 × 10
##   .model             .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>              <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 RW(Data ~ drift()) Test  -84.2  517.  328. -1.77  8.37 0.789 0.815 0.630

#This shows the accuracy measures using cross validation with the drift model.

SNAIVE_cv_fc %>% accuracy(data_ts)
## Warning: The future dataset is incomplete, incomplete out-of-sample data will be treated as missing. 
## 4 observations are missing between 1995 Q1 and 1995 Q4
## # A tibble: 1 × 10
##   .model       .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE  ACF1
##   <chr>        <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 SNAIVE(Data) Test   147.  640.  423.  3.69  11.0  1.02  1.01 0.699

#This shows the accuracy measures using cross validation with the SNAIVE model.

#I created two seperate accuracy measures for the drift and SNAIVE models using cross validation in order to settle the discrepancy discovered previously. As expected, the drift method has the lower MAPE score according to cross validation, meaning it produces the more accurate point forecasts.

#Based on the surface observation, the drift method appeared to be the best. The ACF plot revealed that there is trend in my residuals and that my residuals are not white noise. To solve this, I created a set of bootstrapped values and made a forecast, still using the drift method. The drift method also has the best fit for the data according to the test and training sets, and it has been revealed to have the most accurate forecast according to cross validation. Becuase of all of this, I believe that for my data, a bootstrapped forecast using the drift method is the best.