Take Home Exam

Load packages and data

library(ggplot2)
library(tidyverse)
library(tidyquant)
library(forecast)
library(dplyr)
library(feasts)
library(fpp3)
library(seasonal)
library(fable)
library(readxl)
library(GGally)

# Data

Table1 <- read_excel("Table1.xlsx")
M3_exam1 <- read_excel("C:/Users/14047/Downloads/M3_exam1.xlsx")

Exercise 1

Question 1

# 1.

# Since the data occurs only once per year, we can 
# easily change it into a tsibble with Year as the index.

tsib <- Table1 %>% 
  as_tsibble(index = Year)
tsib

## # A tsibble: 23 x 5 [1Y]
##     Year   GDP Personal_income Personal_cons_exp Total_emp
##    <dbl> <dbl>           <dbl>             <dbl>     <dbl>
##  1  1998   8.2             7.4               7.5       2.3
##  2  1999   9.3            10.2               8.8       2  
##  3  2000   1.4             3.4               5         0.6
##  4  2001   3.1             1.4               4.3       0.1
##  5  2002   5.6             4.5               5.5       0.6
##  6  2003   6               6.1               6.8       1.6
##  7  2004   7               5.8               7         1.7
##  8  2005   6.7             7.8               6.4       1.9
##  9  2006   4.8             4.6               6         1.9
## 10  2007   2.4             1.4               3.1      -1.1
## # … with 13 more rows
## # ℹ Use `print(n = ...)` to see more rows

Question 2

# 2. 

# Using my hint and transforming my tsibble into a tibble 
# before I plot it.

tib <- tsib %>% 
  as_tibble()
tib

## # A tibble: 23 × 5
##     Year   GDP Personal_income Personal_cons_exp Total_emp
##    <dbl> <dbl>           <dbl>             <dbl>     <dbl>
##  1  1998   8.2             7.4               7.5       2.3
##  2  1999   9.3            10.2               8.8       2  
##  3  2000   1.4             3.4               5         0.6
##  4  2001   3.1             1.4               4.3       0.1
##  5  2002   5.6             4.5               5.5       0.6
##  6  2003   6               6.1               6.8       1.6
##  7  2004   7               5.8               7         1.7
##  8  2005   6.7             7.8               6.4       1.9
##  9  2006   4.8             4.6               6         1.9
## 10  2007   2.4             1.4               3.1      -1.1
## # … with 13 more rows
## # ℹ Use `print(n = ...)` to see more rows

# Now, I am plotting the tibble using ggpairs

ggpairs(tib, columns = 2:5)

# The plot shows the scatterplot of each variable against one
# another to show their relationship with one another. Along with
# this, the correlation between the variables align across the
# plots.

Question 3

# 3.

# Again, each variable is plotted against one another. GDP and 
# personal_income have a positive, linear moderately strong relationship 
# with a correlation of 0.618. GDP has a very strong positive, linear 
# relationship with a correlation of 0.923. GDP and Total employment have 
# a positive, moderately strong relationship (correlation of 0.771). 
# Personal income and personal consumption expenditures have a positive 
# moderately strong, linear relationship with few outliers (correlation of 
# 0.487). Personal income and total employment have a positive, moderately 
# strong relationship (correlation of 0.413). Personal consumption 
# expenditures have a strong positive, linear relationship (correlation 0.754).

Question 4

# 4. 

# Since every variable has a linear relationship, I am going to 
# use the TSLM model to explain the change in personal consumption 
# expenditures of a function personal income, GDP, and total employment.

fit_tsib <- tsib %>% 
  model(lm = TSLM(Personal_cons_exp ~ Personal_income + GDP + Total_emp))

# Now, using the report function to view the results 

report(fit_tsib)

## Series: Personal_cons_exp 
## Model: TSLM 
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.8341 -0.6512  0.0199  0.7086  3.5230 
## 
## Coefficients:
##                 Estimate Std. Error t value Pr(>|t|)    
## (Intercept)       0.4718     0.6429   0.734    0.472    
## Personal_income  -0.1450     0.1228  -1.181    0.252    
## GDP               1.0125     0.1666   6.077 7.62e-06 ***
## Total_emp         0.1345     0.2106   0.639    0.531    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.31 on 19 degrees of freedom
## Multiple R-squared: 0.866,   Adjusted R-squared: 0.8448
## F-statistic: 40.92 on 3 and 19 DF, p-value: 1.7295e-08

Question 5

# 5. 

# Intercept estimate: When Personal income, GDP, and total employment equal 
# zero, personal consumption expenditures equals 0.4718 which is not statistically 
# significant at the 5 percent level since the t value is less than 1.96.

# The estimate of the coefficient for personal income means that as it increases 
# by one unit, consumption goes down by -0.1450 units. It is not statistically 
# significant at the 5 percent level. 

# The estimate of the coefficient for GDP indicates that as GDP 
# increases by one unit, consumption goes up by 1.0125 units. It 
# happens to be statistically significant at the 5 percent level.

# The estimate of the coefficient for Total employment indicates that 
# as total employment goes up by one unit, consumption goes up 
# by 0.1345 units. This is not statistically significant at the 5 
# percent level.

Question 6

# 6. 

# I am plotting the fitted line of the regression equation to the actual data 
# values below:

augment(fit_tsib) %>% 
  ggplot(aes(x = Year)) +
  geom_line(aes(y = Personal_cons_exp, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  labs(title = "Personal Consumption Expenditures Yearly", y = "Percentages") +
  scale_colour_manual(values = c(Data = "black", Fitted = "red"))

Question 7

# 7. 

# I am now forecasting consumption for the next three years with 
# two separate scenarios 

# a. a constant growth 

# With a constant growth of 1% in all variables:

future_scenarios <- scenarios(
  Increase = new_data(tsib, 3) %>% 
    mutate(Personal_income = 1, GDP = 1, Total_emp = 1),
  names_to = "Scenario1"
)

# Now, I am forecasting the first scenario 

fc <- forecast(fit_tsib, new_data = future_scenarios)

tsib %>% 
  autoplot(Personal_cons_exp) +
  autolayer(fc)

# b. 

# My scenario 2 has income change by 1, -0.5, and 0.8 percent

future_scen <- scenarios(
  Increase = new_data(tsib, 3) %>% 
    mutate(Personal_income = 1, GDP = 0, Total_emp = 0),
  Decrease = new_data(tsib, 3) %>% 
    mutate(Personal_income = -0.5, GDP = 0, Total_emp = 0),
  Increase1 = new_data(tsib, 3) %>% 
    mutate(Personal_income = 0.8, GDP = 0, Total_emp = 0),
  names_to = "Scenario2"
) 


fc2 <- forecast(fit_tsib, new_data = future_scen)

tsib %>% 
  autoplot(Personal_cons_exp) +
  autolayer(fc2) +
  labs(y = "Consumption (percentages)")

Question 8

# 8. 

# Scenario one has a pretty low forecast point compared to the 
# last observation, Thus, it seems pretty questionable. However, it seems
# to fit when we take into account the overall historical data. 
# The prediction intervals seem not to be too precise which is good.

# Scenario two includes the forecast of consumption when income has different
# percentage growth rates. Since the coefficient of income is negative, it makes
# sense that the forecast of consumption decreases as the growth of income percentages
# increases. Again, the forecast point is low relative to the last observation of the 
# data, however, seems to fit the overall historical data pretty well.

Exercise 2

Data Preparation

# I am going to use the 5 steps of forecasting a model 

# 1. Data Preparation (tidy)


Data <- M3_exam1 %>% 
  filter(`Student Name` == "Summer") %>% 
  pivot_longer(cols = 8:43, names_to = "Values", values_to = "count") %>% 
  select(-c(8:43)) %>% 
  select(-Values) %>% 
  mutate(Macro_Values = count) %>% 
  select(-count) %>% 
  na.omit()
View(Data)

# It took me awhile to tidy the data, but it looks pretty nice now.
# I pivoted my data and omitted the NA values I had. Now, I am going
# to add the rest of the quarters following the initial. I am now adding
# the rest of the years and quarters for my data and selecting the 
# certain rows:

Data1 <- Data %>% 
  mutate(Quarter = yearquarter(68:91)) %>% 
  select(Quarter, Macro_Values) %>% 
  as_tsibble(index = Quarter)
Data1

## # A tsibble: 24 x 2 [1Q]
##    Quarter Macro_Values
##      <qtr>        <dbl>
##  1 1987 Q1         3095
##  2 1987 Q2         3276
##  3 1987 Q3         3051
##  4 1987 Q4         3427
##  5 1988 Q1         3380
##  6 1988 Q2         3327
##  7 1988 Q3         3143
##  8 1988 Q4         3577
##  9 1989 Q1         3441
## 10 1989 Q2         3557
## # … with 14 more rows
## # ℹ Use `print(n = ...)` to see more rows

View(Data1)

Data Visualisation

# 2. Plot the Data (visualize)

ggplot(Data1, aes(x = Quarter, y = Macro_Values)) +
  geom_line()

# The data seems to follow a seasonal pattern and a 
# positive trend. 

# Now, I am decomposing my data to see that different 
# components within my data using the STL decomposition.


Data1 %>% 
  model(STL(Macro_Values ~ season() + trend(), robust = TRUE)) %>% 
  components() %>% 
  autoplot()

# Like I said before, I have an increasing trend and yearly seasonality.

Define a Model

# 3. Define a model (specify)


# I hypothesize that my most accurate model will be the 
# snaive model since my plot has seasonality. However, I am 
# going to employ a couple of methods based on what I think 
# would be most appropriate.

Train the Model (estimate)

# 4. Train the model (estimate)

recent <- Data1 %>% 
  filter(year(Quarter) >= 1987)
recent

## # A tsibble: 24 x 2 [1Q]
##    Quarter Macro_Values
##      <qtr>        <dbl>
##  1 1987 Q1         3095
##  2 1987 Q2         3276
##  3 1987 Q3         3051
##  4 1987 Q4         3427
##  5 1988 Q1         3380
##  6 1988 Q2         3327
##  7 1988 Q3         3143
##  8 1988 Q4         3577
##  9 1989 Q1         3441
## 10 1989 Q2         3557
## # … with 14 more rows
## # ℹ Use `print(n = ...)` to see more rows

# Test Data is about 20% of my data

test <- Data1 %>% 
  filter(year(Quarter) > 1990)

# My train data is about 80% of my data 

train <- Data1 %>% 
  filter(year(Quarter) <= 1990)
train

## # A tsibble: 16 x 2 [1Q]
##    Quarter Macro_Values
##      <qtr>        <dbl>
##  1 1987 Q1         3095
##  2 1987 Q2         3276
##  3 1987 Q3         3051
##  4 1987 Q4         3427
##  5 1988 Q1         3380
##  6 1988 Q2         3327
##  7 1988 Q3         3143
##  8 1988 Q4         3577
##  9 1989 Q1         3441
## 10 1989 Q2         3557
## 11 1989 Q3         3327
## 12 1989 Q4         3831
## 13 1990 Q1         3766
## 14 1990 Q2         3742
## 15 1990 Q3         3664
## 16 1990 Q4         4025

# Now I am fitting my data to my training data 

macro_fit <- train %>% 
  model(mean = MEAN(Macro_Values),
        naive = NAIVE(Macro_Values),
        snaive = SNAIVE(Macro_Values),
        drift = RW(Macro_Values ~ drift()))
macro_fit

## # A mable: 1 x 4
##      mean   naive   snaive         drift
##   <model> <model>  <model>       <model>
## 1  <MEAN> <NAIVE> <SNAIVE> <RW w/ drift>

# Now, I am forecasting the four models above 
# to plot them below to graphically observe them.

macro_fc <- macro_fit %>% 
  forecast(h = 8) %>% 
  autoplot(recent)
macro_fc

# And now I am doing the accuracy to see which of these
# four models is best (#4 Checking model performance):

macro_fc1 <- macro_fit %>% 
  forecast(h = 8)

accuracy(macro_fc1, recent)

## # A tibble: 4 × 10
##   .model .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>  <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 drift  Test  -330.   370.  330. -8.50  8.50 1.69  1.71  -0.542
## 2 mean   Test   497.   537.  497. 12.3  12.3  2.54  2.48  -0.233
## 3 naive  Test   -51.5  211.  182. -1.57  4.67 0.931 0.976 -0.233
## 4 snaive Test   174.   222.  182.  4.26  4.46 0.928 1.03   0.193

# It shows that the naive model has the lowest RMSE, so 
# it is my most accurate model thus far. 



# I am now going to employ a few other techniques that I 
# think are applicable to my data. 

# I am now forecasting with Regression with trend and season 
# as my special functions. (7.6 from book)

fit2 <- train %>% 
  model(TSLM(Macro_Values ~ trend() + season()))

fc2 <- fit2 %>% 
  forecast(h = 8) %>% 
  autoplot(recent)
fc2

# This model looks very accurate. I am now going to check the 
# accuracy of my regression model. (#4 checking model performance) 

fc02 <- fit2 %>% 
  forecast(h = 8)

accuracy(fc02, recent)

## # A tibble: 1 × 10
##   .model                  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 TSLM(Macro_Values ~ tr… Test  -86.3  104.  95.4 -2.22  2.44 0.487 0.480 -0.292

# My RMSE for this model is much lower than for my naive! I am 
# thinking that this is the one, but I am wanting to see what 
# seasonal dummies with trend can do as well. 

# I have quarterly data so:

t <- nrow(train)
trend <- seq(1:t)

recent2 <- train %>% 
  mutate(q2 = ifelse(quarter(Quarter) == 2,1,0)) %>% 
  mutate(q3 = ifelse(quarter(Quarter) == 3,1,0)) %>% 
  mutate(q4 = ifelse(quarter(Quarter) == 4,1,0)) %>% 
  mutate(td = trend)
recent2

## # A tsibble: 16 x 6 [1Q]
##    Quarter Macro_Values    q2    q3    q4    td
##      <qtr>        <dbl> <dbl> <dbl> <dbl> <int>
##  1 1987 Q1         3095     0     0     0     1
##  2 1987 Q2         3276     1     0     0     2
##  3 1987 Q3         3051     0     1     0     3
##  4 1987 Q4         3427     0     0     1     4
##  5 1988 Q1         3380     0     0     0     5
##  6 1988 Q2         3327     1     0     0     6
##  7 1988 Q3         3143     0     1     0     7
##  8 1988 Q4         3577     0     0     1     8
##  9 1989 Q1         3441     0     0     0     9
## 10 1989 Q2         3557     1     0     0    10
## 11 1989 Q3         3327     0     1     0    11
## 12 1989 Q4         3831     0     0     1    12
## 13 1990 Q1         3766     0     0     0    13
## 14 1990 Q2         3742     1     0     0    14
## 15 1990 Q3         3664     0     1     0    15
## 16 1990 Q4         4025     0     0     1    16

# Now I am modeling the data

fit3 <- recent2 %>% 
  model(TSLM(Macro_Values ~ q2 + q3 + q4 + td))

# Now, I am using my training data set to forecast the model 
# and check the accuracy.

augment(fit3) %>% 
  ggplot(aes(x = Quarter)) +
  geom_line(aes(y = Macro_Values, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  scale_color_manual(values = c(Data = "black", Fitted = "red"))

# The fitted values seems to fit the actual values pretty well. 

# I couldn't forecast the seasonal dummies for some odd reason, but 
# forecasting with fourier terms with k = m/2 = 4/2 = 2 will be the 
# exact same forecast as the seasonal dummies. 

fourier_macro <- train %>% 
  model(TSLM(Macro_Values ~ trend() + fourier(K = 2)))

# Now I am checking to see if the fitted values are the same as the 
# seasonal dummies. 

augment(fourier_macro) %>% 
  ggplot(aes(x = Quarter)) +
  geom_line(aes(y = Macro_Values, color = "Data")) +
  geom_line(aes(y = .fitted, color = "Fitted")) +
  scale_color_manual(values = c(Data = "black", Fitted = "red"))

# They are! So now, I am forecasting it.

fc4 <- fourier_macro %>% 
  forecast(h = 8) %>% 
  autoplot(recent)
fc4

# The forecast seems pretty accurate to the data, let me check
# the accuracy (#4 checking Model Performance)

fc5 <- fourier_macro %>% 
  forecast(h = 8)

accuracy(fc5, recent)

## # A tibble: 1 × 10
##   .model                  .type    ME  RMSE   MAE   MPE  MAPE  MASE RMSSE   ACF1
##   <chr>                   <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>  <dbl>
## 1 TSLM(Macro_Values ~ tr… Test  -86.3  104.  95.4 -2.22  2.44 0.487 0.480 -0.292

# The RMSE is exactly the same as the TSLM model with 
# season and trend as my special functions.

Produce the Forecast

# 5. Produce Forecasts  


# My Chosen forecast is the Regression forecast of the TSLM model with 
# special functions trend and season since it has the lowest RMSE. Now, I 
# am going to check the residuals by using certain diagnostic 
# checks to make sure there is no autocorrelation, my mean is centered aroud
# zero, I have constant variance, and I have a normal distribution all 
# for my residuals. This is to see if all of the valuable information used for 
# my forecast does not remain in the residuals and to see if my PI will be 
# easily calculated. Since there are no possible predictors within my given data,
# I can perform any techniques to pick and choose with respect to AIC, CV, and
# AIC corrected. 

# Using my forecast from before, I am finding my fitted values to then plot them 
# against my actual values (just to observe):

fitt <- augment(fit2)
View(fitt)


# My residuals are equal to my innovative residuals since I have not 
# used any sort of transformation for my model. 

ggplot(fitt, aes(x = Quarter, y = .resid)) +
  geom_line()

# From the graph, my variance remains to be within the (-100, 100) range.
# However, there seems to be some type of pattern. 

Data1 %>% 
  model(TSLM(Macro_Values ~ trend() + season())) %>% 
  gg_tsresiduals()

# To make sure there is no autocorrelation within my model, I am going 
# to employ the ljung-box test. Since I have seasonal data, I am going to 
# use lag = 2m = 2(1) = 2 and dof = 0.

fitt %>% 
  features(.innov, ljung_box, lag = 2, dof = 0)

## # A tibble: 1 × 3
##   .model                                  lb_stat lb_pvalue
##   <chr>                                     <dbl>     <dbl>
## 1 TSLM(Macro_Values ~ trend() + season())    1.51     0.471

# Since my p value > 0.05, it is not significant! And so, my residuals 
# are indistinguishable from a white noise series.


# There is not autocorrelation within my data and the mean seems as 
# if it could be zero, meaning, all of my information has been 
# extrapolated from my residuals. However, the variance is not constant and 
# it is not normally distributed. Thus, I am going to use a transformation. 

# I am using Box-Cox technique. First, I am finding my lambda to then choose 
# my appropriate transformation. 

Data1 %>% 
  features(Macro_Values, features = guerrero)

## # A tibble: 1 × 1
##   lambda_guerrero
##             <dbl>
## 1           0.591

Data1 %>% 
  autoplot(box_cox(Macro_Values, 0.591))

# The variance doesn't seem anymore constant than it was 
# before, but I will follow through with my diagnostics. 

# Since my lambda is equal close to 1/2, I am going to 
# transform it into a square root plus linear transformation.

DData <- Data1 %>% 
  mutate(Macro_Values = (Macro_Values)^(1/2))

# To now fitting the transformed model 


fit_DData <- DData %>% 
  model(TSLM(Macro_Values ~ trend() + season()))

aug_data <- augment(fit_DData)
View(aug_data)

# After finding my residuals, I've plotted the values to see that the
# transformation has actually caused my variance to increase. 

ggplot(aug_data, aes(x = Quarter, y = .innov)) +
  geom_line()

# Therefore, I am going to keep my original data. 

# Since I have compromised with my original data, the 
# assumption of my residuals being normally distributed is 
# unreasonable. Therefore, I am going to forecast my model 
# with prediction intervals from bootstrapped residuals with 
# 80% and 95% PI levels. 

Forecast <- Data1 %>% 
  model(TSLM(Macro_Values ~ trend() + season())) %>% 
  forecast(h = 4, bootstrap = TRUE, times = 1000) %>% 
  autoplot(Data1) +
  labs(title = "Forecast of Macro values for the next year")
Forecast

# My forecast PI seems to be a bit too accurate; however, I trust the 
# bootstrap residuals to produce 80 and 90 percent confidence intervals.
# After many diagnostics tests and accuracy performances, I have finally 
# forecast my data.

Take Home Exam

Summer Ligon

2022-10-21

Load packages and data

Exercise 1

Question 1

Question 2

Question 3

Question 4

Question 5

Question 6

Question 7

Question 8

Exercise 2

Data Preparation

Data Visualisation

Define a Model

Train the Model (estimate)

Produce the Forecast