Exercise 1
Question 1
# 1.
# Since the data occurs only once per year, we can
# easily change it into a tsibble with Year as the index.
tsib <- Table1 %>%
as_tsibble(index = Year)
tsib
## # A tsibble: 23 x 5 [1Y]
## Year GDP Personal_income Personal_cons_exp Total_emp
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1998 8.2 7.4 7.5 2.3
## 2 1999 9.3 10.2 8.8 2
## 3 2000 1.4 3.4 5 0.6
## 4 2001 3.1 1.4 4.3 0.1
## 5 2002 5.6 4.5 5.5 0.6
## 6 2003 6 6.1 6.8 1.6
## 7 2004 7 5.8 7 1.7
## 8 2005 6.7 7.8 6.4 1.9
## 9 2006 4.8 4.6 6 1.9
## 10 2007 2.4 1.4 3.1 -1.1
## # … with 13 more rows
## # ℹ Use `print(n = ...)` to see more rows
Question 2
# 2.
# Using my hint and transforming my tsibble into a tibble
# before I plot it.
tib <- tsib %>%
as_tibble()
tib
## # A tibble: 23 × 5
## Year GDP Personal_income Personal_cons_exp Total_emp
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1998 8.2 7.4 7.5 2.3
## 2 1999 9.3 10.2 8.8 2
## 3 2000 1.4 3.4 5 0.6
## 4 2001 3.1 1.4 4.3 0.1
## 5 2002 5.6 4.5 5.5 0.6
## 6 2003 6 6.1 6.8 1.6
## 7 2004 7 5.8 7 1.7
## 8 2005 6.7 7.8 6.4 1.9
## 9 2006 4.8 4.6 6 1.9
## 10 2007 2.4 1.4 3.1 -1.1
## # … with 13 more rows
## # ℹ Use `print(n = ...)` to see more rows
# Now, I am plotting the tibble using ggpairs
ggpairs(tib, columns = 2:5)

# The plot shows the scatterplot of each variable against one
# another to show their relationship with one another. Along with
# this, the correlation between the variables align across the
# plots.
Question 3
# 3.
# Again, each variable is plotted against one another. GDP and
# personal_income have a positive, linear moderately strong relationship
# with a correlation of 0.618. GDP has a very strong positive, linear
# relationship with a correlation of 0.923. GDP and Total employment have
# a positive, moderately strong relationship (correlation of 0.771).
# Personal income and personal consumption expenditures have a positive
# moderately strong, linear relationship with few outliers (correlation of
# 0.487). Personal income and total employment have a positive, moderately
# strong relationship (correlation of 0.413). Personal consumption
# expenditures have a strong positive, linear relationship (correlation 0.754).
Question 4
# 4.
# Since every variable has a linear relationship, I am going to
# use the TSLM model to explain the change in personal consumption
# expenditures of a function personal income, GDP, and total employment.
fit_tsib <- tsib %>%
model(lm = TSLM(Personal_cons_exp ~ Personal_income + GDP + Total_emp))
# Now, using the report function to view the results
report(fit_tsib)
## Series: Personal_cons_exp
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.8341 -0.6512 0.0199 0.7086 3.5230
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.4718 0.6429 0.734 0.472
## Personal_income -0.1450 0.1228 -1.181 0.252
## GDP 1.0125 0.1666 6.077 7.62e-06 ***
## Total_emp 0.1345 0.2106 0.639 0.531
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.31 on 19 degrees of freedom
## Multiple R-squared: 0.866, Adjusted R-squared: 0.8448
## F-statistic: 40.92 on 3 and 19 DF, p-value: 1.7295e-08
Question 5
# 5.
# Intercept estimate: When Personal income, GDP, and total employment equal
# zero, personal consumption expenditures equals 0.4718 which is not statistically
# significant at the 5 percent level since the t value is less than 1.96.
# The estimate of the coefficient for personal income means that as it increases
# by one unit, consumption goes down by -0.1450 units. It is not statistically
# significant at the 5 percent level.
# The estimate of the coefficient for GDP indicates that as GDP
# increases by one unit, consumption goes up by 1.0125 units. It
# happens to be statistically significant at the 5 percent level.
# The estimate of the coefficient for Total employment indicates that
# as total employment goes up by one unit, consumption goes up
# by 0.1345 units. This is not statistically significant at the 5
# percent level.
Question 6
# 6.
# I am plotting the fitted line of the regression equation to the actual data
# values below:
augment(fit_tsib) %>%
ggplot(aes(x = Year)) +
geom_line(aes(y = Personal_cons_exp, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(title = "Personal Consumption Expenditures Yearly", y = "Percentages") +
scale_colour_manual(values = c(Data = "black", Fitted = "red"))

Question 7
# 7.
# I am now forecasting consumption for the next three years with
# two separate scenarios
# a. a constant growth
# With a constant growth of 1% in all variables:
future_scenarios <- scenarios(
Increase = new_data(tsib, 3) %>%
mutate(Personal_income = 1, GDP = 1, Total_emp = 1),
names_to = "Scenario1"
)
# Now, I am forecasting the first scenario
fc <- forecast(fit_tsib, new_data = future_scenarios)
tsib %>%
autoplot(Personal_cons_exp) +
autolayer(fc)

# b.
# My scenario 2 has income change by 1, -0.5, and 0.8 percent
future_scen <- scenarios(
Increase = new_data(tsib, 3) %>%
mutate(Personal_income = 1, GDP = 0, Total_emp = 0),
Decrease = new_data(tsib, 3) %>%
mutate(Personal_income = -0.5, GDP = 0, Total_emp = 0),
Increase1 = new_data(tsib, 3) %>%
mutate(Personal_income = 0.8, GDP = 0, Total_emp = 0),
names_to = "Scenario2"
)
fc2 <- forecast(fit_tsib, new_data = future_scen)
tsib %>%
autoplot(Personal_cons_exp) +
autolayer(fc2) +
labs(y = "Consumption (percentages)")

Question 8
# 8.
# Scenario one has a pretty low forecast point compared to the
# last observation, Thus, it seems pretty questionable. However, it seems
# to fit when we take into account the overall historical data.
# The prediction intervals seem not to be too precise which is good.
# Scenario two includes the forecast of consumption when income has different
# percentage growth rates. Since the coefficient of income is negative, it makes
# sense that the forecast of consumption decreases as the growth of income percentages
# increases. Again, the forecast point is low relative to the last observation of the
# data, however, seems to fit the overall historical data pretty well.
Exercise 2
Data Preparation
# I am going to use the 5 steps of forecasting a model
# 1. Data Preparation (tidy)
Data <- M3_exam1 %>%
filter(`Student Name` == "Summer") %>%
pivot_longer(cols = 8:43, names_to = "Values", values_to = "count") %>%
select(-c(8:43)) %>%
select(-Values) %>%
mutate(Macro_Values = count) %>%
select(-count) %>%
na.omit()
View(Data)
# It took me awhile to tidy the data, but it looks pretty nice now.
# I pivoted my data and omitted the NA values I had. Now, I am going
# to add the rest of the quarters following the initial. I am now adding
# the rest of the years and quarters for my data and selecting the
# certain rows:
Data1 <- Data %>%
mutate(Quarter = yearquarter(68:91)) %>%
select(Quarter, Macro_Values) %>%
as_tsibble(index = Quarter)
Data1
## # A tsibble: 24 x 2 [1Q]
## Quarter Macro_Values
## <qtr> <dbl>
## 1 1987 Q1 3095
## 2 1987 Q2 3276
## 3 1987 Q3 3051
## 4 1987 Q4 3427
## 5 1988 Q1 3380
## 6 1988 Q2 3327
## 7 1988 Q3 3143
## 8 1988 Q4 3577
## 9 1989 Q1 3441
## 10 1989 Q2 3557
## # … with 14 more rows
## # ℹ Use `print(n = ...)` to see more rows
View(Data1)
Data Visualisation
# 2. Plot the Data (visualize)
ggplot(Data1, aes(x = Quarter, y = Macro_Values)) +
geom_line()

# The data seems to follow a seasonal pattern and a
# positive trend.
# Now, I am decomposing my data to see that different
# components within my data using the STL decomposition.
Data1 %>%
model(STL(Macro_Values ~ season() + trend(), robust = TRUE)) %>%
components() %>%
autoplot()

# Like I said before, I have an increasing trend and yearly seasonality.
Define a Model
# 3. Define a model (specify)
# I hypothesize that my most accurate model will be the
# snaive model since my plot has seasonality. However, I am
# going to employ a couple of methods based on what I think
# would be most appropriate.
Train the Model (estimate)
# 4. Train the model (estimate)
recent <- Data1 %>%
filter(year(Quarter) >= 1987)
recent
## # A tsibble: 24 x 2 [1Q]
## Quarter Macro_Values
## <qtr> <dbl>
## 1 1987 Q1 3095
## 2 1987 Q2 3276
## 3 1987 Q3 3051
## 4 1987 Q4 3427
## 5 1988 Q1 3380
## 6 1988 Q2 3327
## 7 1988 Q3 3143
## 8 1988 Q4 3577
## 9 1989 Q1 3441
## 10 1989 Q2 3557
## # … with 14 more rows
## # ℹ Use `print(n = ...)` to see more rows
# Test Data is about 20% of my data
test <- Data1 %>%
filter(year(Quarter) > 1990)
# My train data is about 80% of my data
train <- Data1 %>%
filter(year(Quarter) <= 1990)
train
## # A tsibble: 16 x 2 [1Q]
## Quarter Macro_Values
## <qtr> <dbl>
## 1 1987 Q1 3095
## 2 1987 Q2 3276
## 3 1987 Q3 3051
## 4 1987 Q4 3427
## 5 1988 Q1 3380
## 6 1988 Q2 3327
## 7 1988 Q3 3143
## 8 1988 Q4 3577
## 9 1989 Q1 3441
## 10 1989 Q2 3557
## 11 1989 Q3 3327
## 12 1989 Q4 3831
## 13 1990 Q1 3766
## 14 1990 Q2 3742
## 15 1990 Q3 3664
## 16 1990 Q4 4025
# Now I am fitting my data to my training data
macro_fit <- train %>%
model(mean = MEAN(Macro_Values),
naive = NAIVE(Macro_Values),
snaive = SNAIVE(Macro_Values),
drift = RW(Macro_Values ~ drift()))
macro_fit
## # A mable: 1 x 4
## mean naive snaive drift
## <model> <model> <model> <model>
## 1 <MEAN> <NAIVE> <SNAIVE> <RW w/ drift>
# Now, I am forecasting the four models above
# to plot them below to graphically observe them.
macro_fc <- macro_fit %>%
forecast(h = 8) %>%
autoplot(recent)
macro_fc

# And now I am doing the accuracy to see which of these
# four models is best (#4 Checking model performance):
macro_fc1 <- macro_fit %>%
forecast(h = 8)
accuracy(macro_fc1, recent)
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 drift Test -330. 370. 330. -8.50 8.50 1.69 1.71 -0.542
## 2 mean Test 497. 537. 497. 12.3 12.3 2.54 2.48 -0.233
## 3 naive Test -51.5 211. 182. -1.57 4.67 0.931 0.976 -0.233
## 4 snaive Test 174. 222. 182. 4.26 4.46 0.928 1.03 0.193
# It shows that the naive model has the lowest RMSE, so
# it is my most accurate model thus far.
# I am now going to employ a few other techniques that I
# think are applicable to my data.
# I am now forecasting with Regression with trend and season
# as my special functions. (7.6 from book)
fit2 <- train %>%
model(TSLM(Macro_Values ~ trend() + season()))
fc2 <- fit2 %>%
forecast(h = 8) %>%
autoplot(recent)
fc2

# This model looks very accurate. I am now going to check the
# accuracy of my regression model. (#4 checking model performance)
fc02 <- fit2 %>%
forecast(h = 8)
accuracy(fc02, recent)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TSLM(Macro_Values ~ tr… Test -86.3 104. 95.4 -2.22 2.44 0.487 0.480 -0.292
# My RMSE for this model is much lower than for my naive! I am
# thinking that this is the one, but I am wanting to see what
# seasonal dummies with trend can do as well.
# I have quarterly data so:
t <- nrow(train)
trend <- seq(1:t)
recent2 <- train %>%
mutate(q2 = ifelse(quarter(Quarter) == 2,1,0)) %>%
mutate(q3 = ifelse(quarter(Quarter) == 3,1,0)) %>%
mutate(q4 = ifelse(quarter(Quarter) == 4,1,0)) %>%
mutate(td = trend)
recent2
## # A tsibble: 16 x 6 [1Q]
## Quarter Macro_Values q2 q3 q4 td
## <qtr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1987 Q1 3095 0 0 0 1
## 2 1987 Q2 3276 1 0 0 2
## 3 1987 Q3 3051 0 1 0 3
## 4 1987 Q4 3427 0 0 1 4
## 5 1988 Q1 3380 0 0 0 5
## 6 1988 Q2 3327 1 0 0 6
## 7 1988 Q3 3143 0 1 0 7
## 8 1988 Q4 3577 0 0 1 8
## 9 1989 Q1 3441 0 0 0 9
## 10 1989 Q2 3557 1 0 0 10
## 11 1989 Q3 3327 0 1 0 11
## 12 1989 Q4 3831 0 0 1 12
## 13 1990 Q1 3766 0 0 0 13
## 14 1990 Q2 3742 1 0 0 14
## 15 1990 Q3 3664 0 1 0 15
## 16 1990 Q4 4025 0 0 1 16
# Now I am modeling the data
fit3 <- recent2 %>%
model(TSLM(Macro_Values ~ q2 + q3 + q4 + td))
# Now, I am using my training data set to forecast the model
# and check the accuracy.
augment(fit3) %>%
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Macro_Values, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
scale_color_manual(values = c(Data = "black", Fitted = "red"))

# The fitted values seems to fit the actual values pretty well.
# I couldn't forecast the seasonal dummies for some odd reason, but
# forecasting with fourier terms with k = m/2 = 4/2 = 2 will be the
# exact same forecast as the seasonal dummies.
fourier_macro <- train %>%
model(TSLM(Macro_Values ~ trend() + fourier(K = 2)))
# Now I am checking to see if the fitted values are the same as the
# seasonal dummies.
augment(fourier_macro) %>%
ggplot(aes(x = Quarter)) +
geom_line(aes(y = Macro_Values, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
scale_color_manual(values = c(Data = "black", Fitted = "red"))

# They are! So now, I am forecasting it.
fc4 <- fourier_macro %>%
forecast(h = 8) %>%
autoplot(recent)
fc4

# The forecast seems pretty accurate to the data, let me check
# the accuracy (#4 checking Model Performance)
fc5 <- fourier_macro %>%
forecast(h = 8)
accuracy(fc5, recent)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TSLM(Macro_Values ~ tr… Test -86.3 104. 95.4 -2.22 2.44 0.487 0.480 -0.292
# The RMSE is exactly the same as the TSLM model with
# season and trend as my special functions.
Produce the Forecast
# 5. Produce Forecasts
# My Chosen forecast is the Regression forecast of the TSLM model with
# special functions trend and season since it has the lowest RMSE. Now, I
# am going to check the residuals by using certain diagnostic
# checks to make sure there is no autocorrelation, my mean is centered aroud
# zero, I have constant variance, and I have a normal distribution all
# for my residuals. This is to see if all of the valuable information used for
# my forecast does not remain in the residuals and to see if my PI will be
# easily calculated. Since there are no possible predictors within my given data,
# I can perform any techniques to pick and choose with respect to AIC, CV, and
# AIC corrected.
# Using my forecast from before, I am finding my fitted values to then plot them
# against my actual values (just to observe):
fitt <- augment(fit2)
View(fitt)
# My residuals are equal to my innovative residuals since I have not
# used any sort of transformation for my model.
ggplot(fitt, aes(x = Quarter, y = .resid)) +
geom_line()

# From the graph, my variance remains to be within the (-100, 100) range.
# However, there seems to be some type of pattern.
Data1 %>%
model(TSLM(Macro_Values ~ trend() + season())) %>%
gg_tsresiduals()

# To make sure there is no autocorrelation within my model, I am going
# to employ the ljung-box test. Since I have seasonal data, I am going to
# use lag = 2m = 2(1) = 2 and dof = 0.
fitt %>%
features(.innov, ljung_box, lag = 2, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 TSLM(Macro_Values ~ trend() + season()) 1.51 0.471
# Since my p value > 0.05, it is not significant! And so, my residuals
# are indistinguishable from a white noise series.
# There is not autocorrelation within my data and the mean seems as
# if it could be zero, meaning, all of my information has been
# extrapolated from my residuals. However, the variance is not constant and
# it is not normally distributed. Thus, I am going to use a transformation.
# I am using Box-Cox technique. First, I am finding my lambda to then choose
# my appropriate transformation.
Data1 %>%
features(Macro_Values, features = guerrero)
## # A tibble: 1 × 1
## lambda_guerrero
## <dbl>
## 1 0.591
Data1 %>%
autoplot(box_cox(Macro_Values, 0.591))

# The variance doesn't seem anymore constant than it was
# before, but I will follow through with my diagnostics.
# Since my lambda is equal close to 1/2, I am going to
# transform it into a square root plus linear transformation.
DData <- Data1 %>%
mutate(Macro_Values = (Macro_Values)^(1/2))
# To now fitting the transformed model
fit_DData <- DData %>%
model(TSLM(Macro_Values ~ trend() + season()))
aug_data <- augment(fit_DData)
View(aug_data)
# After finding my residuals, I've plotted the values to see that the
# transformation has actually caused my variance to increase.
ggplot(aug_data, aes(x = Quarter, y = .innov)) +
geom_line()

# Therefore, I am going to keep my original data.
# Since I have compromised with my original data, the
# assumption of my residuals being normally distributed is
# unreasonable. Therefore, I am going to forecast my model
# with prediction intervals from bootstrapped residuals with
# 80% and 95% PI levels.
Forecast <- Data1 %>%
model(TSLM(Macro_Values ~ trend() + season())) %>%
forecast(h = 4, bootstrap = TRUE, times = 1000) %>%
autoplot(Data1) +
labs(title = "Forecast of Macro values for the next year")
Forecast

# My forecast PI seems to be a bit too accurate; however, I trust the
# bootstrap residuals to produce 80 and 90 percent confidence intervals.
# After many diagnostics tests and accuracy performances, I have finally
# forecast my data.