library(fpp3)
## ── Attaching packages ──────────────────────────────────────────── fpp3 0.4.0 ──
## ✔ tibble 3.1.8 ✔ tsibble 1.1.2
## ✔ dplyr 1.0.9 ✔ tsibbledata 0.4.1
## ✔ tidyr 1.2.0 ✔ feasts 0.2.2
## ✔ lubridate 1.8.0 ✔ fable 0.3.1
## ✔ ggplot2 3.3.6
## ── Conflicts ───────────────────────────────────────────────── fpp3_conflicts ──
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks base::setdiff()
## ✖ tsibble::union() masks base::union()
library(readxl)
library(ggplot2)
library(tsibble)
library(lubridate)
library(tidyverse)
## ── Attaching packages
## ───────────────────────────────────────
## tidyverse 1.3.2 ──
## ✔ readr 2.1.2 ✔ stringr 1.4.1
## ✔ purrr 0.3.4 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ lubridate::as.difftime() masks base::as.difftime()
## ✖ lubridate::date() masks base::date()
## ✖ dplyr::filter() masks stats::filter()
## ✖ tsibble::intersect() masks lubridate::intersect(), base::intersect()
## ✖ tsibble::interval() masks lubridate::interval()
## ✖ dplyr::lag() masks stats::lag()
## ✖ tsibble::setdiff() masks lubridate::setdiff(), base::setdiff()
## ✖ tsibble::union() masks lubridate::union(), base::union()
library(dplyr)
library(seasonal)
##
## Attaching package: 'seasonal'
##
## The following object is masked from 'package:tibble':
##
## view
library(zoo)
##
## Attaching package: 'zoo'
##
## The following object is masked from 'package:tsibble':
##
## index
##
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(tidyr)
library(tibble)
library(ggfortify)
library(moments)
library(fable)
library(brew)
library(sos)
##
## Attaching package: 'sos'
##
## The following object is masked from 'package:tidyr':
##
## matches
##
## The following object is masked from 'package:dplyr':
##
## matches
##
## The following object is masked from 'package:utils':
##
## ?
library(slider)
library(x13binary)
library(USgas)
library(GGally)
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
GA_data <- read_excel("C:/Users/ryanf/OneDrive/Pictures/final paper/GA data.xlsx")
ga_data <-GA_data %>%
as_tsibble(index=Year)
head(ga_data)
## # A tsibble: 6 x 5 [1Y]
## Year GDP Income Consumption Total_employment
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1998 9 6.6 8.1 2.6
## 2 1999 5.7 8.2 8.2 2.8
## 3 2000 3.9 5 4.8 0.2
## 4 2001 3 2.8 3.8 0.3
## 5 2002 4.7 3.9 5.6 0.8
## 6 2003 6.5 5.4 6.3 2.6
gg <-GA_data%>%
as_tibble()
head(gg)
## # A tibble: 6 × 5
## Year GDP Income Consumption Total_employment
## <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 1998 9 6.6 8.1 2.6
## 2 1999 5.7 8.2 8.2 2.8
## 3 2000 3.9 5 4.8 0.2
## 4 2001 3 2.8 3.8 0.3
## 5 2002 4.7 3.9 5.6 0.8
## 6 2003 6.5 5.4 6.3 2.6
gg %>%
ggpairs
year doesnt appear to have any relationship to the rest of the variables. All of the rest of my relationships of statistically significant with most being at the 90% level. Income and consumption are significant at the 99% level with GDP. Consumption is significant at the 95% level with Income. All three of the remaining variables (consumption, income, and employment) have a positive correlation with GDP. Personal income has a positive correlation with total employment and consumption. Consumption has a positive correlation with total employment.
ga_model <- ga_data %>%
model(TSLM(Consumption ~ GDP + Income + Total_employment))
report(ga_model)
## Series: Consumption
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.82914 -0.81453 -0.06476 0.79437 3.51296
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.09569 0.89627 0.107 0.916094
## GDP 0.89891 0.19253 4.669 0.000167 ***
## Income 0.12738 0.21102 0.604 0.553218
## Total_employment 0.10114 0.35788 0.283 0.780533
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.505 on 19 degrees of freedom
## Multiple R-squared: 0.7294, Adjusted R-squared: 0.6867
## F-statistic: 17.08 on 3 and 19 DF, p-value: 1.2684e-05
The interecept has a value of .09569 meaning when GDP Income and Total_employment changes by 0%, Consumption goes up by .095%.
GDP has a value of .89 meaning that a 1% increase in consumption leads to a .89% increase in GDP. This is the only value that is statistically significant, but is also significant at the 99% level.
Income has .12 value meaning a 1% increase in consumption leads to a .12% increase in Income.
Total_employment has a vlaue of .10 meaning that a % increase in consumption leads to a .1% increase in total_employment
augment (ga_model) %>%
ggplot(aes(x = Year)) +
geom_line(aes(y = Consumption, color = "Data")) +
geom_line(aes(y = .fitted, color = "Fitted")) +
labs(y = "Percentage Change", title = "Percent Change in Georgia Consumption") +
scale_colour_manual(values = c(Data = "black", Fitted = "blue")) +
guides(colour = guide_legend(title = NULL))
fit_ga <- ga_data %>%
model(lm = TSLM(Consumption ~ GDP + Income + Total_employment))
ga_scenarios <- scenarios(
Increase = new_data(ga_data, 3) %>%
mutate(GDP = 1, Income = 1, Total_employment = 1),
names_to = "Scenario")
ga_forecast1 <- forecast(fit_ga, new_data = ga_scenarios)
ga_scenarios2 <- scenarios(
Increase = new_data(ga_data, 3) %>%
mutate(GDP = 0, Income = 1, Total_employment = 0),
Decrease = new_data(ga_data, 3) %>%
mutate(GDP = 0, Income = -.5, Total_employment = 0),
Increase1 = new_data(ga_data, 3) %>%
mutate(GDP = 0, Income = .8, Total_employment = 0), names_to = "Scenario")
ga_forecast2 <- forecast(fit_ga, new_data = ga_scenarios2)
ga_data %>%
autoplot(Consumption) +
autolayer(ga_forecast1) +
autolayer(ga_forecast2) +
labs(title = "Georgia Scenario 1 and 2 on Consumption Forecast", y = "Percentage Change")
I plotted both of the scenarios with prediction intervals. Since the most recent data contains covid data, consumption % change is high due to financial stimulus. I could see the drop being pretty drastic as stimulus slows down and concerns over inflation and recession increase. It most likely would not slow down to zero, but it might be within the prediction interval.
PROBLEM 2
M3_exam1 <- read_excel("C:/Users/ryanf/Downloads/M3_exam1.xlsx")
#View(M3_exam1)
ryan_data <- M3_exam1%>%
select(-Series) %>%
select(-N) %>%
select(-NF) %>%
select(-Category) %>%
select(-'Starting Year') %>%
select(-'Starting Quarter') %>%
filter(`Student Name` == 'Ryan') %>%
pivot_longer(-'Student Name') %>%
select(-'Student Name') %>%
select(-name) %>%
mutate(Micro = value) %>%
select(-value)
ryan_data <- na.omit(ryan_data)
ryan_data <- ryan_data %>%
mutate(Date = yearquarter(57:99)) %>%
select(Date, Micro) %>%
as_tsibble(index = Date)
ryan_train <- ryan_data %>%
filter(year(Date) <= 1990)
ryan_test <- ryan_data %>%
filter(year(Date) > 1990)
ryan_drift_prep <- ryan_data %>%
filter(year(Date) > 1987)
ryan_drift_prep_train <- ryan_drift_prep %>%
filter(year(Date) < 1993)
ryan_drift_prep_test <- ryan_drift_prep %>%
filter(year(Date) > 1992)
#ryan_data
ryan_data %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = Micro)) +
labs(title = "Micro Data", y = "Micro Data Index")
ryan_data %>%
features(Micro, features = guerrero)
## # A tibble: 1 × 1
## lambda_guerrero
## <dbl>
## 1 1.69
lamda <- 1.69
ryan_data %>%
autoplot(box_cox(Micro, lamda)) +
labs(title = "Box-Cox Transformed Micro Data", x = "Date")
ryan_stl <- ryan_data %>%
model(STL(Micro ~ season(window = 4), robust = TRUE)) %>%
components() %>%
autoplot() +
labs(title = "STL Decomposition:Micro Data")
ryan_stl
I did a box cox and stl decomposition in order help me with my data. The
box cox did not change the variance of my data very much. The stl helped
me understand the increasing trend more as well as rule out forecasts
such as seasonal forecasts due to the unliklihood of seasonal
patterns.
ryan_mod <- ryan_train %>%
model(
Seasonal_naive = SNAIVE(Micro),
Naive = NAIVE(Micro),
Drift = RW(Micro ~ drift()),
Mean = MEAN(Micro)
)
ryan_fore <- ryan_mod %>%
forecast(h = 4)
accuracy(ryan_fore, ryan_data)
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test 63.4 206. 201. 0.643 2.07 0.312 0.286 -0.0531
## 2 Mean Test 2627. 2629. 2627. 27.0 27.0 4.08 3.64 -0.274
## 3 Naive Test 485. 498. 485. 4.97 4.97 0.755 0.691 -0.274
## 4 Seasonal_naive Test 822. 857. 822. 8.43 8.43 1.28 1.19 -0.141
ryan_fore %>%
autoplot() +
autolayer(ryan_data)
## Plot variable not specified, automatically selected `.vars = Micro`
Based off my training data, drift method seems to be the best fit. This is backed up by the fact the accuracy test for drift had the best numbers. I figured this would be the case based off the strong trend shown in my stl decomposition.
ryan_drift2 <- ryan_drift_prep_train %>%
model(Drift = RW(Micro ~ drift()))
ryan_drift2_forecast <- ryan_drift2 %>%
forecast(h = "1 year")
ryan_drift2 %>%
gg_tsresiduals()
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
augment(ryan_drift2) %>%
features(.resid, ljung_box, lag = 10, dof = 0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 Drift 6.14 0.803
accuracy(ryan_drift2_forecast, ryan_data)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Drift Test -41.6 84.5 69.2 -0.499 0.819 0.0948 0.102 -0.617
ryan_drift3 <- ryan_data %>%
model(Drift = RW(Micro ~ drift()))
ryan_drift3_forecast <- ryan_drift3 %>%
forecast(h = "1 year")
ryan_data %>%
ggplot(aes(x = Date)) +
geom_line(aes(y = Micro)) +
autolayer(ryan_drift3_forecast) +
labs(title = "Drift Forecast", y = "Micro Data Index")
Because I tried all of the forecast methods on my training data, I could already be fairly confident drift was the best method for me. It had the best accuracy. Visually, my data already had a strong trend line so it looked to be a good fit for trend. The mean of the residuals has an average at zero which is good. I do wish the residuals weren’t so horizontal though. It makes me think it might not be completely white noise. There was also a pretty big outlier in 1992. My p value in the box test was pretty high though.
In conclusion, drift is the best forecast for my data because it has homoskedastic residuals distrubuted at a mean of zero indistinguishable from white noise based on the box test. Furthermore, my forecast has the best accuracy of all tested methods and is visually convincing.