library(fpp3)
library(readxl)
library(GGally)
#install and load any package necessary
va <- readxl::read_excel("//Users//mattmullis//Downloads//bahhh.xlsx")
va
## # A tibble: 4 × 24
## Description `1998-1999` `1999-2000` `2000-2001` `2001-2002` `2002-2003`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 GDP 8 7.5 6.4 3.4 5.5
## 2 PI 7.2 8.4 5.3 2.7 6.1
## 3 Exp 8.1 8.2 5 5.4 6.3
## 4 Emp 2.2 3.2 0.4 0.1 1
## # … with 18 more variables: `2003-2004` <dbl>, `2004-2005` <dbl>,
## # `2005-2006` <dbl>, `2006-2007` <dbl>, `2007-2008` <dbl>, `2008-2009` <dbl>,
## # `2009-2010` <dbl>, `2010-2011` <dbl>, `2011-2012` <dbl>, `2012-2013` <dbl>,
## # `2013-2014` <dbl>, `2014-2015` <dbl>, `2015-2016` <dbl>, `2016-2017` <dbl>,
## # `2017-2018` <dbl>, `2018-2019` <dbl>, `2019-2020` <dbl>, `2020-2021` <dbl>
va1 <- va %>%
pivot_longer(!Description, names_to = "Year", values_to = "Count" ) %>%
pivot_wider(names_from= Description, values_from= Count)
#ts
va1
## # A tibble: 23 × 5
## Year GDP PI Exp Emp
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 1998-1999 8 7.2 8.1 2.2
## 2 1999-2000 7.5 8.4 8.2 3.2
## 3 2000-2001 6.4 5.3 5 0.4
## 4 2001-2002 3.4 2.7 5.4 0.1
## 5 2002-2003 5.5 6.1 6.3 1
## 6 2003-2004 6.6 6.9 7.9 2.7
## 7 2004-2005 7.8 6.7 7.7 2.5
## 8 2005-2006 5.1 6.9 6 1.7
## 9 2006-2007 4.1 5.4 5.2 1.8
## 10 2007-2008 3.3 4 3.8 0
## # … with 13 more rows
va2 <- va1 %>%
mutate(n=row_number())
vats <- va2 %>%
as_tsibble(index=n)
vats
## # A tsibble: 23 x 6 [1]
## Year GDP PI Exp Emp n
## <chr> <dbl> <dbl> <dbl> <dbl> <int>
## 1 1998-1999 8 7.2 8.1 2.2 1
## 2 1999-2000 7.5 8.4 8.2 3.2 2
## 3 2000-2001 6.4 5.3 5 0.4 3
## 4 2001-2002 3.4 2.7 5.4 0.1 4
## 5 2002-2003 5.5 6.1 6.3 1 5
## 6 2003-2004 6.6 6.9 7.9 2.7 6
## 7 2004-2005 7.8 6.7 7.7 2.5 7
## 8 2005-2006 5.1 6.9 6 1.7 8
## 9 2006-2007 4.1 5.4 5.2 1.8 9
## 10 2007-2008 3.3 4 3.8 0 10
## # … with 13 more rows
#graph
vatb <- vats %>%
as_tibble() %>%
select(-Year)
ggpairs(vatb)
# personal income and GDP have a positive correlation at the 99% significance level. Expenditures and GDP have a positive correlation at the 99.99% significance level.Employment and GDP have a positive correlation coefficient at the 99.99% significance level. expenditure and PI are positively correlated at the 99% stat level. Employment and personal income are positively correlated at the 99% level. Expenditure and employment are positively correlated at the 99% level. As n increases, all the variables decrease, but only one is significant (year and gdp) at the 99% level.
#5
aaa <- vats %>%
model(TSLM(Exp~GDP+Emp+PI)) %>% report()
## Series: Exp
## Model: TSLM
##
## Residuals:
## Min 1Q Median 3Q Max
## -2.62662 -0.80104 -0.09362 0.59371 3.29110
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) -0.48586 0.74375 -0.653 0.521
## GDP 0.95160 0.18286 5.204 5.05e-05 ***
## Emp 0.49270 0.30558 1.612 0.123
## PI 0.09381 0.16350 0.574 0.573
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 1.386 on 19 degrees of freedom
## Multiple R-squared: 0.8316, Adjusted R-squared: 0.805
## F-statistic: 31.27 on 3 and 19 DF, p-value: 1.4903e-07
#interpret the results- the only result with any real statistical significance is GDP, which is significant on the 99% level. Intercept- with no change in GDP, employment or PI, expenditure. My adj r2 is very high, so my model explains a lot of the variance of the dependent variable. All three of my independent variables have a positive relationship with my dependent variable, but none except GDP has a significant affect on expenditure.
#6
augment(aaa) %>%
ggplot(aes(x = n)) +
geom_line(aes(y = Exp, colour = "Data")) +
geom_line(aes(y = .fitted, colour = "Fitted")) +
labs(y = NULL,
title = "Percent change in US consumption expenditure"
) +
scale_colour_manual(values=c(Data="black",Fitted="#D55E00")) +
guides(colour = guide_legend(title = NULL))
#7
fs <- scenarios(
one=new_data(vats, 3) %>%
mutate(PI=1, GDP= 1, Emp= 1 ), names_to = "Scenario")
fs1 <- scenarios(
one=new_data(vats, 3) %>%
mutate(PI=1, GDP= 0, Emp= 0 ), names_to = "Scenario")
fs2 <- scenarios(
one=new_data(vats, 3) %>%
mutate(PI=-.5, GDP= 0, Emp= 0 ), names_to = "Scenario")
fs3 <- scenarios(
one=new_data(vats,3) %>%
mutate(PI=.8, GDP=0, Emp=0), names_to = "Scenario"
)
fcast <- forecast(aaa, new_data = fs)
fcast
## # A fable: 23 x 8 [1]
## # Key: Scenario, .model [1]
## Scenario .model n Exp .mean PI GDP Emp
## <chr> <chr> <dbl> <dist> <dbl> <dbl> <dbl> <dbl>
## 1 one TSLM(Exp ~ GDP + Emp + PI) 24 N(1.1, 2.4) 1.05 1 1 1
## 2 one TSLM(Exp ~ GDP + Emp + PI) 25 N(1.1, 2.4) 1.05 1 1 1
## 3 one TSLM(Exp ~ GDP + Emp + PI) 26 N(1.1, 2.4) 1.05 1 1 1
## 4 one TSLM(Exp ~ GDP + Emp + PI) 27 N(1.1, 2.4) 1.05 1 1 1
## 5 one TSLM(Exp ~ GDP + Emp + PI) 28 N(1.1, 2.4) 1.05 1 1 1
## 6 one TSLM(Exp ~ GDP + Emp + PI) 29 N(1.1, 2.4) 1.05 1 1 1
## 7 one TSLM(Exp ~ GDP + Emp + PI) 30 N(1.1, 2.4) 1.05 1 1 1
## 8 one TSLM(Exp ~ GDP + Emp + PI) 31 N(1.1, 2.4) 1.05 1 1 1
## 9 one TSLM(Exp ~ GDP + Emp + PI) 32 N(1.1, 2.4) 1.05 1 1 1
## 10 one TSLM(Exp ~ GDP + Emp + PI) 33 N(1.1, 2.4) 1.05 1 1 1
## # … with 13 more rows
fcast1 <- forecast(aaa, new_data = fs1)
fcast1
## # A fable: 23 x 8 [1]
## # Key: Scenario, .model [1]
## Scenario .model n Exp .mean PI GDP Emp
## <chr> <chr> <dbl> <dist> <dbl> <dbl> <dbl> <dbl>
## 1 one TSLM(Exp ~ GDP + Emp +… 24 N(-0.39, 2.4) -0.392 1 0 0
## 2 one TSLM(Exp ~ GDP + Emp +… 25 N(-0.39, 2.4) -0.392 1 0 0
## 3 one TSLM(Exp ~ GDP + Emp +… 26 N(-0.39, 2.4) -0.392 1 0 0
## 4 one TSLM(Exp ~ GDP + Emp +… 27 N(-0.39, 2.4) -0.392 1 0 0
## 5 one TSLM(Exp ~ GDP + Emp +… 28 N(-0.39, 2.4) -0.392 1 0 0
## 6 one TSLM(Exp ~ GDP + Emp +… 29 N(-0.39, 2.4) -0.392 1 0 0
## 7 one TSLM(Exp ~ GDP + Emp +… 30 N(-0.39, 2.4) -0.392 1 0 0
## 8 one TSLM(Exp ~ GDP + Emp +… 31 N(-0.39, 2.4) -0.392 1 0 0
## 9 one TSLM(Exp ~ GDP + Emp +… 32 N(-0.39, 2.4) -0.392 1 0 0
## 10 one TSLM(Exp ~ GDP + Emp +… 33 N(-0.39, 2.4) -0.392 1 0 0
## # … with 13 more rows
fcast2 <- forecast(aaa, new_data = fs2)
fcast2
## # A fable: 23 x 8 [1]
## # Key: Scenario, .model [1]
## Scenario .model n Exp .mean PI GDP Emp
## <chr> <chr> <dbl> <dist> <dbl> <dbl> <dbl> <dbl>
## 1 one TSLM(Exp ~ GDP + Emp +… 24 N(-0.53, 2.5) -0.533 -0.5 0 0
## 2 one TSLM(Exp ~ GDP + Emp +… 25 N(-0.53, 2.5) -0.533 -0.5 0 0
## 3 one TSLM(Exp ~ GDP + Emp +… 26 N(-0.53, 2.5) -0.533 -0.5 0 0
## 4 one TSLM(Exp ~ GDP + Emp +… 27 N(-0.53, 2.5) -0.533 -0.5 0 0
## 5 one TSLM(Exp ~ GDP + Emp +… 28 N(-0.53, 2.5) -0.533 -0.5 0 0
## 6 one TSLM(Exp ~ GDP + Emp +… 29 N(-0.53, 2.5) -0.533 -0.5 0 0
## 7 one TSLM(Exp ~ GDP + Emp +… 30 N(-0.53, 2.5) -0.533 -0.5 0 0
## 8 one TSLM(Exp ~ GDP + Emp +… 31 N(-0.53, 2.5) -0.533 -0.5 0 0
## 9 one TSLM(Exp ~ GDP + Emp +… 32 N(-0.53, 2.5) -0.533 -0.5 0 0
## 10 one TSLM(Exp ~ GDP + Emp +… 33 N(-0.53, 2.5) -0.533 -0.5 0 0
## # … with 13 more rows
fcast3 <- forecast(aaa, new_data = fs3)
fcast3
## # A fable: 23 x 8 [1]
## # Key: Scenario, .model [1]
## Scenario .model n Exp .mean PI GDP Emp
## <chr> <chr> <dbl> <dist> <dbl> <dbl> <dbl> <dbl>
## 1 one TSLM(Exp ~ GDP + Emp +… 24 N(-0.41, 2.4) -0.411 0.8 0 0
## 2 one TSLM(Exp ~ GDP + Emp +… 25 N(-0.41, 2.4) -0.411 0.8 0 0
## 3 one TSLM(Exp ~ GDP + Emp +… 26 N(-0.41, 2.4) -0.411 0.8 0 0
## 4 one TSLM(Exp ~ GDP + Emp +… 27 N(-0.41, 2.4) -0.411 0.8 0 0
## 5 one TSLM(Exp ~ GDP + Emp +… 28 N(-0.41, 2.4) -0.411 0.8 0 0
## 6 one TSLM(Exp ~ GDP + Emp +… 29 N(-0.41, 2.4) -0.411 0.8 0 0
## 7 one TSLM(Exp ~ GDP + Emp +… 30 N(-0.41, 2.4) -0.411 0.8 0 0
## 8 one TSLM(Exp ~ GDP + Emp +… 31 N(-0.41, 2.4) -0.411 0.8 0 0
## 9 one TSLM(Exp ~ GDP + Emp +… 32 N(-0.41, 2.4) -0.411 0.8 0 0
## 10 one TSLM(Exp ~ GDP + Emp +… 33 N(-0.41, 2.4) -0.411 0.8 0 0
## # … with 13 more rows
vats %>%
autoplot(Exp) +
autolayer(fcast) +
labs(title = "US consumption", y = "% change")
vats %>%
autoplot(Exp) +
autolayer(fcast1) +
labs(title = "US consumption", y = "% change")
vats %>%
autoplot(Exp) +
autolayer(fcast2) +
labs(title = "US consumption", y = "% change")
vats %>%
autoplot(Exp) +
autolayer(fcast3) +
labs(title = "US consumption", y = "% change")
#8
###8 seems like there isnt a big difference with either. I think there must be a problem because the forecasts are unreasonably low for both. I do not trust it at all, there is no cycle or anything in my forecast.
data <- readxl::read_excel("//Users//mattmullis//Downloads//M3_exam1.xlsx")
view(data)
data1 <-data %>% filter(StudentName== "Matt")
#filtering/cleaning everything. I could have done this better.
data2 <- data1 %>%
select(-Series) %>%
select(-N) %>%
select(-NF) %>%
select(-Category) %>%
select(-StartingYear) %>%
select(-StartingQuarter)
data2
## # A tibble: 1 × 73
## StudentName `1` `2` `3` `4` `5` `6` `7` `8` `9` `10` `11`
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Matt 2820 2360 2900 3980 3820 6280 5220 4440 1220 1840 1860
## # … with 61 more variables: `12` <dbl>, `13` <dbl>, `14` <dbl>, `15` <dbl>,
## # `16` <dbl>, `17` <dbl>, `18` <dbl>, `19` <dbl>, `20` <dbl>, `21` <dbl>,
## # `22` <dbl>, `23` <dbl>, `24` <dbl>, `25` <dbl>, `26` <dbl>, `27` <dbl>,
## # `28` <dbl>, `29` <dbl>, `30` <dbl>, `31` <dbl>, `32` <dbl>, `33` <dbl>,
## # `34` <dbl>, `35` <dbl>, `36` <dbl>, `37` <dbl>, `38` <dbl>, `39` <dbl>,
## # `40` <dbl>, `41` <dbl>, `42` <dbl>, `43` <dbl>, `44` <dbl>, `45` <dbl>,
## # `46` <dbl>, `47` <dbl>, `48` <dbl>, `49` <dbl>, `50` <dbl>, `51` <dbl>, …
view(data2)
data3 <- data2 %>%
pivot_longer(-StudentName) %>%
na.omit %>%
mutate(Date = yearquarter(1:52)) %>%
as_tsibble(index=Date)
view(data3)
autoplot(data3)
## Plot variable not specified, automatically selected `.vars = value`
#the data doesn't need a transformation.
#creating test and training sets.
trdata3 <- data3 %>%
filter(year(Date)<=1980)
tedata3 <- data3 %>%
filter(year(Date)>1981)
data3add <- data3%>%
model(classical_decomposition(value, type = "additive")) %>%
components() %>%
autoplot() +
labs("add")
data3add
## Warning: Removed 2 row(s) containing missing values (geom_path).
data3mult <- data3 %>%
model(classical_decomposition(value,type="multiplicative")) %>%
components() %>%
autoplot()+
labs("mult")
data3mult
## Warning: Removed 2 row(s) containing missing values (geom_path).
data3stl <- data3 %>%
model(STL(value ~ season(window = 4), robust = TRUE)) %>%
components() %>%
autoplot()
data3stl
#I wasn't 100% sure which method to use, so I used all the decomp methods i know. There isn't too much difference in additive vs multiplicative. There is a lot of seasonality and an obvious trend. The remainder is almost a random walk, however it does mimic the data a little bit. For the STL, the seasonality increases as time increases. the remainder is a little more constant. all three have a big jump.
#benchmarks:
data3bench <- trdata3 %>%
model(Mean=MEAN(value),
seasonal_naive=SNAIVE(value),
drift=RW(value~drift()),
naive=NAIVE(value)
)
bigone <- data3bench %>%
forecast(h=4)
autoplot(bigone)+
autolayer(data3)
## Plot variable not specified, automatically selected `.vars = value`
accuracy(bigone,data3)
## # A tibble: 4 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 drift Test -427. 1014. 857. -12.5 19.3 0.365 0.364 0.0151
## 2 Mean Test 643. 1159. 1023. 9.18 19.5 0.436 0.417 0.0470
## 3 naive Test -277. 1003. 797. -9.66 17.8 0.340 0.361 0.0470
## 4 seasonal_naive Test -1732. 1982. 1732. -35.5 35.5 0.738 0.713 -0.0354
### for the benchmarks, #These all are very high values. The RMSE is lowest for the Naive method, which makes sense because it is financial data. Next I will try forecasting using regression.
blah <- data3bench %>%
select(naive)
augment(blah) %>%
features(.resid,ljung_box, lag=10,dof=0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 naive 12.7 0.242
gg_tsresiduals(blah)
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).
#looks like it could be wn!!!
#high p value lets gooooo. this is not distinguishable from wn.
tstrdata3 <- trdata3 %>%
model(TSLM(value~season()+trend()))
fctstrdata3 <- tstrdata3 %>% forecast(h=4)
fctstrdata3 %>%
autoplot(trdata3)
accuracy(fctstrdata3,data3)
## # A tibble: 1 × 10
## .model .type ME RMSE MAE MPE MAPE MASE RMSSE ACF1
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 TSLM(value ~ season(… Test -1889. 1938. 1889. -40.5 40.5 0.806 0.697 -0.0831
#very interesting how the regression data isn't as good as the naive...
gg_tsresiduals(tstrdata3)
#literally awful, the mean may be zero, but there is heteroskedasiticty and it doesnt follow a normal distribution. there is high autocorrelation.
augment(tstrdata3) %>%
features(.resid,ljung_box,lag=10, dof=0)
## # A tibble: 1 × 3
## .model lb_stat lb_pvalue
## <chr> <dbl> <dbl>
## 1 TSLM(value ~ season() + trend()) 65.9 2.74e-10
#this is not white noise and my forecast is terrible.
###naive method is the best because the residuals follow a random walk and it has the lowest accuracy scores!!!