Load packages and data

library(fpp3)
library(readxl)
library(GGally)

#install and load any package necessary

Questions

Exercise 1

va <- readxl::read_excel("//Users//mattmullis//Downloads//bahhh.xlsx")
va
## # A tibble: 4 × 24
##   Description `1998-1999` `1999-2000` `2000-2001` `2001-2002` `2002-2003`
##   <chr>             <dbl>       <dbl>       <dbl>       <dbl>       <dbl>
## 1 GDP                 8           7.5         6.4         3.4         5.5
## 2 PI                  7.2         8.4         5.3         2.7         6.1
## 3 Exp                 8.1         8.2         5           5.4         6.3
## 4 Emp                 2.2         3.2         0.4         0.1         1  
## # … with 18 more variables: `2003-2004` <dbl>, `2004-2005` <dbl>,
## #   `2005-2006` <dbl>, `2006-2007` <dbl>, `2007-2008` <dbl>, `2008-2009` <dbl>,
## #   `2009-2010` <dbl>, `2010-2011` <dbl>, `2011-2012` <dbl>, `2012-2013` <dbl>,
## #   `2013-2014` <dbl>, `2014-2015` <dbl>, `2015-2016` <dbl>, `2016-2017` <dbl>,
## #   `2017-2018` <dbl>, `2018-2019` <dbl>, `2019-2020` <dbl>, `2020-2021` <dbl>
va1 <- va %>% 
  pivot_longer(!Description, names_to = "Year", values_to = "Count" ) %>% 
  pivot_wider(names_from= Description, values_from= Count)
#ts
va1
## # A tibble: 23 × 5
##    Year        GDP    PI   Exp   Emp
##    <chr>     <dbl> <dbl> <dbl> <dbl>
##  1 1998-1999   8     7.2   8.1   2.2
##  2 1999-2000   7.5   8.4   8.2   3.2
##  3 2000-2001   6.4   5.3   5     0.4
##  4 2001-2002   3.4   2.7   5.4   0.1
##  5 2002-2003   5.5   6.1   6.3   1  
##  6 2003-2004   6.6   6.9   7.9   2.7
##  7 2004-2005   7.8   6.7   7.7   2.5
##  8 2005-2006   5.1   6.9   6     1.7
##  9 2006-2007   4.1   5.4   5.2   1.8
## 10 2007-2008   3.3   4     3.8   0  
## # … with 13 more rows
va2 <-  va1 %>% 
  mutate(n=row_number())
vats <-  va2 %>% 
  as_tsibble(index=n)
vats
## # A tsibble: 23 x 6 [1]
##    Year        GDP    PI   Exp   Emp     n
##    <chr>     <dbl> <dbl> <dbl> <dbl> <int>
##  1 1998-1999   8     7.2   8.1   2.2     1
##  2 1999-2000   7.5   8.4   8.2   3.2     2
##  3 2000-2001   6.4   5.3   5     0.4     3
##  4 2001-2002   3.4   2.7   5.4   0.1     4
##  5 2002-2003   5.5   6.1   6.3   1       5
##  6 2003-2004   6.6   6.9   7.9   2.7     6
##  7 2004-2005   7.8   6.7   7.7   2.5     7
##  8 2005-2006   5.1   6.9   6     1.7     8
##  9 2006-2007   4.1   5.4   5.2   1.8     9
## 10 2007-2008   3.3   4     3.8   0      10
## # … with 13 more rows
#graph
vatb <- vats %>% 
  as_tibble() %>% 
  select(-Year)
 ggpairs(vatb)

 # personal income and GDP have a positive correlation at the 99% significance level. Expenditures and GDP have a positive correlation at the 99.99% significance level.Employment and GDP have a positive correlation coefficient at the 99.99% significance level. expenditure and PI are positively correlated at the 99% stat level. Employment and personal income are positively correlated at the 99% level. Expenditure and employment are positively correlated at the 99% level. As n increases, all the variables decrease, but only one is significant (year and gdp) at the 99% level. 

 #5
 aaa <- vats %>% 
   model(TSLM(Exp~GDP+Emp+PI)) %>% report()
## Series: Exp 
## Model: TSLM 
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.62662 -0.80104 -0.09362  0.59371  3.29110 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) -0.48586    0.74375  -0.653    0.521    
## GDP          0.95160    0.18286   5.204 5.05e-05 ***
## Emp          0.49270    0.30558   1.612    0.123    
## PI           0.09381    0.16350   0.574    0.573    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.386 on 19 degrees of freedom
## Multiple R-squared: 0.8316,  Adjusted R-squared: 0.805
## F-statistic: 31.27 on 3 and 19 DF, p-value: 1.4903e-07
#interpret the results- the only result with any real statistical significance is GDP, which is significant on the 99% level. Intercept- with no change in GDP, employment or PI, expenditure. My adj r2 is very high, so my model explains a lot of the variance of the dependent variable. All three of my independent variables have a positive relationship with my dependent variable, but none except GDP has a significant affect on expenditure. 
 #6

augment(aaa) %>%
  ggplot(aes(x = n)) +
  geom_line(aes(y = Exp, colour = "Data")) +
  geom_line(aes(y = .fitted, colour = "Fitted")) +
  labs(y = NULL,
    title = "Percent change in US consumption expenditure"
  ) +
  scale_colour_manual(values=c(Data="black",Fitted="#D55E00")) +
  guides(colour = guide_legend(title = NULL))

 #7

fs <- scenarios(
  one=new_data(vats, 3) %>% 
    mutate(PI=1, GDP= 1, Emp= 1 ), names_to = "Scenario")
fs1 <- scenarios(
  one=new_data(vats, 3) %>% 
    mutate(PI=1, GDP= 0, Emp= 0 ), names_to = "Scenario")
fs2 <- scenarios(
  one=new_data(vats, 3) %>% 
    mutate(PI=-.5, GDP= 0, Emp= 0 ), names_to = "Scenario")
fs3 <- scenarios(
  one=new_data(vats,3) %>% 
    mutate(PI=.8, GDP=0, Emp=0), names_to = "Scenario"
)
  
fcast <- forecast(aaa, new_data = fs)
fcast
## # A fable: 23 x 8 [1]
## # Key:     Scenario, .model [1]
##    Scenario .model                         n         Exp .mean    PI   GDP   Emp
##    <chr>    <chr>                      <dbl>      <dist> <dbl> <dbl> <dbl> <dbl>
##  1 one      TSLM(Exp ~ GDP + Emp + PI)    24 N(1.1, 2.4)  1.05     1     1     1
##  2 one      TSLM(Exp ~ GDP + Emp + PI)    25 N(1.1, 2.4)  1.05     1     1     1
##  3 one      TSLM(Exp ~ GDP + Emp + PI)    26 N(1.1, 2.4)  1.05     1     1     1
##  4 one      TSLM(Exp ~ GDP + Emp + PI)    27 N(1.1, 2.4)  1.05     1     1     1
##  5 one      TSLM(Exp ~ GDP + Emp + PI)    28 N(1.1, 2.4)  1.05     1     1     1
##  6 one      TSLM(Exp ~ GDP + Emp + PI)    29 N(1.1, 2.4)  1.05     1     1     1
##  7 one      TSLM(Exp ~ GDP + Emp + PI)    30 N(1.1, 2.4)  1.05     1     1     1
##  8 one      TSLM(Exp ~ GDP + Emp + PI)    31 N(1.1, 2.4)  1.05     1     1     1
##  9 one      TSLM(Exp ~ GDP + Emp + PI)    32 N(1.1, 2.4)  1.05     1     1     1
## 10 one      TSLM(Exp ~ GDP + Emp + PI)    33 N(1.1, 2.4)  1.05     1     1     1
## # … with 13 more rows
fcast1 <- forecast(aaa, new_data = fs1)
fcast1
## # A fable: 23 x 8 [1]
## # Key:     Scenario, .model [1]
##    Scenario .model                      n           Exp  .mean    PI   GDP   Emp
##    <chr>    <chr>                   <dbl>        <dist>  <dbl> <dbl> <dbl> <dbl>
##  1 one      TSLM(Exp ~ GDP + Emp +…    24 N(-0.39, 2.4) -0.392     1     0     0
##  2 one      TSLM(Exp ~ GDP + Emp +…    25 N(-0.39, 2.4) -0.392     1     0     0
##  3 one      TSLM(Exp ~ GDP + Emp +…    26 N(-0.39, 2.4) -0.392     1     0     0
##  4 one      TSLM(Exp ~ GDP + Emp +…    27 N(-0.39, 2.4) -0.392     1     0     0
##  5 one      TSLM(Exp ~ GDP + Emp +…    28 N(-0.39, 2.4) -0.392     1     0     0
##  6 one      TSLM(Exp ~ GDP + Emp +…    29 N(-0.39, 2.4) -0.392     1     0     0
##  7 one      TSLM(Exp ~ GDP + Emp +…    30 N(-0.39, 2.4) -0.392     1     0     0
##  8 one      TSLM(Exp ~ GDP + Emp +…    31 N(-0.39, 2.4) -0.392     1     0     0
##  9 one      TSLM(Exp ~ GDP + Emp +…    32 N(-0.39, 2.4) -0.392     1     0     0
## 10 one      TSLM(Exp ~ GDP + Emp +…    33 N(-0.39, 2.4) -0.392     1     0     0
## # … with 13 more rows
fcast2 <- forecast(aaa, new_data = fs2)
fcast2
## # A fable: 23 x 8 [1]
## # Key:     Scenario, .model [1]
##    Scenario .model                      n           Exp  .mean    PI   GDP   Emp
##    <chr>    <chr>                   <dbl>        <dist>  <dbl> <dbl> <dbl> <dbl>
##  1 one      TSLM(Exp ~ GDP + Emp +…    24 N(-0.53, 2.5) -0.533  -0.5     0     0
##  2 one      TSLM(Exp ~ GDP + Emp +…    25 N(-0.53, 2.5) -0.533  -0.5     0     0
##  3 one      TSLM(Exp ~ GDP + Emp +…    26 N(-0.53, 2.5) -0.533  -0.5     0     0
##  4 one      TSLM(Exp ~ GDP + Emp +…    27 N(-0.53, 2.5) -0.533  -0.5     0     0
##  5 one      TSLM(Exp ~ GDP + Emp +…    28 N(-0.53, 2.5) -0.533  -0.5     0     0
##  6 one      TSLM(Exp ~ GDP + Emp +…    29 N(-0.53, 2.5) -0.533  -0.5     0     0
##  7 one      TSLM(Exp ~ GDP + Emp +…    30 N(-0.53, 2.5) -0.533  -0.5     0     0
##  8 one      TSLM(Exp ~ GDP + Emp +…    31 N(-0.53, 2.5) -0.533  -0.5     0     0
##  9 one      TSLM(Exp ~ GDP + Emp +…    32 N(-0.53, 2.5) -0.533  -0.5     0     0
## 10 one      TSLM(Exp ~ GDP + Emp +…    33 N(-0.53, 2.5) -0.533  -0.5     0     0
## # … with 13 more rows
fcast3 <-  forecast(aaa, new_data = fs3)
fcast3
## # A fable: 23 x 8 [1]
## # Key:     Scenario, .model [1]
##    Scenario .model                      n           Exp  .mean    PI   GDP   Emp
##    <chr>    <chr>                   <dbl>        <dist>  <dbl> <dbl> <dbl> <dbl>
##  1 one      TSLM(Exp ~ GDP + Emp +…    24 N(-0.41, 2.4) -0.411   0.8     0     0
##  2 one      TSLM(Exp ~ GDP + Emp +…    25 N(-0.41, 2.4) -0.411   0.8     0     0
##  3 one      TSLM(Exp ~ GDP + Emp +…    26 N(-0.41, 2.4) -0.411   0.8     0     0
##  4 one      TSLM(Exp ~ GDP + Emp +…    27 N(-0.41, 2.4) -0.411   0.8     0     0
##  5 one      TSLM(Exp ~ GDP + Emp +…    28 N(-0.41, 2.4) -0.411   0.8     0     0
##  6 one      TSLM(Exp ~ GDP + Emp +…    29 N(-0.41, 2.4) -0.411   0.8     0     0
##  7 one      TSLM(Exp ~ GDP + Emp +…    30 N(-0.41, 2.4) -0.411   0.8     0     0
##  8 one      TSLM(Exp ~ GDP + Emp +…    31 N(-0.41, 2.4) -0.411   0.8     0     0
##  9 one      TSLM(Exp ~ GDP + Emp +…    32 N(-0.41, 2.4) -0.411   0.8     0     0
## 10 one      TSLM(Exp ~ GDP + Emp +…    33 N(-0.41, 2.4) -0.411   0.8     0     0
## # … with 13 more rows
vats %>%
  autoplot(Exp) +
  autolayer(fcast) +
  labs(title = "US consumption", y = "% change")

vats %>%
  autoplot(Exp) +
  autolayer(fcast1) +
  labs(title = "US consumption", y = "% change")

vats %>%
  autoplot(Exp) +
  autolayer(fcast2) +
  labs(title = "US consumption", y = "% change")

vats %>%
  autoplot(Exp) +
  autolayer(fcast3) +
  labs(title = "US consumption", y = "% change")

 #8

###8 seems like there isnt a big difference with either. I think there must be a problem because the forecasts are unreasonably low for both. I do not trust it at all, there is no cycle or anything in my forecast.

Exercise 2(comments on bottom. )

data <- readxl::read_excel("//Users//mattmullis//Downloads//M3_exam1.xlsx")
view(data)
data1 <-data %>%  filter(StudentName== "Matt")
#filtering/cleaning everything. I could have done this better. 
data2 <- data1 %>% 
  select(-Series) %>% 
  select(-N) %>% 
  select(-NF) %>% 
  select(-Category) %>% 
  select(-StartingYear) %>% 
  select(-StartingQuarter)
data2
## # A tibble: 1 × 73
##   StudentName   `1`   `2`   `3`   `4`   `5`   `6`   `7`   `8`   `9`  `10`  `11`
##   <chr>       <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 Matt         2820  2360  2900  3980  3820  6280  5220  4440  1220  1840  1860
## # … with 61 more variables: `12` <dbl>, `13` <dbl>, `14` <dbl>, `15` <dbl>,
## #   `16` <dbl>, `17` <dbl>, `18` <dbl>, `19` <dbl>, `20` <dbl>, `21` <dbl>,
## #   `22` <dbl>, `23` <dbl>, `24` <dbl>, `25` <dbl>, `26` <dbl>, `27` <dbl>,
## #   `28` <dbl>, `29` <dbl>, `30` <dbl>, `31` <dbl>, `32` <dbl>, `33` <dbl>,
## #   `34` <dbl>, `35` <dbl>, `36` <dbl>, `37` <dbl>, `38` <dbl>, `39` <dbl>,
## #   `40` <dbl>, `41` <dbl>, `42` <dbl>, `43` <dbl>, `44` <dbl>, `45` <dbl>,
## #   `46` <dbl>, `47` <dbl>, `48` <dbl>, `49` <dbl>, `50` <dbl>, `51` <dbl>, …
view(data2)
data3 <- data2 %>% 
pivot_longer(-StudentName) %>% 
  na.omit %>% 
  mutate(Date = yearquarter(1:52)) %>% 
  as_tsibble(index=Date)
view(data3)
autoplot(data3)
## Plot variable not specified, automatically selected `.vars = value`

#the data doesn't need a transformation. 
#creating test and training sets. 
trdata3 <- data3 %>% 
  filter(year(Date)<=1980)
tedata3 <- data3 %>% 
  filter(year(Date)>1981)

data3add <- data3%>%
  model(classical_decomposition(value, type = "additive")) %>%
  components() %>%
  autoplot() +
  labs("add")
data3add
## Warning: Removed 2 row(s) containing missing values (geom_path).

data3mult <- data3 %>% 
  model(classical_decomposition(value,type="multiplicative")) %>% 
  components() %>% 
  autoplot()+
  labs("mult")
  data3mult
## Warning: Removed 2 row(s) containing missing values (geom_path).

  data3stl <- data3 %>% 
  model(STL(value ~ season(window = 4), robust = TRUE)) %>%
  components() %>% 
  autoplot() 
  data3stl

#I wasn't 100% sure which method to use, so I used all the decomp methods i know. There isn't too much difference in additive vs multiplicative. There is a lot of seasonality and an obvious trend. The remainder is almost a random walk, however it does mimic the data a little bit. For the STL, the seasonality increases as time increases. the remainder is a little more constant. all three have a big jump. 

  
#benchmarks:
  data3bench <- trdata3 %>% 
    model(Mean=MEAN(value),
          seasonal_naive=SNAIVE(value),
          drift=RW(value~drift()),
          naive=NAIVE(value)
          )
  bigone <- data3bench %>% 
    forecast(h=4)
  autoplot(bigone)+
    autolayer(data3)
## Plot variable not specified, automatically selected `.vars = value`

  accuracy(bigone,data3)
## # A tibble: 4 × 10
##   .model         .type     ME  RMSE   MAE    MPE  MAPE  MASE RMSSE    ACF1
##   <chr>          <chr>  <dbl> <dbl> <dbl>  <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 drift          Test   -427. 1014.  857. -12.5   19.3 0.365 0.364  0.0151
## 2 Mean           Test    643. 1159. 1023.   9.18  19.5 0.436 0.417  0.0470
## 3 naive          Test   -277. 1003.  797.  -9.66  17.8 0.340 0.361  0.0470
## 4 seasonal_naive Test  -1732. 1982. 1732. -35.5   35.5 0.738 0.713 -0.0354
### for the benchmarks, #These all are very high values. The RMSE is lowest for the Naive method, which makes sense because it is financial data. Next I will try forecasting using regression. 
  blah <- data3bench %>% 
    select(naive)
  augment(blah) %>% 
    features(.resid,ljung_box, lag=10,dof=0)
## # A tibble: 1 × 3
##   .model lb_stat lb_pvalue
##   <chr>    <dbl>     <dbl>
## 1 naive     12.7     0.242
  gg_tsresiduals(blah)
## Warning: Removed 1 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_point).
## Warning: Removed 1 rows containing non-finite values (stat_bin).

  #looks like it could be wn!!!

#high p value lets gooooo. this is not distinguishable from wn. 
tstrdata3 <- trdata3 %>% 
  model(TSLM(value~season()+trend()))
  fctstrdata3 <- tstrdata3 %>% forecast(h=4)
  fctstrdata3 %>% 
    autoplot(trdata3)

  accuracy(fctstrdata3,data3)
## # A tibble: 1 × 10
##   .model                .type     ME  RMSE   MAE   MPE  MAPE  MASE RMSSE    ACF1
##   <chr>                 <chr>  <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>   <dbl>
## 1 TSLM(value ~ season(… Test  -1889. 1938. 1889. -40.5  40.5 0.806 0.697 -0.0831
#very interesting how the regression data isn't as good as the naive... 
  
  gg_tsresiduals(tstrdata3)

#literally awful, the mean may be zero, but there is heteroskedasiticty and it doesnt follow a normal distribution. there is high autocorrelation.
  augment(tstrdata3) %>% 
    features(.resid,ljung_box,lag=10, dof=0)
## # A tibble: 1 × 3
##   .model                           lb_stat lb_pvalue
##   <chr>                              <dbl>     <dbl>
## 1 TSLM(value ~ season() + trend())    65.9  2.74e-10
#this is not white noise and my forecast is terrible. 

###naive method is the best because the residuals follow a random walk and it has the lowest accuracy scores!!!