# Prepare needed packages
packages <- c("psych", "tidyr", "ggplot2",
                "caret", "dplyr", "car", "fpp",
                "forecast","ResourceSelection",
                "ggpubr", "scales", "stringr",
                "gridExtra", "pROC", "reshape2", 
                "corrplot","glmnet", "plyr",
                "knitr", "purrr", "readxl", "seasonal", 
                "fpp2")
  for (i in 1:length(packages)) {
    if (!packages[i] %in% rownames(installed.packages())) {
      install.packages(packages[i])
    }
    library(packages[i], character.only = TRUE) # Loads package in
  }
## 
## Attaching package: 'ggplot2'
## The following objects are masked from 'package:psych':
## 
##     %+%, alpha
## Loading required package: lattice
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## Loading required package: carData
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
## The following object is masked from 'package:psych':
## 
##     logit
## Loading required package: forecast
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
## Loading required package: fma
## Loading required package: expsmooth
## Loading required package: lmtest
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
## Loading required package: tseries
## ResourceSelection 0.3-5   2019-07-22
## 
## Attaching package: 'ggpubr'
## The following object is masked from 'package:forecast':
## 
##     gghistogram
## 
## Attaching package: 'scales'
## The following objects are masked from 'package:psych':
## 
##     alpha, rescale
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
## Type 'citation("pROC")' for a citation.
## 
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
## 
##     cov, smooth, var
## 
## Attaching package: 'reshape2'
## The following object is masked from 'package:tidyr':
## 
##     smiths
## corrplot 0.84 loaded
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## The following objects are masked from 'package:tidyr':
## 
##     expand, pack, unpack
## Loaded glmnet 4.1-1
## ------------------------------------------------------------------------------
## You have loaded plyr after dplyr - this is likely to cause problems.
## If you need functions from both plyr and dplyr, please load plyr first, then dplyr:
## library(plyr); library(dplyr)
## ------------------------------------------------------------------------------
## 
## Attaching package: 'plyr'
## The following object is masked from 'package:ggpubr':
## 
##     mutate
## The following object is masked from 'package:fma':
## 
##     ozone
## The following objects are masked from 'package:dplyr':
## 
##     arrange, count, desc, failwith, id, mutate, rename, summarise,
##     summarize
## 
## Attaching package: 'purrr'
## The following object is masked from 'package:plyr':
## 
##     compact
## The following object is masked from 'package:scales':
## 
##     discard
## The following object is masked from 'package:car':
## 
##     some
## The following object is masked from 'package:caret':
## 
##     lift
## 
## Attaching package: 'seasonal'
## The following object is masked from 'package:psych':
## 
##     outlier
## 
## Attaching package: 'fpp2'
## The following objects are masked from 'package:fpp':
## 
##     ausair, ausbeer, austa, austourists, debitcards, departures,
##     elecequip, euretail, guinearice, oil, sunspotarea, usmelec
# Remove unused objects
rm(packages)
rm(i)

# Set work directory
setwd("~/Desktop/Predictive:Forecasting/Week2")

Chapter 5

1.

#(a)  
head(elecdaily)
## Time Series:
## Start = c(1, 4) 
## End = c(2, 2) 
## Frequency = 7 
##            Demand WorkDay Temperature
## 1.428571 174.8963       0        26.0
## 1.571429 188.5909       1        23.0
## 1.714286 188.9169       1        22.2
## 1.857143 173.8142       0        20.3
## 2.000000 169.5152       0        26.1
## 2.142857 195.7288       1        19.6
daily20 <- head(elecdaily,20)

model <- lm(Demand~Temperature, elecdaily)
summary(model)
## 
## Call:
## lm(formula = Demand ~ Temperature, data = elecdaily)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -55.474 -15.719  -0.336  15.767 117.184 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 212.3856     5.0080  42.409   <2e-16 ***
## Temperature   0.4182     0.2263   1.848   0.0654 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 26.55 on 363 degrees of freedom
## Multiple R-squared:  0.009322,   Adjusted R-squared:  0.006592 
## F-statistic: 3.416 on 1 and 363 DF,  p-value: 0.0654
## The positive relationship is because the higher the temperature, the more likely to use air conditioning so that the demand of electricity will be higher as well. 

#(b)
plot(model)

## There are a few outliers that locate around the fitter value 230, which could influence the fitted curve. Furthermore, most data points are not randomly dispersed, and they show an u-shape pattern which indicates a nonlinear model may be more appropriate. 

#(c)
tslm_elec <- tslm(Demand~Temperature, daily20)
newdata <- data.frame(Temperature=c(15,35))
fcast_model = forecast(tslm_elec, newdata)
fcast_model 
##          Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 4.285714       140.5701 108.6810 172.4591  90.21166 190.9285
## 4.428571       275.7146 245.2278 306.2014 227.57056 323.8586
#(d)
autoplot(daily20, facets=TRUE)

daily20 %>%
  as.data.frame() %>%
  ggplot(aes(x=Temperature, y=Demand)) +
  geom_point() +
  geom_smooth(method="lm", se=FALSE)
## `geom_smooth()` using formula 'y ~ x'

fit <- tslm(Demand ~ Temperature, data=daily20)
checkresiduals(fit)

## 
##  Breusch-Godfrey test for serial correlation of order up to 5
## 
## data:  Residuals from Linear regression model
## LM test = 3.8079, df = 5, p-value = 0.5774
forecast(fit, newdata=data.frame(Temperature=c(15,35)))
##          Point Forecast    Lo 80    Hi 80     Lo 95    Hi 95
## 4.285714       140.5701 108.6810 172.4591  90.21166 190.9285
## 4.428571       275.7146 245.2278 306.2014 227.57056 323.8586
fcast_model$lower[,1] 
## Time Series:
## Start = c(4, 3) 
## End = c(4, 4) 
## Frequency = 7 
## [1] 108.6810 245.2278
fcast_model$upper[,1]
## Time Series:
## Start = c(4, 3) 
## End = c(4, 4) 
## Frequency = 7 
## [1] 172.4591 306.2014
fcast_model$lower[,2] 
## Time Series:
## Start = c(4, 3) 
## End = c(4, 4) 
## Frequency = 7 
## [1]  90.21166 227.57056
fcast_model$upper[,2]
## Time Series:
## Start = c(4, 3) 
## End = c(4, 4) 
## Frequency = 7 
## [1] 190.9285 323.8586
#(e)
plot(Demand ~ Temperature, elecdaily, main="Demand vs Temperature (all data)")

## Looks like the model would not be a good fit to the data with all data points. 

2.

#(a)
mens400
## Time Series:
## Start = 1896 
## End = 2016 
## Frequency = 0.25 
##  [1] 54.20 49.40 49.20 50.00 48.20    NA 49.60 47.60 47.80 46.20 46.50    NA
## [13]    NA 46.20 45.90 46.70 44.90 45.10 43.80 44.66 44.26 44.60 44.27 43.87
## [25] 43.50 43.49 43.84 44.00 43.75 43.94 43.03
autoplot(mens400)+
  ggtitle("Monthly Sales of Product A (Plastics Manufacturer) ")+
  theme(plot.title = element_text(hjust = 0.5,face = "bold"))

## First of all, we can see that there are two places where the curve was not connected, it is because of the missing data at that time periods. Second of all, we see that the winning time have gone gradually smaller and smaller as the time increases. 

#(b)
time400 <- time(mens400)
mens_tslm <- tslm(mens400 ~ time400, mens400)

mens_tslm$coefficients
##  (Intercept)      time400 
## 172.48147736  -0.06457385
## The winning time is decreasing at 0.06457 second per year. 

#(c)
mens_lm <- lm(mens400 ~ time400, mens400)
plot(mens_lm)

## The residual plot shows that the model fits the data well. 

#(d)
timemens = time(mens400)
mens400_lm = lm(mens400 ~ timemens, mens400, na.action=na.exclude)
fcast_mens = forecast(mens400_lm, newdata=data.frame(timemens=2020))
fcast_mens
##   Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 1       42.04231 40.44975 43.63487 39.55286 44.53176
## Assumption is that the residuals are normally distributed. At 80% confidence interval, the forecast is range from 40.44975 to 43.63487. At 95% confidence interval, the forecast is range from 39.55286 to 44.53176. 

3.

easter(ausbeer)
##      Qtr1 Qtr2 Qtr3 Qtr4
## 1956 0.67 0.33 0.00 0.00
## 1957 0.00 1.00 0.00 0.00
## 1958 0.00 1.00 0.00 0.00
## 1959 1.00 0.00 0.00 0.00
## 1960 0.00 1.00 0.00 0.00
## 1961 0.33 0.67 0.00 0.00
## 1962 0.00 1.00 0.00 0.00
## 1963 0.00 1.00 0.00 0.00
## 1964 1.00 0.00 0.00 0.00
## 1965 0.00 1.00 0.00 0.00
## 1966 0.00 1.00 0.00 0.00
## 1967 1.00 0.00 0.00 0.00
## 1968 0.00 1.00 0.00 0.00
## 1969 0.00 1.00 0.00 0.00
## 1970 1.00 0.00 0.00 0.00
## 1971 0.00 1.00 0.00 0.00
## 1972 0.33 0.67 0.00 0.00
## 1973 0.00 1.00 0.00 0.00
## 1974 0.00 1.00 0.00 0.00
## 1975 1.00 0.00 0.00 0.00
## 1976 0.00 1.00 0.00 0.00
## 1977 0.00 1.00 0.00 0.00
## 1978 1.00 0.00 0.00 0.00
## 1979 0.00 1.00 0.00 0.00
## 1980 0.00 1.00 0.00 0.00
## 1981 0.00 1.00 0.00 0.00
## 1982 0.00 1.00 0.00 0.00
## 1983 0.00 1.00 0.00 0.00
## 1984 0.00 1.00 0.00 0.00
## 1985 0.00 1.00 0.00 0.00
## 1986 1.00 0.00 0.00 0.00
## 1987 0.00 1.00 0.00 0.00
## 1988 0.00 1.00 0.00 0.00
## 1989 1.00 0.00 0.00 0.00
## 1990 0.00 1.00 0.00 0.00
## 1991 1.00 0.00 0.00 0.00
## 1992 0.00 1.00 0.00 0.00
## 1993 0.00 1.00 0.00 0.00
## 1994 0.00 1.00 0.00 0.00
## 1995 0.00 1.00 0.00 0.00
## 1996 0.00 1.00 0.00 0.00
## 1997 1.00 0.00 0.00 0.00
## 1998 0.00 1.00 0.00 0.00
## 1999 0.00 1.00 0.00 0.00
## 2000 0.00 1.00 0.00 0.00
## 2001 0.00 1.00 0.00 0.00
## 2002 1.00 0.00 0.00 0.00
## 2003 0.00 1.00 0.00 0.00
## 2004 0.00 1.00 0.00 0.00
## 2005 1.00 0.00 0.00 0.00
## 2006 0.00 1.00 0.00 0.00
## 2007 0.00 1.00 0.00 0.00
## 2008 1.00 0.00 0.00 0.00
## 2009 0.00 1.00 0.00 0.00
## 2010 0.00 1.00
## There are many 1s and 0s, so it may have a meaning of indication. The easter() will return a vector of 0s or 1s if Easter is on March or April in a particular year. 

4.

# log(y) = ß0 + ß1log(x) + e
# ß1log(x) = log(y) - ß0 - e
# ß1 = (log(y) - ß0 - e) / log(x) 

5.

#(a)
autoplot(fancy)

## The graph has a seasonality with a frequency of 1 year. The sales volume was increasing gradually from 1987 to 1990. However, there is large increase from 1992 to 1993. 

#(b)
hist(fancy)

## This data is skewed to the right; thus, an log transformation is necessary to transform the skewed data to normality. 

#(c)
time <- time(fancy)
surfing_festival <- c()

for(i in 1:length(time)){
  month <- round(12*(time[i] - floor(time[i]))) + 1
  year <- floor(time[i])
  if(year >= 1988 & month == 3){
    surfing_festival[i] <- 1
  } else {
    surfing_festival[i] <- 0
  }
}

tslm_fancy <- tslm(BoxCox(fancy, 0) ~ trend + season + surfing_festival)
summary(tslm_fancy)
## 
## Call:
## tslm(formula = BoxCox(fancy, 0) ~ trend + season + surfing_festival)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33673 -0.12757  0.00257  0.10911  0.37671 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.6196670  0.0742471 102.626  < 2e-16 ***
## trend            0.0220198  0.0008268  26.634  < 2e-16 ***
## season2          0.2514168  0.0956790   2.628 0.010555 *  
## season3          0.2660828  0.1934044   1.376 0.173275    
## season4          0.3840535  0.0957075   4.013 0.000148 ***
## season5          0.4094870  0.0957325   4.277 5.88e-05 ***
## season6          0.4488283  0.0957647   4.687 1.33e-05 ***
## season7          0.6104545  0.0958039   6.372 1.71e-08 ***
## season8          0.5879644  0.0958503   6.134 4.53e-08 ***
## season9          0.6693299  0.0959037   6.979 1.36e-09 ***
## season10         0.7473919  0.0959643   7.788 4.48e-11 ***
## season11         1.2067479  0.0960319  12.566  < 2e-16 ***
## season12         1.9622412  0.0961066  20.417  < 2e-16 ***
## surfing_festival 0.5015151  0.1964273   2.553 0.012856 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared:  0.9567, Adjusted R-squared:  0.9487 
## F-statistic:   119 on 13 and 70 DF,  p-value: < 2.2e-16
#(d)
autoplot(tslm_fancy$residuals)+ ylab("Residuals") +
  ggtitle("Residuals vs Time")+
  theme(plot.title = element_text(hjust = 0.5,face = "bold"))

cbind(residuals = tslm_fancy$residuals,
      fitted.values = tslm_fancy$fitted.values) %>% as.data.frame() %>%
  ggplot(aes(x = fitted.values,
             y = residuals)) + geom_point() +
  ggtitle("Residuals vs Time")+
  theme(plot.title = element_text(hjust = 0.5,face = "bold"))

## The data points are separated out as the residuals increase, so heteroscedasticity may be a problem here. 

#(e)
cbind.data.frame(
  month = factor(
    month.abb[round(12*(time - floor(time)) + 1)],
    labels = month.abb,
    ordered = TRUE
  ),
  residuals = tslm_fancy$residuals
) %>%
  ggplot(aes(x = month,
             y = residuals)) +
  geom_boxplot()
## Don't know how to automatically pick scale for object of type ts. Defaulting to continuous.

#(f)
summary(tslm_fancy)
## 
## Call:
## tslm(formula = BoxCox(fancy, 0) ~ trend + season + surfing_festival)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.33673 -0.12757  0.00257  0.10911  0.37671 
## 
## Coefficients:
##                   Estimate Std. Error t value Pr(>|t|)    
## (Intercept)      7.6196670  0.0742471 102.626  < 2e-16 ***
## trend            0.0220198  0.0008268  26.634  < 2e-16 ***
## season2          0.2514168  0.0956790   2.628 0.010555 *  
## season3          0.2660828  0.1934044   1.376 0.173275    
## season4          0.3840535  0.0957075   4.013 0.000148 ***
## season5          0.4094870  0.0957325   4.277 5.88e-05 ***
## season6          0.4488283  0.0957647   4.687 1.33e-05 ***
## season7          0.6104545  0.0958039   6.372 1.71e-08 ***
## season8          0.5879644  0.0958503   6.134 4.53e-08 ***
## season9          0.6693299  0.0959037   6.979 1.36e-09 ***
## season10         0.7473919  0.0959643   7.788 4.48e-11 ***
## season11         1.2067479  0.0960319  12.566  < 2e-16 ***
## season12         1.9622412  0.0961066  20.417  < 2e-16 ***
## surfing_festival 0.5015151  0.1964273   2.553 0.012856 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.179 on 70 degrees of freedom
## Multiple R-squared:  0.9567, Adjusted R-squared:  0.9487 
## F-statistic:   119 on 13 and 70 DF,  p-value: < 2.2e-16
## All coefficients are positive which suggest a positive relationship with time. 

#(g)
checkresiduals(tslm_fancy)

## 
##  Breusch-Godfrey test for serial correlation of order up to 17
## 
## data:  Residuals from Linear regression model
## LM test = 37.954, df = 17, p-value = 0.002494
## The test tells us that the autocorrelation problem has occurred in the residuals. 

#(h)
fancy_time = ts(data=time, start=1994, end=c(1996,12), frequency=12)
forecast(tslm_fancy, fancy_time)
## Warning in forecast.lm(tslm_fancy, fancy_time): newdata column names not
## specified, defaulting to first variable required.
##          Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## Jan 1994       1006.002 501.0539 1510.950 227.5849 1784.419
## Feb 1994       1006.317 501.3479 1511.286 227.8675 1784.767
## Mar 1994       1006.396 501.6231 1511.168 228.2491 1784.542
## Apr 1994       1006.577 501.5658 1511.589 228.0624 1785.092
## May 1994       1006.667 501.6339 1511.699 228.1190 1785.214
## Jun 1994       1006.770 501.7159 1511.824 228.1896 1785.350
## Jul 1994       1006.995 501.9201 1512.070 228.3823 1785.608
## Aug 1994       1007.036 501.9403 1512.133 228.3910 1785.682
## Sep 1994       1007.182 502.0643 1512.299 228.5036 1785.860
## Oct 1994       1007.324 502.1850 1512.462 228.6128 1786.034
## Nov 1994       1007.847 502.6870 1513.006 229.1033 1786.590
## Dec 1994       1008.666 503.4851 1513.847 229.8900 1787.442
## Jan 1995       1006.768 501.5678 1511.967 227.9624 1785.573
## Feb 1995       1007.083 501.8618 1512.304 228.2450 1785.921
## Mar 1995       1007.161 502.1369 1512.186 228.6266 1785.696
## Apr 1995       1007.343 502.0797 1512.606 228.4399 1786.246
## May 1995       1007.432 502.1478 1512.717 228.4965 1786.368
## Jun 1995       1007.535 502.2298 1512.841 228.5670 1786.504
## Jul 1995       1007.761 502.4340 1513.088 228.7598 1786.762
## Aug 1995       1007.802 502.4542 1513.150 228.7685 1786.836
## Sep 1995       1007.947 502.5782 1513.317 228.8810 1787.014
## Oct 1995       1008.089 502.6989 1513.480 228.9903 1787.188
## Nov 1995       1008.612 503.2009 1514.024 229.4808 1787.744
## Dec 1995       1009.432 503.9990 1514.865 230.2674 1788.596
## Jan 1996       1007.533 502.0817 1512.985 228.3399 1786.727
## Feb 1996       1007.849 502.3757 1513.321 228.6225 1787.075
## Mar 1996       1007.927 502.6508 1513.203 229.0041 1786.850
## Apr 1996       1008.109 502.5936 1513.624 228.8174 1787.400
## May 1996       1008.198 502.6617 1513.734 228.8740 1787.522
## Jun 1996       1008.301 502.7437 1513.859 228.9445 1787.658
## Jul 1996       1008.527 502.9479 1514.105 229.1373 1787.916
## Aug 1996       1008.568 502.9681 1514.168 229.1460 1787.990
## Sep 1996       1008.713 503.0921 1514.334 229.2585 1788.168
## Oct 1996       1008.855 503.2128 1514.497 229.3678 1788.342
## Nov 1996       1009.378 503.7148 1515.042 229.8583 1788.898
## Dec 1996       1010.198 504.5129 1515.882 230.6449 1789.750
#(i)
trans.fancy = (BoxCox(fancy, 0))
forecast(trans.fancy)
##          Point Forecast     Lo 80     Hi 80     Lo 95     Hi 95
## Jan 1994       9.674641  9.464046  9.885235  9.352565  9.996717
## Feb 1994       9.958925  9.733371 10.184479  9.613970 10.303880
## Mar 1994      10.439733 10.200143 10.679323 10.073312 10.806155
## Apr 1994      10.121042  9.868185 10.373898  9.734331 10.507752
## May 1994      10.157291  9.891822 10.422759  9.751292 10.563289
## Jun 1994      10.227196  9.949682 10.504711  9.802774 10.651618
## Jul 1994      10.403289 10.114223 10.692356  9.961200 10.845378
## Aug 1994      10.382661 10.082480 10.682842  9.923574 10.841748
## Sep 1994      10.518849 10.207944 10.829754 10.043361 10.994337
## Oct 1994      10.612257 10.290980 10.933534 10.120906 11.103608
## Nov 1994      11.108401 10.777070 11.439732 10.601674 11.615129
## Dec 1994      11.898748 11.557653 12.239843 11.377088 12.420408
## Jan 1995       9.985303  9.634703 10.335903  9.449107 10.521499
## Feb 1995      10.269587  9.909735 10.629440  9.719240 10.819934
## Mar 1995      10.750396 10.381517 11.119274 10.186244 11.314547
## Apr 1995      10.431704 10.054009 10.809399  9.854070 11.009338
## May 1995      10.467953 10.081638 10.854268  9.877135 11.058770
## Jun 1995      10.537858 10.143107 10.932610  9.934137 11.141579
## Jul 1995      10.713952 10.310934 11.116969 10.097590 11.330314
## Aug 1995      10.693323 10.282201 11.104445 10.064567 11.322080
## Sep 1995      10.829511 10.410437 11.248586 10.188592 11.470430
## Oct 1995      10.922919 10.496035 11.349803 10.270057 11.575781
## Nov 1995      11.419063 10.984506 11.853621 10.754465 12.083661
## Dec 1995      12.209410 11.767308 12.651512 11.533273 12.885547
#(j)
## Since we have autocorrelation problem, it may be worth to use lambda function because it helps to better transform the data. 

6.

#(a)
head(gasoline)
## Time Series:
## Start = 1991.1 
## End = 1991.19582477755 
## Frequency = 52.1785714285714 
## [1] 6.621 6.433 6.582 7.224 6.875 6.947
gas_04 = window(gasoline, end=2004)
autoplot(gas_04)

for(num in c(1, 5, 10)){
  var_name <- paste("Fit",
                    as.character(num),
                    "_gasoline_2004",
                    sep = "")
  
  assign(var_name,
         tslm(gas_04 ~ trend + fourier(
           gas_04,
           K = num
         ))
  )
  print(
    autoplot(gas_04) +
      autolayer(get(var_name)$fitted.values,
                series = as.character(num)) +
      ggtitle(var_name) +
      ylab("gasoline") +
      guides(colour = guide_legend(title = "Number of Fourier Transform pairs"))+
      scale_colour_manual(values="blue")
  )
}

autoplot(gas_04) +
  autolayer(Fit1_gasoline_2004$fitted.values, series = "1") +
  autolayer(Fit10_gasoline_2004$fitted.values, series = "5") +
  autolayer(Fit10_gasoline_2004$fitted.values, series = "10") +
  guides(colour = guide_legend(title = "Fourier Transform pairs")) +
  scale_color_discrete(breaks = c(1, 5, 10))

## As we use more Fourier pairs is applied, the fitted line is more closely match to the original data. 

#(b)
for(i in c(1, 5, 10)){
  fit_gasoline_2004.name <- paste(
    "Fit", as.character(i), "_gasoline_2004",
    sep = ""
  )
  writeLines(
    paste(
      "\n", fit_gasoline_2004.name, "\n"
    )
  )
  print(CV(get(fit_gasoline_2004.name)))
}
## 
##  Fit1_gasoline_2004 
## 
##            CV           AIC          AICc           BIC         AdjR2 
##  8.493593e-02 -1.659745e+03 -1.659655e+03 -1.637179e+03  7.989587e-01 
## 
##  Fit5_gasoline_2004 
## 
##            CV           AIC          AICc           BIC         AdjR2 
##  7.879969e-02 -1.710485e+03 -1.709933e+03 -1.651813e+03  8.157116e-01 
## 
##  Fit10_gasoline_2004 
## 
##            CV           AIC          AICc           BIC         AdjR2 
##     0.0748716 -1745.4585912 -1743.7601296 -1641.6542986     0.8275409
## The results suggest that 10 Fourier terms minimize the AIC. 

#(c)
checkresiduals(Fit10_gasoline_2004)

## 
##  Breusch-Godfrey test for serial correlation of order up to 104
## 
## data:  Residuals from Linear regression model
## LM test = 151.05, df = 104, p-value = 0.001781
#(d)
fc_05 <- forecast(Fit10_gasoline_2004, 
               newdata=data.frame(fourier(gas_04,
                                          K = 10,
                                          h = 52)))

#(e)
autoplot(fc_05) 

## The model prediction looks to be in line and displaying a similar seasonal pattern in the early year. 

7.

#(a)
head(huron)
## Time Series:
## Start = 1875 
## End = 1880 
## Frequency = 1 
## [1] 10.38 11.86 10.97 10.80  9.79 10.39
autoplot(huron)

## The plot shows that there was an decreasing trend from 1880 to 1986. Furthermore, it is difficult to tell if there is a seasonality as the pattern are extremely inconsistent. 

#(b)
tslm_huron <- tslm(huron ~ trend)

t.huron <- time(huron)
t.break <- 1915
t.piece <- ts(pmax(0,t.huron-t.break), start=1875)
pw_huron <- tslm(huron ~ t.huron + t.piece)
summary(pw_huron)
## 
## Call:
## tslm(formula = huron ~ t.huron + t.piece)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -2.49626 -0.66240 -0.07139  0.85163  2.39222 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 132.90870   19.97687   6.653 1.82e-09 ***
## t.huron      -0.06498    0.01051  -6.181 1.58e-08 ***
## t.piece       0.06486    0.01563   4.150 7.26e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 1.045 on 95 degrees of freedom
## Multiple R-squared:  0.3841, Adjusted R-squared:  0.3711 
## F-statistic: 29.62 on 2 and 95 DF,  p-value: 1.004e-10
#(c)
fcast_huron <- tslm(huron ~ trend)
forecast(fcast_huron, h=8)
##      Point Forecast    Lo 80    Hi 80    Lo 95     Hi 95
## 1973       7.806127 6.317648 9.294605 5.516501 10.095752
## 1974       7.781926 6.292536 9.271315 5.490899 10.072952
## 1975       7.757724 6.267406 9.248043 5.465269 10.050179
## 1976       7.733523 6.242259 9.224788 5.439613 10.027434
## 1977       7.709322 6.217094 9.201550 5.413929 10.004715
## 1978       7.685121 6.191912 9.178331 5.388219  9.982024
## 1979       7.660920 6.166712 9.155128 5.362481  9.959359
## 1980       7.636719 6.141494 9.131943 5.336717  9.936721
forecast(t.piece, h=8)
##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 1973             58 57.86560 58.13440 57.79446 58.20554
## 1974             59 58.70969 59.29031 58.55602 59.44398
## 1975             60 59.51703 60.48297 59.26136 60.73864
## 1976             61 60.29422 61.70578 59.92061 62.07939
## 1977             62 61.04503 62.95497 60.53950 63.46050
## 1978             63 61.77204 64.22796 61.12199 64.87801
## 1979             64 62.47716 65.52284 61.67102 66.32898
## 1980             65 63.16194 66.83806 62.18893 67.81107
## In the linear model, the trend is decreasing. On the other hand, piecewise model is increasing. 

Chapter 6

1.

#weights = c(0.067, 0.133, 0.200, 0.200, 0.200, 0.133, 0.067)

#3x5MA = [[(y1+y2+y3+y4+y5)/5] + [(y2+y3+y4+y5+y6)/5] + [(y3+y4+y5+y6+y7)/5]]/3 
#      = (1/15)y1+ (2/15)y2 + (3/15)y3 + (3/15)y4 + (3/15)y5 + (2/15)y6 +(1/15)y7 

2. (a)

plas.ts <- ts(plastics, frequency = 12)
autoplot(plas.ts) +
  ggtitle("Monthly Sales of Product A (Plastics Manufacturer) ")+
  theme(plot.title = element_text(hjust = 0.5,face = "bold"))

## By looking at the graph, there is indeed a seasonality, and it looks that it has an upward or increase tendency. 

2. (b)

plas.ts %>% decompose(type="multiplicative") %>%
  autoplot() + xlab("Time") +
  ggtitle("Decomposition of Multiplicative Time Series")+
  theme(plot.title = element_text(hjust = 0.5,face = "bold"))

2. (c)

## The multiplicative decomposition shows that it supports the first initial interpretation from part a as the trend presents an increasing pattern and a seasonality. However, we see that there is a decrease in trend-cycle around 5.1 years. 

2. (d)

fit <- plastics %>%
  decompose(type="multiplicative")

autoplot(plastics, series="Data") +
  autolayer(seasadj(fit), series="Seasonally Adjusted") +
  xlab("Year") + 
  ggtitle("Seasonal Adjusted ") +
  scale_colour_manual(values=c("gray","red"),
             breaks=c("Data","Seasonally Adjusted"))

2. (e)

plst.add <- plastics
plst.add[20] <- plst.add[20]+500

fit2 <- plst.add %>%
  decompose(type="multiplicative")

autoplot(plst.add, series="Data") +
  autolayer(seasadj(fit2), series="Seasonally Adjusted") +
  xlab("Year")+
  ggtitle("Seasonal Adjusted ") +
  scale_colour_manual(values=c("gray","blue"),
             breaks=c("Data","Seasonally Adjusted"))

## It shows that the outlier caused a sudden increase in the seasonally adjusted data. 

2. (f)

plst.add2 <- plastics
plst.add2[60] <- plst.add2[60]+500

fit3 <- plst.add2 %>%
  decompose(type="multiplicative")

autoplot(plst.add2, series="Data") +
  autolayer(seasadj(fit3), series="Seasonally Adjusted") +
  xlab("Year") +
  ggtitle("Seasonal Adjusted ") +
  scale_colour_manual(values=c("gray","blue"),
             breaks=c("Data","Seasonally Adjusted"))

## We add 500 to the very last observation. This graph is matching well in terms of the overall trend with the original data; therefore, it suggests that the condition is less severe than outlier in the middle of the time series. 

3.

setwd("~/Desktop/Predictive:Forecasting/Week2")
retail <- readxl::read_excel("retail.xlsx", skip=1)
myts <- ts(retail[,"A3349873A"],
  frequency=12, start=c(1982,4))

myts %>% seas(x11="") -> myts_x11
autoplot(myts_x11) +
  ggtitle("X11 Decomposition of Clothing Retail (New South Wales) ")+
    theme(plot.title = element_text(hjust = 0.5,face = "bold"))

myts %>% decompose(type="multiplicative") %>%
  autoplot()

## We can see that x11 method has captured some sudden falls in 2010 in the trend-cycle, which is better than previous classical method. In addition, we can see that there is a peak around 2001. This was not showed previously because the overall trend looks relatively consistent compared to x11 decomposition. 

4. (a)

## From the graph, we can see that seasonality seems to get stronger as time increases. Also, there may have some outliers around 1992 that caused a drop in the data. Furthermore, the breakdown of monthly seasonal component shows that March and December have larger labor force than other months. 

4. (b)

## We can clearly see that the 1991/1992 recession caused a fall in the remainder and data; hence, the recession is indeed visible in the estimated component. 

5. (a)

autoplot(cangas)

ggsubseriesplot(cangas)

ggseasonplot(cangas)

## the autoplot shows that overall it is increasing from 1960 to 2005. However, when looking at ggseasonplot, the warmer month such as June, July and August have relatively lower gas production compared to months that are colder. The reason may be that summer-blend fuel is more expensive to produce than winter-blend fuel. Therefore, the overall consumption is lower corresponding to the higher prices. 

5. (b)

cangas %>%
  stl(t.window=13, s.window="periodic", robust=TRUE) %>%
  autoplot()

5. (c)

#SEATS
cangas %>% seas() %>%
autoplot() +
  ggtitle("SEATS decomposition of Monthly Canadian Gas Production ")

#X11
cangas %>% seas(x11="") %>%
autoplot() +
  ggtitle("X11 decomposition of Monthly Canadian Gas Production")

## As we can see, two methods are very similar in terms of the trend-cycles and seasonal components. However, looking at the remainder, there are many variations in the SEATS method, and X11 is relatively better. 

6

#(a) 
bricksq%>%
  stl(t.window=13, s.window="periodic", robust=TRUE) %>%
  autoplot()

bricksq%>%
  stl(t.window=13, s.window=50, robust=TRUE) %>%
  autoplot()

#(b)
decom_brick <- bricksq %>%
  decompose(type="multiplicative")

autoplot(bricksq, series="Data") +
  autolayer(seasadj(decom_brick), series="Seasonally Adjusted") +
  xlab("Year")+
  ggtitle("Seasonal Adjusted ") +
  scale_colour_manual(values=c("gray","blue"),
             breaks=c("Data","Seasonally Adjusted"))

#(c)
focas <- stl(bricksq, t.window=13, s.window="periodic",
  robust=F)
focas %>% seasadj() %>% naive() %>%
  autoplot() + ylab("New orders index") +
  ggtitle("Naive forecasts of seasonally adjusted data")

#(d)
fcast <- stlf(bricksq)
fcast
##         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 1994 Q4       467.0636 439.0835 495.0437 424.2718 509.8555
## 1995 Q1       426.0736 386.4842 465.6629 365.5268 486.6203
## 1995 Q2       484.0345 435.5220 532.5470 409.8411 558.2280
## 1995 Q3       493.9988 437.9514 550.0462 408.2817 579.7159
## 1995 Q4       467.0636 404.3669 529.7604 371.1772 562.9500
## 1996 Q1       426.0736 357.3555 494.7916 320.9784 531.1688
## 1996 Q2       484.0345 409.7703 558.2988 370.4571 597.6119
## 1996 Q3       493.9988 414.5638 573.4338 372.5134 615.4841
autoplot(fcast)

#(e)
checkresiduals((fcast))
## Warning in checkresiduals((fcast)): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 44.704, df = 6, p-value = 5.358e-08
## 
## Model df: 2.   Total lags used: 8
## the residuals are normal, and correlation has occurred slightly. 

#(f)
fcast1 <- stlf(bricksq, robust=TRUE)
fcast1
##         Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 1994 Q4       461.6172 432.2633 490.9710 416.7243 506.5100
## 1995 Q1       424.3612 382.8257 465.8967 360.8381 487.8842
## 1995 Q2       487.1456 436.2455 538.0458 409.3006 564.9907
## 1995 Q3       493.9985 435.1892 552.8078 404.0574 583.9396
## 1995 Q4       461.6172 395.8271 527.4072 360.9999 562.2344
## 1996 Q1       424.3612 352.2486 496.4738 314.0745 534.6479
## 1996 Q2       487.1456 409.2083 565.0829 367.9508 606.3404
## 1996 Q3       493.9985 410.6299 577.3671 366.4972 621.4997
autoplot(fcast1)

checkresiduals((fcast))
## Warning in checkresiduals((fcast)): The fitted degrees of freedom is based on
## the model used for the seasonally adjusted data.

## 
##  Ljung-Box test
## 
## data:  Residuals from STL +  ETS(M,N,N)
## Q* = 44.704, df = 6, p-value = 5.358e-08
## 
## Model df: 2.   Total lags used: 8
## The normality of residuals are being reduced. More importantly, we see autocorrelation on the residuals which the robust STL did not model the correlation between data points sufficient. 

#(g)
bric.train <-  window(bricksq, end=c(1992,3))
bric.test <- window(bricksq, start=c(1992,4), end= c(1994,3))

snbrick <- snaive(bric.train)
stlfbrick <- stlf(bric.train, robust=TRUE)

snbrick.fcast <- forecast(snbrick, h =8)
autoplot(snbrick.fcast)

stlfbrick.fcast <- forecast(stlfbrick, h =8)
autoplot(stlfbrick.fcast)

## It looks like that stlf() function has less variations in the forecast which perhaps gives a better prediction than snaive(). 

7

writing
##           Jan      Feb      Mar      Apr      May      Jun      Jul      Aug
## 1968  562.674  599.000  668.516  597.798  579.889  668.233  499.232  215.187
## 1969  634.712  639.283  712.182  621.557  621.000  675.989  501.322  220.286
## 1970  646.783  658.442  712.906  687.714  723.916  707.183  629.000  237.530
## 1971  676.155  748.183  810.681  729.363  701.108  790.079  594.621  230.716
## 1972  747.636  773.392  813.788  766.713  728.875  749.197  680.954  241.424
## 1973  795.337  788.421  889.968  797.393  751.000  821.255  691.605  290.655
## 1974  843.038  847.000  941.952  804.309  840.307  871.528  656.330  370.508
## 1975  778.139  856.075  938.833  813.023  783.417  828.110  657.311  310.032
## 1976  895.217  856.075  893.268  875.000  835.088  934.595  832.500  300.000
## 1977  875.024  992.968  976.804  968.697  871.675 1006.852  832.037  345.587
##           Sep      Oct      Nov      Dec
## 1968  555.813  586.935  546.136  571.111
## 1969  560.727  602.530  626.379  605.508
## 1970  613.296  730.444  734.925  651.812
## 1971  617.189  691.389  701.067  705.777
## 1972  680.234  708.326  694.238  772.071
## 1973  727.147  868.355  812.390  799.556
## 1974  742.000  847.152  731.675  898.527
## 1975  780.000  860.000  780.000  807.993
## 1976  791.443  900.000  781.729  880.000
## 1977  849.528  913.871  868.746  993.733
hist(writing) #not randomly distributed; thus, box-cox is used. 

autoplot(writing)

stlf(writing, method='rwdrift')
##          Point Forecast    Lo 80     Hi 80    Lo 95     Hi 95
## Jan 1978       991.5849 932.9189 1050.2509 901.8630 1081.3068
## Feb 1978      1026.5147 943.2006 1109.8288 899.0967 1153.9327
## Mar 1978      1083.0518 980.5889 1185.5146 926.3484 1239.7551
## Apr 1978      1013.4371 894.6353 1132.2389 831.7454 1195.1288
## May 1978       980.1886 846.8209 1113.5563 776.2203 1184.1569
## Jun 1978      1051.1446 904.4549 1197.8342 826.8021 1275.4871
## Jul 1978       902.0899 743.0094 1061.1705 658.7972 1145.3826
## Aug 1978       490.0146 319.2714  660.7578 228.8855  751.1438
## Sep 1978       939.0529 757.2352 1120.8706 660.9867 1217.1191
## Oct 1978      1029.1126 836.7068 1221.5184 734.8534 1323.3718
## Nov 1978       961.3387 758.7551 1163.9223 651.5138 1271.1636
## Dec 1978      1035.7402 823.3300 1248.1504 710.8868 1360.5935
## Jan 1979      1033.5921 811.6598 1255.5243 694.1760 1373.0081
## Feb 1979      1068.5219 837.3345 1299.7092 714.9513 1422.0924
## Mar 1979      1125.0589 884.8526 1365.2653 757.6950 1492.4229
## Apr 1979      1055.4443 806.4293 1304.4593 674.6087 1436.2798
## May 1979      1022.1958 764.5610 1279.8305 628.1774 1416.2141
## Jun 1979      1093.1517 827.0677 1359.2358 686.2114 1500.0921
## Jul 1979       944.0971 669.7185 1218.4756 524.4713 1363.7229
## Aug 1979       532.0218 249.4898  814.5538  99.9264  964.1172
## Sep 1979       981.0601 690.5039 1271.6163 536.6928 1425.4274
## Oct 1979      1071.1198 772.6582 1369.5814 614.6622 1527.5774
## Nov 1979      1003.3459 697.0885 1309.6032 534.9656 1471.7261
## Dec 1979      1077.7473 763.7956 1391.6991 597.5996 1557.8951
writ.box <- stlf(writing, method='rwdrift', s.window = "periodic", robust=TRUE, lambda = BoxCox.lambda(writing))
autoplot(writ.box)

8

fancy
##            Jan       Feb       Mar       Apr       May       Jun       Jul
## 1987   1664.81   2397.53   2840.71   3547.29   3752.96   3714.74   4349.61
## 1988   2499.81   5198.24   7225.14   4806.03   5900.88   4951.34   6179.12
## 1989   4717.02   5702.63   9957.58   5304.78   6492.43   6630.80   7349.62
## 1990   5921.10   5814.58  12421.25   6369.77   7609.12   7224.75   8121.22
## 1991   4826.64   6470.23   9638.77   8821.17   8722.37  10209.48  11276.55
## 1992   7615.03   9849.69  14558.40  11587.33   9332.56  13082.09  16732.78
## 1993  10243.24  11266.88  21826.84  17357.33  15997.79  18601.53  26155.15
##            Aug       Sep       Oct       Nov       Dec
## 1987   3566.34   5021.82   6423.48   7600.60  19756.21
## 1988   4752.15   5496.43   5835.10  12600.08  28541.72
## 1989   8176.62   8573.17   9690.50  15151.84  34061.01
## 1990   7979.25   8093.06   8476.70  17914.66  30114.41
## 1991  12552.22  11637.39  13606.89  21822.11  45060.69
## 1992  19888.61  23933.38  25391.35  36024.80  80721.71
## 1993  28586.52  30505.41  30821.33  46634.38 104660.67
hist(fancy) #Also not randomly distributed. 

autoplot(fancy)

stlf(fancy, method='rwdrift')
##          Point Forecast    Lo 80     Hi 80    Lo 95     Hi 95
## Jan 1994       60219.09 52848.76  67589.41 48947.14  71491.03
## Feb 1994       61810.73 51324.91  72296.55 45774.06  77847.41
## Mar 1994       67525.88 54607.21  80444.55 47768.48  87283.28
## Apr 1994       64392.56 49387.87  79397.25 41444.87  87340.25
## May 1994       64241.86 47368.86  81114.86 38436.83  90046.89
## Jun 1994       65746.22 47156.85  84335.59 37316.23  94176.21
## Jul 1994       68624.78 48432.20  88817.36 37742.90  99506.66
## Aug 1994       70067.94 48360.23  91775.65 36868.86 103267.01
## Sep 1994       71428.85 48276.79  94580.91 36020.82 106836.87
## Oct 1994       72613.64 48075.50  97151.78 35085.79 110141.49
## Nov 1994       82383.43 56508.12 108258.74 42810.56 121956.30
## Dec 1994      113329.03 86158.23 140499.82 71774.89 154883.16
## Jan 1995       68887.44 40457.16  97317.73 25407.07 112367.81
## Feb 1995       70479.09 40820.71 100137.46 25120.52 115837.66
## Mar 1995       76194.23 45335.42 107053.05 28999.75 123388.71
## Apr 1995       73060.91 41026.21 105095.62 24068.06 122053.77
## May 1995       72910.22 39721.55 106098.88 22152.53 123667.90
## Jun 1995       74414.57 40091.67 108737.47 21922.23 126906.92
## Jul 1995       77293.14 41853.83 112732.44 23093.39 131492.88
## Aug 1995       78736.29 42196.78 115275.81 22853.92 134618.66
## Sep 1995       80097.20 42472.25 117722.16 22554.80 137639.61
## Oct 1995       81282.00 42585.14 119978.86 22100.25 140463.74
## Nov 1995       91051.79 51295.45 130808.12 30249.72 151853.85
## Dec 1995      121997.38 81193.05 162801.71 59592.54 184402.22
fanc.box <- stlf(fancy, method='rwdrift',s.window = "periodic", robust=TRUE, lambda = BoxCox.lambda(fancy))
autoplot(fanc.box)