Jake

Chapter 6, #2, 4, and 5

## -- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.2.1 --

## v ggplot2 2.2.1     v purrr   0.2.4
## v tibble  1.3.4     v dplyr   0.7.4
## v tidyr   0.7.2     v stringr 1.2.0
## v readr   1.1.1     v forcats 0.2.0

## -- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

2 Canadian working Hours

CanadianWorkHours <- read_csv("C:/Users/jake/Desktop/class/CanadianWorkHours.csv")

## Parsed with column specification:
## cols(
##   Year = col_integer(),
##   Hours = col_double()
## )

glimpse(CanadianWorkHours)

## Observations: 35
## Variables: 2
## $ Year  <int> 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 19...
## $ Hours <dbl> 37.2, 37.0, 37.4, 37.5, 37.7, 37.7, 37.4, 37.2, 37.3, 37...

work.ts <- ts(CanadianWorkHours[,2] , start = c(1966) , frequency = 1)
autoplot(work.ts)

This is a yearly series whose plot does not display any resemblence of seasonality, so seasonal models are not in play.

Linear trend model

work_lm <- tslm(work.ts ~ trend)
summary(work_lm)

## 
## Call:
## tslm(formula = work.ts ~ trend)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -1.20457 -0.28761  0.04779  0.30210  1.23190 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 37.416134   0.175818 212.811  < 2e-16 ***
## trend       -0.061373   0.008518  -7.205 2.93e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.509 on 33 degrees of freedom
## Multiple R-squared:  0.6113, Adjusted R-squared:  0.5996 
## F-statistic: 51.91 on 1 and 33 DF,  p-value: 2.928e-08

quadratic trend model

work_quad <- tslm(work.ts ~ trend + I(trend^2))
summary(work_quad)

## 
## Call:
## tslm(formula = work.ts ~ trend + I(trend^2))
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -0.94503 -0.20964 -0.01652  0.31862  0.60160 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 38.1644156  0.2176599 175.340  < 2e-16 ***
## trend       -0.1827154  0.0278786  -6.554 2.21e-07 ***
## I(trend^2)   0.0033706  0.0007512   4.487 8.76e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.4049 on 32 degrees of freedom
## Multiple R-squared:  0.7614, Adjusted R-squared:  0.7465 
## F-statistic: 51.07 on 2 and 32 DF,  p-value: 1.1e-10

The quadratic model has a higher r squared value than the linear model. All quadratic terms in both the quadradic and linear models have significant P-Values.

plot the series and fitted forecasts

yrange <- range(work.ts)

# Set up the plot
plot(c(1966, 2000), yrange, type="n", xlab="Year",  ylab="Hours", bty="l", xaxt="n", yaxt="n")

# Add the time series air
lines(work.ts, bty="l")

# Add the x-axis
axis(1, at=seq(1966,2000,1), labels=format(seq(1966,2000,1)))

# Add the y-axis
axis(2, at=seq(34,38,0.5), labels=format(seq(34,38,0.5)), las=2)

# Add the linear trend model
lines(work_lm$fitted, col="red")
lines(work_quad$fitted, col="blue")


# Add a legend
legend(1979,37.5, c("work.ts", "Linear - ADJ RSQ:0.60", "Quadratic - ADJ RSQ: 0.75"), lty=c(1,1,1), col=c("black", "red", "blue"), bty="n" , title("Canadian Working Hours"))

The plot visually confirms what the r squared values tell us, that the quadratic model provies a superior fit to the linear model.

4

DeptStoreSales <- read_csv("C:/Users/jake/Desktop/class/DeptStoreSales.csv")

## Parsed with column specification:
## cols(
##   Quarter = col_integer(),
##   Sales = col_integer()
## )

DeptStoreSales

sales.ts <- ts(DeptStoreSales[,2] , start = c(0,1), frequency = 4)
autoplot(sales.ts)

(a)

To fit a regression based model that accounts for trend, we should take the logarithm of sales.

(b) fit model with exponential trend and seasonality

validlength <- 4
trainlength <- length(sales.ts)- validlength

salestrain <- window(sales.ts , end=c(0,trainlength))
salesvalid <- window(sales.ts , start=c(0,trainlength+1))

s_exp <- tslm( salestrain~trend + season, lambda = 0)
summary(s_exp)

## 
## Call:
## tslm(formula = salestrain ~ trend + season, lambda = 0)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.053524 -0.013199 -0.004527  0.014387  0.062681 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 10.748945   0.018725 574.057  < 2e-16 ***
## trend        0.011088   0.001295   8.561 3.70e-07 ***
## season2      0.024956   0.020764   1.202    0.248    
## season3      0.165343   0.020884   7.917 9.79e-07 ***
## season4      0.433746   0.021084  20.572 2.10e-12 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.03277 on 15 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 7.63e+11 on 4 and 15 DF,  p-value: < 2.2e-16

According to the model, Q2 sales are 1.03 time higher than Q1 sales.

(d) forecast quarter 21 and 22

fc1 <- forecast(s_exp , h=2)
fc1

##      Point Forecast    Lo 80    Hi 80    Lo 95    Hi 95
## 5 Q1       58793.71 55790.19 61958.92 54090.84 63905.46
## 5 Q2       60951.51 57837.76 64232.89 56076.04 66250.87

(e) plots

autoplot(salestrain)+autolayer(s_exp$fit) +autolayer(fc1$mean)

autoplot(s_exp$residuals)

The plot of residuals over time suggest that our model with under-forecast the real values of sales. The plot also suggest that the Q22 forecast will be more accurate than the Q21 forecast.

(f)

The residual plot tells us that the trend is not captured very well by the model, because there is a pattern to the residuals and the forecast is behind the trend of the actual series.

(g)

resid_quad <- tslm(s_exp$residuals ~ trend + I(trend^2))
summary(resid_quad)

## 
## Call:
## tslm(formula = s_exp$residuals ~ trend + I(trend^2))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.027018 -0.013586 -0.001159  0.013387  0.022401 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  0.0618489  0.0124722   4.959 0.000119 ***
## trend       -0.0168679  0.0027353  -6.167 1.03e-05 ***
## I(trend^2)   0.0008032  0.0001265   6.349 7.27e-06 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.01676 on 17 degrees of freedom
## Multiple R-squared:  0.7033, Adjusted R-squared:  0.6684 
## F-statistic: 20.15 on 2 and 17 DF,  p-value: 3.268e-05

sales_quad <- tslm(salestrain ~ trend + I(trend^2))
summary(sales_quad)

## 
## Call:
## tslm(formula = salestrain ~ trend + I(trend^2))
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -13567  -7733  -4244   4896  20705 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 56462.70    9097.26   6.207 9.57e-06 ***
## trend        -370.83    1995.17  -0.186    0.855    
## I(trend^2)     68.56      92.29   0.743    0.468    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 12230 on 17 degrees of freedom
## Multiple R-squared:  0.2489, Adjusted R-squared:  0.1605 
## F-statistic: 2.817 on 2 and 17 DF,  p-value: 0.08779

Fitting quadratic trend model the the residuals would be a better way to improve fit.

6

AustralianWines <- read_csv("C:/Users/jake/Desktop/class/AustralianWines.csv")

## Parsed with column specification:
## cols(
##   Month = col_character(),
##   Fortified = col_integer(),
##   Red = col_integer(),
##   Rose = col_character(),
##   sparkling = col_integer(),
##   `Sweet white` = col_integer(),
##   `Dry white` = col_integer()
## )

glimpse(AustralianWines)

## Observations: 188
## Variables: 7
## $ Month         <chr> "Jan-80", "Feb-80", "Mar-80", "Apr-80", "May-80"...
## $ Fortified     <int> 2585, 3368, 3210, 3111, 3756, 4216, 5225, 4426, ...
## $ Red           <int> 464, 675, 703, 887, 1139, 1077, 1318, 1260, 1120...
## $ Rose          <chr> "112", "118", "129", "99", "116", "168", "118", ...
## $ sparkling     <int> 1686, 1591, 2304, 1712, 1471, 1377, 1966, 2453, ...
## $ `Sweet white` <int> 85, 89, 109, 95, 91, 95, 96, 128, 124, 111, 178,...
## $ `Dry white`   <int> 1954, 2302, 3054, 2414, 2226, 2725, 2589, 3470, ...

wineTS <- ts(AustralianWines[,2:7 ] , start = c(1980 , 1) , end = c(1994 , 12), frequency = 12)

## Warning in data.matrix(data): NAs introduced by coercion

autoplot(wineTS)

If I had to choose the model for all of the series I would choose a regression model which accounted for seasonality and trend. I may even want to give each series a log transformation as an attempt to stabilize the variance.

fort_TS <-  ts(AustralianWines[, 2] , start = c(1980 , 1) , end = c(1994 , 12), frequency = 12)

validLength <- 12
trainLength <- length(fort_TS) - validLength

fortTrain <- window(fort_TS, end=c(1980,trainLength))
fortValid <- window(fort_TS, start=c(1980,trainLength+1) , end = c(1994,2))



BoxCox.lambda(fortTrain)

## [1] 0.02277637

lm_t_s <- tslm(fortTrain ~ trend + season , lambda = 0)
summary(lm_t_s)

## 
## Call:
## tslm(formula = fortTrain ~ trend + season, lambda = 0)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -0.179573 -0.062128 -0.001094  0.064548  0.232966 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  7.7802102  0.0256763 303.012  < 2e-16 ***
## trend       -0.0034963  0.0001386 -25.227  < 2e-16 ***
## season2      0.1717197  0.0328450   5.228 5.46e-07 ***
## season3      0.3610223  0.0328459  10.991  < 2e-16 ***
## season4      0.4560481  0.0328473  13.884  < 2e-16 ***
## season5      0.6369148  0.0328494  19.389  < 2e-16 ***
## season6      0.6568734  0.0328520  19.995  < 2e-16 ***
## season7      0.8461903  0.0328552  25.755  < 2e-16 ***
## season8      0.7580526  0.0328590  23.070  < 2e-16 ***
## season9      0.4728853  0.0328634  14.389  < 2e-16 ***
## season10     0.4027478  0.0328684  12.253  < 2e-16 ***
## season11     0.5681648  0.0328739  17.283  < 2e-16 ***
## season12     0.6368115  0.0328801  19.368  < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.0869 on 155 degrees of freedom
## Multiple R-squared:      1,  Adjusted R-squared:      1 
## F-statistic: 1.281e+09 on 12 and 155 DF,  p-value: < 2.2e-16

autoplot(fortTrain) + xlab("Year") + autolayer(lm_t_s$fitted)

The model appears to be a nice fit. R Square is 1.

fc_fort <- forecast(lm_t_s,  h = 2)
accuracy(fc_fort , fortValid)

##                     ME     RMSE       MAE        MPE     MAPE      MASE
## Training set   9.57250 277.2694 212.96781 -0.3473992 6.811106 0.7653653
## Test set     -85.60303 121.0659  85.60996 -7.4180196 7.418461 0.3076657
##                     ACF1    Theil's U
## Training set  0.05621871           NA
## Test set     -0.50000000 1.673107e-05

autoplot(fortTrain , series = "Train Series" )  + autolayer(lm_t_s$fitted , series = "Fitted") + autolayer(fc_fort, PI=FALSE, series="Forecast") + autolayer(fortValid , series = "Actual") + labs(title = "Fortified Wine")