## -- Attaching packages ---------------------------------------------------------------------------- tidyverse 1.2.1 --
## v ggplot2 2.2.1 v purrr 0.2.4
## v tibble 1.3.4 v dplyr 0.7.4
## v tidyr 0.7.2 v stringr 1.2.0
## v readr 1.1.1 v forcats 0.2.0
## -- Conflicts ------------------------------------------------------------------------------- tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
CanadianWorkHours <- read_csv("C:/Users/jake/Desktop/class/CanadianWorkHours.csv")
## Parsed with column specification:
## cols(
## Year = col_integer(),
## Hours = col_double()
## )
glimpse(CanadianWorkHours)
## Observations: 35
## Variables: 2
## $ Year <int> 1966, 1967, 1968, 1969, 1970, 1971, 1972, 1973, 1974, 19...
## $ Hours <dbl> 37.2, 37.0, 37.4, 37.5, 37.7, 37.7, 37.4, 37.2, 37.3, 37...
work.ts <- ts(CanadianWorkHours[,2] , start = c(1966) , frequency = 1)
autoplot(work.ts)
This is a yearly series whose plot does not display any resemblence of seasonality, so seasonal models are not in play.
work_lm <- tslm(work.ts ~ trend)
summary(work_lm)
##
## Call:
## tslm(formula = work.ts ~ trend)
##
## Residuals:
## Min 1Q Median 3Q Max
## -1.20457 -0.28761 0.04779 0.30210 1.23190
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 37.416134 0.175818 212.811 < 2e-16 ***
## trend -0.061373 0.008518 -7.205 2.93e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.509 on 33 degrees of freedom
## Multiple R-squared: 0.6113, Adjusted R-squared: 0.5996
## F-statistic: 51.91 on 1 and 33 DF, p-value: 2.928e-08
work_quad <- tslm(work.ts ~ trend + I(trend^2))
summary(work_quad)
##
## Call:
## tslm(formula = work.ts ~ trend + I(trend^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.94503 -0.20964 -0.01652 0.31862 0.60160
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 38.1644156 0.2176599 175.340 < 2e-16 ***
## trend -0.1827154 0.0278786 -6.554 2.21e-07 ***
## I(trend^2) 0.0033706 0.0007512 4.487 8.76e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.4049 on 32 degrees of freedom
## Multiple R-squared: 0.7614, Adjusted R-squared: 0.7465
## F-statistic: 51.07 on 2 and 32 DF, p-value: 1.1e-10
The quadratic model has a higher r squared value than the linear model. All quadratic terms in both the quadradic and linear models have significant P-Values.
yrange <- range(work.ts)
# Set up the plot
plot(c(1966, 2000), yrange, type="n", xlab="Year", ylab="Hours", bty="l", xaxt="n", yaxt="n")
# Add the time series air
lines(work.ts, bty="l")
# Add the x-axis
axis(1, at=seq(1966,2000,1), labels=format(seq(1966,2000,1)))
# Add the y-axis
axis(2, at=seq(34,38,0.5), labels=format(seq(34,38,0.5)), las=2)
# Add the linear trend model
lines(work_lm$fitted, col="red")
lines(work_quad$fitted, col="blue")
# Add a legend
legend(1979,37.5, c("work.ts", "Linear - ADJ RSQ:0.60", "Quadratic - ADJ RSQ: 0.75"), lty=c(1,1,1), col=c("black", "red", "blue"), bty="n" , title("Canadian Working Hours"))
The plot visually confirms what the r squared values tell us, that the quadratic model provies a superior fit to the linear model.
DeptStoreSales <- read_csv("C:/Users/jake/Desktop/class/DeptStoreSales.csv")
## Parsed with column specification:
## cols(
## Quarter = col_integer(),
## Sales = col_integer()
## )
DeptStoreSales
sales.ts <- ts(DeptStoreSales[,2] , start = c(0,1), frequency = 4)
autoplot(sales.ts)
To fit a regression based model that accounts for trend, we should take the logarithm of sales.
validlength <- 4
trainlength <- length(sales.ts)- validlength
salestrain <- window(sales.ts , end=c(0,trainlength))
salesvalid <- window(sales.ts , start=c(0,trainlength+1))
s_exp <- tslm( salestrain~trend + season, lambda = 0)
summary(s_exp)
##
## Call:
## tslm(formula = salestrain ~ trend + season, lambda = 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.053524 -0.013199 -0.004527 0.014387 0.062681
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 10.748945 0.018725 574.057 < 2e-16 ***
## trend 0.011088 0.001295 8.561 3.70e-07 ***
## season2 0.024956 0.020764 1.202 0.248
## season3 0.165343 0.020884 7.917 9.79e-07 ***
## season4 0.433746 0.021084 20.572 2.10e-12 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.03277 on 15 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 7.63e+11 on 4 and 15 DF, p-value: < 2.2e-16
According to the model, Q2 sales are 1.03 time higher than Q1 sales.
fc1 <- forecast(s_exp , h=2)
fc1
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 5 Q1 58793.71 55790.19 61958.92 54090.84 63905.46
## 5 Q2 60951.51 57837.76 64232.89 56076.04 66250.87
autoplot(salestrain)+autolayer(s_exp$fit) +autolayer(fc1$mean)
autoplot(s_exp$residuals)
The plot of residuals over time suggest that our model with under-forecast the real values of sales. The plot also suggest that the Q22 forecast will be more accurate than the Q21 forecast.
The residual plot tells us that the trend is not captured very well by the model, because there is a pattern to the residuals and the forecast is behind the trend of the actual series.
resid_quad <- tslm(s_exp$residuals ~ trend + I(trend^2))
summary(resid_quad)
##
## Call:
## tslm(formula = s_exp$residuals ~ trend + I(trend^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.027018 -0.013586 -0.001159 0.013387 0.022401
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.0618489 0.0124722 4.959 0.000119 ***
## trend -0.0168679 0.0027353 -6.167 1.03e-05 ***
## I(trend^2) 0.0008032 0.0001265 6.349 7.27e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.01676 on 17 degrees of freedom
## Multiple R-squared: 0.7033, Adjusted R-squared: 0.6684
## F-statistic: 20.15 on 2 and 17 DF, p-value: 3.268e-05
sales_quad <- tslm(salestrain ~ trend + I(trend^2))
summary(sales_quad)
##
## Call:
## tslm(formula = salestrain ~ trend + I(trend^2))
##
## Residuals:
## Min 1Q Median 3Q Max
## -13567 -7733 -4244 4896 20705
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 56462.70 9097.26 6.207 9.57e-06 ***
## trend -370.83 1995.17 -0.186 0.855
## I(trend^2) 68.56 92.29 0.743 0.468
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 12230 on 17 degrees of freedom
## Multiple R-squared: 0.2489, Adjusted R-squared: 0.1605
## F-statistic: 2.817 on 2 and 17 DF, p-value: 0.08779
Fitting quadratic trend model the the residuals would be a better way to improve fit.
AustralianWines <- read_csv("C:/Users/jake/Desktop/class/AustralianWines.csv")
## Parsed with column specification:
## cols(
## Month = col_character(),
## Fortified = col_integer(),
## Red = col_integer(),
## Rose = col_character(),
## sparkling = col_integer(),
## `Sweet white` = col_integer(),
## `Dry white` = col_integer()
## )
glimpse(AustralianWines)
## Observations: 188
## Variables: 7
## $ Month <chr> "Jan-80", "Feb-80", "Mar-80", "Apr-80", "May-80"...
## $ Fortified <int> 2585, 3368, 3210, 3111, 3756, 4216, 5225, 4426, ...
## $ Red <int> 464, 675, 703, 887, 1139, 1077, 1318, 1260, 1120...
## $ Rose <chr> "112", "118", "129", "99", "116", "168", "118", ...
## $ sparkling <int> 1686, 1591, 2304, 1712, 1471, 1377, 1966, 2453, ...
## $ `Sweet white` <int> 85, 89, 109, 95, 91, 95, 96, 128, 124, 111, 178,...
## $ `Dry white` <int> 1954, 2302, 3054, 2414, 2226, 2725, 2589, 3470, ...
wineTS <- ts(AustralianWines[,2:7 ] , start = c(1980 , 1) , end = c(1994 , 12), frequency = 12)
## Warning in data.matrix(data): NAs introduced by coercion
autoplot(wineTS)
If I had to choose the model for all of the series I would choose a regression model which accounted for seasonality and trend. I may even want to give each series a log transformation as an attempt to stabilize the variance.
fort_TS <- ts(AustralianWines[, 2] , start = c(1980 , 1) , end = c(1994 , 12), frequency = 12)
validLength <- 12
trainLength <- length(fort_TS) - validLength
fortTrain <- window(fort_TS, end=c(1980,trainLength))
fortValid <- window(fort_TS, start=c(1980,trainLength+1) , end = c(1994,2))
BoxCox.lambda(fortTrain)
## [1] 0.02277637
lm_t_s <- tslm(fortTrain ~ trend + season , lambda = 0)
summary(lm_t_s)
##
## Call:
## tslm(formula = fortTrain ~ trend + season, lambda = 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.179573 -0.062128 -0.001094 0.064548 0.232966
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7.7802102 0.0256763 303.012 < 2e-16 ***
## trend -0.0034963 0.0001386 -25.227 < 2e-16 ***
## season2 0.1717197 0.0328450 5.228 5.46e-07 ***
## season3 0.3610223 0.0328459 10.991 < 2e-16 ***
## season4 0.4560481 0.0328473 13.884 < 2e-16 ***
## season5 0.6369148 0.0328494 19.389 < 2e-16 ***
## season6 0.6568734 0.0328520 19.995 < 2e-16 ***
## season7 0.8461903 0.0328552 25.755 < 2e-16 ***
## season8 0.7580526 0.0328590 23.070 < 2e-16 ***
## season9 0.4728853 0.0328634 14.389 < 2e-16 ***
## season10 0.4027478 0.0328684 12.253 < 2e-16 ***
## season11 0.5681648 0.0328739 17.283 < 2e-16 ***
## season12 0.6368115 0.0328801 19.368 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0869 on 155 degrees of freedom
## Multiple R-squared: 1, Adjusted R-squared: 1
## F-statistic: 1.281e+09 on 12 and 155 DF, p-value: < 2.2e-16
autoplot(fortTrain) + xlab("Year") + autolayer(lm_t_s$fitted)
The model appears to be a nice fit. R Square is 1.
fc_fort <- forecast(lm_t_s, h = 2)
accuracy(fc_fort , fortValid)
## ME RMSE MAE MPE MAPE MASE
## Training set 9.57250 277.2694 212.96781 -0.3473992 6.811106 0.7653653
## Test set -85.60303 121.0659 85.60996 -7.4180196 7.418461 0.3076657
## ACF1 Theil's U
## Training set 0.05621871 NA
## Test set -0.50000000 1.673107e-05
autoplot(fortTrain , series = "Train Series" ) + autolayer(lm_t_s$fitted , series = "Fitted") + autolayer(fc_fort, PI=FALSE, series="Forecast") + autolayer(fortValid , series = "Actual") + labs(title = "Fortified Wine")