# install.packages("data.table")
library(data.table)
library(forecast)
library(knitr) # for kable table formatting
# workhours = read.csv("/Users/noname/Dropbox/Coursera/Forecasting - Time Series/Tsing Hua Time Series/Week5 Regression Models/CanadianWorkHours.csv")
workhours = read.csv("/Users/bruce/Dropbox/Coursera/Forecasting - Time Series/Tsing Hua Time Series/Week5 Regression Models/CanadianWorkHours.csv")
summary(workhours)
## Year Hours
## Min. :1966 Min. :34.80
## 1st Qu.:1974 1st Qu.:35.65
## Median :1983 Median :36.00
## Mean :1983 Mean :36.31
## 3rd Qu.:1992 3rd Qu.:37.10
## Max. :2000 Max. :37.70
workhours.ts = ts(workhours$Hours, start = 1966, frequency=1)
require(zoo)
require(ggplot2)
## Loading required package: ggplot2
library(scales)
# Plot linear regression with Splines
ggplot(workhours, aes(x=Year, y=Hours)) +
geom_point() +
geom_smooth(method = "lm", formula = y ~ splines::bs(x, 3)) +
ggtitle("Linear regression with Splines") +
ylab("Work Hours")
# Plot linear regression using Loess algo
ggplot(workhours, aes(x=Year, y=Hours)) +
geom_point() +
geom_smooth() +
ggtitle("Loess Algorithm") +
ylab("Work Hours")
Although I can fit some interesting plots such as regression with splines and loess fitting techniques, I am interpreting the rebound of workhours as a structural break with this data. This is because I have no reason to think that the pretty curves, which sort of indicate a possible long term cycle, have any meaning with this limited amount of data.
workhours.train = workhours[1:32, ]
workhours.test = workhours[workhours$Year>1997, ]
nvalid = 3 # number of years in validation set
workhours.post.struc.break.train = workhours.train[workhours.train$Year>1987, ]
workhours.post.struc.break.train.ts = ts(workhours.post.struc.break.train$Hours, start = 1988)
workhours.post.struc.break.train.ts.trend = tslm(workhours.post.struc.break.train.ts ~ trend, lamda = 0)
## Warning: In lm.fit(x, y, offset = offset, singular.ok = singular.ok, ...) :
## extra argument 'lamda' will be disregarded
summary(workhours.post.struc.break.train.ts.trend)
##
## Call:
## tslm(formula = workhours.post.struc.break.train.ts ~ trend, lamda = 0)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.46727 -0.08864 0.01455 0.18136 0.32909
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 35.20000 0.19134 183.963 8.53e-16 ***
## trend 0.06727 0.03084 2.182 0.0607 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.2801 on 8 degrees of freedom
## Multiple R-squared: 0.373, Adjusted R-squared: 0.2946
## F-statistic: 4.759 on 1 and 8 DF, p-value: 0.06072
workhours.post.struc.break.train.ts.trend.pred = forecast(workhours.post.struc.break.train.ts.trend, h = nvalid)
summary(workhours.post.struc.break.train.ts.trend.pred)
##
## Forecast method: Linear regression model
##
## Model Information:
##
## Call:
## tslm(formula = workhours.post.struc.break.train.ts ~ trend, lamda = 0)
##
## Coefficients:
## (Intercept) trend
## 35.20000 0.06727
##
##
## Error measures:
## ME RMSE MAE MPE MAPE MASE
## Training set 0 0.2505267 0.2036364 -0.005004557 0.5735712 1.078075
## ACF1
## Training set 0.3038818
##
## Forecasts:
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## 1998 35.94000 35.46618 36.41382 35.15777 36.72223
## 1999 36.00727 35.51051 36.50403 35.18717 36.82737
## 2000 36.07455 35.55229 36.59680 35.21236 36.93673
# Plot the chart
plot(workhours.post.struc.break.train.ts.trend.pred, main ="Yearly Canandian Work Hours \n interpreted with Structural Break \n around 1988", ylab="Work Hours")
lines(workhours.post.struc.break.train.ts.trend$fitted.values, col="blue", bty="l")
lines(workhours.test, col = "red")
First of all, a test set of 3 years, 3 data points, is not going to be statistically significant. It is easy to get lucky with only 3 data points. Be that as it may, as a toy model example, I interpreted the data as showing a structural break around 1987, and used a simple linear forecast to predict the 3 data points in the test set. As one can see from the graph, the actuals (in red), all fell within the 80% prediction intervals. Since 3 data points is not going to be statistically significant, I didn’t calculate the accuracy numbers.