The task is to analyze the sales count of shampoo over a 3-year period by using different analysis methods. The final goal is to choose the best model from different set of models and give relevant forecasts of sales count of shampoo for the next 5 years. The dataset does not specify which particular years are included but it is specified that it is a 3 year data.
The following packages are loaded.
library(TSA)
## Warning: package 'TSA' was built under R version 3.5.3
##
## Attaching package: 'TSA'
## The following objects are masked from 'package:stats':
##
## acf, arima
## The following object is masked from 'package:utils':
##
## tar
library(tseries)
library(fUnitRoots)
## Warning: package 'fUnitRoots' was built under R version 3.5.3
## Loading required package: timeDate
##
## Attaching package: 'timeDate'
## The following objects are masked from 'package:TSA':
##
## kurtosis, skewness
## Loading required package: timeSeries
## Warning: package 'timeSeries' was built under R version 3.5.3
## Loading required package: fBasics
## Warning: package 'fBasics' was built under R version 3.5.3
library(lmtest)
## Loading required package: zoo
##
## Attaching package: 'zoo'
## The following object is masked from 'package:timeSeries':
##
## time<-
## The following objects are masked from 'package:base':
##
## as.Date, as.Date.numeric
library(forecast)
setwd("C:/Users/ria93/Desktop/Time Series/Assignment 3")
shampoo <- read.csv("sales-of-shampoo-over-a-three-ye.csv", header = TRUE)
head(shampoo)
## Month Sales.of.shampoo.over.a.three.year.period
## 1 1-01 266.0
## 2 1-02 145.9
## 3 1-03 183.1
## 4 1-04 119.3
## 5 1-05 180.3
## 6 1-06 168.5
class(shampoo)
## [1] "data.frame"
names(shampoo) <- c("monthly date", "sales count")
shampoo.ts <- matrix(shampoo$sales, nrow = 37, ncol = 1)
shampoo.ts <- as.vector(t(shampoo.ts))
shampoo.ts <- ts(shampoo.ts,start=c(1,1), end=c(3,12), frequency=12)
A random walk graph is generated using the plot function and the following points are observed:
- Trend : We see a upward trend.
- Behaviour : The series appears to be auto regressive due to multiple succeeding points.
- Seasonality : We see no seasonality in this graph.
- Change in Variance: Change in variance is found.
- Change in point: We do not see a change in point
plot(ts(as.vector(shampoo.ts)),type='o',ylab='Shampoo Sales Count')
plot(shampoo.ts,type='l',ylab='Sales Count',
main = "Time series plot for monthly shampoo sales count with labels")
points(y=shampoo.ts,x=time(shampoo.ts), pch=as.vector(season(shampoo.ts)))
y1 = shampoo.ts
x1 = zlag(shampoo.ts)
index = 2: length(x1)
cor(y1[index], x1[index])
## [1] 0.7194822
plot(y = shampoo.ts, x = zlag(shampoo.ts), ylab = 'Shampoo Sales Count', xlab = 'Previous Year Shampoo Sales Count')
par(mfrow=c(1,2))
acf(shampoo.ts, lag.max = 36)
pacf(shampoo.ts, lag.max = 36)
par(mfrow=c(1,1))
m0.shampoo = arima(shampoo.ts,order=c(0,0,0),seasonal=list(order=c(0,0,0), period=12))
res.m0_shampoo = residuals(m0.shampoo)
par(mfrow=c(3,1))
plot(res.m0_shampoo, xlab='Time',ylab='Residuals',main="Time series plot of the residuals")
acf(res.m0_shampoo, lag.max = 36, main = "The sample ACF of the residuals")
pacf(res.m0_shampoo, lag.max = 36, main = "The sample PACF of the residuals")
par(mfrow=c(1,1))
adf.test(shampoo.ts,alternative = c("stationary", "explosive"),
k = trunc((length(shampoo.ts)-1)^(1/3)))
##
## Augmented Dickey-Fuller Test
##
## data: shampoo.ts
## Dickey-Fuller = -0.93943, Lag order = 3, p-value = 0.9332
## alternative hypothesis: stationary
shampoo.transform = BoxCox.ar(shampoo.ts, method = "yule-walker")
shampoo.transform$ci
## [1] -0.4 0.7
lambda = 0.35
BC.shampoo = ((shampoo.ts^lambda)-1)/lambda
plot(BC.shampoo, type = "o", ylab = "Shampoo sales (in )", main = "Transformed - Shampoo sales count")
qqnorm(BC.shampoo)
qqline(BC.shampoo, col = 2)
hist(BC.shampoo)
adf.test(BC.shampoo)
##
## Augmented Dickey-Fuller Test
##
## data: BC.shampoo
## Dickey-Fuller = -1.7858, Lag order = 3, p-value = 0.6565
## alternative hypothesis: stationary
shapiro.test(BC.shampoo)
##
## Shapiro-Wilk normality test
##
## data: BC.shampoo
## W = 0.96505, p-value = 0.3061
diff.shampoo.1 = diff(BC.shampoo, differences = 1)
plot(diff.shampoo.1,type='o',ylab='Sales Count of Shampoo')
The adf test for the differenced series is done. The null hypothesis states that the series is non stationary whereas the alternate hypothesis states that the series is stationary. The p-value is 0.01 which is less than 0.1, hence we can reject the null hypothesis and the series is stationary now.
We have a stationary series now and we can proceed ahead with fitting the models.
ar(diff(diff.shampoo.1))
##
## Call:
## ar(x = diff(diff.shampoo.1))
##
## Coefficients:
## 1 2 3
## -1.2463 -0.7195 -0.2775
##
## Order selected 3 sigma^2 estimated as 6.601
adfTest(diff.shampoo.1, lags = 3, title = NULL,description = NULL)
## Warning in adfTest(diff.shampoo.1, lags = 3, title = NULL, description =
## NULL): p-value smaller than printed p-value
##
## Title:
## Augmented Dickey-Fuller Test
##
## Test Results:
## PARAMETER:
## Lag Order: 3
## STATISTIC:
## Dickey-Fuller: -3.7302
## P VALUE:
## 0.01
##
## Description:
## Mon Aug 05 22:36:07 2019 by user: ria93
par(mfrow=c(1,2))
acf(diff.shampoo.1)
pacf(diff.shampoo.1)
par(mfrow=c(1,1))
eacf(diff.shampoo.1, ar.max = 5, ma.max = 5)
## AR/MA
## 0 1 2 3 4 5
## 0 x x o o o o
## 1 o x o o o o
## 2 x o o o o o
## 3 x o o o o o
## 4 o o x o o o
## 5 o o o o o o
par(mfrow=c(1,1))
res2 = armasubsets(y=diff.shampoo.1,nar=10,nma=10,y.name='test',ar.method='ols')
## Warning in leaps.setup(x, y, wt = wt, nbest = nbest, nvmax = nvmax,
## force.in = force.in, : 11 linear dependencies found
plot(res2)
** ARIMA(0,1,2), ARIMA(0,1,3),
** ARIMA(1,1,1),ARIMA(1,1,2), ARIMA(1,1,3)
** ARIMA(2,1,2), ARIMA(2,1,3)
arima_012_ml <- arima(shampoo.ts,order=c(0,1,2),method='ML')
coeftest(arima_012_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -1.30391 0.12671 -10.2907 < 2.2e-16 ***
## ma2 1.00000 0.15498 6.4524 1.101e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_012_css <- arima(shampoo.ts,order=c(0,1,2),method='CSS')
coeftest(arima_012_css)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -0.95839 0.16753 -5.7208 1.06e-08 ***
## ma2 0.52991 0.17299 3.0632 0.00219 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_013_ml <- arima(shampoo.ts,order=c(0,1,3),method='ML')
coeftest(arima_013_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -1.235375 0.201826 -6.1210 9.3e-10 ***
## ma2 0.890397 0.287547 3.0965 0.001958 **
## ma3 0.083108 0.183766 0.4523 0.651088
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_013_css <- arima(shampoo.ts,order=c(0,1,3),method='CSS')
coeftest(arima_013_css)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -0.943194 0.159897 -5.8988 3.662e-09 ***
## ma2 0.499605 0.187928 2.6585 0.007849 **
## ma3 0.050904 0.153631 0.3313 0.740389
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_111_ml<- arima(shampoo.ts,order=c(1,1,1),method='ML')
coeftest(arima_111_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 -0.59990 0.16457 -3.6452 0.0002672 ***
## ma1 -0.27538 0.17939 -1.5351 0.1247711
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_111_css <- arima(shampoo.ts,order=c(1,1,1),method='CSS')
coeftest(arima_111_css)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 -0.58296 0.16366 -3.5621 0.0003679 ***
## ma1 -0.29547 0.17066 -1.7314 0.0833884 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_112_ml <- arima(shampoo.ts,order=c(1,1,2),method='ML')
coeftest(arima_112_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.09008 0.20550 0.4383 0.6611
## ma1 -1.32211 0.14746 -8.9659 < 2.2e-16 ***
## ma2 0.99999 0.16309 6.1316 8.701e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_112_css <- arima(shampoo.ts,order=c(1,1,2),method='CSS')
coeftest(arima_112_css)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.114390 0.017089 6.6938 2.174e-11 ***
## ma1 -1.430788 0.044042 -32.4866 < 2.2e-16 ***
## ma2 1.179074 0.038294 30.7900 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_113_ml <- arima(shampoo.ts,order=c(1,1,3),method='ML')
coeftest(arima_113_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.079361 0.963045 0.0824 0.9343
## ma1 -1.311308 0.959185 -1.3671 0.1716
## ma2 0.986185 1.222951 0.8064 0.4200
## ma3 0.010450 0.917356 0.0114 0.9909
arima_113_css <- arima(shampoo.ts,order=c(1,1,3),method='CSS')
coeftest(arima_113_css)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.113786 0.016779 6.7813 1.191e-11 ***
## ma1 -1.520697 0.170842 -8.9012 < 2.2e-16 ***
## ma2 1.309988 0.244597 5.3557 8.522e-08 ***
## ma3 -0.109528 0.201800 -0.5428 0.5873
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_212_ml <- arima(shampoo.ts,order=c(2,1,2),method='ML')
coeftest(arima_212_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.20136 0.21345 0.9434 0.3455
## ar2 0.11534 0.21245 0.5429 0.5872
## ma1 -1.44520 0.19565 -7.3867 1.506e-13 ***
## ma2 0.99997 0.21028 4.7555 1.979e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_212_css <- arima(shampoo.ts,order=c(2,1,2),method='CSS')
coeftest(arima_212_css)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.123751 0.227072 0.5450 0.5858
## ar2 0.187920 0.142651 1.3173 0.1877
## ma1 -1.313214 0.130081 -10.0954 <2e-16 ***
## ma2 0.842521 0.087371 9.6430 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_213_ml <- arima(shampoo.ts,order=c(2,1,3),method='ML')
coeftest(arima_213_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 -0.26117 0.54594 -0.4784 0.63237
## ar2 0.25208 0.21787 1.1570 0.24727
## ma1 -0.97337 0.55262 -1.7614 0.07818 .
## ma2 0.31123 0.73352 0.4243 0.67135
## ma3 0.47537 0.51258 0.9274 0.35371
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
arima_213_css <- arima(shampoo.ts,order=c(2,1,3),method='CSS')
## Warning in stats::arima(x = x, order = order, seasonal = seasonal, xreg =
## xreg, : possible convergence problem: optim gave code = 1
coeftest(arima_213_css)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 -0.6252985 0.0027770 -225.1688 < 2.2e-16 ***
## ar2 0.2998054 0.0016822 178.2226 < 2.2e-16 ***
## ma1 -0.0605820 0.1109941 -0.5458 0.5851949
## ma2 -0.7345278 0.1963096 -3.7417 0.0001828 ***
## ma3 1.1616477 0.1241044 9.3602 < 2.2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In all the cases it is seen that the p- value is greater than 0.05 and hence the stochastic component of the model is normally distributed.
We do not see any model showing lack of fit.
residual.analysis(arima_012_ml)
## Warning: package 'FitAR' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: leaps
## Warning: package 'leaps' was built under R version 3.5.3
## Loading required package: ltsa
## Warning: package 'ltsa' was built under R version 3.5.2
## Loading required package: bestglm
## Warning: package 'bestglm' was built under R version 3.5.3
##
## Attaching package: 'FitAR'
## The following object is masked from 'package:forecast':
##
## BoxCox
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.97953, p-value = 0.7295
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 4.2329, df = 6, p-value = 0.6452
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_012_css)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.97848, p-value = 0.6939
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 2.4911, df = 6, p-value = 0.8695
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_013_ml)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.97732, p-value = 0.6545
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 3.6267, df = 6, p-value = 0.727
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_013_css)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.98252, p-value = 0.827
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 2.6511, df = 6, p-value = 0.8512
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_111_ml)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.95703, p-value = 0.1738
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 2.9521, df = 6, p-value = 0.8148
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_111_css)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.95387, p-value = 0.1384
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 4.6179, df = 6, p-value = 0.5937
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_112_ml)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.97751, p-value = 0.6607
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 3.6159, df = 6, p-value = 0.7285
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_112_css)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.96237, p-value = 0.2541
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 7.4386, df = 6, p-value = 0.2822
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_113_ml)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.97747, p-value = 0.6594
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 3.6137, df = 6, p-value = 0.7288
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_113_css)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.95436, p-value = 0.1434
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 7.1719, df = 6, p-value = 0.3052
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_212_ml)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.96644, p-value = 0.3363
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 3.2903, df = 6, p-value = 0.7716
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_212_css)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.95842, p-value = 0.1921
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 4.8042, df = 6, p-value = 0.5692
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_213_ml)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.97097, p-value = 0.4525
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 2.8463, df = 6, p-value = 0.8279
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
residual.analysis(arima_213_css)
##
## Shapiro-Wilk normality test
##
## data: res.model
## W = 0.96812, p-value = 0.3765
##
##
## Box-Ljung test
##
## data: (res.model)
## X-squared = 4.8414, df = 6, p-value = 0.5643
## Warning in (ra^2)/(n - (1:lag.max)): longer object length is not a multiple
## of shorter object length
AIC(arima_012_ml, arima_013_ml, arima_111_ml, arima_112_ml, arima_113_ml, arima_212_ml, arima_213_ml)
## df AIC
## arima_012_ml 3 399.5270
## arima_013_ml 4 401.3329
## arima_111_ml 3 406.8310
## arima_112_ml 4 401.3261
## arima_113_ml 5 403.3260
## arima_212_ml 5 403.4060
## arima_213_ml 6 404.9636
BIC(arima_012_ml, arima_013_ml, arima_111_ml, arima_112_ml, arima_113_ml, arima_212_ml, arima_213_ml)
## df BIC
## arima_012_ml 3 404.1931
## arima_013_ml 4 407.5543
## arima_111_ml 3 411.4971
## arima_112_ml 4 407.5475
## arima_113_ml 5 411.1028
## arima_212_ml 5 411.1827
## arima_213_ml 6 414.2957
After selecting the model, we check for overfitting by increasing the level of p and q by 1 each. In the below code, we increase the value of p by 1.
The p- value 0.6611 is insignificant as it is greater than 0.05 .
arima_112_ml <- arima(shampoo.ts,order=c(1,1,2),method='ML')
coeftest(arima_112_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ar1 0.09008 0.20550 0.4383 0.6611
## ma1 -1.32211 0.14746 -8.9659 < 2.2e-16 ***
## ma2 0.99999 0.16309 6.1316 8.701e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
In the below code, we increase the value of q by 1.
The p- value 0.651088 is insignificant as it is greater than 0.05 .
We thus conclude that the model ARIMA(0,1,2) would be an appropriate model for predicting the sales count of shampoo.
arima_013_ml <- arima(shampoo.ts,order=c(0,1,3),method='ML')
coeftest(arima_013_ml)
##
## z test of coefficients:
##
## Estimate Std. Error z value Pr(>|z|)
## ma1 -1.235375 0.201826 -6.1210 9.3e-10 ***
## ma2 0.890397 0.287547 3.0965 0.001958 **
## ma3 0.083108 0.183766 0.4523 0.651088
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
predict(arima_012_ml,n.ahead = 5,newxreg = NULL,se.fit=TRUE)
## $pred
## Jan Feb Mar Apr May
## 4 555.4505 589.0495 589.0495 589.0495 589.0495
##
## $se
## Jan Feb Mar Apr May
## 4 62.66664 64.90589 77.57836 88.45355 98.13082
fit=Arima(shampoo.ts,c(0,1,2))
plot(forecast(fit,h=10))
fit
## Series: shampoo.ts
## ARIMA(0,1,2)
##
## Coefficients:
## ma1 ma2
## -1.3039 1.000
## s.e. 0.1267 0.155
##
## sigma^2 estimated as 3952: log likelihood=-196.76
## AIC=399.53 AICc=400.3 BIC=404.19