library(rdatamarket)
## Loading required package: zoo
## 
## Attaching package: 'zoo'
## The following objects are masked from 'package:base':
## 
##     as.Date, as.Date.numeric
library(forecast)
## Warning: package 'forecast' was built under R version 3.4.4
library(tseries)
## Warning: package 'tseries' was built under R version 3.4.4

a. IMPORT AND CLEAN THE DATA SET

x<-ts(rdatamarket::dmseries("https://datamarket.com/data/set/24kw/monthly-average-residential-electricity-price#!ds=24kw!2r02=2&display=line
")[,1],start(1976,07), frequency=12)
#90%of data in train, rest 10%in test 
train<-ts(x[1:427],frequency = 12)
test<-ts(x[427:474],frequency = 12)

plot the data set

#There's a downward trend in 26th years, that was in year 2002, and the plot shows there is seaonsal. 
plot(train)#it is a seasonal plot

#check stationary and autocorreltation
#to start a stationary test and the test shows data is non stationary. And there is autocorrelation in the data.
adf.test(train, alternative="stationary",k=12)
## 
##  Augmented Dickey-Fuller Test
## 
## data:  train
## Dickey-Fuller = -1.0106, Lag order = 12, p-value = 0.9365
## alternative hypothesis: stationary
#first look of acf and pacf
acf(train) #check acf to check stationary, it is non stationary data 

pacf(train)

b. Identify my ARIMA MODEL

#I take out the seasonal differecning to make data stationary. 
#first seasonal difference 
diffx=diff(train,12)
plot(diffx)

acf(diffx)

pacf(diffx)

#first difference 
diffx1=diff(diffx)
plot(diffx1)

acf(diffx1)

pacf(diffx1)

#at this time, the data looks stationary after one time differneces, so d=1 
#by looking at graph acf and pacf, there is a single spike at lag 1 in both acf and pacf graph, then it is a MA(1) signature, and p=0,q=1
#for the seasonal term, after differening, there is spike at lag 1, so q=1,p=1
fit1<-arima(train, order=c(0,1,1), seasonal =c(1,1,1))
fit1
## 
## Call:
## arima(x = train, order = c(0, 1, 1), seasonal = c(1, 1, 1))
## 
## Coefficients:
##           ma1     sar1     sma1
##       -0.2054  -0.0362  -0.7701
## s.e.   0.0563   0.0697   0.0466
## 
## sigma^2 estimated as 0.03625:  log likelihood = 93.51,  aic = -179.02
tsdisplay(residuals(fit1))

##c. check residual, white noise
Box.test(fit1$residuals, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  fit1$residuals
## X-squared = 0.2912, df = 1, p-value = 0.5895
acf(fit1$residuals)

pacf(fit1$residuals)

checkresiduals(fit1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(1,1,1)[12]
## Q* = 41.917, df = 21, p-value = 0.00431
## 
## Model df: 3.   Total lags used: 24
#By checking acf graph, there are majority not exceed the blue line, so it is white noise
#using Ljung-box test to verify that p value=0.5895>0.05,it does reject, so it proves that my guess is right, residuals are white noise 

d. forecasting using ARIMA for next four years

fcastmyarima=forecast(fit1, h=48)
plot(fcastmyarima)

#the forecasting graph looks good, it follows the trends.

e. Identify my ETS model

#first I try "ZZZ" model, then, I try "MAN"model, to compare their aic, and model"ZZZ"has lower aic, so I choose to use "ZZZ" model
ets1=ets(train,model="ZZZ")
ets1#aic=1141.389
## ETS(M,Ad,M) 
## 
## Call:
##  ets(y = train, model = "ZZZ") 
## 
##   Smoothing parameters:
##     alpha = 0.7288 
##     beta  = 0.0082 
##     gamma = 1e-04 
##     phi   = 0.9765 
## 
##   Initial states:
##     l = 15.5366 
##     b = 0.0663 
##     s = 1.0415 1.0218 0.9928 0.9691 0.9458 0.9351
##            0.9537 0.9897 1.024 1.0373 1.0446 1.0447
## 
##   sigma:  0.0132
## 
##      AIC     AICc      BIC 
## 1141.389 1143.065 1214.411
ets2=ets(train,model= "MAN")
ets2#aic=1165.419
## ETS(M,A,N) 
## 
## Call:
##  ets(y = train, model = "MAN") 
## 
##   Smoothing parameters:
##     alpha = 0.9618 
##     beta  = 0.8467 
## 
##   Initial states:
##     l = 16.9091 
##     b = -0.6871 
## 
##   sigma:  0.0247
## 
##      AIC     AICc      BIC 
## 1665.419 1665.562 1685.703

f. check residuals white noise?

Box.test(ets1$residuals, type="Ljung-Box")
## 
##  Box-Ljung test
## 
## data:  ets1$residuals
## X-squared = 0.50233, df = 1, p-value = 0.4785
acf(ets1$residuals)

pacf(ets1$residuals)

checkresiduals(ets1)

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,Ad,M)
## Q* = 31.654, df = 7, p-value = 4.707e-05
## 
## Model df: 17.   Total lags used: 24
#by looking at acf, most lags are within 0.05 range, so it has white noise
#using Ljung-Box test, p =0.4785>0.05, so residuals are white noise

g. ets forecasting

fcastmyets=forecast(ets1, h=48)
plot(fcastmyets)

#the forecasting graph looks good.

h. compare ets and arima

accarima=accuracy(fcastmyarima,test[1:12])
accets=accuracy(fcastmyets,test[1:12])
#compare two accuracy
accarima
##                        ME      RMSE       MAE           MPE     MAPE
## Training set -0.001199607 0.1874845 0.1362785 -0.0001672381 0.993814
## Test set     -0.386116524 0.4564924 0.3861165 -3.1154627917 3.115463
##                   MASE       ACF1
## Training set 0.4763421 0.02602294
## Test set     1.3496153         NA
accets
##                       ME      RMSE       MAE         MPE      MAPE
## Training set -0.01208133 0.1812985 0.1331199 -0.08749404 0.9644755
## Test set     -0.40651829 0.5013798 0.4184103 -3.27018885 3.3695008
##                   MASE       ACF1
## Training set 0.4653016 0.05261198
## Test set     1.4624937         NA
#By comparing ME, RMSE, MAE, MPE, MAPE, MASE, ARIMA model has smaller errors, so ARIMA model is better than ETS model