Introduction

This R is the assignment of week 4 discussion for the course of Predictive Analytics. The task is : Pick a time series of interest to you. Build an auto.arima model as well as an ETS model. Which performed better? Now hold out 6 months of data for a test set and try to forecast using the ETS and the auto.arima. Which performs better on the hold-out set?

knitr::opts_chunk$set(echo = TRUE)

library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
##   method            from
##   as.zoo.data.frame zoo
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(sna)
## Warning: package 'sna' was built under R version 4.0.5
## Loading required package: statnet.common
## Warning: package 'statnet.common' was built under R version 4.0.5
## 
## Attaching package: 'statnet.common'
## The following objects are masked from 'package:base':
## 
##     attr, order
## Loading required package: network
## Warning: package 'network' was built under R version 4.0.5
## 
## 'network' 1.17.1 (2021-06-12), part of the Statnet Project
## * 'news(package="network")' for changes since last version
## * 'citation("network")' for citation information
## * 'https://statnet.org' for help, support, and other information
## sna: Tools for Social Network Analysis
## Version 2.6 created on 2020-10-5.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
##  For citation information, type citation("sna").
##  Type help(package="sna") to get started.
library(lessR)
## Warning: package 'lessR' was built under R version 4.0.5
## 
## lessR 4.1.6  feedback: gerbing@pdx.edu  web: lessRstats.com/new
## ---------------------------------------------------------------
## > d <- Read("")   Read text, Excel, SPSS, SAS, or R data file
##   d is default data frame, data= in analysis routines optional
## 
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables.
##   Enter:  browseVignettes("lessR")
## 
## View changes in this or recent versions of lessR.
##   Enter: help(package=lessR)  Click: Package NEWS
##   Enter: interact()  for access to interactive graphics
##   New function: reshape_long() to move data from wide to long
library(feasts)
## Loading required package: fabletools
## 
## Attaching package: 'fabletools'
## The following object is masked from 'package:lessR':
## 
##     model
## The following object is masked from 'package:sna':
## 
##     components
## The following objects are masked from 'package:forecast':
## 
##     accuracy, forecast
library(seasonal)
library(readxl)
library(tsibble)
## 
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble  3.1.6     v dplyr   1.0.8
## v tidyr   1.2.0     v stringr 1.4.0
## v readr   2.1.2     v forcats 0.5.1
## v purrr   0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x dplyr::recode() masks lessR::recode()
## x tibble::view()  masks seasonal::view()
library(fpp2)  
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## v fma       2.4     v expsmooth 2.3
## -- Conflicts ------------------------------------------------- fpp2_conflicts --
## x fabletools::forecast() masks forecast::forecast()
library(readr)     # for reading data
library(dplyr)     # for manipulating data
library(lubridate) # for date and time manipulation
## 
## Attaching package: 'lubridate'
## The following object is masked from 'package:tsibble':
## 
##     interval
## The following objects are masked from 'package:base':
## 
##     date, intersect, setdiff, union
library(ggplot2)

Descriptive

The purpose of this section is to preprocess and describe the dataset. The dataset used was the Monthly Sunspot Dataset,this dataset describes a monthly count of the number of observed sunspots for just over 230 years (1749-1983). Here is a first view of the data.

data <- read.csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/monthly-sunspots.csv")
summary(data)
##     Month              Sunspots     
##  Length:2820        Min.   :  0.00  
##  Class :character   1st Qu.: 15.70  
##  Mode  :character   Median : 42.00  
##                     Mean   : 51.27  
##                     3rd Qu.: 74.92  
##                     Max.   :253.80
data2<-ts(data$Sunspot,start=c(1950,1),end=c(1983,12), frequency=12)
autoplot(data2)

Here the data set is splitted

train_data = ts(data$Sunspots,start=c(1950,1), end=c(1983,6), frequency=12)
test_data = ts(data$Sunspots,start=c(1983,7), frequency=12)
train_data
##        Jan   Feb   Mar   Apr   May   Jun   Jul   Aug   Sep   Oct   Nov   Dec
## 1950  58.0  62.6  70.0  55.7  85.0  83.5  94.8  66.3  75.9  75.5 158.6  85.2
## 1951  73.3  75.9  89.2  88.3  90.0 100.0  85.4 103.0  91.2  65.7  63.3  75.4
## 1952  70.0  43.5  45.3  56.4  60.7  50.7  66.3  59.8  23.5  23.2  28.5  44.0
## 1953  35.0  50.0  71.0  59.3  59.7  39.6  78.4  29.3  27.1  46.6  37.6  40.0
## 1954  44.0  32.0  45.7  38.0  36.0  31.7  22.2  39.0  28.0  25.0  20.0   6.7
## 1955   0.0   3.0   1.7  13.7  20.7  26.7  18.8  12.3   8.2  24.1  13.2   4.2
## 1956  10.2  11.2   6.8   6.5   0.0   0.0   8.6   3.2  17.8  23.7   6.8  20.0
## 1957  12.5   7.1   5.4   9.4  12.5  12.9   3.6   6.4  11.8  14.3  17.0   9.4
## 1958  14.1  21.2  26.2  30.0  38.1  12.8  25.0  51.3  39.7  32.5  64.7  33.5
## 1959  37.6  52.0  49.0  72.3  46.4  45.0  44.0  38.7  62.5  37.7  43.0  43.0
## 1960  48.3  44.0  46.8  47.0  49.0  50.0  51.0  71.3  77.2  59.7  46.3  57.0
## 1961  67.3  59.5  74.7  58.3  72.0  48.3  66.0  75.6  61.3  50.6  59.7  61.0
## 1962  70.0  91.0  80.7  71.7 107.2  99.3  94.1  91.1 100.7  88.7  89.7  46.0
## 1963  43.8  72.8  45.7  60.2  39.9  77.1  33.8  67.7  68.5  69.3  77.8  77.2
## 1964  56.5  31.9  34.2  32.9  32.7  35.8  54.2  26.5  68.1  46.3  60.9  61.4
## 1965  59.7  59.7  40.2  34.4  44.3  30.0  30.0  30.0  28.2  28.0  26.0  25.7
## 1966  24.0  26.0  25.0  22.0  20.2  20.0  27.0  29.7  16.0  14.0  14.0  13.0
## 1967  12.0  11.0  36.6   6.0  26.8   3.0   3.3   4.0   4.3   5.0   5.7  19.2
## 1968  27.4  30.0  43.0  32.9  29.8  33.3  21.9  40.8  42.7  44.1  54.7  53.3
## 1969  53.5  66.1  46.3  42.7  77.7  77.4  52.6  66.8  74.8  77.8  90.6 111.8
## 1970  73.9  64.2  64.3  96.7  73.6  94.4 118.6 120.3 148.8 158.2 148.1 112.0
## 1971 104.0 142.5  80.1  51.0  70.1  83.3 109.8 126.3 104.4 103.6 132.2 102.3
## 1972  36.0  46.2  46.7  64.9 152.7 119.5  67.7  58.5 101.4  90.0  99.7  95.7
## 1973 100.9  90.8  31.1  92.2  38.0  57.0  77.3  56.2  50.5  78.6  61.3  64.0
## 1974  54.6  29.0  51.2  32.9  41.1  28.4  27.7  12.7  29.3  26.3  40.9  43.2
## 1975  46.8  65.4  55.7  43.8  51.3  28.5  17.5   6.6   7.9  14.0  17.7  12.2
## 1976   4.4   0.0  11.6  11.2   3.9  12.3   1.0   7.9   3.2   5.6  15.1   7.9
## 1977  21.7  11.6   6.3  21.8  11.2  19.0   1.0  24.2  16.0  30.0  35.0  40.0
## 1978  45.0  36.5  39.0  95.5  80.3  80.7  95.0 112.0 116.2 106.5 146.0 157.3
## 1979 177.3 109.3 134.0 145.0 238.9 171.6 153.0 140.0 171.7 156.3 150.3 105.0
## 1980 114.7 165.7 118.0 145.0 140.0 113.7 143.0 112.0 111.0 124.0 114.0 110.0
## 1981  70.0  98.0  98.0  95.0 107.2  88.0  86.0  86.0  93.7  77.0  60.0  58.7
## 1982  98.7  74.7  53.0  68.3 104.7  97.7  73.5  66.0  51.0  27.3  67.0  35.2
## 1983  54.0  37.5  37.0  41.0  54.3  38.0

ARIMA MODEL

ARIMA_model = auto.arima(train_data)
ARIMA_forecast = forecast(ARIMA_model, h = 6)
summary(ARIMA_model)
## Series: train_data 
## ARIMA(0,1,2)(1,0,0)[12] 
## 
## Coefficients:
##           ma1      ma2     sar1
##       -0.4364  -0.1026  -0.0335
## s.e.   0.0496   0.0465   0.0532
## 
## sigma^2 = 345.6:  log likelihood = -1739.63
## AIC=3487.27   AICc=3487.37   BIC=3503.24
## 
## Training set error measures:
##                       ME     RMSE      MAE  MPE MAPE      MASE        ACF1
## Training set -0.08606159 18.49826 13.07344 -Inf  Inf 0.4831049 0.001273284

Now the performance is calculated.

ARIMA_accuracy = accuracy(ARIMA_forecast, test_data)
print(ARIMA_accuracy)
##                       ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -0.08606159 18.49826 13.07344     -Inf      Inf 0.4831049
## Test set     25.58548898 28.02941 25.58549 35.30079 35.30079 0.9454648
##                     ACF1 Theil's U
## Training set 0.001273284        NA
## Test set     0.041786505  1.788588

ETS MODEL

ETS_model = ets(train_data)
ETS_forecast = forecast(ETS_model, h = 6)
summary(ETS_model)
## ETS(A,N,N) 
## 
## Call:
##  ets(y = train_data) 
## 
##   Smoothing parameters:
##     alpha = 0.5025 
## 
##   Initial states:
##     l = 62.0691 
## 
##   sigma:  18.6664
## 
##      AIC     AICc      BIC 
## 4767.658 4767.718 4779.647 
## 
## Training set error measures:
##                      ME     RMSE      MAE  MPE MAPE      MASE       ACF1
## Training set -0.0957382 18.61995 13.09579 -Inf  Inf 0.4839309 0.05704476
ETS_accuracy = accuracy(ETS_forecast, test_data)
print(ETS_accuracy)
##                      ME     RMSE      MAE      MPE     MAPE      MASE
## Training set -0.0957382 18.61995 13.09579     -Inf      Inf 0.4839309
## Test set     26.4031817 28.83505 26.40318 36.47763 36.47763 0.9756811
##                    ACF1 Theil's U
## Training set 0.05704476        NA
## Test set     0.08714988  1.839715
ARIMA_forecast
##          Point Forecast     Lo 80    Hi 80     Lo 95    Hi 95
## Jul 1983       41.97421 18.148911 65.79952  5.536557 78.41187
## Aug 1983       43.30317 15.953954 70.65238  1.476156 85.13018
## Sep 1983       43.80563 14.333218 73.27803 -1.268532 88.87978
## Oct 1983       44.59951 13.146909 76.05211 -3.503093 92.70211
## Nov 1983       43.26967  9.954365 76.58497 -7.681691 94.22103
## Dec 1983       44.33488  9.255645 79.41412 -9.314181 97.98394
autoplot(data2) +
  autolayer(ARIMA_forecast, series = "ARIMA FORECAST") #+

  #autolayer(ETS_forecast, series = "ETS FORECAST") #+
  #autolayer(data2, series = "OBSERVED DATA")
autoplot(data2) +
  autolayer(ETS_forecast, series = "ETS FORECAST") #+

  #autolayer(data2, series = "OBSERVED DATA")