This R is the assignment of week 4 discussion for the course of Predictive Analytics. The task is : Pick a time series of interest to you. Build an auto.arima model as well as an ETS model. Which performed better? Now hold out 6 months of data for a test set and try to forecast using the ETS and the auto.arima. Which performs better on the hold-out set?
knitr::opts_chunk$set(echo = TRUE)
library(forecast)
## Warning: package 'forecast' was built under R version 4.0.5
## Registered S3 method overwritten by 'quantmod':
## method from
## as.zoo.data.frame zoo
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.0.5
library(sna)
## Warning: package 'sna' was built under R version 4.0.5
## Loading required package: statnet.common
## Warning: package 'statnet.common' was built under R version 4.0.5
##
## Attaching package: 'statnet.common'
## The following objects are masked from 'package:base':
##
## attr, order
## Loading required package: network
## Warning: package 'network' was built under R version 4.0.5
##
## 'network' 1.17.1 (2021-06-12), part of the Statnet Project
## * 'news(package="network")' for changes since last version
## * 'citation("network")' for citation information
## * 'https://statnet.org' for help, support, and other information
## sna: Tools for Social Network Analysis
## Version 2.6 created on 2020-10-5.
## copyright (c) 2005, Carter T. Butts, University of California-Irvine
## For citation information, type citation("sna").
## Type help(package="sna") to get started.
library(lessR)
## Warning: package 'lessR' was built under R version 4.0.5
##
## lessR 4.1.6 feedback: gerbing@pdx.edu web: lessRstats.com/new
## ---------------------------------------------------------------
## > d <- Read("") Read text, Excel, SPSS, SAS, or R data file
## d is default data frame, data= in analysis routines optional
##
## Learn about reading, writing, and manipulating data, graphics,
## testing means and proportions, regression, factor analysis,
## customization, and descriptive statistics from pivot tables.
## Enter: browseVignettes("lessR")
##
## View changes in this or recent versions of lessR.
## Enter: help(package=lessR) Click: Package NEWS
## Enter: interact() for access to interactive graphics
## New function: reshape_long() to move data from wide to long
library(feasts)
## Loading required package: fabletools
##
## Attaching package: 'fabletools'
## The following object is masked from 'package:lessR':
##
## model
## The following object is masked from 'package:sna':
##
## components
## The following objects are masked from 'package:forecast':
##
## accuracy, forecast
library(seasonal)
library(readxl)
library(tsibble)
##
## Attaching package: 'tsibble'
## The following objects are masked from 'package:base':
##
## intersect, setdiff, union
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v tibble 3.1.6 v dplyr 1.0.8
## v tidyr 1.2.0 v stringr 1.4.0
## v readr 2.1.2 v forcats 0.5.1
## v purrr 0.3.4
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x dplyr::recode() masks lessR::recode()
## x tibble::view() masks seasonal::view()
library(fpp2)
## -- Attaching packages ---------------------------------------------- fpp2 2.4 --
## v fma 2.4 v expsmooth 2.3
## -- Conflicts ------------------------------------------------- fpp2_conflicts --
## x fabletools::forecast() masks forecast::forecast()
library(readr) # for reading data
library(dplyr) # for manipulating data
library(lubridate) # for date and time manipulation
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:tsibble':
##
## interval
## The following objects are masked from 'package:base':
##
## date, intersect, setdiff, union
library(ggplot2)
The purpose of this section is to preprocess and describe the dataset. The dataset used was the Monthly Sunspot Dataset,this dataset describes a monthly count of the number of observed sunspots for just over 230 years (1749-1983). Here is a first view of the data.
data <- read.csv("https://raw.githubusercontent.com/jbrownlee/Datasets/master/monthly-sunspots.csv")
summary(data)
## Month Sunspots
## Length:2820 Min. : 0.00
## Class :character 1st Qu.: 15.70
## Mode :character Median : 42.00
## Mean : 51.27
## 3rd Qu.: 74.92
## Max. :253.80
data2<-ts(data$Sunspot,start=c(1950,1),end=c(1983,12), frequency=12)
autoplot(data2)
Here the data set is splitted
train_data = ts(data$Sunspots,start=c(1950,1), end=c(1983,6), frequency=12)
test_data = ts(data$Sunspots,start=c(1983,7), frequency=12)
train_data
## Jan Feb Mar Apr May Jun Jul Aug Sep Oct Nov Dec
## 1950 58.0 62.6 70.0 55.7 85.0 83.5 94.8 66.3 75.9 75.5 158.6 85.2
## 1951 73.3 75.9 89.2 88.3 90.0 100.0 85.4 103.0 91.2 65.7 63.3 75.4
## 1952 70.0 43.5 45.3 56.4 60.7 50.7 66.3 59.8 23.5 23.2 28.5 44.0
## 1953 35.0 50.0 71.0 59.3 59.7 39.6 78.4 29.3 27.1 46.6 37.6 40.0
## 1954 44.0 32.0 45.7 38.0 36.0 31.7 22.2 39.0 28.0 25.0 20.0 6.7
## 1955 0.0 3.0 1.7 13.7 20.7 26.7 18.8 12.3 8.2 24.1 13.2 4.2
## 1956 10.2 11.2 6.8 6.5 0.0 0.0 8.6 3.2 17.8 23.7 6.8 20.0
## 1957 12.5 7.1 5.4 9.4 12.5 12.9 3.6 6.4 11.8 14.3 17.0 9.4
## 1958 14.1 21.2 26.2 30.0 38.1 12.8 25.0 51.3 39.7 32.5 64.7 33.5
## 1959 37.6 52.0 49.0 72.3 46.4 45.0 44.0 38.7 62.5 37.7 43.0 43.0
## 1960 48.3 44.0 46.8 47.0 49.0 50.0 51.0 71.3 77.2 59.7 46.3 57.0
## 1961 67.3 59.5 74.7 58.3 72.0 48.3 66.0 75.6 61.3 50.6 59.7 61.0
## 1962 70.0 91.0 80.7 71.7 107.2 99.3 94.1 91.1 100.7 88.7 89.7 46.0
## 1963 43.8 72.8 45.7 60.2 39.9 77.1 33.8 67.7 68.5 69.3 77.8 77.2
## 1964 56.5 31.9 34.2 32.9 32.7 35.8 54.2 26.5 68.1 46.3 60.9 61.4
## 1965 59.7 59.7 40.2 34.4 44.3 30.0 30.0 30.0 28.2 28.0 26.0 25.7
## 1966 24.0 26.0 25.0 22.0 20.2 20.0 27.0 29.7 16.0 14.0 14.0 13.0
## 1967 12.0 11.0 36.6 6.0 26.8 3.0 3.3 4.0 4.3 5.0 5.7 19.2
## 1968 27.4 30.0 43.0 32.9 29.8 33.3 21.9 40.8 42.7 44.1 54.7 53.3
## 1969 53.5 66.1 46.3 42.7 77.7 77.4 52.6 66.8 74.8 77.8 90.6 111.8
## 1970 73.9 64.2 64.3 96.7 73.6 94.4 118.6 120.3 148.8 158.2 148.1 112.0
## 1971 104.0 142.5 80.1 51.0 70.1 83.3 109.8 126.3 104.4 103.6 132.2 102.3
## 1972 36.0 46.2 46.7 64.9 152.7 119.5 67.7 58.5 101.4 90.0 99.7 95.7
## 1973 100.9 90.8 31.1 92.2 38.0 57.0 77.3 56.2 50.5 78.6 61.3 64.0
## 1974 54.6 29.0 51.2 32.9 41.1 28.4 27.7 12.7 29.3 26.3 40.9 43.2
## 1975 46.8 65.4 55.7 43.8 51.3 28.5 17.5 6.6 7.9 14.0 17.7 12.2
## 1976 4.4 0.0 11.6 11.2 3.9 12.3 1.0 7.9 3.2 5.6 15.1 7.9
## 1977 21.7 11.6 6.3 21.8 11.2 19.0 1.0 24.2 16.0 30.0 35.0 40.0
## 1978 45.0 36.5 39.0 95.5 80.3 80.7 95.0 112.0 116.2 106.5 146.0 157.3
## 1979 177.3 109.3 134.0 145.0 238.9 171.6 153.0 140.0 171.7 156.3 150.3 105.0
## 1980 114.7 165.7 118.0 145.0 140.0 113.7 143.0 112.0 111.0 124.0 114.0 110.0
## 1981 70.0 98.0 98.0 95.0 107.2 88.0 86.0 86.0 93.7 77.0 60.0 58.7
## 1982 98.7 74.7 53.0 68.3 104.7 97.7 73.5 66.0 51.0 27.3 67.0 35.2
## 1983 54.0 37.5 37.0 41.0 54.3 38.0
ARIMA_model = auto.arima(train_data)
ARIMA_forecast = forecast(ARIMA_model, h = 6)
summary(ARIMA_model)
## Series: train_data
## ARIMA(0,1,2)(1,0,0)[12]
##
## Coefficients:
## ma1 ma2 sar1
## -0.4364 -0.1026 -0.0335
## s.e. 0.0496 0.0465 0.0532
##
## sigma^2 = 345.6: log likelihood = -1739.63
## AIC=3487.27 AICc=3487.37 BIC=3503.24
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.08606159 18.49826 13.07344 -Inf Inf 0.4831049 0.001273284
Now the performance is calculated.
ARIMA_accuracy = accuracy(ARIMA_forecast, test_data)
print(ARIMA_accuracy)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.08606159 18.49826 13.07344 -Inf Inf 0.4831049
## Test set 25.58548898 28.02941 25.58549 35.30079 35.30079 0.9454648
## ACF1 Theil's U
## Training set 0.001273284 NA
## Test set 0.041786505 1.788588
ETS_model = ets(train_data)
ETS_forecast = forecast(ETS_model, h = 6)
summary(ETS_model)
## ETS(A,N,N)
##
## Call:
## ets(y = train_data)
##
## Smoothing parameters:
## alpha = 0.5025
##
## Initial states:
## l = 62.0691
##
## sigma: 18.6664
##
## AIC AICc BIC
## 4767.658 4767.718 4779.647
##
## Training set error measures:
## ME RMSE MAE MPE MAPE MASE ACF1
## Training set -0.0957382 18.61995 13.09579 -Inf Inf 0.4839309 0.05704476
ETS_accuracy = accuracy(ETS_forecast, test_data)
print(ETS_accuracy)
## ME RMSE MAE MPE MAPE MASE
## Training set -0.0957382 18.61995 13.09579 -Inf Inf 0.4839309
## Test set 26.4031817 28.83505 26.40318 36.47763 36.47763 0.9756811
## ACF1 Theil's U
## Training set 0.05704476 NA
## Test set 0.08714988 1.839715
ARIMA_forecast
## Point Forecast Lo 80 Hi 80 Lo 95 Hi 95
## Jul 1983 41.97421 18.148911 65.79952 5.536557 78.41187
## Aug 1983 43.30317 15.953954 70.65238 1.476156 85.13018
## Sep 1983 43.80563 14.333218 73.27803 -1.268532 88.87978
## Oct 1983 44.59951 13.146909 76.05211 -3.503093 92.70211
## Nov 1983 43.26967 9.954365 76.58497 -7.681691 94.22103
## Dec 1983 44.33488 9.255645 79.41412 -9.314181 97.98394
autoplot(data2) +
autolayer(ARIMA_forecast, series = "ARIMA FORECAST") #+
#autolayer(ETS_forecast, series = "ETS FORECAST") #+
#autolayer(data2, series = "OBSERVED DATA")
autoplot(data2) +
autolayer(ETS_forecast, series = "ETS FORECAST") #+
#autolayer(data2, series = "OBSERVED DATA")