# Load data from git
GET('https://github.com/klgriffen96/summer23_data624/raw/main/project_1/Data%20Set%20for%20Class.xls', write_disk(tmpfile <- tempfile(fileext=".xls")))## Response [https://raw.githubusercontent.com/klgriffen96/summer23_data624/main/project_1/Data%20Set%20for%20Class.xls]
## Date: 2023-06-18 21:53
## Status: 200
## Content-Type: application/octet-stream
## Size: 1.55 MB
## <ON DISK> C:\Users\micha\AppData\Local\Temp\RtmpAnXvtA\file364c692339f6.xls
df_orig <- readxl::read_excel(tmpfile, skip=0)# Set frequency
the_freq <- 5
# Remove blank observations at end
df <- df_orig %>%
filter(SeriesInd <= 43021) %>%
arrange(SeriesInd, category)
# Initial summary
summary(df)## SeriesInd category Var01 Var02
## Min. :40669 Length:9732 Min. : 9.03 Min. : 1339900
## 1st Qu.:41253 Class :character 1st Qu.: 23.10 1st Qu.: 12520675
## Median :41846 Mode :character Median : 38.44 Median : 21086550
## Mean :41843 Mean : 46.98 Mean : 37035741
## 3rd Qu.:42430 3rd Qu.: 66.78 3rd Qu.: 42486700
## Max. :43021 Max. :195.18 Max. :480879500
## NA's :14 NA's :2
## Var03 Var05 Var07
## Min. : 8.82 Min. : 8.99 Min. : 8.92
## 1st Qu.: 22.59 1st Qu.: 22.91 1st Qu.: 22.88
## Median : 37.66 Median : 38.05 Median : 38.05
## Mean : 46.12 Mean : 46.55 Mean : 46.56
## 3rd Qu.: 65.88 3rd Qu.: 66.38 3rd Qu.: 66.31
## Max. :189.36 Max. :195.00 Max. :189.72
## NA's :26 NA's :26 NA's :26
# Fill in the existing NAs with Infs so we can distinguish them later
df[is.na(df)] <- Inf
# The next steps create a complete series of data, filling in gaps in SeriesInd across all categories
# Create a sequence of values starting at the first value of SeriesInd and ending at the last
dftmp2 <- data.frame(SeriesInd=seq(from=min(df$SeriesInd), to=max(df$SeriesInd)))
# Create a tmp dataframe for categories
dftmp3 <- data.frame(category=c('S01', 'S02', 'S03', 'S04', 'S05', 'S06'))
# Create dataframe joining the complete set of SeriesInd to the categories
dftmp4 <- dftmp2 %>%
merge(dftmp3, all=T) %>%
arrange(SeriesInd, category)
# Join the original dataframe with the dataframe containing the full set of SeriesInd and categories
df2 <- dftmp4 %>%
merge(df, by=c('SeriesInd', 'category'), all.x=T) %>%
arrange(SeriesInd, category)
# Summary after creating complete dataframe
summary(df2)## SeriesInd category Var01 Var02
## Min. :40669 Length:14118 Min. : 9.03 Min. : 1339900
## 1st Qu.:41257 Class :character 1st Qu.:23.36 1st Qu.:12521025
## Median :41845 Mode :character Median :38.55 Median :21091200
## Mean :41845 Mean : Inf Mean : Inf
## 3rd Qu.:42433 3rd Qu.:67.12 3rd Qu.:42518000
## Max. :43021 Max. : Inf Max. : Inf
## NA's :4386 NA's :4386
## Var03 Var05 Var07
## Min. : 8.82 Min. : 8.99 Min. : 8.92
## 1st Qu.:22.68 1st Qu.:23.03 1st Qu.:23.04
## Median :37.83 Median :38.24 Median :38.23
## Mean : Inf Mean : Inf Mean : Inf
## 3rd Qu.:66.33 3rd Qu.:66.75 3rd Qu.:66.78
## Max. : Inf Max. : Inf Max. : Inf
## NA's :4386 NA's :4386 NA's :4386
# Define which variables should be forecast for each category
fcvars <- list(c(1, 2), c(2, 3), c(5, 7), c(1, 2), c(2, 3), c(5,7))# Look for missing values now that we have complete values for SeriesInd
summary(df2)## SeriesInd category Var01 Var02
## Min. :40669 Length:14118 Min. : 9.03 Min. : 1339900
## 1st Qu.:41257 Class :character 1st Qu.:23.36 1st Qu.:12521025
## Median :41845 Mode :character Median :38.55 Median :21091200
## Mean :41845 Mean : Inf Mean : Inf
## 3rd Qu.:42433 3rd Qu.:67.12 3rd Qu.:42518000
## Max. :43021 Max. : Inf Max. : Inf
## NA's :4386 NA's :4386
## Var03 Var05 Var07
## Min. : 8.82 Min. : 8.99 Min. : 8.92
## 1st Qu.:22.68 1st Qu.:23.03 1st Qu.:23.04
## Median :37.83 Median :38.24 Median :38.23
## Mean : Inf Mean : Inf Mean : Inf
## 3rd Qu.:66.33 3rd Qu.:66.75 3rd Qu.:66.78
## Max. : Inf Max. : Inf Max. : Inf
## NA's :4386 NA's :4386 NA's :4386
# Create df to hold missing value summary
dfmv <- data.frame(category=c(), var=c())
# Examine missing value count
for (i in seq(1, 6)) {
for (j in seq(1, 2)) {
gaps <- sum(is.na(df2[df2$category==paste0('S0', i), paste0('Var0', fcvars[[i]][j])]))
inf_vals <- sum(is.infinite(df2[df2$category==paste0('S0', i), paste0('Var0', fcvars[[i]][j])]))
total_vals <- length(df2[df2$category==paste0('S0', i), paste0('Var0', fcvars[[i]][j])])
dfmv <- rbind(dfmv, c(paste0('S0', i), paste0('Var0', fcvars[[i]][j]), inf_vals, gaps))
}
}
colnames(dfmv) <- c('category', 'var', 'missing.values', 'gaps')
dfmv %>%
kbl(caption='Time series gaps and missing values') %>%
kable_classic(full_width=F)| category | var | missing.values | gaps |
|---|---|---|---|
| S01 | Var01 | 2 | 731 |
| S01 | Var02 | 0 | 731 |
| S02 | Var02 | 0 | 731 |
| S02 | Var03 | 4 | 731 |
| S03 | Var05 | 4 | 731 |
| S03 | Var07 | 4 | 731 |
| S04 | Var01 | 2 | 731 |
| S04 | Var02 | 0 | 731 |
| S05 | Var02 | 1 | 731 |
| S05 | Var03 | 5 | 731 |
| S06 | Var05 | 5 | 731 |
| S06 | Var07 | 5 | 731 |
We looked for patterns in the data set to determine if there was any seasonality and to evaluate where there were gaps. We assumed SeriesInd was an integer representing the number of days since a certain date, a common one being January 1, 1900 (the “origin” date). Using this logic, we converted SeriesInd to a date and examined where gaps occurred. Using an origin date of January 1, 1900 placed gaps on Friday and Saturday. It seemed more reasonable to assume that the series excluded weekend days, so we shifted the weekdays such that the gaps would fall on weekends.
It is also noted that a number of gaps fall on other weekdays, with the greatest number on Monday (32). This is consistent with a US-based calendar which includes a number of national holidays that fall on Monday (e.g. Memorial Day and Labor Day).
Because an origin date of January 1, 1900 yielded a week with gaps on Fridays and Saturdays, we performed a systematic search to find a reasonable origin that would align the gaps with Saturday and Sunday instead. Going under the assumption that this was a US calendar, we looked for origin dates that would place gaps on both January 1 and on December 25. The first such origin date that met these criteria was August 31, 1915, which would put our first data point in the year 2027. Therefore, we concluded that the data set either does not likely conform to a US calendar. We further concluded that the importance of the origin date was secondary and that the key observation was that the data is likely based on a seven-day week, with regular gaps on exactly two of those days. It is possible these are stock market prices, sales figures, or another such weekly metric.
We also performed a gap analysis on the prediction set to aid in determining whether “weekend” values would need to be imputed (filled). As shown in the table, there are no weekend days in the prediction set.
# Look for patterns in gaps in SeriesInd; first make a copy of df == dfga (gap analysis).
# Filter by just a single category since this will give us a single set of SeriesInd values,
# and the NAs in each category will all be in the same positions so it doesn't matter which category we choose.
# Work from the theory that these are days of the week, and that that SeriesInd is the number of days since Jan 1, 1900.
dfga <- df2 %>%
filter(category=='S01') %>%
mutate(date=as.Date(SeriesInd, origin='1900-01-01')+1) %>%
mutate(SeriesInd.mod7=SeriesInd %% 7) %>%
mutate(Day.of.week=weekdays(date)) %>%
group_by(SeriesInd.mod7, Day.of.week) %>%
summarize(Gaps=sum(is.na(Var01)), Filled.vals=sum(!is.na(Var01)), .groups='keep') %>%
ungroup() %>%
arrange(SeriesInd.mod7) %>%
select(-SeriesInd.mod7)
dfga %>%
kbl(caption='Gap analysis') %>%
kable_classic(full_width = F)| Day.of.week | Gaps | Filled.vals |
|---|---|---|
| Tuesday | 3 | 333 |
| Wednesday | 3 | 333 |
| Thursday | 9 | 327 |
| Friday | 12 | 324 |
| Saturday | 336 | 0 |
| Sunday | 336 | 0 |
| Monday | 32 | 305 |
# Look at gaps in the SeriesInd values we need to predict;
# first fill in values with -1 so we know which ones are missing
df3 <- df_orig %>%
filter(SeriesInd > 43021) %>%
arrange(SeriesInd, category) %>%
select(SeriesInd, category, Var01) %>%
mutate(Var01=-1)
# Create a sequence of values starting at the first value of SeriesInd and ending at the last
dftmp2 <- data.frame(SeriesInd=seq(from=min(df3$SeriesInd), to=max(df3$SeriesInd)))
# Create a tmp dataframe for categories
dftmp3 <- data.frame(category=c('S01', 'S02', 'S03', 'S04', 'S05', 'S06'))
# Create dataframe joining the complete set of SeriesInd to the categories
dftmp4 <- dftmp2 %>%
merge(dftmp3, all=T) %>%
arrange(SeriesInd, category)
# Join the original dataframe with the dataframe containing the full set of SeriesInd and categories
df3 <- dftmp4 %>%
merge(df3, by=c('SeriesInd', 'category'), all.x=T) %>%
arrange(SeriesInd, category)
# Examine gaps
dfga2 <- df3 %>%
filter(category=='S01') %>%
mutate(date=as.Date(SeriesInd, origin='1900-01-01')+1) %>%
mutate(SeriesInd.mod7=SeriesInd %% 7) %>%
mutate(Day.of.week=weekdays(date)) %>%
group_by(SeriesInd.mod7, Day.of.week) %>%
summarize(Gaps=sum(is.na(Var01)), Filled.vals=sum(!is.na(Var01)), .groups='keep') %>%
ungroup() %>%
arrange(SeriesInd.mod7) %>%
select(-SeriesInd.mod7)
dfga2 %>%
kbl(caption='Gap analysis - prediction set') %>%
kable_classic(full_width = F)| Day.of.week | Gaps | Filled.vals |
|---|---|---|
| Tuesday | 0 | 29 |
| Wednesday | 0 | 29 |
| Thursday | 1 | 28 |
| Friday | 0 | 29 |
| Saturday | 28 | 0 |
| Sunday | 28 | 0 |
| Monday | 3 | 25 |
# Create full 5-day set of SeriesInds
df4 <- df2 %>%
filter(category=='S01') %>%
mutate(date=as.Date(SeriesInd, origin='1900-01-01')+1) %>%
mutate(SeriesInd.mod7=SeriesInd %% 7) %>%
mutate(Day.of.week=weekdays(date)) %>%
filter(Day.of.week != 'Saturday' & Day.of.week != 'Sunday') %>%
select(SeriesInd, category)
for (i in df4$SeriesInd) {
for (j in seq(2, 6)) {
df4 <- rbind(df4, data.frame(SeriesInd=i, category=paste0('S0', j)))
}
}
# Join the original dataframe with the dataframe containing the full set of SeriesInd and categories
df2 <- df %>%
merge(df4, by=c('SeriesInd', 'category'), all=T) %>%
arrange(SeriesInd, category)
summary(df2)## SeriesInd category Var01 Var02
## Min. :40669 Length:10086 Min. : 9.03 Min. : 1339900
## 1st Qu.:41257 Class :character 1st Qu.:23.36 1st Qu.:12521025
## Median :41845 Mode :character Median :38.55 Median :21091200
## Mean :41844 Mean : Inf Mean : Inf
## 3rd Qu.:42433 3rd Qu.:67.12 3rd Qu.:42518000
## Max. :43021 Max. : Inf Max. : Inf
## NA's :354 NA's :354
## Var03 Var05 Var07
## Min. : 8.82 Min. : 8.99 Min. : 8.92
## 1st Qu.:22.68 1st Qu.:23.03 1st Qu.:23.04
## Median :37.83 Median :38.24 Median :38.23
## Mean : Inf Mean : Inf Mean : Inf
## 3rd Qu.:66.33 3rd Qu.:66.75 3rd Qu.:66.77
## Max. : Inf Max. : Inf Max. : Inf
## NA's :354 NA's :354 NA's :354
# To work with the data more easily, initialize list to hold dataframes, one for each category
dfcat = list()
# Filter by category, selecting only those vars we're interested in for that category
for (i in seq(1, 6)) {
dfcat[[i]] <- df2 %>%
filter(category == paste0('S0', i)) %>%
select(SeriesInd, !!paste0('Var0', fcvars[[i]][1]), !!paste0('Var0', fcvars[[i]][2]))
}
# Initialize ts objects; each element in each list will correspond to a category; e.g. ts1[[1]] will be S01, etc
ts1 = list()
ts2 = list()
# Set start date to first date in series
start_date <- df2$SeriesInd[[1]]
# Iterate over categories
for (i in seq(1, 6)) {
# Create var names for the two variables we're interested in for this category
varname1 <- paste0('Var0', fcvars[[i]][1])
varname2 <- paste0('Var0', fcvars[[i]][2])
# Create time series for each variable
#ts1[[i]] <- ts(dfcat[[i]][dfcat[[i]][varname1] != Inf, varname1], frequency=the_freq, start=start_date)
#ts2[[i]] <- ts(dfcat[[i]][dfcat[[i]][varname2] != Inf, varname2], frequency=the_freq, start=start_date)
ts1[[i]] <- ts(dfcat[[i]][[varname1]], frequency=the_freq, start=start_date)
ts2[[i]] <- ts(dfcat[[i]][[varname2]], frequency=the_freq, start=start_date)
# Plot the time series
p1a <- ts1[[i]] %>%
autoplot() +
ggtitle(paste0('Category S0', i, ', ', varname1)) +
ylab(varname1)
p1b <- dfcat[[i]] %>%
filter(dfcat[[i]][!!varname1] != Inf & !is.na(dfcat[[i]][!!varname1])) %>%
ggplot() +
geom_histogram(aes(x=eval(sym(varname1))), bins=30) +
ggtitle(paste0('Category S0', i, ', ', varname1)) +
xlab(varname1)
grid.arrange(p1a, p1b, ncol=2)
p2a <- ts2[[i]] %>%
autoplot() +
ggtitle(paste0('Category S0', i, ', ', varname2)) +
ylab(varname2)
p2b <- dfcat[[i]] %>%
filter(dfcat[[i]][!!varname2] != Inf & !is.na(dfcat[[i]][!!varname2])) %>%
ggplot() +
geom_histogram(aes(x=eval(sym(varname2))), bins=30) +
ggtitle(paste0('Category S0', i, ', ', varname2)) +
xlab(varname2)
grid.arrange(p2a, p2b, ncol=2)
}# Plot time series on one graph
p1 <- dfcat[[1]] %>%
ggplot(aes(x=SeriesInd)) +
geom_line(aes(y=Var01, color='Var01')) +
geom_line(aes(y=Var02 / 500000, color='Var02')) +
scale_y_continuous(sec.axis=sec_axis(~ . * 500000, name='Var02')) +
scale_color_manual(values=c('black', 'darkred')) +
ggtitle('Category S01')
p2 <- dfcat[[2]] %>%
ggplot(aes(x=SeriesInd)) +
geom_line(aes(y=Var02, color='Var02')) +
geom_line(aes(y=Var03 * 10000000, color='Var03')) +
scale_y_continuous(sec.axis=sec_axis(~ . / 10000000, name='Var03')) +
scale_color_manual(values=c('black', 'darkred')) +
ggtitle('Category S02')
p3 <- dfcat[[3]] %>%
ggplot(aes(x=SeriesInd)) +
geom_line(aes(y=Var05, color='Var05')) +
geom_line(aes(y=Var07, color='Var07')) +
scale_y_continuous(sec.axis=sec_axis(~ ., name='Var07')) +
scale_color_manual(values=c('black', 'darkred')) +
ggtitle('Category S03')
p4 <- dfcat[[4]] %>%
ggplot(aes(x=SeriesInd)) +
geom_line(aes(y=Var01, color='Var01')) +
geom_line(aes(y=Var02 / 1000000, color='Var02')) +
scale_y_continuous(sec.axis=sec_axis(~ . * 1000000, name='Var02')) +
scale_color_manual(values=c('black', 'darkred')) +
ggtitle('Category S04')
p5 <- dfcat[[5]] %>%
ggplot(aes(x=SeriesInd)) +
geom_line(aes(y=Var02, color='Var02')) +
geom_line(aes(y=Var03 * 1000000, color='Var03')) +
scale_y_continuous(sec.axis=sec_axis(~ . / 1000000, name='Var03')) +
scale_color_manual(values=c('black', 'darkred')) +
ggtitle('Category S05')
p6 <- dfcat[[6]] %>%
ggplot(aes(x=SeriesInd)) +
geom_line(aes(y=Var05, color='Var05')) +
geom_line(aes(y=Var07, color='Var07')) +
scale_y_continuous(sec.axis=sec_axis(~ ., name='Var07')) +
scale_color_manual(values=c('black', 'darkred')) +
ggtitle('Category S06')
#grid.arrange(p1, p2, p3, p4, p5, p6, nrow=6, ncol=1)
grid.arrange(p1, p2, p3, nrow=3, ncol=1)grid.arrange(p4, p5, p6, nrow=3, ncol=1)# Look for correlation between variables
GGally::ggpairs(df2[,3:7], progress=F)## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
An additional step in exploratory data analysis that is often helpful to better understand the data set is to generate pairwise plots showing the relationship between predictors. Based on these pairwise plots (see below), we observed an extremely high degree of correlation between some predictors. Notably, variables Var01, Var03, Var05, and Var07 contain very similar values, suggesting that any missing values of one might be imputed using existing values of another. Without the context of what these variables represent, it is difficult to speculate on why they might be correlated as such, but it is possible that the data were collected by different observers using similar but slightly different methodologies or techniques.
We discovered some obvious outliers in the data. Outliers adversely affect forecasts and, as such, should be either removed or replaced. Notably,outliers were discovered in the following variables, with four other variables having questionable values.
One such example is in Category S06, variable V07, which clearly exhibits an outlying value of approximately 190 about a quarter of the way into the series.
# Create list to hold outlier counts
dfout <- data.frame()
# Look for outliers using decomposition plots
for (i in seq(1, 6)) {
# Create var names for the two variables we're interested in for this category
varname1 <- paste0('Var0', fcvars[[i]][1])
varname2 <- paste0('Var0', fcvars[[i]][2])
# Filter out infinite values
dftmp1 <- dfcat[[i]] %>%
filter(!is.infinite(eval(sym(varname1)))) %>%
filter(!is.na(eval(sym(varname1))))
dftmp2 <- dfcat[[i]] %>%
filter(!is.infinite(eval(sym(varname2)))) %>%
filter(!is.na(eval(sym(varname2))))
# Create time series for each variable using xts; SeriesInd appears to be the days since 1900-01-01
ts1[[i]] <- ts(dftmp1[[varname1]], frequency=the_freq, start=start_date)
ts2[[i]] <- ts(dftmp2[[varname2]], frequency=the_freq, start=start_date)
if (the_freq > 1) {
p1 <- ts1[[i]] %>%
decompose(type='additive') %>%
autoplot() +
ggtitle(paste0('Category S0', i, ', Var0', fcvars[[i]][1]))
p2 <- ts2[[i]] %>%
decompose(type='additive') %>%
autoplot() +
ggtitle(paste0('Category S0', i, ', Var0', fcvars[[i]][2]))
grid.arrange(p1, p2, ncol=1, nrow=2)
}
# Count of outliers > 3 SD
outct <- length(ts1[[i]][ts1[[i]] > mean(ts1[[i]]) + 3 * sd(ts1[[i]]) | ts1[[i]] < mean(ts1[[i]]) - 3 * sd(ts1[[i]])])
dfout <- rbind(dfout, data.frame(
Category=paste0('S0', i),
Variable=paste0('Var0', fcvars[[i]][1]),
Outliers=outct
))
outct <- length(ts2[[i]][ts2[[i]] > mean(ts2[[i]]) + 3 * sd(ts2[[i]]) | ts2[[i]] < mean(ts2[[i]]) - 3 * sd(ts2[[i]])])
dfout <- rbind(dfout, data.frame(
Category=paste0('S0', i),
Variable=paste0('Var0', fcvars[[i]][2]),
Outliers=outct
))
}# Show outlier count
dfout %>%
kbl(caption='Outliers beyond 3 standard deviations from the mean') %>%
kable_classic(full_width=F)| Category | Variable | Outliers |
|---|---|---|
| S01 | Var01 | 0 |
| S01 | Var02 | 19 |
| S02 | Var02 | 33 |
| S02 | Var03 | 1 |
| S03 | Var05 | 0 |
| S03 | Var07 | 0 |
| S04 | Var01 | 0 |
| S04 | Var02 | 27 |
| S05 | Var02 | 24 |
| S05 | Var03 | 0 |
| S06 | Var05 | 1 |
| S06 | Var07 | 1 |
# Create new ts objects that will have imputed values
tsnew1 <- list()
tsnew2 <- list()
for(i in seq(1, 6)) {
# Create var names for the two variables we're interested in for this category
varname1 <- paste0('Var0', fcvars[[i]][1])
varname2 <- paste0('Var0', fcvars[[i]][2])
# Convert Infs to NAs
dftmp <- dfcat[[i]]
print(paste0(" changing ", sum(is.infinite(dftmp[[varname1]])), " infinite values to NA for ", varname1))
print(paste0(" changing ", sum(is.infinite(dftmp[[varname2]])), " infinite values to NA for ", varname2))
dftmp[varname1] = ifelse(is.infinite(dftmp[[varname1]]), NA, dftmp[[varname1]])
dftmp[varname2] = ifelse(is.infinite(dftmp[[varname2]]), NA, dftmp[[varname2]])
# Create time series for each variable
tsnew1[[i]] <- ts(dftmp[[varname1]], frequency=the_freq, start=start_date)
tsnew2[[i]] <- ts(dftmp[[varname2]], frequency=the_freq, start=start_date)
# Replace missing values
tsnew1[[i]] <- tsnew1[[i]] %>%
tsclean(replace.missing=T, lambda='auto')
tsnew2[[i]] <- tsnew2[[i]] %>%
tsclean(replace.missing=T, lambda='auto')
if (the_freq > 1) {
# Decomp plots
p1 <- tsnew1[[i]] %>%
decompose(type='additive') %>%
autoplot()
p2 <- tsnew2[[i]] %>%
decompose(type='additive') %>%
autoplot()
grid.arrange(p1, p2, ncol=1, nrow=2)
}
}## [1] " changing 2 infinite values to NA for Var01"
## [1] " changing 0 infinite values to NA for Var02"
## [1] " changing 0 infinite values to NA for Var02"
## [1] " changing 4 infinite values to NA for Var03"
## [1] " changing 4 infinite values to NA for Var05"
## [1] " changing 4 infinite values to NA for Var07"
## [1] " changing 2 infinite values to NA for Var01"
## [1] " changing 0 infinite values to NA for Var02"
## [1] " changing 1 infinite values to NA for Var02"
## [1] " changing 5 infinite values to NA for Var03"
## [1] " changing 5 infinite values to NA for Var05"
## [1] " changing 5 infinite values to NA for Var07"
# Pre- and post-imputation examples
i <- 6
p1 <- ts2[[i]] %>%
autoplot() +
ggtitle(paste0('Pre-imputation, category S0', i, ', ', varname2)) +
ylab(varname2)
p2 <- tsnew2[[i]] %>%
autoplot() +
ggtitle(paste0('Post-imputation, category S0', i, ', ', varname2)) +
ylab(varname2)
grid.arrange(p1, p2, nrow=1, ncol=2)if (the_freq > 1) {
# Compare plots of pre- and post-interpolation
for (i in seq(1, 6)) {
# First var
p1 <- (ts1[[i]] %>% decompose(type='additive')) %>%
autoplot() +
ggtitle(paste0('Pre-interpolation - Category S0', i, ', Var0', paste0(fcvars[[i]][1])))
p2 <- (tsnew1[[i]] %>% decompose(type='additive')) %>%
autoplot() +
ggtitle(paste0('Post-interpolation - Category S0', i, ', Var0', paste0(fcvars[[i]][1])))
grid.arrange(p1, p2, ncol=1, nrow=2)
# Second var
p3 <- (ts2[[i]] %>% decompose(type='additive')) %>%
autoplot() +
ggtitle(paste0('Pre-interpolation - Category S0', i, ', Var0', paste0(fcvars[[i]][2])))
p4 <- (tsnew2[[i]] %>% decompose(type='additive')) %>%
autoplot() +
ggtitle(paste0('Post-interpolation - Category S0', i, ', Var0', paste0(fcvars[[i]][2])))
grid.arrange(p3, p4, ncol=1, nrow=2)
}
}# Lag plots - I don't think these would be useful
for (i in seq(1, 6)) {
p1 <- ts1[[i]] %>%
gglagplot() +
ggtitle(paste0('Lag plot - category S0', i, ', Var0', fcvars[[i]][1]))
p2 <- ts2[[i]] %>%
gglagplot() +
ggtitle(paste0('Lag plot - category S0', i, ', Var0', fcvars[[i]][2]))
grid.arrange(p1, p2, ncol=2, nrow=1)
}if (the_freq > 1) {
# Seasonal subseries plots
for (i in seq(1, 6)) {
p1 <- tsnew1[[i]] %>%
ggsubseriesplot(main=paste0('Category S0', i, ', Var0', fcvars[[i]][1]))
p2 <- tsnew2[[i]] %>%
ggsubseriesplot(main=paste0('Category S0', i, ', Var0', fcvars[[i]][2]))
grid.arrange(p1, p2, ncol=2, nrow=1)
}
}if (the_freq > 1) {
# I don't think these will be useful
for (i in seq(1, 6)) {
p1 <- tsnew1[[i]] %>%
head(100) %>%
ggseasonplot()
p2 <- tsnew2[[i]] %>%
head(100) %>%
ggseasonplot()
grid.arrange(p1, p2, nrow=2, ncol=1)
}
}if (the_freq > 1) {
# Decomposition
for (i in seq(1, 6)) {
p1 <- tsnew1[[i]] %>%
head(3000) %>%
stl(s.window='periodic') %>%
autoplot() +
ggtitle(paste0('STL decomposition - category S0', i, ', Var0', paste0(fcvars[[i]][1])))
p2 <- tsnew2[[i]] %>%
head(3000) %>%
stl(s.window='periodic') %>%
autoplot() +
ggtitle(paste0('STL decomposition - Category S0', i, ', Var0', paste0(fcvars[[i]][2])))
grid.arrange(p1, p2, ncol=1, nrow=2)
}
}Some types of models are sensitive to data that is autocorrelated, that is, data that contains values which are related to previous values in some regular or predictable way. These models require that the data be modified such that they are “stationary,” meaning that they appear to be randomly distributed and when plotted look like “white noise.”
To identify autocorrelation patterns in the data, autocorrelation function (ACF) and partial autocorrelation function (PACF) plots can be constructed. These plots illustrate the relationship between lagged time series values, i.e. comparing one value with the the next value in the series, or the values two or more positions later. Examining the patterns in the ACF and PACF plots helps the modeler determine what parameters to use as a basis when modeling.
ACF and PACF plots aid in evaluating whether the data is autocorrelated and, if so, whether it should be modified before modeling occurs. One such modification is “differencing,” which converts time series data into the change in value over time. Once the data is “differenced,” it is no longer autocorrelated, and the time series should appear to be “white noise.” Likewise, the ACF and PACF plots should exhibit no clear trend or pattern.
As shown in the figures, there is some trending to most variables that would indicate that differencing is needed prior to modeling. One possible exception is Var02 in categories
# ACF/PACF plots - needed for ARIMA modeling
for (i in seq(1, 6)) {
p1 <- tsnew1[[i]] %>%
ggtsdisplay(plot.type='partial', main=paste0('Category S0', i, ', Var0', fcvars[[i]][1]))
p2 <- tsnew2[[i]] %>%
ggtsdisplay(plot.type='partial', main=paste0('Category S0', i, ', Var0', fcvars[[i]][2]))
p1
p2
}# Init df to hold kpss test stats to determine whether differencing is needed
dfdiffs <- data.frame(matrix(nrow=0, ncol=6))
colnames(dfdiffs) = c('Category', 'Variable', 'KPSS.test.statistic', 'Differencing.required',
'Number.of.seasonal.differences', 'Seasonal.differencing.required')
# Differencing - all need differencing except Var02
for (i in seq(1, 6)) {
# First variable in this category
tmp_kpss <- summary(tsnew1[[i]] %>% ur.kpss())@teststat
if (the_freq > 1) {
tmp_nsdiffs <- tsnew1[[i]] %>% nsdiffs()
}
else {
tmp_nsdiffs <- 0
}
diff_req <- 'no'
seas_diff_req <- 'no'
if (tmp_kpss > 1) { # test statistic is given in percent, if it exceeds 1%, differencing is required
diff_req <- 'yes'
}
if (tmp_nsdiffs > 0) { # nsdiffs() gives the number of seasonal differences required
seas_diff_req <- 'yes'
}
dfdiffs <- rbind(dfdiffs, data.frame(
Category=paste0('S0', i),
Variable=paste0('V0', fcvars[[i]][1]),
KPSS.test.statistic=tmp_kpss,
Differencing.required=diff_req,
Number.of.seasonal.differences=tmp_nsdiffs,
Seasonal.differencing.required=seas_diff_req
))
# Second variable in this category
tmp_kpss <- summary(tsnew2[[i]] %>% ur.kpss())@teststat
if (the_freq > 1) {
tmp_nsdiffs <- tsnew2[[i]] %>% nsdiffs()
}
else {
tmp_nsdiffs <- 0
}
diff_req <- 'no'
seas_diff_req <- 'no'
if (tmp_kpss > 1) { # test statistic is given in percent, if it exceeds 1%, differencing is required
diff_req <- 'yes'
}
if (tmp_nsdiffs > 0) { # nsdiffs() gives the number of seasonal differences required
seas_diff_req <- 'yes'
}
dfdiffs <- rbind(dfdiffs, data.frame(
Category=paste0('S0', i),
Variable=paste0('V0', fcvars[[i]][1]),
KPSS.test.statistic=tmp_kpss,
Differencing.required=diff_req,
Number.of.seasonal.differences=tmp_nsdiffs,
Seasonal.differencing.required=seas_diff_req
))
}
# Show table
dfdiffs %>%
kbl(caption='Differencing requirements') %>%
kable_classic(full_width=F)| Category | Variable | KPSS.test.statistic | Differencing.required | Number.of.seasonal.differences | Seasonal.differencing.required |
|---|---|---|---|---|---|
| S01 | V01 | 16.630429 | yes | 0 | no |
| S01 | V01 | 10.706258 | yes | 0 | no |
| S02 | V02 | 10.016950 | yes | 0 | no |
| S02 | V02 | 4.219967 | yes | 0 | no |
| S03 | V05 | 15.215024 | yes | 0 | no |
| S03 | V05 | 15.231207 | yes | 0 | no |
| S04 | V01 | 15.184888 | yes | 0 | no |
| S04 | V01 | 1.291808 | yes | 0 | no |
| S05 | V02 | 8.348904 | yes | 0 | no |
| S05 | V02 | 8.311704 | yes | 0 | no |
| S06 | V05 | 17.360578 | yes | 0 | no |
| S06 | V05 | 17.359510 | yes | 0 | no |
# Create new list to hold differenced ts objects
tsdiff1 <- list()
tsdiff2 <- list()
# Now actually do the differencing
for (i in seq(1, 6)) {
# First variable in category i
tsdiff1[[i]] <- diff(tsnew1[[i]])
tmp_kpss <- summary(tsdiff1[[i]] %>% ur.kpss())@teststat
dfdiffs[2 * i - 1, 'KPSS.test.statistic'] <- tmp_kpss
diff_req <- 'no'
if (tmp_kpss > 1) { # test statistic is given in percent, if it exceeds 1%, differencing is required
diff_req <- 'yes'
}
dfdiffs[2 * i - 1, 'Differencing.required'] <- diff_req
# Second variable in category i
tsdiff2[[i]] <- diff(tsnew1[[i]])
tmp_kpss <- summary(tsdiff1[[i]] %>% ur.kpss())@teststat
dfdiffs[2 * i, 'KPSS.test.statistic'] <- tmp_kpss
diff_req <- 'no'
if (tmp_kpss > 1) { # test statistic is given in percent, if it exceeds 1%, differencing is required
diff_req <- 'yes'
}
dfdiffs[2 * i, 'Differencing.required'] <- diff_req
}
# Show updated table
dfdiffs %>%
kbl(caption='Differencing requirements') %>%
kable_classic(full_width=F)| Category | Variable | KPSS.test.statistic | Differencing.required | Number.of.seasonal.differences | Seasonal.differencing.required |
|---|---|---|---|---|---|
| S01 | V01 | 0.1001108 | no | 0 | no |
| S01 | V01 | 0.1001108 | no | 0 | no |
| S02 | V02 | 0.0039713 | no | 0 | no |
| S02 | V02 | 0.0039713 | no | 0 | no |
| S03 | V05 | 0.1252398 | no | 0 | no |
| S03 | V05 | 0.1252398 | no | 0 | no |
| S04 | V01 | 0.1280270 | no | 0 | no |
| S04 | V01 | 0.1280270 | no | 0 | no |
| S05 | V02 | 0.0047819 | no | 0 | no |
| S05 | V02 | 0.0047819 | no | 0 | no |
| S06 | V05 | 0.1050035 | no | 0 | no |
| S06 | V05 | 0.1050035 | no | 0 | no |
# Prepare data frame for results
#dfr <- data.frame(matrix(nrow=0, ncol=11))
#colnames(dfr) <- c('category', 'var', 'model', 'method', 'ME', 'RMSE', 'MAE', 'MPE', 'MAPE', 'MASE', 'ACF1')
dfr <- data.frame(matrix(nrow=0, ncol=6))
colnames(dfr) <- c('Category', 'Variable', 'Model', 'Method', 'MAPE', 'Ljung.Box')# Create list to store ETS fit
fit_ets1 <- list()
fit_ets2 <- list()
# ETS
for (i in seq(1, 6)) {
# First variable in category i
fit_ets1[[i]] <- ets(tsnew1[[i]])
dfr <- rbind(dfr, data.frame(
Category=paste0('S0', i),
Variable=paste0('V0', fcvars[[i]][1]),
Model='ETS',
Method=fit_ets1[[i]]$method,
MAPE=accuracy(fit_ets1[[i]])[5],
Ljung.Box=0 # temp, will fill in later when calculating residuals
))
# Second variable in category i
fit_ets2[[i]] <- ets(tsnew2[[i]])
dfr <- rbind(dfr, data.frame(
Category=paste0('S0', i),
Variable=paste0('V0', fcvars[[i]][2]),
Model='ETS',
Method=fit_ets2[[i]]$method,
MAPE=accuracy(fit_ets2[[i]])[5],
Ljung.Box=0 # temp, will fill in later when calculating residuals
))
}# Display residual plots
for (i in seq(1, 6)) {
tmp_res <- checkresiduals(fit_ets1[[i]])
dfr[i * 2 - 1, 'Ljung.Box'] <- tmp_res$p.value
tmp_res <- checkresiduals(fit_ets2[[i]])
dfr[i * 2, 'Ljung.Box'] <- tmp_res$p.value
}##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 20.285, df = 10, p-value = 0.02667
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(M,A,M)
## Q* = 66.04, df = 10, p-value = 2.562e-10
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 66.585, df = 10, p-value = 2.015e-10
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(A,Ad,N)
## Q* = 33.823, df = 10, p-value = 0.0001979
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,A)
## Q* = 13.536, df = 10, p-value = 0.1952
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,A)
## Q* = 8.9033, df = 10, p-value = 0.5413
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,N)
## Q* = 15.125, df = 10, p-value = 0.1276
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 54.629, df = 10, p-value = 3.703e-08
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(M,N,M)
## Q* = 65.926, df = 10, p-value = 2.695e-10
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 32.435, df = 10, p-value = 0.0003387
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 18.391, df = 10, p-value = 0.04871
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ETS(A,N,N)
## Q* = 10.804, df = 10, p-value = 0.373
##
## Model df: 0. Total lags used: 10
For the most part, residuals look like white noise and are normally distributed.
# Display results
dfr %>%
kbl(caption='ETS Modeling Results') %>%
kable_classic(full_width=F)| Category | Variable | Model | Method | MAPE | Ljung.Box |
|---|---|---|---|---|---|
| S01 | V01 | ETS | ETS(M,N,N) | 0.8894790 | 0.0266735 |
| S01 | V02 | ETS | ETS(M,A,M) | 29.4721891 | 0.0000000 |
| S02 | V02 | ETS | ETS(M,N,M) | 26.9412580 | 0.0000000 |
| S02 | V03 | ETS | ETS(A,Ad,N) | 1.3036569 | 0.0001979 |
| S03 | V05 | ETS | ETS(M,N,A) | 1.2878573 | 0.1952189 |
| S03 | V07 | ETS | ETS(M,N,A) | 1.1834222 | 0.5413057 |
| S04 | V01 | ETS | ETS(M,N,N) | 1.1894769 | 0.1275537 |
| S04 | V02 | ETS | ETS(M,N,M) | 32.8855229 | 0.0000000 |
| S05 | V02 | ETS | ETS(M,N,M) | 19.0649932 | 0.0000000 |
| S05 | V03 | ETS | ETS(A,N,N) | 0.7845041 | 0.0003387 |
| S06 | V05 | ETS | ETS(A,N,N) | 1.0584005 | 0.0487121 |
| S06 | V07 | ETS | ETS(A,N,N) | 1.0709812 | 0.3729529 |
# Create list to store ETS fit
fit_arima1 <- list()
fit_arima2 <- list()
# Function to return friendly name of ARIMA method using the fit returned by the model
ret_arima_name <- function (fit) {
tmp_name <- paste0(
'ARIMA(', fit$arma[1],
',', fit$arma[6],
',', fit$arma[2],
')(', fit$arma[3],
',', fit$arma[7],
',', fit$arma[4],
')'
)
if ('drift' %in% names(fit$coef)) {
tmp_name <- paste(tmp_name, ' with drift')
}
return(tmp_name)
}
# ARIMA modeling
for (i in seq(1, 6)) {
fit_arima1[[i]] <- auto.arima(tsnew1[[i]])
dfr <- rbind(dfr, data.frame(
Category=paste0('S0', i),
Variable=paste0('V0', fcvars[[i]][1]),
Model='ARIMA',
Method=ret_arima_name(fit_arima1[[i]]),
MAPE=accuracy(fit_arima1[[i]])[5],
Ljung.Box=0 # temp, will fill in later when calculating residuals
))
fit_arima2[[i]] <- auto.arima(tsnew2[[i]])
dfr <- rbind(dfr, data.frame(
Category=paste0('S0', i),
Variable=paste0('V0', fcvars[[i]][2]),
Model='ARIMA',
Method=ret_arima_name(fit_arima2[[i]]),
MAPE=accuracy(fit_arima1[[i]])[5],
Ljung.Box=0 # temp, will fill in later when calculating residuals
))
}# Display residual plots
for (i in seq(1, 6)) {
tmp_res <- checkresiduals(fit_arima1[[i]])
dfr[i * 2 - 1 + 12, 'Ljung.Box'] <- tmp_res$p.value
tmp_res <- checkresiduals(fit_arima2[[i]])
dfr[i * 2 + 12, 'Ljung.Box'] <- tmp_res$p.value
}##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,2) with drift
## Q* = 7.9375, df = 8, p-value = 0.4396
##
## Model df: 2. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,2)(0,0,2)[5]
## Q* = 11.455, df = 5, p-value = 0.04307
##
## Model df: 5. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,3)(1,0,0)[5] with drift
## Q* = 3.9806, df = 4, p-value = 0.4086
##
## Model df: 6. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,0)
## Q* = 11.977, df = 8, p-value = 0.1522
##
## Model df: 2. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)
## Q* = 11.894, df = 8, p-value = 0.156
##
## Model df: 2. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)
## Q* = 8.5574, df = 10, p-value = 0.5746
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,0)
## Q* = 6.8036, df = 10, p-value = 0.7439
##
## Model df: 0. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,3)(1,0,1)[5]
## Q* = 9.0584, df = 4, p-value = 0.05966
##
## Model df: 6. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(2,1,2)(2,0,0)[5] with drift
## Q* = 1.8642, df = 4, p-value = 0.7607
##
## Model df: 6. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)
## Q* = 13.457, df = 9, p-value = 0.143
##
## Model df: 1. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(0,1,1)(0,0,2)[5]
## Q* = 8.1228, df = 7, p-value = 0.3219
##
## Model df: 3. Total lags used: 10
##
## Ljung-Box test
##
## data: Residuals from ARIMA(1,1,1)
## Q* = 3.9985, df = 8, p-value = 0.8573
##
## Model df: 2. Total lags used: 10
As with the ETS models, residuals look like white noise and are normally distributed.
# Display results
dfr %>%
arrange(Category, Variable, Model) %>%
kbl(caption='Modeling Results - ETS and ') %>%
kable_classic(full_width=F)| Category | Variable | Model | Method | MAPE | Ljung.Box |
|---|---|---|---|---|---|
| S01 | V01 | ARIMA | ARIMA(0,1,2)(0,0,0) with drift | 0.8842708 | 0.4395964 |
| S01 | V01 | ETS | ETS(M,N,N) | 0.8894790 | 0.0266735 |
| S01 | V02 | ARIMA | ARIMA(1,1,2)(0,0,2) | 0.8842708 | 0.0430706 |
| S01 | V02 | ETS | ETS(M,A,M) | 29.4721891 | 0.0000000 |
| S02 | V02 | ARIMA | ARIMA(2,1,3)(1,0,0) with drift | 26.4875660 | 0.4086317 |
| S02 | V02 | ETS | ETS(M,N,M) | 26.9412580 | 0.0000000 |
| S02 | V03 | ARIMA | ARIMA(2,1,0)(0,0,0) | 26.4875660 | 0.1522174 |
| S02 | V03 | ETS | ETS(A,Ad,N) | 1.3036569 | 0.0001979 |
| S03 | V05 | ARIMA | ARIMA(1,1,1)(0,0,0) | 1.2912153 | 0.1560026 |
| S03 | V05 | ETS | ETS(M,N,A) | 1.2878573 | 0.1952189 |
| S03 | V07 | ARIMA | ARIMA(0,1,0)(0,0,0) | 1.2912153 | 0.5745615 |
| S03 | V07 | ETS | ETS(M,N,A) | 1.1834222 | 0.5413057 |
| S04 | V01 | ARIMA | ARIMA(0,1,0)(0,0,0) | 1.1895139 | 0.7438504 |
| S04 | V01 | ETS | ETS(M,N,N) | 1.1894769 | 0.1275537 |
| S04 | V02 | ARIMA | ARIMA(1,1,3)(1,0,1) | 1.1895139 | 0.0596569 |
| S04 | V02 | ETS | ETS(M,N,M) | 32.8855229 | 0.0000000 |
| S05 | V02 | ARIMA | ARIMA(2,1,2)(2,0,0) with drift | 18.9684041 | 0.7607213 |
| S05 | V02 | ETS | ETS(M,N,M) | 19.0649932 | 0.0000000 |
| S05 | V03 | ARIMA | ARIMA(0,1,1)(0,0,0) | 18.9684041 | 0.1429890 |
| S05 | V03 | ETS | ETS(A,N,N) | 0.7845041 | 0.0003387 |
| S06 | V05 | ARIMA | ARIMA(0,1,1)(0,0,2) | 1.0579635 | 0.3218907 |
| S06 | V05 | ETS | ETS(A,N,N) | 1.0584005 | 0.0487121 |
| S06 | V07 | ARIMA | ARIMA(1,1,1)(0,0,0) | 1.0579635 | 0.8572600 |
| S06 | V07 | ETS | ETS(A,N,N) | 1.0709812 | 0.3729529 |
# Manually choose best model for now
Selected.model <- c(1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0)
# Choose the model with the lower MAPE for each category/var combination
#dfr2 <- dfr %>%
# #filter(Ljung.Box > 0.05) %>%
# group_by(Category, Variable) %>%
# slice_min(MAPE)
dfr2 <- dfr %>%
arrange(Category, Variable, Model) %>%
cbind(Selected.model) %>%
filter(Selected.model==1) %>%
select(-Selected.model)
colnames(dfr2) <- c('Category', 'Variable', 'Model', 'Method', 'MAPE', 'Ljung.Box')
dfr2 %>%
kbl(caption='Best-performing models') %>%
kable_classic(full_width=F)| Category | Variable | Model | Method | MAPE | Ljung.Box |
|---|---|---|---|---|---|
| S01 | V01 | ARIMA | ARIMA(0,1,2)(0,0,0) with drift | 0.8842708 | 0.4395964 |
| S01 | V02 | ARIMA | ARIMA(1,1,2)(0,0,2) | 0.8842708 | 0.0430706 |
| S02 | V02 | ARIMA | ARIMA(2,1,3)(1,0,0) with drift | 26.4875660 | 0.4086317 |
| S02 | V03 | ARIMA | ARIMA(2,1,0)(0,0,0) | 26.4875660 | 0.1522174 |
| S03 | V05 | ETS | ETS(M,N,A) | 1.2878573 | 0.1952189 |
| S03 | V07 | ETS | ETS(M,N,A) | 1.1834222 | 0.5413057 |
| S04 | V01 | ARIMA | ARIMA(0,1,0)(0,0,0) | 1.1895139 | 0.7438504 |
| S04 | V02 | ARIMA | ARIMA(1,1,3)(1,0,1) | 1.1895139 | 0.0596569 |
| S05 | V02 | ARIMA | ARIMA(2,1,2)(2,0,0) with drift | 18.9684041 | 0.7607213 |
| S05 | V03 | ARIMA | ARIMA(0,1,1)(0,0,0) | 18.9684041 | 0.1429890 |
| S06 | V05 | ARIMA | ARIMA(0,1,1)(0,0,2) | 1.0579635 | 0.3218907 |
| S06 | V07 | ARIMA | ARIMA(1,1,1)(0,0,0) | 1.0579635 | 0.8572600 |
# Create variable to store forcasts
fc1 <- list()
fc2 <- list()
# Create data frame to store forecasts
dffc <- data.frame(matrix(nrow=0, ncol=3))
colnames(dffc) <- c('Category', 'Variable', 'Forecast')
for (i in seq(1, 6)) {
# First var in category
if (dfr2[2 * i - 1, 'Model'] == 'ETS') {
fc1[[i]] <- fit_ets1[[i]] %>% forecast(h=140)
} else {
fc1[[i]] <- fit_arima1[[i]] %>% forecast(h=140)
}
p1 <- fc1[[i]] %>%
autoplot() +
ylab('Forecasted units') +
ggtitle(paste0('Forecasts - category S0', i, ', VAR0', fcvars[[i]][1]))
print(p1)
# First var in category
if (dfr2[2 * i, 'Model'] == 'ETS') {
fc2[[i]] <- fit_ets2[[i]] %>% forecast(h=140)
} else {
fc2[[i]] <- fit_arima2[[i]] %>% forecast(h=140)
}
p2 <- fc2[[i]] %>%
autoplot() +
ylab('Forecasted units') +
ggtitle(paste0('Forecasts - category S0', i, ', VAR0', fcvars[[i]][2]))
print(p2)
# Store forecasts in df
dffc <- rbind(dffc, data.frame(
Category=paste0('S0', i),
Variable=paste0('V0', fcvars[[i]][1]),
Forecast=data.frame(fc1[[i]])['Point.Forecast']
))
dffc <- rbind(dffc, data.frame(
Category=paste0('S0', i),
Variable=paste0('V0', fcvars[[i]][2]),
Forecast=data.frame(fc2[[i]])['Point.Forecast']
))
}#df_orig %>%
# filter(SeriesInd > 43021) %>%
# select(SeriesInd) %>%
#dffc %>%
# merge()
# Display a few rows
dffc %>%
head(10) %>%
kbl(caption='Forecast values (first 10 values)') %>%
kable_classic(full_width=F)| Category | Variable | Point.Forecast | |
|---|---|---|---|
| 41005.20 | S01 | V01 | 62.34758 |
| 41005.40 | S01 | V01 | 62.36178 |
| 41005.60 | S01 | V01 | 62.38303 |
| 41005.80 | S01 | V01 | 62.40428 |
| 41006.00 | S01 | V01 | 62.42553 |
| 41006.20 | S01 | V01 | 62.44679 |
| 41006.40 | S01 | V01 | 62.46804 |
| 41006.60 | S01 | V01 | 62.48929 |
| 41006.80 | S01 | V01 | 62.51054 |
| 41007.00 | S01 | V01 | 62.53179 |
# Write forecasts
#reference: https://www.statology.org/r-export-to-excel-multiple-sheets/
#export each data frame to separate sheets in same Excel file
#openxlsx::write.xlsx(forecasts, file = 'mydata.xlsx')