ippolito_data624

Load data

# Load data from git
GET('https://github.com/klgriffen96/summer23_data624/raw/main/project_1/Data%20Set%20for%20Class.xls', write_disk(tmpfile <- tempfile(fileext=".xls")))

## Response [https://raw.githubusercontent.com/klgriffen96/summer23_data624/main/project_1/Data%20Set%20for%20Class.xls]
##   Date: 2023-06-18 21:53
##   Status: 200
##   Content-Type: application/octet-stream
##   Size: 1.55 MB
## <ON DISK>  C:\Users\micha\AppData\Local\Temp\RtmpAnXvtA\file364c692339f6.xls

df_orig <- readxl::read_excel(tmpfile, skip=0)

Data preparation

# Set frequency
the_freq <- 5

# Remove blank observations at end
df <- df_orig %>%
    filter(SeriesInd <= 43021) %>%
    arrange(SeriesInd, category)

# Initial summary
summary(df)

##    SeriesInd       category             Var01            Var02          
##  Min.   :40669   Length:9732        Min.   :  9.03   Min.   :  1339900  
##  1st Qu.:41253   Class :character   1st Qu.: 23.10   1st Qu.: 12520675  
##  Median :41846   Mode  :character   Median : 38.44   Median : 21086550  
##  Mean   :41843                      Mean   : 46.98   Mean   : 37035741  
##  3rd Qu.:42430                      3rd Qu.: 66.78   3rd Qu.: 42486700  
##  Max.   :43021                      Max.   :195.18   Max.   :480879500  
##                                     NA's   :14       NA's   :2          
##      Var03            Var05            Var07       
##  Min.   :  8.82   Min.   :  8.99   Min.   :  8.92  
##  1st Qu.: 22.59   1st Qu.: 22.91   1st Qu.: 22.88  
##  Median : 37.66   Median : 38.05   Median : 38.05  
##  Mean   : 46.12   Mean   : 46.55   Mean   : 46.56  
##  3rd Qu.: 65.88   3rd Qu.: 66.38   3rd Qu.: 66.31  
##  Max.   :189.36   Max.   :195.00   Max.   :189.72  
##  NA's   :26       NA's   :26       NA's   :26

# Fill in the existing NAs with Infs so we can distinguish them later
df[is.na(df)] <- Inf

# The next steps create a complete series of data, filling in gaps in SeriesInd across all categories

# Create a sequence of values starting at the first value of SeriesInd and ending at the last
dftmp2 <- data.frame(SeriesInd=seq(from=min(df$SeriesInd), to=max(df$SeriesInd)))

# Create a tmp dataframe for categories
dftmp3 <- data.frame(category=c('S01', 'S02', 'S03', 'S04', 'S05', 'S06'))

# Create dataframe joining the complete set of SeriesInd to the categories
dftmp4 <- dftmp2 %>%
    merge(dftmp3, all=T) %>%
    arrange(SeriesInd, category)

# Join the original dataframe with the dataframe containing the full set of SeriesInd and categories
df2 <- dftmp4 %>%
    merge(df, by=c('SeriesInd', 'category'), all.x=T) %>%
    arrange(SeriesInd, category)

# Summary after creating complete dataframe
summary(df2)

##    SeriesInd       category             Var01           Var02         
##  Min.   :40669   Length:14118       Min.   : 9.03   Min.   : 1339900  
##  1st Qu.:41257   Class :character   1st Qu.:23.36   1st Qu.:12521025  
##  Median :41845   Mode  :character   Median :38.55   Median :21091200  
##  Mean   :41845                      Mean   :  Inf   Mean   :     Inf  
##  3rd Qu.:42433                      3rd Qu.:67.12   3rd Qu.:42518000  
##  Max.   :43021                      Max.   :  Inf   Max.   :     Inf  
##                                     NA's   :4386    NA's   :4386      
##      Var03           Var05           Var07      
##  Min.   : 8.82   Min.   : 8.99   Min.   : 8.92  
##  1st Qu.:22.68   1st Qu.:23.03   1st Qu.:23.04  
##  Median :37.83   Median :38.24   Median :38.23  
##  Mean   :  Inf   Mean   :  Inf   Mean   :  Inf  
##  3rd Qu.:66.33   3rd Qu.:66.75   3rd Qu.:66.78  
##  Max.   :  Inf   Max.   :  Inf   Max.   :  Inf  
##  NA's   :4386    NA's   :4386    NA's   :4386

# Define which variables should be forecast for each category
fcvars <- list(c(1, 2), c(2, 3), c(5, 7), c(1, 2), c(2, 3), c(5,7))

Exploratory data analysis

Examine missing values

# Look for missing values now that we have complete values for SeriesInd
summary(df2)

##    SeriesInd       category             Var01           Var02         
##  Min.   :40669   Length:14118       Min.   : 9.03   Min.   : 1339900  
##  1st Qu.:41257   Class :character   1st Qu.:23.36   1st Qu.:12521025  
##  Median :41845   Mode  :character   Median :38.55   Median :21091200  
##  Mean   :41845                      Mean   :  Inf   Mean   :     Inf  
##  3rd Qu.:42433                      3rd Qu.:67.12   3rd Qu.:42518000  
##  Max.   :43021                      Max.   :  Inf   Max.   :     Inf  
##                                     NA's   :4386    NA's   :4386      
##      Var03           Var05           Var07      
##  Min.   : 8.82   Min.   : 8.99   Min.   : 8.92  
##  1st Qu.:22.68   1st Qu.:23.03   1st Qu.:23.04  
##  Median :37.83   Median :38.24   Median :38.23  
##  Mean   :  Inf   Mean   :  Inf   Mean   :  Inf  
##  3rd Qu.:66.33   3rd Qu.:66.75   3rd Qu.:66.78  
##  Max.   :  Inf   Max.   :  Inf   Max.   :  Inf  
##  NA's   :4386    NA's   :4386    NA's   :4386

# Create df to hold missing value summary
dfmv <- data.frame(category=c(), var=c())

# Examine missing value count
for (i in seq(1, 6)) {
    for (j in seq(1, 2)) {
        gaps <- sum(is.na(df2[df2$category==paste0('S0', i), paste0('Var0', fcvars[[i]][j])]))
        inf_vals <- sum(is.infinite(df2[df2$category==paste0('S0', i), paste0('Var0', fcvars[[i]][j])]))
        total_vals <- length(df2[df2$category==paste0('S0', i), paste0('Var0', fcvars[[i]][j])])
        dfmv <- rbind(dfmv, c(paste0('S0', i), paste0('Var0', fcvars[[i]][j]), inf_vals, gaps))
    }
}
colnames(dfmv) <- c('category', 'var', 'missing.values', 'gaps')
dfmv %>%
    kbl(caption='Time series gaps and missing values') %>%
    kable_classic(full_width=F)

Time series gaps and missing values
category	var	missing.values	gaps
S01	Var01	2	731
S01	Var02	0	731
S02	Var02	0	731
S02	Var03	4	731
S03	Var05	4	731
S03	Var07	4	731
S04	Var01	2	731
S04	Var02	0	731
S05	Var02	1	731
S05	Var03	5	731
S06	Var05	5	731
S06	Var07	5	731

Gap analysis

We looked for patterns in the data set to determine if there was any seasonality and to evaluate where there were gaps. We assumed SeriesInd was an integer representing the number of days since a certain date, a common one being January 1, 1900 (the “origin” date). Using this logic, we converted SeriesInd to a date and examined where gaps occurred. Using an origin date of January 1, 1900 placed gaps on Friday and Saturday. It seemed more reasonable to assume that the series excluded weekend days, so we shifted the weekdays such that the gaps would fall on weekends.

It is also noted that a number of gaps fall on other weekdays, with the greatest number on Monday (32). This is consistent with a US-based calendar which includes a number of national holidays that fall on Monday (e.g. Memorial Day and Labor Day).

Because an origin date of January 1, 1900 yielded a week with gaps on Fridays and Saturdays, we performed a systematic search to find a reasonable origin that would align the gaps with Saturday and Sunday instead. Going under the assumption that this was a US calendar, we looked for origin dates that would place gaps on both January 1 and on December 25. The first such origin date that met these criteria was August 31, 1915, which would put our first data point in the year 2027. Therefore, we concluded that the data set either does not likely conform to a US calendar. We further concluded that the importance of the origin date was secondary and that the key observation was that the data is likely based on a seven-day week, with regular gaps on exactly two of those days. It is possible these are stock market prices, sales figures, or another such weekly metric.

We also performed a gap analysis on the prediction set to aid in determining whether “weekend” values would need to be imputed (filled). As shown in the table, there are no weekend days in the prediction set.

# Look for patterns in gaps in SeriesInd; first make a copy of df == dfga (gap analysis).
# Filter by just a single category since this will give us a single set of SeriesInd values,
# and the NAs in each category will all be in the same positions so it doesn't matter which category we choose.
# Work from the theory that these are days of the week, and that that SeriesInd is the number of days since Jan 1, 1900.
dfga <- df2 %>%
    filter(category=='S01') %>%
    mutate(date=as.Date(SeriesInd, origin='1900-01-01')+1) %>%
    mutate(SeriesInd.mod7=SeriesInd %% 7) %>%
    mutate(Day.of.week=weekdays(date)) %>%
    group_by(SeriesInd.mod7, Day.of.week) %>%
    summarize(Gaps=sum(is.na(Var01)), Filled.vals=sum(!is.na(Var01)), .groups='keep') %>%
    ungroup() %>%
    arrange(SeriesInd.mod7) %>%
    select(-SeriesInd.mod7)
dfga %>%
    kbl(caption='Gap analysis') %>%
    kable_classic(full_width = F)

Gap analysis
Day.of.week	Gaps	Filled.vals
Tuesday	3	333
Wednesday	3	333
Thursday	9	327
Friday	12	324
Saturday	336	0
Sunday	336	0
Monday	32	305

# Look at gaps in the SeriesInd values we need to predict;
# first fill in values with -1 so we know which ones are missing
df3 <- df_orig %>%
    filter(SeriesInd > 43021) %>%
    arrange(SeriesInd, category) %>%
    select(SeriesInd, category, Var01) %>%
    mutate(Var01=-1)

# Create a sequence of values starting at the first value of SeriesInd and ending at the last
dftmp2 <- data.frame(SeriesInd=seq(from=min(df3$SeriesInd), to=max(df3$SeriesInd)))

# Create a tmp dataframe for categories
dftmp3 <- data.frame(category=c('S01', 'S02', 'S03', 'S04', 'S05', 'S06'))

# Create dataframe joining the complete set of SeriesInd to the categories
dftmp4 <- dftmp2 %>%
    merge(dftmp3, all=T) %>%
    arrange(SeriesInd, category)

# Join the original dataframe with the dataframe containing the full set of SeriesInd and categories
df3 <- dftmp4 %>%
    merge(df3, by=c('SeriesInd', 'category'), all.x=T) %>%
    arrange(SeriesInd, category)

# Examine gaps
dfga2 <- df3 %>%
    filter(category=='S01') %>%
    mutate(date=as.Date(SeriesInd, origin='1900-01-01')+1) %>%
    mutate(SeriesInd.mod7=SeriesInd %% 7) %>%
    mutate(Day.of.week=weekdays(date)) %>%
    group_by(SeriesInd.mod7, Day.of.week) %>%
    summarize(Gaps=sum(is.na(Var01)), Filled.vals=sum(!is.na(Var01)), .groups='keep') %>%
    ungroup() %>%
    arrange(SeriesInd.mod7) %>%
    select(-SeriesInd.mod7)
dfga2 %>%
    kbl(caption='Gap analysis - prediction set') %>%
    kable_classic(full_width = F)

Gap analysis - prediction set
Day.of.week	Gaps	Filled.vals
Tuesday	0	29
Wednesday	0	29
Thursday	1	28
Friday	0	29
Saturday	28	0
Sunday	28	0
Monday	3	25

Fill in NAs in 5-day cycle

# Create full 5-day set of SeriesInds
df4 <- df2 %>%
    filter(category=='S01') %>%
    mutate(date=as.Date(SeriesInd, origin='1900-01-01')+1) %>%
    mutate(SeriesInd.mod7=SeriesInd %% 7) %>%
    mutate(Day.of.week=weekdays(date)) %>%
    filter(Day.of.week != 'Saturday' & Day.of.week != 'Sunday') %>%
    select(SeriesInd, category)
for (i in df4$SeriesInd) {
    for (j in seq(2, 6)) {
       df4 <- rbind(df4, data.frame(SeriesInd=i, category=paste0('S0', j)))
    }
}

# Join the original dataframe with the dataframe containing the full set of SeriesInd and categories
df2 <- df %>%
    merge(df4, by=c('SeriesInd', 'category'), all=T) %>%
    arrange(SeriesInd, category)
summary(df2)

##    SeriesInd       category             Var01           Var02         
##  Min.   :40669   Length:10086       Min.   : 9.03   Min.   : 1339900  
##  1st Qu.:41257   Class :character   1st Qu.:23.36   1st Qu.:12521025  
##  Median :41845   Mode  :character   Median :38.55   Median :21091200  
##  Mean   :41844                      Mean   :  Inf   Mean   :     Inf  
##  3rd Qu.:42433                      3rd Qu.:67.12   3rd Qu.:42518000  
##  Max.   :43021                      Max.   :  Inf   Max.   :     Inf  
##                                     NA's   :354     NA's   :354       
##      Var03           Var05           Var07      
##  Min.   : 8.82   Min.   : 8.99   Min.   : 8.92  
##  1st Qu.:22.68   1st Qu.:23.03   1st Qu.:23.04  
##  Median :37.83   Median :38.24   Median :38.23  
##  Mean   :  Inf   Mean   :  Inf   Mean   :  Inf  
##  3rd Qu.:66.33   3rd Qu.:66.75   3rd Qu.:66.77  
##  Max.   :  Inf   Max.   :  Inf   Max.   :  Inf  
##  NA's   :354     NA's   :354     NA's   :354

Plot time series

# To work with the data more easily, initialize list to hold dataframes, one for each category
dfcat = list()

# Filter by category, selecting only those vars we're interested in for that category
for (i in seq(1, 6)) {
    dfcat[[i]] <- df2 %>%
        filter(category == paste0('S0', i)) %>%
        select(SeriesInd, !!paste0('Var0', fcvars[[i]][1]), !!paste0('Var0', fcvars[[i]][2]))
}

# Initialize ts objects; each element in each list will correspond to a category; e.g. ts1[[1]] will be S01, etc
ts1 = list()
ts2 = list()

# Set start date to first date in series
start_date <- df2$SeriesInd[[1]]

# Iterate over categories
for (i in seq(1, 6)) {

    # Create var names for the two variables we're interested in for this category
    varname1 <- paste0('Var0', fcvars[[i]][1])
    varname2 <- paste0('Var0', fcvars[[i]][2])

    # Create time series for each variable
    #ts1[[i]] <- ts(dfcat[[i]][dfcat[[i]][varname1] != Inf, varname1], frequency=the_freq, start=start_date)
    #ts2[[i]] <- ts(dfcat[[i]][dfcat[[i]][varname2] != Inf, varname2], frequency=the_freq, start=start_date)
    ts1[[i]] <- ts(dfcat[[i]][[varname1]], frequency=the_freq, start=start_date)
    ts2[[i]] <- ts(dfcat[[i]][[varname2]], frequency=the_freq, start=start_date)

    # Plot the time series
    p1a <- ts1[[i]] %>%
        autoplot() +
        ggtitle(paste0('Category S0', i, ', ', varname1)) +
        ylab(varname1)
    p1b <- dfcat[[i]] %>%
        filter(dfcat[[i]][!!varname1] != Inf & !is.na(dfcat[[i]][!!varname1])) %>%
        ggplot() +
        geom_histogram(aes(x=eval(sym(varname1))), bins=30) +
        ggtitle(paste0('Category S0', i, ', ', varname1)) +
        xlab(varname1)
    grid.arrange(p1a, p1b, ncol=2)
    p2a <- ts2[[i]] %>%
        autoplot() +
        ggtitle(paste0('Category S0', i, ', ', varname2)) +
        ylab(varname2)
    p2b <- dfcat[[i]] %>%
        filter(dfcat[[i]][!!varname2] != Inf & !is.na(dfcat[[i]][!!varname2])) %>%
        ggplot() +
        geom_histogram(aes(x=eval(sym(varname2))), bins=30) +
        ggtitle(paste0('Category S0', i, ', ', varname2)) +
        xlab(varname2)
    grid.arrange(p2a, p2b, ncol=2)

}

# Plot time series on one graph
p1 <- dfcat[[1]] %>%
    ggplot(aes(x=SeriesInd)) +
    geom_line(aes(y=Var01, color='Var01')) +
    geom_line(aes(y=Var02 / 500000, color='Var02')) +
    scale_y_continuous(sec.axis=sec_axis(~ . * 500000, name='Var02')) + 
    scale_color_manual(values=c('black', 'darkred')) +
    ggtitle('Category S01')
p2 <- dfcat[[2]] %>%
    ggplot(aes(x=SeriesInd)) +
    geom_line(aes(y=Var02, color='Var02')) +
    geom_line(aes(y=Var03 * 10000000, color='Var03')) +
    scale_y_continuous(sec.axis=sec_axis(~ . / 10000000, name='Var03')) + 
    scale_color_manual(values=c('black', 'darkred')) +
    ggtitle('Category S02')
p3 <- dfcat[[3]] %>%
    ggplot(aes(x=SeriesInd)) +
    geom_line(aes(y=Var05, color='Var05')) +
    geom_line(aes(y=Var07, color='Var07')) +
    scale_y_continuous(sec.axis=sec_axis(~ ., name='Var07')) + 
    scale_color_manual(values=c('black', 'darkred')) +
    ggtitle('Category S03')
p4 <- dfcat[[4]] %>%
    ggplot(aes(x=SeriesInd)) +
    geom_line(aes(y=Var01, color='Var01')) +
    geom_line(aes(y=Var02 / 1000000, color='Var02')) +
    scale_y_continuous(sec.axis=sec_axis(~ . * 1000000, name='Var02')) + 
    scale_color_manual(values=c('black', 'darkred')) +
    ggtitle('Category S04')
p5 <- dfcat[[5]] %>%
    ggplot(aes(x=SeriesInd)) +
    geom_line(aes(y=Var02, color='Var02')) +
    geom_line(aes(y=Var03 * 1000000, color='Var03')) +
    scale_y_continuous(sec.axis=sec_axis(~ . / 1000000, name='Var03')) + 
    scale_color_manual(values=c('black', 'darkred')) +
    ggtitle('Category S05')
p6 <- dfcat[[6]] %>%
    ggplot(aes(x=SeriesInd)) +
    geom_line(aes(y=Var05, color='Var05')) +
    geom_line(aes(y=Var07, color='Var07')) +
    scale_y_continuous(sec.axis=sec_axis(~ ., name='Var07')) + 
    scale_color_manual(values=c('black', 'darkred')) +
    ggtitle('Category S06')
#grid.arrange(p1, p2, p3, p4, p5, p6, nrow=6, ncol=1)
grid.arrange(p1, p2, p3, nrow=3, ncol=1)

grid.arrange(p4, p5, p6, nrow=3, ncol=1)

Scatterplot matrix

# Look for correlation between variables
GGally::ggpairs(df2[,3:7], progress=F)

## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

An additional step in exploratory data analysis that is often helpful to better understand the data set is to generate pairwise plots showing the relationship between predictors. Based on these pairwise plots (see below), we observed an extremely high degree of correlation between some predictors. Notably, variables Var01, Var03, Var05, and Var07 contain very similar values, suggesting that any missing values of one might be imputed using existing values of another. Without the context of what these variables represent, it is difficult to speculate on why they might be correlated as such, but it is possible that the data were collected by different observers using similar but slightly different methodologies or techniques.

Outliers

We discovered some obvious outliers in the data. Outliers adversely affect forecasts and, as such, should be either removed or replaced. Notably,outliers were discovered in the following variables, with four other variables having questionable values.

One such example is in Category S06, variable V07, which clearly exhibits an outlying value of approximately 190 about a quarter of the way into the series.

# Create list to hold outlier counts
dfout <- data.frame()

# Look for outliers using decomposition plots
for (i in seq(1, 6)) {

    # Create var names for the two variables we're interested in for this category
    varname1 <- paste0('Var0', fcvars[[i]][1])
    varname2 <- paste0('Var0', fcvars[[i]][2])
    
    # Filter out infinite values
    dftmp1 <- dfcat[[i]] %>%
        filter(!is.infinite(eval(sym(varname1)))) %>%
        filter(!is.na(eval(sym(varname1))))
    dftmp2 <- dfcat[[i]] %>%
        filter(!is.infinite(eval(sym(varname2)))) %>%
        filter(!is.na(eval(sym(varname2))))
    
    # Create time series for each variable using xts; SeriesInd appears to be the days since 1900-01-01
    ts1[[i]] <- ts(dftmp1[[varname1]], frequency=the_freq, start=start_date)
    ts2[[i]] <- ts(dftmp2[[varname2]], frequency=the_freq, start=start_date)

    if (the_freq > 1) {
        
        p1 <- ts1[[i]] %>%
            decompose(type='additive') %>%
            autoplot() +
            ggtitle(paste0('Category S0', i, ', Var0', fcvars[[i]][1]))
        p2 <- ts2[[i]] %>%
            decompose(type='additive') %>%
            autoplot() +
            ggtitle(paste0('Category S0', i, ', Var0', fcvars[[i]][2]))
        grid.arrange(p1, p2, ncol=1, nrow=2)
        
    }
    
    # Count of outliers > 3 SD
    outct <- length(ts1[[i]][ts1[[i]] > mean(ts1[[i]]) + 3 * sd(ts1[[i]]) | ts1[[i]] < mean(ts1[[i]]) - 3 * sd(ts1[[i]])])
    dfout <- rbind(dfout, data.frame(
        Category=paste0('S0', i),
        Variable=paste0('Var0', fcvars[[i]][1]),
        Outliers=outct
    ))
    outct <- length(ts2[[i]][ts2[[i]] > mean(ts2[[i]]) + 3 * sd(ts2[[i]]) | ts2[[i]] < mean(ts2[[i]]) - 3 * sd(ts2[[i]])])
    dfout <- rbind(dfout, data.frame(
        Category=paste0('S0', i),
        Variable=paste0('Var0', fcvars[[i]][2]),
        Outliers=outct
    ))
    
}

# Show outlier count
dfout %>%
    kbl(caption='Outliers beyond 3 standard deviations from the mean') %>%
    kable_classic(full_width=F)

Outliers beyond 3 standard deviations from the mean
Category	Variable	Outliers
S01	Var01	0
S01	Var02	19
S02	Var02	33
S02	Var03	1
S03	Var05	0
S03	Var07	0
S04	Var01	0
S04	Var02	27
S05	Var02	24
S05	Var03	0
S06	Var05	1
S06	Var07	1

Impute missing values and outliers

# Create new ts objects that will have imputed values
tsnew1 <- list()
tsnew2 <- list()

for(i in seq(1, 6)) {
    
    # Create var names for the two variables we're interested in for this category
    varname1 <- paste0('Var0', fcvars[[i]][1])
    varname2 <- paste0('Var0', fcvars[[i]][2])
    
    # Convert Infs to NAs
    dftmp <- dfcat[[i]]
    print(paste0("    changing ", sum(is.infinite(dftmp[[varname1]])), " infinite values to NA for ", varname1))
    print(paste0("    changing ", sum(is.infinite(dftmp[[varname2]])), " infinite values to NA for ", varname2))
    dftmp[varname1] = ifelse(is.infinite(dftmp[[varname1]]), NA, dftmp[[varname1]])
    dftmp[varname2] = ifelse(is.infinite(dftmp[[varname2]]), NA, dftmp[[varname2]])

    # Create time series for each variable
    tsnew1[[i]] <- ts(dftmp[[varname1]], frequency=the_freq, start=start_date)
    tsnew2[[i]] <- ts(dftmp[[varname2]], frequency=the_freq, start=start_date)

    # Replace missing values
    tsnew1[[i]] <- tsnew1[[i]] %>%
        tsclean(replace.missing=T, lambda='auto')
    tsnew2[[i]] <- tsnew2[[i]] %>%
        tsclean(replace.missing=T, lambda='auto')

    if (the_freq > 1) {
        
        # Decomp plots
        p1 <- tsnew1[[i]] %>%
            decompose(type='additive') %>%
            autoplot()
        p2 <- tsnew2[[i]] %>%
            decompose(type='additive') %>%
            autoplot()
        grid.arrange(p1, p2, ncol=1, nrow=2)
        
    }

}

## [1] "    changing 2 infinite values to NA for Var01"
## [1] "    changing 0 infinite values to NA for Var02"

## [1] "    changing 0 infinite values to NA for Var02"
## [1] "    changing 4 infinite values to NA for Var03"

## [1] "    changing 4 infinite values to NA for Var05"
## [1] "    changing 4 infinite values to NA for Var07"

## [1] "    changing 2 infinite values to NA for Var01"
## [1] "    changing 0 infinite values to NA for Var02"

## [1] "    changing 1 infinite values to NA for Var02"
## [1] "    changing 5 infinite values to NA for Var03"

## [1] "    changing 5 infinite values to NA for Var05"
## [1] "    changing 5 infinite values to NA for Var07"

# Pre- and post-imputation examples
i <- 6
p1 <- ts2[[i]] %>%
    autoplot() +
    ggtitle(paste0('Pre-imputation, category S0', i, ', ', varname2)) +
    ylab(varname2)
p2 <- tsnew2[[i]] %>%
    autoplot() +
    ggtitle(paste0('Post-imputation, category S0', i, ', ', varname2)) +
    ylab(varname2)
grid.arrange(p1, p2, nrow=1, ncol=2)

Compare plots pre- and post-interpolation

if (the_freq > 1) {

    # Compare plots of pre- and post-interpolation
    for (i in seq(1, 6)) {
    
        # First var
        p1 <- (ts1[[i]] %>% decompose(type='additive')) %>%
            autoplot() +
            ggtitle(paste0('Pre-interpolation - Category S0', i, ', Var0', paste0(fcvars[[i]][1])))
        p2 <- (tsnew1[[i]] %>% decompose(type='additive')) %>%
            autoplot() +
            ggtitle(paste0('Post-interpolation - Category S0', i, ', Var0', paste0(fcvars[[i]][1])))
        grid.arrange(p1, p2, ncol=1, nrow=2)
        
        # Second var
        p3 <- (ts2[[i]] %>% decompose(type='additive')) %>%
            autoplot() +
            ggtitle(paste0('Pre-interpolation - Category S0', i, ', Var0', paste0(fcvars[[i]][2])))
        p4 <- (tsnew2[[i]] %>% decompose(type='additive')) %>%
            autoplot() +
            ggtitle(paste0('Post-interpolation - Category S0', i, ', Var0', paste0(fcvars[[i]][2])))
        grid.arrange(p3, p4, ncol=1, nrow=2)
    
    }

}

Lag plots

# Lag plots - I don't think these would be useful
for (i in seq(1, 6)) {
    
    p1 <- ts1[[i]] %>%
        gglagplot() +
        ggtitle(paste0('Lag plot - category S0', i, ', Var0', fcvars[[i]][1]))
    p2 <- ts2[[i]] %>%
        gglagplot() +
        ggtitle(paste0('Lag plot - category S0', i, ', Var0', fcvars[[i]][2]))
    grid.arrange(p1, p2, ncol=2, nrow=1)
    
}

Seasonal subseries plots

if (the_freq > 1) {
    
    # Seasonal subseries plots
    for (i in seq(1, 6)) {
        
        p1 <- tsnew1[[i]] %>%
            ggsubseriesplot(main=paste0('Category S0', i, ', Var0', fcvars[[i]][1]))
        p2 <- tsnew2[[i]] %>%
            ggsubseriesplot(main=paste0('Category S0', i, ', Var0', fcvars[[i]][2]))
        grid.arrange(p1, p2, ncol=2, nrow=1)
    
    }
    
}

Seasonal plots

if (the_freq > 1) {

    # I don't think these will be useful
    for (i in seq(1, 6)) {
        p1 <- tsnew1[[i]] %>%
            head(100) %>%
            ggseasonplot()
        p2 <- tsnew2[[i]] %>%
            head(100) %>%
            ggseasonplot()
        grid.arrange(p1, p2, nrow=2, ncol=1)
    }
    
}

Decomposition

if (the_freq > 1) {
    
    # Decomposition
    for (i in seq(1, 6)) {
    
        p1 <- tsnew1[[i]] %>% 
            head(3000) %>%
            stl(s.window='periodic') %>%
            autoplot() +
            ggtitle(paste0('STL decomposition - category S0', i, ', Var0', paste0(fcvars[[i]][1])))
        p2 <- tsnew2[[i]] %>% 
            head(3000) %>%
            stl(s.window='periodic') %>%
            autoplot() +
            ggtitle(paste0('STL decomposition - Category S0', i, ', Var0', paste0(fcvars[[i]][2])))
        grid.arrange(p1, p2, ncol=1, nrow=2)
    
    }
    
}

ACF/PACF plots

Some types of models are sensitive to data that is autocorrelated, that is, data that contains values which are related to previous values in some regular or predictable way. These models require that the data be modified such that they are “stationary,” meaning that they appear to be randomly distributed and when plotted look like “white noise.”

To identify autocorrelation patterns in the data, autocorrelation function (ACF) and partial autocorrelation function (PACF) plots can be constructed. These plots illustrate the relationship between lagged time series values, i.e. comparing one value with the the next value in the series, or the values two or more positions later. Examining the patterns in the ACF and PACF plots helps the modeler determine what parameters to use as a basis when modeling.

ACF and PACF plots aid in evaluating whether the data is autocorrelated and, if so, whether it should be modified before modeling occurs. One such modification is “differencing,” which converts time series data into the change in value over time. Once the data is “differenced,” it is no longer autocorrelated, and the time series should appear to be “white noise.” Likewise, the ACF and PACF plots should exhibit no clear trend or pattern.

As shown in the figures, there is some trending to most variables that would indicate that differencing is needed prior to modeling. One possible exception is Var02 in categories

# ACF/PACF plots - needed for ARIMA modeling
for (i in seq(1, 6)) {

    p1 <- tsnew1[[i]] %>%
        ggtsdisplay(plot.type='partial', main=paste0('Category S0', i, ', Var0', fcvars[[i]][1]))
    p2 <- tsnew2[[i]] %>%
        ggtsdisplay(plot.type='partial', main=paste0('Category S0', i, ', Var0', fcvars[[i]][2]))
    p1
    p2
    
}

# Init df to hold kpss test stats to determine whether differencing is needed
dfdiffs <- data.frame(matrix(nrow=0, ncol=6))
colnames(dfdiffs) = c('Category', 'Variable', 'KPSS.test.statistic', 'Differencing.required', 
                      'Number.of.seasonal.differences', 'Seasonal.differencing.required')

# Differencing - all need differencing except Var02
for (i in seq(1, 6)) {

    # First variable in this category
    tmp_kpss <- summary(tsnew1[[i]] %>% ur.kpss())@teststat
    if (the_freq > 1) {
        tmp_nsdiffs <- tsnew1[[i]] %>% nsdiffs()
    }
    else {
        tmp_nsdiffs <- 0
    }
    diff_req <- 'no'
    seas_diff_req <- 'no'
    if (tmp_kpss > 1) {  # test statistic is given in percent, if it exceeds 1%, differencing is required
        diff_req <- 'yes'
    }
    if (tmp_nsdiffs > 0) {  # nsdiffs() gives the number of seasonal differences required
        seas_diff_req <- 'yes'
    }
    dfdiffs <- rbind(dfdiffs, data.frame(
        Category=paste0('S0', i),
        Variable=paste0('V0', fcvars[[i]][1]),
        KPSS.test.statistic=tmp_kpss,
        Differencing.required=diff_req,
        Number.of.seasonal.differences=tmp_nsdiffs,
        Seasonal.differencing.required=seas_diff_req
    ))

    # Second variable in this category
    tmp_kpss <- summary(tsnew2[[i]] %>% ur.kpss())@teststat
    if (the_freq > 1) {
        tmp_nsdiffs <- tsnew2[[i]] %>% nsdiffs()
    }
    else {
        tmp_nsdiffs <- 0
    }
    diff_req <- 'no'
    seas_diff_req <- 'no'
    if (tmp_kpss > 1) {  # test statistic is given in percent, if it exceeds 1%, differencing is required
        diff_req <- 'yes'
    }
    if (tmp_nsdiffs > 0) {  # nsdiffs() gives the number of seasonal differences required
        seas_diff_req <- 'yes'
    }
    dfdiffs <- rbind(dfdiffs, data.frame(
        Category=paste0('S0', i),
        Variable=paste0('V0', fcvars[[i]][1]),
        KPSS.test.statistic=tmp_kpss,
        Differencing.required=diff_req,
        Number.of.seasonal.differences=tmp_nsdiffs,
        Seasonal.differencing.required=seas_diff_req
    ))

}

# Show table
dfdiffs %>%
    kbl(caption='Differencing requirements') %>%
    kable_classic(full_width=F)

Differencing requirements
Category	Variable	KPSS.test.statistic	Differencing.required	Seasonal.differencing.required
S01	V01	16.630429	yes	no
S01	V01	10.706258	yes	no
S02	V02	10.016950	yes	no
S02	V02	4.219967	yes	no
S03	V05	15.215024	yes	no
S03	V05	15.231207	yes	no
S04	V01	15.184888	yes	no
S04	V01	1.291808	yes	no
S05	V02	8.348904	yes	no
S05	V02	8.311704	yes	no
S06	V05	17.360578	yes	no
S06	V05	17.359510	yes	no

# Create new list to hold differenced ts objects
tsdiff1 <- list()
tsdiff2 <- list()

# Now actually do the differencing
for (i in seq(1, 6)) {
    
    # First variable in category i
    tsdiff1[[i]] <- diff(tsnew1[[i]])
    tmp_kpss <- summary(tsdiff1[[i]] %>% ur.kpss())@teststat
    dfdiffs[2 * i - 1, 'KPSS.test.statistic'] <- tmp_kpss
    diff_req <- 'no'
    if (tmp_kpss > 1) {  # test statistic is given in percent, if it exceeds 1%, differencing is required
        diff_req <- 'yes'
    }
    dfdiffs[2 * i - 1, 'Differencing.required'] <- diff_req

    # Second variable in category i
    tsdiff2[[i]] <- diff(tsnew1[[i]])
    tmp_kpss <- summary(tsdiff1[[i]] %>% ur.kpss())@teststat
    dfdiffs[2 * i, 'KPSS.test.statistic'] <- tmp_kpss
    diff_req <- 'no'
    if (tmp_kpss > 1) {  # test statistic is given in percent, if it exceeds 1%, differencing is required
        diff_req <- 'yes'
    }
    dfdiffs[2 * i, 'Differencing.required'] <- diff_req
}

# Show updated table
dfdiffs %>%
    kbl(caption='Differencing requirements') %>%
    kable_classic(full_width=F)

Differencing requirements
Category	Variable	KPSS.test.statistic	Differencing.required	Seasonal.differencing.required
S01	V01	0.1001108	no	no
S01	V01	0.1001108	no	no
S02	V02	0.0039713	no	no
S02	V02	0.0039713	no	no
S03	V05	0.1252398	no	no
S03	V05	0.1252398	no	no
S04	V01	0.1280270	no	no
S04	V01	0.1280270	no	no
S05	V02	0.0047819	no	no
S05	V02	0.0047819	no	no
S06	V05	0.1050035	no	no
S06	V05	0.1050035	no	no

Modeling

# Prepare data frame for results
#dfr <- data.frame(matrix(nrow=0, ncol=11))
#colnames(dfr) <- c('category', 'var', 'model', 'method', 'ME', 'RMSE', 'MAE', 'MPE', 'MAPE', 'MASE', 'ACF1')
dfr <- data.frame(matrix(nrow=0, ncol=6))
colnames(dfr) <- c('Category', 'Variable', 'Model', 'Method', 'MAPE', 'Ljung.Box')

Exponential smoothing

# Create list to store ETS fit
fit_ets1 <- list()
fit_ets2 <- list()

# ETS
for (i in seq(1, 6)) {
    
    # First variable in category i
    fit_ets1[[i]] <- ets(tsnew1[[i]])
    dfr <- rbind(dfr, data.frame(
        Category=paste0('S0', i), 
        Variable=paste0('V0', fcvars[[i]][1]), 
        Model='ETS', 
        Method=fit_ets1[[i]]$method, 
        MAPE=accuracy(fit_ets1[[i]])[5],
        Ljung.Box=0  # temp, will fill in later when calculating residuals
    ))
    
    # Second variable in category i
    fit_ets2[[i]] <- ets(tsnew2[[i]])
    dfr <- rbind(dfr, data.frame(
        Category=paste0('S0', i), 
        Variable=paste0('V0', fcvars[[i]][2]), 
        Model='ETS', 
        Method=fit_ets2[[i]]$method,
        MAPE=accuracy(fit_ets2[[i]])[5],
        Ljung.Box=0  # temp, will fill in later when calculating residuals
    ))
    
}

# Display residual plots
for (i in seq(1, 6)) {
    tmp_res <- checkresiduals(fit_ets1[[i]])
    dfr[i * 2 - 1, 'Ljung.Box'] <- tmp_res$p.value
    tmp_res <- checkresiduals(fit_ets2[[i]])
    dfr[i * 2, 'Ljung.Box'] <- tmp_res$p.value
}

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 20.285, df = 10, p-value = 0.02667
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,A,M)
## Q* = 66.04, df = 10, p-value = 2.562e-10
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,M)
## Q* = 66.585, df = 10, p-value = 2.015e-10
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,Ad,N)
## Q* = 33.823, df = 10, p-value = 0.0001979
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,A)
## Q* = 13.536, df = 10, p-value = 0.1952
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,A)
## Q* = 8.9033, df = 10, p-value = 0.5413
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,N)
## Q* = 15.125, df = 10, p-value = 0.1276
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,M)
## Q* = 54.629, df = 10, p-value = 3.703e-08
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(M,N,M)
## Q* = 65.926, df = 10, p-value = 2.695e-10
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 32.435, df = 10, p-value = 0.0003387
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 18.391, df = 10, p-value = 0.04871
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ETS(A,N,N)
## Q* = 10.804, df = 10, p-value = 0.373
## 
## Model df: 0.   Total lags used: 10

For the most part, residuals look like white noise and are normally distributed.

# Display results
dfr %>%
    kbl(caption='ETS Modeling Results') %>%
    kable_classic(full_width=F)

ETS Modeling Results
Category	Variable	Model	Method	MAPE	Ljung.Box
S01	V01	ETS	ETS(M,N,N)	0.8894790	0.0266735
S01	V02	ETS	ETS(M,A,M)	29.4721891	0.0000000
S02	V02	ETS	ETS(M,N,M)	26.9412580	0.0000000
S02	V03	ETS	ETS(A,Ad,N)	1.3036569	0.0001979
S03	V05	ETS	ETS(M,N,A)	1.2878573	0.1952189
S03	V07	ETS	ETS(M,N,A)	1.1834222	0.5413057
S04	V01	ETS	ETS(M,N,N)	1.1894769	0.1275537
S04	V02	ETS	ETS(M,N,M)	32.8855229	0.0000000
S05	V02	ETS	ETS(M,N,M)	19.0649932	0.0000000
S05	V03	ETS	ETS(A,N,N)	0.7845041	0.0003387
S06	V05	ETS	ETS(A,N,N)	1.0584005	0.0487121
S06	V07	ETS	ETS(A,N,N)	1.0709812	0.3729529

ARIMA modeling

# Create list to store ETS fit
fit_arima1 <- list()
fit_arima2 <- list()

# Function to return friendly name of ARIMA method using the fit returned by the model
ret_arima_name <- function (fit) {
    tmp_name <- paste0(
        'ARIMA(', fit$arma[1], 
        ',', fit$arma[6],
        ',', fit$arma[2],
        ')(', fit$arma[3],
        ',',  fit$arma[7],
        ',',  fit$arma[4],
        ')'
    )
    if ('drift' %in% names(fit$coef)) {
        tmp_name <- paste(tmp_name, ' with drift')
    }
    return(tmp_name)
}

# ARIMA modeling
for (i in seq(1, 6)) {
    
    fit_arima1[[i]] <- auto.arima(tsnew1[[i]])
    dfr <- rbind(dfr, data.frame(
        Category=paste0('S0', i), 
        Variable=paste0('V0', fcvars[[i]][1]), 
        Model='ARIMA', 
        Method=ret_arima_name(fit_arima1[[i]]), 
        MAPE=accuracy(fit_arima1[[i]])[5],
        Ljung.Box=0  # temp, will fill in later when calculating residuals
    ))
    fit_arima2[[i]] <- auto.arima(tsnew2[[i]])
    dfr <- rbind(dfr, data.frame(
        Category=paste0('S0', i), 
        Variable=paste0('V0', fcvars[[i]][2]), 
        Model='ARIMA', 
        Method=ret_arima_name(fit_arima2[[i]]),
        MAPE=accuracy(fit_arima1[[i]])[5],
        Ljung.Box=0  # temp, will fill in later when calculating residuals
    ))
    
}

# Display residual plots
for (i in seq(1, 6)) {
    tmp_res <- checkresiduals(fit_arima1[[i]])
    dfr[i * 2 - 1 + 12, 'Ljung.Box'] <- tmp_res$p.value
    tmp_res <- checkresiduals(fit_arima2[[i]])
    dfr[i * 2 + 12, 'Ljung.Box'] <- tmp_res$p.value
}

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,2) with drift
## Q* = 7.9375, df = 8, p-value = 0.4396
## 
## Model df: 2.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,2)(0,0,2)[5]
## Q* = 11.455, df = 5, p-value = 0.04307
## 
## Model df: 5.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,3)(1,0,0)[5] with drift
## Q* = 3.9806, df = 4, p-value = 0.4086
## 
## Model df: 6.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,0)
## Q* = 11.977, df = 8, p-value = 0.1522
## 
## Model df: 2.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)
## Q* = 11.894, df = 8, p-value = 0.156
## 
## Model df: 2.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 8.5574, df = 10, p-value = 0.5746
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,0)
## Q* = 6.8036, df = 10, p-value = 0.7439
## 
## Model df: 0.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,3)(1,0,1)[5]
## Q* = 9.0584, df = 4, p-value = 0.05966
## 
## Model df: 6.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(2,1,2)(2,0,0)[5] with drift
## Q* = 1.8642, df = 4, p-value = 0.7607
## 
## Model df: 6.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)
## Q* = 13.457, df = 9, p-value = 0.143
## 
## Model df: 1.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(0,1,1)(0,0,2)[5]
## Q* = 8.1228, df = 7, p-value = 0.3219
## 
## Model df: 3.   Total lags used: 10

## 
##  Ljung-Box test
## 
## data:  Residuals from ARIMA(1,1,1)
## Q* = 3.9985, df = 8, p-value = 0.8573
## 
## Model df: 2.   Total lags used: 10

As with the ETS models, residuals look like white noise and are normally distributed.

# Display results
dfr %>%
    arrange(Category, Variable, Model) %>%
    kbl(caption='Modeling Results - ETS and ') %>%
    kable_classic(full_width=F)

Modeling Results - ETS and
Category	Variable	Model	Method	MAPE	Ljung.Box
S01	V01	ARIMA	ARIMA(0,1,2)(0,0,0) with drift	0.8842708	0.4395964
S01	V01	ETS	ETS(M,N,N)	0.8894790	0.0266735
S01	V02	ARIMA	ARIMA(1,1,2)(0,0,2)	0.8842708	0.0430706
S01	V02	ETS	ETS(M,A,M)	29.4721891	0.0000000
S02	V02	ARIMA	ARIMA(2,1,3)(1,0,0) with drift	26.4875660	0.4086317
S02	V02	ETS	ETS(M,N,M)	26.9412580	0.0000000
S02	V03	ARIMA	ARIMA(2,1,0)(0,0,0)	26.4875660	0.1522174
S02	V03	ETS	ETS(A,Ad,N)	1.3036569	0.0001979
S03	V05	ARIMA	ARIMA(1,1,1)(0,0,0)	1.2912153	0.1560026
S03	V05	ETS	ETS(M,N,A)	1.2878573	0.1952189
S03	V07	ARIMA	ARIMA(0,1,0)(0,0,0)	1.2912153	0.5745615
S03	V07	ETS	ETS(M,N,A)	1.1834222	0.5413057
S04	V01	ARIMA	ARIMA(0,1,0)(0,0,0)	1.1895139	0.7438504
S04	V01	ETS	ETS(M,N,N)	1.1894769	0.1275537
S04	V02	ARIMA	ARIMA(1,1,3)(1,0,1)	1.1895139	0.0596569
S04	V02	ETS	ETS(M,N,M)	32.8855229	0.0000000
S05	V02	ARIMA	ARIMA(2,1,2)(2,0,0) with drift	18.9684041	0.7607213
S05	V02	ETS	ETS(M,N,M)	19.0649932	0.0000000
S05	V03	ARIMA	ARIMA(0,1,1)(0,0,0)	18.9684041	0.1429890
S05	V03	ETS	ETS(A,N,N)	0.7845041	0.0003387
S06	V05	ARIMA	ARIMA(0,1,1)(0,0,2)	1.0579635	0.3218907
S06	V05	ETS	ETS(A,N,N)	1.0584005	0.0487121
S06	V07	ARIMA	ARIMA(1,1,1)(0,0,0)	1.0579635	0.8572600
S06	V07	ETS	ETS(A,N,N)	1.0709812	0.3729529

Model selection

# Manually choose best model for now
Selected.model <- c(1, 0, 1, 0, 1, 0, 1, 0, 0, 1, 0, 1, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0, 1, 0)

# Choose the model with the lower MAPE for each category/var combination
#dfr2 <- dfr %>%
#    #filter(Ljung.Box > 0.05) %>%
#    group_by(Category, Variable) %>%
#    slice_min(MAPE)
dfr2 <- dfr %>%
    arrange(Category, Variable, Model) %>%
    cbind(Selected.model) %>%
    filter(Selected.model==1) %>%
    select(-Selected.model)
colnames(dfr2) <- c('Category', 'Variable', 'Model', 'Method', 'MAPE', 'Ljung.Box')
dfr2 %>%
    kbl(caption='Best-performing models') %>%
    kable_classic(full_width=F)

Best-performing models
Category	Variable	Model	Method	MAPE	Ljung.Box
S01	V01	ARIMA	ARIMA(0,1,2)(0,0,0) with drift	0.8842708	0.4395964
S01	V02	ARIMA	ARIMA(1,1,2)(0,0,2)	0.8842708	0.0430706
S02	V02	ARIMA	ARIMA(2,1,3)(1,0,0) with drift	26.4875660	0.4086317
S02	V03	ARIMA	ARIMA(2,1,0)(0,0,0)	26.4875660	0.1522174
S03	V05	ETS	ETS(M,N,A)	1.2878573	0.1952189
S03	V07	ETS	ETS(M,N,A)	1.1834222	0.5413057
S04	V01	ARIMA	ARIMA(0,1,0)(0,0,0)	1.1895139	0.7438504
S04	V02	ARIMA	ARIMA(1,1,3)(1,0,1)	1.1895139	0.0596569
S05	V02	ARIMA	ARIMA(2,1,2)(2,0,0) with drift	18.9684041	0.7607213
S05	V03	ARIMA	ARIMA(0,1,1)(0,0,0)	18.9684041	0.1429890
S06	V05	ARIMA	ARIMA(0,1,1)(0,0,2)	1.0579635	0.3218907
S06	V07	ARIMA	ARIMA(1,1,1)(0,0,0)	1.0579635	0.8572600

Forecasting

# Create variable to store forcasts
fc1 <- list()
fc2 <- list()

# Create data frame to store forecasts
dffc <- data.frame(matrix(nrow=0, ncol=3))
colnames(dffc) <- c('Category', 'Variable', 'Forecast')

for (i in seq(1, 6)) {
    
    # First var in category
    if (dfr2[2 * i - 1, 'Model'] == 'ETS') {
        fc1[[i]] <- fit_ets1[[i]] %>% forecast(h=140)
    } else {
        fc1[[i]] <- fit_arima1[[i]] %>% forecast(h=140)
    }
    p1 <- fc1[[i]] %>%
        autoplot() +
        ylab('Forecasted units') +
        ggtitle(paste0('Forecasts - category S0', i, ', VAR0', fcvars[[i]][1]))
    print(p1)

    # First var in category
    if (dfr2[2 * i, 'Model'] == 'ETS') {
        fc2[[i]] <- fit_ets2[[i]] %>% forecast(h=140)
    } else {
        fc2[[i]] <- fit_arima2[[i]] %>% forecast(h=140)
    }
    p2 <- fc2[[i]] %>%
        autoplot() +
        ylab('Forecasted units') +
        ggtitle(paste0('Forecasts - category S0', i, ', VAR0', fcvars[[i]][2]))
    print(p2)
    
    # Store forecasts in df
    dffc <- rbind(dffc, data.frame(
        Category=paste0('S0', i),
        Variable=paste0('V0', fcvars[[i]][1]),
        Forecast=data.frame(fc1[[i]])['Point.Forecast']
    ))
    dffc <- rbind(dffc, data.frame(
        Category=paste0('S0', i),
        Variable=paste0('V0', fcvars[[i]][2]),
        Forecast=data.frame(fc2[[i]])['Point.Forecast']
    ))

}

#df_orig %>%
#    filter(SeriesInd > 43021) %>%
#    select(SeriesInd) %>%

#dffc %>%
#    merge()

# Display a few rows
dffc %>%
    head(10) %>%
    kbl(caption='Forecast values (first 10 values)') %>%
    kable_classic(full_width=F)

Forecast values (first 10 values)
	Category	Variable	Point.Forecast
41005.20	S01	V01	62.34758
41005.40	S01	V01	62.36178
41005.60	S01	V01	62.38303
41005.80	S01	V01	62.40428
41006.00	S01	V01	62.42553
41006.20	S01	V01	62.44679
41006.40	S01	V01	62.46804
41006.60	S01	V01	62.48929
41006.80	S01	V01	62.51054
41007.00	S01	V01	62.53179

# Write forecasts

#reference: https://www.statology.org/r-export-to-excel-multiple-sheets/
#export each data frame to separate sheets in same Excel file
#openxlsx::write.xlsx(forecasts, file = 'mydata.xlsx')

ippolito_data624_project1

Michael Ippolito

2023-06-10