Data624 Project 1

Group Members

Subhalaxmi Rout
Kenan Sooklall
Devin Teran
Christian Thieme
Leo Yi

Getting The Data

url <- 'https://raw.githubusercontent.com/christianthieme/Predictive-Analytics/main/data624_project1_dataset.csv'
df <- read.csv(url)
glimpse(df)

## Rows: 10,572
## Columns: 7
## $ ï..SeriesInd <int> 40669, 40669, 40669, 40669, 40669, 40669, 40670, 40670, 4~
## $ group        <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "~
## $ Var01        <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.2000~
## $ Var02        <int> 123432400, 60855800, 10369300, 39335700, 27809100, 165874~
## $ Var03        <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.8800~
## $ Var05        <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.9400~
## $ Var07        <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.1000~

Let’s clean it up a bit.

# rename first column
names(df)[1] <- 'date'

# convert first column to date
df$date <- as.Date(df$date, origin="1899-12-30") 

# convert column names to all lowercase
names(df) <- lapply(names(df), tolower)

glimpse(df)

## Rows: 10,572
## Columns: 7
## $ date  <date> 2011-05-06, 2011-05-06, 2011-05-06, 2011-05-06, 2011-05-06, 201~
## $ group <chr> "S03", "S02", "S01", "S06", "S05", "S04", "S03", "S02", "S01", "~
## $ var01 <dbl> 30.64286, 10.28000, 26.61000, 27.48000, 69.26000, 17.20000, 30.7~
## $ var02 <int> 123432400, 60855800, 10369300, 39335700, 27809100, 16587400, 150~
## $ var03 <dbl> 30.34000, 10.05000, 25.89000, 26.82000, 68.19000, 16.88000, 30.4~
## $ var05 <dbl> 30.49000, 10.17000, 26.20000, 27.02000, 68.72000, 16.94000, 30.6~
## $ var07 <dbl> 30.57286, 10.28000, 26.01000, 27.32000, 69.15000, 17.10000, 30.6~

Separate Into Individual Sets

filter_group <- function(group_name) {
  temp_df <- df %>%
    filter(group == group_name) %>%
    dplyr::select(-group)
  return(temp_df[1:1622,])
}

s1 <- filter_group('S01')
s2 <- filter_group('S02')
s3 <- filter_group('S03')
s4 <- filter_group('S04')
s5 <- filter_group('S05')
s6 <- filter_group('S06')

Exploratory Data Analysis

Here’s an attempt to plot the two variables we need to create forecasts for, for the first set, S01.

s1 %>%
  gather(variable, value, -date) %>%
  ggplot(aes(x = date, y = value)) +
  geom_line() +
  facet_wrap(~variable, scales = 'free_y', nrow = 5) +
  theme_classic()

correlation between variables

Do variables 3+ even matter here? I don’t think so. It doesn’t look like we can use them to impute missing data.

# s1 %>%
#   select(-date) %>%
#   ggpairs()

# calculate correlation for complete rows
s1[complete.cases(s1),] %>%
  with(cor(var01, var02))

## [1] -0.5598215

# plot relationship
ggplot(s1, aes(x = var01, y = var02)) +
  geom_point(alpha = 0.2) +
  theme_classic()

# plot relationship, y log scale
ggplot(s1, aes(x = var01, y = var02)) +
  geom_point(alpha = 0.2) +
  geom_smooth(se = F, method = 'lm') +
  theme_classic() +
  scale_y_log10()

Missing Values

# incomplete cases row 1537:1538
s1[!complete.cases(s1),]

##            date var01   var02 var03 var05 var07
## 1537 2017-06-11    NA 7329600    NA    NA    NA
## 1538 2017-06-12    NA 6121400    NA    NA    NA
## 1607 2017-09-19 58.83 6337000    NA    NA    NA
## 1608 2017-09-22 59.28 3690900    NA    NA    NA

# highlight missing cases
ggplot(s1, aes(x = date, y = var01)) +
  geom_line() + 
  geom_vline(xintercept = as.Date('2017-06-11'), lty=2)

# zoom in to data, 2017
filter(s1, date >= '2017-01-01') %>%
  ggplot(aes(x = date, y = var01)) +
  geom_line() + 
  geom_vline(xintercept = as.Date('2017-06-11'), lty=2)

I think it would be safe to impute the average of the leading and trailing days.

# filter(s1, date >= '2017-06-05' & date <= '2017-06-30')
s1[1533:1542,]

##            date var01    var02 var03 var05 var07
## 1533 2017-06-05 51.62  9795800 50.01 50.46 51.26
## 1534 2017-06-06 51.41  7929800 49.51 51.20 49.88
## 1535 2017-06-09 51.23 15765600 49.10 49.26 50.90
## 1536 2017-06-10 51.63  7321700 50.09 50.53 50.68
## 1537 2017-06-11    NA  7329600    NA    NA    NA
## 1538 2017-06-12    NA  6121400    NA    NA    NA
## 1539 2017-06-13 51.40  8060600 49.99 51.06 51.14
## 1540 2017-06-17 52.77  6567400 51.14 51.14 52.54
## 1541 2017-06-18 53.72  6244700 52.73 52.96 53.69
## 1542 2017-06-19 53.82  6004200 53.00 53.64 53.06

# calculate window average, 4 before, 4 after
var1_window_avg <- mean(s1[1533:1542,2], na.rm = T)

# impute
s1[1537:1538,2] <- var1_window_avg

Split Training Data

Let’s use about 80% of the data to train the model and the last 20% to test it.

# each dataset was said to have 1622 rows
n <- 1622
test_rows <- floor(1622 * 0.80)

train_s1 <- s1[1:test_rows,]
test_s1 <- s1[test_rows + 1:1622,]

fit_s1v1 <- auto.arima(train_s1$var01)

fit_s1v1 %>%
  forecast(h = (n - test_rows) + 140) %>%
  autoplot()

fit_s1v2 <- auto.arima(train_s1$var02)

fit_s1v2 %>%
  forecast(h = (n - test_rows) + 140) %>%
  autoplot()