coe model for Cat A prices from year 2007 to year 2017.

suppressWarnings(suppressMessages(library(h2o)))
suppressWarnings(suppressMessages(library(timetk)))
suppressWarnings(suppressMessages(library(tidyquant)))
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(plotly)))

coe <- read.csv("coedata.csv", header=TRUE, stringsAsFactors=FALSE)

#remove $ dollar sign
coe$premium <-str_replace_all(c(coe$premium), fixed("$"), "")
coe$pqp <- str_replace_all(c(coe$pqp), fixed("$"), "")

# remove the comma in character and convert into numeric
coe$quota <- as.numeric(gsub(",", "", as.character(coe$quota)))
coe$quota_recd <- as.numeric(gsub(",", "", as.character(coe$quota_recd)))
coe$premium <- as.numeric(gsub(",", "", as.character(coe$premium)))
coe$pqp <- as.numeric(gsub(",", "", as.character(coe$pqp)))


#convert into tibble format
coe_data <- as_data_frame(coe)

# Starting point
coe_data <- subset(coe_data, select = -c(year,month,round,convert))
coe_data$date_convert <- ymd(coe_data$date_convert)

# Augment (adds data frame columns)
coe_data_aug <- coe_data %>% tk_augment_timeseries_signature()
#coe_data_aug %>% glimpse()

# prep data for h2o
coe_data_aug_clean <- coe_data_aug %>%
  select_if(~ !is.Date(.)) %>%
  select_if(~ !any(is.na(.))) %>%
  mutate_if(is.ordered, ~ as.character(.) %>% as.factor)

#coe_data_aug_clean %>% glimpse()

# Split into training, validation and test sets

train_tbl <- coe_data_aug_clean %>% filter(year <= 2014)
valid_tbl <- coe_data_aug_clean %>% filter(year == 2015)
test_tbl  <- coe_data_aug_clean %>% filter(year >= 2016)

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         47 minutes 6 seconds 
##     H2O cluster version:        3.14.0.3 
##     H2O cluster version age:    1 month and 12 days  
##     H2O cluster name:           H2O_started_from_R_jkkli_gbh042 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   6.17 GB 
##     H2O cluster total cores:    8 
##     H2O cluster allowed cores:  8 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.2 (2017-09-28)
h2o.no_progress()  

# Convert to H2OFrame objects
train_h2o <- as.h2o(train_tbl)
valid_h2o <- as.h2o(valid_tbl)
test_h2o  <- as.h2o(test_tbl)

#Set the names that h2o will use as the target and predictor variables.
# Set names for h2o
y <- "premium"
x <- setdiff(names(train_h2o), y)


# linear regression model used, but can use any model
automl_models_h2o <- h2o.automl(
  x = x, 
  y = y, 
  training_frame = train_h2o, 
  validation_frame = valid_h2o, 
  leaderboard_frame = test_h2o, 
  max_runtime_secs = 60, 
  stopping_metric = "deviance")

#Extract leader model
automl_leader <- automl_models_h2o@leader
pred_h2o <- h2o.predict(automl_leader, newdata = test_h2o)
h2o.performance(automl_leader, newdata = test_h2o)
## H2ORegressionMetrics: gbm
## 
## MSE:  15866123
## RMSE:  3983.23
## MAE:  3263.436
## RMSLE:  0.086472
## Mean Residual Deviance :  15866123
error_tbl <- coe_data %>% 
  filter(lubridate::year(date_convert) >= 2016) %>%
  add_column(predict_coe = pred_h2o %>% as.tibble() %>% pull(predict)) %>%
    mutate(
    error     = premium - predict_coe,
    error_pct = error / premium
  ) 

## this gives a range of predicted coe and its error range in absolute dollars
error_tbl
## # A tibble: 44 x 8
##    premium   pqp quota quota_recd date_convert predict_coe      error
##      <dbl> <dbl> <dbl>      <dbl>       <date>       <dbl>      <dbl>
##  1   41617 42564  1815       2391   2017-10-28    50187.56  -8570.558
##  2   41761 42564  1829       2808   2017-10-01    49775.44  -8014.436
##  3   42902 44081  1815       3469   2017-09-28    48141.89  -5239.888
##  4   36001 44081  1820       2195   2017-09-01    47882.51 -11881.505
##  5   42900 45400  1817       2148   2017-08-28    50023.67  -7123.669
##  6   46778 45400  1817       2403   2017-08-01    49184.60  -2406.600
##  7   44002 48200  1907       2389   2017-07-28    50007.98  -6005.985
##  8   42801 48200  1904       2296   2017-07-01    49253.45  -6452.448
##  9   42801 50625  1906       2302   2017-06-28    46798.40  -3997.397
## 10   45201 50625  1898       2245   2017-06-01    48224.47  -3023.470
## # ... with 34 more rows, and 1 more variables: error_pct <dbl>
error_tbl %>%
  summarise(
    me   = mean(error),
    rmse = mean(error^2)^0.5,
    mae  = mean(abs(error)),
    mape = mean(abs(error_pct))*100,
    mpe  = mean(error_pct)
  ) %>% glimpse()
## Observations: 1
## Variables: 5
## $ me   <dbl> -657.3225
## $ rmse <dbl> 4271.427
## $ mae  <dbl> 3386.48
## $ mape <dbl> 7.362642
## $ mpe  <dbl> -0.0215807

See graphs between Actual COE and Predicted COE

# SORT DATE from oldest to newest
error_tbl <-dplyr::arrange(error_tbl, date_convert)

trace_0 <- error_tbl$premium
trace_2 <- error_tbl$predict_coe

x <- c(1:nrow(error_tbl))
data <- data.frame(x, trace_0, trace_2)

plot_ly(data, x = ~x, y = ~trace_0, name = 'Actual COE for year 2016 to 2017', type = 'scatter', mode = 'lines') %>% 
  add_trace(y = ~trace_2, name = 'Predicted COE for year 2016 to 2017', type= 'scatter', mode = 'lines')