coe model for Cat A prices from year 2007 to year 2017.
suppressWarnings(suppressMessages(library(h2o)))
suppressWarnings(suppressMessages(library(timetk)))
suppressWarnings(suppressMessages(library(tidyquant)))
suppressWarnings(suppressMessages(library(stringr)))
suppressWarnings(suppressMessages(library(plotly)))
coe <- read.csv("coedata.csv", header=TRUE, stringsAsFactors=FALSE)
#remove $ dollar sign
coe$premium <-str_replace_all(c(coe$premium), fixed("$"), "")
coe$pqp <- str_replace_all(c(coe$pqp), fixed("$"), "")
# remove the comma in character and convert into numeric
coe$quota <- as.numeric(gsub(",", "", as.character(coe$quota)))
coe$quota_recd <- as.numeric(gsub(",", "", as.character(coe$quota_recd)))
coe$premium <- as.numeric(gsub(",", "", as.character(coe$premium)))
coe$pqp <- as.numeric(gsub(",", "", as.character(coe$pqp)))
#convert into tibble format
coe_data <- as_data_frame(coe)
# Starting point
coe_data <- subset(coe_data, select = -c(year,month,round,convert))
coe_data$date_convert <- ymd(coe_data$date_convert)
# Augment (adds data frame columns)
coe_data_aug <- coe_data %>% tk_augment_timeseries_signature()
#coe_data_aug %>% glimpse()
# prep data for h2o
coe_data_aug_clean <- coe_data_aug %>%
select_if(~ !is.Date(.)) %>%
select_if(~ !any(is.na(.))) %>%
mutate_if(is.ordered, ~ as.character(.) %>% as.factor)
#coe_data_aug_clean %>% glimpse()
# Split into training, validation and test sets
train_tbl <- coe_data_aug_clean %>% filter(year <= 2014)
valid_tbl <- coe_data_aug_clean %>% filter(year == 2015)
test_tbl <- coe_data_aug_clean %>% filter(year >= 2016)
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 47 minutes 6 seconds
## H2O cluster version: 3.14.0.3
## H2O cluster version age: 1 month and 12 days
## H2O cluster name: H2O_started_from_R_jkkli_gbh042
## H2O cluster total nodes: 1
## H2O cluster total memory: 6.17 GB
## H2O cluster total cores: 8
## H2O cluster allowed cores: 8
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.2 (2017-09-28)
h2o.no_progress()
# Convert to H2OFrame objects
train_h2o <- as.h2o(train_tbl)
valid_h2o <- as.h2o(valid_tbl)
test_h2o <- as.h2o(test_tbl)
#Set the names that h2o will use as the target and predictor variables.
# Set names for h2o
y <- "premium"
x <- setdiff(names(train_h2o), y)
# linear regression model used, but can use any model
automl_models_h2o <- h2o.automl(
x = x,
y = y,
training_frame = train_h2o,
validation_frame = valid_h2o,
leaderboard_frame = test_h2o,
max_runtime_secs = 60,
stopping_metric = "deviance")
#Extract leader model
automl_leader <- automl_models_h2o@leader
pred_h2o <- h2o.predict(automl_leader, newdata = test_h2o)
h2o.performance(automl_leader, newdata = test_h2o)
## H2ORegressionMetrics: gbm
##
## MSE: 15866123
## RMSE: 3983.23
## MAE: 3263.436
## RMSLE: 0.086472
## Mean Residual Deviance : 15866123
error_tbl <- coe_data %>%
filter(lubridate::year(date_convert) >= 2016) %>%
add_column(predict_coe = pred_h2o %>% as.tibble() %>% pull(predict)) %>%
mutate(
error = premium - predict_coe,
error_pct = error / premium
)
## this gives a range of predicted coe and its error range in absolute dollars
error_tbl
## # A tibble: 44 x 8
## premium pqp quota quota_recd date_convert predict_coe error
## <dbl> <dbl> <dbl> <dbl> <date> <dbl> <dbl>
## 1 41617 42564 1815 2391 2017-10-28 50187.56 -8570.558
## 2 41761 42564 1829 2808 2017-10-01 49775.44 -8014.436
## 3 42902 44081 1815 3469 2017-09-28 48141.89 -5239.888
## 4 36001 44081 1820 2195 2017-09-01 47882.51 -11881.505
## 5 42900 45400 1817 2148 2017-08-28 50023.67 -7123.669
## 6 46778 45400 1817 2403 2017-08-01 49184.60 -2406.600
## 7 44002 48200 1907 2389 2017-07-28 50007.98 -6005.985
## 8 42801 48200 1904 2296 2017-07-01 49253.45 -6452.448
## 9 42801 50625 1906 2302 2017-06-28 46798.40 -3997.397
## 10 45201 50625 1898 2245 2017-06-01 48224.47 -3023.470
## # ... with 34 more rows, and 1 more variables: error_pct <dbl>
error_tbl %>%
summarise(
me = mean(error),
rmse = mean(error^2)^0.5,
mae = mean(abs(error)),
mape = mean(abs(error_pct))*100,
mpe = mean(error_pct)
) %>% glimpse()
## Observations: 1
## Variables: 5
## $ me <dbl> -657.3225
## $ rmse <dbl> 4271.427
## $ mae <dbl> 3386.48
## $ mape <dbl> 7.362642
## $ mpe <dbl> -0.0215807
See graphs between Actual COE and Predicted COE
# SORT DATE from oldest to newest
error_tbl <-dplyr::arrange(error_tbl, date_convert)
trace_0 <- error_tbl$premium
trace_2 <- error_tbl$predict_coe
x <- c(1:nrow(error_tbl))
data <- data.frame(x, trace_0, trace_2)
plot_ly(data, x = ~x, y = ~trace_0, name = 'Actual COE for year 2016 to 2017', type = 'scatter', mode = 'lines') %>%
add_trace(y = ~trace_2, name = 'Predicted COE for year 2016 to 2017', type= 'scatter', mode = 'lines')