Initial Approach:

H2O

Read in data

#set working directory
setwd("C:/Users/Gerhard Viljoen/Desktop/bcx-hackathon-two/r-benchmark")
#read csv
train <- read.csv('../data/train.csv')

Define a data manipulation function:

data.manip <- function(train){
  
  #select numeric variables
  numeric_vars <- train[,4:22]
  
  #select numeric variables, set NA's to 0, scale them
  numeric_vars[is.na(numeric_vars)] <- 0
  numeric_vars[,1:18] <- scale(numeric_vars[,1:18])
  categorical_vars <- train[,23:31]
  
  #attempt to fix missing categorical vars (this approach did not really succeed in production)
  categorical_vars <- data.frame(categorical_vars)
  categorical_vars[is.na(categorical_vars)] <- "O."
  
  #define target and predictor variables
  train.y <- cbind(train[,1:3],train$quantity_next_week)
  train.x <- cbind(train[,1:3],numeric_vars[,-19],categorical_vars)
  
  #bind them together into a dataframe
  dat <- cbind(train.y,train.x)
  
  #remove dfs built in the function, not necessary downstream
  rm(categorical_vars,numeric_vars,train)
  rm(train.x,train.y)
  
  #deselect unnecessary columns
  dat <- dat[,-c(1:3)]
}

#save this function
save(data.manip,file="data_manip.RData")

#load it
load("data_manip.RData")
#clean data using the above-defined function

dat <- data.manip(train)

Run H2O RandomForest model:

https://www.h2o.ai/products/h2o/

http://docs.h2o.ai/h2o/latest-stable/h2o-r/docs/reference/h2o.randomForest.html

#initialize an h20 session, using all available cores and a decent amount of memory
require(h2o)
h2o.init(nthreads = -1,max_mem_size ="8G")# "32G")
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 14 minutes 
##     H2O cluster timezone:       Africa/Harare 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.18.0.11 
##     H2O cluster version age:    3 months and 4 days  
##     H2O cluster name:           H2O_started_from_R_Gerhard_Viljoen_qoy797 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   5.73 GB 
##     H2O cluster total cores:    4 
##     H2O cluster allowed cores:  4 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Algos, AutoML, Core V3, Core V4 
##     R Version:                  R version 3.4.4 (2018-03-15)
#suppress progress bar for Rmd
h2o.no_progress()

#push the manipulated data into an h2o frame
dat.hex <- as.h2o(dat,"dat.hex")

#generate 3 splits: training, validation used during training, and test
splitz <- h2o.splitFrame(dat.hex,ratios=c(0.7,0.15),
                         destination_frames = c("train.hex","valid.hex","test.hex"))

#assign these splits
train.hex <- splitz[[1]]
valid.hex <- splitz[[2]]
test.hex <- splitz[[3]]

#build a base random forest
rf_base <- h2o.randomForest(y=1,
                            x=5:31,
                            training_frame = train.hex,
                            validation_frame = valid.hex,
                            model_id = "rf_base",
                            ntrees = 500,
                            distribution = "poisson"
                            )

?h2o.randomForest

#check performance during training
h2o.performance(rf_base)
## H2ORegressionMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
## 
## MSE:  118.1001
## RMSE:  10.86739
## MAE:  3.126948
## RMSLE:  NaN
## Mean Residual Deviance :  118.1001
#check performance on the test set:
h2o.performance(rf_base,test.hex)
## H2ORegressionMetrics: drf
## 
## MSE:  76.27595
## RMSE:  8.73361
## MAE:  3.114512
## RMSLE:  NaN
## Mean Residual Deviance :  76.27595

Rethink approach:

Reasons:

  • Getting something in production on a remote machine, which doesn’t have the required packages in its environment, is not as easy as running locally
  • H2O in particular runs on a Java Virtual Machine (with other dependencies), and going through that mission might have taken too long
  • While H2O is really fast and handles one-hot encoding internally, it does not handle unseen factor levels autoatically

Solution

  • Run randomForest models for days!

randomForest

rm(list=ls())
train <- read.csv('../data/train.csv')

head(train)
##   sku_key store_key week_sunday lag_13 lag_12 lag_11 lag_10 lag_9 lag_8
## 1      24        17  2017/09/10     NA     NA     NA     NA    NA    NA
## 2      28        17  2017/09/10     NA     NA     NA     NA    NA    NA
## 3      31        62  2017/07/09     NA     NA     NA     NA    NA    NA
## 4      34         3  2016/06/19     NA     NA     NA     NA    NA    NA
## 5      34         3  2016/07/17     NA     NA     NA     NA    NA    NA
## 6      34       110  2017/01/15     NA     NA     NA     NA    NA    NA
##   lag_7 lag_6 lag_5 lag_4 lag_3 lag_2 lag_1 quantity_today stock_close
## 1    NA    NA     4     6     1     2     2              1          18
## 2    NA    NA     2     4     1     1     2              7          58
## 3    NA    NA    NA    NA    NA    NA    NA              3           3
## 4    NA    NA    NA    NA    NA    NA     1              1           7
## 5    NA    NA     1     1     3     1     2              1          10
## 6    NA    NA    NA    NA    NA     2     1              1          43
##   weeks_of_stock weeks_lead_time stock_lead_ratio quantity_next_week
## 1       6.750000       2.2857143         2.953125                  1
## 2      20.470588       2.2857143         8.955882                  7
## 3       1.000000       2.0000000         0.500000                  2
## 4       7.000000       1.7142857         4.083333                  3
## 5       6.666667       0.1428571        46.666667                  1
## 6      32.250000       1.2142857        26.558824                  2
##   store_region store_grading store_area_manager sku_department
## 1          LIM             A             CH2014             TY
## 2          LIM             A             CH2014             TY
## 3          KWA             B             JC2015             TY
## 4          KWA             A             JC2015             TY
## 5          KWA             A             JC2015             TY
## 6           WC          NULL                 WA             TY
##   sku_subdepartment sku_category sku_subcategory sku_range sku_label
## 1                04          012             332     0001B      NULL
## 2                06          063             20G     0001B      DRGI
## 3                04          096             326     0001B      NULL
## 4                04          011             35A     0001B      DINO
## 5                04          011             35A     0001B      DINO
## 6                04          011             35A     0001B      DINO
##   test_weeks
## 1      FALSE
## 2      FALSE
## 3      FALSE
## 4      FALSE
## 5      FALSE
## 6      FALSE
full.set <- train[,-c(3,ncol(train),29,31)]

full.set[is.na(full.set)] <- 0

source('functions.R')

types <- list(store_region = 'factor',
              store_grading = 'factor',
              store_area_manager = 'factor',
              sku_department = 'factor',
              sku_subdepartment = 'factor',
              sku_category = 'factor',
              #sku_subcategory = 'factor',
              sku_range = 'factor',
              #sku_label = 'factor',
              lag_13 = 'numeric',
              lag_12 = 'numeric',
              lag_11 = 'numeric',
              lag_10 = 'numeric',
              lag_9 = 'numeric',
              lag_8 = 'numeric',
              lag_7 = 'numeric',
              lag_6 = 'numeric',
              lag_5 = 'numeric',
              lag_4 = 'numeric',
              lag_3 = 'numeric',
              lag_2 = 'numeric',
              lag_1 = 'numeric',
              quantity_today = 'numeric',
              stock_close = 'numeric',
              weeks_of_stock = 'numeric',
              weeks_lead_time = 'numeric',
              stock_lead_ratio = 'numeric')

The ridiculously simple winning model was a randomForest with 25 trees:

#select data columns
data.cols <- c('quantity_next_week',names(types))


require(onehot)
#one-hot encode categorical variables
full.set.encoder <- onehot(full.set[,data.cols],max_levels=1000)
full.set.encoded <- as.data.frame(predict(full.set.encoder,full.set[,data.cols]))
names(full.set.encoded) <- as.vector(sapply(names(full.set.encoded),function(x) gsub('=','_',x)))

require(randomForest)
model.rf <- randomForest(quantity_next_week ~ ., full.set.encoded,ntree=25,do.trace=T)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##    1 |    315.7   116.76 |
##    2 |    250.3    92.57 |
##    3 |    208.8    77.23 |
##    4 |    193.2    71.47 |
##    5 |    179.6    66.42 |
##    6 |    170.4    63.02 |
##    7 |    165.7    61.29 |
##    8 |    154.7    57.23 |
##    9 |    150.9    55.80 |
##   10 |    150.5    55.65 |
##   11 |    149.2    55.17 |
##   12 |      145    53.63 |
##   13 |    142.5    52.71 |
##   14 |    141.3    52.27 |
##   15 |    142.8    52.80 |
##   16 |    141.2    52.22 |
##   17 |      140    51.78 |
##   18 |    140.9    52.13 |
##   19 |      141    52.14 |
##   20 |    138.8    51.34 |
##   21 |    137.8    50.96 |
##   22 |    137.8    50.98 |
##   23 |    136.8    50.61 |
##   24 |    138.2    51.10 |
##   25 |    137.9    50.99 |
summary(model.rf)
##                 Length Class  Mode     
## call                5  -none- call     
## type                1  -none- character
## predicted       91255  -none- numeric  
## mse                25  -none- numeric  
## rsq                25  -none- numeric  
## oob.times       91255  -none- numeric  
## importance        137  -none- numeric  
## importanceSD        0  -none- NULL     
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             11  -none- list     
## coefs               0  -none- NULL     
## y               91255  -none- numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call

The more complex model that wouldn’t generalize to unseen data:

#run model:
require(randomForest)
model.rf <- randomForest(quantity_next_week ~ ., full.set.encoded,ntree=100,do.trace=10)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##   10 |    156.9    58.05 |
##   20 |    140.4    51.94 |
##   30 |    135.4    50.06 |
##   40 |    130.6    48.29 |
##   50 |    128.1    47.39 |
##   60 |    127.8    47.26 |
##   70 |    127.5    47.17 |
##   80 |    126.8    46.91 |
##   90 |    126.8    46.92 |
##  100 |    125.9    46.58 |
summary(model.rf)
##                 Length Class  Mode     
## call                5  -none- call     
## type                1  -none- character
## predicted       91255  -none- numeric  
## mse               100  -none- numeric  
## rsq               100  -none- numeric  
## oob.times       91255  -none- numeric  
## importance        137  -none- numeric  
## importanceSD        0  -none- NULL     
## localImportance     0  -none- NULL     
## proximity           0  -none- NULL     
## ntree               1  -none- numeric  
## mtry                1  -none- numeric  
## forest             11  -none- list     
## coefs               0  -none- NULL     
## y               91255  -none- numeric  
## test                0  -none- NULL     
## inbag               0  -none- NULL     
## terms               3  terms  call
save(model.rf,file='model_rf.rdata')

Why is this not generalizable?

Some randomForest theory:

Single split tree.

Single split tree.

Go left if true

Splits are partitions in your feature space, using recursive binary splitting, which minimizes the residual sum of squares at each split

If there is no stopping criterion defined, trees are grown all the way to the bottom

Random Forests are multiple trees, grown on Bootstrapped samples of the training data, decorrelating each tree’s prediction with all others, by sampling m < p predictors as split candidates at each split.

m = p/3 is the default for randomForest

Overfitting.

Overfitting.

We can prevent overfitting further, by reducing the depth of our trees.

If we reduce the amount of observations run through each tree in the forest, we achieve this reduction in tree depth, since there is less data to split on. Smaller trees also means it runs faster…

So, we grow 5000 trees, sampling 10% of our predictors for each tree:

The Super-generalizable model, which would just not run:

model.rf3 <- randomForest(quantity_next_week ~ ., full.set.encoded,ntree=5000,do.trace=500,sampsize=9000)
##      |      Out-of-bag   |
## Tree |      MSE  %Var(y) |
##  500 |    132.9    49.16 |
## 1000 |    132.8    49.11 |
## 1500 |      133    49.19 |
## 2000 |    133.1    49.22 |
## 2500 |    132.8    49.11 |
## 3000 |    132.6    49.05 |
## 3500 |    132.7    49.07 |
## 4000 |    132.6    49.06 |
## 4500 |    132.5    49.00 |
## 5000 |    132.5    49.02 |
save(model.rf3,file='model_rf3.rdata')

Technicalities of getting model to run on remote machine:

RUN MODEL FILE: ………………………………………………………………………………………………………………………. hidden <- read.csv(opt$input) hidden <- as.data.frame(hidden)

prepare data for prediction

hidden[is.na(hidden)] <- 0

types <- list(store_region = ‘factor’, store_grading = ‘factor’, store_area_manager = ‘factor’, sku_department = ‘factor’, sku_subdepartment = ‘factor’, sku_category = ‘factor’, #sku_subcategory = ‘factor’, sku_range = ‘factor’, #sku_label = ‘factor’, lag_13 = ‘numeric’, lag_12 = ‘numeric’, lag_11 = ‘numeric’, lag_10 = ‘numeric’, lag_9 = ‘numeric’, lag_8 = ‘numeric’, lag_7 = ‘numeric’, lag_6 = ‘numeric’, lag_5 = ‘numeric’, lag_4 = ‘numeric’, lag_3 = ‘numeric’, lag_2 = ‘numeric’, lag_1 = ‘numeric’, quantity_today = ‘numeric’, stock_close = ‘numeric’, weeks_of_stock = ‘numeric’, weeks_lead_time = ‘numeric’, stock_lead_ratio = ‘numeric’)

terms <- names(types) terms <- terms[terms %in% names(hidden)] new.test.set <- hidden[,terms]

for(i in names(new.test.set)){ model.class <- types[i] as.f <- get(paste(c(‘as’,model.class),sep=“.”,collapse=‘.’)) new.test.set[,i] <- as.f(new.test.set[,i]) if(model.class == ‘factor’) levels(new.test.set[,i]) <- c(levels(new.test.set[,i]),’_unk’) }

require(onehot)

new.encode <- onehot(new.test.set,max_levels=1000) new.test.set <- as.data.frame(predict(new.encode,new.test.set)) names(new.test.set) <- as.vector(sapply(names(new.test.set),function(x) gsub(‘=’,’_’,x)))

new.terms <- names(new.test.set)[!(names(new.test.set) %in% names(model.rf\(forest\)xlevels))]

if(length(new.terms) > 0){ new.frame <- as.data.frame(matrix(0,nrow=nrow(new.test.set),ncol=length(new.terms),dimnames=list(c(),new.terms))) new.test.set <- cbind(new.test.set,new.frame) }

new.test.set <- as.data.frame(new.test.set)

check <- names(model.rf\(forest\)xlevels)

extra.names <- which(!names(new.test.set) %in% check==T) extra.names <- as.vector(extra.names)

new.test.set <- new.test.set[,-extra.names]

missing.names <- which(!check %in% names(new.test.set)==T)

fill.missing <- matrix(0,nrow=nrow(new.test.set),ncol=length(missing.names))

fill.missing <- as.data.frame(fill.missing)

names(fill.missing) <- check[missing.names]

new.test.set <- as.data.frame(cbind(new.test.set,fill.missing))

new.test.set <- setTypes(new.test.set,model.rf\(forest\)xlevels)

make probabilistic prediction

require(randomForest) hidden.pred <- predict(model.rf,newdata=new.test.set)

re-order columns

submission <- hidden[,c(‘sku_key’,‘store_key’,‘week_sunday’)] submission[,‘week_sunday’] <- as.character(format(as.Date(submission[,‘week_sunday’]),format=‘%Y-%m-%d’)) submission[,‘quantity_next_week’] <- NA submission[,‘quantity_next_week’] <- hidden.pred

create submission:

write.csv(submission,file=opt$submission,row.names=F)