Read in data
#set working directory
setwd("C:/Users/Gerhard Viljoen/Desktop/bcx-hackathon-two/r-benchmark")
#read csv
train <- read.csv('../data/train.csv')
Define a data manipulation function:
data.manip <- function(train){
#select numeric variables
numeric_vars <- train[,4:22]
#select numeric variables, set NA's to 0, scale them
numeric_vars[is.na(numeric_vars)] <- 0
numeric_vars[,1:18] <- scale(numeric_vars[,1:18])
categorical_vars <- train[,23:31]
#attempt to fix missing categorical vars (this approach did not really succeed in production)
categorical_vars <- data.frame(categorical_vars)
categorical_vars[is.na(categorical_vars)] <- "O."
#define target and predictor variables
train.y <- cbind(train[,1:3],train$quantity_next_week)
train.x <- cbind(train[,1:3],numeric_vars[,-19],categorical_vars)
#bind them together into a dataframe
dat <- cbind(train.y,train.x)
#remove dfs built in the function, not necessary downstream
rm(categorical_vars,numeric_vars,train)
rm(train.x,train.y)
#deselect unnecessary columns
dat <- dat[,-c(1:3)]
}
#save this function
save(data.manip,file="data_manip.RData")
#load it
load("data_manip.RData")
#clean data using the above-defined function
dat <- data.manip(train)
https://www.h2o.ai/products/h2o/
http://docs.h2o.ai/h2o/latest-stable/h2o-r/docs/reference/h2o.randomForest.html
#initialize an h20 session, using all available cores and a decent amount of memory
require(h2o)
h2o.init(nthreads = -1,max_mem_size ="8G")# "32G")
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 14 minutes
## H2O cluster timezone: Africa/Harare
## H2O data parsing timezone: UTC
## H2O cluster version: 3.18.0.11
## H2O cluster version age: 3 months and 4 days
## H2O cluster name: H2O_started_from_R_Gerhard_Viljoen_qoy797
## H2O cluster total nodes: 1
## H2O cluster total memory: 5.73 GB
## H2O cluster total cores: 4
## H2O cluster allowed cores: 4
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Algos, AutoML, Core V3, Core V4
## R Version: R version 3.4.4 (2018-03-15)
#suppress progress bar for Rmd
h2o.no_progress()
#push the manipulated data into an h2o frame
dat.hex <- as.h2o(dat,"dat.hex")
#generate 3 splits: training, validation used during training, and test
splitz <- h2o.splitFrame(dat.hex,ratios=c(0.7,0.15),
destination_frames = c("train.hex","valid.hex","test.hex"))
#assign these splits
train.hex <- splitz[[1]]
valid.hex <- splitz[[2]]
test.hex <- splitz[[3]]
#build a base random forest
rf_base <- h2o.randomForest(y=1,
x=5:31,
training_frame = train.hex,
validation_frame = valid.hex,
model_id = "rf_base",
ntrees = 500,
distribution = "poisson"
)
?h2o.randomForest
#check performance during training
h2o.performance(rf_base)
## H2ORegressionMetrics: drf
## ** Reported on training data. **
## ** Metrics reported on Out-Of-Bag training samples **
##
## MSE: 118.1001
## RMSE: 10.86739
## MAE: 3.126948
## RMSLE: NaN
## Mean Residual Deviance : 118.1001
#check performance on the test set:
h2o.performance(rf_base,test.hex)
## H2ORegressionMetrics: drf
##
## MSE: 76.27595
## RMSE: 8.73361
## MAE: 3.114512
## RMSLE: NaN
## Mean Residual Deviance : 76.27595
Read in data, select columns, make missing numerical variables 0
Define the variables we want to build this model with, along with their datatypes
I have excluded subcategory and label from my model, since some instances had special characters: * / [space], etc. which messed with R’s variable assignment, while I could have improved accuracy by fixing these, it would have taken too much time and I couldn’t get anything to run on the autograder!
rm(list=ls())
train <- read.csv('../data/train.csv')
head(train)
## sku_key store_key week_sunday lag_13 lag_12 lag_11 lag_10 lag_9 lag_8
## 1 24 17 2017/09/10 NA NA NA NA NA NA
## 2 28 17 2017/09/10 NA NA NA NA NA NA
## 3 31 62 2017/07/09 NA NA NA NA NA NA
## 4 34 3 2016/06/19 NA NA NA NA NA NA
## 5 34 3 2016/07/17 NA NA NA NA NA NA
## 6 34 110 2017/01/15 NA NA NA NA NA NA
## lag_7 lag_6 lag_5 lag_4 lag_3 lag_2 lag_1 quantity_today stock_close
## 1 NA NA 4 6 1 2 2 1 18
## 2 NA NA 2 4 1 1 2 7 58
## 3 NA NA NA NA NA NA NA 3 3
## 4 NA NA NA NA NA NA 1 1 7
## 5 NA NA 1 1 3 1 2 1 10
## 6 NA NA NA NA NA 2 1 1 43
## weeks_of_stock weeks_lead_time stock_lead_ratio quantity_next_week
## 1 6.750000 2.2857143 2.953125 1
## 2 20.470588 2.2857143 8.955882 7
## 3 1.000000 2.0000000 0.500000 2
## 4 7.000000 1.7142857 4.083333 3
## 5 6.666667 0.1428571 46.666667 1
## 6 32.250000 1.2142857 26.558824 2
## store_region store_grading store_area_manager sku_department
## 1 LIM A CH2014 TY
## 2 LIM A CH2014 TY
## 3 KWA B JC2015 TY
## 4 KWA A JC2015 TY
## 5 KWA A JC2015 TY
## 6 WC NULL WA TY
## sku_subdepartment sku_category sku_subcategory sku_range sku_label
## 1 04 012 332 0001B NULL
## 2 06 063 20G 0001B DRGI
## 3 04 096 326 0001B NULL
## 4 04 011 35A 0001B DINO
## 5 04 011 35A 0001B DINO
## 6 04 011 35A 0001B DINO
## test_weeks
## 1 FALSE
## 2 FALSE
## 3 FALSE
## 4 FALSE
## 5 FALSE
## 6 FALSE
full.set <- train[,-c(3,ncol(train),29,31)]
full.set[is.na(full.set)] <- 0
source('functions.R')
types <- list(store_region = 'factor',
store_grading = 'factor',
store_area_manager = 'factor',
sku_department = 'factor',
sku_subdepartment = 'factor',
sku_category = 'factor',
#sku_subcategory = 'factor',
sku_range = 'factor',
#sku_label = 'factor',
lag_13 = 'numeric',
lag_12 = 'numeric',
lag_11 = 'numeric',
lag_10 = 'numeric',
lag_9 = 'numeric',
lag_8 = 'numeric',
lag_7 = 'numeric',
lag_6 = 'numeric',
lag_5 = 'numeric',
lag_4 = 'numeric',
lag_3 = 'numeric',
lag_2 = 'numeric',
lag_1 = 'numeric',
quantity_today = 'numeric',
stock_close = 'numeric',
weeks_of_stock = 'numeric',
weeks_lead_time = 'numeric',
stock_lead_ratio = 'numeric')
#select data columns
data.cols <- c('quantity_next_week',names(types))
require(onehot)
#one-hot encode categorical variables
full.set.encoder <- onehot(full.set[,data.cols],max_levels=1000)
full.set.encoded <- as.data.frame(predict(full.set.encoder,full.set[,data.cols]))
names(full.set.encoded) <- as.vector(sapply(names(full.set.encoded),function(x) gsub('=','_',x)))
require(randomForest)
model.rf <- randomForest(quantity_next_week ~ ., full.set.encoded,ntree=25,do.trace=T)
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 1 | 315.7 116.76 |
## 2 | 250.3 92.57 |
## 3 | 208.8 77.23 |
## 4 | 193.2 71.47 |
## 5 | 179.6 66.42 |
## 6 | 170.4 63.02 |
## 7 | 165.7 61.29 |
## 8 | 154.7 57.23 |
## 9 | 150.9 55.80 |
## 10 | 150.5 55.65 |
## 11 | 149.2 55.17 |
## 12 | 145 53.63 |
## 13 | 142.5 52.71 |
## 14 | 141.3 52.27 |
## 15 | 142.8 52.80 |
## 16 | 141.2 52.22 |
## 17 | 140 51.78 |
## 18 | 140.9 52.13 |
## 19 | 141 52.14 |
## 20 | 138.8 51.34 |
## 21 | 137.8 50.96 |
## 22 | 137.8 50.98 |
## 23 | 136.8 50.61 |
## 24 | 138.2 51.10 |
## 25 | 137.9 50.99 |
summary(model.rf)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 91255 -none- numeric
## mse 25 -none- numeric
## rsq 25 -none- numeric
## oob.times 91255 -none- numeric
## importance 137 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 91255 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
#run model:
require(randomForest)
model.rf <- randomForest(quantity_next_week ~ ., full.set.encoded,ntree=100,do.trace=10)
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 10 | 156.9 58.05 |
## 20 | 140.4 51.94 |
## 30 | 135.4 50.06 |
## 40 | 130.6 48.29 |
## 50 | 128.1 47.39 |
## 60 | 127.8 47.26 |
## 70 | 127.5 47.17 |
## 80 | 126.8 46.91 |
## 90 | 126.8 46.92 |
## 100 | 125.9 46.58 |
summary(model.rf)
## Length Class Mode
## call 5 -none- call
## type 1 -none- character
## predicted 91255 -none- numeric
## mse 100 -none- numeric
## rsq 100 -none- numeric
## oob.times 91255 -none- numeric
## importance 137 -none- numeric
## importanceSD 0 -none- NULL
## localImportance 0 -none- NULL
## proximity 0 -none- NULL
## ntree 1 -none- numeric
## mtry 1 -none- numeric
## forest 11 -none- list
## coefs 0 -none- NULL
## y 91255 -none- numeric
## test 0 -none- NULL
## inbag 0 -none- NULL
## terms 3 terms call
save(model.rf,file='model_rf.rdata')
Single split tree.
Go left if true
Splits are partitions in your feature space, using recursive binary splitting, which minimizes the residual sum of squares at each split
If there is no stopping criterion defined, trees are grown all the way to the bottom
Random Forests are multiple trees, grown on Bootstrapped samples of the training data, decorrelating each tree’s prediction with all others, by sampling m < p predictors as split candidates at each split.
m = p/3 is the default for randomForest
Overfitting.
We can prevent overfitting further, by reducing the depth of our trees.
If we reduce the amount of observations run through each tree in the forest, we achieve this reduction in tree depth, since there is less data to split on. Smaller trees also means it runs faster…
So, we grow 5000 trees, sampling 10% of our predictors for each tree:
model.rf3 <- randomForest(quantity_next_week ~ ., full.set.encoded,ntree=5000,do.trace=500,sampsize=9000)
## | Out-of-bag |
## Tree | MSE %Var(y) |
## 500 | 132.9 49.16 |
## 1000 | 132.8 49.11 |
## 1500 | 133 49.19 |
## 2000 | 133.1 49.22 |
## 2500 | 132.8 49.11 |
## 3000 | 132.6 49.05 |
## 3500 | 132.7 49.07 |
## 4000 | 132.6 49.06 |
## 4500 | 132.5 49.00 |
## 5000 | 132.5 49.02 |
save(model.rf3,file='model_rf3.rdata')
RUN MODEL FILE: ………………………………………………………………………………………………………………………. hidden <- read.csv(opt$input) hidden <- as.data.frame(hidden)
hidden[is.na(hidden)] <- 0
types <- list(store_region = ‘factor’, store_grading = ‘factor’, store_area_manager = ‘factor’, sku_department = ‘factor’, sku_subdepartment = ‘factor’, sku_category = ‘factor’, #sku_subcategory = ‘factor’, sku_range = ‘factor’, #sku_label = ‘factor’, lag_13 = ‘numeric’, lag_12 = ‘numeric’, lag_11 = ‘numeric’, lag_10 = ‘numeric’, lag_9 = ‘numeric’, lag_8 = ‘numeric’, lag_7 = ‘numeric’, lag_6 = ‘numeric’, lag_5 = ‘numeric’, lag_4 = ‘numeric’, lag_3 = ‘numeric’, lag_2 = ‘numeric’, lag_1 = ‘numeric’, quantity_today = ‘numeric’, stock_close = ‘numeric’, weeks_of_stock = ‘numeric’, weeks_lead_time = ‘numeric’, stock_lead_ratio = ‘numeric’)
terms <- names(types) terms <- terms[terms %in% names(hidden)] new.test.set <- hidden[,terms]
for(i in names(new.test.set)){ model.class <- types[i] as.f <- get(paste(c(‘as’,model.class),sep=“.”,collapse=‘.’)) new.test.set[,i] <- as.f(new.test.set[,i]) if(model.class == ‘factor’) levels(new.test.set[,i]) <- c(levels(new.test.set[,i]),’_unk’) }
require(onehot)
new.encode <- onehot(new.test.set,max_levels=1000) new.test.set <- as.data.frame(predict(new.encode,new.test.set)) names(new.test.set) <- as.vector(sapply(names(new.test.set),function(x) gsub(‘=’,’_’,x)))
new.terms <- names(new.test.set)[!(names(new.test.set) %in% names(model.rf\(forest\)xlevels))]
if(length(new.terms) > 0){ new.frame <- as.data.frame(matrix(0,nrow=nrow(new.test.set),ncol=length(new.terms),dimnames=list(c(),new.terms))) new.test.set <- cbind(new.test.set,new.frame) }
new.test.set <- as.data.frame(new.test.set)
check <- names(model.rf\(forest\)xlevels)
extra.names <- which(!names(new.test.set) %in% check==T) extra.names <- as.vector(extra.names)
new.test.set <- new.test.set[,-extra.names]
missing.names <- which(!check %in% names(new.test.set)==T)
fill.missing <- matrix(0,nrow=nrow(new.test.set),ncol=length(missing.names))
fill.missing <- as.data.frame(fill.missing)
names(fill.missing) <- check[missing.names]
new.test.set <- as.data.frame(cbind(new.test.set,fill.missing))
new.test.set <- setTypes(new.test.set,model.rf\(forest\)xlevels)
require(randomForest) hidden.pred <- predict(model.rf,newdata=new.test.set)
submission <- hidden[,c(‘sku_key’,‘store_key’,‘week_sunday’)] submission[,‘week_sunday’] <- as.character(format(as.Date(submission[,‘week_sunday’]),format=‘%Y-%m-%d’)) submission[,‘quantity_next_week’] <- NA submission[,‘quantity_next_week’] <- hidden.pred
write.csv(submission,file=opt$submission,row.names=F)