0.1 Introduction

This Data was collected through surveys by the central bureau of statistics as mentioned in their competition front page. This survey is one of the largest post-disaster datasets ever collected containing important information about the household conditions and other relevent information and we have to predict the level damage caused by the earthquake.

0.2 Import Libraries

library(tidyverse)
library(h2o)
library(caret)

0.3 Import Dataset

train = read.csv("train_values.csv")
train_label = read.csv("train_labels.csv")
test = read.csv("test_values.csv")

train = train %>%
  left_join(train_label, by = "building_id")
str(train)
## 'data.frame':    260601 obs. of  40 variables:
##  $ building_id                           : int  802906 28830 94947 590882 201944 333020 728451 475515 441126 989500 ...
##  $ geo_level_1_id                        : int  6 8 21 22 11 8 9 20 0 26 ...
##  $ geo_level_2_id                        : int  487 900 363 418 131 558 475 323 757 886 ...
##  $ geo_level_3_id                        : int  12198 2812 8973 10694 1488 6089 12066 12236 7219 994 ...
##  $ count_floors_pre_eq                   : int  2 2 2 2 3 2 2 2 2 1 ...
##  $ age                                   : int  30 10 10 10 30 10 25 0 15 0 ...
##  $ area_percentage                       : int  6 8 5 6 8 9 3 8 8 13 ...
##  $ height_percentage                     : int  5 7 5 5 9 5 4 6 6 4 ...
##  $ land_surface_condition                : Factor w/ 3 levels "n","o","t": 3 2 3 3 3 3 1 3 3 3 ...
##  $ foundation_type                       : Factor w/ 5 levels "h","i","r","u",..: 3 3 3 3 3 3 3 5 3 2 ...
##  $ roof_type                             : Factor w/ 3 levels "n","q","x": 1 1 1 1 1 1 1 2 2 1 ...
##  $ ground_floor_type                     : Factor w/ 5 levels "f","m","v","x",..: 1 4 1 1 1 1 4 3 1 3 ...
##  $ other_floor_type                      : Factor w/ 4 levels "j","q","s","x": 2 2 4 4 4 2 2 4 2 1 ...
##  $ position                              : Factor w/ 4 levels "j","o","s","t": 4 3 4 3 3 3 3 3 3 3 ...
##  $ plan_configuration                    : Factor w/ 10 levels "a","c","d","f",..: 3 3 3 3 3 3 3 10 3 3 ...
##  $ has_superstructure_adobe_mud          : int  1 0 0 0 1 0 0 0 0 0 ...
##  $ has_superstructure_mud_mortar_stone   : int  1 1 1 1 0 1 1 0 1 0 ...
##  $ has_superstructure_stone_flag         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_cement_mortar_stone: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_mud_mortar_brick   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_cement_mortar_brick: int  0 0 0 0 0 0 0 1 0 1 ...
##  $ has_superstructure_timber             : int  0 0 0 1 0 0 0 1 1 0 ...
##  $ has_superstructure_bamboo             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ has_superstructure_rc_non_engineered  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_rc_engineered      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_other              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ legal_ownership_status                : Factor w/ 4 levels "a","r","v","w": 3 3 3 3 3 3 3 3 3 3 ...
##  $ count_families                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ has_secondary_use                     : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ has_secondary_use_agriculture         : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ has_secondary_use_hotel               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_rental              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_institution         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_school              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_industry            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_health_post         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_gov_office          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_use_police          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_other               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ damage_grade                          : int  3 2 3 2 3 2 3 1 2 1 ...

We have 260601 rows and 40 columns

0.4 Merge Data

In order to preprocess both train and test data, I will merge them in new dataframe.

test$damage_grade = NA

## Merge Data
data = rbind(train, test)

str(data)
## 'data.frame':    347469 obs. of  40 variables:
##  $ building_id                           : int  802906 28830 94947 590882 201944 333020 728451 475515 441126 989500 ...
##  $ geo_level_1_id                        : int  6 8 21 22 11 8 9 20 0 26 ...
##  $ geo_level_2_id                        : int  487 900 363 418 131 558 475 323 757 886 ...
##  $ geo_level_3_id                        : int  12198 2812 8973 10694 1488 6089 12066 12236 7219 994 ...
##  $ count_floors_pre_eq                   : int  2 2 2 2 3 2 2 2 2 1 ...
##  $ age                                   : int  30 10 10 10 30 10 25 0 15 0 ...
##  $ area_percentage                       : int  6 8 5 6 8 9 3 8 8 13 ...
##  $ height_percentage                     : int  5 7 5 5 9 5 4 6 6 4 ...
##  $ land_surface_condition                : Factor w/ 3 levels "n","o","t": 3 2 3 3 3 3 1 3 3 3 ...
##  $ foundation_type                       : Factor w/ 5 levels "h","i","r","u",..: 3 3 3 3 3 3 3 5 3 2 ...
##  $ roof_type                             : Factor w/ 3 levels "n","q","x": 1 1 1 1 1 1 1 2 2 1 ...
##  $ ground_floor_type                     : Factor w/ 5 levels "f","m","v","x",..: 1 4 1 1 1 1 4 3 1 3 ...
##  $ other_floor_type                      : Factor w/ 4 levels "j","q","s","x": 2 2 4 4 4 2 2 4 2 1 ...
##  $ position                              : Factor w/ 4 levels "j","o","s","t": 4 3 4 3 3 3 3 3 3 3 ...
##  $ plan_configuration                    : Factor w/ 10 levels "a","c","d","f",..: 3 3 3 3 3 3 3 10 3 3 ...
##  $ has_superstructure_adobe_mud          : int  1 0 0 0 1 0 0 0 0 0 ...
##  $ has_superstructure_mud_mortar_stone   : int  1 1 1 1 0 1 1 0 1 0 ...
##  $ has_superstructure_stone_flag         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_cement_mortar_stone: int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_mud_mortar_brick   : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_cement_mortar_brick: int  0 0 0 0 0 0 0 1 0 1 ...
##  $ has_superstructure_timber             : int  0 0 0 1 0 0 0 1 1 0 ...
##  $ has_superstructure_bamboo             : int  0 0 0 1 0 0 0 0 0 0 ...
##  $ has_superstructure_rc_non_engineered  : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_rc_engineered      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_superstructure_other              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ legal_ownership_status                : Factor w/ 4 levels "a","r","v","w": 3 3 3 3 3 3 3 3 3 3 ...
##  $ count_families                        : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ has_secondary_use                     : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ has_secondary_use_agriculture         : int  0 0 0 0 0 1 0 0 0 0 ...
##  $ has_secondary_use_hotel               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_rental              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_institution         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_school              : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_industry            : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_health_post         : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_gov_office          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_use_police          : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ has_secondary_use_other               : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ damage_grade                          : int  3 2 3 2 3 2 3 1 2 1 ...

We have total of 347469 rows and 40 columns.

0.5 Get Dummy Variables

The categorical variables as the values are not ordinal, so I will convert them into dummy variables.

factor_variables = data %>%
  select_if(is.factor)

DFdummies <- as.data.frame(model.matrix(~.-1, factor_variables))

0.6 Get numerical and merge it with dummy Dataframe

numericDF = data %>%
  select_if(is.numeric)

final_data = cbind(numericDF, DFdummies)

0.7 Binning the age Column

I will bin the age column into 10 separate groups as it will compress the information into groups. I will leave it as factor for H2O to deal with it.

breaks <- c(0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000)
tags <- c("0-100","100-200", "200-300",
          "300-400", "400-500", "500-600","600-700", "700-800","800-900", "900-1000")

final_data$age_group <- cut(data$age, 
                  breaks = breaks, 
                  include.lowest = TRUE, 
                  right = FALSE,
                  labels = tags)
final_data$damage_grade = as.factor(final_data$damage_grade)

train = final_data[!is.na(final_data$damage_grade),]
test = final_data[is.na(final_data$damage_grade),]

0.8 Start H2o and Convert dataframe to h2o dataframe

h2o.init()
##  Connection successful!
## 
## R is connected to the H2O cluster: 
##     H2O cluster uptime:         1 hours 13 minutes 
##     H2O cluster timezone:       America/Los_Angeles 
##     H2O data parsing timezone:  UTC 
##     H2O cluster version:        3.28.0.1 
##     H2O cluster version age:    23 days  
##     H2O cluster name:           H2O_started_from_R_uzair_out621 
##     H2O cluster total nodes:    1 
##     H2O cluster total memory:   8.54 GB 
##     H2O cluster total cores:    6 
##     H2O cluster allowed cores:  6 
##     H2O cluster healthy:        TRUE 
##     H2O Connection ip:          localhost 
##     H2O Connection port:        54321 
##     H2O Connection proxy:       NA 
##     H2O Internal Security:      FALSE 
##     H2O API Extensions:         Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4 
##     R Version:                  R version 3.5.2 (2018-12-20)
train_h2o = as.h2o(train)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
test_h2o = as.h2o(test)
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%

0.9 Get Validation set from train

I will use 75% of the trainset to train and 25% of the trainset to validate

parts = h2o.splitFrame(train_h2o, ratios = 0.75, seed = 123)
train_h2o = parts[[1]]
valid_h2o = parts[[2]]

0.10 Set response and Predictors

I will remove building_id as it is irrelevent for the prediction. I will remove age as we have age groups in agebinned variable.

y = "damage_grade"
x = setdiff(colnames(train), c(y, "building_id", "age"))

0.11 GBM

I have selected parameters randomly.

model_gbm = h2o.gbm(x = x,
                    y = y,
                    training_frame = train_h2o,
                    nfolds = 5,
                    ntrees = 1500,
                    distribution = "multinomial",
                    stopping_metric = "mean_per_class_error",
                    stopping_rounds = 5,
                    stopping_tolerance = 0,
                    seed = 123,
                    max_depth = 7,
                    balance_classes = F
                    )
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |                                                                 |   1%
  |                                                                       
  |=                                                                |   1%
  |                                                                       
  |=                                                                |   2%
  |                                                                       
  |==                                                               |   2%
  |                                                                       
  |==                                                               |   3%
  |                                                                       
  |==                                                               |   4%
  |                                                                       
  |===                                                              |   4%
  |                                                                       
  |===                                                              |   5%
  |                                                                       
  |====                                                             |   6%
  |                                                                       
  |====                                                             |   7%
  |                                                                       
  |=====                                                            |   7%
  |                                                                       
  |=====                                                            |   8%
  |                                                                       
  |==============                                                   |  21%
  |                                                                       
  |======================                                           |  33%
  |                                                                       
  |======================                                           |  34%
  |                                                                       
  |======================                                           |  35%
  |                                                                       
  |=======================                                          |  35%
  |                                                                       
  |=======================                                          |  36%
  |                                                                       
  |========================                                         |  36%
  |                                                                       
  |========================                                         |  37%
  |                                                                       
  |========================                                         |  38%
  |                                                                       
  |=========================                                        |  38%
  |                                                                       
  |=========================                                        |  39%
  |                                                                       
  |==========================                                       |  39%
  |                                                                       
  |==========================                                       |  40%
  |                                                                       
  |==========================                                       |  41%
  |                                                                       
  |===========================                                      |  41%
  |                                                                       
  |===========================                                      |  42%
  |                                                                       
  |===================================                              |  54%
  |                                                                       
  |===================================                              |  55%
  |                                                                       
  |===========================================                      |  67%
  |                                                                       
  |============================================                     |  67%
  |                                                                       
  |============================================                     |  68%
  |                                                                       
  |=============================================                    |  68%
  |                                                                       
  |=============================================                    |  69%
  |                                                                       
  |=============================================                    |  70%
  |                                                                       
  |==============================================                   |  70%
  |                                                                       
  |==============================================                   |  71%
  |                                                                       
  |===============================================                  |  72%
  |                                                                       
  |===============================================                  |  73%
  |                                                                       
  |================================================                 |  73%
  |                                                                       
  |======================================================           |  83%
  |                                                                       
  |======================================================           |  84%
  |                                                                       
  |=======================================================          |  84%
  |                                                                       
  |=======================================================          |  85%
  |                                                                       
  |========================================================         |  85%
  |                                                                       
  |========================================================         |  86%
  |                                                                       
  |========================================================         |  87%
  |                                                                       
  |=========================================================        |  87%
  |                                                                       
  |=========================================================        |  88%
  |                                                                       
  |=================================================================| 100%
h2o.performance(model_gbm)
## H2OMultinomialMetrics: gbm
## ** Reported on training data. **
## 
## Training Set Metrics: 
## =====================
## 
## Extract training frame with `h2o.getFrame("RTMP_sid_9327_5")`
## MSE: (Extract with `h2o.mse`) 0.195046
## RMSE: (Extract with `h2o.rmse`) 0.4416401
## Logloss: (Extract with `h2o.logloss`) 0.5918587
## Mean Per-Class Error: 0.345013
## R^2: (Extract with `h2o.r2`) 0.4793181
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##            1      2     3  Error               Rate
## 1       9253   9356   231 0.5089 =   9,587 / 18,840
## 2       2890  96222 12017 0.1341 = 14,907 / 111,129
## 3        249  25468 39882 0.3920 =  25,717 / 65,599
## Totals 12392 131046 52130 0.2567 = 50,211 / 195,568
## 
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-3 Hit Ratios: 
##   k hit_ratio
## 1 1  0.743256
## 2 2  0.973666
## 3 3  1.000000

0.12 Performance on Validation set

h2o.confusionMatrix(model_gbm,  newdata = valid_h2o)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
##           1     2     3  Error              Rate
## 1      2779  3427    78 0.5578 =   3,505 / 6,284
## 2      1153 31585  4392 0.1493 =  5,545 / 37,130
## 3        91  8837 12691 0.4130 =  8,928 / 21,619
## Totals 4023 43849 17161 0.2764 = 17,978 / 65,033

Total Error rate of around 30%.

0.13 Make Final Prediction

pred = as.data.frame(h2o.predict(model_gbm, newdata = test_h2o))
## 
  |                                                                       
  |                                                                 |   0%
  |                                                                       
  |=================================================================| 100%
pred = pred[1]

sample = data.frame(building_id = test$building_id, damage_grade = pred$predict)

0.14 Result

center>

0.15 Reference