This Data was collected through surveys by the central bureau of statistics as mentioned in their competition front page. This survey is one of the largest post-disaster datasets ever collected containing important information about the household conditions and other relevent information and we have to predict the level damage caused by the earthquake.
library(tidyverse)
library(h2o)
library(caret)
train = read.csv("train_values.csv")
train_label = read.csv("train_labels.csv")
test = read.csv("test_values.csv")
train = train %>%
left_join(train_label, by = "building_id")
str(train)
## 'data.frame': 260601 obs. of 40 variables:
## $ building_id : int 802906 28830 94947 590882 201944 333020 728451 475515 441126 989500 ...
## $ geo_level_1_id : int 6 8 21 22 11 8 9 20 0 26 ...
## $ geo_level_2_id : int 487 900 363 418 131 558 475 323 757 886 ...
## $ geo_level_3_id : int 12198 2812 8973 10694 1488 6089 12066 12236 7219 994 ...
## $ count_floors_pre_eq : int 2 2 2 2 3 2 2 2 2 1 ...
## $ age : int 30 10 10 10 30 10 25 0 15 0 ...
## $ area_percentage : int 6 8 5 6 8 9 3 8 8 13 ...
## $ height_percentage : int 5 7 5 5 9 5 4 6 6 4 ...
## $ land_surface_condition : Factor w/ 3 levels "n","o","t": 3 2 3 3 3 3 1 3 3 3 ...
## $ foundation_type : Factor w/ 5 levels "h","i","r","u",..: 3 3 3 3 3 3 3 5 3 2 ...
## $ roof_type : Factor w/ 3 levels "n","q","x": 1 1 1 1 1 1 1 2 2 1 ...
## $ ground_floor_type : Factor w/ 5 levels "f","m","v","x",..: 1 4 1 1 1 1 4 3 1 3 ...
## $ other_floor_type : Factor w/ 4 levels "j","q","s","x": 2 2 4 4 4 2 2 4 2 1 ...
## $ position : Factor w/ 4 levels "j","o","s","t": 4 3 4 3 3 3 3 3 3 3 ...
## $ plan_configuration : Factor w/ 10 levels "a","c","d","f",..: 3 3 3 3 3 3 3 10 3 3 ...
## $ has_superstructure_adobe_mud : int 1 0 0 0 1 0 0 0 0 0 ...
## $ has_superstructure_mud_mortar_stone : int 1 1 1 1 0 1 1 0 1 0 ...
## $ has_superstructure_stone_flag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_cement_mortar_stone: int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_mud_mortar_brick : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_cement_mortar_brick: int 0 0 0 0 0 0 0 1 0 1 ...
## $ has_superstructure_timber : int 0 0 0 1 0 0 0 1 1 0 ...
## $ has_superstructure_bamboo : int 0 0 0 1 0 0 0 0 0 0 ...
## $ has_superstructure_rc_non_engineered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_rc_engineered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ legal_ownership_status : Factor w/ 4 levels "a","r","v","w": 3 3 3 3 3 3 3 3 3 3 ...
## $ count_families : int 1 1 1 1 1 1 1 1 1 1 ...
## $ has_secondary_use : int 0 0 0 0 0 1 0 0 0 0 ...
## $ has_secondary_use_agriculture : int 0 0 0 0 0 1 0 0 0 0 ...
## $ has_secondary_use_hotel : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_rental : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_institution : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_school : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_industry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_health_post : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_gov_office : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_use_police : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ damage_grade : int 3 2 3 2 3 2 3 1 2 1 ...
We have 260601 rows and 40 columns
In order to preprocess both train and test data, I will merge them in new dataframe.
test$damage_grade = NA
## Merge Data
data = rbind(train, test)
str(data)
## 'data.frame': 347469 obs. of 40 variables:
## $ building_id : int 802906 28830 94947 590882 201944 333020 728451 475515 441126 989500 ...
## $ geo_level_1_id : int 6 8 21 22 11 8 9 20 0 26 ...
## $ geo_level_2_id : int 487 900 363 418 131 558 475 323 757 886 ...
## $ geo_level_3_id : int 12198 2812 8973 10694 1488 6089 12066 12236 7219 994 ...
## $ count_floors_pre_eq : int 2 2 2 2 3 2 2 2 2 1 ...
## $ age : int 30 10 10 10 30 10 25 0 15 0 ...
## $ area_percentage : int 6 8 5 6 8 9 3 8 8 13 ...
## $ height_percentage : int 5 7 5 5 9 5 4 6 6 4 ...
## $ land_surface_condition : Factor w/ 3 levels "n","o","t": 3 2 3 3 3 3 1 3 3 3 ...
## $ foundation_type : Factor w/ 5 levels "h","i","r","u",..: 3 3 3 3 3 3 3 5 3 2 ...
## $ roof_type : Factor w/ 3 levels "n","q","x": 1 1 1 1 1 1 1 2 2 1 ...
## $ ground_floor_type : Factor w/ 5 levels "f","m","v","x",..: 1 4 1 1 1 1 4 3 1 3 ...
## $ other_floor_type : Factor w/ 4 levels "j","q","s","x": 2 2 4 4 4 2 2 4 2 1 ...
## $ position : Factor w/ 4 levels "j","o","s","t": 4 3 4 3 3 3 3 3 3 3 ...
## $ plan_configuration : Factor w/ 10 levels "a","c","d","f",..: 3 3 3 3 3 3 3 10 3 3 ...
## $ has_superstructure_adobe_mud : int 1 0 0 0 1 0 0 0 0 0 ...
## $ has_superstructure_mud_mortar_stone : int 1 1 1 1 0 1 1 0 1 0 ...
## $ has_superstructure_stone_flag : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_cement_mortar_stone: int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_mud_mortar_brick : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_cement_mortar_brick: int 0 0 0 0 0 0 0 1 0 1 ...
## $ has_superstructure_timber : int 0 0 0 1 0 0 0 1 1 0 ...
## $ has_superstructure_bamboo : int 0 0 0 1 0 0 0 0 0 0 ...
## $ has_superstructure_rc_non_engineered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_rc_engineered : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_superstructure_other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ legal_ownership_status : Factor w/ 4 levels "a","r","v","w": 3 3 3 3 3 3 3 3 3 3 ...
## $ count_families : int 1 1 1 1 1 1 1 1 1 1 ...
## $ has_secondary_use : int 0 0 0 0 0 1 0 0 0 0 ...
## $ has_secondary_use_agriculture : int 0 0 0 0 0 1 0 0 0 0 ...
## $ has_secondary_use_hotel : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_rental : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_institution : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_school : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_industry : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_health_post : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_gov_office : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_use_police : int 0 0 0 0 0 0 0 0 0 0 ...
## $ has_secondary_use_other : int 0 0 0 0 0 0 0 0 0 0 ...
## $ damage_grade : int 3 2 3 2 3 2 3 1 2 1 ...
We have total of 347469 rows and 40 columns.
The categorical variables as the values are not ordinal, so I will convert them into dummy variables.
factor_variables = data %>%
select_if(is.factor)
DFdummies <- as.data.frame(model.matrix(~.-1, factor_variables))
numericDF = data %>%
select_if(is.numeric)
final_data = cbind(numericDF, DFdummies)
I will bin the age column into 10 separate groups as it will compress the information into groups. I will leave it as factor for H2O to deal with it.
breaks <- c(0, 100, 200, 300, 400, 500, 600, 700, 800, 900, 1000)
tags <- c("0-100","100-200", "200-300",
"300-400", "400-500", "500-600","600-700", "700-800","800-900", "900-1000")
final_data$age_group <- cut(data$age,
breaks = breaks,
include.lowest = TRUE,
right = FALSE,
labels = tags)
final_data$damage_grade = as.factor(final_data$damage_grade)
train = final_data[!is.na(final_data$damage_grade),]
test = final_data[is.na(final_data$damage_grade),]
h2o.init()
## Connection successful!
##
## R is connected to the H2O cluster:
## H2O cluster uptime: 1 hours 13 minutes
## H2O cluster timezone: America/Los_Angeles
## H2O data parsing timezone: UTC
## H2O cluster version: 3.28.0.1
## H2O cluster version age: 23 days
## H2O cluster name: H2O_started_from_R_uzair_out621
## H2O cluster total nodes: 1
## H2O cluster total memory: 8.54 GB
## H2O cluster total cores: 6
## H2O cluster allowed cores: 6
## H2O cluster healthy: TRUE
## H2O Connection ip: localhost
## H2O Connection port: 54321
## H2O Connection proxy: NA
## H2O Internal Security: FALSE
## H2O API Extensions: Amazon S3, Algos, AutoML, Core V3, TargetEncoder, Core V4
## R Version: R version 3.5.2 (2018-12-20)
train_h2o = as.h2o(train)
##
|
| | 0%
|
|=================================================================| 100%
test_h2o = as.h2o(test)
##
|
| | 0%
|
|=================================================================| 100%
I will use 75% of the trainset to train and 25% of the trainset to validate
parts = h2o.splitFrame(train_h2o, ratios = 0.75, seed = 123)
train_h2o = parts[[1]]
valid_h2o = parts[[2]]
I will remove building_id as it is irrelevent for the prediction. I will remove age as we have age groups in agebinned variable.
y = "damage_grade"
x = setdiff(colnames(train), c(y, "building_id", "age"))
I have selected parameters randomly.
model_gbm = h2o.gbm(x = x,
y = y,
training_frame = train_h2o,
nfolds = 5,
ntrees = 1500,
distribution = "multinomial",
stopping_metric = "mean_per_class_error",
stopping_rounds = 5,
stopping_tolerance = 0,
seed = 123,
max_depth = 7,
balance_classes = F
)
##
|
| | 0%
|
| | 1%
|
|= | 1%
|
|= | 2%
|
|== | 2%
|
|== | 3%
|
|== | 4%
|
|=== | 4%
|
|=== | 5%
|
|==== | 6%
|
|==== | 7%
|
|===== | 7%
|
|===== | 8%
|
|============== | 21%
|
|====================== | 33%
|
|====================== | 34%
|
|====================== | 35%
|
|======================= | 35%
|
|======================= | 36%
|
|======================== | 36%
|
|======================== | 37%
|
|======================== | 38%
|
|========================= | 38%
|
|========================= | 39%
|
|========================== | 39%
|
|========================== | 40%
|
|========================== | 41%
|
|=========================== | 41%
|
|=========================== | 42%
|
|=================================== | 54%
|
|=================================== | 55%
|
|=========================================== | 67%
|
|============================================ | 67%
|
|============================================ | 68%
|
|============================================= | 68%
|
|============================================= | 69%
|
|============================================= | 70%
|
|============================================== | 70%
|
|============================================== | 71%
|
|=============================================== | 72%
|
|=============================================== | 73%
|
|================================================ | 73%
|
|====================================================== | 83%
|
|====================================================== | 84%
|
|======================================================= | 84%
|
|======================================================= | 85%
|
|======================================================== | 85%
|
|======================================================== | 86%
|
|======================================================== | 87%
|
|========================================================= | 87%
|
|========================================================= | 88%
|
|=================================================================| 100%
h2o.performance(model_gbm)
## H2OMultinomialMetrics: gbm
## ** Reported on training data. **
##
## Training Set Metrics:
## =====================
##
## Extract training frame with `h2o.getFrame("RTMP_sid_9327_5")`
## MSE: (Extract with `h2o.mse`) 0.195046
## RMSE: (Extract with `h2o.rmse`) 0.4416401
## Logloss: (Extract with `h2o.logloss`) 0.5918587
## Mean Per-Class Error: 0.345013
## R^2: (Extract with `h2o.r2`) 0.4793181
## Confusion Matrix: Extract with `h2o.confusionMatrix(<model>,train = TRUE)`)
## =========================================================================
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 1 2 3 Error Rate
## 1 9253 9356 231 0.5089 = 9,587 / 18,840
## 2 2890 96222 12017 0.1341 = 14,907 / 111,129
## 3 249 25468 39882 0.3920 = 25,717 / 65,599
## Totals 12392 131046 52130 0.2567 = 50,211 / 195,568
##
## Hit Ratio Table: Extract with `h2o.hit_ratio_table(<model>,train = TRUE)`
## =======================================================================
## Top-3 Hit Ratios:
## k hit_ratio
## 1 1 0.743256
## 2 2 0.973666
## 3 3 1.000000
h2o.confusionMatrix(model_gbm, newdata = valid_h2o)
## Confusion Matrix: Row labels: Actual class; Column labels: Predicted class
## 1 2 3 Error Rate
## 1 2779 3427 78 0.5578 = 3,505 / 6,284
## 2 1153 31585 4392 0.1493 = 5,545 / 37,130
## 3 91 8837 12691 0.4130 = 8,928 / 21,619
## Totals 4023 43849 17161 0.2764 = 17,978 / 65,033
Total Error rate of around 30%.
pred = as.data.frame(h2o.predict(model_gbm, newdata = test_h2o))
##
|
| | 0%
|
|=================================================================| 100%
pred = pred[1]
sample = data.frame(building_id = test$building_id, damage_grade = pred$predict)