ModelStu002

library("Numero")
library("DALEX")

## Welcome to DALEX (version: 1.0).
## Find examples and detailed introduction at: https://pbiecek.github.io/ema/
## Additional features will be available after installation of: ggpubr.
## Use 'install_dependencies()' to get all suggested dependencies

library("ranger")
library(data.table)

setwd("C:/Users/s-das/Syncplicity/MyProjects_IMP/MY_Papers_V2/TRB 2021/EScotter_BayesianRule/")
it01 <- fread("EScotter_Fin2.csv")
names(it01)

##  [1] "CRASH_NUM1"         "NARRATIVE"          "ACCESS_CNTL_CD"    
##  [4] "ALIGNMENT_CD"       "HWY_TYPE_CD"        "INVEST_AGENCY_CD"  
##  [7] "LIGHTING_CD"        "LOC_TYPE_CD"        "MAN_COLL_CD"       
## [10] "PRI_CONTRIB_FAC_CD" "ROAD_COND_CD"       "ROAD_REL_CD"       
## [13] "ROAD_TYPE_CD"       "SEC_CONTRIB_FAC_CD" "SEVERITY_CD"       
## [16] "SURF_COND_CD"       "SURF_TYPE_CD"       "WEATHER_CD"        
## [19] "CRASH_DATE"         "CRASH_TIME"         "CR_MONTH"          
## [22] "CR_HOUR"            "DAY_OF_WK"          "INTERSECTION"      
## [25] "NUM_VEH"            "LAT"                "LONG"              
## [28] "PARISH_CD"          "CITY_CD"            "TIME_AMB_ARR"      
## [31] "TIME_AMB_ARR_HOSP"  "HIT_AND_RUN"

mn01 <- it01[, c("SEVERITY_CD", "DAY_OF_WK", "LIGHTING_CD", "HWY_TYPE_CD", "WEATHER_CD", "CR_HOUR", 
               "NUM_VEH")]

mn02= na.omit(mn01)
mn02$SEVERITY_CD= as.factor(mn02$SEVERITY_CD)


model_HR_ranger <- ranger(SEVERITY_CD~.,  data = mn02, probability = TRUE, num.trees = 50)
model_HR_ranger

## Ranger result
## 
## Call:
##  ranger(SEVERITY_CD ~ ., data = mn02, probability = TRUE, num.trees = 50) 
## 
## Type:                             Probability estimation 
## Number of trees:                  50 
## Sample size:                      338 
## Number of independent variables:  6 
## Mtry:                             2 
## Target node size:                 10 
## Variable importance mode:         none 
## Splitrule:                        gini 
## OOB prediction error (Brier s.):  0.4839634

explain_HR_ranger <- explain(model_HR_ranger,
                             data = mn02[,-1],
                             y = mn02$SEVERITY_CD,
                             label = "Ranger Multilabel Classification",
                             colorize = FALSE)

## Preparation of a new explainer is initiated
##   -> model label       :  Ranger Multilabel Classification 
##   -> data              :  338  rows  6  cols 
##   -> target variable   :  338  values 
##   -> target variable   :  Please note that 'y' is a factor.  (  WARNING  )
##   -> target variable   :  Consider changing the 'y' to a logical or numerical vector.
##   -> target variable   :  Otherwise I will not be able to calculate residuals or loss function.
##   -> model_info        :  package ranger , ver. 0.12.1 , task classification (  default  ) 
##   -> predict function  :  yhat.ranger  will be used (  default  )
##   -> predicted values  :  predict function returns multiple columns:  5  (  WARNING  ) some of functionalities may not work 
##   -> residual function :  difference between y and yhat (  default  )

## Warning in Ops.factor(y, predict_function(model, data)): '-' not meaningful for
## factors

##   -> residuals         :  numerical, min =  NA , mean =  NA , max =  NA  
##   A new explainer has been created!

y_HR <- as.numeric(mn02$SEVERITY_CD)
y_HR[1:10]

##  [1] 3 4 4 3 2 2 3 4 2 4

explain_HR_ranger_new_y <- explain(model_HR_ranger,
                                   data = mn02[,-1],
                                   y = y_HR,
                                   label = "Ranger Multilabel Classification",
                                   colorize = FALSE)

## Preparation of a new explainer is initiated
##   -> model label       :  Ranger Multilabel Classification 
##   -> data              :  338  rows  6  cols 
##   -> target variable   :  338  values 
##   -> model_info        :  package ranger , ver. 0.12.1 , task classification (  default  ) 
##   -> predict function  :  yhat.ranger  will be used (  default  )
##   -> predicted values  :  predict function returns multiple columns:  5  (  WARNING  ) some of functionalities may not work 
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  0.5780405 , mean =  3.711243 , max =  5  
##   A new explainer has been created!

vi <- variable_importance(explain_HR_ranger_new_y, loss_function = loss_cross_entropy)
plot(vi)

ve_p <- variable_profile(explain_HR_ranger, variables = "NUM_VEH", type = "partial")

## Warning in if (class(new_observation_ext) != "data.frame") {: the condition has
## length > 1 and only the first element will be used

ve_p$color = "_label_"
plot(ve_p)

ve_p <- variable_profile(explain_HR_ranger, variables = "CR_HOUR", type = "partial")

## Warning in if (class(new_observation_ext) != "data.frame") {: the condition has
## length > 1 and only the first element will be used

ve_p$color = "_label_"
plot(ve_p)

bd <- variable_attribution(explain_HR_ranger, mn02[2,], type = "break_down")
plot(bd)

shap <- variable_attribution(explain_HR_ranger, mn02[2,], type = "shap")
plot(shap)

residual_function <- function(model, data, y) {
  y_char <- as.character(y)
  pred <- predict(model, data, probability = TRUE)$predictions 
  res <- c()
  for (i in 1:nrow(pred)) {
    res[i] <- 1-pred[i, y_char[i]]
  }
  res
}


explain_HR_ranger_residual <- explain(model_HR_ranger,
                                      data = mn02[,-1],
                                      y = mn02$SEVERITY_CD,
                                      label = "Ranger Multilabel Classification",
                                      residual_function = residual_function,
                                      colorize = FALSE)

## Preparation of a new explainer is initiated
##   -> model label       :  Ranger Multilabel Classification 
##   -> data              :  338  rows  6  cols 
##   -> target variable   :  338  values 
##   -> target variable   :  Please note that 'y' is a factor.  (  WARNING  )
##   -> target variable   :  Consider changing the 'y' to a logical or numerical vector.
##   -> target variable   :  Otherwise I will not be able to calculate residuals or loss function.
##   -> model_info        :  package ranger , ver. 0.12.1 , task classification (  default  ) 
##   -> predict function  :  yhat.ranger  will be used (  default  )
##   -> predicted values  :  predict function returns multiple columns:  5  (  WARNING  ) some of functionalities may not work 
##   -> residual function :  residual_function 
##   -> residuals         :  numerical, min =  0.2261979 , mean =  0.5522091 , max =  0.9241105  
##   A new explainer has been created!

rd_all <- individual_diagnostics(explain_HR_ranger_residual, mn02[1,])

## Warning in ks.test(residuals_all, residuals_sel): p-value will be approximate in
## the presence of ties

plot(rd_all)

rd_salary <- individual_diagnostics(explain_HR_ranger_residual, mn02[1,], variables = "NUM_VEH")

## Warning in if (class(new_observation_ext) != "data.frame") {: the condition has
## length > 1 and only the first element will be used

## Warning in if (class(new_observation_ext) != "data.frame") {: the condition has
## length > 1 and only the first element will be used

plot(rd_salary)