Fair Models

facto

library(fairmodels)

data("adult")
head(adult)

##   salary age        workclass fnlwgt education education_num     marital_status
## 1  <=50K  39        State-gov  77516 Bachelors            13      Never-married
## 2  <=50K  50 Self-emp-not-inc  83311 Bachelors            13 Married-civ-spouse
## 3  <=50K  38          Private 215646   HS-grad             9           Divorced
## 4  <=50K  53          Private 234721      11th             7 Married-civ-spouse
## 5  <=50K  28          Private 338409 Bachelors            13 Married-civ-spouse
## 6  <=50K  37          Private 284582   Masters            14 Married-civ-spouse
##          occupation  relationship  race    sex capital_gain capital_loss
## 1      Adm-clerical Not-in-family White   Male         2174            0
## 2   Exec-managerial       Husband White   Male            0            0
## 3 Handlers-cleaners Not-in-family White   Male            0            0
## 4 Handlers-cleaners       Husband Black   Male            0            0
## 5    Prof-specialty          Wife Black Female            0            0
## 6   Exec-managerial          Wife White Female            0            0
##   hours_per_week native_country
## 1             40  United-States
## 2             13  United-States
## 3             40  United-States
## 4             40  United-States
## 5             40           Cuba
## 6             40  United-States

dim(adult)

## [1] 32561    15

library(gbm)

## Warning: package 'gbm' was built under R version 3.6.2

## Loaded gbm 2.1.8

library(DALEX)

## Welcome to DALEX (version: 1.0.1).
## Find examples and detailed introduction at: https://pbiecek.github.io/ema/
## Additional features will be available after installation of: iBreakDown.
## Use 'install_dependencies()' to get all suggested dependencies

adult$salary   <- as.numeric(adult$salary) -1 # 0 if bad and 1 if good risk
protected     <- adult$sex
adult <- adult[colnames(adult) != "sex"] # sex not specified

# making model
set.seed(1)
gbm_model <-gbm(salary ~. , data = adult, distribution = "bernoulli")

# making explainer
gbm_explainer <- explain(gbm_model,
                         data = adult[,-1],
                         y = adult$salary,
                         colorize = FALSE)

## Preparation of a new explainer is initiated
##   -> model label       :  gbm  (  default  )
##   -> data              :  32561  rows  13  cols 
##   -> target variable   :  32561  values 
##   -> model_info        :  package gbm , ver. 2.1.8 , task classification (  default  ) 
##   -> predict function  :  yhat.gbm  will be used (  default  )
##   -> predicted values  :  numerical, min =  0.0101498 , mean =  0.2409542 , max =  0.9864558  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.9790795 , mean =  -0.0001445991 , max =  0.9864904  
##   A new explainer has been created!

model_performance(gbm_explainer)

## Measures for:  classification
## recall   : 0.5353909 
## precision: 0.7999238 
## f1       : 0.6414547 
## accuracy : 0.8558705 
## auc      : 0.9093789
## 
## Residuals:
##          0%         10%         20%         30%         40%         50% 
## -0.97907954 -0.31512711 -0.20921106 -0.12255213 -0.06480941 -0.04122486 
##         60%         70%         80%         90%        100% 
## -0.02832005 -0.01740186  0.13344943  0.53676046  0.98649038

fobject <- fairness_check(gbm_explainer, 
                          protected  = protected, 
                          privileged = "Male", 
                          colorize = FALSE)

## Creating fairness object
## -> Privileged subgroup       : character ( Ok  )
## -> Protected variable        : factor ( Ok  ) 
## -> Cutoff values for explainers  : 0.5 ( for all subgroups ) 
## -> Fairness objects      : 0 objects 
## -> Checking explainers       : 1 in total (  compatible  )
## -> Metric calculation        : 12/12 metrics calculated for all models
##  Fairness object created succesfully

plot(fobject)

data_fixed <- disparate_impact_remover(data = adult, protected = protected, 
                                       features_to_transform = c("age", "hours_per_week",
                                                                 "capital_loss",
                                                                 "capital_gain"))

set.seed(1)
gbm_model     <- gbm(salary ~. , data = data_fixed, distribution = "bernoulli")
gbm_explainer_dir <- explain(gbm_model,
                             data = data_fixed[,-1],
                             y = adult$salary,
                             label = "gbm_dir",
                             verbose = FALSE)

fobject <- fairness_check(gbm_explainer, gbm_explainer_dir,
                          protected = protected, 
                          privileged = "Male",
                          verbose = FALSE)
plot(fobject)

weights <- reweight(protected = protected, y = adult$salary)

set.seed(1)
gbm_model     <- gbm(salary ~. ,
                     data = adult,
                     weights = weights,
                     distribution = "bernoulli")

gbm_explainer_w <- explain(gbm_model,
                           data = adult[,-1],
                           y = adult$salary,
                           label = "gbm_weighted",
                           verbose = FALSE)

fobject <- fairness_check(fobject, gbm_explainer_w, verbose = FALSE)

plot(fobject)

# to obtain probs we will use simple linear regression
probs <- glm(salary ~., data = adult, family = binomial())$fitted.values

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

uniform_indexes      <- resample(protected = protected,
                                 y = adult$salary)
preferential_indexes <- resample(protected = protected,
                                 y = adult$salary,
                                 type = "preferential",
                                 probs = probs)

set.seed(1)
gbm_model     <- gbm(salary ~. ,
                     data = adult[uniform_indexes,],
                     distribution = "bernoulli")

gbm_explainer_u <- explain(gbm_model,
                           data = adult[,-1],
                           y = adult$salary,
                           label = "gbm_uniform",
                           verbose = FALSE)

set.seed(1)
gbm_model     <- gbm(salary ~. ,
                     data = adult[preferential_indexes,],
                     distribution = "bernoulli")

gbm_explainer_p <- explain(gbm_model,
                           data = adult[,-1],
                           y = adult$salary,
                           label = "gbm_preferential",
                           verbose = FALSE)

fobject <- fairness_check(fobject, gbm_explainer_u, gbm_explainer_p, 
                          verbose = FALSE)
plot(fobject)

# we will need normal explainer 
set.seed(1)
gbm_model <-gbm(salary ~. , data = adult, distribution = "bernoulli")
gbm_explainer <- explain(gbm_model,
                         data = adult[,-1],
                         y = adult$salary,
                         verbose = FALSE)

gbm_explainer_r <- roc_pivot(gbm_explainer,
                             protected = protected,
                             privileged = "Male")


fobject <- fairness_check(fobject, gbm_explainer_r, 
                          label = "gbm_roc",  # label as vector for explainers
                          verbose = FALSE) 

plot(fobject)

set.seed(1)
gbm_model <-gbm(salary ~. , data = adult, distribution = "bernoulli")
gbm_explainer <- explain(gbm_model,
                         data = adult[,-1],
                         y = adult$salary,
                         verbose = FALSE)

# test fairness object
fobject_test <- fairness_check(gbm_explainer, 
                               protected = protected, 
                               privileged = "Male",
                               verbose = FALSE) 

plot(ceteris_paribus_cutoff(fobject_test, subgroup = "Female"))

## Warning: Removed 13 row(s) containing missing values (geom_path).

plot(ceteris_paribus_cutoff(fobject_test,
                            subgroup = "Female",
                            fairness_metrics = c("ACC","TPR","STP")))

## Warning: Removed 4 row(s) containing missing values (geom_path).

fc <- fairness_check(gbm_explainer, fobject,
                     label = "gbm_cutoff",
                     cutoff = list(Female = 0.25),
                     verbose = FALSE)

plot(fc)

data("compas")

head(compas)

##   Two_yr_Recidivism Number_of_Priors Age_Above_FourtyFive Age_Below_TwentyFive
## 1                 0                0                    1                    0
## 2                 1                0                    0                    0
## 3                 1                4                    0                    1
## 4                 0                0                    0                    0
## 5                 1               14                    0                    0
## 6                 0                3                    0                    0
##   Misdemeanor        Ethnicity  Sex
## 1           0            Other Male
## 2           0 African_American Male
## 3           0 African_American Male
## 4           1            Other Male
## 5           0        Caucasian Male
## 6           0            Other Male

library(DALEX)
library(ranger)

# train
rf_compas <- ranger(Two_yr_Recidivism ~., data = compas, probability = TRUE)

# numeric target values
y_numeric <- as.numeric(compas$Two_yr_Recidivism)-1

# explainer
rf_explainer <- explain(rf_compas, data = compas[,-1], y = y_numeric, colorize = FALSE)

## Preparation of a new explainer is initiated
##   -> model label       :  ranger  (  default  )
##   -> data              :  6172  rows  6  cols 
##   -> target variable   :  6172  values 
##   -> model_info        :  package ranger , ver. 0.11.2 , task classification (  default  ) 
##   -> predict function  :  yhat.ranger  will be used (  default  )
##   -> predicted values  :  numerical, min =  0.1339348 , mean =  0.4551456 , max =  0.8347115  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.7768296 , mean =  -2.572062e-05 , max =  0.8494836  
##   A new explainer has been created!

fobject <- fairness_check(rf_explainer,                         # explainer
                          protected = compas$Ethnicity,         # protected variable as factor
                          privileged = "Caucasian",             # level in protected variable, potentially more privileged
                          cutoff = 0.5,                         # cutoff - optional, default = 0.5
                          colorize = FALSE)

## Creating fairness object
## -> Privileged subgroup       : character ( Ok  )
## -> Protected variable        : factor ( Ok  ) 
## -> Cutoff values for explainers  : 0.5 ( for all subgroups )
## -> Fairness objects      : 0 objects 
## -> Checking explainers       : 1 in total (  compatible  )
## -> Metric calculation        : 12/12 metrics calculated for all models
##  Fairness object created succesfully

plot(fobject)

plot_density(fobject)

library(gbm)

rf_compas_1 <- ranger(Two_yr_Recidivism ~Number_of_Priors+Age_Below_TwentyFive,
                      data = compas,
                      probability = TRUE)

lr_compas_1 <- glm(Two_yr_Recidivism~.,
                   data=compas,
                   family=binomial(link="logit"))

rf_compas_2 <- ranger(Two_yr_Recidivism ~., data = compas, probability = TRUE) 
rf_compas_3 <- ranger(Two_yr_Recidivism ~ Age_Above_FourtyFive+Misdemeanor,
                      data = compas,
                      probability = TRUE)

df <- compas
df$Two_yr_Recidivism <- as.numeric(compas$Two_yr_Recidivism)-1
gbm_compas_1<- gbm(Two_yr_Recidivism~., data = df)

## Distribution not specified, assuming bernoulli ...

explainer_1 <- explain(rf_compas_1,  data = compas[,-1], y = y_numeric)

## Preparation of a new explainer is initiated
##   -> model label       :  ranger  ( [33m default [39m )
##   -> data              :  6172  rows  6  cols 
##   -> target variable   :  6172  values 
##   -> model_info        :  package ranger , ver. 0.11.2 , task classification ( [33m default [39m ) 
##   -> predict function  :  yhat.ranger  will be used ( [33m default [39m )
##   -> predicted values  :  numerical, min =  0.2967264 , mean =  0.4552762 , max =  0.7799906  
##   -> residual function :  difference between y and yhat ( [33m default [39m )
##   -> residuals         :  numerical, min =  -0.755995 , mean =  -0.0001563086 , max =  0.7032736  
##  [32m A new explainer has been created! [39m

explainer_2 <- explain(lr_compas_1,  data = compas[,-1], y = y_numeric)

## Preparation of a new explainer is initiated
##   -> model label       :  lm  ( [33m default [39m )
##   -> data              :  6172  rows  6  cols 
##   -> target variable   :  6172  values 
##   -> model_info        :  package stats , ver. 3.6.0 , task regression ( [33m default [39m ) 
##   -> predict function  :  yhat.glm  will be used ( [33m default [39m )
##   -> predicted values  :  numerical, min =  0.1144574 , mean =  0.4551199 , max =  0.995477  
##   -> residual function :  difference between y and yhat ( [33m default [39m )
##   -> residuals         :  numerical, min =  -0.9767658 , mean =  5.070175e-13 , max =  0.8822826  
##  [32m A new explainer has been created! [39m

explainer_3 <- explain(rf_compas_2,  data = compas[,-1], y = y_numeric, label = "ranger_2")

## Preparation of a new explainer is initiated
##   -> model label       :  ranger_2 
##   -> data              :  6172  rows  6  cols 
##   -> target variable   :  6172  values 
##   -> model_info        :  package ranger , ver. 0.11.2 , task classification ( [33m default [39m ) 
##   -> predict function  :  yhat.ranger  will be used ( [33m default [39m )
##   -> predicted values  :  numerical, min =  0.1302174 , mean =  0.454672 , max =  0.8435127  
##   -> residual function :  difference between y and yhat ( [33m default [39m )
##   -> residuals         :  numerical, min =  -0.7800877 , mean =  0.0004479272 , max =  0.8463011  
##  [32m A new explainer has been created! [39m

explainer_4 <- explain(rf_compas_3,  data = compas[,-1], y = y_numeric, label = "ranger_3")

## Preparation of a new explainer is initiated
##   -> model label       :  ranger_3 
##   -> data              :  6172  rows  6  cols 
##   -> target variable   :  6172  values 
##   -> model_info        :  package ranger , ver. 0.11.2 , task classification ( [33m default [39m ) 
##   -> predict function  :  yhat.ranger  will be used ( [33m default [39m )
##   -> predicted values  :  numerical, min =  0.3024475 , mean =  0.4547728 , max =  0.5135181  
##   -> residual function :  difference between y and yhat ( [33m default [39m )
##   -> residuals         :  numerical, min =  -0.5135181 , mean =  0.000347104 , max =  0.6975525  
##  [32m A new explainer has been created! [39m

explainer_5 <- explain(gbm_compas_1, data = compas[,-1], y = y_numeric)

## Preparation of a new explainer is initiated
##   -> model label       :  gbm  ( [33m default [39m )
##   -> data              :  6172  rows  6  cols 
##   -> target variable   :  6172  values 
##   -> model_info        :  package gbm , ver. 2.1.8 , task classification ( [33m default [39m ) 
##   -> predict function  :  yhat.gbm  will be used ( [33m default [39m )
##   -> predicted values  :  numerical, min =  0.1173517 , mean =  0.454823 , max =  0.9066772  
##   -> residual function :  difference between y and yhat ( [33m default [39m )
##   -> residuals         :  numerical, min =  -0.8737664 , mean =  0.000296854 , max =  0.8750579  
##  [32m A new explainer has been created! [39m

fobject <- fairness_check(explainer_1, explainer_2,
                          explainer_3, explainer_4,
                          explainer_5,
                          protected = compas$Ethnicity,
                          privileged = "Caucasian",
                          verbose = FALSE) 


sm <- stack_metrics(fobject)

## Warning in drop_metrics_with_na(parity_loss_metric_data): Found metric with NA: TPR, PPV, omiting it

plot(sm)

cm <- choose_metric(fobject, "TPR")
plot(cm)

## Warning: Removed 1 rows containing missing values (position_stack).

fair_pca <- fairness_pca(fobject)

## Warning in drop_metrics_with_na(data): Found metric with NA: TPR, PPV, TS, F1, omiting it

print(fair_pca)

## Fairness PCA : 
##             PC1        PC2        PC3        PC4           PC5
## [1,] -1.1397848  0.0782970 -0.7870143 -0.6444191  1.110223e-16
## [2,]  3.0369489  1.3967656 -0.2532461  0.1447943  1.165734e-15
## [3,]  0.1458079 -1.4931618 -0.7708397  0.4185452 -3.552714e-15
## [4,] -3.2070818  0.9050207  0.4858758  0.3041836  1.117162e-15
## [5,]  1.1641098 -0.8869215  1.3252242 -0.2231039 -7.355228e-16
## 
## Created with: 
## [1] "ranger"   "lm"       "ranger_2" "ranger_3" "gbm"     
## 
## First two components explained 87 % of variance.

plot(fair_pca)

fheatmap <- fairness_heatmap(fobject)
plot(fheatmap, text_size = 3)

## Warning: Removed 4 rows containing missing values (geom_text).

fobject2 <- fairness_check(explainer_1,explainer_2, 
                           protected = compas$Ethnicity,
                           privileged = "Caucasian", 
                           verbose = FALSE)


gm <- group_metric(fobject2, fairness_metric = "FPR")

## Performace metric not given, setting deafult ( accuracy )  
## 
## Creating object with: 
## Fairness metric:  FPR 
## Performance metric:  accuracy

plot(gm)

fradar <- fairness_radar(fobject2)

## Warning in fairness_radar(fobject2): Found metric with NA: TPR, PPV, ommiting it

plot(fradar)

ac <- all_cutoffs(fobject2)

plot(ac)

## Warning: Removed 192 row(s) containing missing values (geom_path).

cpc <- ceteris_paribus_cutoff(fobject2, subgroup = "African_American")

plot(cpc)

## Warning: Removed 252 row(s) containing missing values (geom_path).

## Warning: Removed 1 rows containing missing values (geom_segment).

## Warning: Removed 1 rows containing missing values (geom_text_repel).