facto
library(fairmodels)
data("adult")
head(adult)
## salary age workclass fnlwgt education education_num marital_status
## 1 <=50K 39 State-gov 77516 Bachelors 13 Never-married
## 2 <=50K 50 Self-emp-not-inc 83311 Bachelors 13 Married-civ-spouse
## 3 <=50K 38 Private 215646 HS-grad 9 Divorced
## 4 <=50K 53 Private 234721 11th 7 Married-civ-spouse
## 5 <=50K 28 Private 338409 Bachelors 13 Married-civ-spouse
## 6 <=50K 37 Private 284582 Masters 14 Married-civ-spouse
## occupation relationship race sex capital_gain capital_loss
## 1 Adm-clerical Not-in-family White Male 2174 0
## 2 Exec-managerial Husband White Male 0 0
## 3 Handlers-cleaners Not-in-family White Male 0 0
## 4 Handlers-cleaners Husband Black Male 0 0
## 5 Prof-specialty Wife Black Female 0 0
## 6 Exec-managerial Wife White Female 0 0
## hours_per_week native_country
## 1 40 United-States
## 2 13 United-States
## 3 40 United-States
## 4 40 United-States
## 5 40 Cuba
## 6 40 United-States
dim(adult)
## [1] 32561 15
library(gbm)
## Warning: package 'gbm' was built under R version 3.6.2
## Loaded gbm 2.1.8
library(DALEX)
## Welcome to DALEX (version: 1.0.1).
## Find examples and detailed introduction at: https://pbiecek.github.io/ema/
## Additional features will be available after installation of: iBreakDown.
## Use 'install_dependencies()' to get all suggested dependencies
adult$salary <- as.numeric(adult$salary) -1 # 0 if bad and 1 if good risk
protected <- adult$sex
adult <- adult[colnames(adult) != "sex"] # sex not specified
# making model
set.seed(1)
gbm_model <-gbm(salary ~. , data = adult, distribution = "bernoulli")
# making explainer
gbm_explainer <- explain(gbm_model,
data = adult[,-1],
y = adult$salary,
colorize = FALSE)
## Preparation of a new explainer is initiated
## -> model label : gbm ( default )
## -> data : 32561 rows 13 cols
## -> target variable : 32561 values
## -> model_info : package gbm , ver. 2.1.8 , task classification ( default )
## -> predict function : yhat.gbm will be used ( default )
## -> predicted values : numerical, min = 0.0101498 , mean = 0.2409542 , max = 0.9864558
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = -0.9790795 , mean = -0.0001445991 , max = 0.9864904
## A new explainer has been created!
model_performance(gbm_explainer)
## Measures for: classification
## recall : 0.5353909
## precision: 0.7999238
## f1 : 0.6414547
## accuracy : 0.8558705
## auc : 0.9093789
##
## Residuals:
## 0% 10% 20% 30% 40% 50%
## -0.97907954 -0.31512711 -0.20921106 -0.12255213 -0.06480941 -0.04122486
## 60% 70% 80% 90% 100%
## -0.02832005 -0.01740186 0.13344943 0.53676046 0.98649038
fobject <- fairness_check(gbm_explainer,
protected = protected,
privileged = "Male",
colorize = FALSE)
## Creating fairness object
## -> Privileged subgroup : character ( Ok )
## -> Protected variable : factor ( Ok )
## -> Cutoff values for explainers : 0.5 ( for all subgroups )
## -> Fairness objects : 0 objects
## -> Checking explainers : 1 in total ( compatible )
## -> Metric calculation : 12/12 metrics calculated for all models
## Fairness object created succesfully
plot(fobject)

data_fixed <- disparate_impact_remover(data = adult, protected = protected,
features_to_transform = c("age", "hours_per_week",
"capital_loss",
"capital_gain"))
set.seed(1)
gbm_model <- gbm(salary ~. , data = data_fixed, distribution = "bernoulli")
gbm_explainer_dir <- explain(gbm_model,
data = data_fixed[,-1],
y = adult$salary,
label = "gbm_dir",
verbose = FALSE)
fobject <- fairness_check(gbm_explainer, gbm_explainer_dir,
protected = protected,
privileged = "Male",
verbose = FALSE)
plot(fobject)

weights <- reweight(protected = protected, y = adult$salary)
set.seed(1)
gbm_model <- gbm(salary ~. ,
data = adult,
weights = weights,
distribution = "bernoulli")
gbm_explainer_w <- explain(gbm_model,
data = adult[,-1],
y = adult$salary,
label = "gbm_weighted",
verbose = FALSE)
fobject <- fairness_check(fobject, gbm_explainer_w, verbose = FALSE)
plot(fobject)

# to obtain probs we will use simple linear regression
probs <- glm(salary ~., data = adult, family = binomial())$fitted.values
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
uniform_indexes <- resample(protected = protected,
y = adult$salary)
preferential_indexes <- resample(protected = protected,
y = adult$salary,
type = "preferential",
probs = probs)
set.seed(1)
gbm_model <- gbm(salary ~. ,
data = adult[uniform_indexes,],
distribution = "bernoulli")
gbm_explainer_u <- explain(gbm_model,
data = adult[,-1],
y = adult$salary,
label = "gbm_uniform",
verbose = FALSE)
set.seed(1)
gbm_model <- gbm(salary ~. ,
data = adult[preferential_indexes,],
distribution = "bernoulli")
gbm_explainer_p <- explain(gbm_model,
data = adult[,-1],
y = adult$salary,
label = "gbm_preferential",
verbose = FALSE)
fobject <- fairness_check(fobject, gbm_explainer_u, gbm_explainer_p,
verbose = FALSE)
plot(fobject)

# we will need normal explainer
set.seed(1)
gbm_model <-gbm(salary ~. , data = adult, distribution = "bernoulli")
gbm_explainer <- explain(gbm_model,
data = adult[,-1],
y = adult$salary,
verbose = FALSE)
gbm_explainer_r <- roc_pivot(gbm_explainer,
protected = protected,
privileged = "Male")
fobject <- fairness_check(fobject, gbm_explainer_r,
label = "gbm_roc", # label as vector for explainers
verbose = FALSE)
plot(fobject)

set.seed(1)
gbm_model <-gbm(salary ~. , data = adult, distribution = "bernoulli")
gbm_explainer <- explain(gbm_model,
data = adult[,-1],
y = adult$salary,
verbose = FALSE)
# test fairness object
fobject_test <- fairness_check(gbm_explainer,
protected = protected,
privileged = "Male",
verbose = FALSE)
plot(ceteris_paribus_cutoff(fobject_test, subgroup = "Female"))
## Warning: Removed 13 row(s) containing missing values (geom_path).

plot(ceteris_paribus_cutoff(fobject_test,
subgroup = "Female",
fairness_metrics = c("ACC","TPR","STP")))
## Warning: Removed 4 row(s) containing missing values (geom_path).

fc <- fairness_check(gbm_explainer, fobject,
label = "gbm_cutoff",
cutoff = list(Female = 0.25),
verbose = FALSE)
plot(fc)

data("compas")
head(compas)
## Two_yr_Recidivism Number_of_Priors Age_Above_FourtyFive Age_Below_TwentyFive
## 1 0 0 1 0
## 2 1 0 0 0
## 3 1 4 0 1
## 4 0 0 0 0
## 5 1 14 0 0
## 6 0 3 0 0
## Misdemeanor Ethnicity Sex
## 1 0 Other Male
## 2 0 African_American Male
## 3 0 African_American Male
## 4 1 Other Male
## 5 0 Caucasian Male
## 6 0 Other Male
library(DALEX)
library(ranger)
# train
rf_compas <- ranger(Two_yr_Recidivism ~., data = compas, probability = TRUE)
# numeric target values
y_numeric <- as.numeric(compas$Two_yr_Recidivism)-1
# explainer
rf_explainer <- explain(rf_compas, data = compas[,-1], y = y_numeric, colorize = FALSE)
## Preparation of a new explainer is initiated
## -> model label : ranger ( default )
## -> data : 6172 rows 6 cols
## -> target variable : 6172 values
## -> model_info : package ranger , ver. 0.11.2 , task classification ( default )
## -> predict function : yhat.ranger will be used ( default )
## -> predicted values : numerical, min = 0.1339348 , mean = 0.4551456 , max = 0.8347115
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = -0.7768296 , mean = -2.572062e-05 , max = 0.8494836
## A new explainer has been created!
fobject <- fairness_check(rf_explainer, # explainer
protected = compas$Ethnicity, # protected variable as factor
privileged = "Caucasian", # level in protected variable, potentially more privileged
cutoff = 0.5, # cutoff - optional, default = 0.5
colorize = FALSE)
## Creating fairness object
## -> Privileged subgroup : character ( Ok )
## -> Protected variable : factor ( Ok )
## -> Cutoff values for explainers : 0.5 ( for all subgroups )
## -> Fairness objects : 0 objects
## -> Checking explainers : 1 in total ( compatible )
## -> Metric calculation : 12/12 metrics calculated for all models
## Fairness object created succesfully
plot(fobject)

plot_density(fobject)

library(gbm)
rf_compas_1 <- ranger(Two_yr_Recidivism ~Number_of_Priors+Age_Below_TwentyFive,
data = compas,
probability = TRUE)
lr_compas_1 <- glm(Two_yr_Recidivism~.,
data=compas,
family=binomial(link="logit"))
rf_compas_2 <- ranger(Two_yr_Recidivism ~., data = compas, probability = TRUE)
rf_compas_3 <- ranger(Two_yr_Recidivism ~ Age_Above_FourtyFive+Misdemeanor,
data = compas,
probability = TRUE)
df <- compas
df$Two_yr_Recidivism <- as.numeric(compas$Two_yr_Recidivism)-1
gbm_compas_1<- gbm(Two_yr_Recidivism~., data = df)
## Distribution not specified, assuming bernoulli ...
explainer_1 <- explain(rf_compas_1, data = compas[,-1], y = y_numeric)
## Preparation of a new explainer is initiated
## -> model label : ranger ( [33m default [39m )
## -> data : 6172 rows 6 cols
## -> target variable : 6172 values
## -> model_info : package ranger , ver. 0.11.2 , task classification ( [33m default [39m )
## -> predict function : yhat.ranger will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.2967264 , mean = 0.4552762 , max = 0.7799906
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.755995 , mean = -0.0001563086 , max = 0.7032736
## [32m A new explainer has been created! [39m
explainer_2 <- explain(lr_compas_1, data = compas[,-1], y = y_numeric)
## Preparation of a new explainer is initiated
## -> model label : lm ( [33m default [39m )
## -> data : 6172 rows 6 cols
## -> target variable : 6172 values
## -> model_info : package stats , ver. 3.6.0 , task regression ( [33m default [39m )
## -> predict function : yhat.glm will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.1144574 , mean = 0.4551199 , max = 0.995477
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.9767658 , mean = 5.070175e-13 , max = 0.8822826
## [32m A new explainer has been created! [39m
explainer_3 <- explain(rf_compas_2, data = compas[,-1], y = y_numeric, label = "ranger_2")
## Preparation of a new explainer is initiated
## -> model label : ranger_2
## -> data : 6172 rows 6 cols
## -> target variable : 6172 values
## -> model_info : package ranger , ver. 0.11.2 , task classification ( [33m default [39m )
## -> predict function : yhat.ranger will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.1302174 , mean = 0.454672 , max = 0.8435127
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.7800877 , mean = 0.0004479272 , max = 0.8463011
## [32m A new explainer has been created! [39m
explainer_4 <- explain(rf_compas_3, data = compas[,-1], y = y_numeric, label = "ranger_3")
## Preparation of a new explainer is initiated
## -> model label : ranger_3
## -> data : 6172 rows 6 cols
## -> target variable : 6172 values
## -> model_info : package ranger , ver. 0.11.2 , task classification ( [33m default [39m )
## -> predict function : yhat.ranger will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.3024475 , mean = 0.4547728 , max = 0.5135181
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.5135181 , mean = 0.000347104 , max = 0.6975525
## [32m A new explainer has been created! [39m
explainer_5 <- explain(gbm_compas_1, data = compas[,-1], y = y_numeric)
## Preparation of a new explainer is initiated
## -> model label : gbm ( [33m default [39m )
## -> data : 6172 rows 6 cols
## -> target variable : 6172 values
## -> model_info : package gbm , ver. 2.1.8 , task classification ( [33m default [39m )
## -> predict function : yhat.gbm will be used ( [33m default [39m )
## -> predicted values : numerical, min = 0.1173517 , mean = 0.454823 , max = 0.9066772
## -> residual function : difference between y and yhat ( [33m default [39m )
## -> residuals : numerical, min = -0.8737664 , mean = 0.000296854 , max = 0.8750579
## [32m A new explainer has been created! [39m
fobject <- fairness_check(explainer_1, explainer_2,
explainer_3, explainer_4,
explainer_5,
protected = compas$Ethnicity,
privileged = "Caucasian",
verbose = FALSE)
sm <- stack_metrics(fobject)
## Warning in drop_metrics_with_na(parity_loss_metric_data): Found metric with NA: TPR, PPV, omiting it
plot(sm)

cm <- choose_metric(fobject, "TPR")
plot(cm)
## Warning: Removed 1 rows containing missing values (position_stack).

fair_pca <- fairness_pca(fobject)
## Warning in drop_metrics_with_na(data): Found metric with NA: TPR, PPV, TS, F1, omiting it
print(fair_pca)
## Fairness PCA :
## PC1 PC2 PC3 PC4 PC5
## [1,] -1.1397848 0.0782970 -0.7870143 -0.6444191 1.110223e-16
## [2,] 3.0369489 1.3967656 -0.2532461 0.1447943 1.165734e-15
## [3,] 0.1458079 -1.4931618 -0.7708397 0.4185452 -3.552714e-15
## [4,] -3.2070818 0.9050207 0.4858758 0.3041836 1.117162e-15
## [5,] 1.1641098 -0.8869215 1.3252242 -0.2231039 -7.355228e-16
##
## Created with:
## [1] "ranger" "lm" "ranger_2" "ranger_3" "gbm"
##
## First two components explained 87 % of variance.
plot(fair_pca)

fheatmap <- fairness_heatmap(fobject)
plot(fheatmap, text_size = 3)
## Warning: Removed 4 rows containing missing values (geom_text).

fobject2 <- fairness_check(explainer_1,explainer_2,
protected = compas$Ethnicity,
privileged = "Caucasian",
verbose = FALSE)
gm <- group_metric(fobject2, fairness_metric = "FPR")
## Performace metric not given, setting deafult ( accuracy )
##
## Creating object with:
## Fairness metric: FPR
## Performance metric: accuracy
plot(gm)

fradar <- fairness_radar(fobject2)
## Warning in fairness_radar(fobject2): Found metric with NA: TPR, PPV, ommiting it
plot(fradar)

ac <- all_cutoffs(fobject2)
plot(ac)
## Warning: Removed 192 row(s) containing missing values (geom_path).

cpc <- ceteris_paribus_cutoff(fobject2, subgroup = "African_American")
plot(cpc)
## Warning: Removed 252 row(s) containing missing values (geom_path).
## Warning: Removed 1 rows containing missing values (geom_segment).
## Warning: Removed 1 rows containing missing values (geom_text_repel).
