library(gbm)
library(randomForest)
library(ROSE)
library(dplyr)
library(tidyverse)
library(tree)
library(smotefamily)
library(ModelMetrics)
library(ggplot2)
library(viridis)
library(hrbrthemes)
library(kableExtra)
library(knitr)
library(tune)
library(discrim)
library(klaR)
library(themis)
library(tidymodels)
library(plotly)
library(DescTools)
tidymodels_prefer()
conflicted::conflict_prefer("select", "dplyr")
I dati sono stati importati dal sito Kaggle al seguente link https://www.kaggle.com/datasets/sgpjesus/bank-account-fraud-dataset-neurips-2022 e si riferiscono alle frodi bancarie.
L’obbiettivo del progetto è sviluppare un modello di previsione delle frodi bancarie.
Il dataset a disposizione offre dati realistici basati sul mondo reale ma protetti da privacy tramite diverse tecniche come noise addittion e addesstramento di un modello generativo CTGAN.
df <- read.csv("data/Base.csv") %>%
tibble()
df %>% head(50)
Trasformo la variabile fraud_bool da integer a
factor.
df = df %>%
mutate(fraud = factor(fraud_bool, labels = c("No fraud", "Fraud"))) %>%
select(-fraud_bool)
table(df$fraud)
##
## No fraud Fraud
## 988971 11029
Controllo della presenza di NA nel dataset.
anyNA.data.frame(df)
## [1] FALSE
Breve visione del dataframe, è composto da un milione di osservazioni e 32 variabili. Sarà necessario poner attenzione nell’ottimizzazione di alcuni comandi per la riduzione del tempo macchina nell’esecuzione degli stessi.
df %>%
skimr::skim()
| Name | Piped data |
| Number of rows | 1000000 |
| Number of columns | 32 |
| _______________________ | |
| Column type frequency: | |
| character | 5 |
| factor | 1 |
| numeric | 26 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| payment_type | 0 | 1 | 2 | 2 | 0 | 5 | 0 |
| employment_status | 0 | 1 | 2 | 2 | 0 | 7 | 0 |
| housing_status | 0 | 1 | 2 | 2 | 0 | 7 | 0 |
| source | 0 | 1 | 7 | 8 | 0 | 2 | 0 |
| device_os | 0 | 1 | 3 | 9 | 0 | 5 | 0 |
Variable type: factor
| skim_variable | n_missing | complete_rate | ordered | n_unique | top_counts |
|---|---|---|---|---|---|
| fraud | 0 | 1 | FALSE | 2 | No : 988971, Fra: 11029 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| income | 0 | 1 | 0.56 | 0.29 | 0.10 | 0.30 | 0.60 | 0.80 | 0.90 | ▅▃▁▅▇ |
| name_email_similarity | 0 | 1 | 0.49 | 0.29 | 0.00 | 0.23 | 0.49 | 0.76 | 1.00 | ▇▇▇▇▇ |
| prev_address_months_count | 0 | 1 | 16.72 | 44.05 | -1.00 | -1.00 | -1.00 | 12.00 | 383.00 | ▇▁▁▁▁ |
| current_address_months_count | 0 | 1 | 86.59 | 88.41 | -1.00 | 19.00 | 52.00 | 130.00 | 428.00 | ▇▂▂▁▁ |
| customer_age | 0 | 1 | 33.69 | 12.03 | 10.00 | 20.00 | 30.00 | 40.00 | 90.00 | ▃▇▂▁▁ |
| days_since_request | 0 | 1 | 1.03 | 5.38 | 0.00 | 0.01 | 0.02 | 0.03 | 78.46 | ▇▁▁▁▁ |
| intended_balcon_amount | 0 | 1 | 8.66 | 20.24 | -15.53 | -1.18 | -0.83 | 4.98 | 112.96 | ▇▁▁▁▁ |
| zip_count_4w | 0 | 1 | 1572.69 | 1005.37 | 1.00 | 894.00 | 1263.00 | 1944.00 | 6700.00 | ▇▅▂▁▁ |
| velocity_6h | 0 | 1 | 5665.30 | 3009.38 | -170.60 | 3436.37 | 5319.77 | 7680.72 | 16715.57 | ▅▇▅▁▁ |
| velocity_24h | 0 | 1 | 4769.78 | 1479.21 | 1300.31 | 3593.18 | 4749.92 | 5752.57 | 9506.90 | ▂▇▇▃▁ |
| velocity_4w | 0 | 1 | 4856.32 | 919.84 | 2825.75 | 4268.37 | 4913.44 | 5488.08 | 6994.76 | ▃▇▇▆▂ |
| bank_branch_count_8w | 0 | 1 | 184.36 | 459.63 | 0.00 | 1.00 | 9.00 | 25.00 | 2385.00 | ▇▁▁▁▁ |
| date_of_birth_distinct_emails_4w | 0 | 1 | 9.50 | 5.03 | 0.00 | 6.00 | 9.00 | 13.00 | 39.00 | ▇▇▂▁▁ |
| credit_risk_score | 0 | 1 | 130.99 | 69.68 | -170.00 | 83.00 | 122.00 | 178.00 | 389.00 | ▁▂▇▃▁ |
| email_is_free | 0 | 1 | 0.53 | 0.50 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▇▁▁▁▇ |
| phone_home_valid | 0 | 1 | 0.42 | 0.49 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 | ▇▁▁▁▆ |
| phone_mobile_valid | 0 | 1 | 0.89 | 0.31 | 0.00 | 1.00 | 1.00 | 1.00 | 1.00 | ▁▁▁▁▇ |
| bank_months_count | 0 | 1 | 10.84 | 12.12 | -1.00 | -1.00 | 5.00 | 25.00 | 32.00 | ▇▁▁▂▃ |
| has_other_cards | 0 | 1 | 0.22 | 0.42 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▂ |
| proposed_credit_limit | 0 | 1 | 515.85 | 487.56 | 190.00 | 200.00 | 200.00 | 500.00 | 2100.00 | ▇▁▁▂▁ |
| foreign_request | 0 | 1 | 0.03 | 0.16 | 0.00 | 0.00 | 0.00 | 0.00 | 1.00 | ▇▁▁▁▁ |
| session_length_in_minutes | 0 | 1 | 7.54 | 8.03 | -1.00 | 3.10 | 5.11 | 8.87 | 85.90 | ▇▁▁▁▁ |
| keep_alive_session | 0 | 1 | 0.58 | 0.49 | 0.00 | 0.00 | 1.00 | 1.00 | 1.00 | ▆▁▁▁▇ |
| device_distinct_emails_8w | 0 | 1 | 1.02 | 0.18 | -1.00 | 1.00 | 1.00 | 1.00 | 2.00 | ▁▁▁▇▁ |
| device_fraud_count | 0 | 1 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | 0.00 | ▁▁▇▁▁ |
| month | 0 | 1 | 3.29 | 2.21 | 0.00 | 1.00 | 3.00 | 5.00 | 7.00 | ▇▃▇▃▆ |
Eseguo un’analisi esplorativa dei dati grafica per visualizzare con facilità eventuli problematiche o possibili miglioramenti che possono essere applicati al dataset.
Vengono prodotti gli istogrammi delle variabili numeriche per osservare le loro distribuzioni.
df %>%
select(where(is.numeric)) %>%
# mutate_all(scale) %>%
pivot_longer(cols = 1:9,
names_to = "Variabili",
values_to = "Valori") %>%
ggplot(aes(x = Valori, color = Variabili, fill = Variabili)) +
geom_histogram(bins = 20, alpha = 0.6) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
theme_ipsum() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
) +
xlab("") +
ylab("") +
facet_wrap(~Variabili, ncol = 3, scales = "free")
df %>%
select(where(is.numeric)) %>%
# mutate_all(scale) %>%
pivot_longer(cols = 10:18,
names_to = "Variabili",
values_to = "Valori") %>%
ggplot(aes(x = Valori, color = Variabili, fill = Variabili)) +
geom_histogram(bins = 20, alpha = 0.6) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
theme_ipsum() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
) +
xlab("") +
ylab("") +
facet_wrap(~Variabili, ncol = 3, scales = "free")
df %>%
select(where(is.numeric)) %>%
# mutate_all(scale) %>%
pivot_longer(cols = 19:26,
names_to = "Variabili",
values_to = "Valori") %>%
ggplot(aes(x = Valori, color = Variabili, fill = Variabili)) +
geom_histogram(bins = 20, alpha = 0.6) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
theme_ipsum() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
) +
xlab("") +
ylab("") +
facet_wrap(~Variabili, ncol = 3, scales = "free")
Alcune variabili devono essere trasformate da character
a factor, allo stesso tempo anche alcune variabili
quantitative discrete possono essere considerate come variabili
qualitative ordinali.
Prima della rappresentazione grafiche si sottopone il dataset alle trasformazioni sopra descritte.
Viene rimossa anche la variabile device_fraud_count in quanto possiede una varianza nulla.
Alcune variabili quantitative discrete vengono trasformate in variabili qualitative poiché presentano un numero limitato di distinte osservazioni e mostrano una correlazione lineare con la variabile dipendente. Tale approccio può migliorare la precisione delle stime, sebbene comporti un aumento nel numero dei parametri da stimare.
df = df %>%
select(- device_fraud_count) %>%
mutate_at(vars(c(customer_age, income, email_is_free, proposed_credit_limit,
phone_home_valid, phone_mobile_valid, device_distinct_emails_8w,
foreign_request, has_other_cards, keep_alive_session, month)),
~ factor(.)) %>% # ordered = T
mutate_if(is.character, factor)
df
dfChar = df %>%
select(where(is.factor))
apply(matrix(1:(ncol(dfChar) - 1)),1, function(indexCol)
# apply(matrix(1:2),1, function(indexCol)
{
ggplot(dfChar, aes(x = get(colnames(dfChar)[indexCol]), group = fraud)) +
geom_bar(aes(y = ..prop.., fill = fraud), stat = "count") +
scale_y_continuous(labels=scales::percent) +
scale_fill_brewer(palette = "Dark2") +
xlab(colnames(dfChar)[indexCol]) +
ylab("Osservazioni") +
theme_ipsum() +
theme(legend.position="none")
}
)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
df %>%
group_by(fraud) %>%
summarise_if(is.numeric,mean) %>%
bind_cols(df %>%
group_by(fraud) %>%
summarise_if(is.character,DescTools::Mode)) %>%
as.matrix() %>%
t() %>%
janitor::row_to_names(row_number = 1) %>%
as.data.frame() %>%
rownames_to_column("Variabili") %>%
tibble()
## New names:
## • `fraud` -> `fraud...1`
## • `fraud` -> `fraud...16`
grafici = lapply(1:4,function(x)
{
df %>%
select(where(is.numeric)|fraud) %>%
# mutate_if(is.numeric, scale) %>%
pivot_longer(cols = 1:ifelse(x==4,2,4)+seq(0,16,by = 4)[x],
names_to = "Variabili",
values_to = "Valori") %>%
group_by(fraud) %>%
ggplot(aes(y = Valori, x = fraud, color = fraud, fill = fraud)) +
geom_boxplot(alpha = 0.6) +
scale_fill_viridis(discrete=TRUE) +
scale_color_viridis(discrete=TRUE) +
theme_ipsum() +
theme(
legend.position="none",
panel.spacing = unit(0.1, "lines"),
strip.text.x = element_text(size = 8)
) +
xlab("") +
ylab("") +
facet_wrap(~Variabili, ncol = 2, scales = "free_y")
}
)
for(i in 1:4) print(grafici[[i]])
rm(grafici)
# df[c(1:1000, 50000:51000),] %>%
df %>%
mutate(fraud = as.numeric(fraud)) %>%
select(where(is.numeric)) %>%
# select(where(~ length(levels(factor(.))) > 2)) %>%
rename_all(~ str_trunc(., width = 15)) %>%
scale() %>%
cor() %>%
corrplot::corrplot(cl.cex = 0.5,
type = "lower",
tl.pos = "l")
Il seguente grafico mostra in modo diretto la correlazione della
variabile fraud con le altre variabili del modello. Ovviamente, non è
rappresentata la multicollinearità tra le altre variabili, come mostrato
nel grafico soprastante.
df %>%
mutate(fraud = as.numeric(fraud)) %>%
select(where(is.numeric)) %>%
# select(where(~ length(levels(factor(.))) > 2)) %>%
rename_all(~ str_trunc(., width = 15)) %>%
scale() %>%
cor() %>%
data.frame() %>%
rownames_to_column("Variabili") %>%
tibble() %>%
select(Variabili,fraud) %>%
mutate(Variabili = as.factor(Variabili)) %>%
filter(fraud<1) %>%
ggplot(aes(x = Variabili, y = fraud, fill = fraud)) +
geom_col(color = "gray45") +
coord_flip() +
scale_fill_gradient2(low = "mediumspringgreen",
high = "mediumvioletred",
midpoint = 0) +
theme_bw() +
theme(legend.position = "none")
data_split = initial_split(df, prop = 3/4, strata = "fraud")
data_train = training(data_split)
data_test = testing(data_split)
step_novel(all_nominal_predictors()): Identifica
variabili categoriche che potrebbero contenere nuove categorie non
presenti nei dati di addestramento. Questo passo è utile per gestire
categorie non viste durante l’addestramento del modello che invece
potrebbero essere presenti nel dataset di testing del modello.
step_normalize(all_numeric_predictors()): Normalizza
tutte le variabili numeriche in modo che abbiano una media di 0 e una
deviazione standard di 1. Questo può essere utile per garantire che
tutte le variabili abbiano lo stesso peso nella costruzione del modello.
Inoltre, sarà comodo per capire il peso dei coefficienti nel
modello.
step_dummy(all_nominal_predictors()): Crea variabili
fittizie (dummy variables) per le variabili categoriche, consentendo al
modello di trattare correttamente le categorie senza un ordine
intrinseco.
step_rose(fraud): Utilizza il metodo di
sovracampionamento chiamato ROSE (Random Over-Sampling Examples) per
bilanciare le classi nel dataset. Infatti i casi di frode sono
nettamente inferiori (1,1%) ai casi in cui non si verifica la frode.
step_zv(all_nominal_predictors()): Rimuove le variabili
categoriche che hanno varianza zero, il che significa che sono costanti
e non portano alcuna informazione predittiva.
df_rec =
recipe(fraud ~ ., data = data_train) %>%
step_novel(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_rose(fraud) %>%
step_zv(all_nominal_predictors())
df_rec %>% prep()
df %>%
select(fraud) %>%
table() %>%
as_tibble() %>%
mutate(perc = round(n/nrow(df)*100, digits = 1))
log_mod = logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
log_wflw =
workflow() %>%
add_model(log_mod) %>%
add_recipe(df_rec)
log_fit = fit(log_wflw, data_train)
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 3031363 161.9 8088500 432 10173480 543.4
## Vcells 48861987 372.8 1241382144 9471 1132926565 8643.6
log_coef
creaGruppoVariabili = function(data, varOrig, colVar)
{
macroGruppi = matrix(nrow = nrow(data))
for(i in 1:nrow(data))
{
if(data[i,colVar] == "(Intercept)")
{
macroGruppi[i,1] = "Intercetta"
}else
{
if(sum(str_equal(data[i,colVar], varOrig))==1)
{
macroGruppi[i,1] = data[i,colVar] %>% as.character()
}else
{
data[i,colVar] = data[i,colVar] %>%
str_sub(end = data[i,colVar] %>%
str_locate_all(pattern = "_") %>%
unlist() %>%
tail(1) - 1)
macroGruppi[i,1] = data[i,colVar] %>% as.character()
}
}
}
return(macroGruppi)
}
grafico = log_coef %>%
mutate(Variabili = creaGruppoVariabili(log_coef, names(df), "term")) %>%
ggplot(aes(x = Variabili, y = OR, ymin = OR.conf.low,
ymax = OR.conf.high, color = p.value, label = term)) +
geom_pointrange(position = "jitter") +
coord_flip() +
scale_x_discrete(labels = log_coef %>%
creaGruppoVariabili(names(df), "term") %>%
str_sub(end = 15) %>%
as.factor() %>%
levels()) +
scale_color_gradient2(low = "mediumspringgreen",
mid = "mediumvioletred",
high = "mediumvioletred",
midpoint = 0.4) +
# scale_color_brewer(palette = palette(palette.colors(palette = "RdYlGn"))) +
# scale_color_gradient2(low = "springgreen",
# high = "tomato",
# mid = "orange",
# midpoint = .05,
# na.value = "gray") +
xlab("Coefficienti") +
geom_hline(aes(yintercept = 1), color = "tomato") +
ylim(c(min(log_coef$OR.conf.low),2*median(log_coef$OR.conf.high, na.rm = T))) +
theme(legend.position = "none",
legend.box = "horizontal",
legend.title = element_text(size = 9),
legend.text = element_text(size = 7),
axis.text.x = element_text(vjust = 1, size = 8))
ggplotly(grafico)
Dalle probabilità creo la predizione della classe in base al threshold impostato, per ora pongo 0.5. successivamente lo andrò ad adattare per trovare la soluzione migliore al problema.
logistic_output %>%
ggplot(aes(x = fraud, y = `.pred_No fraud`)) +
geom_violin(fill = "mediumspringgreen", alpha = .6) +
geom_hline(yintercept = 0.5, color='mediumvioletred') +
geom_hline(yintercept = 0.73, color='mediumvioletred') +
labs(y = 'Predicted Probability of Outcome', x = 'Observed Outcome') +
scale_y_percent() +
theme_classic()
Osservo le metriche delle predizioni del modello con il threshold impostato al 0.5. Le metriche scelte in questo caso sono la sensitività, specificità, accuratezza e precione.
log_metrics = metric_set(sens, yardstick::spec, accuracy)
logistic_output %>%
log_metrics(estimate = .pred_class, truth = fraud, event_level = "second")
log_roc = logistic_output %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") # con second dico che la seconda classe (fraud) è la classe che è interesse di studio
# log_bestThreshold = log_roc %>%
# filter(sensitivity > 0.5) %>%
# filter(specificity == max(specificity))
logistic_output %>%
# mutate(pred_best = probably::make_two_class_pred(`.pred_No fraud`, levels(fraud), threshold = log_bestThreshold$.threshold)) %>%
mutate(pred_best = probably::make_two_class_pred(`.pred_No fraud`, levels(fraud), threshold = 0.5)) %>%
select(fraud,pred_best) %>%
table() %>%
caret::confusionMatrix(positive = "Fraud")
## Confusion Matrix and Statistics
##
## pred_best
## fraud No fraud Fraud
## No fraud 200633 46639
## Fraud 597 2131
##
## Accuracy : 0.8111
## 95% CI : (0.8095, 0.8126)
## No Information Rate : 0.8049
## P-Value [Acc > NIR] : 3.909e-15
##
## Kappa : 0.0634
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.043695
## Specificity : 0.997033
## Pos Pred Value : 0.781158
## Neg Pred Value : 0.811386
## Prevalence : 0.195080
## Detection Rate : 0.008524
## Detection Prevalence : 0.010912
## Balanced Accuracy : 0.520364
##
## 'Positive' Class : Fraud
##
log_roc %>%
mutate(FPR = 1 - specificity,
TPR = sensitivity) %>%
ggplot(aes(x = FPR, y = TPR)) +
geom_line() +
geom_abline(aes(intercept = 0, slope = 1)) +
geom_vline(aes(xintercept = log_roc %>%
filter(.threshold > .73) %>%
filter(.threshold == min(.threshold)) %>%
select(specificity) %>%
mutate(specificity = 1 - specificity) %>%
as.numeric()),
color = "tomato",
linetype = "dashed") +
geom_hline(aes(yintercept = log_roc %>%
filter(.threshold > .73) %>%
filter(.threshold == min(.threshold)) %>%
select(sensitivity) %>%
as.numeric()),
color = "tomato",
linetype = "dashed") +
labs(title = "Curva ROC") +
theme_bw() +
theme(aspect.ratio = 1)
Per problemi computazionali, dovuti alla grande quantità di dati e di features, si è deciso di ridurre la dimensione del campione riducendo del 95% le osservazioni. L’obiettivo del lavoro è quello di creare un modello che identifichi le frodi bancarie, perciò nella riduzione del dataframe è stato scelto di conservare tutte le osservazioni nelle quali è stata commessa una frode.
# dfRid = ovun.sample(fraud ~.,
# data = df,
# method = "under",
# p = 0.5,
# seed = 1)$data %>% tibble
dfRid = df %>%
filter(fraud == "Fraud") %>%
add_row(df %>%
filter(fraud != "Fraud") %>%
slice_sample(n = nrow(df)*0.05 - nrow(filter(df, fraud == "Fraud"))))
dfRid
L’attuale dataframe contiene ancora le classi sbilanciate, si procederà quindi al ribilanciamento attraverso el diverse tecniche di ricampionamento
table(dfRid$fraud)
##
## No fraud Fraud
## 38971 11029
apply(matrix(1:(ncol(dfChar) - 1)),1, function(indexCol)
# apply(matrix(1:2),1, function(indexCol)
{
dfRid %>%
select(where(is.factor)) %>%
ggplot(aes(x = get(colnames(dfChar)[indexCol]), group = fraud)) +
geom_bar(aes(y = ..prop.., fill = fraud), stat = "count") +
scale_y_continuous(labels=scales::percent) +
scale_fill_brewer(palette = "Dark2") +
xlab(colnames(dfChar)[indexCol]) +
ylab("Osservazioni") +
theme_ipsum() +
theme(legend.position="none")
}
)
## [[1]]
##
## [[2]]
##
## [[3]]
##
## [[4]]
##
## [[5]]
##
## [[6]]
##
## [[7]]
##
## [[8]]
##
## [[9]]
##
## [[10]]
##
## [[11]]
##
## [[12]]
##
## [[13]]
##
## [[14]]
##
## [[15]]
##
## [[16]]
Nei seguenti grafici si può notare come sia aumentata di molto la correlazione della variabile fraud con le altre variabili numeriche.
dfRid %>%
mutate(fraud = as.numeric(fraud)) %>%
select(where(is.numeric)) %>%
rename_all(~ str_trunc(., width = 15)) %>%
scale() %>%
cor() %>%
corrplot::corrplot(cl.cex = 0.5,
type = "lower",
tl.pos = "l")
dfRid %>%
mutate(fraud = as.numeric(fraud)) %>%
select(where(is.numeric)) %>%
# select(where(~ length(levels(factor(.))) > 2)) %>%
rename_all(~ str_trunc(., width = 15)) %>%
scale() %>%
cor() %>%
data.frame() %>%
rownames_to_column("Variabili") %>%
tibble() %>%
select(Variabili,fraud) %>%
mutate(Variabili = as.factor(Variabili)) %>%
filter(fraud<1) %>%
ggplot(aes(x = Variabili, y = fraud, fill = fraud)) +
geom_col(color = "gray45") +
coord_flip() +
scale_fill_gradient2(low = "mediumspringgreen",
high = "mediumvioletred",
midpoint = 0) +
theme_bw() +
theme(legend.position = "none")
data_split = initial_split(dfRid, prop = 3/4, strata = "fraud")
data_train = training(data_split)
data_test = testing(data_split)
df_rec =
recipe(fraud ~ ., data = data_train) %>%
step_novel(all_nominal_predictors()) %>%
step_normalize(all_numeric_predictors()) %>%
step_dummy(all_nominal_predictors()) %>%
step_zv(all_nominal_predictors())
log_mod = logistic_reg() %>%
set_engine("glm") %>%
set_mode("classification")
Imposto il workflow, aggiungendo il modello e il preprocessamento dei dati
log_wflw =
workflow() %>%
add_model(log_mod) %>%
add_recipe(df_rec)
log_fit_rid = fit(log_wflw, data_train)
lev.p.value = c("very low confidence", "low confidence", "confidence", "high confidence", "very high confidence")
log_coef_rid = log_fit_rid %>%
tidy() %>%
mutate(OR = exp(estimate),
OR.conf.low = exp(estimate - 1.96*std.error),
OR.conf.high = exp(estimate + 1.96*std.error),
p.value.level = case_when(p.value > .1 ~ lev.p.value[1],
p.value > .05 ~ lev.p.value[2],
p.value > .01 ~ lev.p.value[3],
p.value > .001 ~ lev.p.value[4],
.default = lev.p.value[5])) %>%
mutate(p.value.level = factor(p.value.level, levels = lev.p.value, ordered = T))
log_coef_rid
creaGruppoVariabili = function(data, varOrig, colVar)
{
macroGruppi = matrix(nrow = nrow(data))
for(i in 1:nrow(data))
{
if(data[i,colVar] == "(Intercept)")
{
macroGruppi[i,1] = "Intercetta"
}else
{
if(sum(str_equal(data[i,colVar], varOrig))==1)
{
macroGruppi[i,1] = data[i,colVar] %>% as.character()
}else
{
data[i,colVar] = data[i,colVar] %>%
str_sub(end = data[i,colVar] %>%
str_locate_all(pattern = "_") %>%
unlist() %>%
tail(1) - 1)
macroGruppi[i,1] = data[i,colVar] %>% as.character()
}
}
}
return(macroGruppi)
}
grafico = log_coef_rid %>%
mutate(Variabili = creaGruppoVariabili(log_coef_rid, names(dfRid), "term")) %>%
ggplot(aes(x = Variabili, y = OR, ymin = OR.conf.low,
ymax = OR.conf.high, color = p.value.level, label = term)) +
geom_pointrange(position = "jitter") +
coord_flip() +
scale_x_discrete(labels = log_coef_rid %>%
creaGruppoVariabili(names(dfRid), "term") %>%
str_sub(end = 15) %>%
as.factor() %>%
levels()) +
scale_color_brewer(palette = "RdYlGn") +
# scale_color_gradient2(low = "springgreen",
# high = "tomato",
# mid = "orange",
# midpoint = .05,
# na.value = "gray") +
xlab("Coefficienti") +
geom_hline(aes(yintercept = 1), color = "tomato") +
ylim(c(min(log_coef_rid$OR.conf.low),2*median(log_coef_rid$OR.conf.high, na.rm = T))) +
theme(legend.position = "bottom",
legend.box = "horizontal",
legend.title = element_text(size = 9),
legend.text = element_text(size = 7),
axis.text.x = element_text(vjust = 1, size = 8))
ggplotly(grafico)
Dalle probabilità creo la predizione della classe in base al threshold impostato, per ora pongo 0.5. successivamente lo andrò ad adattare per trovare la soluzione migliore al problema.
log_output_rid = data_test %>%
bind_cols(predict(log_fit_rid, new_data = data_test, type = 'prob')) %>%
mutate(.pred_class = probably::make_two_class_pred(`.pred_No fraud`, levels(fraud), threshold = .5))
## Warning in predict.lm(object, newdata, se.fit, scale = 1, type = if (type == :
## prediction from rank-deficient fit; attr(*, "non-estim") has doubtful cases
log_output_rid %>%
ggplot(aes(x = fraud, y = `.pred_No fraud`)) +
geom_violin(fill = "mediumspringgreen", alpha = .6) +
geom_hline(yintercept = 0.5, color='mediumvioletred') +
geom_hline(yintercept = 0.6, color='mediumvioletred') +
labs(y = 'Predicted Probability of Outcome', x = 'Observed Outcome') +
scale_y_percent() +
theme_classic()
Osservo le metriche delle predizioni del modello con il threshold impostato al 0.5. Le metriche scelte in questo caso sono la sensitività, specificità, accuratezza e precione.
log_metrics = metric_set(sens, yardstick::spec, accuracy)
log_output_rid %>%
log_metrics(estimate = .pred_class, truth = fraud, event_level = "second")
log_roc = logistic_output %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") # con second dico che la seconda classe (fraud) è la classe che è interesse di studio
# log_bestThreshold = log_roc %>%
# filter(sensitivity > 0.5) %>%
# filter(specificity == max(specificity))
logistic_output %>%
# mutate(pred_best = probably::make_two_class_pred(`.pred_No fraud`, levels(fraud), threshold = log_bestThreshold$.threshold)) %>%
mutate(pred_best = probably::make_two_class_pred(`.pred_No fraud`, levels(fraud), threshold = 0.6)) %>%
select(fraud,pred_best) %>%
table() %>%
caret::confusionMatrix(positive = "Fraud")
## Confusion Matrix and Statistics
##
## pred_best
## fraud No fraud Fraud
## No fraud 171810 75462
## Fraud 340 2388
##
## Accuracy : 0.6968
## 95% CI : (0.695, 0.6986)
## No Information Rate : 0.6886
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.039
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.030674
## Specificity : 0.998025
## Pos Pred Value : 0.875367
## Neg Pred Value : 0.694822
## Prevalence : 0.311400
## Detection Rate : 0.009552
## Detection Prevalence : 0.010912
## Balanced Accuracy : 0.514350
##
## 'Positive' Class : Fraud
##
log_roc %>%
mutate(FPR = 1 - specificity,
TPR = sensitivity) %>%
ggplot(aes(x = FPR, y = TPR)) +
geom_line() +
geom_abline(aes(intercept = 0, slope = 1)) +
geom_vline(aes(xintercept = log_roc %>%
filter(.threshold > .73) %>%
filter(.threshold == min(.threshold)) %>%
select(specificity) %>%
mutate(specificity = 1 - specificity) %>%
as.numeric()),
color = "tomato",
linetype = "dashed") +
geom_hline(aes(yintercept = log_roc %>%
filter(.threshold > .73) %>%
filter(.threshold == min(.threshold)) %>%
select(sensitivity) %>%
as.numeric()),
color = "tomato",
linetype = "dashed") +
labs(title = "Curva ROC") +
theme_bw() +
theme(aspect.ratio = 1)
set.seed(1)
cv_folds = vfold_cv(data_train, strata = "fraud",repeats = 10)
cv_folds
log_metrics = metric_set(sens, yardstick::spec, accuracy)
set.seed(1)
log_res = fit_resamples(
log_wflw,
control = control_resamples(save_pred = TRUE, event_level = 'second'),
resamples = cv_folds,
metrics = log_metrics
)
saveRDS(log_res = "var/log_res.RData")
log_res = readRDS("var/log_res.RData")
log_res_metrics = lapply(1:nrow(log_res), function(i) {
log_res %>%
select(.metrics) %>%
dplyr::slice(i) %>%
unlist() %>%
t() %>%
data.frame() %>%
select_if(str_detect(names(.), ".metrics..estimate"))
}) %>%
purrr::map_dfr( ~ as.data.frame(.)) %>%
transmute(sensitivity = .metrics..estimate1,
specificity = .metrics..estimate2,
accuracy = .metrics..estimate3)
log_res_metrics
log_res_metrics %>%
mutate_all(as.numeric) %>%
# add_row(sensitivity = 0,specificity = 0,accuracy = 0, .before = 1) %>%
# add_row(sensitivity = 1,specificity = 1,accuracy = 1, .after = 100) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_point(aes(color = accuracy)) +
scale_color_gradient2(high = "springgreen",
low = "tomato",
mid = "orange",
midpoint = mean(as.numeric(log_res_metrics$accuracy)),
na.value = "gray") +
geom_vline(aes(xintercept = log_res_metrics %>%
mutate_all(as.numeric) %>%
filter(specificity > .95) %>%
filter(sensitivity == max(sensitivity)) %>%
select(specificity) %>%
mutate(specificity = 1 - specificity) %>%
as.numeric()),
color = "tomato",
linetype = "dashed") +
geom_hline(aes(yintercept = log_res_metrics %>%
mutate_all(as.numeric) %>%
filter(specificity > .95) %>%
filter(sensitivity == max(sensitivity)) %>%
select(sensitivity) %>%
as.numeric()),
color = "tomato",
linetype = "dashed") +
theme_bw()
metriche = log_res %>%
select(.metrics) %>%
map_dfr(as.data.frame) %>%
select(.metric, .estimate, .config) %>%
pivot_wider(names_from = .metric, values_from = .estimate, id_cols = .config)
metriche
Il modello
log_res %>%
collect_predictions() %>%
select(fraud, .pred_class) %>%
table() %>%
caret::confusionMatrix(positive = "Fraud")
## Confusion Matrix and Statistics
##
## .pred_class
## fraud No fraud Fraud
## No fraud 276166 16114
## Fraud 37679 45031
##
## Accuracy : 0.8565
## 95% CI : (0.8554, 0.8577)
## No Information Rate : 0.8369
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5398
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.7365
## Specificity : 0.8799
## Pos Pred Value : 0.5444
## Neg Pred Value : 0.9449
## Prevalence : 0.1631
## Detection Rate : 0.1201
## Detection Prevalence : 0.2206
## Balanced Accuracy : 0.8082
##
## 'Positive' Class : Fraud
##
rf_mod = rand_forest() %>%
set_engine(engine = 'ranger') %>%
set_args(mtry = NULL, # lo vado a inserire dopo
trees = 1000,
min_n = 2,
probability = TRUE,
importance = 'impurity') %>%
set_mode('classification')
rf_wflw_mtry2 = workflow() %>%
add_model(rf_mod %>% set_args(mtry = 2)) %>%
add_recipe(df_rec)
rf_wflw_mtry5 = workflow() %>%
add_model(rf_mod %>% set_args(mtry = floor(sqrt(ncol(dfRid))))) %>% # 5
add_recipe(df_rec)
rf_wflw_mtry12 = workflow() %>%
add_model(rf_mod %>% set_args(mtry = 12)) %>%
add_recipe(df_rec)
set.seed(1)
rf_fit_mtry2 = fit(rf_wflw_mtry2, data = data_train)
set.seed(1)
rf_fit_mtry12 = fit(rf_wflw_mtry12, data = data_train)
set.seed(1)
rf_fit_mtry5 = fit(rf_wflw_mtry5, data = data_train)
saveRDS(rf_fit_mtry2, "var/rf_fit_mtry2.RData")
saveRDS(rf_fit_mtry5, "var/rf_fit_mtry5.RData")
saveRDS(rf_fit_mtry12, "var/rf_fit_mtry12.RData")
rf_fit_mtry2 = readRDS("var/rf_fit_mtry2.RData") # importo il modello nel'enviroment
rf_fit_mtry5 = readRDS("var/rf_fit_mtry5.RData") # importo il modello nel'enviroment
rf_fit_mtry12 = readRDS("var/rf_fit_mtry12.RData") # importo il modello nel'enviroment
rf_out_mtry2 = rf_fit_mtry2 %>%
extract_fit_engine()
rf_pred_mtry2 = predict(rf_fit_mtry2,
new_data = data_test,
type = 'prob')
rf_out_mtry5 = rf_fit_mtry5 %>%
extract_fit_engine()
rf_pred_mtry5 = predict(rf_fit_mtry5,
new_data = data_test,
type = 'prob')
rf_out_mtry12 = rf_fit_mtry12 %>%
extract_fit_engine()
rf_pred_mtry12 = predict(rf_fit_mtry12,
new_data = data_test,
type = 'prob')
data_test %>%
select(fraud) %>%
add_column(rf_pred_mtry2, .name_repair = "unique") %>%
add_column(rf_pred_mtry5, .name_repair = "unique") %>%
add_column(rf_pred_mtry12, .name_repair = "unique") %>%
pivot_longer(cols = c(2,4,6), values_to = "value", names_to = "mtry") %>%
mutate(mtry = factor(mtry, labels = c("mtry2", "mtry5", "mtry12"))) %>%
ggplot(aes(x = fraud, y = value)) +
geom_violin(fill = "mediumspringgreen", alpha = .6) +
geom_hline(yintercept = 0.5, color='mediumvioletred') +
geom_hline(yintercept = 0.85, color='mediumvioletred') +
facet_grid(cols = dplyr::vars(mtry)) +
labs(y = 'Predicted Probability of Outcome', x = 'Observed Outcome') +
scale_y_percent() +
theme_classic()
## New names:
## New names:
## • `.pred_No fraud` -> `.pred_No fraud...2`
## • `.pred_Fraud` -> `.pred_Fraud...3`
## • `.pred_No fraud` -> `.pred_No fraud...4`
## • `.pred_Fraud` -> `.pred_Fraud...5`
#### Matrice di confusione
data_test %>%
select(fraud) %>%
add_column(rf_pred_mtry12 %>%
transmute(predicted = factor(`.pred_No fraud`<.85,
labels = c("No fraud", "Fraud")))) %>%
table() %>%
caret::confusionMatrix(positive = "Fraud")
## Confusion Matrix and Statistics
##
## predicted
## fraud No fraud Fraud
## No fraud 6104 3639
## Fraud 57 2701
##
## Accuracy : 0.7043
## 95% CI : (0.6963, 0.7123)
## No Information Rate : 0.5072
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4134
##
## Mcnemar's Test P-Value : < 2.2e-16
##
## Sensitivity : 0.4260
## Specificity : 0.9907
## Pos Pred Value : 0.9793
## Neg Pred Value : 0.6265
## Prevalence : 0.5072
## Detection Rate : 0.2161
## Detection Prevalence : 0.2206
## Balanced Accuracy : 0.7084
##
## 'Positive' Class : Fraud
##
Nel grafico si può vedere come mtry12 e mtry5 hanno valori sull’importanza delle variabili decisamente superiori di mtry2, inoltre le prime variabili hanno una classifica simile tra mtry12 e mtry5, per mtry2, invece, le variabili più importanti sarebbero altre.
procediamo quindi ha eseguire una selezione delle variabili per snellire il dataset. Verrannno selezionate solo alcune modalità di variabili categoriche, avendo già il dataset diviso in dummies non ci saranno problemi nella creazione di nuove categorie.
rf_varImportance = rf_out_mtry2 %>%
vip::vi() %>%
right_join(rf_out_mtry5 %>% vip::vi(), by = "Variable") %>%
right_join(rf_out_mtry12 %>% vip::vi(), by = "Variable")
colnames(rf_varImportance) = c("Variable", "mtry2", "mtry5", "mtry12")
rf_varImportance %>%
# mutate(Macrovariabili = creaGruppoVariabili(rf_varImportance, names(dfRid), "Variable")) %>%
head(25) %>%
pivot_longer(cols = 2:4,
names_to = "mtry",
values_to = "Importance") %>%
mutate(mtry = mtry %>%
factor(levels = c("mtry2", "mtry5", "mtry12"), ordered = T)) %>%
mutate(Variable = Variable %>%
factor() %>%
reorder(Importance)) %>%
# mutate(mtry = mtry %>%
# factor() %>%
# reorder(c("mtry2", "mtry5", "mtry12"))) %>%
ggplot(aes(x = Variable, y = Importance)) +
geom_col(aes(fill = mtry), position = "dodge") +
scale_fill_brewer(palette = "Dark2") +
coord_flip()
Seleziono le 25 variabili più importanti per il modello random forest con il parametro mtry12.
var_Importance = rf_varImportance %>%
dplyr::arrange(across(mtry12)) %>%
tail(25) %>%
select(Variable)
df_rec_25var = df_rec %>%
step_select(c(var_Importance$Variable, "fraud")) %>%
step_nzv(all_factor_predictors()) %>%
step_corr(all_predictors())
df_rec_25var
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 30
##
## ── Operations
## • Novel factor level assignment for: all_nominal_predictors()
## • Centering and scaling for: all_numeric_predictors()
## • Dummy variables from: all_nominal_predictors()
## • Zero variance filter on: all_nominal_predictors()
## • Variables selected: c(var_Importance$Variable, "fraud")
## • Sparse, unbalanced variable filter on: all_factor_predictors()
## • Correlation filter on: all_predictors()
data_train_25var = df_rec_25var %>%
prep(training = data_train) %>%
bake(new_data = NULL)
data_test_25var = df_rec_25var %>%
prep(training = data_test) %>%
bake(new_data = NULL)
rm(rf_fit_mtry2) # elimino il modello per liberare spazio
rm(rf_fit_mtry5) # elimino il modello per liberare spazio
rm(rf_fit_mtry12) # elimino il modello per liberare spazio
gc()
## used (Mb) gc trigger (Mb) max used (Mb)
## Ncells 22186212 1184.9 33405662 1784.1 31833864 1700.2
## Vcells 238688033 1821.1 794484573 6061.5 1132926565 8643.6
Svolgo il lavoro su differenti core della CPU.
cores = parallel::detectCores() - 1
cores
## [1] 19
rf_mod_tun =
rand_forest(mtry = tune(), min_n = tune(), trees = 1000) %>%
set_engine("ranger", num.threads = cores) %>%
set_args(probability = TRUE,
importance = 'impurity') %>%
set_mode("classification")
rf_workflow =
workflow() %>%
add_model(rf_mod_tun) %>%
add_recipe(df_rec_25var)
rf_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
##
## • step_novel()
## • step_normalize()
## • step_dummy()
## • step_zv()
## • step_select()
## • step_nzv()
## • step_corr()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = tune()
## trees = 1000
## min_n = tune()
##
## Engine-Specific Arguments:
## num.threads = cores
## probability = TRUE
## importance = impurity
##
## Computational engine: ranger
set.seed(1)
rf_res =
rf_workflow %>%
tune_grid(data_folds,
grid = 25,
control = control_grid(save_pred = TRUE,
event_level = "second"),
metrics = metric_set(roc_auc,
yardstick::specificity,
yardstick::sensitivity))
saveRDS(rf_res, "var/rf_res.RData")
Alleno 250 modelli totali utilizzando la cross validation (k-fold) con il train set diviso in 10 parti uguali. Per ogni modello k-1 gruppi del train set verranno utilizzati per allenare il modello, mentre il restante sarà usato per testare il modello.
rf_res = readRDS("var/rf_res.RData")
metriche = rf_res %>%
select(.metrics) %>%
map_dfr(as.data.frame) %>%
select(mtry, min_n, .metric, .estimate, .config)
metriche_pivot = metriche %>%
pivot_wider(names_from = .metric, values_from = .estimate, id_cols = .config) %>%
left_join(metriche %>%
filter(.metric == "specificity") %>%
select(mtry, min_n, .config),
by = ".config")
metriche_pivot
autoplot(rf_res) +
geom_vline(aes(xintercept = 5), color = "tomato") +
theme_bw()
rf_best = metriche_pivot %>%
filter(specificity > .95) %>%
filter(sensitivity == max(sensitivity))
rf_best
rf_auc =
rf_res %>%
collect_predictions(parameters = rf_best) %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") %>%
mutate(model = "Random Forest")
rf_auc %>%
ggplot(aes(x = 1 - specificity, y = sensitivity)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal()
tree_mod =
decision_tree(
cost_complexity = tune(),
tree_depth = tune()
) %>%
set_engine("rpart") %>%
set_mode("classification")
tree_grid = grid_regular(cost_complexity(),
tree_depth(),
levels = 5)
tree_wflw = workflow() %>%
add_recipe(df_rec_25var) %>%
# add_recipe(recipe(fraud ~ ., data = data_train_25var)) %>%
add_model(tree_mod)
set.seed(1)
data_folds = vfold_cv(data_train)
tree_wflw
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: decision_tree()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
##
## • step_novel()
## • step_normalize()
## • step_dummy()
## • step_zv()
## • step_select()
## • step_nzv()
## • step_corr()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Decision Tree Model Specification (classification)
##
## Main Arguments:
## cost_complexity = tune()
## tree_depth = tune()
##
## Computational engine: rpart
tree_res =
tree_wflw %>%
tune_grid(
resamples = data_folds,
grid = tree_grid
)
saveRDS(tree_res, "var/tree_res.RData")
tree_res = readRDS("var/tree_res.RData")
tree_res %>%
collect_metrics() %>%
mutate(tree_depth = factor(tree_depth)) %>%
ggplot(aes(cost_complexity, mean, color = tree_depth)) +
geom_line(linewidth = 1.5, alpha = 0.6) +
geom_point(size = 2) +
facet_wrap(~ .metric, scales = "free", nrow = 2) +
scale_x_log10(labels = scales::label_number()) +
scale_color_viridis_d(option = "plasma", begin = .9, end = 0)
tree_res %>%
show_best("accuracy")
best_tree = tree_res %>%
select_best("accuracy")
best_tree
tree_final_wflw =
tree_wflw %>%
finalize_workflow(best_tree)
tree_fit =
tree_final_wflw %>%
last_fit(data_split)
tree_fit %>%
collect_metrics()
tree_fit %>%
collect_predictions() %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") %>%
autoplot()
tree_fit %>%
extract_fit_engine() %>%
rpart.plot::rpart.plot()
## Warning: Cannot retrieve the data used to build the model (so cannot determine roundint and is.binary for the variables).
## To silence this warning:
## Call rpart.plot with roundint=FALSE,
## or rebuild the rpart model with model=TRUE.
## Warning: labs do not fit even at cex 0.15, there may be some overplotting
# plot()
# text()
tree_fit %>%
collect_predictions() %>%
mutate(pred_best = probably::make_two_class_pred(`.pred_No fraud`, levels(fraud), threshold = 0.75)) %>%
select(fraud,pred_best) %>%
table() %>%
caret::confusionMatrix(positive = "Fraud")
## Confusion Matrix and Statistics
##
## pred_best
## fraud No fraud Fraud
## No fraud 8359 1384
## Fraud 1090 1668
##
## Accuracy : 0.8021
## 95% CI : (0.795, 0.809)
## No Information Rate : 0.7559
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.4457
##
## Mcnemar's Test P-Value : 3.845e-09
##
## Sensitivity : 0.5465
## Specificity : 0.8846
## Pos Pred Value : 0.6048
## Neg Pred Value : 0.8579
## Prevalence : 0.2441
## Detection Rate : 0.1334
## Detection Prevalence : 0.2206
## Balanced Accuracy : 0.7156
##
## 'Positive' Class : Fraud
##
lr_mod =
logistic_reg(penalty = tune(), mixture = 1) %>%
set_engine("glmnet") %>%
set_mode("classification") %>%
set_args(family = "binomial")
lr_workflow =
workflow() %>%
add_model(lr_mod) %>%
add_recipe(df_rec_25var)
lr_reg_grid = tibble(penalty = 10^ seq(-4, -1, length.out = 30))
lr_workflow
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: logistic_reg()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
##
## • step_novel()
## • step_normalize()
## • step_dummy()
## • step_zv()
## • step_select()
## • step_nzv()
## • step_corr()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Logistic Regression Model Specification (classification)
##
## Main Arguments:
## penalty = tune()
## mixture = 1
##
## Engine-Specific Arguments:
## family = binomial
##
## Computational engine: glmnet
lr_res =
lr_workflow %>%
tune_grid(data_folds,
grid = lr_reg_grid,
control = control_grid(save_pred = TRUE),
metrics = metric_set(roc_auc))
saveRDS(lr_res, "var/lr_res.RData")
lr_res = readRDS("var/lr_res.RData")
lr_res %>%
collect_metrics() %>%
ggplot(aes(x = penalty, y = mean)) +
geom_point() +
geom_line() +
ylab("Area under the ROC Curve") +
geom_vline(aes(xintercept = lr_res %>%
collect_metrics() %>%
select(penalty) %>%
slice(20) %>%
as.numeric()),
color = "tomato") +
scale_x_log10(labels = scales::label_number())
top_models =
lr_res %>%
show_best("roc_auc", n = 20) %>%
arrange(penalty)
top_models
Scelgo il 20esimo modello migliore considerando l’AUC per massimizzare la penalità e di conseguenza migliorare il modello.
lr_best =
lr_res %>%
collect_metrics() %>%
arrange(penalty) %>%
slice(20)
lr_best
lr_auc = lr_res %>%
collect_predictions(parameters = lr_best) %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") %>%
mutate(model = "Regressione logistica")
lr_auc %>% autoplot(lr_auc)
lr_res %>%
collect_predictions(parameters = lr_best) %>%
mutate(pred_best = probably::make_two_class_pred(`.pred_No fraud`, levels(fraud), threshold = 0.680)) %>%
select(fraud,pred_best) %>%
table() %>%
caret::confusionMatrix(positive = "Fraud")
## Confusion Matrix and Statistics
##
## pred_best
## fraud No fraud Fraud
## No fraud 25717 3511
## Fraud 2855 5416
##
## Accuracy : 0.8302
## 95% CI : (0.8264, 0.834)
## No Information Rate : 0.7619
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5199
##
## Mcnemar's Test P-Value : 2.224e-16
##
## Sensitivity : 0.6067
## Specificity : 0.9001
## Pos Pred Value : 0.6548
## Neg Pred Value : 0.8799
## Prevalence : 0.2381
## Detection Rate : 0.1444
## Detection Prevalence : 0.2206
## Balanced Accuracy : 0.7534
##
## 'Positive' Class : Fraud
##
lr_final_wflw =
lr_workflow %>%
finalize_workflow(lr_best)
lr_fit =
lr_final_wflw %>%
last_fit(data_split)
In questa fase si procederà ad allenare i modelli sul dataset originale con i parametri trovati nei capitoli precedenti. Ci si ritroverà in una situazione di imbilanciamento delle classe, sarà quindi opportuno utilizzare una tecnica di ribilanciamente. In questo caso è stata utilizzata ROSE.
set.seed(1)
df_split = initial_split(df, prop = 3/4, strata = "fraud")
df_train = training(data_split)
df_test = testing(data_split)
Sys.time()
lr_fit_finale = lr_final_wflw %>% last_fit(df_split)
Sys.time()
saveRDS(lr_fit_finale, "var/lr_fit_finale.RData")
lr_fit_finale = readRDS("var/lr_fit_finale.RData")
lr_auc_finale = lr_fit_finale %>%
collect_predictions() %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") %>%
mutate(model = "Regressione logistica finale senza ROSE")
bind_rows(rf_auc, lr_auc, lr_auc_finale) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1.5, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "plasma", end = .6)
lr_final_wflw_rose = lr_final_wflw %>%
update_recipe(df_rec %>% step_rose())
Sys.time()
lr_fit_finale_rose = lr_final_wflw_rose %>% last_fit(df_split)
Sys.time()
saveRDS(lr_fit_finale_rose, "var/lr_fit_finale_rose.RData")
lr_fit_finale_rose = readRDS("var/lr_fit_finale_rose.RData")
lr_auc_finale_rose = lr_fit_finale_rose %>%
collect_predictions() %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") %>%
mutate(model = "Regressione logistica finale con ROSE")
bind_rows(rf_auc, lr_auc, lr_auc_finale, lr_auc_finale_rose) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1, alpha = 0.6) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "plasma", end = 1)
rf_wflw_finale = rf_workflow %>%
finalize_workflow(rf_best)
rf_wflw_finale
## ══ Workflow ════════════════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: rand_forest()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 7 Recipe Steps
##
## • step_novel()
## • step_normalize()
## • step_dummy()
## • step_zv()
## • step_select()
## • step_nzv()
## • step_corr()
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Random Forest Model Specification (classification)
##
## Main Arguments:
## mtry = 9
## trees = 1000
## min_n = 7
##
## Engine-Specific Arguments:
## num.threads = cores
## probability = TRUE
## importance = impurity
##
## Computational engine: ranger
Sys.time()
rf_fit_finale = rf_wflw_finale %>% last_fit(df_split)
Sys.time()
saveRDS(rf_fit_finale, "var/rf_fit_finale.RData")
rf_fit_finale = readRDS("var/rf_fit_finale.RData")
rf_auc_finale = rf_fit_finale %>%
collect_predictions() %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") %>%
mutate(model = "Random forest finale")
bind_rows(rf_auc, lr_auc, lr_auc_finale, rf_auc_finale) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
scale_color_viridis_d(option = "plasma", end = .6)
rf_fit_finale %>%
extract_fit_engine() %>%
vip::vi() %>%
head(25) %>%
arrange(Importance) %>%
ggplot(aes(x = Variable, y = Importance)) +
geom_col() +
coord_flip()
rf_wflw_finale_rose = rf_wflw_finale %>%
update_recipe(df_rec %>% step_rose)
Sys.time()
rf_fit_finale_rose = rf_wflw_finale_rose %>% last_fit(df_split)
Sys.time()
saveRDS(rf_fit_finale_rose, "var/rf_fit_finale_rose.RData")
rf_fit_finale_rose = readRDS("var/rf_fit_finale_rose.RData")
rf_auc_finale_rose = rf_fit_finale_rose %>%
collect_predictions() %>%
roc_curve(fraud, .pred_Fraud, event_level = "second") %>%
mutate(model = "Random forest finale con ROSE")
bind_rows(
rf_auc,
lr_auc,
lr_auc_finale,
lr_auc_finale_rose,
rf_auc_finale,
rf_auc_finale_rose
) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
theme_bw() +
# theme(legend.position = c(.8,.23)) +
scale_color_viridis_d(option = "plasma", end = 1)
bind_rows(
rf_auc,
lr_auc,
lr_auc_finale,
lr_auc_finale_rose,
rf_auc_finale,
rf_auc_finale_rose
) %>%
ggplot(aes(x = 1 - specificity, y = sensitivity, col = model)) +
geom_path(lwd = 1, alpha = 0.8) +
geom_abline(lty = 3) +
coord_equal() +
theme_bw() +
# theme(legend.position = c(.8,.23)) +
scale_color_viridis_d(option = "plasma", end = 1)