credit data
customers <- read_csv("lab08-segmentation/customers.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## Seniority = col_double(),
## Home = col_character(),
## Age = col_double(),
## Marital = col_character(),
## Records = col_character(),
## Job = col_character(),
## Expenses = col_double(),
## Income = col_double(),
## Assets = col_double(),
## Debt = col_double()
## )
customers$Home = as.factor(customers$Home)
customers$Records = as.factor(customers$Records)
credit_data <- read_csv("lab08-segmentation/credit-data.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## Status = col_character(),
## Time = col_double(),
## Amount = col_double(),
## Price = col_double()
## )
credit_data = left_join(customers, credit_data)
## Joining, by = "id"
credit_data$Status = factor(credit_data$Status, levels = c("good", "bad"))
credit_split <- initial_split(credit_data %>% select(-id))
car_tr <- training(credit_split)
car_te <- testing(credit_split)
library("rpart")
##
## Attaching package: 'rpart'
## The following object is masked from 'package:dials':
##
## prune
(rp <- rpart(Status ~ ., data = car_tr, model = TRUE))
## n= 2735
##
## node), split, n, loss, yval, (yprob)
## * denotes terminal node
##
## 1) root 2735 774 good (0.7170018 0.2829982)
## 2) Records=no_rec 2251 509 good (0.7738783 0.2261217)
## 4) Job=fixed,freelance,others 2013 374 good (0.8142077 0.1857923) *
## 5) Job=partime 238 103 bad (0.4327731 0.5672269)
## 10) Assets>=7250 15 2 good (0.8666667 0.1333333) *
## 11) Assets< 7250 223 90 bad (0.4035874 0.5964126)
## 22) Time< 27 32 12 good (0.6250000 0.3750000) *
## 23) Time>=27 191 70 bad (0.3664921 0.6335079) *
## 3) Records=yes_rec 484 219 bad (0.4524793 0.5475207)
## 6) Seniority>=6.5 207 75 good (0.6376812 0.3623188) *
## 7) Seniority< 6.5 277 87 bad (0.3140794 0.6859206) *
rp_2 <- rpart(Status ~ Income + Job + Marital + Home, data = car_tr, model = TRUE)
p1 = predict(rp, car_te, type = "class")
p2 = predict(rp_2, car_te, type = "class")
car_te$p1 = p1 %>% unname()
car_te$p2 = p2 %>% unname()
Confusion matrix classification
conf_mat_class <- function(data, truth, estimate,
bclass_name = "bclass",
true_class = T, false_class = F){
require(tidyverse)
data %>%
mutate("{ bclass_name }" := case_when(
{{ truth }} == true_class & {{ estimate }} == true_class ~ "TP",
{{ truth }} == false_class & {{ estimate }} == false_class ~ "TN",
{{ truth }} == true_class & {{ estimate }} == false_class ~ "FN",
{{ truth }} == false_class & {{ estimate }} == true_class ~ "FP"
))
}
car_te = car_te %>%
conf_mat_class(Status, p1, true_class = "good", false_class = "bad", bclass_name = "pred_results")
car_te = car_te %>%
conf_mat_class(Status, p2, true_class = "good", false_class = "bad", bclass_name = "pred_results_2")
Conf matrix but ggalluvial
cmc_flows <- function(data,
# colors = c("TP" = "#1a9641", "TN" = "#a6d96a",
# "FP" = "#fdae61", "FN" = "#d7191c"),
truth, estimate, bclass,
count_name = "cases"){
require(tidyverse)
data %>%
group_by({{ truth }}, {{ estimate }}, {{ bclass }}) %>%
summarise("{ count_name }" := n())
}
car_te %>% cmc_flows(Status, pred_results, p1) -> flows
## `summarise()` regrouping output by 'Status', 'pred_results' (override with `.groups` argument)
# group_by(Status, pred_results, p1, p2, pred_results_2) %>%
# summarise(cases = n())
cmc_flows_plot <- function(flows_data,
truth, estimate, bclass,
cases = cases,
cols = c("TP" = "#1a9641", "TN" = "#a6d96a",
"FP" = "#fdae61", "FN" = "#d7191c")
){
require(ggalluvial)
require(tidyverse)
cols = cols
flows_data %>%
ggplot(aes(y = {{ cases }}, axis1 = {{ truth }}, axis2 = {{ estimate }})) +
geom_alluvium(aes(fill = {{bclass}}), width = 1/12) +
geom_stratum(alpha = 1, fill = "white", color ="grey") +
geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
scale_x_discrete(limits = c("True classes", "Predicted classes"), expand = c(.05, .05)) +
scale_fill_manual(values = cols) +
ggtitle("Truth vs Estimate")+
theme_minimal()+
guides(fill = guide_legend(reverse=T))
}
# ?pivot_wider()
# ?enquo
#
# flows %>%
# cmc_flows_plot(Status, p1, pred_results, cases)
car_te %>%
cmc_flows(Status, p1, pred_results) %>%
cmc_flows_plot(Status, p1, pred_results, cases)
## `summarise()` regrouping output by 'Status', 'p1' (override with `.groups` argument)
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.
## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

Conf matrix 2
conf_mat_class2 <- function(data, estimate_1, estimate_2,
change_name = "from_to",
true_class = T, false_class = F){
require(tidyverse)
data %>%
mutate("{ change_name }" := case_when(
# same:
{{ estimate_1 }} == "TP" &{{ estimate_2 }} == "TP" ~ "TP",
{{ estimate_1 }} == "TN" &{{ estimate_2 }} == "TN" ~ "TN",
{{ estimate_1 }} == "FP" &{{ estimate_2 }} == "FP" ~ "FP",
{{ estimate_1 }} == "FN" &{{ estimate_2 }} == "FN" ~ "FN",
# changed for the worse:
{{ estimate_1 }} == "TP" &{{ estimate_2 }} == "FN" ~ "TP to FN",
{{ estimate_1 }} == "TN" &{{ estimate_2 }} == "FP" ~ "TN to FP",
# chaned for the better:
{{ estimate_1 }} == "FP" &{{ estimate_2 }} == "TN" ~ "FP to TN",
{{ estimate_1 }} == "FN" &{{ estimate_2 }} == "TP" ~ "FN to TP"
))
}
car_te %>%
conf_mat_class2(pred_results, pred_results_2)
## # A tibble: 911 x 19
## Seniority Home Age Marital Records Job Expenses Income Assets Debt
## <dbl> <fct> <dbl> <chr> <fct> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 17 rent 58 widow no_rec fixed 52 131 0 0
## 2 29 owner 44 married no_rec fixed 68 125 10000 0
## 3 7 owner 29 married no_rec fixed 74 121 3000 0
## 4 0 other 21 single yes_rec part… 30 50 0 0
## 5 15 priv 52 single no_rec free… 36 330 16500 0
## 6 1 pare… 31 single no_rec fixed 32 137 0 0
## 7 15 owner 43 married no_rec fixed 79 251 4000 0
## 8 10 owner 39 married no_rec free… 72 150 3000 0
## 9 24 owner 42 married no_rec fixed 73 137 4000 0
## 10 2 owner 43 married no_rec part… 75 71 3000 0
## # … with 901 more rows, and 9 more variables: Status <fct>, Time <dbl>,
## # Amount <dbl>, Price <dbl>, p1 <fct>, p2 <fct>, pred_results <chr>,
## # pred_results_2 <chr>, from_to <chr>
describe_group_factors
describe_group_factors <- function(data) {
##select only meaningful variables
data %>%
mutate_if(is.factor, as.character) %>%
select_if(is.character) %>%
# mutate(id = row_number()) %>%
pivot_longer(everything(),
names_to = "variable",
values_to = "value") %>%
group_by(variable, value) %>%
summarise(val_n = n()) %>%
mutate(perc = val_n/sum(val_n))
}
# t3_a %>% describe_group_factors() %>%
# full_join(t3_b %>% describe_group_factors(), by = c("variable", "value")) %>%
# mutate_at(vars(val_n.x:perc.y), replace_na, 0) %>%
# mutate(perc_diff = (perc.y - perc.x) / perc.x) %>% ungroup() %>%
# mutate(full_name = str_c(variable, value, sep = " ")) %>%
# rowwise() %>%
# mutate(
# test_stat = chisq.test(c(val_n.y, val_n.x))$statistic %>% round(2),
# p_val = chisq.test(c(val_n.y, val_n.x))$p.value
# ) %>%
# select(variable, value, val_n.x, val_n.y, test_stat, p_val) -> t3_c
describe_group_numeric
describe_group_numeric <- function(data) {
##select only meaningful variables
data %>%
select_if(is.numeric) %>%
# mutate(id = row_number()) %>%
pivot_longer(everything(),
names_to = "variable",
values_to = "value") %>%
group_by(variable) %>%
summarise(mean = mean(value), sd = sd(value), val_n = n())
}
# t3_a %>%
# describe_group_numeric () %>%
# full_join(
# t3_b %>% describe_group_numeric(),
# by = c("variable")
# ) %>%
# rowwise %>%
# mutate(
# p_val = t.test(c(mean.y, mean.x))$p.value,
# test_stat = t.test(c(mean.y, mean.x))$statistic %>% round(2)
# ) %>%
# select(variable, mean.x, sd.x,mean.y, sd.y, test_stat, p_val) -> t3_n
# t3_c %>%
# full_join(t3_n) %>%
# group_by(variable) %>%
# mutate(min_p = min(p_val)) %>%
# ungroup() %>%
# arrange(min_p) %>%
# select(-min_p) -> t3_max
#
#
# options(knitr.kable.NA = '')
#
# t3_max %>%
# mutate(p_val = case_when(
# p_val %>% round(3) == 0 ~ "<0.01 **",
# p_val < 0.05 ~ str_c(p_val %>% round(3), " *"),
# TRUE ~ p_val %>% round(3) %>% as.character()
# )) %>%
# mutate_if(is.numeric, . %>% round()) %>%
# mutate(sd.x = str_c("(±", sd.x, ")"),
# sd.y = str_c("(±", sd.y, ")")) %>%
# kbl(escape = F, booktabs = T) %>%
# kable_styling() %>%
# # kable_paper("hover", full_width = F) %>%
# column_spec(c(8:10), italic = T) %>%
# row_spec(which(t3_max$p_val > 0.05),
# background = "#D3D3D3") %>%
# column_spec(1, background = "white") %>%
# collapse_rows(columns = 1, valign = "top")
dplyr compare groups
car_te %>%
conf_mat_class2(pred_results, pred_results_2) %>%
select(-p1, -p2, -pred_results_2) -> car_te_cmc2
compare_groups <- function(data,
estimate1, from_to = from_to,
cmc1 = "TP", from_to_flow = "TP to FN"){
data %>%
filter({{ estimate1 }} == cmc1) %>%
select(-{{ estimate1 }}, -{{ from_to }})-> cg_1
# mutate(flow_group = cmc1)-> cg_1
data %>%
filter({{ from_to }} == from_to_flow) %>%
select(-{{ estimate1 }}, -{{ from_to }}) -> cg_2
# mutate(flow_group = from_to_flow)
cg_1 %>%
describe_group_factors() %>%
full_join(cg_2 %>%
describe_group_factors(),
by = c("variable", "value")) %>%
mutate_at(vars(val_n.x:perc.y), replace_na, 0) %>%
mutate(perc_diff = (perc.y - perc.x) / perc.x) %>%
ungroup() %>%
mutate(full_name = str_c(variable, value, sep = " ")) %>%
rowwise() %>%
mutate(
test_stat = chisq.test(c(val_n.y, val_n.x))$statistic %>% round(2),
p_val = chisq.test(c(val_n.y, val_n.x))$p.value
) %>%
select(variable, value, val_n.x, val_n.y, test_stat, p_val) -> cg_c
cg_1 %>%
describe_group_numeric () %>%
full_join(
cg_2 %>%
describe_group_numeric(),
by = c("variable")
) %>%
ungroup() %>%
rowwise() %>%
mutate(
p_val = t.test(c(mean.y, mean.x))$p.value,
test_stat = t.test(c(mean.y, mean.x))$statistic %>% round(2)
) %>%
select(variable, mean.x, sd.x,mean.y, sd.y, test_stat, p_val) -> cg_n
cg_c %>%
full_join(cg_n) %>%
group_by(variable) %>%
mutate(min_p = min(p_val)) %>%
ungroup() %>%
arrange(min_p) %>%
select(-min_p) -> t3_max
}
car_te_cmc2 %>%
compare_groups(pred_results, from_to,
cmc1 = "TP", from_to_flow = "TP to FN") -> car_te_max
## `summarise()` regrouping output by 'variable' (override with `.groups` argument)
## `summarise()` regrouping output by 'variable' (override with `.groups` argument)
## Warning: Problem with `mutate()` input `test_stat`.
## ℹ Chi-squared approximation may be incorrect
## ℹ Input `test_stat` is `chisq.test(c(val_n.y, val_n.x))$statistic %>% round(2)`.
## ℹ The error occurred in row 1.
## Warning in chisq.test(c(val_n.y, val_n.x)): Chi-squared approximation may be
## incorrect
## Warning: Problem with `mutate()` input `test_stat`.
## ℹ Chi-squared approximation may be incorrect
## ℹ Input `test_stat` is `chisq.test(c(val_n.y, val_n.x))$statistic %>% round(2)`.
## ℹ The error occurred in row 11.
## Warning in chisq.test(c(val_n.y, val_n.x)): Chi-squared approximation may be
## incorrect
## Warning: Problem with `mutate()` input `p_val`.
## ℹ Chi-squared approximation may be incorrect
## ℹ Input `p_val` is `chisq.test(c(val_n.y, val_n.x))$p.value`.
## ℹ The error occurred in row 1.
## Warning in chisq.test(c(val_n.y, val_n.x)): Chi-squared approximation may be
## incorrect
## Warning: Problem with `mutate()` input `p_val`.
## ℹ Chi-squared approximation may be incorrect
## ℹ Input `p_val` is `chisq.test(c(val_n.y, val_n.x))$p.value`.
## ℹ The error occurred in row 11.
## Warning in chisq.test(c(val_n.y, val_n.x)): Chi-squared approximation may be
## incorrect
## `summarise()` ungrouping output (override with `.groups` argument)
## `summarise()` ungrouping output (override with `.groups` argument)
## Joining, by = c("variable", "test_stat", "p_val")
compare_groups_kable <- function(data){
require(tidyverse)
data %>%
mutate(p_val = case_when(
p_val %>% round(3) == 0 ~ "<0.01 **",
p_val < 0.05 ~ str_c(p_val %>% round(3), " *"),
TRUE ~ p_val %>% round(3) %>% as.character()
)) %>%
mutate_if(is.numeric, . %>% round()) %>%
mutate(sd.x = str_c("(±", sd.x, ")"),
sd.y = str_c("(±", sd.y, ")"))
}
options(knitr.kable.NA = '')
car_te_max %>%
compare_groups_kable() %>%
kbl(escape = F, booktabs = T) %>%
kable_styling() %>%
# kable_paper("hover", full_width = F) %>%
column_spec(c(8:10), italic = T) %>%
row_spec(which(car_te_max$p_val > 0.05),
background = "#D3D3D3") %>%
column_spec(1, background = "white") %>%
collapse_rows(columns = 1, valign = "top")
variable
|
value
|
val_n.x
|
val_n.y
|
test_stat
|
p_val
|
mean.x
|
sd.x
|
mean.y
|
sd.y
|
Status
|
good
|
607
|
12
|
572
|
<0.01 **
|
|
|
|
|
Records
|
no_rec
|
575
|
12
|
540
|
<0.01 **
|
|
|
|
|
yes_rec
|
32
|
0
|
32
|
<0.01 **
|
|
|
|
|
Marital
|
divorced
|
6
|
0
|
6
|
0.014 *
|
|
|
|
|
married
|
479
|
8
|
456
|
<0.01 **
|
|
|
|
|
separated
|
17
|
0
|
17
|
<0.01 **
|
|
|
|
|
single
|
94
|
4
|
83
|
<0.01 **
|
|
|
|
|
widow
|
11
|
0
|
11
|
0.001 *
|
|
|
|
|
Job
|
fixed
|
429
|
0
|
429
|
<0.01 **
|
|
|
|
|
freelance
|
140
|
0
|
140
|
<0.01 **
|
|
|
|
|
others
|
26
|
4
|
16
|
<0.01 **
|
|
|
|
|
partime
|
12
|
8
|
1
|
0.371
|
|
|
|
|
Home
|
ignore
|
3
|
0
|
3
|
0.083
|
|
|
|
|
other
|
37
|
2
|
31
|
<0.01 **
|
|
|
|
|
owner
|
349
|
4
|
337
|
<0.01 **
|
|
|
|
|
parents
|
81
|
2
|
75
|
<0.01 **
|
|
|
|
|
priv
|
33
|
1
|
30
|
<0.01 **
|
|
|
|
|
rent
|
104
|
3
|
95
|
<0.01 **
|
|
|
|
|
Age
|
|
|
|
53
|
0.012 *
|
39
|
(±11)
|
37
|
(±15)
|
Expenses
|
|
|
|
17
|
0.037 *
|
57
|
(±20)
|
51
|
(±16)
|
Debt
|
|
|
|
12
|
0.053
|
257
|
(±830)
|
217
|
(±720)
|
Price
|
|
|
|
11
|
0.057
|
1497
|
(±675)
|
1250
|
(±663)
|
Amount
|
|
|
|
8
|
0.076
|
1003
|
(±458)
|
788
|
(±454)
|
Assets
|
|
|
|
6
|
0.097
|
6016
|
(±10728)
|
8208
|
(±8038)
|
Time
|
|
|
|
5
|
0.118
|
46
|
(±15)
|
32
|
(±15)
|
Income
|
|
|
|
4
|
0.15
|
152
|
(±82)
|
93
|
(±42)
|
Seniority
|
|
|
|
1
|
0.386
|
10
|
(±9)
|
2
|
(±2)
|
draw roc_curve
# car_te %>%
# mutate(random = random = runif(nrow(car_te)))
# pivot_longer(cols = c(random, part_model,full_model),
# names_to = "model", values_to = "fitted") %>%
# group_by(model) %>% # group to get individual ROC curve for each model
# roc_curve(truth = status, fitted) %>% # get values to plot an ROC curve
# ggplot(
# aes(
# x = 1 - specificity,
# y = sensitivity,
# color = model
# )
# ) + # plot with 2 ROC curves for each model
# geom_line(size = 1.1) +
# geom_abline(slope = 1, intercept = 0, size = 0.4) +
# scale_color_manual(values = c("random" = "black", "full_model" = "cyan4", "part_model" = "blue")) +
# coord_fixed()+
# theme_minimal()+
# ggtitle(label = "roc_curve")