proto_funcs

credit data

customers <- read_csv("lab08-segmentation/customers.csv")

## Parsed with column specification:
## cols(
##   id = col_double(),
##   Seniority = col_double(),
##   Home = col_character(),
##   Age = col_double(),
##   Marital = col_character(),
##   Records = col_character(),
##   Job = col_character(),
##   Expenses = col_double(),
##   Income = col_double(),
##   Assets = col_double(),
##   Debt = col_double()
## )

customers$Home = as.factor(customers$Home)
customers$Records = as.factor(customers$Records)
credit_data <- read_csv("lab08-segmentation/credit-data.csv")

## Parsed with column specification:
## cols(
##   id = col_double(),
##   Status = col_character(),
##   Time = col_double(),
##   Amount = col_double(),
##   Price = col_double()
## )

credit_data = left_join(customers, credit_data)

## Joining, by = "id"

credit_data$Status = factor(credit_data$Status, levels = c("good", "bad"))


credit_split <- initial_split(credit_data %>% select(-id))
car_tr <- training(credit_split)
car_te <-  testing(credit_split)

library("rpart")

## 
## Attaching package: 'rpart'

## The following object is masked from 'package:dials':
## 
##     prune

(rp <- rpart(Status ~ ., data = car_tr, model = TRUE))

## n= 2735 
## 
## node), split, n, loss, yval, (yprob)
##       * denotes terminal node
## 
##  1) root 2735 774 good (0.7170018 0.2829982)  
##    2) Records=no_rec 2251 509 good (0.7738783 0.2261217)  
##      4) Job=fixed,freelance,others 2013 374 good (0.8142077 0.1857923) *
##      5) Job=partime 238 103 bad (0.4327731 0.5672269)  
##       10) Assets>=7250 15   2 good (0.8666667 0.1333333) *
##       11) Assets< 7250 223  90 bad (0.4035874 0.5964126)  
##         22) Time< 27 32  12 good (0.6250000 0.3750000) *
##         23) Time>=27 191  70 bad (0.3664921 0.6335079) *
##    3) Records=yes_rec 484 219 bad (0.4524793 0.5475207)  
##      6) Seniority>=6.5 207  75 good (0.6376812 0.3623188) *
##      7) Seniority< 6.5 277  87 bad (0.3140794 0.6859206) *

rp_2 <- rpart(Status ~ Income + Job + Marital + Home, data = car_tr, model = TRUE)

p1 = predict(rp, car_te, type = "class")
p2 = predict(rp_2, car_te, type = "class")

car_te$p1 = p1 %>% unname()
car_te$p2 = p2 %>% unname()

Confusion matrix classification

conf_mat_class <- function(data, truth, estimate, 
                           bclass_name = "bclass",
                           true_class = T, false_class = F){
  require(tidyverse)
  
  data %>% 
    mutate("{ bclass_name }" := case_when(
      {{ truth }} == true_class & {{ estimate }} == true_class ~ "TP",
      {{ truth }} == false_class & {{ estimate }} == false_class ~ "TN",
      {{ truth }} == true_class & {{ estimate }} == false_class ~ "FN",
      {{ truth }} == false_class & {{ estimate }} == true_class ~ "FP"
    ))
}

car_te = car_te %>% 
  conf_mat_class(Status, p1, true_class = "good",  false_class = "bad", bclass_name = "pred_results")

car_te = car_te %>% 
  conf_mat_class(Status, p2, true_class = "good",  false_class = "bad", bclass_name = "pred_results_2")

Conf matrix but ggalluvial

cmc_flows <- function(data,
      # colors = c("TP" = "#1a9641", "TN" = "#a6d96a", 
      #            "FP" = "#fdae61", "FN" = "#d7191c"),
      truth, estimate, bclass,
      count_name = "cases"){
  require(tidyverse)
  
  data %>% 
    group_by({{ truth }}, {{ estimate }}, {{ bclass }}) %>% 
    summarise("{ count_name  }" := n())
}



car_te %>% cmc_flows(Status, pred_results, p1) -> flows

## `summarise()` regrouping output by 'Status', 'pred_results' (override with `.groups` argument)

  # group_by(Status, pred_results, p1, p2, pred_results_2) %>% 
  # summarise(cases = n())


cmc_flows_plot <- function(flows_data,
            truth, estimate, bclass, 
            cases = cases,
            cols = c("TP" = "#1a9641", "TN" = "#a6d96a", 
            "FP" = "#fdae61", "FN" = "#d7191c")
                           ){
  
  require(ggalluvial)
  require(tidyverse)
  
  
  cols = cols
  
  flows_data %>% 
  ggplot(aes(y = {{ cases }}, axis1 = {{ truth }}, axis2 = {{ estimate }})) +
    geom_alluvium(aes(fill = {{bclass}}), width = 1/12) +
    geom_stratum(alpha = 1, fill = "white", color ="grey") +
    geom_label(stat = "stratum", aes(label = after_stat(stratum))) +
    scale_x_discrete(limits = c("True classes", "Predicted classes"), expand = c(.05, .05)) +
    scale_fill_manual(values = cols) +
    ggtitle("Truth vs Estimate")+
    theme_minimal()+
    guides(fill = guide_legend(reverse=T))
}

# ?pivot_wider()
# ?enquo
# 
# flows %>% 
#   cmc_flows_plot(Status, p1, pred_results, cases)

car_te %>% 
  cmc_flows(Status, p1, pred_results) %>% 
  cmc_flows_plot(Status, p1, pred_results, cases)

## `summarise()` regrouping output by 'Status', 'p1' (override with `.groups` argument)

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

## Warning in to_lodes_form(data = data, axes = axis_ind, discern =
## params$discern): Some strata appear at multiple axes.

Conf matrix 2

conf_mat_class2 <- function(data, estimate_1, estimate_2, 
                           change_name = "from_to",
                           true_class = T, false_class = F){
  require(tidyverse)
  
data %>% 
    mutate("{ change_name }" := case_when(
      
        # same:
        {{ estimate_1 }} == "TP" &{{ estimate_2 }} == "TP" ~ "TP",
        {{ estimate_1 }} == "TN" &{{ estimate_2 }} == "TN" ~ "TN",
        {{ estimate_1 }} == "FP" &{{ estimate_2 }} == "FP" ~ "FP",
        {{ estimate_1 }} == "FN" &{{ estimate_2 }} == "FN" ~ "FN",
      
        # changed for the worse:
        {{ estimate_1 }} == "TP" &{{ estimate_2 }} == "FN" ~ "TP to FN",
        {{ estimate_1 }} == "TN" &{{ estimate_2 }} == "FP" ~ "TN to FP",
      
        # chaned for the better:
        {{ estimate_1 }} == "FP" &{{ estimate_2 }} == "TN" ~ "FP to TN",
        {{ estimate_1 }} == "FN" &{{ estimate_2 }} == "TP" ~ "FN to TP"
    ))
}

car_te %>% 
  conf_mat_class2(pred_results, pred_results_2)

## # A tibble: 911 x 19
##    Seniority Home    Age Marital Records Job   Expenses Income Assets  Debt
##        <dbl> <fct> <dbl> <chr>   <fct>   <chr>    <dbl>  <dbl>  <dbl> <dbl>
##  1        17 rent     58 widow   no_rec  fixed       52    131      0     0
##  2        29 owner    44 married no_rec  fixed       68    125  10000     0
##  3         7 owner    29 married no_rec  fixed       74    121   3000     0
##  4         0 other    21 single  yes_rec part…       30     50      0     0
##  5        15 priv     52 single  no_rec  free…       36    330  16500     0
##  6         1 pare…    31 single  no_rec  fixed       32    137      0     0
##  7        15 owner    43 married no_rec  fixed       79    251   4000     0
##  8        10 owner    39 married no_rec  free…       72    150   3000     0
##  9        24 owner    42 married no_rec  fixed       73    137   4000     0
## 10         2 owner    43 married no_rec  part…       75     71   3000     0
## # … with 901 more rows, and 9 more variables: Status <fct>, Time <dbl>,
## #   Amount <dbl>, Price <dbl>, p1 <fct>, p2 <fct>, pred_results <chr>,
## #   pred_results_2 <chr>, from_to <chr>

describe_group_factors

describe_group_factors <- function(data) {
  ##select only meaningful variables
  
  data %>% 
    mutate_if(is.factor, as.character) %>% 
    select_if(is.character) %>% 
    # mutate(id = row_number()) %>% 
    pivot_longer(everything(),
               names_to = "variable",
               values_to = "value") %>% 
    group_by(variable, value) %>% 
    summarise(val_n = n()) %>% 
    mutate(perc = val_n/sum(val_n))
}


# t3_a %>% describe_group_factors() %>% 
#   full_join(t3_b %>% describe_group_factors(), by = c("variable", "value")) %>% 
#   mutate_at(vars(val_n.x:perc.y),  replace_na, 0) %>% 
#   mutate(perc_diff = (perc.y - perc.x) / perc.x) %>% ungroup() %>% 
#   mutate(full_name = str_c(variable, value, sep = " ")) %>% 
#   rowwise() %>% 
#   mutate(
#     test_stat = chisq.test(c(val_n.y, val_n.x))$statistic %>% round(2),
#     p_val = chisq.test(c(val_n.y, val_n.x))$p.value
#     ) %>% 
#   select(variable, value, val_n.x, val_n.y,  test_stat, p_val) -> t3_c

describe_group_numeric

describe_group_numeric <- function(data) {
  ##select only meaningful variables
  
  data %>% 
    select_if(is.numeric) %>% 
    # mutate(id = row_number()) %>% 
    pivot_longer(everything(),
               names_to = "variable",
               values_to = "value") %>% 
    group_by(variable) %>% 
    summarise(mean = mean(value), sd = sd(value), val_n = n())
}


# t3_a %>% 
#   describe_group_numeric () %>% 
#   full_join(
#     t3_b %>% describe_group_numeric(),
#     by = c("variable")
#   ) %>% 
#   rowwise %>%
#   mutate(
#     p_val = t.test(c(mean.y, mean.x))$p.value,
#     test_stat = t.test(c(mean.y, mean.x))$statistic %>% round(2)
#   ) %>%
#   select(variable, mean.x, sd.x,mean.y, sd.y, test_stat, p_val) -> t3_n

# t3_c %>% 
#   full_join(t3_n) %>% 
#   group_by(variable) %>% 
#   mutate(min_p = min(p_val)) %>% 
#   ungroup() %>% 
#   arrange(min_p) %>% 
#   select(-min_p)  -> t3_max
# 
# 
# options(knitr.kable.NA = '')
# 
# t3_max %>% 
#   mutate(p_val = case_when(
#     p_val %>% round(3) == 0 ~ "<0.01 **",
#     p_val  < 0.05 ~ str_c(p_val %>% round(3), " *"),
#     TRUE ~ p_val %>% round(3) %>% as.character()
#   )) %>%  
#   mutate_if(is.numeric, . %>% round()) %>%
#   mutate(sd.x = str_c("(±",  sd.x, ")"),
#          sd.y = str_c("(±",  sd.y, ")")) %>% 
#   kbl(escape = F, booktabs = T) %>%
#   kable_styling() %>% 
#   # kable_paper("hover", full_width = F) %>%
#   column_spec(c(8:10), italic  = T) %>% 
#   row_spec(which(t3_max$p_val > 0.05), 
#            background = "#D3D3D3") %>% 
#   column_spec(1, background = "white") %>% 
#   collapse_rows(columns = 1, valign = "top")

dplyr compare groups

car_te %>% 
  conf_mat_class2(pred_results, pred_results_2) %>% 
  select(-p1, -p2, -pred_results_2) -> car_te_cmc2


compare_groups <- function(data, 
                           estimate1, from_to = from_to, 
                           cmc1 = "TP", from_to_flow = "TP to FN"){
  data %>% 
    filter({{ estimate1 }} == cmc1) %>% 
    select(-{{ estimate1 }}, -{{ from_to }})-> cg_1
    # mutate(flow_group = cmc1)-> cg_1
  
  data %>% 
    filter({{ from_to }} == from_to_flow) %>% 
    select(-{{ estimate1 }}, -{{ from_to }}) -> cg_2
    # mutate(flow_group = from_to_flow)

  cg_1 %>% 
    describe_group_factors() %>% 
    full_join(cg_2 %>% 
                describe_group_factors(), 
            by = c("variable", "value")) %>% 
    mutate_at(vars(val_n.x:perc.y),  replace_na, 0) %>% 
    mutate(perc_diff = (perc.y - perc.x) / perc.x) %>% 
    ungroup() %>% 
    mutate(full_name = str_c(variable, value, sep = " ")) %>% 
    rowwise() %>% 
    mutate(
      test_stat = chisq.test(c(val_n.y, val_n.x))$statistic %>% round(2),
      p_val = chisq.test(c(val_n.y, val_n.x))$p.value
      ) %>% 
    select(variable, value, val_n.x, val_n.y,  test_stat, p_val) -> cg_c
  

  cg_1 %>%
    describe_group_numeric () %>%
    full_join(
      cg_2 %>% 
        describe_group_numeric(),
      by = c("variable")
    ) %>%
    ungroup() %>% 
    rowwise() %>%
    mutate(
      p_val = t.test(c(mean.y, mean.x))$p.value,
      test_stat = t.test(c(mean.y, mean.x))$statistic %>% round(2)
    ) %>%
    select(variable, mean.x, sd.x,mean.y, sd.y, test_stat, p_val) -> cg_n


  cg_c %>%
    full_join(cg_n) %>%
    group_by(variable) %>%
    mutate(min_p = min(p_val)) %>%
    ungroup() %>%
    arrange(min_p) %>%
    select(-min_p)  -> t3_max
}

car_te_cmc2 %>% 
  compare_groups(pred_results, from_to, 
                           cmc1 = "TP", from_to_flow = "TP to FN") -> car_te_max

## `summarise()` regrouping output by 'variable' (override with `.groups` argument)
## `summarise()` regrouping output by 'variable' (override with `.groups` argument)

## Warning: Problem with `mutate()` input `test_stat`.
## ℹ Chi-squared approximation may be incorrect
## ℹ Input `test_stat` is `chisq.test(c(val_n.y, val_n.x))$statistic %>% round(2)`.
## ℹ The error occurred in row 1.

## Warning in chisq.test(c(val_n.y, val_n.x)): Chi-squared approximation may be
## incorrect

## Warning: Problem with `mutate()` input `test_stat`.
## ℹ Chi-squared approximation may be incorrect
## ℹ Input `test_stat` is `chisq.test(c(val_n.y, val_n.x))$statistic %>% round(2)`.
## ℹ The error occurred in row 11.

## Warning in chisq.test(c(val_n.y, val_n.x)): Chi-squared approximation may be
## incorrect

## Warning: Problem with `mutate()` input `p_val`.
## ℹ Chi-squared approximation may be incorrect
## ℹ Input `p_val` is `chisq.test(c(val_n.y, val_n.x))$p.value`.
## ℹ The error occurred in row 1.

## Warning in chisq.test(c(val_n.y, val_n.x)): Chi-squared approximation may be
## incorrect

## Warning: Problem with `mutate()` input `p_val`.
## ℹ Chi-squared approximation may be incorrect
## ℹ Input `p_val` is `chisq.test(c(val_n.y, val_n.x))$p.value`.
## ℹ The error occurred in row 11.

## Warning in chisq.test(c(val_n.y, val_n.x)): Chi-squared approximation may be
## incorrect

## `summarise()` ungrouping output (override with `.groups` argument)

## `summarise()` ungrouping output (override with `.groups` argument)

## Joining, by = c("variable", "test_stat", "p_val")

compare_groups_kable <- function(data){
  require(tidyverse)
  
  data %>% 
    mutate(p_val = case_when(
      p_val %>% round(3) == 0 ~ "<0.01 **",
      p_val  < 0.05 ~ str_c(p_val %>% round(3), " *"),
      TRUE ~ p_val %>% round(3) %>% as.character()
    )) %>%  
    mutate_if(is.numeric, . %>% round()) %>%
    mutate(sd.x = str_c("(±",  sd.x, ")"),
           sd.y = str_c("(±",  sd.y, ")"))
  
}

options(knitr.kable.NA = '')

car_te_max %>%   
   compare_groups_kable() %>% 
    kbl(escape = F, booktabs = T) %>%
    kable_styling() %>% 
    # kable_paper("hover", full_width = F) %>%
    column_spec(c(8:10), italic  = T) %>% 
    row_spec(which(car_te_max$p_val > 0.05), 
             background = "#D3D3D3") %>% 
    column_spec(1, background = "white") %>% 
    collapse_rows(columns = 1, valign = "top")

variable	value	val_n.x	val_n.y	test_stat	p_val	mean.x	sd.x	mean.y	sd.y
Status	good	607	12	572	<0.01 **
Records	no_rec	575	12	540	<0.01 **
Records	yes_rec	32	0	32	<0.01 **
Marital	divorced	6	0	6	0.014 *
	married	479	8	456	<0.01 **
	separated	17	0	17	<0.01 **
	single	94	4	83	<0.01 **
	widow	11	0	11	0.001 *
Job	fixed	429	0	429	<0.01 **
	freelance	140	0	140	<0.01 **
	others	26	4	16	<0.01 **
	partime	12	8	1	0.371
Home	ignore	3	0	3	0.083
	other	37	2	31	<0.01 **
	owner	349	4	337	<0.01 **
	parents	81	2	75	<0.01 **
	priv	33	1	30	<0.01 **
	rent	104	3	95	<0.01 **
Age				53	0.012 *	39	(±11)	37	(±15)
Expenses				17	0.037 *	57	(±20)	51	(±16)
Debt				12	0.053	257	(±830)	217	(±720)
Price				11	0.057	1497	(±675)	1250	(±663)
Amount				8	0.076	1003	(±458)	788	(±454)
Assets				6	0.097	6016	(±10728)	8208	(±8038)
Time				5	0.118	46	(±15)	32	(±15)
Income				4	0.15	152	(±82)	93	(±42)
Seniority				1	0.386	10	(±9)	2	(±2)

draw roc_curve

# car_te %>% 
#   mutate(random = random = runif(nrow(car_te)))
#   pivot_longer(cols = c(random, part_model,full_model), 
#                names_to = "model", values_to = "fitted") %>% 
#   group_by(model) %>% # group to get individual ROC curve for each model
#   roc_curve(truth = status, fitted) %>% # get values to plot an ROC curve
#   ggplot(
#     aes(
#       x = 1 - specificity, 
#       y = sensitivity, 
#       color = model
#     )
#   ) + # plot with 2 ROC curves for each model
#   geom_line(size = 1.1) +
#   geom_abline(slope = 1, intercept = 0, size = 0.4) +
#   scale_color_manual(values = c("random" = "black", "full_model" = "cyan4", "part_model" = "blue")) +
#   coord_fixed()+
#     theme_minimal()+
#   ggtitle(label = "roc_curve")

proto_funcs

Suschevskiy Vsevolod

11/8/2020

credit data

Confusion matrix classification

Conf matrix but ggalluvial

Conf matrix 2

describe_group_factors

describe_group_numeric

dplyr compare groups

draw roc_curve