\(~\)

\(~\)

\(~\)

1 Load Sample Data

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3     v purrr   0.3.4
## v tibble  3.0.4     v dplyr   0.8.5
## v tidyr   1.0.2     v stringr 1.4.0
## v readr   1.3.1     v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
diab_pop_org <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

# riagendr 
# ridreth1                   
# dmdeduc2 
# dmdmartl        
# indhhin2 

diab_pop %>%
  group_by(riagendr) %>%
  summarise(cnt = n_distinct(seqn))
## # A tibble: 2 x 2
##   riagendr   cnt
##   <fct>    <int>
## 1 Male      2747
## 2 Female    2972
diab_pop %>%
  group_by(ridreth1) %>%
  summarise(cnt = n_distinct(seqn))
## # A tibble: 5 x 2
##   ridreth1             cnt
##   <fct>              <int>
## 1 MexicanAmerican      995
## 2 Other Hispanic       768
## 3 Non-Hispanic White  1863
## 4 Non-Hispanic Black  1198
## 5 Other                895
diab_pop %>%
  group_by(dmdeduc2) %>%
  summarise(cnt = n_distinct(seqn))
## Warning: Factor `dmdeduc2` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 6 x 2
##   dmdeduc2                     cnt
##   <fct>                      <int>
## 1 Less than 9th grade          688
## 2 Grades 9-11th                676
## 3 High school graduate/GED    1236
## 4 Some college or AA degrees  1692
## 5 College grad or above       1422
## 6 <NA>                           5
diab_pop %>%
  group_by(dmdmartl) %>%
  summarise(cnt = n_distinct(seqn))
## Warning: Factor `dmdmartl` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 7 x 2
##   dmdmartl              cnt
##   <fct>               <int>
## 1 Married              2886
## 2 Widowed               421
## 3 Divorced              614
## 4 Separated             192
## 5 Never married        1048
## 6 Living with partner   555
## 7 <NA>                    3
diab_pop %>%
  group_by(indhhin2) %>%
  summarise(cnt = n_distinct(seqn))
## Warning: Factor `indhhin2` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 13 x 2
##    indhhin2            cnt
##    <fct>             <int>
##  1 $0-$4,999           122
##  2 $5,000-$9,999       211
##  3 $10,000-$14,999     347
##  4 $15,000-$19,999     348
##  5 $20,000-$24,999     350
##  6 $25,000-$34,999     596
##  7 $45,000-$54,999     451
##  8 $65,000-$74,999     282
##  9 20,000+             195
## 10 less than $20,000    85
## 11 $75,000-$99,999     524
## 12 $100,000+           910
## 13 <NA>               1298
diab_pop <- diab_pop %>%
  mutate_if(is.factor,
            fct_explicit_na)

\(~\)

\(~\)

2 Helper function 1 - feature_sum_function

feature_sum_function <- function(data,
                                 id,
                                 feature){
  
  result <- data %>%
    group_by(!!! syms(feature)) %>%
    summarise(cnt = n_distinct(!!! syms(id))) %>%
    rename(factor = all_of(feature)) %>%
    mutate(feature = as.character(feature)) %>%
    mutate(factor = as.character(factor))
    
  
  return(result)
  
}

2.1 Test function

feature_sum_function(data = diab_pop,
                     id = 'seqn',
                     feature = 'riagendr')
## # A tibble: 2 x 3
##   factor   cnt feature 
##   <chr>  <int> <chr>   
## 1 Male    2747 riagendr
## 2 Female  2972 riagendr
feature_sum_function(data = diab_pop,
                     id = 'seqn',
                     feature = 'ridreth1')
## # A tibble: 5 x 3
##   factor               cnt feature 
##   <chr>              <int> <chr>   
## 1 MexicanAmerican      995 ridreth1
## 2 Other Hispanic       768 ridreth1
## 3 Non-Hispanic White  1863 ridreth1
## 4 Non-Hispanic Black  1198 ridreth1
## 5 Other                895 ridreth1
purrr::map_dfr(c('riagendr','ridreth1'), 
               feature_sum_function,
               data = diab_pop,
               id = 'seqn')
## # A tibble: 7 x 3
##   factor               cnt feature 
##   <chr>              <int> <chr>   
## 1 Male                2747 riagendr
## 2 Female              2972 riagendr
## 3 MexicanAmerican      995 ridreth1
## 4 Other Hispanic       768 ridreth1
## 5 Non-Hispanic White  1863 ridreth1
## 6 Non-Hispanic Black  1198 ridreth1
## 7 Other                895 ridreth1

\(~\)

\(~\)

3 Helper function 2 - two_feature_sum_function

3.1 Helper function inner - two_feature_sum_function_inner

two_feature_sum_function_inner <- function(data,
                                           id,
                                           feature1,
                                           feature2){
  result <- data %>%
    group_by(!!! syms(feature1), !!! syms(feature2)) %>%
    summarise(cnt = n_distinct(!!! syms(id))) %>%
    ungroup() %>%
    rename(factor1 = all_of(feature1), 
           factor2 = all_of(feature2)) %>%
    mutate(feature1 = feature1) %>%
    mutate(feature2 = feature2) %>%
    mutate(factor1 = as.character(factor1),
           factor2 = as.character(factor2))
  
  return(result)
}

3.1.1 Test

two_feature_sum_function_inner(data = diab_pop,
                         id = 'seqn',
                         feature1 = 'riagendr',
                         feature2 = 'ridreth1')
## # A tibble: 10 x 5
##    factor1 factor2              cnt feature1 feature2
##    <chr>   <chr>              <int> <chr>    <chr>   
##  1 Male    MexicanAmerican      459 riagendr ridreth1
##  2 Male    Other Hispanic       334 riagendr ridreth1
##  3 Male    Non-Hispanic White   946 riagendr ridreth1
##  4 Male    Non-Hispanic Black   563 riagendr ridreth1
##  5 Male    Other                445 riagendr ridreth1
##  6 Female  MexicanAmerican      536 riagendr ridreth1
##  7 Female  Other Hispanic       434 riagendr ridreth1
##  8 Female  Non-Hispanic White   917 riagendr ridreth1
##  9 Female  Non-Hispanic Black   635 riagendr ridreth1
## 10 Female  Other                450 riagendr ridreth1

3.2 Main

two_feature_sum_function <- function(data,
                                     id,
                                     combination.tibble,
                                     record){
  
  current_rec_combo <- combination.tibble %>%
    filter(rn == record)
  
  feature1 <- current_rec_combo$feature1
  feature2 <- current_rec_combo$feature2
  
  result <- two_feature_sum_function_inner(data = data,
                                 id = id,
                                 feature1 = feature1,
                                 feature2 = feature2)
  
  return(result)
}

3.2.1 Test

t.combination_tibble <- tibble(feature1 = "riagendr",
                               feature2 = "ridreth1",
                               rn =1)

t.combination_tibble
## # A tibble: 1 x 3
##   feature1 feature2    rn
##   <chr>    <chr>    <dbl>
## 1 riagendr ridreth1     1
two_feature_sum_function(data = diab_pop,
                         id = 'seqn',
                         combination.tibble = t.combination_tibble,
                         record =1)
## # A tibble: 10 x 5
##    factor1 factor2              cnt feature1 feature2
##    <chr>   <chr>              <int> <chr>    <chr>   
##  1 Male    MexicanAmerican      459 riagendr ridreth1
##  2 Male    Other Hispanic       334 riagendr ridreth1
##  3 Male    Non-Hispanic White   946 riagendr ridreth1
##  4 Male    Non-Hispanic Black   563 riagendr ridreth1
##  5 Male    Other                445 riagendr ridreth1
##  6 Female  MexicanAmerican      536 riagendr ridreth1
##  7 Female  Other Hispanic       434 riagendr ridreth1
##  8 Female  Non-Hispanic White   917 riagendr ridreth1
##  9 Female  Non-Hispanic Black   635 riagendr ridreth1
## 10 Female  Other                450 riagendr ridreth1
purrr::map_dfr(t.combination_tibble$rn,
               two_feature_sum_function,
               data = diab_pop,
               id = 'seqn',
               combination.tibble = t.combination_tibble)
## # A tibble: 10 x 5
##    factor1 factor2              cnt feature1 feature2
##    <chr>   <chr>              <int> <chr>    <chr>   
##  1 Male    MexicanAmerican      459 riagendr ridreth1
##  2 Male    Other Hispanic       334 riagendr ridreth1
##  3 Male    Non-Hispanic White   946 riagendr ridreth1
##  4 Male    Non-Hispanic Black   563 riagendr ridreth1
##  5 Male    Other                445 riagendr ridreth1
##  6 Female  MexicanAmerican      536 riagendr ridreth1
##  7 Female  Other Hispanic       434 riagendr ridreth1
##  8 Female  Non-Hispanic White   917 riagendr ridreth1
##  9 Female  Non-Hispanic Black   635 riagendr ridreth1
## 10 Female  Other                450 riagendr ridreth1

\(~\)

\(~\)

\(~\)

4 Main Function - make_sankey_data_function

make_sankey_data_from_features <- function(data,
                                           id,
                                           target,
                                           features){
  n_features <- length(features)
  
  if (n_features < 2){
    print('Error please provide at least 2 features')
    return(NULL)
  }
  
  by.target <- data %>%
    group_by(!!! syms(target)) %>%
    summarise(cnt = n_distinct(!!! syms(id))) %>%
    rename(target_fact = target) %>%
    mutate(target = target)
  
  feature_tibble <- enframe(features) %>%
    rename(feature_index = name, 
           feature_name = value)
  
  feature_tibble.target <- feature_tibble %>%
    mutate(target_var = target)
  
  feature_tibble.feature <- as_tibble( gtools::combinations(n_features, 
                                                       2, 
                                                       v=features, 
                                                       set=FALSE, 
                                                       repeats.allowed=FALSE),
                                       .name_repair = "universal") %>%
    rename(feature1 ='...1',
           feature2 ='...2')
                                       
  # for every feature in the list I need a summary table

  featre_summary <- purrr::map_dfr(features, 
                                   feature_sum_function,
                                   data = data,
                                   id = id)

  
  # now I need a summary table for every combination
  combination_tibble <- feature_tibble.target %>%
    mutate(feature1 = feature_name, 
           feature2 = target_var) %>%
    select(feature1, feature2) %>% 
    bind_rows(feature_tibble.feature) %>%
    mutate(rn = row_number())
  
  two_feature_sum <- purrr::map_dfr(combination_tibble$rn,
                                    two_feature_sum_function,
                                    data = data,
                                    id = id,
                                    combination.tibble = combination_tibble)
 
  # Now work with output tables
  
  map_final_output <- function(feature){
    #print(feature)
    
    feature_summary.f1 <- featre_summary %>%
      filter(feature == !!syms(feature))
    
    two_featue_sum.f1 <- two_feature_sum %>%
      filter(feature1 == !!syms(feature))
    
    two_featue_sum.f2 <- two_feature_sum %>%
      filter(feature2 == !!syms(feature))
    
    if(nrow(two_featue_sum.f1) > 0){
      join1 <- feature_summary.f1 %>%
        left_join(two_featue_sum.f1, 
                  by=c('factor'='factor1')) %>%
        mutate(diff = cnt.x - cnt.y) %>%
        mutate(source = factor,
               target = factor2) %>%
        select(source, target, diff)
    }
    
    if(nrow(two_featue_sum.f2) > 0 ){
      join2 <- feature_summary.f1 %>%
        left_join(two_featue_sum.f2, 
                  by=c('factor'='factor2')) %>%
        mutate(diff = cnt.x - cnt.y) %>%
        mutate(source = factor,
               target = factor1) %>%
        select(source, target, diff)
    }
    if(nrow(two_featue_sum.f2) <= 0){
      join2 <- tibble(source = NA,
                      target = NA,
                      diff = NA)
    }
    
    join1 <- join1 %>%
      mutate(source = as.character(source), 
             target = as.character(target))
    
    join2 <- join2 %>%
      mutate(source = as.character(source), 
             target = as.character(target))
    
    join12 <- bind_rows(join1,join2)
    
    two_featue_sum.f1.target <- two_featue_sum.f1 %>%
      left_join(by.target, 
                by =c('factor2'='target_fact')) %>%
      mutate(diff = cnt.y - cnt.x) %>%
      filter(diff >= 0) %>%
      mutate(source = factor1) %>%
      mutate(target = factor2) %>%
      select(source, target, diff) %>%
      mutate(source = as.character(source), 
             target = as.character(target))
    
    join3 <- bind_rows(join12,
                       two_featue_sum.f1.target) %>%
      filter(diff > 0)
    
    return(join3)
    
  }
  
  FINAL <- purrr::map_dfr(features,
                      map_final_output)
  
  
  OUT <- list(
    by.target = by.target,
    feature_tibble = feature_tibble,
    feature_tibble.target = feature_tibble.target,
    feature_tibble.feature = feature_tibble.feature,
    featre_summary =featre_summary,
    combination_tibble = combination_tibble,
    two_feature_sum = two_feature_sum
    )
  
  return(FINAL)
  
}

4.1 Test

# riagendr 
# ridreth1                   
# dmdeduc2 
# dmdmartl        
# indhhin2 

t <- make_sankey_data_from_features(data = diab_pop,
                               id = 'seqn',
                               target = 'diq010',
                               features = c('riagendr',
                                            'ridreth1',
                                            'dmdeduc2',
                                            'dmdmartl'))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(target)` instead of `target` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## New names:
## * `` -> ...1
## * `` -> ...2
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
t
## # A tibble: 343 x 3
##    source target                    diff
##    <chr>  <chr>                    <int>
##  1 Male   Diabetes                  2291
##  2 Male   No Diabetes                456
##  3 Male   MexicanAmerican           2288
##  4 Male   Other Hispanic            2413
##  5 Male   Non-Hispanic White        1801
##  6 Male   Non-Hispanic Black        2184
##  7 Male   Other                     2302
##  8 Male   Less than 9th grade       2422
##  9 Male   Grades 9-11th             2375
## 10 Male   High school graduate/GED  2123
## # ... with 333 more rows

5 Final Function

make_sankey_graph_from_features <- function(data,
                                            id,
                                            target,
                                            features){
  
library(networkD3)

  t <- make_sankey_data_from_features(data = data,
                                      id = id,
                                      target = target,
                                      features = features)  
  
nodes <- data.frame(
  name=c( as.character(t$source),
          as.character(t$target)) 
  %>% unique()
)

t$IDsource <- match(t$source, nodes$name)-1 
t$IDtarget <- match(t$target, nodes$name)-1

p <- sankeyNetwork(Links = t, 
                   Nodes = nodes,
                   Source = "IDsource", 
                   Target = "IDtarget",
                   Value = "diff", 
                   NodeID = "name", 
                   sinksRight=FALSE)
return(p)
}

5.1 Test

# riagendr 
# ridreth1                   
# dmdeduc2 
# dmdmartl        
# indhhin2 

make_sankey_graph_from_features(data = diab_pop,
                                id = 'seqn',
                                target = 'diq010',
                                features = c('riagendr','ridreth1'))
## New names:
## * `` -> ...1
## * `` -> ...2
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Links is a tbl_df. Converting to a plain data frame.
make_sankey_graph_from_features(data = diab_pop,
                                id = 'seqn',
                                target = 'diq010',
                                features = c('riagendr','dmdeduc2'))
## New names:
## * `` -> ...1
## * `` -> ...2
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Links is a tbl_df. Converting to a plain data frame.
make_sankey_graph_from_features(data = diab_pop,
                                id = 'seqn',
                                target = 'diq010',
                                features = c('riagendr','ridreth1','dmdeduc2','dmdmartl','indhhin2'))
## New names:
## * `` -> ...1
## * `` -> ...2
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector

## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Links is a tbl_df. Converting to a plain data frame.

\(~\)

\(~\)

6 Code Appendix

\(~\)

\(~\)

library(tidyverse)

diab_pop_org <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')

# riagendr 
# ridreth1                   
# dmdeduc2 
# dmdmartl        
# indhhin2 

diab_pop %>%
  group_by(riagendr) %>%
  summarise(cnt = n_distinct(seqn))

diab_pop %>%
  group_by(ridreth1) %>%
  summarise(cnt = n_distinct(seqn))

diab_pop %>%
  group_by(dmdeduc2) %>%
  summarise(cnt = n_distinct(seqn))

diab_pop %>%
  group_by(dmdmartl) %>%
  summarise(cnt = n_distinct(seqn))

diab_pop %>%
  group_by(indhhin2) %>%
  summarise(cnt = n_distinct(seqn))


diab_pop <- diab_pop %>%
  mutate_if(is.factor,
            fct_explicit_na)
feature_sum_function <- function(data,
                                 id,
                                 feature){
  
  result <- data %>%
    group_by(!!! syms(feature)) %>%
    summarise(cnt = n_distinct(!!! syms(id))) %>%
    rename(factor = all_of(feature)) %>%
    mutate(feature = as.character(feature)) %>%
    mutate(factor = as.character(factor))
    
  
  return(result)
  
}

feature_sum_function(data = diab_pop,
                     id = 'seqn',
                     feature = 'riagendr')
feature_sum_function(data = diab_pop,
                     id = 'seqn',
                     feature = 'ridreth1')
purrr::map_dfr(c('riagendr','ridreth1'), 
               feature_sum_function,
               data = diab_pop,
               id = 'seqn')
two_feature_sum_function_inner <- function(data,
                                           id,
                                           feature1,
                                           feature2){
  result <- data %>%
    group_by(!!! syms(feature1), !!! syms(feature2)) %>%
    summarise(cnt = n_distinct(!!! syms(id))) %>%
    ungroup() %>%
    rename(factor1 = all_of(feature1), 
           factor2 = all_of(feature2)) %>%
    mutate(feature1 = feature1) %>%
    mutate(feature2 = feature2) %>%
    mutate(factor1 = as.character(factor1),
           factor2 = as.character(factor2))
  
  return(result)
}
two_feature_sum_function_inner(data = diab_pop,
                         id = 'seqn',
                         feature1 = 'riagendr',
                         feature2 = 'ridreth1')
two_feature_sum_function <- function(data,
                                     id,
                                     combination.tibble,
                                     record){
  
  current_rec_combo <- combination.tibble %>%
    filter(rn == record)
  
  feature1 <- current_rec_combo$feature1
  feature2 <- current_rec_combo$feature2
  
  result <- two_feature_sum_function_inner(data = data,
                                 id = id,
                                 feature1 = feature1,
                                 feature2 = feature2)
  
  return(result)
}

t.combination_tibble <- tibble(feature1 = "riagendr",
                               feature2 = "ridreth1",
                               rn =1)

t.combination_tibble

two_feature_sum_function(data = diab_pop,
                         id = 'seqn',
                         combination.tibble = t.combination_tibble,
                         record =1)
purrr::map_dfr(t.combination_tibble$rn,
               two_feature_sum_function,
               data = diab_pop,
               id = 'seqn',
               combination.tibble = t.combination_tibble)
make_sankey_data_from_features <- function(data,
                                           id,
                                           target,
                                           features){
  n_features <- length(features)
  
  if (n_features < 2){
    print('Error please provide at least 2 features')
    return(NULL)
  }
  
  by.target <- data %>%
    group_by(!!! syms(target)) %>%
    summarise(cnt = n_distinct(!!! syms(id))) %>%
    rename(target_fact = target) %>%
    mutate(target = target)
  
  feature_tibble <- enframe(features) %>%
    rename(feature_index = name, 
           feature_name = value)
  
  feature_tibble.target <- feature_tibble %>%
    mutate(target_var = target)
  
  feature_tibble.feature <- as_tibble( gtools::combinations(n_features, 
                                                       2, 
                                                       v=features, 
                                                       set=FALSE, 
                                                       repeats.allowed=FALSE),
                                       .name_repair = "universal") %>%
    rename(feature1 ='...1',
           feature2 ='...2')
                                       
  # for every feature in the list I need a summary table

  featre_summary <- purrr::map_dfr(features, 
                                   feature_sum_function,
                                   data = data,
                                   id = id)

  
  # now I need a summary table for every combination
  combination_tibble <- feature_tibble.target %>%
    mutate(feature1 = feature_name, 
           feature2 = target_var) %>%
    select(feature1, feature2) %>% 
    bind_rows(feature_tibble.feature) %>%
    mutate(rn = row_number())
  
  two_feature_sum <- purrr::map_dfr(combination_tibble$rn,
                                    two_feature_sum_function,
                                    data = data,
                                    id = id,
                                    combination.tibble = combination_tibble)
 
  # Now work with output tables
  
  map_final_output <- function(feature){
    #print(feature)
    
    feature_summary.f1 <- featre_summary %>%
      filter(feature == !!syms(feature))
    
    two_featue_sum.f1 <- two_feature_sum %>%
      filter(feature1 == !!syms(feature))
    
    two_featue_sum.f2 <- two_feature_sum %>%
      filter(feature2 == !!syms(feature))
    
    if(nrow(two_featue_sum.f1) > 0){
      join1 <- feature_summary.f1 %>%
        left_join(two_featue_sum.f1, 
                  by=c('factor'='factor1')) %>%
        mutate(diff = cnt.x - cnt.y) %>%
        mutate(source = factor,
               target = factor2) %>%
        select(source, target, diff)
    }
    
    if(nrow(two_featue_sum.f2) > 0 ){
      join2 <- feature_summary.f1 %>%
        left_join(two_featue_sum.f2, 
                  by=c('factor'='factor2')) %>%
        mutate(diff = cnt.x - cnt.y) %>%
        mutate(source = factor,
               target = factor1) %>%
        select(source, target, diff)
    }
    if(nrow(two_featue_sum.f2) <= 0){
      join2 <- tibble(source = NA,
                      target = NA,
                      diff = NA)
    }
    
    join1 <- join1 %>%
      mutate(source = as.character(source), 
             target = as.character(target))
    
    join2 <- join2 %>%
      mutate(source = as.character(source), 
             target = as.character(target))
    
    join12 <- bind_rows(join1,join2)
    
    two_featue_sum.f1.target <- two_featue_sum.f1 %>%
      left_join(by.target, 
                by =c('factor2'='target_fact')) %>%
      mutate(diff = cnt.y - cnt.x) %>%
      filter(diff >= 0) %>%
      mutate(source = factor1) %>%
      mutate(target = factor2) %>%
      select(source, target, diff) %>%
      mutate(source = as.character(source), 
             target = as.character(target))
    
    join3 <- bind_rows(join12,
                       two_featue_sum.f1.target) %>%
      filter(diff > 0)
    
    return(join3)
    
  }
  
  FINAL <- purrr::map_dfr(features,
                      map_final_output)
  
  
  OUT <- list(
    by.target = by.target,
    feature_tibble = feature_tibble,
    feature_tibble.target = feature_tibble.target,
    feature_tibble.feature = feature_tibble.feature,
    featre_summary =featre_summary,
    combination_tibble = combination_tibble,
    two_feature_sum = two_feature_sum
    )
  
  return(FINAL)
  
}
# riagendr 
# ridreth1                   
# dmdeduc2 
# dmdmartl        
# indhhin2 

t <- make_sankey_data_from_features(data = diab_pop,
                               id = 'seqn',
                               target = 'diq010',
                               features = c('riagendr',
                                            'ridreth1',
                                            'dmdeduc2',
                                            'dmdmartl'))

t
make_sankey_graph_from_features <- function(data,
                                            id,
                                            target,
                                            features){
  
library(networkD3)

  t <- make_sankey_data_from_features(data = data,
                                      id = id,
                                      target = target,
                                      features = features)  
  
nodes <- data.frame(
  name=c( as.character(t$source),
          as.character(t$target)) 
  %>% unique()
)

t$IDsource <- match(t$source, nodes$name)-1 
t$IDtarget <- match(t$target, nodes$name)-1

p <- sankeyNetwork(Links = t, 
                   Nodes = nodes,
                   Source = "IDsource", 
                   Target = "IDtarget",
                   Value = "diff", 
                   NodeID = "name", 
                   sinksRight=FALSE)
return(p)
}
# riagendr 
# ridreth1                   
# dmdeduc2 
# dmdmartl        
# indhhin2 

make_sankey_graph_from_features(data = diab_pop,
                                id = 'seqn',
                                target = 'diq010',
                                features = c('riagendr','ridreth1'))


make_sankey_graph_from_features(data = diab_pop,
                                id = 'seqn',
                                target = 'diq010',
                                features = c('riagendr','dmdeduc2'))

make_sankey_graph_from_features(data = diab_pop,
                                id = 'seqn',
                                target = 'diq010',
                                features = c('riagendr','ridreth1','dmdeduc2','dmdmartl','indhhin2'))