\(~\)
\(~\)
\(~\)
Load Sample Data
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## v ggplot2 3.3.3 v purrr 0.3.4
## v tibble 3.0.4 v dplyr 0.8.5
## v tidyr 1.0.2 v stringr 1.4.0
## v readr 1.3.1 v forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
diab_pop_org <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
# riagendr
# ridreth1
# dmdeduc2
# dmdmartl
# indhhin2
diab_pop %>%
group_by(riagendr) %>%
summarise(cnt = n_distinct(seqn))
## # A tibble: 2 x 2
## riagendr cnt
## <fct> <int>
## 1 Male 2747
## 2 Female 2972
diab_pop %>%
group_by(ridreth1) %>%
summarise(cnt = n_distinct(seqn))
## # A tibble: 5 x 2
## ridreth1 cnt
## <fct> <int>
## 1 MexicanAmerican 995
## 2 Other Hispanic 768
## 3 Non-Hispanic White 1863
## 4 Non-Hispanic Black 1198
## 5 Other 895
diab_pop %>%
group_by(dmdeduc2) %>%
summarise(cnt = n_distinct(seqn))
## Warning: Factor `dmdeduc2` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 6 x 2
## dmdeduc2 cnt
## <fct> <int>
## 1 Less than 9th grade 688
## 2 Grades 9-11th 676
## 3 High school graduate/GED 1236
## 4 Some college or AA degrees 1692
## 5 College grad or above 1422
## 6 <NA> 5
diab_pop %>%
group_by(dmdmartl) %>%
summarise(cnt = n_distinct(seqn))
## Warning: Factor `dmdmartl` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 7 x 2
## dmdmartl cnt
## <fct> <int>
## 1 Married 2886
## 2 Widowed 421
## 3 Divorced 614
## 4 Separated 192
## 5 Never married 1048
## 6 Living with partner 555
## 7 <NA> 3
diab_pop %>%
group_by(indhhin2) %>%
summarise(cnt = n_distinct(seqn))
## Warning: Factor `indhhin2` contains implicit NA, consider using
## `forcats::fct_explicit_na`
## # A tibble: 13 x 2
## indhhin2 cnt
## <fct> <int>
## 1 $0-$4,999 122
## 2 $5,000-$9,999 211
## 3 $10,000-$14,999 347
## 4 $15,000-$19,999 348
## 5 $20,000-$24,999 350
## 6 $25,000-$34,999 596
## 7 $45,000-$54,999 451
## 8 $65,000-$74,999 282
## 9 20,000+ 195
## 10 less than $20,000 85
## 11 $75,000-$99,999 524
## 12 $100,000+ 910
## 13 <NA> 1298
diab_pop <- diab_pop %>%
mutate_if(is.factor,
fct_explicit_na)
\(~\)
\(~\)
Helper function 1 - feature_sum_function
feature_sum_function <- function(data,
id,
feature){
result <- data %>%
group_by(!!! syms(feature)) %>%
summarise(cnt = n_distinct(!!! syms(id))) %>%
rename(factor = all_of(feature)) %>%
mutate(feature = as.character(feature)) %>%
mutate(factor = as.character(factor))
return(result)
}
Test function
feature_sum_function(data = diab_pop,
id = 'seqn',
feature = 'riagendr')
## # A tibble: 2 x 3
## factor cnt feature
## <chr> <int> <chr>
## 1 Male 2747 riagendr
## 2 Female 2972 riagendr
feature_sum_function(data = diab_pop,
id = 'seqn',
feature = 'ridreth1')
## # A tibble: 5 x 3
## factor cnt feature
## <chr> <int> <chr>
## 1 MexicanAmerican 995 ridreth1
## 2 Other Hispanic 768 ridreth1
## 3 Non-Hispanic White 1863 ridreth1
## 4 Non-Hispanic Black 1198 ridreth1
## 5 Other 895 ridreth1
purrr::map_dfr(c('riagendr','ridreth1'),
feature_sum_function,
data = diab_pop,
id = 'seqn')
## # A tibble: 7 x 3
## factor cnt feature
## <chr> <int> <chr>
## 1 Male 2747 riagendr
## 2 Female 2972 riagendr
## 3 MexicanAmerican 995 ridreth1
## 4 Other Hispanic 768 ridreth1
## 5 Non-Hispanic White 1863 ridreth1
## 6 Non-Hispanic Black 1198 ridreth1
## 7 Other 895 ridreth1
\(~\)
\(~\)
Helper function 2 - two_feature_sum_function
Helper function inner - two_feature_sum_function_inner
two_feature_sum_function_inner <- function(data,
id,
feature1,
feature2){
result <- data %>%
group_by(!!! syms(feature1), !!! syms(feature2)) %>%
summarise(cnt = n_distinct(!!! syms(id))) %>%
ungroup() %>%
rename(factor1 = all_of(feature1),
factor2 = all_of(feature2)) %>%
mutate(feature1 = feature1) %>%
mutate(feature2 = feature2) %>%
mutate(factor1 = as.character(factor1),
factor2 = as.character(factor2))
return(result)
}
Test
two_feature_sum_function_inner(data = diab_pop,
id = 'seqn',
feature1 = 'riagendr',
feature2 = 'ridreth1')
## # A tibble: 10 x 5
## factor1 factor2 cnt feature1 feature2
## <chr> <chr> <int> <chr> <chr>
## 1 Male MexicanAmerican 459 riagendr ridreth1
## 2 Male Other Hispanic 334 riagendr ridreth1
## 3 Male Non-Hispanic White 946 riagendr ridreth1
## 4 Male Non-Hispanic Black 563 riagendr ridreth1
## 5 Male Other 445 riagendr ridreth1
## 6 Female MexicanAmerican 536 riagendr ridreth1
## 7 Female Other Hispanic 434 riagendr ridreth1
## 8 Female Non-Hispanic White 917 riagendr ridreth1
## 9 Female Non-Hispanic Black 635 riagendr ridreth1
## 10 Female Other 450 riagendr ridreth1
Main
two_feature_sum_function <- function(data,
id,
combination.tibble,
record){
current_rec_combo <- combination.tibble %>%
filter(rn == record)
feature1 <- current_rec_combo$feature1
feature2 <- current_rec_combo$feature2
result <- two_feature_sum_function_inner(data = data,
id = id,
feature1 = feature1,
feature2 = feature2)
return(result)
}
Test
t.combination_tibble <- tibble(feature1 = "riagendr",
feature2 = "ridreth1",
rn =1)
t.combination_tibble
## # A tibble: 1 x 3
## feature1 feature2 rn
## <chr> <chr> <dbl>
## 1 riagendr ridreth1 1
two_feature_sum_function(data = diab_pop,
id = 'seqn',
combination.tibble = t.combination_tibble,
record =1)
## # A tibble: 10 x 5
## factor1 factor2 cnt feature1 feature2
## <chr> <chr> <int> <chr> <chr>
## 1 Male MexicanAmerican 459 riagendr ridreth1
## 2 Male Other Hispanic 334 riagendr ridreth1
## 3 Male Non-Hispanic White 946 riagendr ridreth1
## 4 Male Non-Hispanic Black 563 riagendr ridreth1
## 5 Male Other 445 riagendr ridreth1
## 6 Female MexicanAmerican 536 riagendr ridreth1
## 7 Female Other Hispanic 434 riagendr ridreth1
## 8 Female Non-Hispanic White 917 riagendr ridreth1
## 9 Female Non-Hispanic Black 635 riagendr ridreth1
## 10 Female Other 450 riagendr ridreth1
purrr::map_dfr(t.combination_tibble$rn,
two_feature_sum_function,
data = diab_pop,
id = 'seqn',
combination.tibble = t.combination_tibble)
## # A tibble: 10 x 5
## factor1 factor2 cnt feature1 feature2
## <chr> <chr> <int> <chr> <chr>
## 1 Male MexicanAmerican 459 riagendr ridreth1
## 2 Male Other Hispanic 334 riagendr ridreth1
## 3 Male Non-Hispanic White 946 riagendr ridreth1
## 4 Male Non-Hispanic Black 563 riagendr ridreth1
## 5 Male Other 445 riagendr ridreth1
## 6 Female MexicanAmerican 536 riagendr ridreth1
## 7 Female Other Hispanic 434 riagendr ridreth1
## 8 Female Non-Hispanic White 917 riagendr ridreth1
## 9 Female Non-Hispanic Black 635 riagendr ridreth1
## 10 Female Other 450 riagendr ridreth1
\(~\)
\(~\)
\(~\)
Main Function - make_sankey_data_function
make_sankey_data_from_features <- function(data,
id,
target,
features){
n_features <- length(features)
if (n_features < 2){
print('Error please provide at least 2 features')
return(NULL)
}
by.target <- data %>%
group_by(!!! syms(target)) %>%
summarise(cnt = n_distinct(!!! syms(id))) %>%
rename(target_fact = target) %>%
mutate(target = target)
feature_tibble <- enframe(features) %>%
rename(feature_index = name,
feature_name = value)
feature_tibble.target <- feature_tibble %>%
mutate(target_var = target)
feature_tibble.feature <- as_tibble( gtools::combinations(n_features,
2,
v=features,
set=FALSE,
repeats.allowed=FALSE),
.name_repair = "universal") %>%
rename(feature1 ='...1',
feature2 ='...2')
# for every feature in the list I need a summary table
featre_summary <- purrr::map_dfr(features,
feature_sum_function,
data = data,
id = id)
# now I need a summary table for every combination
combination_tibble <- feature_tibble.target %>%
mutate(feature1 = feature_name,
feature2 = target_var) %>%
select(feature1, feature2) %>%
bind_rows(feature_tibble.feature) %>%
mutate(rn = row_number())
two_feature_sum <- purrr::map_dfr(combination_tibble$rn,
two_feature_sum_function,
data = data,
id = id,
combination.tibble = combination_tibble)
# Now work with output tables
map_final_output <- function(feature){
#print(feature)
feature_summary.f1 <- featre_summary %>%
filter(feature == !!syms(feature))
two_featue_sum.f1 <- two_feature_sum %>%
filter(feature1 == !!syms(feature))
two_featue_sum.f2 <- two_feature_sum %>%
filter(feature2 == !!syms(feature))
if(nrow(two_featue_sum.f1) > 0){
join1 <- feature_summary.f1 %>%
left_join(two_featue_sum.f1,
by=c('factor'='factor1')) %>%
mutate(diff = cnt.x - cnt.y) %>%
mutate(source = factor,
target = factor2) %>%
select(source, target, diff)
}
if(nrow(two_featue_sum.f2) > 0 ){
join2 <- feature_summary.f1 %>%
left_join(two_featue_sum.f2,
by=c('factor'='factor2')) %>%
mutate(diff = cnt.x - cnt.y) %>%
mutate(source = factor,
target = factor1) %>%
select(source, target, diff)
}
if(nrow(two_featue_sum.f2) <= 0){
join2 <- tibble(source = NA,
target = NA,
diff = NA)
}
join1 <- join1 %>%
mutate(source = as.character(source),
target = as.character(target))
join2 <- join2 %>%
mutate(source = as.character(source),
target = as.character(target))
join12 <- bind_rows(join1,join2)
two_featue_sum.f1.target <- two_featue_sum.f1 %>%
left_join(by.target,
by =c('factor2'='target_fact')) %>%
mutate(diff = cnt.y - cnt.x) %>%
filter(diff >= 0) %>%
mutate(source = factor1) %>%
mutate(target = factor2) %>%
select(source, target, diff) %>%
mutate(source = as.character(source),
target = as.character(target))
join3 <- bind_rows(join12,
two_featue_sum.f1.target) %>%
filter(diff > 0)
return(join3)
}
FINAL <- purrr::map_dfr(features,
map_final_output)
OUT <- list(
by.target = by.target,
feature_tibble = feature_tibble,
feature_tibble.target = feature_tibble.target,
feature_tibble.feature = feature_tibble.feature,
featre_summary =featre_summary,
combination_tibble = combination_tibble,
two_feature_sum = two_feature_sum
)
return(FINAL)
}
Test
# riagendr
# ridreth1
# dmdeduc2
# dmdmartl
# indhhin2
t <- make_sankey_data_from_features(data = diab_pop,
id = 'seqn',
target = 'diq010',
features = c('riagendr',
'ridreth1',
'dmdeduc2',
'dmdmartl'))
## Note: Using an external vector in selections is ambiguous.
## i Use `all_of(target)` instead of `target` to silence this message.
## i See <https://tidyselect.r-lib.org/reference/faq-external-vector.html>.
## This message is displayed once per session.
## New names:
## * `` -> ...1
## * `` -> ...2
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
t
## # A tibble: 343 x 3
## source target diff
## <chr> <chr> <int>
## 1 Male Diabetes 2291
## 2 Male No Diabetes 456
## 3 Male MexicanAmerican 2288
## 4 Male Other Hispanic 2413
## 5 Male Non-Hispanic White 1801
## 6 Male Non-Hispanic Black 2184
## 7 Male Other 2302
## 8 Male Less than 9th grade 2422
## 9 Male Grades 9-11th 2375
## 10 Male High school graduate/GED 2123
## # ... with 333 more rows
Final Function
make_sankey_graph_from_features <- function(data,
id,
target,
features){
library(networkD3)
t <- make_sankey_data_from_features(data = data,
id = id,
target = target,
features = features)
nodes <- data.frame(
name=c( as.character(t$source),
as.character(t$target))
%>% unique()
)
t$IDsource <- match(t$source, nodes$name)-1
t$IDtarget <- match(t$target, nodes$name)-1
p <- sankeyNetwork(Links = t,
Nodes = nodes,
Source = "IDsource",
Target = "IDtarget",
Value = "diff",
NodeID = "name",
sinksRight=FALSE)
return(p)
}
Test
# riagendr
# ridreth1
# dmdeduc2
# dmdmartl
# indhhin2
make_sankey_graph_from_features(data = diab_pop,
id = 'seqn',
target = 'diq010',
features = c('riagendr','ridreth1'))
## New names:
## * `` -> ...1
## * `` -> ...2
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Links is a tbl_df. Converting to a plain data frame.
make_sankey_graph_from_features(data = diab_pop,
id = 'seqn',
target = 'diq010',
features = c('riagendr','dmdeduc2'))
## New names:
## * `` -> ...1
## * `` -> ...2
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Links is a tbl_df. Converting to a plain data frame.
make_sankey_graph_from_features(data = diab_pop,
id = 'seqn',
target = 'diq010',
features = c('riagendr','ridreth1','dmdeduc2','dmdmartl','indhhin2'))
## New names:
## * `` -> ...1
## * `` -> ...2
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Warning: Column `factor2`/`target_fact` joining character vector and factor,
## coercing into character vector
## Links is a tbl_df. Converting to a plain data frame.
\(~\)
\(~\)
Code Appendix
\(~\)
\(~\)
library(tidyverse)
diab_pop_org <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
diab_pop <- readRDS('C:/Users/jkyle/Documents/GitHub/Intro_Jeff_Data_Science/DATA/diab_pop.RDS')
# riagendr
# ridreth1
# dmdeduc2
# dmdmartl
# indhhin2
diab_pop %>%
group_by(riagendr) %>%
summarise(cnt = n_distinct(seqn))
diab_pop %>%
group_by(ridreth1) %>%
summarise(cnt = n_distinct(seqn))
diab_pop %>%
group_by(dmdeduc2) %>%
summarise(cnt = n_distinct(seqn))
diab_pop %>%
group_by(dmdmartl) %>%
summarise(cnt = n_distinct(seqn))
diab_pop %>%
group_by(indhhin2) %>%
summarise(cnt = n_distinct(seqn))
diab_pop <- diab_pop %>%
mutate_if(is.factor,
fct_explicit_na)
feature_sum_function <- function(data,
id,
feature){
result <- data %>%
group_by(!!! syms(feature)) %>%
summarise(cnt = n_distinct(!!! syms(id))) %>%
rename(factor = all_of(feature)) %>%
mutate(feature = as.character(feature)) %>%
mutate(factor = as.character(factor))
return(result)
}
feature_sum_function(data = diab_pop,
id = 'seqn',
feature = 'riagendr')
feature_sum_function(data = diab_pop,
id = 'seqn',
feature = 'ridreth1')
purrr::map_dfr(c('riagendr','ridreth1'),
feature_sum_function,
data = diab_pop,
id = 'seqn')
two_feature_sum_function_inner <- function(data,
id,
feature1,
feature2){
result <- data %>%
group_by(!!! syms(feature1), !!! syms(feature2)) %>%
summarise(cnt = n_distinct(!!! syms(id))) %>%
ungroup() %>%
rename(factor1 = all_of(feature1),
factor2 = all_of(feature2)) %>%
mutate(feature1 = feature1) %>%
mutate(feature2 = feature2) %>%
mutate(factor1 = as.character(factor1),
factor2 = as.character(factor2))
return(result)
}
two_feature_sum_function_inner(data = diab_pop,
id = 'seqn',
feature1 = 'riagendr',
feature2 = 'ridreth1')
two_feature_sum_function <- function(data,
id,
combination.tibble,
record){
current_rec_combo <- combination.tibble %>%
filter(rn == record)
feature1 <- current_rec_combo$feature1
feature2 <- current_rec_combo$feature2
result <- two_feature_sum_function_inner(data = data,
id = id,
feature1 = feature1,
feature2 = feature2)
return(result)
}
t.combination_tibble <- tibble(feature1 = "riagendr",
feature2 = "ridreth1",
rn =1)
t.combination_tibble
two_feature_sum_function(data = diab_pop,
id = 'seqn',
combination.tibble = t.combination_tibble,
record =1)
purrr::map_dfr(t.combination_tibble$rn,
two_feature_sum_function,
data = diab_pop,
id = 'seqn',
combination.tibble = t.combination_tibble)
make_sankey_data_from_features <- function(data,
id,
target,
features){
n_features <- length(features)
if (n_features < 2){
print('Error please provide at least 2 features')
return(NULL)
}
by.target <- data %>%
group_by(!!! syms(target)) %>%
summarise(cnt = n_distinct(!!! syms(id))) %>%
rename(target_fact = target) %>%
mutate(target = target)
feature_tibble <- enframe(features) %>%
rename(feature_index = name,
feature_name = value)
feature_tibble.target <- feature_tibble %>%
mutate(target_var = target)
feature_tibble.feature <- as_tibble( gtools::combinations(n_features,
2,
v=features,
set=FALSE,
repeats.allowed=FALSE),
.name_repair = "universal") %>%
rename(feature1 ='...1',
feature2 ='...2')
# for every feature in the list I need a summary table
featre_summary <- purrr::map_dfr(features,
feature_sum_function,
data = data,
id = id)
# now I need a summary table for every combination
combination_tibble <- feature_tibble.target %>%
mutate(feature1 = feature_name,
feature2 = target_var) %>%
select(feature1, feature2) %>%
bind_rows(feature_tibble.feature) %>%
mutate(rn = row_number())
two_feature_sum <- purrr::map_dfr(combination_tibble$rn,
two_feature_sum_function,
data = data,
id = id,
combination.tibble = combination_tibble)
# Now work with output tables
map_final_output <- function(feature){
#print(feature)
feature_summary.f1 <- featre_summary %>%
filter(feature == !!syms(feature))
two_featue_sum.f1 <- two_feature_sum %>%
filter(feature1 == !!syms(feature))
two_featue_sum.f2 <- two_feature_sum %>%
filter(feature2 == !!syms(feature))
if(nrow(two_featue_sum.f1) > 0){
join1 <- feature_summary.f1 %>%
left_join(two_featue_sum.f1,
by=c('factor'='factor1')) %>%
mutate(diff = cnt.x - cnt.y) %>%
mutate(source = factor,
target = factor2) %>%
select(source, target, diff)
}
if(nrow(two_featue_sum.f2) > 0 ){
join2 <- feature_summary.f1 %>%
left_join(two_featue_sum.f2,
by=c('factor'='factor2')) %>%
mutate(diff = cnt.x - cnt.y) %>%
mutate(source = factor,
target = factor1) %>%
select(source, target, diff)
}
if(nrow(two_featue_sum.f2) <= 0){
join2 <- tibble(source = NA,
target = NA,
diff = NA)
}
join1 <- join1 %>%
mutate(source = as.character(source),
target = as.character(target))
join2 <- join2 %>%
mutate(source = as.character(source),
target = as.character(target))
join12 <- bind_rows(join1,join2)
two_featue_sum.f1.target <- two_featue_sum.f1 %>%
left_join(by.target,
by =c('factor2'='target_fact')) %>%
mutate(diff = cnt.y - cnt.x) %>%
filter(diff >= 0) %>%
mutate(source = factor1) %>%
mutate(target = factor2) %>%
select(source, target, diff) %>%
mutate(source = as.character(source),
target = as.character(target))
join3 <- bind_rows(join12,
two_featue_sum.f1.target) %>%
filter(diff > 0)
return(join3)
}
FINAL <- purrr::map_dfr(features,
map_final_output)
OUT <- list(
by.target = by.target,
feature_tibble = feature_tibble,
feature_tibble.target = feature_tibble.target,
feature_tibble.feature = feature_tibble.feature,
featre_summary =featre_summary,
combination_tibble = combination_tibble,
two_feature_sum = two_feature_sum
)
return(FINAL)
}
# riagendr
# ridreth1
# dmdeduc2
# dmdmartl
# indhhin2
t <- make_sankey_data_from_features(data = diab_pop,
id = 'seqn',
target = 'diq010',
features = c('riagendr',
'ridreth1',
'dmdeduc2',
'dmdmartl'))
t
make_sankey_graph_from_features <- function(data,
id,
target,
features){
library(networkD3)
t <- make_sankey_data_from_features(data = data,
id = id,
target = target,
features = features)
nodes <- data.frame(
name=c( as.character(t$source),
as.character(t$target))
%>% unique()
)
t$IDsource <- match(t$source, nodes$name)-1
t$IDtarget <- match(t$target, nodes$name)-1
p <- sankeyNetwork(Links = t,
Nodes = nodes,
Source = "IDsource",
Target = "IDtarget",
Value = "diff",
NodeID = "name",
sinksRight=FALSE)
return(p)
}
# riagendr
# ridreth1
# dmdeduc2
# dmdmartl
# indhhin2
make_sankey_graph_from_features(data = diab_pop,
id = 'seqn',
target = 'diq010',
features = c('riagendr','ridreth1'))
make_sankey_graph_from_features(data = diab_pop,
id = 'seqn',
target = 'diq010',
features = c('riagendr','dmdeduc2'))
make_sankey_graph_from_features(data = diab_pop,
id = 'seqn',
target = 'diq010',
features = c('riagendr','ridreth1','dmdeduc2','dmdmartl','indhhin2'))