Init
library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.2 ✔ tibble 3.2.1
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.0.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
##
##
## Attaching package: 'magrittr'
##
##
## The following object is masked from 'package:purrr':
##
## set_names
##
##
## The following object is masked from 'package:tidyr':
##
## extract
##
##
## Loading required package: weights
##
## Loading required package: Hmisc
##
##
## Attaching package: 'Hmisc'
##
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
##
## The following objects are masked from 'package:base':
##
## format.pval, units
##
##
## Loading required package: assertthat
##
##
## Attaching package: 'assertthat'
##
##
## The following object is masked from 'package:tibble':
##
## has_name
##
##
## Loading required package: psych
##
##
## Attaching package: 'psych'
##
##
## The following object is masked from 'package:Hmisc':
##
## describe
##
##
## The following objects are masked from 'package:ggplot2':
##
## %+%, alpha
##
##
##
## Attaching package: 'kirkegaard'
##
##
## The following object is masked from 'package:psych':
##
## rescale
##
##
## The following object is masked from 'package:assertthat':
##
## are_equal
##
##
## The following object is masked from 'package:purrr':
##
## is_logical
##
##
## The following object is masked from 'package:base':
##
## +
load_packages(
googlesheets4,
ggeffects,
broom
)
theme_set(theme_bw())
options(
digits = 3
)
#multithreading
#library(future)
#plan(multisession(workers = 8))
Functions
#recode cancer types
recode_map <- c(
"Anus" = "Gastrointestinal Cancer",
"Bladder" = "Other Cancer Types", # Sometimes grouped with urinary tract cancers
"Bone and Joint" = "Bone Cancer",
"Brain and Other Nervous System" = "Brain Tumor",
"Breast" = "Breast Cancer",
"Cervix Uteri" = "Cervical Cancer",
"Colon and Rectum" = "Colon and Rectal Cancer",
"Esophagus" = "Gastrointestinal Cancer",
"Hodgkin Lymphoma" = "Blood Cancer", # Could also map to Lymphoma specifically
"Kidney and Renal Pelvis" = "Kidney Cancer",
"Larynx" = "Head and Neck Cancer",
"Leukemia" = "Leukemia/Leukaemia",
"Liver and Intrahepatic Bile Duct" = "Liver Cancer",
"Lung and Bronchus" = "Lung Cancer",
"Melanoma of the Skin" = "Melanoma",
"Myeloma" = "Blood Cancer", # Often considered a type of blood cancer
"Non-Hodgkin Lymphoma" = "Non-Hodgkin's Lymphoma",
"Oral Cavity and Pharynx" = "Head and Neck Cancer",
"Ovary" = "Ovarian Cancer",
"Pancreas" = "Pancreatic Cancer",
"Prostate" = "Prostate Cancer",
"Small Intestine" = "Gastrointestinal Cancer",
"Stomach" = "Gastrointestinal Cancer",
"Testis" = "Other Cancer Types", # Could be its own category or under reproductive cancers
"Thyroid" = "Thyroid Cancer",
"Uterus" = "Other Cancer Types", # Could also be grouped under gynecologic cancers
"Vulva" = "Other Cancer Types" # Often grouped under gynecologic cancers
)
Data
#cancer data
gs4_deauth()
d_cancer = read_sheet(
"https://docs.google.com/spreadsheets/d/1dLFZ7CJNbA8mqzkpZiOuxtWYRQoIaJkH49I9tuSCVfI/edit?gid=0#gid=0",
sheet = 1
) %>% df_legalize_names()
## ✔ Reading from "Cancer types, data, funding".
## ✔ Range ''cancer data''.
d_funding = read_sheet(
"https://docs.google.com/spreadsheets/d/1dLFZ7CJNbA8mqzkpZiOuxtWYRQoIaJkH49I9tuSCVfI/edit?gid=0#gid=0",
sheet = 2
) %>% df_legalize_names()
## ✔ Reading from "Cancer types, data, funding".
## ✔ Range ''funding''.
#fix list columns into numeric
d_cancer %<>% map_df(
function(x) {
if (is.list(x)) {
x = as.numeric(x)
}
return(x)
}
)
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
#recode cancer types
d_cancer %<>% mutate(
type2 = recode(Type, !!!recode_map)
)
#summarize by type2
d_cancer2 = d_cancer %>%
filter(sex == "both") %>%
group_by(type2) %>%
summarise(
n = n(),
new_cases_per_year = sum(new_cases_per_year, na.rm = TRUE),
deaths_per_year = sum(deaths, na.rm = TRUE),
)
#compute male% of new cases and deaths
d_cancer_sex = d_cancer %>%
filter(sex != "both") %>%
#impute 0s for new cases, deaths
mutate(
new_cases_per_year = ifelse(is.na(new_cases_per_year), 0, new_cases_per_year),
deaths = ifelse(is.na(deaths), 0, deaths)
) %>%
#recode into type2
mutate(
type2 = recode(Type, !!!recode_map)
) %>%
group_by(type2, sex) %>%
summarise(
new_cases_per_year = sum(new_cases_per_year, na.rm = TRUE),
deaths_per_year = sum(deaths, na.rm = TRUE),
) %>%
ungroup() %>%
group_by(type2) %>%
summarise(
new_cases_male_pct = new_cases_per_year[1] / sum(new_cases_per_year, na.rm = TRUE),
deaths_male_pct = deaths_per_year[1] / sum(deaths_per_year, na.rm = TRUE)
)
## `summarise()` has grouped output by 'type2'. You can override using the
## `.groups` argument.
#join by type 2
d = full_join(
d_cancer2,
d_funding,
by = c("type2" = "Cancer_Type")
) %>%
#join with sex prevalence
full_join(
d_cancer_sex
)
## Joining with `by = join_by(type2)`
Analysis
d %>%
select(
-type2,
) %>%
GG_heatmap(
cross_out_nonsig = T,
p_sig = 0.05
) +
labs(
title = "Cancer funding, cases, deaths, sex bias correlations"
)

GG_save("figs/heatmap.png")
d %>%
select(
-type2,
) %>%
cor_matrix(
p_val = T,
asterisks_only = F
)
## n new_cases_per_year deaths_per_year
## n "1" "0.03 [0.901]" "-0.01 [0.983]"
## new_cases_per_year "0.03 [0.901]" "1" "0.58 [0.01*]"
## deaths_per_year "-0.01 [0.983]" "0.58 [0.01*]" "1"
## Number_of_Grants "0.01 [0.959]" "0.70 [<0.001***]" "0.52 [0.024]"
## Funded_Amount "-0.02 [0.949]" "0.77 [<0.001***]" "0.56 [0.013]"
## Specific_Amount "-0.12 [0.623]" "0.72 [<0.001***]" "0.52 [0.022]"
## new_cases_male_pct "0.10 [0.679]" "0.12 [0.637]" "0.09 [0.705]"
## deaths_male_pct "0.10 [0.672]" "0.10 [0.696]" "0.07 [0.79]"
## Number_of_Grants Funded_Amount Specific_Amount
## n "0.01 [0.959]" "-0.02 [0.949]" "-0.12 [0.623]"
## new_cases_per_year "0.70 [<0.001***]" "0.77 [<0.001***]" "0.72 [<0.001***]"
## deaths_per_year "0.52 [0.024]" "0.56 [0.013]" "0.52 [0.022]"
## Number_of_Grants "1" "0.97 [<0.001***]" "0.98 [<0.001***]"
## Funded_Amount "0.97 [<0.001***]" "1" "0.97 [<0.001***]"
## Specific_Amount "0.98 [<0.001***]" "0.97 [<0.001***]" "1"
## new_cases_male_pct "-0.33 [0.163]" "-0.33 [0.166]" "-0.32 [0.178]"
## deaths_male_pct "-0.40 [0.094]" "-0.39 [0.097]" "-0.37 [0.119]"
## new_cases_male_pct deaths_male_pct
## n "0.10 [0.679]" "0.10 [0.672]"
## new_cases_per_year "0.12 [0.637]" "0.10 [0.696]"
## deaths_per_year "0.09 [0.705]" "0.07 [0.79]"
## Number_of_Grants "-0.33 [0.163]" "-0.40 [0.094]"
## Funded_Amount "-0.33 [0.166]" "-0.39 [0.097]"
## Specific_Amount "-0.32 [0.178]" "-0.37 [0.119]"
## new_cases_male_pct "1" "0.98 [<0.001***]"
## deaths_male_pct "0.98 [<0.001***]" "1"
d %>%
GG_scatter(
"new_cases_per_year",
"Funded_Amount",
case_names = "type2"
) +
labs(
title = "Cancer funding vs new cases per year",
x = "New cases per year",
y = "Funding amount"
)
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter_cases_funding.png")
## `geom_smooth()` using formula = 'y ~ x'
d %>%
GG_scatter(
"new_cases_male_pct",
"Funded_Amount",
case_names = "type2"
) +
labs(
title = "Cancer funding vs. male % of new cases",
x = "Male % of new cases",
y = "Funding amount"
)
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter_male_pct_funding.png")
## `geom_smooth()` using formula = 'y ~ x'
#regression
model_1 = lm(
Funded_Amount ~ new_cases_per_year * new_cases_male_pct,
data = d
)
model_1 %>%
summary()
##
## Call:
## lm(formula = Funded_Amount ~ new_cases_per_year * new_cases_male_pct,
## data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -23124915 -4849742 714945 7596443 22665955
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.71e+07 9.26e+06 1.84 0.0850 .
## new_cases_per_year 3.87e+02 5.07e+01 7.63 1.5e-06 ***
## new_cases_male_pct -1.28e+07 1.80e+07 -0.71 0.4897
## new_cases_per_year:new_cases_male_pct -2.60e+02 8.66e+01 -3.01 0.0088 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13200000 on 15 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.853, Adjusted R-squared: 0.824
## F-statistic: 29 on 3 and 15 DF, p-value: 1.72e-06
model_1 %>% tidy() %>%
{
.$estimate[2] / (.$estimate[2] + .$estimate[4])
}
## [1] 3.06
model_1 %>%
ggeffects::ggpredict(
terms = c("new_cases_per_year", "new_cases_male_pct[0, 0.5, 1]")
) %>%
plot(
show_data = T
)
## Data points may overlap. Use the `jitter` argument to add some amount of
## random variation to the location of data points and avoid overplotting.

GG_save("figs/model_1_cases.png")
#using deaths
model_2 = lm(
Funded_Amount ~ deaths_per_year * deaths_male_pct,
data = d
)
model_2 %>%
summary()
##
## Call:
## lm(formula = Funded_Amount ~ deaths_per_year * deaths_male_pct,
## data = d)
##
## Residuals:
## Min 1Q Median 3Q Max
## -21734581 -15492470 -4102264 4817719 44924312
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 7188300 17454038 0.41 0.686
## deaths_per_year 2353 629 3.74 0.002 **
## deaths_male_pct 26581131 33886666 0.78 0.445
## deaths_per_year:deaths_male_pct -3433 1229 -2.79 0.014 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 19900000 on 15 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.668, Adjusted R-squared: 0.601
## F-statistic: 10.1 on 3 and 15 DF, p-value: 0.000702
model_2 %>% tidy() %>%
{
.$estimate[2] / (.$estimate[2] + .$estimate[4])
}
## [1] -2.18
model_2 %>%
ggeffects::ggpredict(
terms = c("deaths_per_year", "deaths_male_pct[0, 0.5, 1]")
) %>%
plot(
show_data = T
)
## Data points may overlap. Use the `jitter` argument to add some amount of
## random variation to the location of data points and avoid overplotting.

GG_save("figs/model_2_deaths.png")
#cases, but without prostate cancer
model_3 = lm(
Funded_Amount ~ new_cases_per_year * new_cases_male_pct,
data = d %>% filter(
type2 != "Prostate Cancer")
)
model_3 %>%
summary()
##
## Call:
## lm(formula = Funded_Amount ~ new_cases_per_year * new_cases_male_pct,
## data = d %>% filter(type2 != "Prostate Cancer"))
##
## Residuals:
## Min 1Q Median 3Q Max
## -22338283 -5840787 87785 8135523 19226533
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 1.71e+07 9.40e+06 1.82 0.09 .
## new_cases_per_year 3.74e+02 5.43e+01 6.89 7.5e-06 ***
## new_cases_male_pct -1.81e+07 1.96e+07 -0.92 0.37
## new_cases_per_year:new_cases_male_pct -1.61e+02 1.60e+02 -1.00 0.33
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 13400000 on 14 degrees of freedom
## (3 observations deleted due to missingness)
## Multiple R-squared: 0.858, Adjusted R-squared: 0.828
## F-statistic: 28.3 on 3 and 14 DF, p-value: 3.34e-06
model_3 %>%
ggeffects::ggpredict(
terms = c("new_cases_per_year", "new_cases_male_pct[0, 0.5, 1]")
) %>%
plot(
show_data = T
)
## Data points may overlap. Use the `jitter` argument to add some amount of
## random variation to the location of data points and avoid overplotting.
