Init

library(kirkegaard)
## Loading required package: tidyverse
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.2     ✔ tibble    3.2.1
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.0.4     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
## Loading required package: magrittr
## 
## 
## Attaching package: 'magrittr'
## 
## 
## The following object is masked from 'package:purrr':
## 
##     set_names
## 
## 
## The following object is masked from 'package:tidyr':
## 
##     extract
## 
## 
## Loading required package: weights
## 
## Loading required package: Hmisc
## 
## 
## Attaching package: 'Hmisc'
## 
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## 
## Loading required package: assertthat
## 
## 
## Attaching package: 'assertthat'
## 
## 
## The following object is masked from 'package:tibble':
## 
##     has_name
## 
## 
## Loading required package: psych
## 
## 
## Attaching package: 'psych'
## 
## 
## The following object is masked from 'package:Hmisc':
## 
##     describe
## 
## 
## The following objects are masked from 'package:ggplot2':
## 
##     %+%, alpha
## 
## 
## 
## Attaching package: 'kirkegaard'
## 
## 
## The following object is masked from 'package:psych':
## 
##     rescale
## 
## 
## The following object is masked from 'package:assertthat':
## 
##     are_equal
## 
## 
## The following object is masked from 'package:purrr':
## 
##     is_logical
## 
## 
## The following object is masked from 'package:base':
## 
##     +
load_packages(
  googlesheets4,
  ggeffects,
  broom
)

theme_set(theme_bw())

options(
    digits = 3
)

#multithreading
#library(future)
#plan(multisession(workers = 8))

Functions

#recode cancer types
recode_map <- c(
  "Anus" = "Gastrointestinal Cancer",
  "Bladder" = "Other Cancer Types",  # Sometimes grouped with urinary tract cancers
  "Bone and Joint" = "Bone Cancer",
  "Brain and Other Nervous System" = "Brain Tumor",
  "Breast" = "Breast Cancer",
  "Cervix Uteri" = "Cervical Cancer",
  "Colon and Rectum" = "Colon and Rectal Cancer",
  "Esophagus" = "Gastrointestinal Cancer",
  "Hodgkin Lymphoma" = "Blood Cancer",  # Could also map to Lymphoma specifically
  "Kidney and Renal Pelvis" = "Kidney Cancer",
  "Larynx" = "Head and Neck Cancer",
  "Leukemia" = "Leukemia/Leukaemia",
  "Liver and Intrahepatic Bile Duct" = "Liver Cancer",
  "Lung and Bronchus" = "Lung Cancer",
  "Melanoma of the Skin" = "Melanoma",
  "Myeloma" = "Blood Cancer",  # Often considered a type of blood cancer
  "Non-Hodgkin Lymphoma" = "Non-Hodgkin's Lymphoma",
  "Oral Cavity and Pharynx" = "Head and Neck Cancer",
  "Ovary" = "Ovarian Cancer",
  "Pancreas" = "Pancreatic Cancer",
  "Prostate" = "Prostate Cancer",
  "Small Intestine" = "Gastrointestinal Cancer",
  "Stomach" = "Gastrointestinal Cancer",
  "Testis" = "Other Cancer Types",  # Could be its own category or under reproductive cancers
  "Thyroid" = "Thyroid Cancer",
  "Uterus" = "Other Cancer Types",  # Could also be grouped under gynecologic cancers
  "Vulva" = "Other Cancer Types"   # Often grouped under gynecologic cancers
)

Data

#cancer data
gs4_deauth()
d_cancer = read_sheet(
  "https://docs.google.com/spreadsheets/d/1dLFZ7CJNbA8mqzkpZiOuxtWYRQoIaJkH49I9tuSCVfI/edit?gid=0#gid=0",
  sheet = 1
) %>% df_legalize_names()
## ✔ Reading from "Cancer types, data, funding".
## ✔ Range ''cancer data''.
d_funding = read_sheet(
  "https://docs.google.com/spreadsheets/d/1dLFZ7CJNbA8mqzkpZiOuxtWYRQoIaJkH49I9tuSCVfI/edit?gid=0#gid=0",
  sheet = 2
) %>% df_legalize_names()
## ✔ Reading from "Cancer types, data, funding".
## ✔ Range ''funding''.
#fix list columns into numeric
d_cancer %<>% map_df(
  function(x) {
    if (is.list(x)) {
      x = as.numeric(x)
    }
    return(x)
  }
)
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
## Warning in .f(.x[[i]], ...): NAs introduced by coercion
#recode cancer types
d_cancer %<>% mutate(
  type2 = recode(Type, !!!recode_map)
)

#summarize by type2
d_cancer2 = d_cancer %>% 
  filter(sex == "both") %>% 
  group_by(type2) %>% 
  summarise(
    n = n(),
    
    new_cases_per_year = sum(new_cases_per_year, na.rm = TRUE),
    deaths_per_year = sum(deaths, na.rm = TRUE),
  )

#compute male% of new cases and deaths
d_cancer_sex = d_cancer %>% 
  filter(sex != "both") %>% 
  #impute 0s for new cases, deaths
  mutate(
    new_cases_per_year = ifelse(is.na(new_cases_per_year), 0, new_cases_per_year),
    deaths = ifelse(is.na(deaths), 0, deaths)
  ) %>%
  #recode into type2
    mutate(
    type2 = recode(Type, !!!recode_map)
  ) %>%
  group_by(type2, sex) %>%
    summarise(
    new_cases_per_year = sum(new_cases_per_year, na.rm = TRUE),
    deaths_per_year = sum(deaths, na.rm = TRUE),
  ) %>% 
  ungroup() %>% 
  group_by(type2) %>%
  summarise(
    new_cases_male_pct = new_cases_per_year[1] / sum(new_cases_per_year, na.rm = TRUE),
    deaths_male_pct = deaths_per_year[1] / sum(deaths_per_year, na.rm = TRUE)
  )
## `summarise()` has grouped output by 'type2'. You can override using the
## `.groups` argument.
#join by type 2
d = full_join(
  d_cancer2,
  d_funding,
  by = c("type2" = "Cancer_Type")
) %>% 
  #join with sex prevalence
  full_join(
    d_cancer_sex
  )
## Joining with `by = join_by(type2)`

Analysis

d %>% 
  select(
    -type2,
  ) %>% 
  GG_heatmap(
    cross_out_nonsig = T,
    p_sig = 0.05
  ) +
  labs(
    title = "Cancer funding, cases, deaths, sex bias correlations"
  )

GG_save("figs/heatmap.png")

d %>% 
  select(
    -type2,
  ) %>% 
  cor_matrix(
    p_val = T,
    asterisks_only = F
  )
##                    n               new_cases_per_year deaths_per_year
## n                  "1"             "0.03 [0.901]"     "-0.01 [0.983]"
## new_cases_per_year "0.03 [0.901]"  "1"                "0.58 [0.01*]" 
## deaths_per_year    "-0.01 [0.983]" "0.58 [0.01*]"     "1"            
## Number_of_Grants   "0.01 [0.959]"  "0.70 [<0.001***]" "0.52 [0.024]" 
## Funded_Amount      "-0.02 [0.949]" "0.77 [<0.001***]" "0.56 [0.013]" 
## Specific_Amount    "-0.12 [0.623]" "0.72 [<0.001***]" "0.52 [0.022]" 
## new_cases_male_pct "0.10 [0.679]"  "0.12 [0.637]"     "0.09 [0.705]" 
## deaths_male_pct    "0.10 [0.672]"  "0.10 [0.696]"     "0.07 [0.79]"  
##                    Number_of_Grants   Funded_Amount      Specific_Amount   
## n                  "0.01 [0.959]"     "-0.02 [0.949]"    "-0.12 [0.623]"   
## new_cases_per_year "0.70 [<0.001***]" "0.77 [<0.001***]" "0.72 [<0.001***]"
## deaths_per_year    "0.52 [0.024]"     "0.56 [0.013]"     "0.52 [0.022]"    
## Number_of_Grants   "1"                "0.97 [<0.001***]" "0.98 [<0.001***]"
## Funded_Amount      "0.97 [<0.001***]" "1"                "0.97 [<0.001***]"
## Specific_Amount    "0.98 [<0.001***]" "0.97 [<0.001***]" "1"               
## new_cases_male_pct "-0.33 [0.163]"    "-0.33 [0.166]"    "-0.32 [0.178]"   
## deaths_male_pct    "-0.40 [0.094]"    "-0.39 [0.097]"    "-0.37 [0.119]"   
##                    new_cases_male_pct deaths_male_pct   
## n                  "0.10 [0.679]"     "0.10 [0.672]"    
## new_cases_per_year "0.12 [0.637]"     "0.10 [0.696]"    
## deaths_per_year    "0.09 [0.705]"     "0.07 [0.79]"     
## Number_of_Grants   "-0.33 [0.163]"    "-0.40 [0.094]"   
## Funded_Amount      "-0.33 [0.166]"    "-0.39 [0.097]"   
## Specific_Amount    "-0.32 [0.178]"    "-0.37 [0.119]"   
## new_cases_male_pct "1"                "0.98 [<0.001***]"
## deaths_male_pct    "0.98 [<0.001***]" "1"
d %>% 
  GG_scatter(
    "new_cases_per_year",
    "Funded_Amount",
    case_names = "type2"
  ) +
  labs(
    title = "Cancer funding vs new cases per year",
    x = "New cases per year",
    y = "Funding amount"
  )
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter_cases_funding.png")
## `geom_smooth()` using formula = 'y ~ x'
d %>% 
  GG_scatter(
    "new_cases_male_pct",
    "Funded_Amount",
    case_names = "type2"
  ) +
  labs(
    title = "Cancer funding vs. male % of new cases",
    x = "Male % of new cases",
    y = "Funding amount"
  )
## `geom_smooth()` using formula = 'y ~ x'

GG_save("figs/scatter_male_pct_funding.png")
## `geom_smooth()` using formula = 'y ~ x'
#regression
model_1 = lm(
  Funded_Amount ~ new_cases_per_year * new_cases_male_pct,
  data = d
  )

model_1 %>% 
  summary()
## 
## Call:
## lm(formula = Funded_Amount ~ new_cases_per_year * new_cases_male_pct, 
##     data = d)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -23124915  -4849742    714945   7596443  22665955 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            1.71e+07   9.26e+06    1.84   0.0850 .  
## new_cases_per_year                     3.87e+02   5.07e+01    7.63  1.5e-06 ***
## new_cases_male_pct                    -1.28e+07   1.80e+07   -0.71   0.4897    
## new_cases_per_year:new_cases_male_pct -2.60e+02   8.66e+01   -3.01   0.0088 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13200000 on 15 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.853,  Adjusted R-squared:  0.824 
## F-statistic:   29 on 3 and 15 DF,  p-value: 1.72e-06
model_1 %>% tidy() %>% 
  {
    .$estimate[2] / (.$estimate[2] + .$estimate[4])
  }
## [1] 3.06
model_1 %>% 
  ggeffects::ggpredict(
    terms = c("new_cases_per_year", "new_cases_male_pct[0, 0.5, 1]")
  ) %>%
  plot(
    show_data = T
  )
## Data points may overlap. Use the `jitter` argument to add some amount of
##   random variation to the location of data points and avoid overplotting.

GG_save("figs/model_1_cases.png")

#using deaths
model_2 = lm(
  Funded_Amount ~ deaths_per_year * deaths_male_pct,
  data = d
  )

model_2 %>%
  summary()
## 
## Call:
## lm(formula = Funded_Amount ~ deaths_per_year * deaths_male_pct, 
##     data = d)
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -21734581 -15492470  -4102264   4817719  44924312 
## 
## Coefficients:
##                                 Estimate Std. Error t value Pr(>|t|)   
## (Intercept)                      7188300   17454038    0.41    0.686   
## deaths_per_year                     2353        629    3.74    0.002 **
## deaths_male_pct                 26581131   33886666    0.78    0.445   
## deaths_per_year:deaths_male_pct    -3433       1229   -2.79    0.014 * 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 19900000 on 15 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.668,  Adjusted R-squared:  0.601 
## F-statistic: 10.1 on 3 and 15 DF,  p-value: 0.000702
model_2 %>% tidy() %>% 
  {
    .$estimate[2] / (.$estimate[2] + .$estimate[4])
  }
## [1] -2.18
model_2 %>%
  ggeffects::ggpredict(
    terms = c("deaths_per_year", "deaths_male_pct[0, 0.5, 1]")
  ) %>%
  plot(
    show_data = T
  )
## Data points may overlap. Use the `jitter` argument to add some amount of
##   random variation to the location of data points and avoid overplotting.

GG_save("figs/model_2_deaths.png")

#cases, but without prostate cancer
model_3 = lm(
  Funded_Amount ~ new_cases_per_year * new_cases_male_pct,
  data = d %>% filter(
    type2 != "Prostate Cancer")
  )

model_3 %>% 
  summary()
## 
## Call:
## lm(formula = Funded_Amount ~ new_cases_per_year * new_cases_male_pct, 
##     data = d %>% filter(type2 != "Prostate Cancer"))
## 
## Residuals:
##       Min        1Q    Median        3Q       Max 
## -22338283  -5840787     87785   8135523  19226533 
## 
## Coefficients:
##                                        Estimate Std. Error t value Pr(>|t|)    
## (Intercept)                            1.71e+07   9.40e+06    1.82     0.09 .  
## new_cases_per_year                     3.74e+02   5.43e+01    6.89  7.5e-06 ***
## new_cases_male_pct                    -1.81e+07   1.96e+07   -0.92     0.37    
## new_cases_per_year:new_cases_male_pct -1.61e+02   1.60e+02   -1.00     0.33    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 13400000 on 14 degrees of freedom
##   (3 observations deleted due to missingness)
## Multiple R-squared:  0.858,  Adjusted R-squared:  0.828 
## F-statistic: 28.3 on 3 and 14 DF,  p-value: 3.34e-06
model_3 %>% 
  ggeffects::ggpredict(
    terms = c("new_cases_per_year", "new_cases_male_pct[0, 0.5, 1]")
  ) %>%
  plot(
    show_data = T
  )
## Data points may overlap. Use the `jitter` argument to add some amount of
##   random variation to the location of data points and avoid overplotting.

Meta

#versions
write_sessioninfo()
## R version 4.5.0 (2025-04-11)
## Platform: x86_64-pc-linux-gnu
## Running under: Linux Mint 21.1
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0 
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0  LAPACK version 3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## time zone: Europe/Brussels
## tzcode source: system (glibc)
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] broom_1.0.8           ggeffects_2.2.1       googlesheets4_1.1.1  
##  [4] kirkegaard_2025-05-09 psych_2.5.3           assertthat_0.2.1     
##  [7] weights_1.0.4         Hmisc_5.2-3           magrittr_2.0.3       
## [10] lubridate_1.9.4       forcats_1.0.0         stringr_1.5.1        
## [13] dplyr_1.1.4           purrr_1.0.4           readr_2.1.5          
## [16] tidyr_1.3.1           tibble_3.2.1          ggplot2_3.5.2        
## [19] tidyverse_2.0.0      
## 
## loaded via a namespace (and not attached):
##  [1] Rdpack_2.6.4      mnormt_2.1.1      gridExtra_2.3     rlang_1.1.6      
##  [5] compiler_4.5.0    mgcv_1.9-1        gdata_3.0.1       systemfonts_1.2.2
##  [9] vctrs_0.6.5       pkgconfig_2.0.3   shape_1.4.6.1     fastmap_1.2.0    
## [13] backports_1.5.0   labeling_0.4.3    rmarkdown_2.29    tzdb_0.5.0       
## [17] haven_2.5.4       nloptr_2.2.1      ragg_1.4.0        xfun_0.52        
## [21] glmnet_4.1-8      jomo_2.7-6        cachem_1.1.0      jsonlite_2.0.0   
## [25] pan_1.9           parallel_4.5.0    cluster_2.1.8.1   R6_2.6.1         
## [29] bslib_0.9.0       stringi_1.8.7     boot_1.3-31       rpart_4.1.24     
## [33] jquerylib_0.1.4   cellranger_1.1.0  Rcpp_1.0.14       iterators_1.0.14 
## [37] knitr_1.50        base64enc_0.1-3   Matrix_1.7-3      splines_4.5.0    
## [41] nnet_7.3-20       timechange_0.3.0  tidyselect_1.2.1  rstudioapi_0.17.1
## [45] yaml_2.3.10       codetools_0.2-19  curl_6.2.2        lattice_0.22-5   
## [49] withr_3.0.2       evaluate_1.0.3    foreign_0.8-90    survival_3.8-3   
## [53] pillar_1.10.2     mice_3.17.0       checkmate_2.3.2   foreach_1.5.2    
## [57] reformulas_0.4.0  insight_1.2.0     generics_0.1.3    hms_1.1.3        
## [61] munsell_0.5.1     scales_1.3.0      minqa_1.2.8       gtools_3.9.5     
## [65] glue_1.8.0        tools_4.5.0       data.table_1.17.0 lme4_1.1-37      
## [69] fs_1.6.6          grid_4.5.0        rbibutils_2.3     datawizard_1.0.2 
## [73] colorspace_2.1-1  nlme_3.1-168      htmlTable_2.4.3   googledrive_2.1.1
## [77] Formula_1.2-5     cli_3.6.4         textshaping_1.0.0 gargle_1.5.2     
## [81] gtable_0.3.6      sass_0.4.10       digest_0.6.37     htmlwidgets_1.6.4
## [85] farver_2.1.2      htmltools_0.5.8.1 lifecycle_1.0.4   httr_1.4.7       
## [89] mitml_0.4-5       MASS_7.3-65
#write data to file for reuse
d %>% write_rds("data/data_for_reuse.rds")
d_cancer %>% write_rds("data/data_for_reuse2.rds")
d_funding %>% write_rds("data/data_for_reuse3.rds")

#OSF
if (F) {
  library(osfr)
  
  #login
  osf_auth(readr::read_lines("~/.config/osf_token"))
  
  #the project we will use
  osf_proj = osf_retrieve_node("https://osf.io/XXX/")
  
  #upload all files in project
  #overwrite existing (versioning)
  osf_upload(
    osf_proj,
    path = c("data", "figures", "papers", "notebook.Rmd", "notebook.html", "sessions_info.txt"), 
    conflicts = "overwrite"
    )
}