Objective

Create and validate a predictive model to understand a medical student’s chances of matching into obstetrics and gynecology residency.

invisible(gc())

Read in split data and train models (all train_match models end with .fit e.g. ‘logit.fit’)

train_match <- read.csv(file = "output/csv/train_dat_2017_2018_years.csv") %>% 
  dplyr::select(-X, -Year, -Location)

# Recode the "Type_of_medical_school" variable
train_match$Type_of_medical_school <- ifelse(train_match$Type_of_medical_school == "Osteopathic School,International School",
                                   "Osteopathic School",
                                   train_match$Type_of_medical_school)

test_match <- read.csv(file = "output/csv/test_dat_2019_2020_years.csv") %>%
  dplyr::select(-X, -Year, -Location)

# Recode the "Type_of_medical_school" variable
test_match$Type_of_medical_school <- ifelse(test_match$Type_of_medical_school == "Osteopathic School,International School",
                                   "Osteopathic School",
                                   test_match$Type_of_medical_school)

Labels for Hmisc Nomogram

We generated the nomogram to provide a pre-match, personalized estimate of the chance of matching into OBGYN residency at all institutions whereby points in the nomogram were assigned in proportion to the effect sizes in the multivariable logistic regression analysis model. The nomogram was based on presurgical variables including pre-Match education preparations, research accomplishments, and applicant demographics.
Points were allocated for each variable, summed, and then used to calculate a medical student-specific, pre-application risk chance of Matching. The nomogram illustrates the strength of association of the predictors to the outcome as well as the nonlinear associations between age and count of poster presentations and matching.

Model made with ‘rms::lrm’

t.data <- rms::datadist(train_match)
options(datadist = t.data)

fit <- rms::lrm(formula = Match_Status ~ ., 
           data = train_match)

Preparing the Model for Nomogram

rms_nomo <- rms::nomogram(fit = fit,
                          Age = c(25, 30, 35, 40, 45, 50),
                          number_of_applicant_first_author_publications = c(0, 4, 8, 10),
                          Count_of_Poster_Presentation = c(0, 10, 20),
                          Count_of_Oral_Presentation = c(0, 10, 20, 30),
                          Count_of_Peer_Reviewed_Journal_Articles_Abstracts_Other_than_Published = c(0, 10, 20, 40),
                          total_OBGYN_letter_writers = c(0,2,4),
                          reco_count = c(2,4),
                          Count_of_Peer_Reviewed_Journal_Articles_Abstracts = c(0, 10, 20),
                          Volunteer_exp_count = c(0,10),
                          work_exp_count = c(0,10),
                          Research_exp_count = c(0,5,10,15),
                        fun = plogis,
                        fun.at = c(0.001, 0.01, 0.05, seq(0.2, 0.8, by = 0.2), 0.95, 0.99, 0.999),
                        funlabel = "Chance of Matching in OBGYN Residency",
                        lp =FALSE,
                        abbrev = F,
                varname.label.sep=": ",
                est.all=TRUE, 
                        minlength = 1,
                        conf.int = FALSE, 
                        verbose = FALSE,
                        maxscale = 100)

Creates nomogram plot

# Looks good on the screen

##global settings
par(mfrow = c(1,1), mar = c(0,0,0,0),font = 7,font.axis = 7) #, cex.axis = .75)

tiff("output/fig/nomogram_from_04_nomogram.tiff", units="in", width=8, height=6, res=800)
plot(rms_nomo, 
     font.lab = 7,
     lplabel = "Linear Predictor",
     cex.sub = 0.3, cex.axis = 0.4, cex.main = 1, 
     cex.var = 0.5, #Size of variable names
     cex.lab = 0.4, 
     ps = 10, 
     xfrac = 0.2,
     conf.space = c(0.1, 0.5),
     label.every = 1,
     col.grid = gray(c(0.8, 0.95)),
     total.sep.page = FALSE,
     cap.labels = TRUE,
     total.points.label="Sum of all points",
     which = "Match_Status")
dev.off()
#> svg 
#>   2
# Looks good on the TIFF: Not that great.  

##global settings
par(mfrow = c(1,1), mar = c(0,0,0,0)) # Set the margins and font size for the plot


#tiff("output/fig/nomogram_from_04_nomogram.tiff", units="in", width=8, height=6, res=800)
plot(rms_nomo, 
     font.lab = 7, ## Set the font size for the labels
     lplabel = "Linear Predictor",  # Set the label for the linear predictor
     cex.sub = 0.3,  # Set the font sizes for subtitles
     cex.axis = 0.4, # Set the font sizes for axes
     cex.main = 1,  # Set the font sizes for main title
     cex.var = 0.5, # Set the size of the variable names
     cex.lab = 0.3, # Set the size of the labels
     ps = 10,    # Set the font size for the percentage scale
     xfrac = 0.5,   # Set the fraction of the width to use for the x-axis labels
     label.every = 1,  ## Set the frequency of tick labels on the scale
     col.grid = gray(c(0.8, 0.95)),   # Set the color for the grid lines
     which = "Match_Status") ## Set which outcome to show in the nomogram

#dev.off()
#https://www.kaggle.com/pjmcintyre/titanic-first-kernel#final-checks
tm_nomogram_prep <- function(df){  #signature of the function
  set.seed(1978)                  #body of the function
  print("Function Sanity Check: Creation of Nomogram")
  library(rms)
  test <- rms::nomogram(df,
                        #lp.at = seq(-3,4,by=0.5),
                        fun = plogis,
                        fun.at = c(0.001, 0.01, 0.05, seq(0.2, 0.8, by = 0.2), 0.95, 0.99, 0.999),
                        funlabel = "Chance of Matching in OBGYN",
                        lp =FALSE,
                        #conf.int = c(0.1,0.7),
                        abbrev = F,
                        minlength = 9)
  
  tm_plot <- plot(test, lplabel="Linear Predictor",
                  cex.sub = 0.3, cex.axis=0.4, cex.main=1, cex.lab=0.2, ps=10, xfrac=1,
                  label.every=1,
                  col.grid = gray(c(0.8, 0.95)),
                  which="Match_Status")
  return(tm_plot)
}

tm_nomogram_prep(fit)
#> [1] "Function Sanity Check: Creation of Nomogram"

#> NULL

Closing Time Procedures

# parallel::stopCluster(cl)
# #grDevices::dev.off()
# invisible(gc())
# sessioninfo::session_info()
# beepr::beep(sound = 4)
# if (!interactive())
#   q("no")
report::report_packages(include_R = TRUE)
#>   - rmarkdown (version 2.16; Allaire J et al., 2022)
#>   - visNetwork (version 2.1.0; Almende B.and Contributors et al., 2021)
#>   - doMC (version 1.3.8; Analytics R, Weston S, 2022)
#>   - iterators (version 1.0.14; Analytics R, Weston S, 2022)
#>   - DescTools (version 0.99.46; Andri et mult. al. S, 2022)
#>   - countrycode (version 1.4.0; Arel-Bundock V et al., 2018)
#>   - PASWR (version 1.3; Arnholt A, 2022)
#>   - ggthemes (version 4.2.4; Arnold J, 2021)
#>   - RANN (version 2.6.1; Arya S et al., 2019)
#>   - rsconnect (version 0.8.27; Atkins A et al., 2022)
#>   - ezknitr (version 0.6; Attali D, 2016)
#>   - shinyjs (version 2.1.0; Attali D, 2021)
#>   - beepr (version 1.3; Bååth R, 2018)
#>   - magrittr (version 2.0.3; Bache S, Wickham H, 2022)
#>   - Matrix (version 1.5.1; Bates D et al., 2022)
#>   - R.methodsS3 (version 1.8.2; Bengtsson H, 2003)
#>   - quanteda (version 3.2.3; Benoit K et al., 2018)
#>   - rgdal (version 1.5.32; Bivand R et al., 2022)
#>   - grpreg (version 3.4.0; Breheny P, Huang J, 2015)
#>   - rmda (version 1.6; Brown M, 2018)
#>   - funModeling (version 1.9.4; Casas P, 2020)
#>   - shiny (version 1.7.2; Chang W et al., 2022)
#>   - xgboost (version 1.6.0.1; Chen T et al., 2022)
#>   - epiDisplay (version 3.5.0.2; Chongsuvivatwong V, 2022)
#>   - summarytools (version 1.0.1; Comtois D, 2022)
#>   - doParallel (version 1.0.17; Corporation M, Weston S, 2022)
#>   - infer (version 1.0.3; Couch SP et al., 2021)
#>   - gitcreds (version 0.1.2; Csárdi G, 2022)
#>   - callr (version 3.7.3.9000; Csárdi G, Chang W, 2023)
#>   - progress (version 1.2.2; Csárdi G, FitzJohn R, 2019)
#>   - remotes (version 2.4.2; Csárdi G et al., 2021)
#>   - DataExplorer (version 0.8.2; Cui B, 2020)
#>   - qlcMatrix (version 0.9.7; Cysouw M, 2018)
#>   - correlationfunnel (version 0.2.0; Dancho M, 2020)
#>   - tidyquant (version 1.0.5; Dancho M, Vaughan D, 2022)
#>   - pander (version 0.6.5; Daróczi G, Tsegelskyi R, 2022)
#>   - caretEnsemble (version 2.0.1; Deane-Mayer ZA, Knowles JE, 2019)
#>   - data.table (version 1.14.2; Dowle M, Srinivasan A, 2021)
#>   - BH (version 1.78.0.0; Eddelbuettel D et al., 2021)
#>   - tidylog (version 1.0.2; Elbers B, 2020)
#>   - janitor (version 2.1.0; Firke S, 2021)
#>   - english (version 1.2.6; Fox J et al., 2021)
#>   - car (version 3.1.0; Fox J, Weisberg S, 2019)
#>   - carData (version 3.0.5; Fox J et al., 2022)
#>   - glmnet (version 4.1.4; Friedman J et al., 2010)
#>   - viridis (version 0.6.2; Garnier et al., 2021)
#>   - viridisLite (version 0.4.1; Garnier et al., 2022)
#>   - fansi (version 1.0.3; Gaslam B, 2022)
#>   - perturbR (version 0.1.3; Gates K et al., 2019)
#>   - dtw (version 1.23.1; Giorgino T, 2009)
#>   - mltools (version 0.3.5; Gorman B, 2018)
#>   - gbm (version 2.1.8.1; Greenwell B et al., 2022)
#>   - vip (version 0.3.2; Greenwell BM, Boehmke BC, 2020)
#>   - lubridate (version 1.8.0; Grolemund G, Wickham H, 2011)
#>   - gss (version 2.2.3; Gu C, 2014)
#>   - Metrics (version 0.1.4; Hamner B, Frasco M, 2018)
#>   - Hmisc (version 4.7.1; Harrell Jr F, 2022)
#>   - rms (version 6.3.0; Harrell Jr FE, 2022)
#>   - RSelenium (version 1.7.9; Harrison J, 2022)
#>   - earth (version 5.3.1; Hastie SMDfmbT, wrapper. RTUAMFuwTLl, 2021)
#>   - exploratory (version 6.12.3.5; Hayashi H et al., 2023)
#>   - arsenal (version 3.6.3; Heinzen E et al., 2021)
#>   - anonymizer (version 0.2.2; Hendricks P, 2022)
#>   - rlang (version 1.0.6.9000; Henry L, Wickham H, 2023)
#>   - glue (version 1.6.2.9000; Hester J, Bryan J, 2023)
#>   - odbc (version 1.3.3; Hester J, Wickham H, 2021)
#>   - fs (version 1.5.2; Hester J et al., 2021)
#>   - stargazer (version 5.2.3; Hlavac M, 2022)
#>   - MatchIt (version 4.4.0; Ho DE et al., 2011)
#>   - Rmisc (version 1.5.1; Hope RM, 2022)
#>   - slam (version 0.1.50; Hornik K et al., 2022)
#>   - discrim (version 1.0.0; Hvitfeldt E, Kuhn M, 2022)
#>   - lime (version 0.5.3; Hvitfeldt E et al., 2022)
#>   - DiagrammeRsvg (version 0.1; Iannone R, 2016)
#>   - DiagrammeR (version 1.0.9; Iannone R, 2022)
#>   - mctest (version 1.3.1; Imdad MU, Aslam M, 2020)
#>   - plotrix (version 3.8.2; J L, 2006)
#>   - pscl (version 1.5.5; Jackman S, 2020)
#>   - DynNom (version 5.0.2; Jalali A et al., 2022)
#>   - ggformula (version 0.10.2; Kaplan D, Pruim R, 2022)
#>   - fastDummies (version 1.6.3; Kaplan J, 2020)
#>   - kernlab (version 0.9.31; Karatzoglou A et al., 2022)
#>   - factoextra (version 1.0.7; Kassambara A, Mundt F, 2020)
#>   - humaniformat (version 0.6.0; Keyes O, 2016)
#>   - urltools (version 1.7.3; Keyes O et al., 2019)
#>   - ppcor (version 1.1; Kim S, 2015)
#>   - SparseM (version 1.81; Koenker R, 2021)
#>   - moments (version 0.14.1; Komsta L, Novomestky F, 2022)
#>   - caret (version 6.0.93; Kuhn M, 2022)
#>   - modeldata (version 1.0.1; Kuhn M, 2022)
#>   - tune (version 1.0.0; Kuhn M, 2022)
#>   - workflowsets (version 1.0.0; Kuhn M, Couch S, 2022)
#>   - dials (version 1.0.0; Kuhn M, Frick H, 2022)
#>   - AppliedPredictiveModeling (version 1.1.7; Kuhn M, Johnson K, 2018)
#>   - parsnip (version 1.0.1; Kuhn M, Vaughan D, 2022)
#>   - yardstick (version 1.1.0; Kuhn M et al., 2022)
#>   - tidymodels (version 1.0.0; Kuhn M, Wickham H, 2020)
#>   - recipes (version 1.0.1; Kuhn M, Wickham H, 2022)
#>   - Boruta (version 7.0.0; Kursa MB, Rudnicki WR, 2010)
#>   - coefplot (version 1.2.8; Lander JP, 2022)
#>   - ezkable (version 0.0.0.9000; Lang G, 2022)
#>   - backports (version 1.4.1; Lang M, R Core Team, 2021)
#>   - ezplot (version 1.0.0; Lang' ', 2022)
#>   - mlbench (version 2.1.3; Leisch F, Dimitriadou E, 2021)
#>   - ResourceSelection (version 0.3.5; Lele SR et al., 2019)
#>   - randomForest (version 4.7.1.1; Liaw A, Wiener M, 2002)
#>   - corrmorant (version 0.0.0.9007; Link R, 2020)
#>   - sjmisc (version 2.8.9; Lüdecke D, 2018)
#>   - naivebayes (version 0.9.7; Majka M, 2019)
#>   - robotstxt (version 0.7.13; Meissner P, Ren K, 2020)
#>   - scoring (version 0.6; Merkle EC, Steyvers M, 2013)
#>   - proxy (version 0.4.27; Meyer D, Buchta C, 2022)
#>   - e1071 (version 1.7.11; Meyer D et al., 2022)
#>   - foreach (version 1.5.2; Microsoft, Weston S, 2022)
#>   - plotmo (version 3.6.2; Milborrow S, 2022)
#>   - rpart.plot (version 3.1.1; Milborrow S, 2022)
#>   - leaps (version 3.1; Miller TLboFcbA, 2020)
#>   - here (version 1.0.1; Müller K, 2020)
#>   - hms (version 1.1.2; Müller K, 2022)
#>   - tibble (version 3.1.8; Müller K, Wickham H, 2022)
#>   - RSQLite (version 2.2.17; Müller K et al., 2022)
#>   - RColorBrewer (version 1.1.3; Neuwirth E, 2022)
#>   - bit (version 4.0.4; Oehlschlägel J, Ripley B, 2020)
#>   - bit64 (version 4.0.5; Oehlschlägel J, Silvestri L, 2020)
#>   - magick (version 2.7.3; Ooms J, 2021)
#>   - sp (version 1.5.0; Pebesma EJ, Bivand RS, 2005)
#>   - ggforce (version 0.3.4; Pedersen T, 2022)
#>   - shinyWidgets (version 0.7.3; Perrier V et al., 2022)
#>   - utf8 (version 1.2.2; Perry PO, 2021)
#>   - ipred (version 0.9.13; Peters A, Hothorn T, 2022)
#>   - PerformanceAnalytics (version 2.0.4; Peterson BG, Carl P, 2020)
#>   - bitops (version 1.0.7; port SobSDiR et al., 2021)
#>   - InformationValue (version 1.2.3; Prabhakaran S, 2016)
#>   - mosaicData (version 0.20.3; Pruim R et al., 2022)
#>   - mosaic (version 1.8.4; Pruim R et al., 2017)
#>   - foreign (version 0.8.83; R Core Team, 2022)
#>   - R (version 4.2.2; R Core Team, 2022)
#>   - psych (version 2.2.5; Revelle W, 2022)
#>   - textshape (version 1.7.3; Rinker TW, 2021)
#>   - pROC (version 1.18.0; Robin X et al., 2011)
#>   - broom (version 1.0.1; Robinson D et al., 2022)
#>   - sparsesvd (version 0.2.1; Rohde D et al., 2022)
#>   - inspectdf (version 0.0.12; Rushworth A, 2022)
#>   - xts (version 0.12.1; Ryan JA, Ulrich JM, 2020)
#>   - quantmod (version 0.4.20; Ryan JA, Ulrich JM, 2022)
#>   - plotROC (version 2.3.0; Sachs MC, 2017)
#>   - lattice (version 0.20.45; Sarkar D, 2008)
#>   - corpcor (version 1.6.10; Schafer J et al., 2021)
#>   - openxlsx (version 4.2.5; Schauberger P, Walker A, 2021)
#>   - plotly (version 4.10.0; Sievert C, 2020)
#>   - flexdashboard (version 0.6.0; Sievert C et al., 2022)
#>   - rsample (version 1.1.0; Silge J et al., 2022)
#>   - tidytext (version 0.3.4; Silge J, Robinson D, 2016)
#>   - ROCR (version 1.0.11; Sing T et al., 2005)
#>   - TeachingDemos (version 2.12; Snow G, 2020)
#>   - compareGroups (version 4.5.1; Subirana I et al., 2014)
#>   - labeling (version 0.4.2; Talbot, J, 2020)
#>   - XML (version 3.99.0.10; Temple Lang D, 2022)
#>   - survival (version 3.4.0; Therneau T, 2022)
#>   - rpart (version 4.1.19; Therneau T, Atkinson B, 2022)
#>   - naniar (version 0.6.1; Tierney N et al., 2021)
#>   - caTools (version 1.18.2; Tuszynski J, 2021)
#>   - TTR (version 0.24.3; Ulrich J, 2021)
#>   - RcppRoll (version 0.3.0; Ushey K, 2018)
#>   - renv (version 0.15.5; Ushey K, 2022)
#>   - rstudioapi (version 0.14; Ushey K et al., 2022)
#>   - mice (version 3.14.0; van Buuren S, Groothuis-Oudshoorn K, 2011)
#>   - workflows (version 1.0.0; Vaughan D, 2022)
#>   - MASS (version 7.3.58.1; Venables WN, Ripley BD, 2002)
#>   - nnet (version 7.3.18; Venables WN, Ripley BD, 2002)
#>   - tigris (version 1.6.1; Walker K, 2022)
#>   - skimr (version 2.1.4; Waring E et al., 2022)
#>   - corrplot (version 0.92; Wei T, Simko V, 2021)
#>   - klaR (version 1.7.1; Weihs C et al., 2005)
#>   - munsell (version 0.5.0; Wickham C, 2018)
#>   - reshape2 (version 1.4.4; Wickham H, 2007)
#>   - plyr (version 1.8.7; Wickham H, 2011)
#>   - ggplot2 (version 3.4.0; Wickham H, 2016)
#>   - forcats (version 0.5.2; Wickham H, 2022)
#>   - stringr (version 1.4.1; Wickham H, 2022)
#>   - tidyverse (version 1.3.2; Wickham H et al., 2019)
#>   - readxl (version 1.4.1; Wickham H, Bryan J, 2022)
#>   - usethis (version 2.1.6; Wickham H et al., 2022)
#>   - dplyr (version 1.0.10; Wickham H et al., 2022)
#>   - tidyr (version 1.2.1; Wickham H, Girlich M, 2022)
#>   - purrr (version 1.0.1; Wickham H, Henry L, 2023)
#>   - vctrs (version 0.5.1; Wickham H et al., 2022)
#>   - readr (version 2.1.2; Wickham H et al., 2022)
#>   - devtools (version 2.4.4; Wickham H et al., 2022)
#>   - scales (version 1.2.1; Wickham H, Seidel D, 2022)
#>   - cowplot (version 1.1.1; Wilke C, 2020)
#>   - rattle (version 5.5.1; Williams GJ, 2011)
#>   - corrgram (version 1.14; Wright K, 2021)
#>   - ranger (version 0.14.1; Wright MN, Ziegler A, 2017)
#>   - timeDate (version 4021.104; Wuertz D et al., 2022)
#>   - knitr (version 1.40; Xie Y, 2022)
#>   - tinytex (version 0.41; Xie Y, 2022)
#>   - highr (version 0.9; Xie Y, Qiu Y, 2021)
#>   - MLmetrics (version 1.1.1; Yan Y, 2016)
#>   - tableone (version 0.13.2; Yoshida K, Bartel A, 2022)
#>   - Formula (version 1.2.4; Zeileis A, Croissant Y, 2010)
#>   - zoo (version 1.8.11; Zeileis A, Grothendieck G, 2005)
#>   - lmtest (version 0.9.40; Zeileis A, Hothorn T, 2002)
#>   - kableExtra (version 1.3.4; Zhu H, 2021)