Init

#logistic regression example data with snp-like data

library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0      ✔ purrr   0.3.5 
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.3      ✔ forcats 0.5.2 
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()

library(rms)

## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## 
## Attaching package: 'Hmisc'
## 
## The following objects are masked from 'package:dplyr':
## 
##     src, summarize
## 
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## 
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## 
## The following object is masked from 'package:base':
## 
##     backsolve

#make up some data
n = 10000
noise_sd = 5

set.seed(1)
d = tibble(
  snp1 = sample(c(0, 1, 2), size = n, replace = T),
  snp2 = sample(c(0, 1, 2), size = n, replace = T),
  snp3 = sample(c(0, 1, 2), size = n, replace = T),
  #make latent y, normalize
  y = (snp1*1 + snp2*2 + snp3*3 + rnorm(n = n, sd = noise_sd)) %>% scale(),
  #logistic prob
  y_prob = plogis(y),
  #sample from probabilities
  y_binary = rbernoulli(n = n, y_prob)
) %>% map_df(as.numeric)

#fit logistic model
(logis_fit = lrm(y_binary ~ snp1 + snp2 + snp3, data = d))

## Logistic Regression Model
##  
##  lrm(formula = y_binary ~ snp1 + snp2 + snp3, data = d)
##  
##                         Model Likelihood      Discrimination    Rank Discrim.    
##                               Ratio Test             Indexes          Indexes    
##  Obs         10000    LR chi2     521.12      R2       0.068    C       0.629    
##   0           5005    d.f.             3    R2(3,10000)0.050    Dxy     0.257    
##   1           4995    Pr(> chi2) <0.0001     R2(3,7500)0.067    gamma   0.267    
##  max |deriv| 5e-12                            Brier    0.237    tau-a   0.129    
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept -0.9039 0.0492 -18.36 <0.0001 
##  snp1       0.1496 0.0251   5.96 <0.0001 
##  snp2       0.2627 0.0250  10.49 <0.0001 
##  snp3       0.4861 0.0253  19.21 <0.0001 
##

#scores
#functions ready made
d$pred_logits = rms:::predict.lrm(logis_fit, type = "lp")
d$pred_prob = rms:::predict.lrm(logis_fit, type = "fitted")

#manually
#converting logits to probability is just a plogis call
d$pred_prob_manual = rms:::predict.lrm(logis_fit, type = "lp") %>% plogis()

#computing logits is just a matrix multiplication call, then add the intercept at the end
d$pred_logits_manual = (((d %>% select(snp1:snp3) %>% as.matrix()) %*% logis_fit$coefficients[-1]) + logis_fit$coefficients[1]) %>% as.numeric()


#check if we did it right
d %>% head()

#everything matches

#display using this package

#refit with base R
(logis_fit_base = glm(y_binary ~ snp1 + snp2 + snp3, data = d, family = "binomial"))

## 
## Call:  glm(formula = y_binary ~ snp1 + snp2 + snp3, family = "binomial", 
##     data = d)
## 
## Coefficients:
## (Intercept)         snp1         snp2         snp3  
##     -0.9039       0.1496       0.2627       0.4861  
## 
## Degrees of Freedom: 9999 Total (i.e. Null);  9996 Residual
## Null Deviance:       13860 
## Residual Deviance: 13340     AIC: 13350

#diplay
epiDisplay::logistic.display(logis_fit_base)

## Registered S3 method overwritten by 'epiDisplay':
##   method       from
##   print.lrtest rms

## 
## Logistic regression predicting y_binary 
##  
##                   crude OR(95%CI)   adj. OR(95%CI)    P(Wald's test) P(LR-test)
## snp1 (cont. var.) 1.15 (1.1,1.21)   1.16 (1.11,1.22)  < 0.001        < 0.001   
##                                                                                
## snp2 (cont. var.) 1.29 (1.23,1.35)  1.3 (1.24,1.37)   < 0.001        < 0.001   
##                                                                                
## snp3 (cont. var.) 1.61 (1.54,1.69)  1.63 (1.55,1.71)  < 0.001        < 0.001   
##                                                                                
## Log-likelihood = -6670.9062
## No. of observations = 10000
## AIC value = 13349.8125

#manually
logis_fit$coefficients %>% exp()

## Intercept      snp1      snp2      snp3 
## 0.4050001 1.1613252 1.3003962 1.6260293

#matches

#so if we start with ORs
(coefs = logis_fit_base$coefficients %>% exp())

## (Intercept)        snp1        snp2        snp3 
##   0.4050001   1.1613252   1.3003962   1.6260293

#we convert back to logits
(coefs_logits = coefs %>% log())

## (Intercept)        snp1        snp2        snp3 
##  -0.9038679   0.1495617   0.2626690   0.4861410

#to get scores, dot product + intercept
d$pred_logits_manual2 = (((d %>% select(snp1:snp3) %>% as.matrix()) %*% coefs_logits[-1]) + coefs_logits[1]) %>% as.numeric()

#and then finally probabilities
d$pred_prob_manual2 = d$pred_logits_manual2 %>% plogis()

#check if we did it right
d %>% select(-matches("snp")) %>% head()

#everything matches

Meta

#versions
sessionInfo()

## R version 4.2.2 Patched (2022-11-10 r83330)
## Platform: x86_64-pc-linux-gnu (64-bit)
## Running under: Linux Mint 21
## 
## Matrix products: default
## BLAS:   /usr/lib/x86_64-linux-gnu/blas/libblas.so.3.10.0
## LAPACK: /usr/lib/x86_64-linux-gnu/lapack/liblapack.so.3.10.0
## 
## locale:
##  [1] LC_CTYPE=en_US.UTF-8       LC_NUMERIC=C              
##  [3] LC_TIME=en_US.UTF-8        LC_COLLATE=en_US.UTF-8    
##  [5] LC_MONETARY=en_DK.UTF-8    LC_MESSAGES=en_US.UTF-8   
##  [7] LC_PAPER=en_DK.UTF-8       LC_NAME=C                 
##  [9] LC_ADDRESS=C               LC_TELEPHONE=C            
## [11] LC_MEASUREMENT=en_DK.UTF-8 LC_IDENTIFICATION=C       
## 
## attached base packages:
## [1] stats     graphics  grDevices utils     datasets  methods   base     
## 
## other attached packages:
##  [1] rms_6.3-0       SparseM_1.81    Hmisc_4.7-1     Formula_1.2-4  
##  [5] survival_3.4-0  lattice_0.20-45 forcats_0.5.2   stringr_1.4.1  
##  [9] dplyr_1.0.10    purrr_0.3.5     readr_2.1.3     tidyr_1.2.1    
## [13] tibble_3.1.8    ggplot2_3.4.0   tidyverse_1.3.2
## 
## loaded via a namespace (and not attached):
##  [1] nlme_3.1-160        fs_1.5.2            lubridate_1.9.0    
##  [4] RColorBrewer_1.1-3  httr_1.4.4          tools_4.2.2        
##  [7] backports_1.4.1     bslib_0.4.1         utf8_1.2.2         
## [10] R6_2.5.1            rpart_4.1.19        DBI_1.1.3          
## [13] colorspace_2.0-3    nnet_7.3-18         withr_2.5.0        
## [16] tidyselect_1.2.0    gridExtra_2.3       compiler_4.2.2     
## [19] quantreg_5.94       cli_3.4.1           rvest_1.0.3        
## [22] htmlTable_2.4.1     xml2_1.3.3          sandwich_3.0-2     
## [25] sass_0.4.2          scales_1.2.1        checkmate_2.1.0    
## [28] mvtnorm_1.1-3       polspline_1.1.20    digest_0.6.30      
## [31] foreign_0.8-82      rmarkdown_2.18      base64enc_0.1-3    
## [34] jpeg_0.1-9          pkgconfig_2.0.3     htmltools_0.5.4    
## [37] dbplyr_2.2.1        fastmap_1.1.0       htmlwidgets_1.6.1  
## [40] rlang_1.0.6         readxl_1.4.1        epiDisplay_3.5.0.2 
## [43] rstudioapi_0.14     jquerylib_0.1.4     generics_0.1.3     
## [46] zoo_1.8-11          jsonlite_1.8.3      googlesheets4_1.0.1
## [49] magrittr_2.0.3      interp_1.1-3        Matrix_1.5-3       
## [52] Rcpp_1.0.9          munsell_0.5.0       fansi_1.0.3        
## [55] lifecycle_1.0.3     multcomp_1.4-20     stringi_1.7.8      
## [58] yaml_2.3.6          MASS_7.3-58         grid_4.2.2         
## [61] crayon_1.5.2        deldir_1.0-6        haven_2.5.1        
## [64] splines_4.2.2       hms_1.1.2           knitr_1.40         
## [67] pillar_1.8.1        codetools_0.2-18    reprex_2.0.2       
## [70] glue_1.6.2          evaluate_0.18       latticeExtra_0.6-30
## [73] data.table_1.14.6   modelr_0.1.10       png_0.1-7          
## [76] vctrs_0.5.1         tzdb_0.3.0          MatrixModels_0.5-1 
## [79] cellranger_1.1.0    gtable_0.3.1        assertthat_0.2.1   
## [82] cachem_1.0.6        xfun_0.35           broom_1.0.1        
## [85] googledrive_2.0.0   gargle_1.2.1        cluster_2.1.4      
## [88] timechange_0.1.1    TH.data_1.1-1       ellipsis_0.3.2

Logistic regression scoring example

17 February, 2023

Init

Meta