Init
#logistic regression example data with snp-like data
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ ggplot2 3.4.0 ✔ purrr 0.3.5
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.3 ✔ forcats 0.5.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(rms)
## Loading required package: Hmisc
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
##
## Attaching package: 'Hmisc'
##
## The following objects are masked from 'package:dplyr':
##
## src, summarize
##
## The following objects are masked from 'package:base':
##
## format.pval, units
##
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
##
## The following object is masked from 'package:base':
##
## backsolve
#make up some data
n = 10000
noise_sd = 5
set.seed(1)
d = tibble(
snp1 = sample(c(0, 1, 2), size = n, replace = T),
snp2 = sample(c(0, 1, 2), size = n, replace = T),
snp3 = sample(c(0, 1, 2), size = n, replace = T),
#make latent y, normalize
y = (snp1*1 + snp2*2 + snp3*3 + rnorm(n = n, sd = noise_sd)) %>% scale(),
#logistic prob
y_prob = plogis(y),
#sample from probabilities
y_binary = rbernoulli(n = n, y_prob)
) %>% map_df(as.numeric)
#fit logistic model
(logis_fit = lrm(y_binary ~ snp1 + snp2 + snp3, data = d))
## Logistic Regression Model
##
## lrm(formula = y_binary ~ snp1 + snp2 + snp3, data = d)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 10000 LR chi2 521.12 R2 0.068 C 0.629
## 0 5005 d.f. 3 R2(3,10000)0.050 Dxy 0.257
## 1 4995 Pr(> chi2) <0.0001 R2(3,7500)0.067 gamma 0.267
## max |deriv| 5e-12 Brier 0.237 tau-a 0.129
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -0.9039 0.0492 -18.36 <0.0001
## snp1 0.1496 0.0251 5.96 <0.0001
## snp2 0.2627 0.0250 10.49 <0.0001
## snp3 0.4861 0.0253 19.21 <0.0001
##
#scores
#functions ready made
d$pred_logits = rms:::predict.lrm(logis_fit, type = "lp")
d$pred_prob = rms:::predict.lrm(logis_fit, type = "fitted")
#manually
#converting logits to probability is just a plogis call
d$pred_prob_manual = rms:::predict.lrm(logis_fit, type = "lp") %>% plogis()
#computing logits is just a matrix multiplication call, then add the intercept at the end
d$pred_logits_manual = (((d %>% select(snp1:snp3) %>% as.matrix()) %*% logis_fit$coefficients[-1]) + logis_fit$coefficients[1]) %>% as.numeric()
#check if we did it right
d %>% head()
#everything matches
#display using this package
#refit with base R
(logis_fit_base = glm(y_binary ~ snp1 + snp2 + snp3, data = d, family = "binomial"))
##
## Call: glm(formula = y_binary ~ snp1 + snp2 + snp3, family = "binomial",
## data = d)
##
## Coefficients:
## (Intercept) snp1 snp2 snp3
## -0.9039 0.1496 0.2627 0.4861
##
## Degrees of Freedom: 9999 Total (i.e. Null); 9996 Residual
## Null Deviance: 13860
## Residual Deviance: 13340 AIC: 13350
#diplay
epiDisplay::logistic.display(logis_fit_base)
## Registered S3 method overwritten by 'epiDisplay':
## method from
## print.lrtest rms
##
## Logistic regression predicting y_binary
##
## crude OR(95%CI) adj. OR(95%CI) P(Wald's test) P(LR-test)
## snp1 (cont. var.) 1.15 (1.1,1.21) 1.16 (1.11,1.22) < 0.001 < 0.001
##
## snp2 (cont. var.) 1.29 (1.23,1.35) 1.3 (1.24,1.37) < 0.001 < 0.001
##
## snp3 (cont. var.) 1.61 (1.54,1.69) 1.63 (1.55,1.71) < 0.001 < 0.001
##
## Log-likelihood = -6670.9062
## No. of observations = 10000
## AIC value = 13349.8125
#manually
logis_fit$coefficients %>% exp()
## Intercept snp1 snp2 snp3
## 0.4050001 1.1613252 1.3003962 1.6260293
#matches
#so if we start with ORs
(coefs = logis_fit_base$coefficients %>% exp())
## (Intercept) snp1 snp2 snp3
## 0.4050001 1.1613252 1.3003962 1.6260293
#we convert back to logits
(coefs_logits = coefs %>% log())
## (Intercept) snp1 snp2 snp3
## -0.9038679 0.1495617 0.2626690 0.4861410
#to get scores, dot product + intercept
d$pred_logits_manual2 = (((d %>% select(snp1:snp3) %>% as.matrix()) %*% coefs_logits[-1]) + coefs_logits[1]) %>% as.numeric()
#and then finally probabilities
d$pred_prob_manual2 = d$pred_logits_manual2 %>% plogis()
#check if we did it right
d %>% select(-matches("snp")) %>% head()
#everything matches