# Data management
library(tidyverse) # data management & other
library(tidyr) # organize tabular data
library(janitor) # data cleaning like clean_names
library(data.table) # rbindlist function "makes one data.table from a list of many"
# Data summary
library(psych) # numeric data summary
library(crosstable) # cross-tabulation
library(summarytools) # data summary tools
library(epitools) # epidemiology tools
library(ggsci) # themes for plots
# Data analysis
library(Hmisc) # many functions useful for analysis, graphics, computing sample size, simulation, importing and annotating, imputation
library(stats) # statistical tests
library(pROC) # ROC analysis
# Power
library(pwr) # Power calculations
library(WebPower) # basic and advanced statistical power analysis
library(rpact) # adaptive trial design
# Data sets
library(medicaldata)
Logistic Regression and Analysis of Binary Outcomes
1 Introduction
Logistic regression is a generalized linear model used to analyze binary outcomes by predicting the likelihood of an event occurring based on one or more explanatory variables. The core of the model is expressed by the formula:
\[ \text{logit} = \alpha + \beta x \]
The logit is the log of the odds \[ \text{logit} = \ln \left( \frac{p}{q} \right) = \ln \left( \frac{p}{1 - p} \right) \]
In this equation, the logit function represents the log of the odds, the odds value is defined by \(\frac{p}{q}\) , where p is the probability of the event occurring and q is the probability of the event not occurring (i.e., q = 1 - p ). This can also be reformulated in terms of odds:
\[ \text{Odds} = \exp(\alpha + \beta x) \]
Then, the probability formula could be expressed as the following:
\[ p = \frac {\exp( \alpha + \beta x)}{1 + \exp(\alpha + \beta x)} \]
And, the odds ratio is the exponential of ( \(\beta\) ).
Logistic regression is particularly valuable for analyzing categorical outcomes, including cases with more than two categories, such as multinomial and ordinal outcomes. However, in this example, we will focus specifically on the analysis of binary outcomes, which represent two distinct categories or states.
By estimating the parameters \(\alpha\) (the intercept) and \(\beta\) (the regression coefficient associated with the predictor variable x), logistic regression enables researchers to determine how changes in the explanatory variables affect the probability of the target event. This makes it a powerful tool in various fields, including healthcare and sciences, where understanding the factors influencing a binary outcome can drive important decisions and strategies.
In the following sections, I will show examples of performing logistic regression and binary data analysis in R.
2 Library
In my program, first, I load the packages that I frequently use.
3 Data set
For this analysis, we will use the Indomethacin RCT data.
Check the data details here. We will fit a logistic regression model LoRM using the glm function.
# load the data
data(indo_rct)
colnames(indo_rct)
[1] "id" "site" "age" "risk" "gender"
[6] "outcome" "sod" "pep" "recpanc" "psphinc"
[11] "precut" "difcan" "pneudil" "amp" "paninj"
[16] "acinar" "brush" "asa81" "asa325" "asa"
[21] "prophystent" "therastent" "pdstent" "sodsom" "bsphinc"
[26] "bstent" "chole" "pbmal" "train" "status"
[31] "type" "rx" "bleed"
# we need to code the outcome variable to be 0 (no event) and 1 (event)
= indo_rct %>%
indo_rct_ds mutate( bleed01 = bleed - 1 )
# run the LoRM using the glm function and binomial family
glm ( bleed01 ~ age , data = indo_rct_ds , family = "binomial")
Call: glm(formula = bleed01 ~ age, family = "binomial", data = indo_rct_ds)
Coefficients:
(Intercept) age
3.86242 -0.07634
Degrees of Freedom: 26 Total (i.e. Null); 25 Residual
(575 observations deleted due to missingness)
Null Deviance: 36.5
Residual Deviance: 31.48 AIC: 35.48
# we can run multi-variable LoRM
= glm( bleed01 ~ age +
model1 + rx +
gender +
asa81 as.factor(risk) ,
data = indo_rct_ds ,
family = "binomial")
# Create data frames of the results
model1
Call: glm(formula = bleed01 ~ age + gender + rx + asa81 + as.factor(risk),
family = "binomial", data = indo_rct_ds)
Coefficients:
(Intercept) age gender2_male rx1_indomethacin
2.48241 -0.05856 0.68632 0.17248
asa811_yes as.factor(risk)1.5 as.factor(risk)2 as.factor(risk)2.5
-17.37706 0.13455 0.75729 1.17684
as.factor(risk)3 as.factor(risk)3.5 as.factor(risk)4
0.62155 16.62569 -0.75659
Degrees of Freedom: 26 Total (i.e. Null); 16 Residual
(575 observations deleted due to missingness)
Null Deviance: 36.5
Residual Deviance: 27.01 AIC: 49.01
= summary(model1)
mod1.sum = mod1.sum$coefficients
mod1.res= exp(cbind(OR = coef(model1), confint(model1)))
mod1.OR cbind(mod1.res,mod1.OR)
Estimate Std. Error z value Pr(>|z|)
(Intercept) 2.48241281 3.023324e+00 0.821087368 0.4115965
age -0.05855696 4.671372e-02 -1.253528103 0.2100136
gender2_male 0.68632026 1.387346e+00 0.494700312 0.6208117
rx1_indomethacin 0.17247789 1.192324e+00 0.144656859 0.8849818
asa811_yes -17.37705733 2.781711e+03 -0.006246896 0.9950157
as.factor(risk)1.5 0.13454828 1.828436e+00 0.073586552 0.9413394
as.factor(risk)2 0.75729460 2.484426e+00 0.304816703 0.7605058
as.factor(risk)2.5 1.17684181 2.048063e+00 0.574612051 0.5655537
as.factor(risk)3 0.62155296 2.160398e+00 0.287703047 0.7735741
as.factor(risk)3.5 16.62569308 3.956181e+03 0.004202461 0.9966469
as.factor(risk)4 -0.75658944 2.307401e+00 -0.327896746 0.7429897
OR 2.5 % 97.5 %
(Intercept) 1.197011e+01 3.882192e-02 9.945300e+03
age 9.431245e-01 8.482220e-01 1.027569e+00
gender2_male 1.986393e+00 1.467657e-01 5.346190e+01
rx1_indomethacin 1.188246e+00 1.119168e-01 1.484420e+01
asa811_yes 2.839487e-08 NA 1.292643e+153
as.factor(risk)1.5 1.144020e+00 2.377635e-02 5.410012e+01
as.factor(risk)2 2.132499e+00 1.426971e-02 4.655808e+02
as.factor(risk)2.5 3.244112e+00 5.006875e-02 2.741075e+02
as.factor(risk)3 1.861817e+00 2.476409e-02 2.009965e+02
as.factor(risk)3.5 1.661295e+07 7.434702e-300 NA
as.factor(risk)4 4.692642e-01 3.316091e-03 4.486680e+01
# N of the model
length(model1$fitted.values)
[1] 27
length(model1$residuals)
[1] 27
4 LoRM Function
We can create a function to apply the LoRM adjusted for aspirin use, and apply the function (the model) to different predictors
## Function for LoM ================================================================
= function( x ) {
my_logit_func = indo_rct_ds
temp.df <- glm( bleed01 ~ temp.df[[x]] + asa81 ,
model.temp data = temp.df ,
family = "binomial")
= summary (model.temp)
model.summary = model.summary$coefficients
model.est = exp(cbind(OR = coef(model.temp), confint(model.temp)))
model.OR = as.data.frame ( cbind(model.est,model.OR) ) %>%
model.results mutate ( VARNAME = {{x}} ) %>%
rownames_to_column()
return(model.results)
}
## Results ==========================================================================
= lapply( c("age","gender","risk") , my_logit_func)
lom_results
# check the first model
1]] lom_results[[
rowname Estimate Std. Error z value Pr(>|z|) OR
1 (Intercept) 3.18798442 1.892441e+00 1.684588102 0.09206807 2.423952e+01
2 temp.df[[x]] -0.05940911 4.121207e-02 -1.441546280 0.14943041 9.423212e-01
3 asa811_yes -16.70169009 2.748504e+03 -0.006076648 0.99515157 5.578895e-08
2.5 % 97.5 % VARNAME
1 0.8158017 1.790201e+03 age
2 0.8589091 1.015887e+00 age
3 NA 3.392271e+174 age
# combine the results
= rbindlist(lom_results)
OR.Table OR.Table
rowname Estimate Std. Error z value Pr(>|z|)
<char> <num> <num> <num> <num>
1: (Intercept) 3.18798442 1.892441e+00 1.684588102 0.09206807
2: temp.df[[x]] -0.05940911 4.121207e-02 -1.441546280 0.14943041
3: asa811_yes -16.70169009 2.748504e+03 -0.006076648 0.99515157
4: (Intercept) 0.13353139 5.175492e-01 0.258007163 0.79640138
5: temp.df[[x]]2_male 1.25276297 9.449112e-01 1.325799721 0.18490605
6: asa811_yes -18.40246022 2.650294e+03 -0.006943555 0.99445989
7: (Intercept) 0.87597268 1.105093e+00 0.792668895 0.42797076
8: temp.df[[x]] -0.13210740 4.467326e-01 -0.295719219 0.76744452
9: asa811_yes -18.07885390 2.796831e+03 -0.006464051 0.99484247
OR 2.5 % 97.5 % VARNAME
<num> <num> <num> <char>
1: 2.423952e+01 0.8158017 1.790201e+03 age
2: 9.423212e-01 0.8589091 1.015887e+00 age
3: 5.578895e-08 NA 3.392271e+174 age
4: 1.142857e+00 0.4101988 3.259594e+00 gender
5: 3.500000e+00 0.6120457 2.871215e+01 gender
6: 1.018388e-08 NA 1.920274e+172 gender
7: 2.401210e+00 0.2835250 2.399003e+01 risk
8: 8.762469e-01 0.3558699 2.160714e+00 risk
9: 1.407517e-08 NA 2.290332e+182 risk
5 ROC and AUC
We can utilize the pROC package to perform Receiver Operating Characteristic (ROC) analysis and calculate the Area Under the Curve (AUC). The ROC curve is a graphical representation that illustrates the diagnostic ability of a predictor to discriminate the outcome. By plotting the true positive rate (sensitivity) against specificity or against the false positive rate (1-specificity), the ROC curve helps visualize the trade-offs between sensitivity and specificity at different cutoff points.The AUC quantifies the overall performance of the model, with values ranging from 0 to 1. An AUC of 0.5 indicates no discrimination (i.e., the model performs no better than chance), while an AUC of 1.0 represents perfect discrimination.
# obtain predicted and observed values from the model
<- predict(model1, type = "response")
predicted_probs <- model.frame(model1)
model_data <- model_data$bleed01
observed_values
# Create the ROC curve
<- roc(observed_values, predicted_probs)
roc_curve roc_curve
Call:
roc.default(response = observed_values, predictor = predicted_probs)
Data: predicted_probs in 11 controls (observed_values 0) < 16 cases (observed_values 1).
Area under the curve: 0.8239
# Plot the ROC curve
plot(roc_curve,
main = "ROC Curve",
xlim=c(1,0),
ylim=c(0,1))
# Calculate the AUC
<- auc(roc_curve)
auc_value =ci.auc(roc_curve, conf.level=0.95, method=c("delong") )
auc_ci_value
print(paste("AUC:", auc_value) )
[1] "AUC: 0.823863636363636"
auc_ci_value
95% CI: 0.6648-0.9829 (DeLong)
6 Sensitivity & specificity
The pROC package can also be used to calculate sensitivity and specificity for various cutoff points, allowing users to identify the optimal threshold that minimizes false results (false positive and false negative). We can find this by summing sensitivity and specificity and finding the maximum sum.
# loda the data
data("aSAH")
colnames(aSAH)
[1] "gos6" "outcome" "gender" "age" "wfns" "s100b" "ndka"
attach(aSAH)
# check the outcome
table(outcome,exclude = F)
outcome
Good Poor
72 41
# change the outcome to 0 and 1
= aSAH %>%
sah mutate( poor = case_when(
=="Poor"~1 ,
outcome~0 ))
T
# calculate
attach(sah)
= roc (poor,s100b)
myroc
# Combine results into a data frame for easier inspection
<- data.frame(
roc_df Point = c( sort ( unique( myroc$predictor ) ) , NA) ,
Threshold = myroc$thresholds,
Sensitivity = myroc$sensitivities,
Specificity = myroc$specificities ) %>%
mutate(sum=Sensitivity+Specificity)
# Print the optimal cutoff threshold
%>%
roc_df filter(sum==max(sum)) %>%
::select ( Threshold,Sensitivity,Specificity) dplyr
Threshold Sensitivity Specificity
1 0.205 0.6341463 0.8055556
# # to compate to two ROC curves, use roc.test
= roc (poor,age)
myroc2 roc.test (myroc,myroc2)
DeLong's test for two correlated ROC curves
data: myroc and myroc2
Z = 1.7604, p-value = 0.07835
alternative hypothesis: true difference in AUC is not equal to 0
95 percent confidence interval:
-0.01319368 0.24591726
sample estimates:
AUC of roc1 AUC of roc2
0.7313686 0.6150068