
Simple logistic regression
df <- read.csv("LogisticRegression.csv")
names(df)
## [1] "CRP" "Triglyceride" "Cholesterol" "Improvement" "Group"
table(df$Improvement)
##
## No Yes
## 156 144
levels(df$Improvement)
## [1] "No" "Yes"
crp_outcome <- glm(Improvement ~ CRP, # Formula
data = df, # Data object
family = "binomial") # Binomial for dichotomous outcome
summary(crp_outcome)
##
## Call:
## glm(formula = Improvement ~ CRP, family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.248 -1.145 -1.075 1.208 1.308
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.31252 0.56380 0.554 0.579
## CRP -0.03983 0.05601 -0.711 0.477
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 415.41 on 299 degrees of freedom
## Residual deviance: 414.90 on 298 degrees of freedom
## AIC: 418.9
##
## Number of Fisher Scoring iterations: 3
exp(-0.03983)
## [1] 0.9609528
exp(confint(crp_outcome))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.4530342 4.159610
## CRP 0.8602580 1.072253
Multivariable logistic regression
- Using
CRP and Cholesterol
crp_chol_outcome <- glm(Improvement ~ CRP + Cholesterol,
data = df,
family = "binomial")
summary(crp_chol_outcome)
##
## Call:
## glm(formula = Improvement ~ CRP + Cholesterol, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.237 -1.151 -1.014 1.182 1.408
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.28794 0.56547 0.509 0.611
## CRP -0.00745 0.06351 -0.117 0.907
## Cholesterol -0.05298 0.04891 -1.083 0.279
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 415.41 on 299 degrees of freedom
## Residual deviance: 413.72 on 297 degrees of freedom
## AIC: 419.72
##
## Number of Fisher Scoring iterations: 3
Adding a dichotomous nominal variable to the model
table(df$Group)
##
## A C
## 150 150
df$Group <- factor(df$Group,
levels = c("C", "A")) # Listed in reverse order
table(df$Group)
##
## C A
## 150 150
crp_chol_group_outcome <- glm(Improvement ~ CRP + Cholesterol + Group,
data = df,
family = "binomial")
summary(crp_chol_group_outcome)
##
## Call:
## glm(formula = Improvement ~ CRP + Cholesterol + Group, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3802 -1.0748 -0.8885 1.1521 1.4931
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.037206 0.583715 -0.064 0.94918
## CRP -0.003009 0.064343 -0.047 0.96270
## Cholesterol -0.062970 0.049741 -1.266 0.20553
## GroupA 0.669787 0.235858 2.840 0.00451 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 415.41 on 299 degrees of freedom
## Residual deviance: 405.53 on 296 degrees of freedom
## AIC: 413.53
##
## Number of Fisher Scoring iterations: 4
exp(0.669787) # Exponent of the coefficient of the Group variable
## [1] 1.953821
exp(confint(crp_chol_group_outcome))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.3058754 3.037067
## CRP 0.8783664 1.131252
## Cholesterol 0.8509512 1.034644
## GroupA 1.2338960 3.113735
Adding a multi-level nominal categorical variable to the model
set.seed(1) # For reproducible results
df <- mutate(df,
Stage = sample(c("I", "II", "III", "IV"),
size = 300,
replace = TRUE))
names(df) # List the variables
## [1] "CRP" "Triglyceride" "Cholesterol" "Improvement" "Group"
## [6] "Stage"
table(df$Stage)
##
## I II III IV
## 79 81 67 73
df$Stage <- factor(df$Stage,
levels = c("I", "II", "III", "IV"))
levels(df$Stage)
## [1] "I" "II" "III" "IV"
crp_chol_group_stage_outcome <- glm(Improvement ~ CRP + Cholesterol + Group + Stage,
data = df,
family = "binomial")
summary(crp_chol_group_stage_outcome)
##
## Call:
## glm(formula = Improvement ~ CRP + Cholesterol + Group + Stage,
## family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4592 -1.1262 -0.8634 1.1537 1.5693
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 0.107254 0.610740 0.176 0.86060
## CRP 0.003433 0.064995 0.053 0.95788
## Cholesterol -0.065287 0.050026 -1.305 0.19188
## GroupA 0.664812 0.239848 2.772 0.00557 **
## StageII -0.115640 0.324237 -0.357 0.72135
## StageIII -0.316224 0.340689 -0.928 0.35331
## StageIV -0.375277 0.332166 -1.130 0.25857
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 415.41 on 299 degrees of freedom
## Residual deviance: 403.89 on 293 degrees of freedom
## AIC: 417.89
##
## Number of Fisher Scoring iterations: 4
- Note that our AIC has increased
Forward and backward stepwise methods
simple_model <- glm(Improvement ~ 1,
data = df,
family = "binomial")
summary(simple_model)
##
## Call:
## glm(formula = Improvement ~ 1, family = "binomial", data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.144 -1.144 -1.144 1.212 1.212
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.08004 0.11556 -0.693 0.489
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 415.41 on 299 degrees of freedom
## Residual deviance: 415.41 on 299 degrees of freedom
## AIC: 417.41
##
## Number of Fisher Scoring iterations: 3
step(simple_model,
direction = "forward",
scope = formula(crp_chol_group_stage_outcome))
## Start: AIC=417.41
## Improvement ~ 1
##
## Df Deviance AIC
## + Group 1 407.68 411.68
## <none> 415.41 417.41
## + Cholesterol 1 413.74 417.74
## + CRP 1 414.90 418.90
## + Stage 3 413.34 421.34
##
## Step: AIC=411.68
## Improvement ~ Group
##
## Df Deviance AIC
## + Cholesterol 1 405.53 411.53
## <none> 407.68 411.68
## + CRP 1 407.15 413.15
## + Stage 3 406.02 416.02
##
## Step: AIC=411.53
## Improvement ~ Group + Cholesterol
##
## Df Deviance AIC
## <none> 405.53 411.53
## + CRP 1 405.53 413.53
## + Stage 3 403.89 415.89
##
## Call: glm(formula = Improvement ~ Group + Cholesterol, family = "binomial",
## data = df)
##
## Coefficients:
## (Intercept) GroupA Cholesterol
## -0.06095 0.67006 -0.06406
##
## Degrees of Freedom: 299 Total (i.e. Null); 297 Residual
## Null Deviance: 415.4
## Residual Deviance: 405.5 AIC: 411.5
- The result of the best model is given
best_model_forward <- glm(Improvement ~ Group + Cholesterol,
data = df,
family = "binomial")
summary(best_model_forward)
##
## Call:
## glm(formula = Improvement ~ Group + Cholesterol, family = "binomial",
## data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3844 -1.0753 -0.8903 1.1535 1.4918
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.06095 0.28786 -0.212 0.83231
## GroupA 0.67006 0.23579 2.842 0.00449 **
## Cholesterol -0.06406 0.04399 -1.456 0.14534
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 415.41 on 299 degrees of freedom
## Residual deviance: 405.53 on 297 degrees of freedom
## AIC: 411.53
##
## Number of Fisher Scoring iterations: 4
step(crp_chol_group_stage_outcome,
direction = "backward")
## Start: AIC=417.89
## Improvement ~ CRP + Cholesterol + Group + Stage
##
## Df Deviance AIC
## - Stage 3 405.53 413.53
## - CRP 1 403.89 415.89
## - Cholesterol 1 405.60 417.60
## <none> 403.89 417.89
## - Group 1 411.69 423.69
##
## Step: AIC=413.53
## Improvement ~ CRP + Cholesterol + Group
##
## Df Deviance AIC
## - CRP 1 405.53 411.53
## - Cholesterol 1 407.15 413.15
## <none> 405.53 413.53
## - Group 1 413.72 419.72
##
## Step: AIC=411.53
## Improvement ~ Cholesterol + Group
##
## Df Deviance AIC
## <none> 405.53 411.53
## - Cholesterol 1 407.68 411.68
## - Group 1 413.74 417.74
##
## Call: glm(formula = Improvement ~ Cholesterol + Group, family = "binomial",
## data = df)
##
## Coefficients:
## (Intercept) Cholesterol GroupA
## -0.06095 -0.06406 0.67006
##
## Degrees of Freedom: 299 Total (i.e. Null); 297 Residual
## Null Deviance: 415.4
## Residual Deviance: 405.5 AIC: 411.5