Thực hành ngày 6

Mô hình hồi qui logistic

Học viên Đặng Bảo Đăng - Mã số R1F019

Việc 1: Đọc dữ liệu nghiên cứu tim mạch “Framingham dataset.csv”

library(table1); library(ggplot2); library(ggthemes)
## Warning: package 'table1' was built under R version 4.0.3
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
## Warning: package 'ggplot2' was built under R version 4.0.5
fmh = read.csv("D:/Downloads/tailieu/R course/Seminar TDT 2022/Tai lieu/Data set/Framingham dataset.csv")

fmh$gender = ifelse(fmh$sex == 1, "Male", "Female")

dat = fmh[, c('stroke', 'age', 'gender', 'sysbp', 'tot.chol', 'bmi', 'smoker', 'diabetes')]

table1(~ . | stroke, data=dat)
## Warning in table1.formula(~. | stroke, data = dat): Terms to the right of '|' in
## formula 'x' define table columns and are expected to be factors with meaningful
## labels.
0
(N=10566)
1
(N=1061)
Overall
(N=11627)
stroke
Mean (SD) 0 (0) 1.00 (0) 0.0913 (0.288)
Median [Min, Max] 0 [0, 0] 1.00 [1.00, 1.00] 0 [0, 1.00]
age
Mean (SD) 54.2 (9.45) 60.4 (8.90) 54.8 (9.56)
Median [Min, Max] 54.0 [32.0, 81.0] 61.0 [34.0, 81.0] 54.0 [32.0, 81.0]
gender
Female 6032 (57.1%) 573 (54.0%) 6605 (56.8%)
Male 4534 (42.9%) 488 (46.0%) 5022 (43.2%)
sysbp
Mean (SD) 135 (21.8) 151 (26.9) 136 (22.8)
Median [Min, Max] 131 [83.5, 244] 147 [94.0, 295] 132 [83.5, 295]
tot.chol
Mean (SD) 241 (44.9) 243 (49.8) 241 (45.4)
Median [Min, Max] 238 [113, 638] 240 [107, 696] 238 [107, 696]
Missing 376 (3.6%) 33 (3.1%) 409 (3.5%)
bmi
Mean (SD) 25.8 (4.03) 26.8 (4.68) 25.9 (4.10)
Median [Min, Max] 25.4 [14.4, 56.8] 26.4 [16.9, 55.3] 25.5 [14.4, 56.8]
Missing 42 (0.4%) 10 (0.9%) 52 (0.4%)
smoker
Mean (SD) 0.434 (0.496) 0.418 (0.494) 0.433 (0.495)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
diabetes
Mean (SD) 0.0380 (0.191) 0.121 (0.326) 0.0456 (0.209)
Median [Min, Max] 0 [0, 1.00] 0 [0, 1.00] 0 [0, 1.00]
attach(dat)

vars = data.frame(age, gender, sysbp, tot.chol, bmi, smoker, diabetes)
factor = c('Age', 'Male', 'Systolic', 'Total Cholesterol', 'BMI', 'Smoker', 'Diabetes')
odds.ratio = numeric(length(vars))
lower.or = numeric(length(vars))
upper.or = numeric(length(vars))

for (i in 1:length(vars)) {
  m = glm(stroke ~ vars[,i], family=binomial)
  odds.ratio[i] = exp(summary(m)[["coefficients"]][2])
  lower.or[i] = exp(summary(m)[["coefficients"]][2] - 1.96*summary(m)[["coefficients"]][4])
  upper.or[i] = exp(summary(m)[["coefficients"]][2] + 1.96*summary(m)[["coefficients"]][4])
}

effect = data.frame(factor, odds.ratio, lower.or, upper.or)
effect
##              factor odds.ratio  lower.or upper.or
## 1               Age  1.0711984 1.0637608 1.078688
## 2              Male  1.1330394 0.9981748 1.286126
## 3          Systolic  1.0266984 1.0241559 1.029247
## 4 Total Cholesterol  1.0008544 0.9994589 1.002252
## 5               BMI  1.0541061 1.0392176 1.069208
## 6            Smoker  0.9387118 0.8260105 1.066790
## 7          Diabetes  3.4687015 2.8119149 4.278896
ggplot(effect, aes(x=factor, y=odds.ratio)) + geom_point(color="black", size=2) + geom_errorbar(aes(ymin=lower.or, ymax=upper.or), width=0.16, color="black", size=0.6) + theme(legend.position="none") + coord_flip() +geom_hline(yintercept = 1, linetype="dashed", size=0.5) + scale_y_continuous(breaks = seq(0.8, 4.4, 0.2)) + ylab("Odds Ratio") + xlab("Factor") + theme_igray()

t = epiDisplay::logistic.display(glm(fmh$stroke ~ fmh$age + fmh$gender + fmh$sysbp + fmh$tot.chol + fmh$bmi + factor(fmh$smoker) + factor(fmh$diabetes), family=binomial))

t <- plyr::ldply (t, data.frame)
colnames(t)[2] = "OddsRatio"
t[1,1] = "Age"
t[2,1] = "Male"
t[3,1] = "Systolic"
t[4,1] = "Total Cholesterol"
t[5,1] = "BMI"
t[6,1] = "Smoker"
t[7,1] = "Diabetes"

t
##                 .id OddsRatio lower95ci upper95ci     Pr...Z..
## 1               Age 1.0570879 1.0486064  1.065638 1.415878e-41
## 2              Male 1.1702586 1.0171387  1.346429 2.798646e-02
## 3          Systolic 1.0192726 1.0163326  1.022221 2.267641e-38
## 4 Total Cholesterol 0.9987897 0.9972766  1.000305 1.174701e-01
## 5               BMI 1.0244120 1.0083778  1.040701 2.731204e-03
## 6            Smoker 1.4724605 1.2730545  1.703101 1.871578e-07
## 7          Diabetes 1.9611207 1.5546215  2.473910 1.323606e-08
ggplot(data= t, aes(y= .id, x= OddsRatio, label= .id)) + geom_point(size= 2, shape= 19) + geom_errorbarh (aes(xmin= lower95ci, xmax= upper95ci), height=.2, size = 1) + coord_fixed (ratio= 0.3) + geom_vline(xintercept=1, linetype='longdash')

ggplot(t, aes(x=.id, y=OddsRatio)) + geom_point(color="black", size=2) + geom_errorbar(aes(ymin=lower95ci, ymax=upper95ci), width=0.16, color="black", size=0.6) + theme(legend.position="none") + coord_flip() +geom_hline(yintercept = 1, linetype="dashed", size=0.5) + scale_y_continuous(breaks = seq(0.8, 2.6, 0.2)) + ylab("Odds Ratio") + xlab("Factor") + theme_igray()