1. Import library
library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble 3.1.8 ✔ dplyr 1.0.10
## ✔ tidyr 1.2.1 ✔ stringr 1.4.1
## ✔ readr 2.1.2 ✔ forcats 0.5.2
## ✔ purrr 0.3.4
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
library(epiDisplay) # logistic.display()
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
##
## The following object is masked from 'package:ggplot2':
##
## alpha
library(GGally) # Forest plot
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
2. Import data
df <- read.csv("~/Desktop/R-dir/R studying/dataset/Framingham dataset.csv")
dim(df)
## [1] 11627 39
head(df, 10)
## id sex tot.chol age sysbp diasbp smoker cigs.day bmi diabetes bpmed
## 1 2448 1 195 39 106.0 70.0 0 0 26.97 0 0
## 2 2448 1 209 52 121.0 66.0 0 0 NA 0 0
## 3 6238 2 250 46 121.0 81.0 0 0 28.73 0 0
## 4 6238 2 260 52 105.0 69.5 0 0 29.43 0 0
## 5 6238 2 237 58 108.0 66.0 0 0 28.50 0 0
## 6 9428 1 245 48 127.5 80.0 1 20 25.34 0 0
## 7 9428 1 283 54 141.0 89.0 1 30 25.34 0 0
## 8 10552 2 225 61 150.0 95.0 1 30 28.58 0 0
## 9 10552 2 232 67 183.0 109.0 1 20 30.18 0 0
## 10 11252 2 285 46 130.0 84.0 1 23 23.10 0 0
## heart.rate glucose educ prev.chd prev.ap prev.mi prev.stroke prev.hyp time
## 1 80 77 4 0 0 0 0 0 0
## 2 69 92 4 0 0 0 0 0 4628
## 3 95 76 2 0 0 0 0 0 0
## 4 80 86 2 0 0 0 0 0 2156
## 5 80 71 2 0 0 0 0 0 4344
## 6 75 70 1 0 0 0 0 0 0
## 7 75 87 1 0 0 0 0 0 2199
## 8 65 103 3 0 0 0 0 1 0
## 9 60 89 3 0 0 0 0 1 1977
## 10 85 85 3 0 0 0 0 0 0
## period hdlc ldlc death angina hosp.mi mi.fchd any.chd stroke cvd
## 1 1 NA NA 0 0 1 1 1 0 1
## 2 3 31 178 0 0 1 1 1 0 1
## 3 1 NA NA 0 0 0 0 0 0 0
## 4 2 NA NA 0 0 0 0 0 0 0
## 5 3 54 141 0 0 0 0 0 0 0
## 6 1 NA NA 0 0 0 0 0 0 0
## 7 2 NA NA 0 0 0 0 0 0 0
## 8 1 NA NA 1 0 0 0 0 1 1
## 9 2 NA NA 1 0 0 0 0 1 1
## 10 1 NA NA 0 0 0 0 0 0 0
## hypertension time.ap time.mi time.mi.1 time.chd time.stroke time.cvd
## 1 0 8766 6438 6438 6438 8766 6438
## 2 0 8766 6438 6438 6438 8766 6438
## 3 0 8766 8766 8766 8766 8766 8766
## 4 0 8766 8766 8766 8766 8766 8766
## 5 0 8766 8766 8766 8766 8766 8766
## 6 0 8766 8766 8766 8766 8766 8766
## 7 0 8766 8766 8766 8766 8766 8766
## 8 1 2956 2956 2956 2956 2089 2089
## 9 1 2956 2956 2956 2956 2089 2089
## 10 1 8766 8766 8766 8766 8766 8766
## time.dth time.hyp
## 1 8766 8766
## 2 8766 8766
## 3 8766 8766
## 4 8766 8766
## 5 8766 8766
## 6 8766 8766
## 7 8766 8766
## 8 2956 0
## 9 2956 0
## 10 8766 4285
tail(df, 10)
## id sex tot.chol age sysbp diasbp smoker cigs.day bmi diabetes
## 11618 9993179 2 NA 50 131.0 80 1 25 21.22 0
## 11619 9993179 2 251 56 145.0 92 1 35 21.97 0
## 11620 9995546 2 269 52 133.5 83 0 0 21.47 0
## 11621 9995546 2 265 58 140.0 83 0 0 22.55 0
## 11622 9998212 1 185 40 141.0 98 0 0 25.60 0
## 11623 9998212 1 173 46 126.0 82 0 0 19.17 0
## 11624 9998212 1 153 52 143.0 89 0 0 25.74 0
## 11625 9999312 2 196 39 133.0 86 1 30 20.91 0
## 11626 9999312 2 240 46 138.0 79 1 20 26.39 0
## 11627 9999312 2 NA 50 147.0 96 1 10 24.19 0
## bpmed heart.rate glucose educ prev.chd prev.ap prev.mi prev.stroke
## 11618 0 75 NA 1 0 0 0 0
## 11619 1 95 90 1 0 0 0 0
## 11620 0 80 107 2 0 0 0 0
## 11621 1 80 79 2 0 0 0 0
## 11622 0 67 72 3 0 0 0 0
## 11623 0 70 NA 3 0 0 0 0
## 11624 0 65 72 3 0 0 0 0
## 11625 0 85 80 3 0 0 0 0
## 11626 0 90 83 3 0 0 0 0
## 11627 0 94 NA 3 0 0 0 0
## prev.hyp time period hdlc ldlc death angina hosp.mi mi.fchd any.chd
## 11618 0 2226 2 NA NA 1 0 0 0 0
## 11619 1 4396 3 70 181 1 0 0 0 0
## 11620 0 0 1 NA NA 0 1 0 1 1
## 11621 1 2186 2 NA NA 0 1 0 1 1
## 11622 1 0 1 NA NA 0 0 0 0 0
## 11623 1 2333 2 NA NA 0 0 0 0 0
## 11624 1 4538 3 30 123 0 0 0 0 0
## 11625 0 0 1 NA NA 0 0 0 0 0
## 11626 0 2390 2 NA NA 0 0 0 0 0
## 11627 1 4201 3 NA NA 0 0 0 0 0
## stroke cvd hypertension time.ap time.mi time.mi.1 time.chd time.stroke
## 11618 0 0 1 6729 6729 6729 6729 6729
## 11619 0 0 1 6729 6729 6729 6729 6729
## 11620 0 1 1 5939 8766 5209 5209 8766
## 11621 0 1 1 5939 8766 5209 5209 8766
## 11622 0 0 1 8766 8766 8766 8766 8766
## 11623 0 0 1 8766 8766 8766 8766 8766
## 11624 0 0 1 8766 8766 8766 8766 8766
## 11625 0 0 1 8766 8766 8766 8766 8766
## 11626 0 0 1 8766 8766 8766 8766 8766
## 11627 0 0 1 8766 8766 8766 8766 8766
## time.cvd time.dth time.hyp
## 11618 6729 6729 4396
## 11619 6729 6729 4396
## 11620 5209 8766 735
## 11621 5209 8766 735
## 11622 8766 8766 0
## 11623 8766 8766 0
## 11624 8766 8766 0
## 11625 8766 8766 4201
## 11626 8766 8766 4201
## 11627 8766 8766 4201
str(df)
## 'data.frame': 11627 obs. of 39 variables:
## $ id : int 2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
## $ sex : int 1 1 2 2 2 1 1 2 2 2 ...
## $ tot.chol : int 195 209 250 260 237 245 283 225 232 285 ...
## $ age : int 39 52 46 52 58 48 54 61 67 46 ...
## $ sysbp : num 106 121 121 105 108 ...
## $ diasbp : num 70 66 81 69.5 66 80 89 95 109 84 ...
## $ smoker : int 0 0 0 0 0 1 1 1 1 1 ...
## $ cigs.day : int 0 0 0 0 0 20 30 30 20 23 ...
## $ bmi : num 27 NA 28.7 29.4 28.5 ...
## $ diabetes : int 0 0 0 0 0 0 0 0 0 0 ...
## $ bpmed : int 0 0 0 0 0 0 0 0 0 0 ...
## $ heart.rate : int 80 69 95 80 80 75 75 65 60 85 ...
## $ glucose : int 77 92 76 86 71 70 87 103 89 85 ...
## $ educ : int 4 4 2 2 2 1 1 3 3 3 ...
## $ prev.chd : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prev.ap : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prev.mi : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prev.stroke : int 0 0 0 0 0 0 0 0 0 0 ...
## $ prev.hyp : int 0 0 0 0 0 0 0 1 1 0 ...
## $ time : int 0 4628 0 2156 4344 0 2199 0 1977 0 ...
## $ period : int 1 3 1 2 3 1 2 1 2 1 ...
## $ hdlc : int NA 31 NA NA 54 NA NA NA NA NA ...
## $ ldlc : int NA 178 NA NA 141 NA NA NA NA NA ...
## $ death : int 0 0 0 0 0 0 0 1 1 0 ...
## $ angina : int 0 0 0 0 0 0 0 0 0 0 ...
## $ hosp.mi : int 1 1 0 0 0 0 0 0 0 0 ...
## $ mi.fchd : int 1 1 0 0 0 0 0 0 0 0 ...
## $ any.chd : int 1 1 0 0 0 0 0 0 0 0 ...
## $ stroke : int 0 0 0 0 0 0 0 1 1 0 ...
## $ cvd : int 1 1 0 0 0 0 0 1 1 0 ...
## $ hypertension: int 0 0 0 0 0 0 0 1 1 1 ...
## $ time.ap : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.mi : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.mi.1 : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.chd : int 6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.stroke : int 8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ time.cvd : int 6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
## $ time.dth : int 8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
## $ time.hyp : int 8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...
3. Convert the categorical variables to factor
df3 = df
df3$sex <- as.factor(df3$sex)
df3$smoker <- as.factor(df3$smoker)
df3$diabetes <- as.factor(df3$diabetes)
df3$bpmed <- as.factor(df3$bpmed)
df3$educ <- as.factor(df3$educ)
df3$stroke <- as.factor(df3$stroke)
df3$cvd <- as.factor(df3$cvd)
df3$hypertension <- as.factor(df3$hypertension)
4. Logistic regression
4.1 Simple logistic Regession model about the relationship between
age and stroke
m1 <- glm(stroke ~ age, family = binomial, data = df3)
summary(m1)
##
## Call:
## glm(formula = stroke ~ age, family = binomial, data = df3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9092 -0.4775 -0.3669 -0.2903 2.8008
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.240795 0.214800 -29.05 <2e-16 ***
## age 0.068778 0.003555 19.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7102.4 on 11626 degrees of freedom
## Residual deviance: 6702.1 on 11625 degrees of freedom
## AIC: 6706.1
##
## Number of Fisher Scoring iterations: 5
logistic.display(m1)
##
## Logistic regression predicting stroke : 1 vs 0
##
## OR(95%CI) P(Wald's test) P(LR-test)
## age (cont. var.) 1.07 (1.06,1.08) < 0.001 < 0.001
##
## Log-likelihood = -3351.0549
## No. of observations = 11627
## AIC value = 6706.1098
4.2 Model2: m1 + sex + tot.chol + sysbp + diasbp + glucose +
bmi
m2 <- glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi, family = binomial, data = df3)
summary(m2)
##
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp +
## glucose + bmi, family = binomial, data = df3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2557 -0.4728 -0.3445 -0.2500 3.0162
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.1143066 0.4262665 -21.382 < 2e-16 ***
## age 0.0593876 0.0044973 13.205 < 2e-16 ***
## sex2 -0.2188370 0.0747738 -2.927 0.003426 **
## tot.chol -0.0015023 0.0008214 -1.829 0.067411 .
## sysbp 0.0132716 0.0022606 5.871 4.33e-09 ***
## diasbp 0.0165213 0.0042840 3.856 0.000115 ***
## glucose 0.0024509 0.0011344 2.161 0.030730 *
## bmi 0.0156037 0.0086435 1.805 0.071037 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6137.7 on 10020 degrees of freedom
## Residual deviance: 5563.7 on 10013 degrees of freedom
## (1606 observations deleted due to missingness)
## AIC: 5579.7
##
## Number of Fisher Scoring iterations: 6
logistic.display(m2)
##
## Logistic regression predicting stroke : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI)
## age (cont. var.) 1.07 (1.06,1.08) 1.06 (1.05,1.07)
##
## sex: 2 vs 1 0.89 (0.78,1.02) 0.8 (0.69,0.93)
##
## tot.chol (cont. var.) 1.0009 (0.9994,1.0024) 0.9985 (0.9969,1.0001)
##
## sysbp (cont. var.) 1.03 (1.02,1.03) 1.01 (1.01,1.02)
##
## diasbp (cont. var.) 1.04 (1.03,1.04) 1.02 (1.01,1.03)
##
## glucose (cont. var.) 1.0066 (1.0046,1.0086) 1.0025 (1.0002,1.0047)
##
## bmi (cont. var.) 1.05 (1.04,1.07) 1.02 (1,1.03)
##
## P(Wald's test) P(LR-test)
## age (cont. var.) < 0.001 < 0.001
##
## sex: 2 vs 1 0.003 0.003
##
## tot.chol (cont. var.) 0.067 0.066
##
## sysbp (cont. var.) < 0.001 < 0.001
##
## diasbp (cont. var.) < 0.001 < 0.001
##
## glucose (cont. var.) 0.031 0.036
##
## bmi (cont. var.) 0.071 0.073
##
## Log-likelihood = -2781.8252
## No. of observations = 10021
## AIC value = 5579.6504
4.3 Model3: Model2 + educ + smoker + diabetes + hypertension
m3 <- glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi +
educ + smoker + diabetes + hypertension, family = binomial, data = df3)
summary(m3)
##
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp +
## glucose + bmi + educ + smoker + diabetes + hypertension,
## family = binomial, data = df3)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4213 -0.4725 -0.3371 -0.2302 3.0361
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.043959 0.478029 -18.919 < 2e-16 ***
## age 0.062828 0.004800 13.090 < 2e-16 ***
## sex2 -0.104921 0.077956 -1.346 0.1783
## tot.chol -0.002012 0.000840 -2.395 0.0166 *
## sysbp 0.009417 0.002354 4.001 6.31e-05 ***
## diasbp 0.017406 0.004356 3.996 6.44e-05 ***
## glucose -0.001237 0.001356 -0.912 0.3615
## bmi 0.014258 0.008983 1.587 0.1125
## educ2 -0.094009 0.090442 -1.039 0.2986
## educ3 -0.195882 0.113176 -1.731 0.0835 .
## educ4 -0.176164 0.127799 -1.378 0.1681
## smoker1 0.374200 0.079604 4.701 2.59e-06 ***
## diabetes1 0.860508 0.150958 5.700 1.20e-08 ***
## hypertension1 0.573486 0.130092 4.408 1.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5973.0 on 9760 degrees of freedom
## Residual deviance: 5330.3 on 9747 degrees of freedom
## (1866 observations deleted due to missingness)
## AIC: 5358.3
##
## Number of Fisher Scoring iterations: 6
logistic.display(m3)
##
## Logistic regression predicting stroke : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI)
## age (cont. var.) 1.07 (1.06,1.08) 1.06 (1.05,1.07)
##
## sex: 2 vs 1 0.91 (0.79,1.05) 0.9 (0.77,1.05)
##
## tot.chol (cont. var.) 1.0005 (0.999,1.002) 0.998 (0.9963,0.9996)
##
## sysbp (cont. var.) 1.0269 (1.0241,1.0297) 1.0095 (1.0048,1.0141)
##
## diasbp (cont. var.) 1.04 (1.03,1.04) 1.02 (1.01,1.03)
##
## glucose (cont. var.) 1.0067 (1.0047,1.0087) 0.9988 (0.9961,1.0014)
##
## bmi (cont. var.) 1.06 (1.04,1.07) 1.01 (1,1.03)
##
## educ: ref.=1
## 2 0.65 (0.55,0.77) 0.91 (0.76,1.09)
## 3 0.6 (0.49,0.74) 0.82 (0.66,1.03)
## 4 0.63 (0.5,0.8) 0.84 (0.65,1.08)
##
## smoker: 1 vs 0 0.91 (0.79,1.05) 1.45 (1.24,1.7)
##
## diabetes: 1 vs 0 3.55 (2.81,4.49) 2.36 (1.76,3.18)
##
## hypertension: 1 vs 0 3.71 (2.95,4.68) 1.77 (1.38,2.29)
##
## P(Wald's test) P(LR-test)
## age (cont. var.) < 0.001 < 0.001
##
## sex: 2 vs 1 0.178 0.179
##
## tot.chol (cont. var.) 0.017 0.016
##
## sysbp (cont. var.) < 0.001 < 0.001
##
## diasbp (cont. var.) < 0.001 < 0.001
##
## glucose (cont. var.) 0.362 0.357
##
## bmi (cont. var.) 0.112 0.114
##
## educ: ref.=1 0.235
## 2 0.299
## 3 0.083
## 4 0.168
##
## smoker: 1 vs 0 < 0.001 < 0.001
##
## diabetes: 1 vs 0 < 0.001 < 0.001
##
## hypertension: 1 vs 0 < 0.001 < 0.001
##
## Log-likelihood = -2665.1373
## No. of observations = 9761
## AIC value = 5358.2747