1. Import library

library(ggplot2)
library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.2 ──
## ✔ tibble  3.1.8      ✔ dplyr   1.0.10
## ✔ tidyr   1.2.1      ✔ stringr 1.4.1 
## ✔ readr   2.1.2      ✔ forcats 0.5.2 
## ✔ purrr   0.3.4      
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
library(epiDisplay) # logistic.display()
## Loading required package: foreign
## Loading required package: survival
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: nnet
## 
## Attaching package: 'epiDisplay'
## 
## The following object is masked from 'package:ggplot2':
## 
##     alpha
library(GGally) # Forest plot
## Registered S3 method overwritten by 'GGally':
##   method from   
##   +.gg   ggplot2

2. Import data

df <- read.csv("~/Desktop/R-dir/R studying/dataset/Framingham dataset.csv")
dim(df)
## [1] 11627    39
head(df, 10)
##       id sex tot.chol age sysbp diasbp smoker cigs.day   bmi diabetes bpmed
## 1   2448   1      195  39 106.0   70.0      0        0 26.97        0     0
## 2   2448   1      209  52 121.0   66.0      0        0    NA        0     0
## 3   6238   2      250  46 121.0   81.0      0        0 28.73        0     0
## 4   6238   2      260  52 105.0   69.5      0        0 29.43        0     0
## 5   6238   2      237  58 108.0   66.0      0        0 28.50        0     0
## 6   9428   1      245  48 127.5   80.0      1       20 25.34        0     0
## 7   9428   1      283  54 141.0   89.0      1       30 25.34        0     0
## 8  10552   2      225  61 150.0   95.0      1       30 28.58        0     0
## 9  10552   2      232  67 183.0  109.0      1       20 30.18        0     0
## 10 11252   2      285  46 130.0   84.0      1       23 23.10        0     0
##    heart.rate glucose educ prev.chd prev.ap prev.mi prev.stroke prev.hyp time
## 1          80      77    4        0       0       0           0        0    0
## 2          69      92    4        0       0       0           0        0 4628
## 3          95      76    2        0       0       0           0        0    0
## 4          80      86    2        0       0       0           0        0 2156
## 5          80      71    2        0       0       0           0        0 4344
## 6          75      70    1        0       0       0           0        0    0
## 7          75      87    1        0       0       0           0        0 2199
## 8          65     103    3        0       0       0           0        1    0
## 9          60      89    3        0       0       0           0        1 1977
## 10         85      85    3        0       0       0           0        0    0
##    period hdlc ldlc death angina hosp.mi mi.fchd any.chd stroke cvd
## 1       1   NA   NA     0      0       1       1       1      0   1
## 2       3   31  178     0      0       1       1       1      0   1
## 3       1   NA   NA     0      0       0       0       0      0   0
## 4       2   NA   NA     0      0       0       0       0      0   0
## 5       3   54  141     0      0       0       0       0      0   0
## 6       1   NA   NA     0      0       0       0       0      0   0
## 7       2   NA   NA     0      0       0       0       0      0   0
## 8       1   NA   NA     1      0       0       0       0      1   1
## 9       2   NA   NA     1      0       0       0       0      1   1
## 10      1   NA   NA     0      0       0       0       0      0   0
##    hypertension time.ap time.mi time.mi.1 time.chd time.stroke time.cvd
## 1             0    8766    6438      6438     6438        8766     6438
## 2             0    8766    6438      6438     6438        8766     6438
## 3             0    8766    8766      8766     8766        8766     8766
## 4             0    8766    8766      8766     8766        8766     8766
## 5             0    8766    8766      8766     8766        8766     8766
## 6             0    8766    8766      8766     8766        8766     8766
## 7             0    8766    8766      8766     8766        8766     8766
## 8             1    2956    2956      2956     2956        2089     2089
## 9             1    2956    2956      2956     2956        2089     2089
## 10            1    8766    8766      8766     8766        8766     8766
##    time.dth time.hyp
## 1      8766     8766
## 2      8766     8766
## 3      8766     8766
## 4      8766     8766
## 5      8766     8766
## 6      8766     8766
## 7      8766     8766
## 8      2956        0
## 9      2956        0
## 10     8766     4285
tail(df, 10)
##            id sex tot.chol age sysbp diasbp smoker cigs.day   bmi diabetes
## 11618 9993179   2       NA  50 131.0     80      1       25 21.22        0
## 11619 9993179   2      251  56 145.0     92      1       35 21.97        0
## 11620 9995546   2      269  52 133.5     83      0        0 21.47        0
## 11621 9995546   2      265  58 140.0     83      0        0 22.55        0
## 11622 9998212   1      185  40 141.0     98      0        0 25.60        0
## 11623 9998212   1      173  46 126.0     82      0        0 19.17        0
## 11624 9998212   1      153  52 143.0     89      0        0 25.74        0
## 11625 9999312   2      196  39 133.0     86      1       30 20.91        0
## 11626 9999312   2      240  46 138.0     79      1       20 26.39        0
## 11627 9999312   2       NA  50 147.0     96      1       10 24.19        0
##       bpmed heart.rate glucose educ prev.chd prev.ap prev.mi prev.stroke
## 11618     0         75      NA    1        0       0       0           0
## 11619     1         95      90    1        0       0       0           0
## 11620     0         80     107    2        0       0       0           0
## 11621     1         80      79    2        0       0       0           0
## 11622     0         67      72    3        0       0       0           0
## 11623     0         70      NA    3        0       0       0           0
## 11624     0         65      72    3        0       0       0           0
## 11625     0         85      80    3        0       0       0           0
## 11626     0         90      83    3        0       0       0           0
## 11627     0         94      NA    3        0       0       0           0
##       prev.hyp time period hdlc ldlc death angina hosp.mi mi.fchd any.chd
## 11618        0 2226      2   NA   NA     1      0       0       0       0
## 11619        1 4396      3   70  181     1      0       0       0       0
## 11620        0    0      1   NA   NA     0      1       0       1       1
## 11621        1 2186      2   NA   NA     0      1       0       1       1
## 11622        1    0      1   NA   NA     0      0       0       0       0
## 11623        1 2333      2   NA   NA     0      0       0       0       0
## 11624        1 4538      3   30  123     0      0       0       0       0
## 11625        0    0      1   NA   NA     0      0       0       0       0
## 11626        0 2390      2   NA   NA     0      0       0       0       0
## 11627        1 4201      3   NA   NA     0      0       0       0       0
##       stroke cvd hypertension time.ap time.mi time.mi.1 time.chd time.stroke
## 11618      0   0            1    6729    6729      6729     6729        6729
## 11619      0   0            1    6729    6729      6729     6729        6729
## 11620      0   1            1    5939    8766      5209     5209        8766
## 11621      0   1            1    5939    8766      5209     5209        8766
## 11622      0   0            1    8766    8766      8766     8766        8766
## 11623      0   0            1    8766    8766      8766     8766        8766
## 11624      0   0            1    8766    8766      8766     8766        8766
## 11625      0   0            1    8766    8766      8766     8766        8766
## 11626      0   0            1    8766    8766      8766     8766        8766
## 11627      0   0            1    8766    8766      8766     8766        8766
##       time.cvd time.dth time.hyp
## 11618     6729     6729     4396
## 11619     6729     6729     4396
## 11620     5209     8766      735
## 11621     5209     8766      735
## 11622     8766     8766        0
## 11623     8766     8766        0
## 11624     8766     8766        0
## 11625     8766     8766     4201
## 11626     8766     8766     4201
## 11627     8766     8766     4201
str(df)
## 'data.frame':    11627 obs. of  39 variables:
##  $ id          : int  2448 2448 6238 6238 6238 9428 9428 10552 10552 11252 ...
##  $ sex         : int  1 1 2 2 2 1 1 2 2 2 ...
##  $ tot.chol    : int  195 209 250 260 237 245 283 225 232 285 ...
##  $ age         : int  39 52 46 52 58 48 54 61 67 46 ...
##  $ sysbp       : num  106 121 121 105 108 ...
##  $ diasbp      : num  70 66 81 69.5 66 80 89 95 109 84 ...
##  $ smoker      : int  0 0 0 0 0 1 1 1 1 1 ...
##  $ cigs.day    : int  0 0 0 0 0 20 30 30 20 23 ...
##  $ bmi         : num  27 NA 28.7 29.4 28.5 ...
##  $ diabetes    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ bpmed       : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ heart.rate  : int  80 69 95 80 80 75 75 65 60 85 ...
##  $ glucose     : int  77 92 76 86 71 70 87 103 89 85 ...
##  $ educ        : int  4 4 2 2 2 1 1 3 3 3 ...
##  $ prev.chd    : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.ap     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.mi     : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.stroke : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ prev.hyp    : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ time        : int  0 4628 0 2156 4344 0 2199 0 1977 0 ...
##  $ period      : int  1 3 1 2 3 1 2 1 2 1 ...
##  $ hdlc        : int  NA 31 NA NA 54 NA NA NA NA NA ...
##  $ ldlc        : int  NA 178 NA NA 141 NA NA NA NA NA ...
##  $ death       : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ angina      : int  0 0 0 0 0 0 0 0 0 0 ...
##  $ hosp.mi     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ mi.fchd     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ any.chd     : int  1 1 0 0 0 0 0 0 0 0 ...
##  $ stroke      : int  0 0 0 0 0 0 0 1 1 0 ...
##  $ cvd         : int  1 1 0 0 0 0 0 1 1 0 ...
##  $ hypertension: int  0 0 0 0 0 0 0 1 1 1 ...
##  $ time.ap     : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.mi     : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.mi.1   : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.chd    : int  6438 6438 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.stroke : int  8766 8766 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ time.cvd    : int  6438 6438 8766 8766 8766 8766 8766 2089 2089 8766 ...
##  $ time.dth    : int  8766 8766 8766 8766 8766 8766 8766 2956 2956 8766 ...
##  $ time.hyp    : int  8766 8766 8766 8766 8766 8766 8766 0 0 4285 ...

3. Convert the categorical variables to factor

df3 = df
df3$sex <- as.factor(df3$sex)
df3$smoker <- as.factor(df3$smoker)
df3$diabetes <- as.factor(df3$diabetes)
df3$bpmed <- as.factor(df3$bpmed)
df3$educ <- as.factor(df3$educ)
df3$stroke <- as.factor(df3$stroke)
df3$cvd <- as.factor(df3$cvd)
df3$hypertension <- as.factor(df3$hypertension)

4. Logistic regression

4.1 Simple logistic Regession model about the relationship between age and stroke

m1 <- glm(stroke ~ age, family = binomial, data = df3)
summary(m1)
## 
## Call:
## glm(formula = stroke ~ age, family = binomial, data = df3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9092  -0.4775  -0.3669  -0.2903   2.8008  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.240795   0.214800  -29.05   <2e-16 ***
## age          0.068778   0.003555   19.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7102.4  on 11626  degrees of freedom
## Residual deviance: 6702.1  on 11625  degrees of freedom
## AIC: 6706.1
## 
## Number of Fisher Scoring iterations: 5
logistic.display(m1)
## 
## Logistic regression predicting stroke : 1 vs 0 
##  
##                  OR(95%CI)         P(Wald's test) P(LR-test)
## age (cont. var.) 1.07 (1.06,1.08)  < 0.001        < 0.001   
##                                                             
## Log-likelihood = -3351.0549
## No. of observations = 11627
## AIC value = 6706.1098

4.2 Model2: m1 + sex + tot.chol + sysbp + diasbp + glucose + bmi

m2 <- glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi, family = binomial, data = df3)
summary(m2)
## 
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp + 
##     glucose + bmi, family = binomial, data = df3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2557  -0.4728  -0.3445  -0.2500   3.0162  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.1143066  0.4262665 -21.382  < 2e-16 ***
## age          0.0593876  0.0044973  13.205  < 2e-16 ***
## sex2        -0.2188370  0.0747738  -2.927 0.003426 ** 
## tot.chol    -0.0015023  0.0008214  -1.829 0.067411 .  
## sysbp        0.0132716  0.0022606   5.871 4.33e-09 ***
## diasbp       0.0165213  0.0042840   3.856 0.000115 ***
## glucose      0.0024509  0.0011344   2.161 0.030730 *  
## bmi          0.0156037  0.0086435   1.805 0.071037 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6137.7  on 10020  degrees of freedom
## Residual deviance: 5563.7  on 10013  degrees of freedom
##   (1606 observations deleted due to missingness)
## AIC: 5579.7
## 
## Number of Fisher Scoring iterations: 6
logistic.display(m2)
## 
## Logistic regression predicting stroke : 1 vs 0 
##  
##                       crude OR(95%CI)         adj. OR(95%CI)         
## age (cont. var.)      1.07 (1.06,1.08)        1.06 (1.05,1.07)       
##                                                                      
## sex: 2 vs 1           0.89 (0.78,1.02)        0.8 (0.69,0.93)        
##                                                                      
## tot.chol (cont. var.) 1.0009 (0.9994,1.0024)  0.9985 (0.9969,1.0001) 
##                                                                      
## sysbp (cont. var.)    1.03 (1.02,1.03)        1.01 (1.01,1.02)       
##                                                                      
## diasbp (cont. var.)   1.04 (1.03,1.04)        1.02 (1.01,1.03)       
##                                                                      
## glucose (cont. var.)  1.0066 (1.0046,1.0086)  1.0025 (1.0002,1.0047) 
##                                                                      
## bmi (cont. var.)      1.05 (1.04,1.07)        1.02 (1,1.03)          
##                                                                      
##                       P(Wald's test) P(LR-test)
## age (cont. var.)      < 0.001        < 0.001   
##                                                
## sex: 2 vs 1           0.003          0.003     
##                                                
## tot.chol (cont. var.) 0.067          0.066     
##                                                
## sysbp (cont. var.)    < 0.001        < 0.001   
##                                                
## diasbp (cont. var.)   < 0.001        < 0.001   
##                                                
## glucose (cont. var.)  0.031          0.036     
##                                                
## bmi (cont. var.)      0.071          0.073     
##                                                
## Log-likelihood = -2781.8252
## No. of observations = 10021
## AIC value = 5579.6504

4.3 Model3: Model2 + educ + smoker + diabetes + hypertension

m3 <- glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi + 
            educ + smoker + diabetes + hypertension, family = binomial, data = df3)
summary(m3)
## 
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp + 
##     glucose + bmi + educ + smoker + diabetes + hypertension, 
##     family = binomial, data = df3)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4213  -0.4725  -0.3371  -0.2302   3.0361  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -9.043959   0.478029 -18.919  < 2e-16 ***
## age            0.062828   0.004800  13.090  < 2e-16 ***
## sex2          -0.104921   0.077956  -1.346   0.1783    
## tot.chol      -0.002012   0.000840  -2.395   0.0166 *  
## sysbp          0.009417   0.002354   4.001 6.31e-05 ***
## diasbp         0.017406   0.004356   3.996 6.44e-05 ***
## glucose       -0.001237   0.001356  -0.912   0.3615    
## bmi            0.014258   0.008983   1.587   0.1125    
## educ2         -0.094009   0.090442  -1.039   0.2986    
## educ3         -0.195882   0.113176  -1.731   0.0835 .  
## educ4         -0.176164   0.127799  -1.378   0.1681    
## smoker1        0.374200   0.079604   4.701 2.59e-06 ***
## diabetes1      0.860508   0.150958   5.700 1.20e-08 ***
## hypertension1  0.573486   0.130092   4.408 1.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5973.0  on 9760  degrees of freedom
## Residual deviance: 5330.3  on 9747  degrees of freedom
##   (1866 observations deleted due to missingness)
## AIC: 5358.3
## 
## Number of Fisher Scoring iterations: 6
logistic.display(m3)
## 
## Logistic regression predicting stroke : 1 vs 0 
##  
##                       crude OR(95%CI)         adj. OR(95%CI)         
## age (cont. var.)      1.07 (1.06,1.08)        1.06 (1.05,1.07)       
##                                                                      
## sex: 2 vs 1           0.91 (0.79,1.05)        0.9 (0.77,1.05)        
##                                                                      
## tot.chol (cont. var.) 1.0005 (0.999,1.002)    0.998 (0.9963,0.9996)  
##                                                                      
## sysbp (cont. var.)    1.0269 (1.0241,1.0297)  1.0095 (1.0048,1.0141) 
##                                                                      
## diasbp (cont. var.)   1.04 (1.03,1.04)        1.02 (1.01,1.03)       
##                                                                      
## glucose (cont. var.)  1.0067 (1.0047,1.0087)  0.9988 (0.9961,1.0014) 
##                                                                      
## bmi (cont. var.)      1.06 (1.04,1.07)        1.01 (1,1.03)          
##                                                                      
## educ: ref.=1                                                         
##    2                  0.65 (0.55,0.77)        0.91 (0.76,1.09)       
##    3                  0.6 (0.49,0.74)         0.82 (0.66,1.03)       
##    4                  0.63 (0.5,0.8)          0.84 (0.65,1.08)       
##                                                                      
## smoker: 1 vs 0        0.91 (0.79,1.05)        1.45 (1.24,1.7)        
##                                                                      
## diabetes: 1 vs 0      3.55 (2.81,4.49)        2.36 (1.76,3.18)       
##                                                                      
## hypertension: 1 vs 0  3.71 (2.95,4.68)        1.77 (1.38,2.29)       
##                                                                      
##                       P(Wald's test) P(LR-test)
## age (cont. var.)      < 0.001        < 0.001   
##                                                
## sex: 2 vs 1           0.178          0.179     
##                                                
## tot.chol (cont. var.) 0.017          0.016     
##                                                
## sysbp (cont. var.)    < 0.001        < 0.001   
##                                                
## diasbp (cont. var.)   < 0.001        < 0.001   
##                                                
## glucose (cont. var.)  0.362          0.357     
##                                                
## bmi (cont. var.)      0.112          0.114     
##                                                
## educ: ref.=1                         0.235     
##    2                  0.299                    
##    3                  0.083                    
##    4                  0.168                    
##                                                
## smoker: 1 vs 0        < 0.001        < 0.001   
##                                                
## diabetes: 1 vs 0      < 0.001        < 0.001   
##                                                
## hypertension: 1 vs 0  < 0.001        < 0.001   
##                                                
## Log-likelihood = -2665.1373
## No. of observations = 9761
## AIC value = 5358.2747

5. Figure of OR (95% CI) of developing Stroke;

ggcoef(m3, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10) +
  ggtitle("ORs and 95% CI for having stroke") +
  xlab("ORs (95% CI)") +
  ylab("Predictor Variables") +
  theme(plot.title = element_text(hjust = 0.5))