suppressMessages(library(ggplot2))
suppressMessages(library(tidyverse))
suppressMessages(library(epiDisplay)) # logistic.display()
suppressMessages(library(rms))        # for lrm
suppressMessages(library(caret))      # for prediction, varImp
df <- read.csv("~/Desktop/R-dir/R studying/dataset/Framingham dataset.csv")
dim(df)
## [1] 11627    39

Convert the categorical variables to factor

df$sex <- as.factor(df$sex)
df$smoker <- as.factor(df$smoker)
df$diabetes <- as.factor(df$diabetes)
df$bpmed <- as.factor(df$bpmed)
df$educ <- as.factor(df$educ)
df$stroke <- as.factor(df$stroke)
df$cvd <- as.factor(df$cvd)
df$hypertension <- as.factor(df$hypertension)

Logistic Regression

# Simple logistic Regession  
m1 <- glm(stroke ~ age, family = binomial, data = df)
summary(m1)
## 
## Call:
## glm(formula = stroke ~ age, family = binomial, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.9092  -0.4775  -0.3669  -0.2903   2.8008  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.240795   0.214800  -29.05   <2e-16 ***
## age          0.068778   0.003555   19.35   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 7102.4  on 11626  degrees of freedom
## Residual deviance: 6702.1  on 11625  degrees of freedom
## AIC: 6706.1
## 
## Number of Fisher Scoring iterations: 5
logistic.display(m1)
## 
## Logistic regression predicting stroke : 1 vs 0 
##  
##                  OR(95%CI)         P(Wald's test) P(LR-test)
## age (cont. var.) 1.07 (1.06,1.08)  < 0.001        < 0.001   
##                                                             
## Log-likelihood = -3351.0549
## No. of observations = 11627
## AIC value = 6706.1098
# Model 2: m1 + sex + tot.chol + sysbp + diasbp + glucose + bmi
m2 <- glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi, family = binomial, data = df)
summary(m2)
## 
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp + 
##     glucose + bmi, family = binomial, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.2557  -0.4728  -0.3445  -0.2500   3.0162  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.1143066  0.4262665 -21.382  < 2e-16 ***
## age          0.0593876  0.0044973  13.205  < 2e-16 ***
## sex2        -0.2188370  0.0747738  -2.927 0.003426 ** 
## tot.chol    -0.0015023  0.0008214  -1.829 0.067411 .  
## sysbp        0.0132716  0.0022606   5.871 4.33e-09 ***
## diasbp       0.0165213  0.0042840   3.856 0.000115 ***
## glucose      0.0024509  0.0011344   2.161 0.030730 *  
## bmi          0.0156037  0.0086435   1.805 0.071037 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 6137.7  on 10020  degrees of freedom
## Residual deviance: 5563.7  on 10013  degrees of freedom
##   (1606 observations deleted due to missingness)
## AIC: 5579.7
## 
## Number of Fisher Scoring iterations: 6
logistic.display(m2)
## 
## Logistic regression predicting stroke : 1 vs 0 
##  
##                       crude OR(95%CI)         adj. OR(95%CI)         
## age (cont. var.)      1.07 (1.06,1.08)        1.06 (1.05,1.07)       
##                                                                      
## sex: 2 vs 1           0.89 (0.78,1.02)        0.8 (0.69,0.93)        
##                                                                      
## tot.chol (cont. var.) 1.0009 (0.9994,1.0024)  0.9985 (0.9969,1.0001) 
##                                                                      
## sysbp (cont. var.)    1.03 (1.02,1.03)        1.01 (1.01,1.02)       
##                                                                      
## diasbp (cont. var.)   1.04 (1.03,1.04)        1.02 (1.01,1.03)       
##                                                                      
## glucose (cont. var.)  1.0066 (1.0046,1.0086)  1.0025 (1.0002,1.0047) 
##                                                                      
## bmi (cont. var.)      1.05 (1.04,1.07)        1.02 (1,1.03)          
##                                                                      
##                       P(Wald's test) P(LR-test)
## age (cont. var.)      < 0.001        < 0.001   
##                                                
## sex: 2 vs 1           0.003          0.003     
##                                                
## tot.chol (cont. var.) 0.067          0.066     
##                                                
## sysbp (cont. var.)    < 0.001        < 0.001   
##                                                
## diasbp (cont. var.)   < 0.001        < 0.001   
##                                                
## glucose (cont. var.)  0.031          0.036     
##                                                
## bmi (cont. var.)      0.071          0.073     
##                                                
## Log-likelihood = -2781.8252
## No. of observations = 10021
## AIC value = 5579.6504
# Model 3: Model2 + educ + smoker + diabetes + hypertension
m3 <- glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi + 
            educ + smoker + diabetes + hypertension, family = binomial, data = df)
summary(m3)
## 
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp + 
##     glucose + bmi + educ + smoker + diabetes + hypertension, 
##     family = binomial, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4213  -0.4725  -0.3371  -0.2302   3.0361  
## 
## Coefficients:
##                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -9.043959   0.478029 -18.919  < 2e-16 ***
## age            0.062828   0.004800  13.090  < 2e-16 ***
## sex2          -0.104921   0.077956  -1.346   0.1783    
## tot.chol      -0.002012   0.000840  -2.395   0.0166 *  
## sysbp          0.009417   0.002354   4.001 6.31e-05 ***
## diasbp         0.017406   0.004356   3.996 6.44e-05 ***
## glucose       -0.001237   0.001356  -0.912   0.3615    
## bmi            0.014258   0.008983   1.587   0.1125    
## educ2         -0.094009   0.090442  -1.039   0.2986    
## educ3         -0.195882   0.113176  -1.731   0.0835 .  
## educ4         -0.176164   0.127799  -1.378   0.1681    
## smoker1        0.374200   0.079604   4.701 2.59e-06 ***
## diabetes1      0.860508   0.150958   5.700 1.20e-08 ***
## hypertension1  0.573486   0.130092   4.408 1.04e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 5973.0  on 9760  degrees of freedom
## Residual deviance: 5330.3  on 9747  degrees of freedom
##   (1866 observations deleted due to missingness)
## AIC: 5358.3
## 
## Number of Fisher Scoring iterations: 6
logistic.display(m3)
## 
## Logistic regression predicting stroke : 1 vs 0 
##  
##                       crude OR(95%CI)         adj. OR(95%CI)         
## age (cont. var.)      1.07 (1.06,1.08)        1.06 (1.05,1.07)       
##                                                                      
## sex: 2 vs 1           0.91 (0.79,1.05)        0.9 (0.77,1.05)        
##                                                                      
## tot.chol (cont. var.) 1.0005 (0.999,1.002)    0.998 (0.9963,0.9996)  
##                                                                      
## sysbp (cont. var.)    1.0269 (1.0241,1.0297)  1.0095 (1.0048,1.0141) 
##                                                                      
## diasbp (cont. var.)   1.04 (1.03,1.04)        1.02 (1.01,1.03)       
##                                                                      
## glucose (cont. var.)  1.0067 (1.0047,1.0087)  0.9988 (0.9961,1.0014) 
##                                                                      
## bmi (cont. var.)      1.06 (1.04,1.07)        1.01 (1,1.03)          
##                                                                      
## educ: ref.=1                                                         
##    2                  0.65 (0.55,0.77)        0.91 (0.76,1.09)       
##    3                  0.6 (0.49,0.74)         0.82 (0.66,1.03)       
##    4                  0.63 (0.5,0.8)          0.84 (0.65,1.08)       
##                                                                      
## smoker: 1 vs 0        0.91 (0.79,1.05)        1.45 (1.24,1.7)        
##                                                                      
## diabetes: 1 vs 0      3.55 (2.81,4.49)        2.36 (1.76,3.18)       
##                                                                      
## hypertension: 1 vs 0  3.71 (2.95,4.68)        1.77 (1.38,2.29)       
##                                                                      
##                       P(Wald's test) P(LR-test)
## age (cont. var.)      < 0.001        < 0.001   
##                                                
## sex: 2 vs 1           0.178          0.179     
##                                                
## tot.chol (cont. var.) 0.017          0.016     
##                                                
## sysbp (cont. var.)    < 0.001        < 0.001   
##                                                
## diasbp (cont. var.)   < 0.001        < 0.001   
##                                                
## glucose (cont. var.)  0.362          0.357     
##                                                
## bmi (cont. var.)      0.112          0.114     
##                                                
## educ: ref.=1                         0.235     
##    2                  0.299                    
##    3                  0.083                    
##    4                  0.168                    
##                                                
## smoker: 1 vs 0        < 0.001        < 0.001   
##                                                
## diabetes: 1 vs 0      < 0.001        < 0.001   
##                                                
## hypertension: 1 vs 0  < 0.001        < 0.001   
##                                                
## Log-likelihood = -2665.1373
## No. of observations = 9761
## AIC value = 5358.2747

Figure of OR (95% CI) of developing Stroke;

suppressMessages(library(GGally))

ggcoef(m1, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.05) +
  ggtitle("ORs and 95% CI for having stroke") +
  xlab("ORs (95% CI)") +
  ylab("Predictor Variables") +
  theme(plot.title = element_text(hjust = 0.5))

ggcoef(m2, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.04) +
  ggtitle("ORs and 95% CI for having stroke") +
  xlab("ORs (95% CI)") +
  ylab("Predictor Variables") +
  theme(plot.title = element_text(hjust = 0.5))

ggcoef(m3, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10) +
  ggtitle("ORs and 95% CI for having stroke") +
  xlab("ORs (95% CI)") +
  ylab("Predictor Variables") +
  theme(plot.title = element_text(hjust = 0.5))

Evaluate the performance of model

library(Epi)
ROC(form = stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi + 
      educ + smoker + diabetes + hypertension, data = df, plot = "ROC")

Find the important variables

varImp(m3, scale = F)
##                 Overall
## age           13.089668
## sex2           1.345902
## tot.chol       2.395128
## sysbp          4.000939
## diasbp         3.995962
## glucose        0.912440
## bmi            1.587195
## educ2          1.039436
## educ3          1.730778
## educ4          1.378443
## smoker1        4.700788
## diabetes1      5.700322
## hypertension1  4.408311

Visualizing data

p1 = ggplot(df, aes(x = age, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.1) # what do you think about this?
p1

p2 = ggplot(df, aes(x = sysbp, fill = stroke, color = stroke)) +
  geom_density(alpha = 0.5)
p2

p3 = ggplot(df, aes(x = diasbp, fill = stroke, col = stroke)) +
  geom_density(alpha = 0.9)
p3

p4 = ggplot(df, aes(x = tot.chol, fill = stroke, col = stroke)) +
  geom_density(alpha = 0.3)
p4
## Warning: Removed 409 rows containing non-finite values (`stat_density()`).

gridExtra::grid.arrange(p1, p2, p3, nrow = 3)

gridExtra::grid.arrange(p1, p2, p3, ncol = 3)

Cross-validation

library(caret)
sample = createDataPartition(df$stroke, p = 0.7, list = FALSE)
train = df[sample,]
test = df[-sample,]

model_train = glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi + 
                    educ + smoker + diabetes + hypertension, family = binomial, data = train)
summary(model_train)
## 
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp + 
##     glucose + bmi + educ + smoker + diabetes + hypertension, 
##     family = binomial, data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4006  -0.4773  -0.3380  -0.2285   3.0095  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)   -9.0386261  0.5730294 -15.773  < 2e-16 ***
## age            0.0652052  0.0057384  11.363  < 2e-16 ***
## sex2          -0.1181606  0.0927831  -1.274 0.202836    
## tot.chol      -0.0024481  0.0009947  -2.461 0.013852 *  
## sysbp          0.0082090  0.0028276   2.903 0.003694 ** 
## diasbp         0.0190762  0.0052085   3.662 0.000250 ***
## glucose       -0.0005289  0.0016948  -0.312 0.754994    
## bmi            0.0106454  0.0108306   0.983 0.325657    
## educ2         -0.0100678  0.1068246  -0.094 0.924914    
## educ3         -0.1748099  0.1351494  -1.293 0.195853    
## educ4         -0.2283044  0.1536197  -1.486 0.137235    
## smoker1        0.3354771  0.0950815   3.528 0.000418 ***
## diabetes1      0.6679007  0.1868044   3.575 0.000350 ***
## hypertension1  0.6306131  0.1581472   3.988 6.68e-05 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4183.6  on 6823  degrees of freedom
## Residual deviance: 3741.3  on 6810  degrees of freedom
##   (1316 observations deleted due to missingness)
## AIC: 3769.3
## 
## Number of Fisher Scoring iterations: 6
# Make a predictions on the testing data
prob <- predict(model_train, newdata = test, type = "response")
predictions <- ifelse(prob >0.5, 1, 0)

# Confusion matrix
confusionMatrix(factor(predictions), test$stroke, positive = "1")
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2670  263
##          1    0    4
##                                           
##                Accuracy : 0.9105          
##                  95% CI : (0.8995, 0.9205)
##     No Information Rate : 0.9091          
##     P-Value [Acc > NIR] : 0.4144          
##                                           
##                   Kappa : 0.0269          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.014981        
##             Specificity : 1.000000        
##          Pos Pred Value : 1.000000        
##          Neg Pred Value : 0.910331        
##              Prevalence : 0.090909        
##          Detection Rate : 0.001362        
##    Detection Prevalence : 0.001362        
##       Balanced Accuracy : 0.507491        
##                                           
##        'Positive' Class : 1               
## 
##############
library(rms)
ddist = datadist(df)
options(datadist = "ddist")

model_train2 = lrm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi + 
                     educ + smoker + diabetes + hypertension, data = train)

prob2<-predict(model_train2, test)             ## apply the model to the test data
predictions2<-ifelse(prob2 > 0.5, 1,0)   ## convert the probability value to with or without stroke.
confusionMatrix(factor(predictions2), test$stroke, positive = "1")   ## compare the model's output to the actual data
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction    0    1
##          0 2670  266
##          1    0    1
##                                           
##                Accuracy : 0.9094          
##                  95% CI : (0.8985, 0.9196)
##     No Information Rate : 0.9091          
##     P-Value [Acc > NIR] : 0.4907          
##                                           
##                   Kappa : 0.0068          
##                                           
##  Mcnemar's Test P-Value : <2e-16          
##                                           
##             Sensitivity : 0.0037453       
##             Specificity : 1.0000000       
##          Pos Pred Value : 1.0000000       
##          Neg Pred Value : 0.9094005       
##              Prevalence : 0.0909091       
##          Detection Rate : 0.0003405       
##    Detection Prevalence : 0.0003405       
##       Balanced Accuracy : 0.5018727       
##                                           
##        'Positive' Class : 1               
## 

Nomogram for Stroke prediction (Probability of Stroke)

library(rms)
ddist = datadist(df)
options(datadist = "ddist")

m3 = lrm(stroke ~ age + sex + sysbp + diasbp +  
           smoker + diabetes + hypertension, data = df)

p = nomogram(m3, fun = function(x)1/(1+exp(-x)), fun.at = c(0.001, 0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99, 0.999), funlabel = "Probability of Stroke")

plot(p, cex.axis = 0.6, lmgp = 0.2)