suppressMessages(library(ggplot2))
suppressMessages(library(tidyverse))
suppressMessages(library(epiDisplay)) # logistic.display()
suppressMessages(library(rms)) # for lrm
suppressMessages(library(caret)) # for prediction, varImp
df <- read.csv("~/Desktop/R-dir/R studying/dataset/Framingham dataset.csv")
dim(df)
## [1] 11627 39
df$sex <- as.factor(df$sex)
df$smoker <- as.factor(df$smoker)
df$diabetes <- as.factor(df$diabetes)
df$bpmed <- as.factor(df$bpmed)
df$educ <- as.factor(df$educ)
df$stroke <- as.factor(df$stroke)
df$cvd <- as.factor(df$cvd)
df$hypertension <- as.factor(df$hypertension)
# Simple logistic Regession
m1 <- glm(stroke ~ age, family = binomial, data = df)
summary(m1)
##
## Call:
## glm(formula = stroke ~ age, family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.9092 -0.4775 -0.3669 -0.2903 2.8008
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.240795 0.214800 -29.05 <2e-16 ***
## age 0.068778 0.003555 19.35 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 7102.4 on 11626 degrees of freedom
## Residual deviance: 6702.1 on 11625 degrees of freedom
## AIC: 6706.1
##
## Number of Fisher Scoring iterations: 5
logistic.display(m1)
##
## Logistic regression predicting stroke : 1 vs 0
##
## OR(95%CI) P(Wald's test) P(LR-test)
## age (cont. var.) 1.07 (1.06,1.08) < 0.001 < 0.001
##
## Log-likelihood = -3351.0549
## No. of observations = 11627
## AIC value = 6706.1098
# Model 2: m1 + sex + tot.chol + sysbp + diasbp + glucose + bmi
m2 <- glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi, family = binomial, data = df)
summary(m2)
##
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp +
## glucose + bmi, family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.2557 -0.4728 -0.3445 -0.2500 3.0162
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.1143066 0.4262665 -21.382 < 2e-16 ***
## age 0.0593876 0.0044973 13.205 < 2e-16 ***
## sex2 -0.2188370 0.0747738 -2.927 0.003426 **
## tot.chol -0.0015023 0.0008214 -1.829 0.067411 .
## sysbp 0.0132716 0.0022606 5.871 4.33e-09 ***
## diasbp 0.0165213 0.0042840 3.856 0.000115 ***
## glucose 0.0024509 0.0011344 2.161 0.030730 *
## bmi 0.0156037 0.0086435 1.805 0.071037 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 6137.7 on 10020 degrees of freedom
## Residual deviance: 5563.7 on 10013 degrees of freedom
## (1606 observations deleted due to missingness)
## AIC: 5579.7
##
## Number of Fisher Scoring iterations: 6
logistic.display(m2)
##
## Logistic regression predicting stroke : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI)
## age (cont. var.) 1.07 (1.06,1.08) 1.06 (1.05,1.07)
##
## sex: 2 vs 1 0.89 (0.78,1.02) 0.8 (0.69,0.93)
##
## tot.chol (cont. var.) 1.0009 (0.9994,1.0024) 0.9985 (0.9969,1.0001)
##
## sysbp (cont. var.) 1.03 (1.02,1.03) 1.01 (1.01,1.02)
##
## diasbp (cont. var.) 1.04 (1.03,1.04) 1.02 (1.01,1.03)
##
## glucose (cont. var.) 1.0066 (1.0046,1.0086) 1.0025 (1.0002,1.0047)
##
## bmi (cont. var.) 1.05 (1.04,1.07) 1.02 (1,1.03)
##
## P(Wald's test) P(LR-test)
## age (cont. var.) < 0.001 < 0.001
##
## sex: 2 vs 1 0.003 0.003
##
## tot.chol (cont. var.) 0.067 0.066
##
## sysbp (cont. var.) < 0.001 < 0.001
##
## diasbp (cont. var.) < 0.001 < 0.001
##
## glucose (cont. var.) 0.031 0.036
##
## bmi (cont. var.) 0.071 0.073
##
## Log-likelihood = -2781.8252
## No. of observations = 10021
## AIC value = 5579.6504
# Model 3: Model2 + educ + smoker + diabetes + hypertension
m3 <- glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi +
educ + smoker + diabetes + hypertension, family = binomial, data = df)
summary(m3)
##
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp +
## glucose + bmi + educ + smoker + diabetes + hypertension,
## family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4213 -0.4725 -0.3371 -0.2302 3.0361
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.043959 0.478029 -18.919 < 2e-16 ***
## age 0.062828 0.004800 13.090 < 2e-16 ***
## sex2 -0.104921 0.077956 -1.346 0.1783
## tot.chol -0.002012 0.000840 -2.395 0.0166 *
## sysbp 0.009417 0.002354 4.001 6.31e-05 ***
## diasbp 0.017406 0.004356 3.996 6.44e-05 ***
## glucose -0.001237 0.001356 -0.912 0.3615
## bmi 0.014258 0.008983 1.587 0.1125
## educ2 -0.094009 0.090442 -1.039 0.2986
## educ3 -0.195882 0.113176 -1.731 0.0835 .
## educ4 -0.176164 0.127799 -1.378 0.1681
## smoker1 0.374200 0.079604 4.701 2.59e-06 ***
## diabetes1 0.860508 0.150958 5.700 1.20e-08 ***
## hypertension1 0.573486 0.130092 4.408 1.04e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 5973.0 on 9760 degrees of freedom
## Residual deviance: 5330.3 on 9747 degrees of freedom
## (1866 observations deleted due to missingness)
## AIC: 5358.3
##
## Number of Fisher Scoring iterations: 6
logistic.display(m3)
##
## Logistic regression predicting stroke : 1 vs 0
##
## crude OR(95%CI) adj. OR(95%CI)
## age (cont. var.) 1.07 (1.06,1.08) 1.06 (1.05,1.07)
##
## sex: 2 vs 1 0.91 (0.79,1.05) 0.9 (0.77,1.05)
##
## tot.chol (cont. var.) 1.0005 (0.999,1.002) 0.998 (0.9963,0.9996)
##
## sysbp (cont. var.) 1.0269 (1.0241,1.0297) 1.0095 (1.0048,1.0141)
##
## diasbp (cont. var.) 1.04 (1.03,1.04) 1.02 (1.01,1.03)
##
## glucose (cont. var.) 1.0067 (1.0047,1.0087) 0.9988 (0.9961,1.0014)
##
## bmi (cont. var.) 1.06 (1.04,1.07) 1.01 (1,1.03)
##
## educ: ref.=1
## 2 0.65 (0.55,0.77) 0.91 (0.76,1.09)
## 3 0.6 (0.49,0.74) 0.82 (0.66,1.03)
## 4 0.63 (0.5,0.8) 0.84 (0.65,1.08)
##
## smoker: 1 vs 0 0.91 (0.79,1.05) 1.45 (1.24,1.7)
##
## diabetes: 1 vs 0 3.55 (2.81,4.49) 2.36 (1.76,3.18)
##
## hypertension: 1 vs 0 3.71 (2.95,4.68) 1.77 (1.38,2.29)
##
## P(Wald's test) P(LR-test)
## age (cont. var.) < 0.001 < 0.001
##
## sex: 2 vs 1 0.178 0.179
##
## tot.chol (cont. var.) 0.017 0.016
##
## sysbp (cont. var.) < 0.001 < 0.001
##
## diasbp (cont. var.) < 0.001 < 0.001
##
## glucose (cont. var.) 0.362 0.357
##
## bmi (cont. var.) 0.112 0.114
##
## educ: ref.=1 0.235
## 2 0.299
## 3 0.083
## 4 0.168
##
## smoker: 1 vs 0 < 0.001 < 0.001
##
## diabetes: 1 vs 0 < 0.001 < 0.001
##
## hypertension: 1 vs 0 < 0.001 < 0.001
##
## Log-likelihood = -2665.1373
## No. of observations = 9761
## AIC value = 5358.2747
suppressMessages(library(GGally))
ggcoef(m1, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.05) +
ggtitle("ORs and 95% CI for having stroke") +
xlab("ORs (95% CI)") +
ylab("Predictor Variables") +
theme(plot.title = element_text(hjust = 0.5))
ggcoef(m2, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.04) +
ggtitle("ORs and 95% CI for having stroke") +
xlab("ORs (95% CI)") +
ylab("Predictor Variables") +
theme(plot.title = element_text(hjust = 0.5))
ggcoef(m3, exponentiate=T, exclude_intercept=T, vline_color = "red", errorbar_color = "blue", errorbar_height = 0.10) +
ggtitle("ORs and 95% CI for having stroke") +
xlab("ORs (95% CI)") +
ylab("Predictor Variables") +
theme(plot.title = element_text(hjust = 0.5))
library(Epi)
ROC(form = stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi +
educ + smoker + diabetes + hypertension, data = df, plot = "ROC")
The AUC represents the area under this curve, and it ranges from 0 to 1
AUC > 0.5 to 0.6: This indicates a weak predictive performance. The model is making predictions better than random chance, but its discriminatory power may be limited.
AUC > 0.6 to 0.7: This indicates a moderate predictive performance. The model is showing some ability to accurately classify positive and negative instances, but there is room for improvement.
AUC > 0.7 to 0.8: This indicates a good predictive performance. The model is able to discriminate between positive and negative instances with a reasonable margin of separation.
AUC > 0.8 to 0.9: This indicates a very good predictive performance. The model is making accurate predictions with a high degree of discriminatory power.
AUC > 0.9 to 1: This indicates an excellent predictive performance. The model is making highly accurate predictions with a very large margin of separation between positive and negative instances.
varImp(m3, scale = F)
## Overall
## age 13.089668
## sex2 1.345902
## tot.chol 2.395128
## sysbp 4.000939
## diasbp 3.995962
## glucose 0.912440
## bmi 1.587195
## educ2 1.039436
## educ3 1.730778
## educ4 1.378443
## smoker1 4.700788
## diabetes1 5.700322
## hypertension1 4.408311
p1 = ggplot(df, aes(x = age, fill = stroke, color = stroke)) +
geom_density(alpha = 0.1) # what do you think about this?
p1
p2 = ggplot(df, aes(x = sysbp, fill = stroke, color = stroke)) +
geom_density(alpha = 0.5)
p2
p3 = ggplot(df, aes(x = diasbp, fill = stroke, col = stroke)) +
geom_density(alpha = 0.9)
p3
p4 = ggplot(df, aes(x = tot.chol, fill = stroke, col = stroke)) +
geom_density(alpha = 0.3)
p4
## Warning: Removed 409 rows containing non-finite values (`stat_density()`).
gridExtra::grid.arrange(p1, p2, p3, nrow = 3)
gridExtra::grid.arrange(p1, p2, p3, ncol = 3)
library(caret)
sample = createDataPartition(df$stroke, p = 0.7, list = FALSE)
train = df[sample,]
test = df[-sample,]
model_train = glm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi +
educ + smoker + diabetes + hypertension, family = binomial, data = train)
summary(model_train)
##
## Call:
## glm(formula = stroke ~ age + sex + tot.chol + sysbp + diasbp +
## glucose + bmi + educ + smoker + diabetes + hypertension,
## family = binomial, data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4006 -0.4773 -0.3380 -0.2285 3.0095
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.0386261 0.5730294 -15.773 < 2e-16 ***
## age 0.0652052 0.0057384 11.363 < 2e-16 ***
## sex2 -0.1181606 0.0927831 -1.274 0.202836
## tot.chol -0.0024481 0.0009947 -2.461 0.013852 *
## sysbp 0.0082090 0.0028276 2.903 0.003694 **
## diasbp 0.0190762 0.0052085 3.662 0.000250 ***
## glucose -0.0005289 0.0016948 -0.312 0.754994
## bmi 0.0106454 0.0108306 0.983 0.325657
## educ2 -0.0100678 0.1068246 -0.094 0.924914
## educ3 -0.1748099 0.1351494 -1.293 0.195853
## educ4 -0.2283044 0.1536197 -1.486 0.137235
## smoker1 0.3354771 0.0950815 3.528 0.000418 ***
## diabetes1 0.6679007 0.1868044 3.575 0.000350 ***
## hypertension1 0.6306131 0.1581472 3.988 6.68e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 4183.6 on 6823 degrees of freedom
## Residual deviance: 3741.3 on 6810 degrees of freedom
## (1316 observations deleted due to missingness)
## AIC: 3769.3
##
## Number of Fisher Scoring iterations: 6
# Make a predictions on the testing data
prob <- predict(model_train, newdata = test, type = "response")
predictions <- ifelse(prob >0.5, 1, 0)
# Confusion matrix
confusionMatrix(factor(predictions), test$stroke, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2670 263
## 1 0 4
##
## Accuracy : 0.9105
## 95% CI : (0.8995, 0.9205)
## No Information Rate : 0.9091
## P-Value [Acc > NIR] : 0.4144
##
## Kappa : 0.0269
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.014981
## Specificity : 1.000000
## Pos Pred Value : 1.000000
## Neg Pred Value : 0.910331
## Prevalence : 0.090909
## Detection Rate : 0.001362
## Detection Prevalence : 0.001362
## Balanced Accuracy : 0.507491
##
## 'Positive' Class : 1
##
##############
library(rms)
ddist = datadist(df)
options(datadist = "ddist")
model_train2 = lrm(stroke ~ age + sex + tot.chol + sysbp + diasbp + glucose + bmi +
educ + smoker + diabetes + hypertension, data = train)
prob2<-predict(model_train2, test) ## apply the model to the test data
predictions2<-ifelse(prob2 > 0.5, 1,0) ## convert the probability value to with or without stroke.
confusionMatrix(factor(predictions2), test$stroke, positive = "1") ## compare the model's output to the actual data
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 2670 266
## 1 0 1
##
## Accuracy : 0.9094
## 95% CI : (0.8985, 0.9196)
## No Information Rate : 0.9091
## P-Value [Acc > NIR] : 0.4907
##
## Kappa : 0.0068
##
## Mcnemar's Test P-Value : <2e-16
##
## Sensitivity : 0.0037453
## Specificity : 1.0000000
## Pos Pred Value : 1.0000000
## Neg Pred Value : 0.9094005
## Prevalence : 0.0909091
## Detection Rate : 0.0003405
## Detection Prevalence : 0.0003405
## Balanced Accuracy : 0.5018727
##
## 'Positive' Class : 1
##
library(rms)
ddist = datadist(df)
options(datadist = "ddist")
m3 = lrm(stroke ~ age + sex + sysbp + diasbp +
smoker + diabetes + hypertension, data = df)
p = nomogram(m3, fun = function(x)1/(1+exp(-x)), fun.at = c(0.001, 0.01, 0.05, seq(0.1, 0.9, by = 0.1), 0.95, 0.99, 0.999), funlabel = "Probability of Stroke")
plot(p, cex.axis = 0.6, lmgp = 0.2)
Explanation:
The fun argument specifies the function used to transform the logistic regression coefficients into predicted probabilities. In this case, the fun function is set to the sigmoid function 1/(1+exp(-x)), which maps the logistic regression coefficients to probabilities between 0 and 1.
The fun.at argument specifies the probability values to be labeled on the nomogram’s y-axis. In this case, the fun.at argument includes a range of probability values between 0.001 and 0.999.