load packages:
library(gmodels)
## Warning: package 'gmodels' was built under R version 4.0.5
library(mctest)
library(DescTools)
## Warning: package 'DescTools' was built under R version 4.0.5
## Registered S3 method overwritten by 'DescTools':
## method from
## reorder.factor gdata
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.0.5
## ResourceSelection 0.3-5 2019-07-22
load data:
filename <- "DMdata.csv"
fileURL <- "https://d3c33hcgiwev3.cloudfront.net/NVJ7Cw98Eem6Pgo4-YwqLg_35bb00c00f7c11e9903947c521ebe81a_final-diabetes-data-for-R-_csv_-_2_.csv?Expires=1626307200&Signature=f2iyh3U7CoX33SC~FZEIBHtEogsSVacsuCP7jUlymcUid7vKzHYi~rq~RlDsOpAID4g8ufWH~f4RdiHXQFqmf31gJJuh6-kFEQjy55wCWF2bRFHNVHlyUd-5WmKkM44ynS9DbQGZMB5jitjlsP~4N9sV-ieS7yGkBKA2ZF8nMMM_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A"
if (!file.exists(filename)){
download.file(fileURL, filename)
}
dmdata <- read.csv("DMdata.csv")
define variables:
chol <- dmdata[,"chol"]
gender <- as.factor(dmdata[,"gender"])
dm <- as.factor(dmdata[,"dm"])
dm2 <- factor(dm, exclude=NULL)
hdl <- dmdata[,"hdl"]
insurance <- as.factor(dmdata[,"insurance"])
smoking <- as.factor(dmdata[,"smoking"])
location <- as.factor(dmdata[,"location"])
frame <- as.factor(dmdata[,"frame"])
sbp <- dmdata[,"bp.1s"]
dbp <- dmdata[,"bp.1d"]
age <- dmdata[,"age"]
#bmi:
height <- dmdata[,'height']
weight <- dmdata[,'weight']
height.si <- height*0.0254
weight.si <- weight*0.453592
bmi <- weight.si/height.si^2
bmi_categorised <- ifelse(bmi < 18.5, "underweight",
ifelse(bmi >= 18.5 & bmi <= 25, "normal",
ifelse(bmi > 25 & bmi <= 30, "overweight",
ifelse(bmi > 30, "obese", NA))))
table(bmi_categorised, exclude = NULL)
## bmi_categorised
## normal obese overweight underweight <NA>
## 113 152 123 9 6
dm_by_bmi_category <- table(bmi_categorised, dm2, exclude = NULL)
dm_by_bmi_category
## dm2
## bmi_categorised no yes <NA>
## normal 100 9 4
## obese 118 29 5
## overweight 99 20 4
## underweight 9 0 0
## <NA> 4 2 0
#age categorisation:
age_grouped <- ifelse(age < 45, "under 45",
ifelse(age >= 45 & age < 65, "45 - 64",
ifelse(age >= 65 & age < 75, "65 - 74",
ifelse(age >= 75, "75 or over", NA))))
# cross tabulate age by gender
age_group_by_gender <- table(age_grouped, gender, exclude = NULL)
Explore data:
#age
summary(age)
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 19.00 34.00 45.00 46.85 60.00 92.00
hist(age)

d <- density(age)
plot(d,main = "")

age_group_by_gender
## gender
## age_grouped female male
## 45 - 64 75 64
## 65 - 74 21 20
## 75 or over 12 11
## under 45 126 74
round(100 * prop.table(age_group_by_gender, margin = 2), digits = 1)
## gender
## age_grouped female male
## 45 - 64 32.1 37.9
## 65 - 74 9.0 11.8
## 75 or over 5.1 6.5
## under 45 53.8 43.8
#bmi
summary(bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.20 24.13 27.80 28.79 32.24 55.79 6
hist(bmi)

round(100 * prop.table(dm_by_bmi_category, margin = 1), digits = 1)
## dm2
## bmi_categorised no yes <NA>
## normal 88.5 8.0 3.5
## obese 77.6 19.1 3.3
## overweight 80.5 16.3 3.3
## underweight 100.0 0.0 0.0
## <NA> 66.7 33.3 0.0
#chol
summary(chol)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 78.0 179.0 204.0 207.8 230.0 443.0 1
hist(chol)

#hdl
summary(hdl)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 12.00 38.00 46.00 50.45 59.00 120.00 1
hist(hdl)

#gender
table_gender <- table(gender)
table(gender)
## gender
## female male
## 234 169
round(100 * prop.table(table_gender), digits = 1) #show percentage
## gender
## female male
## 58.1 41.9
Some Simple Logistic Regressions:
#null
mnull <- glm(dm ~ 1, family = binomial(link = logit))
summary(mnull)
##
## Call:
## glm(formula = dm ~ 1, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.578 -0.578 -0.578 -0.578 1.935
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7047 0.1403 -12.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.87 on 389 degrees of freedom
## Residual deviance: 334.87 on 389 degrees of freedom
## (13 observations deleted due to missingness)
## AIC: 336.87
##
## Number of Fisher Scoring iterations: 3
#age
m_age <- glm(dm ~ age, family=binomial (link=logit))
summary(m_age)
##
## Call:
## glm(formula = dm ~ age, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.3612 -0.5963 -0.4199 -0.3056 2.4848
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.404530 0.542828 -8.114 4.90e-16 ***
## age 0.052465 0.009388 5.589 2.29e-08 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.87 on 389 degrees of freedom
## Residual deviance: 299.41 on 388 degrees of freedom
## (13 observations deleted due to missingness)
## AIC: 303.41
##
## Number of Fisher Scoring iterations: 5
## The odds ratio for a year increase in age:
exp(m_age$coefficients["age"])
## age
## 1.053866
#gender
m_gen <- glm(dm ~ gender, family=binomial (link=logit))
summary(m_gen)
##
## Call:
## glm(formula = dm ~ gender, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5915 -0.5915 -0.5683 -0.5683 1.9509
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.74150 0.18592 -9.367 <2e-16 ***
## gendermale 0.08694 0.28352 0.307 0.759
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.87 on 389 degrees of freedom
## Residual deviance: 334.78 on 388 degrees of freedom
## (13 observations deleted due to missingness)
## AIC: 338.78
##
## Number of Fisher Scoring iterations: 4
## odds of having diabetes when male:
exp(m_gen$coefficients["gendermale"])
## gendermale
## 1.09083
#chol
m_chol <- glm(dm ~ chol, family=binomial (link=logit))
summary(m_chol)
##
## Call:
## glm(formula = dm ~ chol, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.1002 -0.5946 -0.5095 -0.4258 2.4106
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -4.169970 0.682741 -6.108 1.01e-09 ***
## chol 0.011486 0.003012 3.813 0.000137 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.54 on 388 degrees of freedom
## Residual deviance: 319.62 on 387 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 323.62
##
## Number of Fisher Scoring iterations: 4
## The odds ratio for a unit increase in chol:
exp(m_chol$coefficients["chol"])
## chol
## 1.011552
#hdl
m_hdl <- glm(dm ~ hdl, family=binomial (link=logit))
summary(m_hdl)
##
## Call:
## glm(formula = dm ~ hdl, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.8443 -0.6245 -0.5602 -0.4535 2.5703
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -0.563695 0.474331 -1.188 0.2347
## hdl -0.023704 0.009847 -2.407 0.0161 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.54 on 388 degrees of freedom
## Residual deviance: 327.91 on 387 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 331.91
##
## Number of Fisher Scoring iterations: 5
## The odds ratio for a unit increase in hdl:
exp(m_hdl$coefficients["hdl"])
## hdl
## 0.9765751
#bmi
m_bmi <- glm(dm ~ bmi, family=binomial (link=logit))
summary(m_bmi)
##
## Call:
## glm(formula = dm ~ bmi, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.0562 -0.5877 -0.5247 -0.4621 2.1349
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.35108 0.63279 -5.296 1.19e-07 ***
## bmi 0.05484 0.02023 2.710 0.00673 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 326.02 on 383 degrees of freedom
## Residual deviance: 318.86 on 382 degrees of freedom
## (19 observations deleted due to missingness)
## AIC: 322.86
##
## Number of Fisher Scoring iterations: 4
## The odds ratio for a unit increase in bmi:
exp(m_bmi$coefficients["bmi"])
## bmi
## 1.056369
Check Linear Relationships between predictors and log odds of the outcome (having diabetes):
##gender:
### cross tabulation
dm_by_gender <- table(gender, dm)
### proportion of diabetes status by gender
dm_by_gender_prop <- prop.table(dm_by_gender, margin = 1)
### calculate the odds of having diabetes by gender
odds_gender <- dm_by_gender_prop[, "yes"]/dm_by_gender_prop[, "no"]
### calculate the log odds
logodds_gender <- log(odds_gender)
### plot the log odds of having diabetes by gender
dotchart(logodds_gender)

##bmi:
dm_by_bmi <- table(bmi, dm)
freq_table_bmi <- prop.table(dm_by_bmi, margin = 1)
odds_bmi <- freq_table_bmi[, "yes"]/freq_table_bmi[, "no"]
logodds_bmi <- log(odds_bmi)
plot(rownames(freq_table_bmi), logodds_bmi)

## categorised bmi:
bmi_categorised <- factor(bmi_categorised, levels = c("underweight", "normal", "overweight","obese"))
dm_by_bmi_categorised <- table(bmi_categorised, dm)
dm_by_bmi_categorised_prop <- prop.table(dm_by_bmi_categorised, margin = 1)
odds_bmi_categorised <- dm_by_bmi_categorised_prop[, "yes"]/dm_by_bmi_categorised_prop[, "no"]
logodds_bmi_categorised <- log(odds_bmi_categorised)
dotchart(logodds_bmi_categorised)

## chol:
dm_by_chol <- table(chol, dm)
freq_table_chol <- prop.table(dm_by_chol, margin = 1)
odds_chol <- freq_table_chol[, "yes"]/freq_table_chol[, "no"]
logodds_chol <- log(odds_chol)
plot(rownames(freq_table_chol), logodds_chol)

## categorising chol into an ordinal variable
# https://www.medicalnewstoday.com/articles/315900.php
chol_categorised <- ifelse(chol < 200, "healthy",
ifelse(chol < 240, "borderline high",
ifelse(chol >= 240, "high", NA)))
chol_categorised <- factor(chol_categorised, levels = c("healthy", "borderline high", "high"))
dm_by_chol_categorised <- table(chol_categorised, dm)
dm_by_chol_categorised_prop <- prop.table(dm_by_chol_categorised, margin = 1)
odds_chol_categorised <- dm_by_chol_categorised_prop[, "yes"]/dm_by_chol_categorised_prop[, "no"]
logodds_chol_categorised <- log(odds_chol_categorised)
dotchart(logodds_chol_categorised)

## hdl:
dm_by_hdl <- table(hdl, dm)
freq_table_hdl <- prop.table(dm_by_hdl, margin = 1)
odds_hdl <- freq_table_hdl[, "yes"]/freq_table_hdl[, "no"]
logodds_hdl <- log(odds_hdl)
plot(rownames(freq_table_hdl), logodds_hdl)

# categorising hdl into an ordinal variable
#https://www.healthline.com/health/high-cholesterol/levels-by-age#adults
hdl_categorised <- ifelse(hdl < 40, "low",
ifelse(hdl < 60, "acceptable",
ifelse(hdl >= 60, "good", NA)))
hdl_categorised <- factor(hdl_categorised, levels = c("low", "acceptable", "good"))
dm_by_hdl_categorised <- table(hdl_categorised, dm)
dm_by_hdl_categorised_prop <- prop.table(dm_by_hdl_categorised, margin = 1)
odds_hdl_categorised <- dm_by_hdl_categorised_prop[, "yes"]/dm_by_hdl_categorised_prop[, "no"]
logodds_hdl_categorised <- log(odds_hdl_categorised)
dotchart(logodds_hdl_categorised)

Relationship Between Predictors:
cor.test(chol, hdl, method = "pearson")
##
## Pearson's product-moment correlation
##
## data: chol and hdl
## t = 3.7983, df = 400, p-value = 0.0001683
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.09042379 0.27929136
## sample estimates:
## cor
## 0.1865809
#(corrolated (low p value) but weak corrolation, we can include both in our model)
Multiple Logistic Regression:
mm <- glm(dm ~ age + gender + bmi, family = binomial(link = "logit"))
summary(mm)
##
## Call:
## glm(formula = dm ~ age + gender + bmi, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6843 -0.5763 -0.3885 -0.2575 2.6991
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.892669 1.027388 -6.709 1.96e-11 ***
## age 0.055454 0.009884 5.611 2.02e-08 ***
## gendermale 0.244852 0.322817 0.758 0.44816
## bmi 0.073879 0.023310 3.169 0.00153 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 326.02 on 383 degrees of freedom
## Residual deviance: 281.62 on 380 degrees of freedom
## (19 observations deleted due to missingness)
## AIC: 289.62
##
## Number of Fisher Scoring iterations: 5
exp(confint(mm))
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) 0.0001226733 0.006986056
## age 1.0373532463 1.078493131
## gendermale 0.6762611041 2.409679250
## bmi 1.0287121260 1.127738696
## The odds ratio for this model:
exp(mm$coefficients)
## (Intercept) age gendermale bmi
## 0.001015201 1.057020598 1.277432283 1.076676391
## age and bmi are significant factors for diabetes risk, but there is no evidence that gender affects the risk.
#calculate McFadden's R-square
R2 <- 1-logLik(mm)/logLik(mnull)
R2
## 'log Lik.' 0.1590084 (df=4)
#calculate c statistic:
Cstat(mm)
## [1] 0.7787709
#perform Hosmer-Lemeshow test
HL <- hoslem.test(x = mm$y, y = fitted(mm), g = 10)
HL
##
## Hosmer and Lemeshow goodness of fit (GOF) test
##
## data: mm$y, fitted(mm)
## X-squared = 15.826, df = 8, p-value = 0.04494
# plot the observed vs expected number of cases for each of the 10 groups
plot(HL$observed[,"y1"], HL$expected[,"yhat1"])

# condensed model:
mm2 <- glm(dm ~ age + gender + bmi + chol + hdl + sbp + dbp + location + frame + insurance + smoking , family = binomial(link = "logit"))
summary(mm)
##
## Call:
## glm(formula = dm ~ age + gender + bmi, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6843 -0.5763 -0.3885 -0.2575 2.6991
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.892669 1.027388 -6.709 1.96e-11 ***
## age 0.055454 0.009884 5.611 2.02e-08 ***
## gendermale 0.244852 0.322817 0.758 0.44816
## bmi 0.073879 0.023310 3.169 0.00153 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 326.02 on 383 degrees of freedom
## Residual deviance: 281.62 on 380 degrees of freedom
## (19 observations deleted due to missingness)
## AIC: 289.62
##
## Number of Fisher Scoring iterations: 5
# test significance of each variable
anova(mm2, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: dm
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 378 324.38
## age 1 33.737 377 290.64 6.309e-09 ***
## gender 1 0.016 376 290.62 0.900103
## bmi 1 9.893 375 280.73 0.001659 **
## chol 1 8.701 374 272.03 0.003180 **
## hdl 1 7.794 373 264.24 0.005241 **
## sbp 1 0.579 372 263.66 0.446733
## dbp 1 0.005 371 263.65 0.944325
## location 1 0.456 370 263.20 0.499712
## frame 3 0.404 367 262.79 0.939510
## insurance 2 1.647 365 261.15 0.438976
## smoking 2 0.213 363 260.93 0.899082
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# cor between age and systolic blood pressure
cor.test(sbp, age) # extremely significant
##
## Pearson's product-moment correlation
##
## data: sbp and age
## t = 9.8342, df = 396, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3604404 0.5187477
## sample estimates:
## cor
## 0.4430412
# model without age:
mm3 <- glm(dm ~ gender + bmi + chol + hdl + sbp + dbp + location + frame + insurance + smoking , family = binomial(link = "logit"))
anova(mm3, test = "Chisq")
## Analysis of Deviance Table
##
## Model: binomial, link: logit
##
## Response: dm
##
## Terms added sequentially (first to last)
##
##
## Df Deviance Resid. Df Resid. Dev Pr(>Chi)
## NULL 378 324.38
## gender 1 0.0562 377 324.32 0.812532
## bmi 1 7.9434 376 316.38 0.004826 **
## chol 1 16.1365 375 300.24 5.894e-05 ***
## hdl 1 6.6824 374 293.56 0.009737 **
## sbp 1 8.5647 373 284.99 0.003427 **
## dbp 1 2.6605 372 282.33 0.102867
## location 1 0.7944 371 281.54 0.372763
## frame 3 0.6076 368 280.93 0.894682
## insurance 2 0.6933 366 280.24 0.707066
## smoking 2 0.8738 364 279.36 0.646024
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## now systolic BP is significant!
# backward elimination:
summary(mm3)
##
## Call:
## glm(formula = dm ~ gender + bmi + chol + hdl + sbp + dbp + location +
## frame + insurance + smoking, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7267 -0.5881 -0.4156 -0.2756 2.4311
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.676068 1.835830 -3.092 0.001989 **
## gendermale 0.129166 0.362884 0.356 0.721883
## bmi 0.037725 0.027704 1.362 0.173278
## chol 0.012871 0.003523 3.654 0.000259 ***
## hdl -0.025611 0.010805 -2.370 0.017768 *
## sbp 0.024860 0.008092 3.072 0.002125 **
## dbp -0.019343 0.015048 -1.285 0.198648
## locationLouisa -0.263597 0.318295 -0.828 0.407584
## framelarge 0.052141 0.892634 0.058 0.953420
## framemedium -0.216548 0.890252 -0.243 0.807817
## framesmall -0.045113 0.947902 -0.048 0.962041
## insurance1 -0.255968 0.380091 -0.673 0.500669
## insurance2 -0.328115 0.386914 -0.848 0.396422
## smoking2 -0.329779 0.350842 -0.940 0.347236
## smoking3 -0.204765 0.494104 -0.414 0.678569
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 324.38 on 378 degrees of freedom
## Residual deviance: 279.36 on 364 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 309.36
##
## Number of Fisher Scoring iterations: 5
##remove frame as it has the highest p value:
mm33 <- glm(dm ~ gender + bmi + chol + hdl + sbp + dbp + location + insurance + smoking , family = binomial(link = "logit"))
summary(mm33)
##
## Call:
## glm(formula = dm ~ gender + bmi + chol + hdl + sbp + dbp + location +
## insurance + smoking, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.7895 -0.5849 -0.4151 -0.2748 2.3992
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.830798 1.505890 -3.872 0.000108 ***
## gendermale 0.197591 0.338090 0.584 0.558928
## bmi 0.042416 0.024977 1.698 0.089469 .
## chol 0.012660 0.003501 3.617 0.000299 ***
## hdl -0.025298 0.010686 -2.367 0.017914 *
## sbp 0.025475 0.007918 3.217 0.001294 **
## dbp -0.020917 0.014850 -1.409 0.158974
## locationLouisa -0.298118 0.312322 -0.955 0.339820
## insurance1 -0.254163 0.377454 -0.673 0.500717
## insurance2 -0.320476 0.386498 -0.829 0.407003
## smoking2 -0.347780 0.348633 -0.998 0.318496
## smoking3 -0.228472 0.491680 -0.465 0.642164
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 324.38 on 378 degrees of freedom
## Residual deviance: 279.90 on 367 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 303.9
##
## Number of Fisher Scoring iterations: 5
##remove gender:
mm333 <- glm(dm ~ bmi + chol + hdl + sbp + dbp + location + insurance + smoking , family = binomial(link = "logit"))
summary(mm333)
##
## Call:
## glm(formula = dm ~ bmi + chol + hdl + sbp + dbp + location +
## insurance + smoking, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.8042 -0.5894 -0.4218 -0.2771 2.3719
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.577511 1.433297 -3.891 9.97e-05 ***
## bmi 0.037421 0.023398 1.599 0.109743
## chol 0.012713 0.003507 3.625 0.000289 ***
## hdl -0.026752 0.010455 -2.559 0.010501 *
## sbp 0.024955 0.007847 3.180 0.001471 **
## dbp -0.019556 0.014643 -1.336 0.181710
## locationLouisa -0.298173 0.312273 -0.955 0.339655
## insurance1 -0.262997 0.376939 -0.698 0.485354
## insurance2 -0.336179 0.385132 -0.873 0.382721
## smoking2 -0.340588 0.348362 -0.978 0.328230
## smoking3 -0.206453 0.489167 -0.422 0.672988
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 324.38 on 378 degrees of freedom
## Residual deviance: 280.24 on 368 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 302.24
##
## Number of Fisher Scoring iterations: 5
##remove smoking:
mm3333 <- glm(dm ~ bmi + chol + hdl + sbp + dbp + location + insurance , family = binomial(link = "logit"))
summary(mm3333)
##
## Call:
## glm(formula = dm ~ bmi + chol + hdl + sbp + dbp + location +
## insurance, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6815 -0.5820 -0.4215 -0.2938 2.4488
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.719167 1.424424 -4.015 5.94e-05 ***
## bmi 0.037048 0.023280 1.591 0.111520
## chol 0.012715 0.003506 3.627 0.000287 ***
## hdl -0.026605 0.010506 -2.532 0.011329 *
## sbp 0.024341 0.007722 3.152 0.001621 **
## dbp -0.019623 0.014609 -1.343 0.179219
## locationLouisa -0.281530 0.310177 -0.908 0.364067
## insurance1 -0.244305 0.375255 -0.651 0.515022
## insurance2 -0.312822 0.384587 -0.813 0.415991
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 324.38 on 378 degrees of freedom
## Residual deviance: 281.19 on 370 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 299.19
##
## Number of Fisher Scoring iterations: 5
##remove insurance
mm33333 <- glm(dm ~ bmi + chol + hdl + sbp + dbp + location , family = binomial(link = "logit"))
summary(mm33333)
##
## Call:
## glm(formula = dm ~ bmi + chol + hdl + sbp + dbp + location, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6031 -0.5905 -0.4209 -0.2986 2.4435
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.873089 1.403795 -4.184 2.87e-05 ***
## bmi 0.038040 0.023253 1.636 0.101864
## chol 0.012955 0.003486 3.717 0.000202 ***
## hdl -0.026586 0.010491 -2.534 0.011271 *
## sbp 0.024556 0.007649 3.211 0.001325 **
## dbp -0.021413 0.014430 -1.484 0.137831
## locationLouisa -0.275159 0.308014 -0.893 0.371677
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 324.38 on 378 degrees of freedom
## Residual deviance: 281.92 on 372 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 295.92
##
## Number of Fisher Scoring iterations: 5
##remove location
mmf <- glm(dm ~ bmi + chol + hdl + sbp + dbp , family = binomial(link = "logit"))
summary(mmf)
##
## Call:
## glm(formula = dm ~ bmi + chol + hdl + sbp + dbp, family = binomial(link = "logit"))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.6686 -0.5932 -0.4237 -0.2890 2.4963
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.961353 1.403370 -4.248 2.16e-05 ***
## bmi 0.038721 0.023238 1.666 0.095657 .
## chol 0.012763 0.003459 3.690 0.000224 ***
## hdl -0.026745 0.010498 -2.548 0.010848 *
## sbp 0.024917 0.007649 3.258 0.001123 **
## dbp -0.022280 0.014391 -1.548 0.121578
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 324.38 on 378 degrees of freedom
## Residual deviance: 282.72 on 373 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 294.72
##
## Number of Fisher Scoring iterations: 5
## The odds ratio for this model:
exp(mmf$coefficients)
## (Intercept) bmi chol hdl sbp dbp
## 0.002576424 1.039480213 1.012844829 0.973609765 1.025230246 0.977966217
#calculate McFadden's R-square
R22 <- 1-logLik(mmf)/logLik(mnull)
R22
## 'log Lik.' 0.155731 (df=6)
#calculate c statistic:
Cstat(mmf)
## [1] 0.7502417