load packages:

library(gmodels)

## Warning: package 'gmodels' was built under R version 4.0.5

library(mctest)
library(DescTools)

## Warning: package 'DescTools' was built under R version 4.0.5

## Registered S3 method overwritten by 'DescTools':
##   method         from 
##   reorder.factor gdata

library(ResourceSelection)

## Warning: package 'ResourceSelection' was built under R version 4.0.5

## ResourceSelection 0.3-5   2019-07-22

load data:

filename <- "DMdata.csv"
fileURL <- "https://d3c33hcgiwev3.cloudfront.net/NVJ7Cw98Eem6Pgo4-YwqLg_35bb00c00f7c11e9903947c521ebe81a_final-diabetes-data-for-R-_csv_-_2_.csv?Expires=1626307200&Signature=f2iyh3U7CoX33SC~FZEIBHtEogsSVacsuCP7jUlymcUid7vKzHYi~rq~RlDsOpAID4g8ufWH~f4RdiHXQFqmf31gJJuh6-kFEQjy55wCWF2bRFHNVHlyUd-5WmKkM44ynS9DbQGZMB5jitjlsP~4N9sV-ieS7yGkBKA2ZF8nMMM_&Key-Pair-Id=APKAJLTNE6QMUY6HBC5A"
if (!file.exists(filename)){
download.file(fileURL, filename)
}

dmdata <- read.csv("DMdata.csv")

define variables:

chol <- dmdata[,"chol"]
gender <- as.factor(dmdata[,"gender"]) 
dm <- as.factor(dmdata[,"dm"])
dm2 <- factor(dm, exclude=NULL)
hdl <- dmdata[,"hdl"]
insurance <- as.factor(dmdata[,"insurance"])
smoking <- as.factor(dmdata[,"smoking"])
location <- as.factor(dmdata[,"location"]) 
frame <- as.factor(dmdata[,"frame"]) 
sbp <- dmdata[,"bp.1s"] 
dbp <- dmdata[,"bp.1d"]
age <- dmdata[,"age"]
#bmi:
height <- dmdata[,'height']
weight <- dmdata[,'weight']
height.si <- height*0.0254
weight.si <- weight*0.453592
bmi <- weight.si/height.si^2
bmi_categorised <- ifelse(bmi < 18.5, "underweight", 
                          ifelse(bmi >= 18.5 & bmi <= 25, "normal", 
                                 ifelse(bmi > 25 & bmi <= 30, "overweight", 
                                        ifelse(bmi > 30, "obese", NA)))) 
table(bmi_categorised, exclude = NULL)

## bmi_categorised
##      normal       obese  overweight underweight        <NA> 
##         113         152         123           9           6

dm_by_bmi_category <- table(bmi_categorised, dm2, exclude = NULL)
dm_by_bmi_category

##                dm2
## bmi_categorised  no yes <NA>
##     normal      100   9    4
##     obese       118  29    5
##     overweight   99  20    4
##     underweight   9   0    0
##     <NA>          4   2    0

#age categorisation:
age_grouped <- ifelse(age < 45, "under 45", 
                      ifelse(age >= 45 & age < 65, "45 - 64",  
                             ifelse(age >= 65 & age < 75, "65 - 74",  
                                    ifelse(age >= 75, "75 or over", NA)))) 

# cross tabulate age by gender 
age_group_by_gender <- table(age_grouped, gender, exclude = NULL)

Explore data:

#age
summary(age)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   19.00   34.00   45.00   46.85   60.00   92.00

hist(age)

d <- density(age) 
plot(d,main = "")

age_group_by_gender

##             gender
## age_grouped  female male
##   45 - 64        75   64
##   65 - 74        21   20
##   75 or over     12   11
##   under 45      126   74

round(100 * prop.table(age_group_by_gender, margin = 2), digits = 1)

##             gender
## age_grouped  female male
##   45 - 64      32.1 37.9
##   65 - 74       9.0 11.8
##   75 or over    5.1  6.5
##   under 45     53.8 43.8

#bmi
summary(bmi)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   15.20   24.13   27.80   28.79   32.24   55.79       6

hist(bmi)

round(100 * prop.table(dm_by_bmi_category, margin = 1), digits = 1)

##                dm2
## bmi_categorised    no   yes  <NA>
##     normal       88.5   8.0   3.5
##     obese        77.6  19.1   3.3
##     overweight   80.5  16.3   3.3
##     underweight 100.0   0.0   0.0
##     <NA>         66.7  33.3   0.0

#chol
summary(chol)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    78.0   179.0   204.0   207.8   230.0   443.0       1

hist(chol)

#hdl
summary(hdl)

##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   12.00   38.00   46.00   50.45   59.00  120.00       1

hist(hdl)

#gender
table_gender <- table(gender)
table(gender)

## gender
## female   male 
##    234    169

round(100 * prop.table(table_gender), digits = 1) #show percentage

## gender
## female   male 
##   58.1   41.9

Some Simple Logistic Regressions:

#null
mnull <- glm(dm ~ 1, family = binomial(link = logit)) 
summary(mnull)

## 
## Call:
## glm(formula = dm ~ 1, family = binomial(link = logit))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.578  -0.578  -0.578  -0.578   1.935  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.7047     0.1403  -12.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.87  on 389  degrees of freedom
## Residual deviance: 334.87  on 389  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 336.87
## 
## Number of Fisher Scoring iterations: 3

#age
m_age <- glm(dm ~ age, family=binomial (link=logit))
summary(m_age)

## 
## Call:
## glm(formula = dm ~ age, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.3612  -0.5963  -0.4199  -0.3056   2.4848  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.404530   0.542828  -8.114 4.90e-16 ***
## age          0.052465   0.009388   5.589 2.29e-08 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.87  on 389  degrees of freedom
## Residual deviance: 299.41  on 388  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 303.41
## 
## Number of Fisher Scoring iterations: 5

## The odds ratio for a year increase in age:
exp(m_age$coefficients["age"])

##      age 
## 1.053866

#gender
m_gen <- glm(dm ~ gender, family=binomial (link=logit))
summary(m_gen)

## 
## Call:
## glm(formula = dm ~ gender, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5915  -0.5915  -0.5683  -0.5683   1.9509  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.74150    0.18592  -9.367   <2e-16 ***
## gendermale   0.08694    0.28352   0.307    0.759    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.87  on 389  degrees of freedom
## Residual deviance: 334.78  on 388  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 338.78
## 
## Number of Fisher Scoring iterations: 4

## odds of having diabetes when male: 
exp(m_gen$coefficients["gendermale"])

## gendermale 
##    1.09083

#chol
m_chol <- glm(dm ~ chol, family=binomial (link=logit))
summary(m_chol)

## 
## Call:
## glm(formula = dm ~ chol, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.1002  -0.5946  -0.5095  -0.4258   2.4106  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -4.169970   0.682741  -6.108 1.01e-09 ***
## chol         0.011486   0.003012   3.813 0.000137 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.54  on 388  degrees of freedom
## Residual deviance: 319.62  on 387  degrees of freedom
##   (14 observations deleted due to missingness)
## AIC: 323.62
## 
## Number of Fisher Scoring iterations: 4

## The odds ratio for a unit increase in chol:
exp(m_chol$coefficients["chol"])

##     chol 
## 1.011552

#hdl
m_hdl <- glm(dm ~ hdl, family=binomial (link=logit))
summary(m_hdl)

## 
## Call:
## glm(formula = dm ~ hdl, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.8443  -0.6245  -0.5602  -0.4535   2.5703  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)  
## (Intercept) -0.563695   0.474331  -1.188   0.2347  
## hdl         -0.023704   0.009847  -2.407   0.0161 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.54  on 388  degrees of freedom
## Residual deviance: 327.91  on 387  degrees of freedom
##   (14 observations deleted due to missingness)
## AIC: 331.91
## 
## Number of Fisher Scoring iterations: 5

## The odds ratio for a unit increase in hdl:
exp(m_hdl$coefficients["hdl"])

##       hdl 
## 0.9765751

#bmi
m_bmi <- glm(dm ~ bmi, family=binomial (link=logit))
summary(m_bmi)

## 
## Call:
## glm(formula = dm ~ bmi, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.0562  -0.5877  -0.5247  -0.4621   2.1349  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -3.35108    0.63279  -5.296 1.19e-07 ***
## bmi          0.05484    0.02023   2.710  0.00673 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 326.02  on 383  degrees of freedom
## Residual deviance: 318.86  on 382  degrees of freedom
##   (19 observations deleted due to missingness)
## AIC: 322.86
## 
## Number of Fisher Scoring iterations: 4

## The odds ratio for a unit increase in bmi:
exp(m_bmi$coefficients["bmi"])

##      bmi 
## 1.056369

Check Linear Relationships between predictors and log odds of the outcome (having diabetes):

##gender:

### cross tabulation 
dm_by_gender <- table(gender, dm) 

### proportion of diabetes status by gender 
dm_by_gender_prop <- prop.table(dm_by_gender, margin = 1) 

### calculate the odds of having diabetes by gender 
odds_gender <- dm_by_gender_prop[, "yes"]/dm_by_gender_prop[, "no"] 

### calculate the log odds 
logodds_gender <- log(odds_gender) 

### plot the log odds of having diabetes by gender 
dotchart(logodds_gender)

##bmi:
dm_by_bmi <- table(bmi, dm) 

freq_table_bmi <- prop.table(dm_by_bmi, margin = 1) 
odds_bmi <- freq_table_bmi[, "yes"]/freq_table_bmi[, "no"] 
logodds_bmi <- log(odds_bmi) 
plot(rownames(freq_table_bmi), logodds_bmi)

## categorised bmi:
bmi_categorised <- factor(bmi_categorised, levels = c("underweight", "normal", "overweight","obese")) 
dm_by_bmi_categorised <- table(bmi_categorised, dm)

dm_by_bmi_categorised_prop <- prop.table(dm_by_bmi_categorised, margin = 1) 

odds_bmi_categorised <- dm_by_bmi_categorised_prop[, "yes"]/dm_by_bmi_categorised_prop[, "no"] 

logodds_bmi_categorised <- log(odds_bmi_categorised) 

dotchart(logodds_bmi_categorised)

## chol:
dm_by_chol <- table(chol, dm) 

freq_table_chol <- prop.table(dm_by_chol, margin = 1) 

odds_chol <- freq_table_chol[, "yes"]/freq_table_chol[, "no"] 

logodds_chol <- log(odds_chol) 
plot(rownames(freq_table_chol), logodds_chol)

## categorising chol into an ordinal variable 

# https://www.medicalnewstoday.com/articles/315900.php 
chol_categorised <- ifelse(chol < 200, "healthy",  
                           ifelse(chol < 240, "borderline high", 
                                  ifelse(chol >= 240, "high", NA))) 

chol_categorised <- factor(chol_categorised, levels = c("healthy", "borderline high", "high")) 

dm_by_chol_categorised <- table(chol_categorised, dm) 

dm_by_chol_categorised_prop <- prop.table(dm_by_chol_categorised, margin = 1) 

odds_chol_categorised <- dm_by_chol_categorised_prop[, "yes"]/dm_by_chol_categorised_prop[, "no"] 

logodds_chol_categorised <- log(odds_chol_categorised) 

dotchart(logodds_chol_categorised)

## hdl:
dm_by_hdl <- table(hdl, dm) 
freq_table_hdl <- prop.table(dm_by_hdl, margin = 1) 
odds_hdl <- freq_table_hdl[, "yes"]/freq_table_hdl[, "no"] 
logodds_hdl <- log(odds_hdl) 
plot(rownames(freq_table_hdl), logodds_hdl)

# categorising hdl into an ordinal variable 

#https://www.healthline.com/health/high-cholesterol/levels-by-age#adults 
hdl_categorised <- ifelse(hdl < 40, "low",  
                           ifelse(hdl < 60, "acceptable", 
                                  ifelse(hdl >= 60, "good", NA))) 

hdl_categorised <- factor(hdl_categorised, levels = c("low", "acceptable", "good")) 

dm_by_hdl_categorised <- table(hdl_categorised, dm) 

dm_by_hdl_categorised_prop <- prop.table(dm_by_hdl_categorised, margin = 1) 

odds_hdl_categorised <- dm_by_hdl_categorised_prop[, "yes"]/dm_by_hdl_categorised_prop[, "no"] 

logodds_hdl_categorised <- log(odds_hdl_categorised) 

dotchart(logodds_hdl_categorised)

Relationship Between Predictors:

cor.test(chol, hdl, method = "pearson")

## 
##  Pearson's product-moment correlation
## 
## data:  chol and hdl
## t = 3.7983, df = 400, p-value = 0.0001683
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.09042379 0.27929136
## sample estimates:
##       cor 
## 0.1865809

#(corrolated (low p value) but weak corrolation, we can include both in our model)

Multiple Logistic Regression:

mm <- glm(dm ~ age + gender + bmi, family = binomial(link = "logit"))
summary(mm)

## 
## Call:
## glm(formula = dm ~ age + gender + bmi, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6843  -0.5763  -0.3885  -0.2575   2.6991  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.892669   1.027388  -6.709 1.96e-11 ***
## age          0.055454   0.009884   5.611 2.02e-08 ***
## gendermale   0.244852   0.322817   0.758  0.44816    
## bmi          0.073879   0.023310   3.169  0.00153 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 326.02  on 383  degrees of freedom
## Residual deviance: 281.62  on 380  degrees of freedom
##   (19 observations deleted due to missingness)
## AIC: 289.62
## 
## Number of Fisher Scoring iterations: 5

exp(confint(mm))

## Waiting for profiling to be done...

##                    2.5 %      97.5 %
## (Intercept) 0.0001226733 0.006986056
## age         1.0373532463 1.078493131
## gendermale  0.6762611041 2.409679250
## bmi         1.0287121260 1.127738696

## The odds ratio for this model:
exp(mm$coefficients)

## (Intercept)         age  gendermale         bmi 
## 0.001015201 1.057020598 1.277432283 1.076676391

## age and bmi are significant factors for diabetes risk, but there is no evidence that gender affects the risk.

#calculate McFadden's R-square 
R2 <- 1-logLik(mm)/logLik(mnull) 
R2

## 'log Lik.' 0.1590084 (df=4)

#calculate c statistic:
Cstat(mm)

## [1] 0.7787709

#perform Hosmer-Lemeshow test
HL <- hoslem.test(x = mm$y, y = fitted(mm), g = 10) 
HL

## 
##  Hosmer and Lemeshow goodness of fit (GOF) test
## 
## data:  mm$y, fitted(mm)
## X-squared = 15.826, df = 8, p-value = 0.04494

#  plot the observed vs expected number of cases for each of the 10 groups 
plot(HL$observed[,"y1"], HL$expected[,"yhat1"])

# condensed model:
mm2 <- glm(dm ~ age + gender + bmi + chol + hdl + sbp + dbp + location + frame + insurance + smoking , family = binomial(link = "logit"))
summary(mm)

## 
## Call:
## glm(formula = dm ~ age + gender + bmi, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6843  -0.5763  -0.3885  -0.2575   2.6991  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -6.892669   1.027388  -6.709 1.96e-11 ***
## age          0.055454   0.009884   5.611 2.02e-08 ***
## gendermale   0.244852   0.322817   0.758  0.44816    
## bmi          0.073879   0.023310   3.169  0.00153 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 326.02  on 383  degrees of freedom
## Residual deviance: 281.62  on 380  degrees of freedom
##   (19 observations deleted due to missingness)
## AIC: 289.62
## 
## Number of Fisher Scoring iterations: 5

#  test significance of each variable 
anova(mm2, test = "Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: dm
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                        378     324.38              
## age        1   33.737       377     290.64 6.309e-09 ***
## gender     1    0.016       376     290.62  0.900103    
## bmi        1    9.893       375     280.73  0.001659 ** 
## chol       1    8.701       374     272.03  0.003180 ** 
## hdl        1    7.794       373     264.24  0.005241 ** 
## sbp        1    0.579       372     263.66  0.446733    
## dbp        1    0.005       371     263.65  0.944325    
## location   1    0.456       370     263.20  0.499712    
## frame      3    0.404       367     262.79  0.939510    
## insurance  2    1.647       365     261.15  0.438976    
## smoking    2    0.213       363     260.93  0.899082    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

# cor between age and systolic blood pressure
cor.test(sbp, age) # extremely significant

## 
##  Pearson's product-moment correlation
## 
## data:  sbp and age
## t = 9.8342, df = 396, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3604404 0.5187477
## sample estimates:
##       cor 
## 0.4430412

# model without age:
mm3 <- glm(dm ~  gender + bmi + chol + hdl + sbp + dbp + location + frame + insurance + smoking , family = binomial(link = "logit"))

anova(mm3, test = "Chisq")

## Analysis of Deviance Table
## 
## Model: binomial, link: logit
## 
## Response: dm
## 
## Terms added sequentially (first to last)
## 
## 
##           Df Deviance Resid. Df Resid. Dev  Pr(>Chi)    
## NULL                        378     324.38              
## gender     1   0.0562       377     324.32  0.812532    
## bmi        1   7.9434       376     316.38  0.004826 ** 
## chol       1  16.1365       375     300.24 5.894e-05 ***
## hdl        1   6.6824       374     293.56  0.009737 ** 
## sbp        1   8.5647       373     284.99  0.003427 ** 
## dbp        1   2.6605       372     282.33  0.102867    
## location   1   0.7944       371     281.54  0.372763    
## frame      3   0.6076       368     280.93  0.894682    
## insurance  2   0.6933       366     280.24  0.707066    
## smoking    2   0.8738       364     279.36  0.646024    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

## now systolic BP is significant!

# backward elimination:

summary(mm3)

## 
## Call:
## glm(formula = dm ~ gender + bmi + chol + hdl + sbp + dbp + location + 
##     frame + insurance + smoking, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7267  -0.5881  -0.4156  -0.2756   2.4311  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.676068   1.835830  -3.092 0.001989 ** 
## gendermale      0.129166   0.362884   0.356 0.721883    
## bmi             0.037725   0.027704   1.362 0.173278    
## chol            0.012871   0.003523   3.654 0.000259 ***
## hdl            -0.025611   0.010805  -2.370 0.017768 *  
## sbp             0.024860   0.008092   3.072 0.002125 ** 
## dbp            -0.019343   0.015048  -1.285 0.198648    
## locationLouisa -0.263597   0.318295  -0.828 0.407584    
## framelarge      0.052141   0.892634   0.058 0.953420    
## framemedium    -0.216548   0.890252  -0.243 0.807817    
## framesmall     -0.045113   0.947902  -0.048 0.962041    
## insurance1     -0.255968   0.380091  -0.673 0.500669    
## insurance2     -0.328115   0.386914  -0.848 0.396422    
## smoking2       -0.329779   0.350842  -0.940 0.347236    
## smoking3       -0.204765   0.494104  -0.414 0.678569    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.38  on 378  degrees of freedom
## Residual deviance: 279.36  on 364  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 309.36
## 
## Number of Fisher Scoring iterations: 5

##remove frame as it has the highest p value:
mm33 <- glm(dm ~  gender + bmi + chol + hdl + sbp + dbp + location +  insurance + smoking , family = binomial(link = "logit"))
summary(mm33)

## 
## Call:
## glm(formula = dm ~ gender + bmi + chol + hdl + sbp + dbp + location + 
##     insurance + smoking, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.7895  -0.5849  -0.4151  -0.2748   2.3992  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.830798   1.505890  -3.872 0.000108 ***
## gendermale      0.197591   0.338090   0.584 0.558928    
## bmi             0.042416   0.024977   1.698 0.089469 .  
## chol            0.012660   0.003501   3.617 0.000299 ***
## hdl            -0.025298   0.010686  -2.367 0.017914 *  
## sbp             0.025475   0.007918   3.217 0.001294 ** 
## dbp            -0.020917   0.014850  -1.409 0.158974    
## locationLouisa -0.298118   0.312322  -0.955 0.339820    
## insurance1     -0.254163   0.377454  -0.673 0.500717    
## insurance2     -0.320476   0.386498  -0.829 0.407003    
## smoking2       -0.347780   0.348633  -0.998 0.318496    
## smoking3       -0.228472   0.491680  -0.465 0.642164    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.38  on 378  degrees of freedom
## Residual deviance: 279.90  on 367  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 303.9
## 
## Number of Fisher Scoring iterations: 5

##remove gender:
mm333 <- glm(dm ~   bmi + chol + hdl + sbp + dbp + location +  insurance + smoking , family = binomial(link = "logit"))
summary(mm333)

## 
## Call:
## glm(formula = dm ~ bmi + chol + hdl + sbp + dbp + location + 
##     insurance + smoking, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.8042  -0.5894  -0.4218  -0.2771   2.3719  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.577511   1.433297  -3.891 9.97e-05 ***
## bmi             0.037421   0.023398   1.599 0.109743    
## chol            0.012713   0.003507   3.625 0.000289 ***
## hdl            -0.026752   0.010455  -2.559 0.010501 *  
## sbp             0.024955   0.007847   3.180 0.001471 ** 
## dbp            -0.019556   0.014643  -1.336 0.181710    
## locationLouisa -0.298173   0.312273  -0.955 0.339655    
## insurance1     -0.262997   0.376939  -0.698 0.485354    
## insurance2     -0.336179   0.385132  -0.873 0.382721    
## smoking2       -0.340588   0.348362  -0.978 0.328230    
## smoking3       -0.206453   0.489167  -0.422 0.672988    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.38  on 378  degrees of freedom
## Residual deviance: 280.24  on 368  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 302.24
## 
## Number of Fisher Scoring iterations: 5

##remove smoking:
mm3333 <- glm(dm ~   bmi + chol + hdl + sbp + dbp + location +  insurance , family = binomial(link = "logit"))
summary(mm3333)

## 
## Call:
## glm(formula = dm ~ bmi + chol + hdl + sbp + dbp + location + 
##     insurance, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6815  -0.5820  -0.4215  -0.2938   2.4488  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.719167   1.424424  -4.015 5.94e-05 ***
## bmi             0.037048   0.023280   1.591 0.111520    
## chol            0.012715   0.003506   3.627 0.000287 ***
## hdl            -0.026605   0.010506  -2.532 0.011329 *  
## sbp             0.024341   0.007722   3.152 0.001621 ** 
## dbp            -0.019623   0.014609  -1.343 0.179219    
## locationLouisa -0.281530   0.310177  -0.908 0.364067    
## insurance1     -0.244305   0.375255  -0.651 0.515022    
## insurance2     -0.312822   0.384587  -0.813 0.415991    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.38  on 378  degrees of freedom
## Residual deviance: 281.19  on 370  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 299.19
## 
## Number of Fisher Scoring iterations: 5

##remove insurance
mm33333 <- glm(dm ~   bmi + chol + hdl + sbp + dbp + location  , family = binomial(link = "logit"))
summary(mm33333)

## 
## Call:
## glm(formula = dm ~ bmi + chol + hdl + sbp + dbp + location, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6031  -0.5905  -0.4209  -0.2986   2.4435  
## 
## Coefficients:
##                 Estimate Std. Error z value Pr(>|z|)    
## (Intercept)    -5.873089   1.403795  -4.184 2.87e-05 ***
## bmi             0.038040   0.023253   1.636 0.101864    
## chol            0.012955   0.003486   3.717 0.000202 ***
## hdl            -0.026586   0.010491  -2.534 0.011271 *  
## sbp             0.024556   0.007649   3.211 0.001325 ** 
## dbp            -0.021413   0.014430  -1.484 0.137831    
## locationLouisa -0.275159   0.308014  -0.893 0.371677    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.38  on 378  degrees of freedom
## Residual deviance: 281.92  on 372  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 295.92
## 
## Number of Fisher Scoring iterations: 5

##remove location
mmf <- glm(dm ~   bmi + chol + hdl + sbp + dbp   , family = binomial(link = "logit"))
summary(mmf)

## 
## Call:
## glm(formula = dm ~ bmi + chol + hdl + sbp + dbp, family = binomial(link = "logit"))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.6686  -0.5932  -0.4237  -0.2890   2.4963  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.961353   1.403370  -4.248 2.16e-05 ***
## bmi          0.038721   0.023238   1.666 0.095657 .  
## chol         0.012763   0.003459   3.690 0.000224 ***
## hdl         -0.026745   0.010498  -2.548 0.010848 *  
## sbp          0.024917   0.007649   3.258 0.001123 ** 
## dbp         -0.022280   0.014391  -1.548 0.121578    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.38  on 378  degrees of freedom
## Residual deviance: 282.72  on 373  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 294.72
## 
## Number of Fisher Scoring iterations: 5

## The odds ratio for this model:
exp(mmf$coefficients)

## (Intercept)         bmi        chol         hdl         sbp         dbp 
## 0.002576424 1.039480213 1.012844829 0.973609765 1.025230246 0.977966217

#calculate McFadden's R-square 
R22 <- 1-logLik(mmf)/logLik(mnull) 
R22

## 'log Lik.' 0.155731 (df=6)

#calculate c statistic:
Cstat(mmf)

## [1] 0.7502417

Logistic Regression Tutorial

Somaya AlGabry

13/07/2021

load packages:

load data:

define variables:

Explore data:

Some Simple Logistic Regressions:

Check Linear Relationships between predictors and log odds of the outcome (having diabetes):

Relationship Between Predictors:

Multiple Logistic Regression: