g=read.csv("data.csv")

##dimension data

dim(g)
## [1] 403  24

##dim names

dimnames(g)
## [[1]]
##   [1] "1"   "2"   "3"   "4"   "5"   "6"   "7"   "8"   "9"   "10"  "11"  "12" 
##  [13] "13"  "14"  "15"  "16"  "17"  "18"  "19"  "20"  "21"  "22"  "23"  "24" 
##  [25] "25"  "26"  "27"  "28"  "29"  "30"  "31"  "32"  "33"  "34"  "35"  "36" 
##  [37] "37"  "38"  "39"  "40"  "41"  "42"  "43"  "44"  "45"  "46"  "47"  "48" 
##  [49] "49"  "50"  "51"  "52"  "53"  "54"  "55"  "56"  "57"  "58"  "59"  "60" 
##  [61] "61"  "62"  "63"  "64"  "65"  "66"  "67"  "68"  "69"  "70"  "71"  "72" 
##  [73] "73"  "74"  "75"  "76"  "77"  "78"  "79"  "80"  "81"  "82"  "83"  "84" 
##  [85] "85"  "86"  "87"  "88"  "89"  "90"  "91"  "92"  "93"  "94"  "95"  "96" 
##  [97] "97"  "98"  "99"  "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150" "151" "152" "153" "154" "155" "156"
## [157] "157" "158" "159" "160" "161" "162" "163" "164" "165" "166" "167" "168"
## [169] "169" "170" "171" "172" "173" "174" "175" "176" "177" "178" "179" "180"
## [181] "181" "182" "183" "184" "185" "186" "187" "188" "189" "190" "191" "192"
## [193] "193" "194" "195" "196" "197" "198" "199" "200" "201" "202" "203" "204"
## [205] "205" "206" "207" "208" "209" "210" "211" "212" "213" "214" "215" "216"
## [217] "217" "218" "219" "220" "221" "222" "223" "224" "225" "226" "227" "228"
## [229] "229" "230" "231" "232" "233" "234" "235" "236" "237" "238" "239" "240"
## [241] "241" "242" "243" "244" "245" "246" "247" "248" "249" "250" "251" "252"
## [253] "253" "254" "255" "256" "257" "258" "259" "260" "261" "262" "263" "264"
## [265] "265" "266" "267" "268" "269" "270" "271" "272" "273" "274" "275" "276"
## [277] "277" "278" "279" "280" "281" "282" "283" "284" "285" "286" "287" "288"
## [289] "289" "290" "291" "292" "293" "294" "295" "296" "297" "298" "299" "300"
## [301] "301" "302" "303" "304" "305" "306" "307" "308" "309" "310" "311" "312"
## [313] "313" "314" "315" "316" "317" "318" "319" "320" "321" "322" "323" "324"
## [325] "325" "326" "327" "328" "329" "330" "331" "332" "333" "334" "335" "336"
## [337] "337" "338" "339" "340" "341" "342" "343" "344" "345" "346" "347" "348"
## [349] "349" "350" "351" "352" "353" "354" "355" "356" "357" "358" "359" "360"
## [361] "361" "362" "363" "364" "365" "366" "367" "368" "369" "370" "371" "372"
## [373] "373" "374" "375" "376" "377" "378" "379" "380" "381" "382" "383" "384"
## [385] "385" "386" "387" "388" "389" "390" "391" "392" "393" "394" "395" "396"
## [397] "397" "398" "399" "400" "401" "402" "403"
## 
## [[2]]
##  [1] "X"         "id"        "chol"      "stab.glu"  "hdl"       "ratio"    
##  [7] "glyhb"     "location"  "age"       "gender"    "height"    "weight"   
## [13] "frame"     "bp.1s"     "bp.1d"     "bp.2s"     "bp.2d"     "waist"    
## [19] "hip"       "time.ppn"  "insurance" "fh"        "smoking"   "dm"

data object creation

chol=g[,"chol"]
gender=as.factor(g[,"gender"])
dm= as.factor(g[,"dm"])
height=g[,"height"]
weight=g[,"weight"]
age=g[,"age"]
hdl=g[,"hdl"]
insurance=g[,"insurance"]
frame=as.factor(g[,"frame"])
location=as.factor(g[,"location"])
systolic=g[,"bp.1s"]
diastolic=g[,"bp.1d"]
ratio=g[,"ratio"]
fh=as.factor(g[,"fh"])
smoking=as.factor(g[,"smoking"])

Gender table creation

t=table(gender)

round gender table and percentage

round(prop.table(t), digits = 3)
## gender
## female   male 
##  0.581  0.419
round(100*prop.table(t), digits = 1)
## gender
## female   male 
##   58.1   41.9

Diabetes Mellitus table creation

dm2=factor(dm, exclude = NULL)
table(dm2)
## dm2
##   no  yes <NA> 
##  330   60   13

Continuous variable

summary(chol)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    78.0   179.0   204.0   207.8   230.0   443.0       1
summary(height)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   52.00   63.00   66.00   66.02   69.00   76.00       5
summary(weight)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##    99.0   151.0   172.5   177.6   200.0   325.0       1

BMI calculattion

height.si=height*0.0254
weight.si=weight*0.453592
bmi=weight.si/height.si^2
summary(bmi)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max.    NA's 
##   15.20   24.13   27.80   28.79   32.24   55.79       6

BMI categorized

bmi_categorized=ifelse(bmi<18.5,"underweight" ,ifelse(bmi>18.5 & bmi<25,"Normal" ,ifelse(bmi>25 & bmi<30,"overweight" ,ifelse(bmi>30,"obese",NA))))
table(bmi_categorized, exclude = NULL)
## bmi_categorized
##      Normal       obese  overweight underweight        <NA> 
##         113         152         123           9           6

frequencies of diabetes by BMI category

dm_bmi_categorized=table(bmi_categorized,dm2,exclude = NULL)

Proportion

round(100*prop.table(dm_bmi_categorized,margin = 1),digits = 1)
##                dm2
## bmi_categorized    no   yes  <NA>
##     Normal       88.5   8.0   3.5
##     obese        77.6  19.1   3.3
##     overweight   80.5  16.3   3.3
##     underweight 100.0   0.0   0.0
##     <NA>         66.7  33.3   0.0
round(prop.table(dm_bmi_categorized, margin = 1),digits = 3)
##                dm2
## bmi_categorized    no   yes  <NA>
##     Normal      0.885 0.080 0.035
##     obese       0.776 0.191 0.033
##     overweight  0.805 0.163 0.033
##     underweight 1.000 0.000 0.000
##     <NA>        0.667 0.333 0.000

Age categorized

age_grouped=ifelse(age<45,"under45" ,ifelse(age>=45 & age<65,"45-64" ,ifelse(age>=65 & age<75,"65-74" ,ifelse(age>=75,"75 & over",NA))))
table(age_grouped,exclude=NULL)
## age_grouped
##     45-64     65-74 75 & over   under45 
##       139        41        23       200

Cross tabluation age categorized and gender

age_group_by_Newgender=table(age_grouped,gender,exclude = NULL)
age_group_by_Newgender
##            gender
## age_grouped female male
##   45-64         75   64
##   65-74         21   20
##   75 & over     12   11
##   under45      126   74

Proportion cross tabluation age categorized and gender

round(prop.table(age_group_by_Newgender),digits =3)
##            gender
## age_grouped female  male
##   45-64      0.186 0.159
##   65-74      0.052 0.050
##   75 & over  0.030 0.027
##   under45    0.313 0.184
round(100*prop.table(age_group_by_Newgender , margin = 2),digits = 1)
##            gender
## age_grouped female male
##   45-64       32.1 37.9
##   65-74        9.0 11.8
##   75 & over    5.1  6.5
##   under45     53.8 43.8

Logesitc regression

m=glm(dm~gender, family = binomial(link = logit))
summary(m)
## 
## Call:
## glm(formula = dm ~ gender, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -0.5915  -0.5915  -0.5683  -0.5683   1.9509  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -1.74150    0.18592  -9.367   <2e-16 ***
## gendermale   0.08694    0.28352   0.307    0.759    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.87  on 389  degrees of freedom
## Residual deviance: 334.78  on 388  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 338.78
## 
## Number of Fisher Scoring iterations: 4

Odds ratio-simple logestic regression

exp(m$coefficients[-1])
## gendermale 
##    1.09083

Variable gender value check

contrasts(gender)
##        male
## female    0
## male      1

Histogram

hist(age)

hist(bmi)

hist(chol)

hist(hdl)

## Multiple logestic regression

m2=glm(dm~age+chol+insurance, family = binomial(link = logit))
summary(m2)
## 
## Call:
## glm(formula = dm ~ age + chol + insurance, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5639  -0.5966  -0.3996  -0.2623   2.4452  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -5.782785   0.859767  -6.726 1.74e-11 ***
## age          0.049720   0.009757   5.096 3.47e-07 ***
## chol         0.008392   0.003150   2.665  0.00771 ** 
## insurance   -0.294207   0.188047  -1.565  0.11769    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.54  on 388  degrees of freedom
## Residual deviance: 289.29  on 385  degrees of freedom
##   (14 observations deleted due to missingness)
## AIC: 297.29
## 
## Number of Fisher Scoring iterations: 5

ODDs ratio-multiple logestic regression

exp(m2$coefficients[-1])
##       age      chol insurance 
## 1.0509765 1.0084275 0.7451224

Insurance data conversion

g$insurance=as.factor(g$insurance)

gender data structure

contrasts(gender)
##        male
## female    0
## male      1

Insurance data structure

table(insurance)
## insurance
##   0   1   2 
## 117 147 139

Insurance value adding

insurance=factor(insurance,labels =c("others","government","private")  )
table(insurance)
## insurance
##     others government    private 
##        117        147        139

Age, chol, Insurance new model check

Full_model=glm(formula = dm~age+chol+insurance, family = binomial(link = logit))
summary(Full_model)
## 
## Call:
## glm(formula = dm ~ age + chol + insurance, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5714  -0.5945  -0.3992  -0.2619   2.4399  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -5.794252   0.874555  -6.625 3.46e-11 ***
## age                  0.049753   0.009770   5.092 3.54e-07 ***
## chol                 0.008402   0.003153   2.665   0.0077 ** 
## insurancegovernment -0.271955   0.359445  -0.757   0.4493    
## insuranceprivate    -0.589803   0.377434  -1.563   0.1181    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.54  on 388  degrees of freedom
## Residual deviance: 289.28  on 384  degrees of freedom
##   (14 observations deleted due to missingness)
## AIC: 299.28
## 
## Number of Fisher Scoring iterations: 5

Null model

null_model=glm(dm~1,family = binomial(link = logit))
summary(null_model)
## 
## Call:
## glm(formula = dm ~ 1, family = binomial(link = logit))
## 
## Deviance Residuals: 
##    Min      1Q  Median      3Q     Max  
## -0.578  -0.578  -0.578  -0.578   1.935  
## 
## Coefficients:
##             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)  -1.7047     0.1403  -12.15   <2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 334.87  on 389  degrees of freedom
## Residual deviance: 334.87  on 389  degrees of freedom
##   (13 observations deleted due to missingness)
## AIC: 336.87
## 
## Number of Fisher Scoring iterations: 3

Mc Faddens R-square

R2=1- logLik(Full_model)/logLik(null_model)
R2
## 'log Lik.' 0.1361385 (df=5)

Library loading

library(DescTools)

Cstats models

Cstat(Full_model)
## [1] 0.7644124
Cstat(null_model)
## [1] 0.5

Library loading 2

library(ResourceSelection)
## ResourceSelection 0.3-5   2019-07-22
Full_model$y
##   1   2   3   4   5   6   7   8   9  10  11  12  13  14  15  16  17  18  19  20 
##   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1 
##  21  22  23  24  25  26  27  29  30  31  32  33  34  35  36  37  38  39  40  41 
##   0   0   1   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   1   0 
##  42  43  45  46  47  48  49  50  51  52  53  54  55  56  57  58  59  61  62  63 
##   0   0   0   0   0   1   0   0   0   0   0   0   1   0   0   0   1   1   0   1 
##  64  66  67  68  69  70  71  72  73  74  75  76  77  78  79  80  81  82  83  84 
##   0   0   0   1   0   1   0   0   0   0   0   1   0   0   0   0   0   0   0   0 
##  85  86  87  88  89  90  91  92  93  94  95  96  97  98  99 100 101 102 103 104 
##   0   0   1   0   0   0   0   1   0   0   0   0   0   1   0   1   0   0   0   0 
## 105 106 107 108 109 111 112 113 114 115 116 119 120 121 122 123 124 125 126 127 
##   1   0   0   0   1   0   0   0   0   0   0   0   0   0   1   0   1   0   0   0 
## 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147 
##   0   0   1   0   0   0   0   0   0   0   0   1   0   0   0   0   1   0   0   0 
## 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167 
##   0   0   0   1   0   0   0   0   1   0   0   0   0   0   1   0   0   0   0   1 
## 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187 
##   0   0   0   0   1   0   1   0   0   1   1   0   0   1   0   0   0   0   0   0 
## 188 189 190 191 192 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208 
##   0   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0 
## 209 210 211 212 213 214 215 216 217 219 220 221 222 223 224 225 226 227 228 229 
##   0   1   0   0   1   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249 
##   0   0   0   0   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   1 
## 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269 
##   0   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0   0   0   0   0 
## 270 271 272 273 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290 
##   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310 
##   0   0   0   0   1   0   0   0   0   0   0   1   0   0   0   1   0   0   0   0 
## 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 329 330 331 332 
##   0   0   1   0   1   0   0   0   0   0   0   0   0   0   1   0   1   0   0   0 
## 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352 
##   0   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   1   0   0 
## 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372 
##   0   0   0   0   0   0   1   0   0   0   1   1   1   0   0   0   1   0   0   0 
## 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392 
##   0   0   0   0   0   0   0   0   0   1   0   0   0   0   0   1   0   0   1   0 
## 393 394 395 396 398 399 400 401 402 
##   0   0   0   0   0   1   0   1   0

Hosmer-Lemeshow statistic and test:

HL=hoslem.test(x=Full_model$y , y=fitted(Full_model),g=10)

Fh data structure

table(fh)
## fh
##   0   1 
## 327  76

Fh value adding

fh=factor(fh,labels = c("No","fh"))
table(fh)
## fh
##  No  fh 
## 327  76

New multiple logestic regression

model=glm(dm~age+bmi+chol+hdl+systolic+diastolic,family = binomial(link=logit))
summary(model)
## 
## Call:
## glm(formula = dm ~ age + bmi + chol + hdl + systolic + diastolic, 
##     family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4658  -0.5453  -0.3625  -0.1989   2.8155  
## 
## Coefficients:
##               Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.4479066  1.5287185  -4.872 1.10e-06 ***
## age          0.0505742  0.0121218   4.172 3.02e-05 ***
## bmi          0.0496011  0.0242735   2.043  0.04101 *  
## chol         0.0106330  0.0035028   3.036  0.00240 ** 
## hdl         -0.0290599  0.0103633  -2.804  0.00505 ** 
## systolic     0.0053365  0.0089493   0.596  0.55097    
## diastolic   -0.0002951  0.0158481  -0.019  0.98515    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.38  on 378  degrees of freedom
## Residual deviance: 263.80  on 372  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 277.8
## 
## Number of Fisher Scoring iterations: 5

model wihout systolic and diastolic bp

model_updated=glm(dm~age+bmi+chol+hdl,family = binomial(link=logit))
summary(model_updated)
## 
## Call:
## glm(formula = dm ~ age + bmi + chol + hdl, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4243  -0.5554  -0.3585  -0.1969   2.8492  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -7.112566   1.279586  -5.558 2.72e-08 ***
## age          0.054510   0.010577   5.153 2.56e-07 ***
## bmi          0.052218   0.024013   2.175  0.02966 *  
## chol         0.011010   0.003452   3.190  0.00142 ** 
## hdl         -0.028668   0.010310  -2.781  0.00543 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 325.70  on 382  degrees of freedom
## Residual deviance: 265.11  on 378  degrees of freedom
##   (20 observations deleted due to missingness)
## AIC: 275.11
## 
## Number of Fisher Scoring iterations: 5

model significant test

cor.test(systolic,hdl)
## 
##  Pearson's product-moment correlation
## 
## data:  systolic and hdl
## t = 0.39368, df = 395, p-value = 0.694
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  -0.07877132  0.11799603
## sample estimates:
##        cor 
## 0.01980412
cor.test(systolic,bmi)
## 
##  Pearson's product-moment correlation
## 
## data:  systolic and bmi
## t = 2.2863, df = 391, p-value = 0.02277
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.01611718 0.21137656
## sample estimates:
##       cor 
## 0.1148561
cor.test(systolic,chol)
## 
##  Pearson's product-moment correlation
## 
## data:  systolic and chol
## t = 4.1276, df = 395, p-value = 4.474e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.1070653 0.2958454
## sample estimates:
##       cor 
## 0.2033444
cor.test(systolic,age)
## 
##  Pearson's product-moment correlation
## 
## data:  systolic and age
## t = 9.8342, df = 396, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
##  0.3604404 0.5187477
## sample estimates:
##       cor 
## 0.4430412

Multiple logestic regression model +gender, location, frame, insurance, and smoking.

model_new=glm(dm~age+gender+bmi+location+frame+insurance+smoking+bmi+chol+hdl+systolic+diastolic, family = binomial(link = logit))
summary(model_new)
## 
## Call:
## glm(formula = dm ~ age + gender + bmi + location + frame + insurance + 
##     smoking + bmi + chol + hdl + systolic + diastolic, family = binomial(link = logit))
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.4845  -0.5506  -0.3577  -0.1948   2.6399  
## 
## Coefficients:
##                      Estimate Std. Error z value Pr(>|z|)    
## (Intercept)         -7.953055   2.094979  -3.796 0.000147 ***
## age                  0.052432   0.012882   4.070  4.7e-05 ***
## gendermale           0.159954   0.381967   0.419 0.675388    
## bmi                  0.054568   0.028911   1.887 0.059097 .  
## locationLouisa      -0.255176   0.330092  -0.773 0.439496    
## framelarge           0.262753   0.969952   0.271 0.786474    
## framemedium          0.275534   0.967371   0.285 0.775776    
## framesmall           0.550051   1.027476   0.535 0.592414    
## insurancegovernment -0.273335   0.391855  -0.698 0.485465    
## insuranceprivate    -0.529986   0.406492  -1.304 0.192300    
## smoking2            -0.158872   0.369045  -0.430 0.666836    
## smoking3            -0.179607   0.508335  -0.353 0.723846    
## chol                 0.010785   0.003589   3.005 0.002652 ** 
## hdl                 -0.028312   0.010874  -2.604 0.009224 ** 
## systolic             0.005573   0.009455   0.589 0.555597    
## diastolic            0.002992   0.016633   0.180 0.857229    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 324.38  on 378  degrees of freedom
## Residual deviance: 260.93  on 363  degrees of freedom
##   (24 observations deleted due to missingness)
## AIC: 292.93
## 
## Number of Fisher Scoring iterations: 5