g=read.csv("data.csv")
##dimension data
dim(g)
## [1] 403 24
##dim names
dimnames(g)
## [[1]]
## [1] "1" "2" "3" "4" "5" "6" "7" "8" "9" "10" "11" "12"
## [13] "13" "14" "15" "16" "17" "18" "19" "20" "21" "22" "23" "24"
## [25] "25" "26" "27" "28" "29" "30" "31" "32" "33" "34" "35" "36"
## [37] "37" "38" "39" "40" "41" "42" "43" "44" "45" "46" "47" "48"
## [49] "49" "50" "51" "52" "53" "54" "55" "56" "57" "58" "59" "60"
## [61] "61" "62" "63" "64" "65" "66" "67" "68" "69" "70" "71" "72"
## [73] "73" "74" "75" "76" "77" "78" "79" "80" "81" "82" "83" "84"
## [85] "85" "86" "87" "88" "89" "90" "91" "92" "93" "94" "95" "96"
## [97] "97" "98" "99" "100" "101" "102" "103" "104" "105" "106" "107" "108"
## [109] "109" "110" "111" "112" "113" "114" "115" "116" "117" "118" "119" "120"
## [121] "121" "122" "123" "124" "125" "126" "127" "128" "129" "130" "131" "132"
## [133] "133" "134" "135" "136" "137" "138" "139" "140" "141" "142" "143" "144"
## [145] "145" "146" "147" "148" "149" "150" "151" "152" "153" "154" "155" "156"
## [157] "157" "158" "159" "160" "161" "162" "163" "164" "165" "166" "167" "168"
## [169] "169" "170" "171" "172" "173" "174" "175" "176" "177" "178" "179" "180"
## [181] "181" "182" "183" "184" "185" "186" "187" "188" "189" "190" "191" "192"
## [193] "193" "194" "195" "196" "197" "198" "199" "200" "201" "202" "203" "204"
## [205] "205" "206" "207" "208" "209" "210" "211" "212" "213" "214" "215" "216"
## [217] "217" "218" "219" "220" "221" "222" "223" "224" "225" "226" "227" "228"
## [229] "229" "230" "231" "232" "233" "234" "235" "236" "237" "238" "239" "240"
## [241] "241" "242" "243" "244" "245" "246" "247" "248" "249" "250" "251" "252"
## [253] "253" "254" "255" "256" "257" "258" "259" "260" "261" "262" "263" "264"
## [265] "265" "266" "267" "268" "269" "270" "271" "272" "273" "274" "275" "276"
## [277] "277" "278" "279" "280" "281" "282" "283" "284" "285" "286" "287" "288"
## [289] "289" "290" "291" "292" "293" "294" "295" "296" "297" "298" "299" "300"
## [301] "301" "302" "303" "304" "305" "306" "307" "308" "309" "310" "311" "312"
## [313] "313" "314" "315" "316" "317" "318" "319" "320" "321" "322" "323" "324"
## [325] "325" "326" "327" "328" "329" "330" "331" "332" "333" "334" "335" "336"
## [337] "337" "338" "339" "340" "341" "342" "343" "344" "345" "346" "347" "348"
## [349] "349" "350" "351" "352" "353" "354" "355" "356" "357" "358" "359" "360"
## [361] "361" "362" "363" "364" "365" "366" "367" "368" "369" "370" "371" "372"
## [373] "373" "374" "375" "376" "377" "378" "379" "380" "381" "382" "383" "384"
## [385] "385" "386" "387" "388" "389" "390" "391" "392" "393" "394" "395" "396"
## [397] "397" "398" "399" "400" "401" "402" "403"
##
## [[2]]
## [1] "X" "id" "chol" "stab.glu" "hdl" "ratio"
## [7] "glyhb" "location" "age" "gender" "height" "weight"
## [13] "frame" "bp.1s" "bp.1d" "bp.2s" "bp.2d" "waist"
## [19] "hip" "time.ppn" "insurance" "fh" "smoking" "dm"
chol=g[,"chol"]
gender=as.factor(g[,"gender"])
dm= as.factor(g[,"dm"])
height=g[,"height"]
weight=g[,"weight"]
age=g[,"age"]
hdl=g[,"hdl"]
insurance=g[,"insurance"]
frame=as.factor(g[,"frame"])
location=as.factor(g[,"location"])
systolic=g[,"bp.1s"]
diastolic=g[,"bp.1d"]
ratio=g[,"ratio"]
fh=as.factor(g[,"fh"])
smoking=as.factor(g[,"smoking"])
t=table(gender)
round(prop.table(t), digits = 3)
## gender
## female male
## 0.581 0.419
round(100*prop.table(t), digits = 1)
## gender
## female male
## 58.1 41.9
dm2=factor(dm, exclude = NULL)
table(dm2)
## dm2
## no yes <NA>
## 330 60 13
summary(chol)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 78.0 179.0 204.0 207.8 230.0 443.0 1
summary(height)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 52.00 63.00 66.00 66.02 69.00 76.00 5
summary(weight)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 99.0 151.0 172.5 177.6 200.0 325.0 1
height.si=height*0.0254
weight.si=weight*0.453592
bmi=weight.si/height.si^2
summary(bmi)
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## 15.20 24.13 27.80 28.79 32.24 55.79 6
bmi_categorized=ifelse(bmi<18.5,"underweight" ,ifelse(bmi>18.5 & bmi<25,"Normal" ,ifelse(bmi>25 & bmi<30,"overweight" ,ifelse(bmi>30,"obese",NA))))
table(bmi_categorized, exclude = NULL)
## bmi_categorized
## Normal obese overweight underweight <NA>
## 113 152 123 9 6
dm_bmi_categorized=table(bmi_categorized,dm2,exclude = NULL)
round(100*prop.table(dm_bmi_categorized,margin = 1),digits = 1)
## dm2
## bmi_categorized no yes <NA>
## Normal 88.5 8.0 3.5
## obese 77.6 19.1 3.3
## overweight 80.5 16.3 3.3
## underweight 100.0 0.0 0.0
## <NA> 66.7 33.3 0.0
round(prop.table(dm_bmi_categorized, margin = 1),digits = 3)
## dm2
## bmi_categorized no yes <NA>
## Normal 0.885 0.080 0.035
## obese 0.776 0.191 0.033
## overweight 0.805 0.163 0.033
## underweight 1.000 0.000 0.000
## <NA> 0.667 0.333 0.000
age_grouped=ifelse(age<45,"under45" ,ifelse(age>=45 & age<65,"45-64" ,ifelse(age>=65 & age<75,"65-74" ,ifelse(age>=75,"75 & over",NA))))
table(age_grouped,exclude=NULL)
## age_grouped
## 45-64 65-74 75 & over under45
## 139 41 23 200
age_group_by_Newgender=table(age_grouped,gender,exclude = NULL)
age_group_by_Newgender
## gender
## age_grouped female male
## 45-64 75 64
## 65-74 21 20
## 75 & over 12 11
## under45 126 74
round(prop.table(age_group_by_Newgender),digits =3)
## gender
## age_grouped female male
## 45-64 0.186 0.159
## 65-74 0.052 0.050
## 75 & over 0.030 0.027
## under45 0.313 0.184
round(100*prop.table(age_group_by_Newgender , margin = 2),digits = 1)
## gender
## age_grouped female male
## 45-64 32.1 37.9
## 65-74 9.0 11.8
## 75 & over 5.1 6.5
## under45 53.8 43.8
m=glm(dm~gender, family = binomial(link = logit))
summary(m)
##
## Call:
## glm(formula = dm ~ gender, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.5915 -0.5915 -0.5683 -0.5683 1.9509
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.74150 0.18592 -9.367 <2e-16 ***
## gendermale 0.08694 0.28352 0.307 0.759
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.87 on 389 degrees of freedom
## Residual deviance: 334.78 on 388 degrees of freedom
## (13 observations deleted due to missingness)
## AIC: 338.78
##
## Number of Fisher Scoring iterations: 4
exp(m$coefficients[-1])
## gendermale
## 1.09083
contrasts(gender)
## male
## female 0
## male 1
hist(age)
hist(bmi)
hist(chol)
hist(hdl)
## Multiple logestic regression
m2=glm(dm~age+chol+insurance, family = binomial(link = logit))
summary(m2)
##
## Call:
## glm(formula = dm ~ age + chol + insurance, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5639 -0.5966 -0.3996 -0.2623 2.4452
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.782785 0.859767 -6.726 1.74e-11 ***
## age 0.049720 0.009757 5.096 3.47e-07 ***
## chol 0.008392 0.003150 2.665 0.00771 **
## insurance -0.294207 0.188047 -1.565 0.11769
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.54 on 388 degrees of freedom
## Residual deviance: 289.29 on 385 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 297.29
##
## Number of Fisher Scoring iterations: 5
exp(m2$coefficients[-1])
## age chol insurance
## 1.0509765 1.0084275 0.7451224
g$insurance=as.factor(g$insurance)
contrasts(gender)
## male
## female 0
## male 1
table(insurance)
## insurance
## 0 1 2
## 117 147 139
insurance=factor(insurance,labels =c("others","government","private") )
table(insurance)
## insurance
## others government private
## 117 147 139
Full_model=glm(formula = dm~age+chol+insurance, family = binomial(link = logit))
summary(Full_model)
##
## Call:
## glm(formula = dm ~ age + chol + insurance, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5714 -0.5945 -0.3992 -0.2619 2.4399
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.794252 0.874555 -6.625 3.46e-11 ***
## age 0.049753 0.009770 5.092 3.54e-07 ***
## chol 0.008402 0.003153 2.665 0.0077 **
## insurancegovernment -0.271955 0.359445 -0.757 0.4493
## insuranceprivate -0.589803 0.377434 -1.563 0.1181
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.54 on 388 degrees of freedom
## Residual deviance: 289.28 on 384 degrees of freedom
## (14 observations deleted due to missingness)
## AIC: 299.28
##
## Number of Fisher Scoring iterations: 5
null_model=glm(dm~1,family = binomial(link = logit))
summary(null_model)
##
## Call:
## glm(formula = dm ~ 1, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -0.578 -0.578 -0.578 -0.578 1.935
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.7047 0.1403 -12.15 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 334.87 on 389 degrees of freedom
## Residual deviance: 334.87 on 389 degrees of freedom
## (13 observations deleted due to missingness)
## AIC: 336.87
##
## Number of Fisher Scoring iterations: 3
R2=1- logLik(Full_model)/logLik(null_model)
R2
## 'log Lik.' 0.1361385 (df=5)
library(DescTools)
Cstat(Full_model)
## [1] 0.7644124
Cstat(null_model)
## [1] 0.5
library(ResourceSelection)
## ResourceSelection 0.3-5 2019-07-22
Full_model$y
## 1 2 3 4 5 6 7 8 9 10 11 12 13 14 15 16 17 18 19 20
## 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 21 22 23 24 25 26 27 29 30 31 32 33 34 35 36 37 38 39 40 41
## 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0
## 42 43 45 46 47 48 49 50 51 52 53 54 55 56 57 58 59 61 62 63
## 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 1 0 1
## 64 66 67 68 69 70 71 72 73 74 75 76 77 78 79 80 81 82 83 84
## 0 0 0 1 0 1 0 0 0 0 0 1 0 0 0 0 0 0 0 0
## 85 86 87 88 89 90 91 92 93 94 95 96 97 98 99 100 101 102 103 104
## 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 1 0 0 0 0
## 105 106 107 108 109 111 112 113 114 115 116 119 120 121 122 123 124 125 126 127
## 1 0 0 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## 128 129 130 131 132 133 134 135 136 137 138 139 140 141 142 143 144 145 146 147
## 0 0 1 0 0 0 0 0 0 0 0 1 0 0 0 0 1 0 0 0
## 148 149 150 151 152 153 154 155 156 157 158 159 160 161 162 163 164 165 166 167
## 0 0 0 1 0 0 0 0 1 0 0 0 0 0 1 0 0 0 0 1
## 168 169 170 171 172 173 174 175 176 177 178 179 180 181 182 183 184 185 186 187
## 0 0 0 0 1 0 1 0 0 1 1 0 0 1 0 0 0 0 0 0
## 188 189 190 191 192 194 195 196 197 198 199 200 201 202 203 204 205 206 207 208
## 0 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0
## 209 210 211 212 213 214 215 216 217 219 220 221 222 223 224 225 226 227 228 229
## 0 1 0 0 1 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 230 231 232 233 234 235 236 237 238 239 240 241 242 243 244 245 246 247 248 249
## 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 1
## 250 251 252 253 254 255 256 257 258 259 260 261 262 263 264 265 266 267 268 269
## 0 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0 0 0 0 0
## 270 271 272 273 275 276 277 278 279 280 281 282 283 284 285 286 287 288 289 290
## 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 291 292 293 294 295 296 297 298 299 300 301 302 303 304 305 306 307 308 309 310
## 0 0 0 0 1 0 0 0 0 0 0 1 0 0 0 1 0 0 0 0
## 311 312 313 314 315 316 317 318 319 320 321 322 323 324 325 326 329 330 331 332
## 0 0 1 0 1 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0
## 333 334 335 336 337 338 339 340 341 342 343 344 345 346 347 348 349 350 351 352
## 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0
## 353 354 355 356 357 358 359 360 361 362 363 364 365 366 367 368 369 370 371 372
## 0 0 0 0 0 0 1 0 0 0 1 1 1 0 0 0 1 0 0 0
## 373 374 375 376 377 378 379 380 381 382 383 384 385 386 387 388 389 390 391 392
## 0 0 0 0 0 0 0 0 0 1 0 0 0 0 0 1 0 0 1 0
## 393 394 395 396 398 399 400 401 402
## 0 0 0 0 0 1 0 1 0
HL=hoslem.test(x=Full_model$y , y=fitted(Full_model),g=10)
table(fh)
## fh
## 0 1
## 327 76
fh=factor(fh,labels = c("No","fh"))
table(fh)
## fh
## No fh
## 327 76
model=glm(dm~age+bmi+chol+hdl+systolic+diastolic,family = binomial(link=logit))
summary(model)
##
## Call:
## glm(formula = dm ~ age + bmi + chol + hdl + systolic + diastolic,
## family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4658 -0.5453 -0.3625 -0.1989 2.8155
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.4479066 1.5287185 -4.872 1.10e-06 ***
## age 0.0505742 0.0121218 4.172 3.02e-05 ***
## bmi 0.0496011 0.0242735 2.043 0.04101 *
## chol 0.0106330 0.0035028 3.036 0.00240 **
## hdl -0.0290599 0.0103633 -2.804 0.00505 **
## systolic 0.0053365 0.0089493 0.596 0.55097
## diastolic -0.0002951 0.0158481 -0.019 0.98515
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 324.38 on 378 degrees of freedom
## Residual deviance: 263.80 on 372 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 277.8
##
## Number of Fisher Scoring iterations: 5
model_updated=glm(dm~age+bmi+chol+hdl,family = binomial(link=logit))
summary(model_updated)
##
## Call:
## glm(formula = dm ~ age + bmi + chol + hdl, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4243 -0.5554 -0.3585 -0.1969 2.8492
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.112566 1.279586 -5.558 2.72e-08 ***
## age 0.054510 0.010577 5.153 2.56e-07 ***
## bmi 0.052218 0.024013 2.175 0.02966 *
## chol 0.011010 0.003452 3.190 0.00142 **
## hdl -0.028668 0.010310 -2.781 0.00543 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 325.70 on 382 degrees of freedom
## Residual deviance: 265.11 on 378 degrees of freedom
## (20 observations deleted due to missingness)
## AIC: 275.11
##
## Number of Fisher Scoring iterations: 5
cor.test(systolic,hdl)
##
## Pearson's product-moment correlation
##
## data: systolic and hdl
## t = 0.39368, df = 395, p-value = 0.694
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.07877132 0.11799603
## sample estimates:
## cor
## 0.01980412
cor.test(systolic,bmi)
##
## Pearson's product-moment correlation
##
## data: systolic and bmi
## t = 2.2863, df = 391, p-value = 0.02277
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.01611718 0.21137656
## sample estimates:
## cor
## 0.1148561
cor.test(systolic,chol)
##
## Pearson's product-moment correlation
##
## data: systolic and chol
## t = 4.1276, df = 395, p-value = 4.474e-05
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1070653 0.2958454
## sample estimates:
## cor
## 0.2033444
cor.test(systolic,age)
##
## Pearson's product-moment correlation
##
## data: systolic and age
## t = 9.8342, df = 396, p-value < 2.2e-16
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.3604404 0.5187477
## sample estimates:
## cor
## 0.4430412
model_new=glm(dm~age+gender+bmi+location+frame+insurance+smoking+bmi+chol+hdl+systolic+diastolic, family = binomial(link = logit))
summary(model_new)
##
## Call:
## glm(formula = dm ~ age + gender + bmi + location + frame + insurance +
## smoking + bmi + chol + hdl + systolic + diastolic, family = binomial(link = logit))
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.4845 -0.5506 -0.3577 -0.1948 2.6399
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -7.953055 2.094979 -3.796 0.000147 ***
## age 0.052432 0.012882 4.070 4.7e-05 ***
## gendermale 0.159954 0.381967 0.419 0.675388
## bmi 0.054568 0.028911 1.887 0.059097 .
## locationLouisa -0.255176 0.330092 -0.773 0.439496
## framelarge 0.262753 0.969952 0.271 0.786474
## framemedium 0.275534 0.967371 0.285 0.775776
## framesmall 0.550051 1.027476 0.535 0.592414
## insurancegovernment -0.273335 0.391855 -0.698 0.485465
## insuranceprivate -0.529986 0.406492 -1.304 0.192300
## smoking2 -0.158872 0.369045 -0.430 0.666836
## smoking3 -0.179607 0.508335 -0.353 0.723846
## chol 0.010785 0.003589 3.005 0.002652 **
## hdl -0.028312 0.010874 -2.604 0.009224 **
## systolic 0.005573 0.009455 0.589 0.555597
## diastolic 0.002992 0.016633 0.180 0.857229
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 324.38 on 378 degrees of freedom
## Residual deviance: 260.93 on 363 degrees of freedom
## (24 observations deleted due to missingness)
## AIC: 292.93
##
## Number of Fisher Scoring iterations: 5