#Logistic Regression on Diabetes Dataset
#Here you want to predict and explain the bases of group membership for each of the object
#Depending upon the characteristics of the object (independent variable), you find the probability that an object will belong to a group coded 1 (i.e. the event is likely to happen)
#prereq settingup data
setwd("C:\\Program Files\\R\\R-3.4.1")
library(caTools) # for sample split - although you have done manually in the past
## Warning: package 'caTools' was built under R version 3.4.4
library(MASS) #pima.te - diabetes dataset is in MASS library
## Warning: package 'MASS' was built under R version 3.4.3
View(Pima.te)
data <- Pima.te
#Splitting the data into analysis and hold-out sample
#taking 70 30 ratio for train and test.... 80% of 332 is 265
set.seed(1234) #to randomize sample
training = sample(nrow(data),265,replace = FALSE)
train = data[training, ]
test = data[-training, ]
#Modeling
#Fitting a logistic regression model
#All independent variables
model.fit <- glm(type ~.,data = train,family = "binomial")
summary(model.fit)
##
## Call:
## glm(formula = type ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9001 -0.6313 -0.3812 0.5980 2.5294
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.198965 1.368669 -6.721 1.80e-11 ***
## npreg 0.114456 0.066089 1.732 0.0833 .
## glu 0.039947 0.006324 6.317 2.67e-10 ***
## bp -0.003921 0.014880 -0.263 0.7922
## skin 0.007842 0.022457 0.349 0.7269
## bmi 0.063189 0.033492 1.887 0.0592 .
## ped 1.000011 0.489263 2.044 0.0410 *
## age 0.012332 0.021592 0.571 0.5679
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 331.02 on 264 degrees of freedom
## Residual deviance: 227.74 on 257 degrees of freedom
## AIC: 243.74
##
## Number of Fisher Scoring iterations: 5
#Residual Deviance = 227.74
#AIC = 243.74
#Variables not significant(in descending) : bp skin age
library(rms)
## Warning: package 'rms' was built under R version 3.4.4
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.2
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Loading required package: SparseM
##
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
##
## backsolve
#To measure Nagelkerke's pseudo R2 as a measure of overall model fit
mod1b <- lrm(type ~.,data = train)
mod1b #pseudo nagelkerke R2 value is 0.453 where 1 indicates a perfect model fit
## Logistic Regression Model
##
## lrm(formula = type ~ ., data = train)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 265 LR chi2 103.29 R2 0.453 C 0.865
## No 181 d.f. 7 g 1.924 Dxy 0.730
## Yes 84 Pr(> chi2) <0.0001 gr 6.846 gamma 0.730
## max |deriv| 7e-08 gp 0.307 tau-a 0.317
## Brier 0.136
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -9.1990 1.3687 -6.72 <0.0001
## npreg 0.1145 0.0661 1.73 0.0833
## glu 0.0399 0.0063 6.32 <0.0001
## bp -0.0039 0.0149 -0.26 0.7922
## skin 0.0078 0.0225 0.35 0.7269
## bmi 0.0632 0.0335 1.89 0.0592
## ped 1.0000 0.4893 2.04 0.0410
## age 0.0123 0.0216 0.57 0.5679
##
library(pscl)
## Warning: package 'pscl' was built under R version 3.4.4
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(model.fit) #r2CU 0.4525472
## llh llhNull G2 McFadden r2ML
## -113.8688477 -165.5118301 103.2859648 0.3120199 0.3227795
## r2CU
## 0.4525472
#Removing bp from the model
model.fit1 <- glm(type ~.-bp,data = train,family = "binomial")
summary(model.fit1)
##
## Call:
## glm(formula = type ~ . - bp, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8982 -0.6189 -0.3836 0.5986 2.5558
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.340874 1.262439 -7.399 1.37e-13 ***
## npreg 0.115604 0.065857 1.755 0.0792 .
## glu 0.039915 0.006317 6.318 2.64e-10 ***
## skin 0.007974 0.022488 0.355 0.7229
## bmi 0.060462 0.031867 1.897 0.0578 .
## ped 1.005796 0.489376 2.055 0.0399 *
## age 0.010563 0.020447 0.517 0.6054
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 331.02 on 264 degrees of freedom
## Residual deviance: 227.81 on 258 degrees of freedom
## AIC: 241.81
##
## Number of Fisher Scoring iterations: 5
#Residual Deviance = 227.81
#AIC = 241.81
#Variables not significant : skin age
library(rms)
#To measure Nagelkerke's pseudo R2 as a measure of overall model fit
mod2b <- lrm(type ~ npreg+glu+skin+bmi+ped+age,data = train)
mod2b #pseudo nagelkerke R2 value is 0.452 where 1 indicates a perfect model fit
## Logistic Regression Model
##
## lrm(formula = type ~ npreg + glu + skin + bmi + ped + age, data = train)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 265 LR chi2 103.22 R2 0.452 C 0.866
## No 181 d.f. 6 g 1.922 Dxy 0.732
## Yes 84 Pr(> chi2) <0.0001 gr 6.831 gamma 0.732
## max |deriv| 7e-08 gp 0.307 tau-a 0.318
## Brier 0.136
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -9.3409 1.2624 -7.40 <0.0001
## npreg 0.1156 0.0659 1.76 0.0792
## glu 0.0399 0.0063 6.32 <0.0001
## skin 0.0080 0.0225 0.35 0.7229
## bmi 0.0605 0.0319 1.90 0.0578
## ped 1.0058 0.4894 2.06 0.0399
## age 0.0106 0.0204 0.52 0.6054
##
library(pscl)
pR2(model.fit1) #r2CU 0.4522985
## llh llhNull G2 McFadden r2ML
## -113.9035599 -165.5118301 103.2165405 0.3118102 0.3226021
## r2CU
## 0.4522985
#Removing only skin and bp from first model #using this as best fit model -- since Residual deviance does not increase much, and AIC reduces
model.fit2 <- glm(type ~ npreg + glu + bmi + ped + age,data = train,family = "binomial")
summary(model.fit2)
##
## Call:
## glm(formula = type ~ npreg + glu + bmi + ped + age, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.8840 -0.6227 -0.3853 0.5963 2.5690
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.350104 1.262915 -7.404 1.33e-13 ***
## npreg 0.116930 0.065869 1.775 0.07587 .
## glu 0.039842 0.006304 6.320 2.62e-10 ***
## bmi 0.067569 0.024927 2.711 0.00671 **
## ped 1.012741 0.488232 2.074 0.03805 *
## age 0.010908 0.020413 0.534 0.59309
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 331.02 on 264 degrees of freedom
## Residual deviance: 227.93 on 259 degrees of freedom
## AIC: 239.93
##
## Number of Fisher Scoring iterations: 5
#Residual Deviance = 227.93
#AIC = 239.93
#Variables not significant : age
library(rms)
#To measure Nagelkerke's pseudo R2 as a measure of overall model fit
mod3b <- lrm(type ~ npreg + glu + bmi + ped + age,data = train)
mod3b #pseudo nagelkerke R2 value is 0.452 where 1 indicates a perfect model fit
## Logistic Regression Model
##
## lrm(formula = type ~ npreg + glu + bmi + ped + age, data = train)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 265 LR chi2 103.09 R2 0.452 C 0.867
## No 181 d.f. 5 g 1.914 Dxy 0.734
## Yes 84 Pr(> chi2) <0.0001 gr 6.780 gamma 0.734
## max |deriv| 4e-08 gp 0.306 tau-a 0.319
## Brier 0.136
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -9.3501 1.2629 -7.40 <0.0001
## npreg 0.1169 0.0659 1.78 0.0759
## glu 0.0398 0.0063 6.32 <0.0001
## bmi 0.0676 0.0249 2.71 0.0067
## ped 1.0127 0.4882 2.07 0.0381
## age 0.0109 0.0204 0.53 0.5931
##
library(pscl)
pR2(model.fit2) #r2CU 0.4518485
## llh llhNull G2 McFadden r2ML
## -113.9663216 -165.5118301 103.0910171 0.3114310 0.3222811
## r2CU
## 0.4518485
#removing skin bp and age from first model
model.fit3 <- glm(type ~ npreg + glu + bmi + ped ,data = train,family = "binomial")
summary(model.fit3)
##
## Call:
## glm(formula = type ~ npreg + glu + bmi + ped, family = "binomial",
## data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.9306 -0.6259 -0.3851 0.5855 2.5616
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.155551 1.201558 -7.620 2.54e-14 ***
## npreg 0.140218 0.049799 2.816 0.00487 **
## glu 0.040408 0.006228 6.488 8.72e-11 ***
## bmi 0.067073 0.024979 2.685 0.00725 **
## ped 1.053231 0.483929 2.176 0.02952 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 331.02 on 264 degrees of freedom
## Residual deviance: 228.22 on 260 degrees of freedom
## AIC: 238.22
##
## Number of Fisher Scoring iterations: 5
#Residual Deviance = 228.22 Residul deviance has gone up by 1 unit
#AIC = 238.22
#all variables significant
library(rms)
#To measure Nagelkerke's pseudo R2 as a measure of overall model fit
modb4 <- lrm(type ~ npreg + glu + bmi + ped ,data = train)
modb4 #pseudo nagelkerke R2 value is 0.451 where 1 indicates a perfect model fit
## Logistic Regression Model
##
## lrm(formula = type ~ npreg + glu + bmi + ped, data = train)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 265 LR chi2 102.81 R2 0.451 C 0.865
## No 181 d.f. 4 g 1.908 Dxy 0.730
## Yes 84 Pr(> chi2) <0.0001 gr 6.739 gamma 0.730
## max |deriv| 3e-08 gp 0.305 tau-a 0.317
## Brier 0.136
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -9.1556 1.2016 -7.62 <0.0001
## npreg 0.1402 0.0498 2.82 0.0049
## glu 0.0404 0.0062 6.49 <0.0001
## bmi 0.0671 0.0250 2.69 0.0072
## ped 1.0532 0.4839 2.18 0.0295
##
library(pscl)
pR2(model.fit3) #r2CU 0.4508327
## llh llhNull G2 McFadden r2ML
## -114.1079010 -165.5118301 102.8078582 0.3105756 0.3215566
## r2CU
## 0.4508327
#model.fit 2 is the best model... now using that model to find the probability of abjects being diabetic (i.e. belonging to group coded 1)
#To see how well our model predicts probabilites on our analysis sample
#to find the probability for each object belonging to group coded 1
predtrain <- predict(model.fit2,data = train, type = "response")
predtrain
## 38 206 202 331 283 210
## 0.06901744 0.06124997 0.08549420 0.16813219 0.17925659 0.22507407
## 4 76 216 167 224 175
## 0.03688948 0.21311764 0.45680825 0.03434500 0.09921710 0.42761609
## 91 295 93 266 320 85
## 0.13814949 0.29689563 0.06319383 0.02961849 0.93235636 0.11697329
## 59 73 99 95 50 13
## 0.67756735 0.45784144 0.02940916 0.70904125 0.22555234 0.03532698
## 68 249 161 279 253 14
## 0.86960757 0.43715156 0.03343644 0.03002979 0.08116688 0.39035635
## 138 80 92 152 54 226
## 0.16836997 0.43206836 0.60660592 0.52367027 0.08138639 0.20721704
## 60 77 292 237 162 189
## 0.68841180 0.03271597 0.71179837 0.31598054 0.94906385 0.14045179
## 316 180 311 145 194 139
## 0.11804943 0.81771138 0.29636347 0.16819442 0.03750465 0.46189810
## 70 217 21 88 201 141
## 0.83709936 0.92350274 0.94794869 0.51004192 0.02058264 0.77663271
## 43 140 137 207 48 232
## 0.72174653 0.20008143 0.39881798 0.09535924 0.41411098 0.28076104
## 236 12 86 326 65 291
## 0.08420246 0.31372147 0.95940362 0.78506747 0.11351767 0.09350662
## 82 135 303 149 32 234
## 0.27129128 0.37564568 0.64651654 0.03455584 0.02894758 0.03288360
## 269 203 24 134 312 18
## 0.25042610 0.53506867 0.04388120 0.68988169 0.85057059 0.51635011
## 317 170 261 119 36 136
## 0.70969489 0.07475763 0.40988335 0.19198158 0.14630337 0.32312499
## 49 222 96 319 40 218
## 0.02508081 0.04615230 0.98437388 0.07565304 0.03977665 0.08135914
## 41 243 33 262 26 122
## 0.38636350 0.96289789 0.34823476 0.04681343 0.05035015 0.12221404
## 71 7 313 173 9 131
## 0.03997268 0.34120366 0.09431059 0.16611248 0.42157975 0.18736151
## 268 47 31 74 250 30
## 0.75940626 0.08374025 0.17785388 0.22154242 0.11565988 0.09054748
## 98 273 159 23 327 27
## 0.63524204 0.11054124 0.51241510 0.02759609 0.45187745 0.72603717
## 274 199 205 61 219 304
## 0.11241375 0.10158300 0.12545178 0.16912805 0.07223162 0.16220348
## 158 286 209 197 102 314
## 0.26989825 0.12587570 0.55349715 0.18499387 0.03875253 0.15084203
## 52 104 208 230 195 128
## 0.04173576 0.40577243 0.10730602 0.55154729 0.07967402 0.17155387
## 238 315 182 318 179 117
## 0.29737879 0.06933461 0.16258957 0.10288101 0.06940480 0.61995359
## 123 168 97 188 62 290
## 0.40746377 0.08358757 0.21711498 0.05640706 0.07935677 0.79725494
## 308 118 302 105 181 106
## 0.13593470 0.05998015 0.29641481 0.48283816 0.23425691 0.94391545
## 301 42 15 153 184 321
## 0.12899712 0.12798273 0.13958403 0.29867416 0.69142778 0.28095984
## 183 177 66 192 227 256
## 0.65899621 0.82708256 0.26048553 0.08484452 0.12165811 0.17963633
## 172 332 289 107 72 271
## 0.36063114 0.05196178 0.17622208 0.80860777 0.96215168 0.01058470
## 132 298 211 288 114 191
## 0.36731298 0.58996740 0.34982783 0.21063584 0.11437085 0.73671840
## 55 120 67 87 255 229
## 0.85072789 0.12991882 0.03357622 0.03073911 0.17696802 0.05178646
## 252 53 267 165 142 260
## 0.12919131 0.71603055 0.51947202 0.07600540 0.13437850 0.12673410
## 294 300 45 323 150 277
## 0.12554830 0.21186239 0.19364977 0.84767828 0.16998852 0.12799211
## 154 146 112 113 160 171
## 0.04671688 0.13506996 0.16087847 0.90789522 0.15562256 0.28676803
## 164 270 281 284 178 100
## 0.03916273 0.72771023 0.73816797 0.54733823 0.42296227 0.90510467
## 247 94 39 51 200 293
## 0.04615408 0.04365586 0.11150207 0.38754973 0.12608909 0.75174617
## 231 34 221 148 156 110
## 0.70798460 0.15115532 0.42556267 0.04744032 0.90418152 0.03331611
## 75 81 212 233 241 147
## 0.20457588 0.23139951 0.60457346 0.01398493 0.05320648 0.46110013
## 29 37 193 163 133 297
## 0.12483579 0.08378247 0.08115401 0.05714052 0.24625994 0.44490385
## 151 125 254 58 11 198
## 0.06761583 0.43138129 0.41393663 0.19843640 0.04445797 0.98963156
## 56 282 166 225 196 220
## 0.12754800 0.49706647 0.21475002 0.07154446 0.79985464 0.09841452
## 244 157 20 190 155 130
## 0.27705735 0.85564320 0.11784785 0.80462087 0.32668536 0.32086935
## 187 248 204 176 44 328
## 0.34659351 0.06845068 0.30040690 0.10677569 0.26891901 0.06443671
## 22 251 6 84 90 1
## 0.94527450 0.11564365 0.67750970 0.11340016 0.30528492 0.66791526
## 245 169 215 3 257 19
## 0.08911717 0.10769515 0.38181755 0.03259797 0.88067208 0.73645505
## 223
## 0.67743080
#classification matrix
(table(ActualValue = train$type, PredictedValue = predtrain>0.3))
## PredictedValue
## ActualValue FALSE TRUE
## No 144 37
## Yes 17 67
#validate your model on a holdout sample
predtest <- predict(model.fit2, newdata = test, type = "response")
predtest
## 2 5 8 10 16 17
## 0.03371019 0.82223760 0.15997311 0.26612557 0.30246088 0.10476051
## 25 28 35 46 57 63
## 0.05965493 0.01351424 0.29187718 0.05106873 0.22708220 0.05084321
## 64 69 78 79 83 89
## 0.23701881 0.63473144 0.88568254 0.71708686 0.16258318 0.22961777
## 101 103 108 109 111 115
## 0.84644752 0.08878344 0.42337444 0.08461789 0.15686536 0.23297762
## 116 121 124 126 127 129
## 0.51576851 0.06802276 0.46387192 0.04634477 0.04391703 0.68406923
## 143 144 174 185 186 213
## 0.29694639 0.27433462 0.14313593 0.60369910 0.17527670 0.17432406
## 214 228 235 239 240 242
## 0.05043125 0.14296502 0.21825390 0.31205629 0.08069340 0.89650989
## 246 258 259 263 264 265
## 0.06748902 0.72080311 0.17469469 0.08074651 0.24523383 0.89979650
## 272 275 276 278 280 285
## 0.47844529 0.04354904 0.07927202 0.33800314 0.11953055 0.03833643
## 287 296 299 305 306 307
## 0.52303851 0.07187371 0.09218476 0.91810053 0.81722555 0.15957354
## 309 310 322 324 325 329
## 0.32089177 0.19825160 0.64835373 0.23016697 0.08009477 0.91097472
## 330
## 0.25476076
#classification matrix -- now when i restarted R it worked !!! this also
(table(ActualValue = test$type, PredictedValue = predtest>0.3))
## PredictedValue
## ActualValue FALSE TRUE
## No 36 6
## Yes 8 17
table(predtest > 0.5,test$type)
##
## No Yes
## FALSE 39 12
## TRUE 3 13
(39+13)/(39+12+3+13)
## [1] 0.7761194
#THIS WORKS
table(predtest>0.3, test[,"type"]) #so for cross-tab, this will compare only observations whcih belong to group coded 1 (>threshold cutoff value) from predicted table and test data
##
## No Yes
## FALSE 36 8
## TRUE 6 17
accuracy <- table(predtest>0.3, test[,"type"])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.7910448
#test also 79% accuracy