#Logistic Regression on Diabetes Dataset

#Here you want to predict and explain the bases of group membership for each of the object
#Depending upon the characteristics of the object (independent variable), you find the probability that an object will belong to a group coded 1 (i.e. the event is likely to happen)

#prereq settingup data 
setwd("C:\\Program Files\\R\\R-3.4.1")
library(caTools) # for sample split - although you have done manually in the past
## Warning: package 'caTools' was built under R version 3.4.4
library(MASS) #pima.te - diabetes dataset is in MASS library
## Warning: package 'MASS' was built under R version 3.4.3
View(Pima.te)
data <- Pima.te


#Splitting the data into analysis and hold-out sample
#taking 70 30 ratio for train and test.... 80% of 332 is 265
set.seed(1234)  #to randomize sample
training = sample(nrow(data),265,replace = FALSE)
train = data[training, ]
test = data[-training, ]


#Modeling

#Fitting a logistic regression model


#All independent variables
model.fit <- glm(type ~.,data = train,family = "binomial")
summary(model.fit)
## 
## Call:
## glm(formula = type ~ ., family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9001  -0.6313  -0.3812   0.5980   2.5294  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.198965   1.368669  -6.721 1.80e-11 ***
## npreg        0.114456   0.066089   1.732   0.0833 .  
## glu          0.039947   0.006324   6.317 2.67e-10 ***
## bp          -0.003921   0.014880  -0.263   0.7922    
## skin         0.007842   0.022457   0.349   0.7269    
## bmi          0.063189   0.033492   1.887   0.0592 .  
## ped          1.000011   0.489263   2.044   0.0410 *  
## age          0.012332   0.021592   0.571   0.5679    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 331.02  on 264  degrees of freedom
## Residual deviance: 227.74  on 257  degrees of freedom
## AIC: 243.74
## 
## Number of Fisher Scoring iterations: 5
#Residual Deviance = 227.74
#AIC = 243.74
#Variables not significant(in descending) : bp skin age
library(rms)
## Warning: package 'rms' was built under R version 3.4.4
## Loading required package: Hmisc
## Warning: package 'Hmisc' was built under R version 3.4.4
## Loading required package: lattice
## Loading required package: survival
## Loading required package: Formula
## Loading required package: ggplot2
## Warning: package 'ggplot2' was built under R version 3.4.2
## 
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
## 
##     format.pval, units
## Loading required package: SparseM
## 
## Attaching package: 'SparseM'
## The following object is masked from 'package:base':
## 
##     backsolve
#To measure Nagelkerke's pseudo R2 as a measure of overall model fit
mod1b <- lrm(type ~.,data = train)
mod1b #pseudo nagelkerke R2 value is 0.453 where 1 indicates a perfect model fit
## Logistic Regression Model
##  
##  lrm(formula = type ~ ., data = train)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           265    LR chi2     103.29    R2       0.453    C       0.865    
##   No           181    d.f.             7    g        1.924    Dxy     0.730    
##   Yes           84    Pr(> chi2) <0.0001    gr       6.846    gamma   0.730    
##  max |deriv| 7e-08                          gp       0.307    tau-a   0.317    
##                                             Brier    0.136                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept -9.1990 1.3687 -6.72  <0.0001 
##  npreg      0.1145 0.0661  1.73  0.0833  
##  glu        0.0399 0.0063  6.32  <0.0001 
##  bp        -0.0039 0.0149 -0.26  0.7922  
##  skin       0.0078 0.0225  0.35  0.7269  
##  bmi        0.0632 0.0335  1.89  0.0592  
##  ped        1.0000 0.4893  2.04  0.0410  
##  age        0.0123 0.0216  0.57  0.5679  
## 
library(pscl)
## Warning: package 'pscl' was built under R version 3.4.4
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
pR2(model.fit) #r2CU 0.4525472
##          llh      llhNull           G2     McFadden         r2ML 
## -113.8688477 -165.5118301  103.2859648    0.3120199    0.3227795 
##         r2CU 
##    0.4525472
#Removing bp from the model
model.fit1 <- glm(type ~.-bp,data = train,family = "binomial")
summary(model.fit1)
## 
## Call:
## glm(formula = type ~ . - bp, family = "binomial", data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8982  -0.6189  -0.3836   0.5986   2.5558  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.340874   1.262439  -7.399 1.37e-13 ***
## npreg        0.115604   0.065857   1.755   0.0792 .  
## glu          0.039915   0.006317   6.318 2.64e-10 ***
## skin         0.007974   0.022488   0.355   0.7229    
## bmi          0.060462   0.031867   1.897   0.0578 .  
## ped          1.005796   0.489376   2.055   0.0399 *  
## age          0.010563   0.020447   0.517   0.6054    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 331.02  on 264  degrees of freedom
## Residual deviance: 227.81  on 258  degrees of freedom
## AIC: 241.81
## 
## Number of Fisher Scoring iterations: 5
#Residual Deviance = 227.81
#AIC = 241.81
#Variables not significant :  skin age
library(rms)
#To measure Nagelkerke's pseudo R2 as a measure of overall model fit
mod2b <- lrm(type ~ npreg+glu+skin+bmi+ped+age,data = train)
mod2b #pseudo nagelkerke R2 value is 0.452 where 1 indicates a perfect model fit
## Logistic Regression Model
##  
##  lrm(formula = type ~ npreg + glu + skin + bmi + ped + age, data = train)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           265    LR chi2     103.22    R2       0.452    C       0.866    
##   No           181    d.f.             6    g        1.922    Dxy     0.732    
##   Yes           84    Pr(> chi2) <0.0001    gr       6.831    gamma   0.732    
##  max |deriv| 7e-08                          gp       0.307    tau-a   0.318    
##                                             Brier    0.136                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept -9.3409 1.2624 -7.40  <0.0001 
##  npreg      0.1156 0.0659  1.76  0.0792  
##  glu        0.0399 0.0063  6.32  <0.0001 
##  skin       0.0080 0.0225  0.35  0.7229  
##  bmi        0.0605 0.0319  1.90  0.0578  
##  ped        1.0058 0.4894  2.06  0.0399  
##  age        0.0106 0.0204  0.52  0.6054  
## 
library(pscl)
pR2(model.fit1) #r2CU 0.4522985
##          llh      llhNull           G2     McFadden         r2ML 
## -113.9035599 -165.5118301  103.2165405    0.3118102    0.3226021 
##         r2CU 
##    0.4522985
#Removing only skin and bp from first model  #using this as best fit model -- since Residual deviance does not increase much, and AIC reduces
model.fit2 <- glm(type ~ npreg + glu + bmi + ped + age,data = train,family = "binomial")
summary(model.fit2)
## 
## Call:
## glm(formula = type ~ npreg + glu + bmi + ped + age, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.8840  -0.6227  -0.3853   0.5963   2.5690  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.350104   1.262915  -7.404 1.33e-13 ***
## npreg        0.116930   0.065869   1.775  0.07587 .  
## glu          0.039842   0.006304   6.320 2.62e-10 ***
## bmi          0.067569   0.024927   2.711  0.00671 ** 
## ped          1.012741   0.488232   2.074  0.03805 *  
## age          0.010908   0.020413   0.534  0.59309    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 331.02  on 264  degrees of freedom
## Residual deviance: 227.93  on 259  degrees of freedom
## AIC: 239.93
## 
## Number of Fisher Scoring iterations: 5
#Residual Deviance = 227.93
#AIC = 239.93
#Variables not significant : age
library(rms)
#To measure Nagelkerke's pseudo R2 as a measure of overall model fit
mod3b <- lrm(type ~ npreg + glu + bmi + ped + age,data = train)
mod3b #pseudo nagelkerke R2 value is 0.452 where 1 indicates a perfect model fit
## Logistic Regression Model
##  
##  lrm(formula = type ~ npreg + glu + bmi + ped + age, data = train)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           265    LR chi2     103.09    R2       0.452    C       0.867    
##   No           181    d.f.             5    g        1.914    Dxy     0.734    
##   Yes           84    Pr(> chi2) <0.0001    gr       6.780    gamma   0.734    
##  max |deriv| 4e-08                          gp       0.306    tau-a   0.319    
##                                             Brier    0.136                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept -9.3501 1.2629 -7.40  <0.0001 
##  npreg      0.1169 0.0659  1.78  0.0759  
##  glu        0.0398 0.0063  6.32  <0.0001 
##  bmi        0.0676 0.0249  2.71  0.0067  
##  ped        1.0127 0.4882  2.07  0.0381  
##  age        0.0109 0.0204  0.53  0.5931  
## 
library(pscl)
pR2(model.fit2) #r2CU 0.4518485
##          llh      llhNull           G2     McFadden         r2ML 
## -113.9663216 -165.5118301  103.0910171    0.3114310    0.3222811 
##         r2CU 
##    0.4518485
#removing skin bp and age from first model
model.fit3 <- glm(type ~ npreg + glu + bmi + ped ,data = train,family = "binomial")
summary(model.fit3)
## 
## Call:
## glm(formula = type ~ npreg + glu + bmi + ped, family = "binomial", 
##     data = train)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.9306  -0.6259  -0.3851   0.5855   2.5616  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -9.155551   1.201558  -7.620 2.54e-14 ***
## npreg        0.140218   0.049799   2.816  0.00487 ** 
## glu          0.040408   0.006228   6.488 8.72e-11 ***
## bmi          0.067073   0.024979   2.685  0.00725 ** 
## ped          1.053231   0.483929   2.176  0.02952 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 331.02  on 264  degrees of freedom
## Residual deviance: 228.22  on 260  degrees of freedom
## AIC: 238.22
## 
## Number of Fisher Scoring iterations: 5
#Residual Deviance = 228.22 Residul deviance has gone up by 1 unit
#AIC = 238.22
#all variables significant
library(rms)
#To measure Nagelkerke's pseudo R2 as a measure of overall model fit
modb4 <- lrm(type ~ npreg + glu + bmi + ped ,data = train)
modb4 #pseudo nagelkerke R2 value is 0.451 where 1 indicates a perfect model fit
## Logistic Regression Model
##  
##  lrm(formula = type ~ npreg + glu + bmi + ped, data = train)
##  
##                        Model Likelihood     Discrimination    Rank Discrim.    
##                           Ratio Test           Indexes           Indexes       
##  Obs           265    LR chi2     102.81    R2       0.451    C       0.865    
##   No           181    d.f.             4    g        1.908    Dxy     0.730    
##   Yes           84    Pr(> chi2) <0.0001    gr       6.739    gamma   0.730    
##  max |deriv| 3e-08                          gp       0.305    tau-a   0.317    
##                                             Brier    0.136                     
##  
##            Coef    S.E.   Wald Z Pr(>|Z|)
##  Intercept -9.1556 1.2016 -7.62  <0.0001 
##  npreg      0.1402 0.0498  2.82  0.0049  
##  glu        0.0404 0.0062  6.49  <0.0001 
##  bmi        0.0671 0.0250  2.69  0.0072  
##  ped        1.0532 0.4839  2.18  0.0295  
## 
library(pscl)
pR2(model.fit3) #r2CU 0.4508327
##          llh      llhNull           G2     McFadden         r2ML 
## -114.1079010 -165.5118301  102.8078582    0.3105756    0.3215566 
##         r2CU 
##    0.4508327
#model.fit 2 is the best model... now using that model to find the probability of abjects being diabetic (i.e. belonging to group coded 1)

#To see how well our model predicts probabilites on our analysis sample
#to find the probability for each object belonging to group coded 1

predtrain <- predict(model.fit2,data = train, type = "response")
predtrain
##         38        206        202        331        283        210 
## 0.06901744 0.06124997 0.08549420 0.16813219 0.17925659 0.22507407 
##          4         76        216        167        224        175 
## 0.03688948 0.21311764 0.45680825 0.03434500 0.09921710 0.42761609 
##         91        295         93        266        320         85 
## 0.13814949 0.29689563 0.06319383 0.02961849 0.93235636 0.11697329 
##         59         73         99         95         50         13 
## 0.67756735 0.45784144 0.02940916 0.70904125 0.22555234 0.03532698 
##         68        249        161        279        253         14 
## 0.86960757 0.43715156 0.03343644 0.03002979 0.08116688 0.39035635 
##        138         80         92        152         54        226 
## 0.16836997 0.43206836 0.60660592 0.52367027 0.08138639 0.20721704 
##         60         77        292        237        162        189 
## 0.68841180 0.03271597 0.71179837 0.31598054 0.94906385 0.14045179 
##        316        180        311        145        194        139 
## 0.11804943 0.81771138 0.29636347 0.16819442 0.03750465 0.46189810 
##         70        217         21         88        201        141 
## 0.83709936 0.92350274 0.94794869 0.51004192 0.02058264 0.77663271 
##         43        140        137        207         48        232 
## 0.72174653 0.20008143 0.39881798 0.09535924 0.41411098 0.28076104 
##        236         12         86        326         65        291 
## 0.08420246 0.31372147 0.95940362 0.78506747 0.11351767 0.09350662 
##         82        135        303        149         32        234 
## 0.27129128 0.37564568 0.64651654 0.03455584 0.02894758 0.03288360 
##        269        203         24        134        312         18 
## 0.25042610 0.53506867 0.04388120 0.68988169 0.85057059 0.51635011 
##        317        170        261        119         36        136 
## 0.70969489 0.07475763 0.40988335 0.19198158 0.14630337 0.32312499 
##         49        222         96        319         40        218 
## 0.02508081 0.04615230 0.98437388 0.07565304 0.03977665 0.08135914 
##         41        243         33        262         26        122 
## 0.38636350 0.96289789 0.34823476 0.04681343 0.05035015 0.12221404 
##         71          7        313        173          9        131 
## 0.03997268 0.34120366 0.09431059 0.16611248 0.42157975 0.18736151 
##        268         47         31         74        250         30 
## 0.75940626 0.08374025 0.17785388 0.22154242 0.11565988 0.09054748 
##         98        273        159         23        327         27 
## 0.63524204 0.11054124 0.51241510 0.02759609 0.45187745 0.72603717 
##        274        199        205         61        219        304 
## 0.11241375 0.10158300 0.12545178 0.16912805 0.07223162 0.16220348 
##        158        286        209        197        102        314 
## 0.26989825 0.12587570 0.55349715 0.18499387 0.03875253 0.15084203 
##         52        104        208        230        195        128 
## 0.04173576 0.40577243 0.10730602 0.55154729 0.07967402 0.17155387 
##        238        315        182        318        179        117 
## 0.29737879 0.06933461 0.16258957 0.10288101 0.06940480 0.61995359 
##        123        168         97        188         62        290 
## 0.40746377 0.08358757 0.21711498 0.05640706 0.07935677 0.79725494 
##        308        118        302        105        181        106 
## 0.13593470 0.05998015 0.29641481 0.48283816 0.23425691 0.94391545 
##        301         42         15        153        184        321 
## 0.12899712 0.12798273 0.13958403 0.29867416 0.69142778 0.28095984 
##        183        177         66        192        227        256 
## 0.65899621 0.82708256 0.26048553 0.08484452 0.12165811 0.17963633 
##        172        332        289        107         72        271 
## 0.36063114 0.05196178 0.17622208 0.80860777 0.96215168 0.01058470 
##        132        298        211        288        114        191 
## 0.36731298 0.58996740 0.34982783 0.21063584 0.11437085 0.73671840 
##         55        120         67         87        255        229 
## 0.85072789 0.12991882 0.03357622 0.03073911 0.17696802 0.05178646 
##        252         53        267        165        142        260 
## 0.12919131 0.71603055 0.51947202 0.07600540 0.13437850 0.12673410 
##        294        300         45        323        150        277 
## 0.12554830 0.21186239 0.19364977 0.84767828 0.16998852 0.12799211 
##        154        146        112        113        160        171 
## 0.04671688 0.13506996 0.16087847 0.90789522 0.15562256 0.28676803 
##        164        270        281        284        178        100 
## 0.03916273 0.72771023 0.73816797 0.54733823 0.42296227 0.90510467 
##        247         94         39         51        200        293 
## 0.04615408 0.04365586 0.11150207 0.38754973 0.12608909 0.75174617 
##        231         34        221        148        156        110 
## 0.70798460 0.15115532 0.42556267 0.04744032 0.90418152 0.03331611 
##         75         81        212        233        241        147 
## 0.20457588 0.23139951 0.60457346 0.01398493 0.05320648 0.46110013 
##         29         37        193        163        133        297 
## 0.12483579 0.08378247 0.08115401 0.05714052 0.24625994 0.44490385 
##        151        125        254         58         11        198 
## 0.06761583 0.43138129 0.41393663 0.19843640 0.04445797 0.98963156 
##         56        282        166        225        196        220 
## 0.12754800 0.49706647 0.21475002 0.07154446 0.79985464 0.09841452 
##        244        157         20        190        155        130 
## 0.27705735 0.85564320 0.11784785 0.80462087 0.32668536 0.32086935 
##        187        248        204        176         44        328 
## 0.34659351 0.06845068 0.30040690 0.10677569 0.26891901 0.06443671 
##         22        251          6         84         90          1 
## 0.94527450 0.11564365 0.67750970 0.11340016 0.30528492 0.66791526 
##        245        169        215          3        257         19 
## 0.08911717 0.10769515 0.38181755 0.03259797 0.88067208 0.73645505 
##        223 
## 0.67743080
#classification matrix
(table(ActualValue = train$type, PredictedValue = predtrain>0.3))
##            PredictedValue
## ActualValue FALSE TRUE
##         No    144   37
##         Yes    17   67
#validate your model on a holdout sample
predtest <- predict(model.fit2, newdata = test, type = "response")
predtest
##          2          5          8         10         16         17 
## 0.03371019 0.82223760 0.15997311 0.26612557 0.30246088 0.10476051 
##         25         28         35         46         57         63 
## 0.05965493 0.01351424 0.29187718 0.05106873 0.22708220 0.05084321 
##         64         69         78         79         83         89 
## 0.23701881 0.63473144 0.88568254 0.71708686 0.16258318 0.22961777 
##        101        103        108        109        111        115 
## 0.84644752 0.08878344 0.42337444 0.08461789 0.15686536 0.23297762 
##        116        121        124        126        127        129 
## 0.51576851 0.06802276 0.46387192 0.04634477 0.04391703 0.68406923 
##        143        144        174        185        186        213 
## 0.29694639 0.27433462 0.14313593 0.60369910 0.17527670 0.17432406 
##        214        228        235        239        240        242 
## 0.05043125 0.14296502 0.21825390 0.31205629 0.08069340 0.89650989 
##        246        258        259        263        264        265 
## 0.06748902 0.72080311 0.17469469 0.08074651 0.24523383 0.89979650 
##        272        275        276        278        280        285 
## 0.47844529 0.04354904 0.07927202 0.33800314 0.11953055 0.03833643 
##        287        296        299        305        306        307 
## 0.52303851 0.07187371 0.09218476 0.91810053 0.81722555 0.15957354 
##        309        310        322        324        325        329 
## 0.32089177 0.19825160 0.64835373 0.23016697 0.08009477 0.91097472 
##        330 
## 0.25476076
#classification matrix -- now when i restarted R it worked !!! this also
(table(ActualValue = test$type, PredictedValue = predtest>0.3))
##            PredictedValue
## ActualValue FALSE TRUE
##         No     36    6
##         Yes     8   17
table(predtest > 0.5,test$type)
##        
##         No Yes
##   FALSE 39  12
##   TRUE   3  13
(39+13)/(39+12+3+13)
## [1] 0.7761194
#THIS WORKS
table(predtest>0.3, test[,"type"]) #so for cross-tab, this will compare only observations whcih belong to group coded 1 (>threshold cutoff value) from predicted table and test data
##        
##         No Yes
##   FALSE 36   8
##   TRUE   6  17
accuracy <- table(predtest>0.3, test[,"type"])
sum(diag(accuracy))/sum(accuracy)
## [1] 0.7910448
#test also 79% accuracy