Problem Defination
Refer given dataset. Check if a logistic model can be created to predict Diabetese status or “Outcome” (Diabetic / Healthy) based on features of the dataset given.

Setup

Set

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
#library(corrgram)
#library(gridExtra) 
#library(Deducer)
library(caret)
## Loading required package: lattice
library(pscl)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

Functions

Dataset

dfrModel <- read.csv("D:/R-BA/R-Scripts/data/Train Data Diabetes.csv", header=T, stringsAsFactors=F)
head(dfrModel)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           0     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0

Observation Dropping column SkinThickness due to lack of clarity Drop Columns
Drop all unrequired / aplha (descriptive only)

#dfrModel <- dplyr::select(dfrModel, -c(Name))
dfrModel <- dplyr::select(dfrModel, -c(SkinThickness))
head(dfrModel)
##   Pregnancies Glucose BloodPressure Insulin  BMI DiabetesPedigreeFunction
## 1           0     148            72       0 33.6                    0.627
## 2           1      85            66       0 26.6                    0.351
## 3           8     183            64       0 23.3                    0.672
## 4           1      89            66      94 28.1                    0.167
## 5           0     137            40     168 43.1                    2.288
## 6           5     116            74       0 25.6                    0.201
##   Age Outcome
## 1  50       1
## 2  31       0
## 3  32       1
## 4  21       0
## 5  33       1
## 6  30       0

Missing Data

#sum(is.na(dfrModel$Age))
lapply(dfrModel, FUN=detect_na)
## $Pregnancies
## [1] 0
## 
## $Glucose
## [1] 0
## 
## $BloodPressure
## [1] 0
## 
## $Insulin
## [1] 0
## 
## $BMI
## [1] 0
## 
## $DiabetesPedigreeFunction
## [1] 0
## 
## $Age
## [1] 0
## 
## $Outcome
## [1] 0

Observation No missing values Outliers Data

#detect_outliers(dfrModel$Age)
lapply(dfrModel, FUN=detect_outliers)
## $Pregnancies
## integer(0)
## 
## $Glucose
## integer(0)
## 
## $BloodPressure
##  [1]   0   0   0   0   0   0 122   0   0   0   0   0   0   0   0   0   0
## [18]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 
## $Insulin
##  [1] 543 846 495 485 495 478 744 680 545 465 579 474 480 600 540 480
## 
## $BMI
##  [1]  0.0  0.0  0.0  0.0  0.0 67.1  0.0  0.0  0.0  0.0  0.0
## 
## $DiabetesPedigreeFunction
## [1] 2.288 1.893 1.781 2.329 2.137 1.731 2.420 1.699 1.698
## 
## $Age
## [1] 81
## 
## $Outcome
## integer(0)

Outliers Graph

plotgraph <- function(inp, na.rm=TRUE) {
ModelPlot <- ggplot(dfrModel, aes(x="", y=inp)) +
            geom_boxplot(aes(fill=inp), color="blue") +
            labs(title="Model Outliers")
ModelPlot
}

Correlation

vctCorr = numeric(0)
for (i in names(dfrModel)){
    cor.result <- cor(as.numeric(dfrModel$Outcome), as.numeric(dfrModel[,i]))
    vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr
##              Pregnancies                  Glucose            BloodPressure 
##               0.22408577               0.45928020               0.06019258 
##                  Insulin                      BMI DiabetesPedigreeFunction 
##               0.14592233               0.30659734               0.17053194 
##                      Age                  Outcome 
##               0.22699018               1.00000000

Data For Visualization

dfrGraph <- gather(dfrModel, variable, value, -Outcome)
head(dfrGraph)
##   Outcome    variable value
## 1       1 Pregnancies     0
## 2       0 Pregnancies     1
## 3       1 Pregnancies     8
## 4       0 Pregnancies     1
## 5       1 Pregnancies     0
## 6       0 Pregnancies     5

Data Visualization

ggplot(dfrGraph) +
    geom_jitter(aes(value,Outcome, colour=variable)) + 
    facet_wrap(~variable, scales="free_x") +
    labs(title="Relation Of Outcome With Other Features")

Summary

lapply(dfrModel, FUN=summary)
## $Pregnancies
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.819   6.000  17.000 
## 
## $Glucose
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    99.0   116.5   120.5   140.2   199.0 
## 
## $BloodPressure
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   63.50   72.00   68.88   80.00  122.00 
## 
## $Insulin
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   36.50   79.88  126.50  846.00 
## 
## $BMI
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   27.00   32.00   31.89   36.50   67.10 
## 
## $DiabetesPedigreeFunction
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0780  0.2400  0.3755  0.4760  0.6370  2.4200 
## 
## $Age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   24.00   29.00   33.12   40.00   81.00 
## 
## $Outcome
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3443  1.0000  1.0000

Find Best Multi Logistic Model
Choose the best logistic model by using step().

stpModel=step(glm(data=dfrModel, formula=Outcome~., family=binomial), trace=0, steps=100)
summary(stpModel)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure + 
##     BMI + DiabetesPedigreeFunction, family = binomial, data = dfrModel)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7483  -0.7330  -0.4109   0.7159   2.8968  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.001307   0.712745 -11.226  < 2e-16 ***
## Pregnancies               0.154737   0.029182   5.302 1.14e-07 ***
## Glucose                   0.033491   0.003518   9.520  < 2e-16 ***
## BloodPressure            -0.012340   0.005243  -2.353  0.01860 *  
## BMI                       0.090916   0.014952   6.080 1.20e-09 ***
## DiabetesPedigreeFunction  0.886659   0.304068   2.916  0.00355 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 901.37  on 699  degrees of freedom
## Residual deviance: 660.13  on 694  degrees of freedom
## AIC: 672.13
## 
## Number of Fisher Scoring iterations: 5
pR2(stpModel)
##          llh      llhNull           G2     McFadden         r2ML 
## -330.0664745 -450.6861271  241.2393052    0.2676356    0.2915158 
##         r2CU 
##    0.4025970

Make Final Multi Linear Model

# make model
mgmModel <- glm(data=dfrModel, formula=Outcome~Pregnancies+Glucose+BMI, family=binomial(link="logit"))
# print summary
summary(mgmModel)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI, family = binomial(link = "logit"), 
##     data = dfrModel)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1642  -0.7279  -0.4250   0.7684   2.8207  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.177774   0.670401 -12.198  < 2e-16 ***
## Pregnancies  0.136706   0.028022   4.879 1.07e-06 ***
## Glucose      0.033024   0.003431   9.624  < 2e-16 ***
## BMI          0.087133   0.014476   6.019 1.75e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 901.37  on 699  degrees of freedom
## Residual deviance: 674.54  on 696  degrees of freedom
## AIC: 682.54
## 
## Number of Fisher Scoring iterations: 5
pR2(mgmModel)
##          llh      llhNull           G2     McFadden         r2ML 
## -337.2698365 -450.6861271  226.8325814    0.2516525    0.2767834 
##         r2CU 
##    0.3822509

Confusion Matrix

prdVal <- predict(mgmModel, type='response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfrModel$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 403 107
##   1  56 134
##                                         
##                Accuracy : 0.7671        
##                  95% CI : (0.734, 0.798)
##     No Information Rate : 0.6557        
##     P-Value [Acc > NIR] : 9.981e-11     
##                                         
##                   Kappa : 0.457         
##  Mcnemar's Test P-Value : 8.992e-05     
##                                         
##             Sensitivity : 0.8780        
##             Specificity : 0.5560        
##          Pos Pred Value : 0.7902        
##          Neg Pred Value : 0.7053        
##              Prevalence : 0.6557        
##          Detection Rate : 0.5757        
##    Detection Prevalence : 0.7286        
##       Balanced Accuracy : 0.7170        
##                                         
##        'Positive' Class : 0             
## 

Regression Data

dfrPlot <- mutate(dfrModel, PrdVal=prdVal, POutcome=prdBln)
head(dfrPlot)
##   Pregnancies Glucose BloodPressure Insulin  BMI DiabetesPedigreeFunction
## 1           0     148            72       0 33.6                    0.627
## 2           1      85            66       0 26.6                    0.351
## 3           8     183            64       0 23.3                    0.672
## 4           1      89            66      94 28.1                    0.167
## 5           0     137            40     168 43.1                    2.288
## 6           5     116            74       0 25.6                    0.201
##   Age Outcome     PrdVal POutcome
## 1  50       1 0.41034201        0
## 2  31       0 0.05135422        0
## 3  32       1 0.72898218        1
## 4  21       0 0.06577407        0
## 5  33       1 0.52546528        1
## 6  30       0 0.19265441        0

Regression Visulaization

#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=POutcome))  + 
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
    labs(title="Binomial Regression Curve") +
    labs(x="") +
    labs(y="")

Test Data

dfrTests <- read.csv("D:/R-BA/R-Scripts/data/Test Data Diabetes.csv", header=T, stringsAsFactors=F)
dfrTests <- dplyr::select(dfrTests, -c(SkinThickness))
head(dfrTests)
##   Pregnancies Glucose BloodPressure Insulin  BMI DiabetesPedigreeFunction
## 1           0     122            76     200 35.9                    0.483
## 2           6     125            78       0 27.6                    0.565
## 3           1     168            88       0 35.0                    0.905
## 4           2     129             0       0 38.5                    0.304
## 5           4     110            76     100 28.4                    0.118
## 6           6      80            80       0 39.8                    0.177
##   Age
## 1  26
## 2  49
## 3  52
## 4  41
## 5  27
## 6  28

Observation
Test Data successfully created.

Predict

resVal <- predict(mgmModel, dfrTests, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)
prdSur <- as.factor(prdSur)
levels(prdSur) <- c("Diabetic", "Healthy")
dfrTests<- mutate(dfrTests, Result=resVal, Outcome=prdSur)
dfrTests 
##    Pregnancies Glucose BloodPressure Insulin  BMI DiabetesPedigreeFunction
## 1            0     122            76     200 35.9                    0.483
## 2            6     125            78       0 27.6                    0.565
## 3            1     168            88       0 35.0                    0.905
## 4            2     129             0       0 38.5                    0.304
## 5            4     110            76     100 28.4                    0.118
## 6            6      80            80       0 39.8                    0.177
## 7           10     115             0       0  0.0                    0.261
## 8            2     127            46     335 34.4                    0.176
## 9            9     164            78       0 32.8                    0.148
## 10           2      93            64     160 38.0                    0.674
## 11           3     158            64     387 31.2                    0.295
## 12           5     126            78      22 29.6                    0.439
## 13          10     129            62       0 41.2                    0.441
## 14           0     134            58     291 26.4                    0.352
## 15           3     102            74       0 29.5                    0.121
## 16           7     187            50     392 33.9                    0.826
## 17           3     173            78     185 33.8                    0.970
## 18          10      94            72       0 23.1                    0.595
## 19           1     108            60     178 35.5                    0.415
## 20           5      97            76       0 35.6                    0.378
## 21           4      83            86       0 29.3                    0.317
## 22           1     114            66     200 38.1                    0.289
## 23           1     149            68     127 29.3                    0.349
## 24           5     117            86     105 39.1                    0.251
## 25           1     111            94       0 32.8                    0.265
## 26           4     112            78       0 39.4                    0.236
## 27           1     116            78     180 36.1                    0.496
## 28           0     141            84       0 32.4                    0.433
## 29           2     175            88       0 22.9                    0.326
## 30           2      92            52       0 30.1                    0.141
## 31           3     130            78      79 28.4                    0.323
## 32           8     120            86       0 28.4                    0.259
## 33           2     174            88     120 44.5                    0.646
## 34           2     106            56     165 29.0                    0.426
## 35           2     105            75       0 23.3                    0.560
## 36           4      95            60       0 35.4                    0.284
## 37           0     126            86     120 27.4                    0.515
## 38           8      65            72       0 32.0                    0.600
## 39           2      99            60     160 36.6                    0.453
## 40           1     102            74       0 39.5                    0.293
## 41          11     120            80     150 42.3                    0.785
## 42           3     102            44      94 30.8                    0.400
## 43           1     109            58     116 28.5                    0.219
## 44           9     140            94       0 32.7                    0.734
## 45          13     153            88     140 40.6                    1.174
## 46          12     100            84     105 30.0                    0.488
## 47           1     147            94       0 49.3                    0.358
## 48           1      81            74      57 46.3                    1.096
## 49           3     187            70     200 36.4                    0.408
## 50           6     162            62       0 24.3                    0.178
## 51           4     136            70       0 31.2                    1.182
## 52           1     121            78      74 39.0                    0.261
## 53           3     108            62       0 26.0                    0.223
## 54           0     181            88     510 43.3                    0.222
## 55           8     154            78       0 32.4                    0.443
## 56           1     128            88     110 36.5                    1.057
## 57           7     137            90       0 32.0                    0.391
## 58           0     123            72       0 36.3                    0.258
## 59           1     106            76       0 37.5                    0.197
## 60           6     190            92       0 35.5                    0.278
## 61           2      88            58      16 28.4                    0.766
## 62           9     170            74       0 44.0                    0.403
## 63           9      89            62       0 22.5                    0.142
## 64          10     101            76     180 32.9                    0.171
## 65           2     122            70       0 36.8                    0.340
## 66           5     121            72     112 26.2                    0.245
## 67           1     126            60       0 30.1                    0.349
## 68           1      93            70       0 30.4                    0.315
##    Age     Result  Outcome
## 1   26 0.26487765 Diabetic
## 2   49 0.30477416 Diabetic
## 3   52 0.63567019  Healthy
## 4   41 0.42809131 Diabetic
## 5   27 0.17891277 Diabetic
## 6   28 0.22309499 Diabetic
## 7   30 0.04684399 Diabetic
## 8   22 0.32895027 Diabetic
## 9   45 0.79025778  Healthy
## 10  23 0.17915652 Diabetic
## 11  24 0.54207196  Healthy
## 12  40 0.31993261 Diabetic
## 13  38 0.73870819  Healthy
## 14  21 0.18965614 Diabetic
## 15  32 0.13838338 Diabetic
## 16  34 0.87084669  Healthy
## 17  31 0.70901244  Healthy
## 18  56 0.15529198 Diabetic
## 19  24 0.20080802 Diabetic
## 20  52 0.23344176 Diabetic
## 21  34 0.08810797 Diabetic
## 22  21 0.27756710 Diabetic
## 23  42 0.36181523 Diabetic
## 24  42 0.44434841 Diabetic
## 25  45 0.17983889 Diabetic
## 26  38 0.37772434 Diabetic
## 27  25 0.25639702 Diabetic
## 28  22 0.33219383 Diabetic
## 29  22 0.46759131 Diabetic
## 30  22 0.09591609 Diabetic
## 31  34 0.26894978 Diabetic
## 32  22 0.34374102 Diabetic
## 33  24 0.84803028  Healthy
## 34  22 0.13273854 Diabetic
## 35  53 0.08266761 Diabetic
## 36  28 0.19636678 Diabetic
## 37  21 0.16392563 Diabetic
## 38  42 0.10439604 Diabetic
## 39  21 0.19063102 Diabetic
## 40  42 0.22603039 Diabetic
## 41  48 0.72603579  Healthy
## 42  26 0.15245049 Diabetic
## 43  22 0.12366633 Diabetic
## 44  45 0.62836083  Healthy
## 45  39 0.89931884  Healthy
## 46  46 0.34957467 Diabetic
## 47  27 0.75196514  Healthy
## 48  32 0.20885164 Diabetic
## 49  36 0.82913007  Healthy
## 50  50 0.52739308  Healthy
## 51  22 0.39624524 Diabetic
## 52  28 0.34367173 Diabetic
## 53  25 0.12613154 Diabetic
## 54  26 0.82812990  Healthy
## 55  45 0.69522905  Healthy
## 56  37 0.34668583 Diabetic
## 57  39 0.52291242  Healthy
## 58  52 0.27830365 Diabetic
## 59  26 0.21873840 Diabetic
## 60  66 0.88186852  Healthy
## 61  22 0.07421535 Diabetic
## 62  43 0.92417792  Healthy
## 63  33 0.11427590 Diabetic
## 64  63 0.35237604 Diabetic
## 65  27 0.33873490 Diabetic
## 66  30 0.22873744 Diabetic
## 67  47 0.22143228 Diabetic
## 68  23 0.08939939 Diabetic