Problem Definition :

The objective is to predict whether a patient has diabetes based on diagnostic measurements.

Data Description:

This dataset is from the National Institute of Diabetes and Digestive and Kidney Diseases. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.

Attributes: Patient ID: serial number for the patient Pregnancies: Number of times pregnant Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test BloodPressure: Diastolic blood pressure (mm Hg) SkinThickness: Triceps skin fold thickness (mm) Insulin: 2-Hour serum insulin (mu U/ml) BMI: Body mass index (weight in kg/(height in m)^2) DiabetesPedigreeFunction: Diabetes pedigree function Age: Age (years) Outcome: Class variable (0 or 1)

Working Directory

setwd("D:/R-BA/R-Scripts")

set-up

Loading Libraries

library(tidyr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra)
## 
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
## 
##     combine
library(caret)
## Loading required package: lattice
library(pscl)
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
## 
##     select
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis

Functions

reading the file Train Data Diabetes.csv

dfrTrainDiabetesData <- read.csv("./data/Train Data Diabetes.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrTrainDiabetesData)
print(intRowCount)
## [1] 700
head(dfrTrainDiabetesData,20)
##    Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1            0     148            72            35       0 33.6
## 2            1      85            66            29       0 26.6
## 3            8     183            64             0       0 23.3
## 4            1      89            66            23      94 28.1
## 5            0     137            40            35     168 43.1
## 6            5     116            74             0       0 25.6
## 7            3      78            50            32      88 31.0
## 8           10     115             0             0       0 35.3
## 9            2     197            70            45     543 30.5
## 10           8     125            96             0       0  0.0
## 11           4     110            92             0       0 37.6
## 12          10     168            74             0       0 38.0
## 13          10     139            80             0       0 27.1
## 14           1     189            60            23     846 30.1
## 15           5     166            72            19     175 25.8
## 16           7     100             0             0       0 30.0
## 17           0     118            84            47     230 45.8
## 18           7     107            74             0       0 29.6
## 19           1     103            30            38      83 43.3
## 20           1     115            70            30      96 34.6
##    DiabetesPedigreeFunction Age Outcome
## 1                     0.627  50       1
## 2                     0.351  31       0
## 3                     0.672  32       1
## 4                     0.167  21       0
## 5                     2.288  33       1
## 6                     0.201  30       0
## 7                     0.248  26       1
## 8                     0.134  29       0
## 9                     0.158  53       1
## 10                    0.232  54       1
## 11                    0.191  30       0
## 12                    0.537  34       1
## 13                    1.441  57       0
## 14                    0.398  59       1
## 15                    0.587  51       1
## 16                    0.484  32       1
## 17                    0.551  31       1
## 18                    0.254  31       1
## 19                    0.183  33       0
## 20                    0.529  32       1

Observation Only numeric data is seen

Missing Data

#sum(is.na(dfrModel$Age))
lapply(dfrTrainDiabetesData, FUN=detect_na)
## $Pregnancies
## [1] 0
## 
## $Glucose
## [1] 0
## 
## $BloodPressure
## [1] 0
## 
## $SkinThickness
## [1] 0
## 
## $Insulin
## [1] 0
## 
## $BMI
## [1] 0
## 
## $DiabetesPedigreeFunction
## [1] 0
## 
## $Age
## [1] 0
## 
## $Outcome
## [1] 0

Observation No missing data is seen in any of the columns

Outliers Data

#detect_outliers(dfrModel$Age)
lapply(dfrTrainDiabetesData, FUN=detect_outliers)
## $Pregnancies
## integer(0)
## 
## $Glucose
## integer(0)
## 
## $BloodPressure
##  [1]   0   0   0   0   0   0 122   0   0   0   0   0   0   0   0   0   0
## [18]   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0
## 
## $SkinThickness
## integer(0)
## 
## $Insulin
##  [1] 543 846 495 485 495 478 744 680 545 465 579 474 480 600 540 480
## 
## $BMI
##  [1]  0.0  0.0  0.0  0.0  0.0 67.1  0.0  0.0  0.0  0.0  0.0
## 
## $DiabetesPedigreeFunction
## [1] 2.288 1.893 1.781 2.329 2.137 1.731 2.420 1.699 1.698
## 
## $Age
## [1] 81
## 
## $Outcome
## integer(0)

Visualizing the outliers

displaygraph <- function(inp, na.rm=TRUE) {
TrainDiabetesPlot <- ggplot(dfrTrainDiabetesData, aes(x="", y=inp)) +
            geom_boxplot(aes(fill=inp), color="blue") +
            labs(title="Outliers in TrainDiabetesData")
TrainDiabetesPlot
}
lapply(dfrTrainDiabetesData, FUN= displaygraph)
## $Pregnancies

## 
## $Glucose

## 
## $BloodPressure

## 
## $SkinThickness

## 
## $Insulin

## 
## $BMI

## 
## $DiabetesPedigreeFunction

## 
## $Age

## 
## $Outcome

Correlation

vctCorr = numeric(0)
for (i in names(dfrTrainDiabetesData)){
    cor.result <- cor(as.numeric(dfrTrainDiabetesData$Outcome), as.numeric(dfrTrainDiabetesData[,i]))
    vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrTrainDiabetesData)
dfrCorr
##              Pregnancies                  Glucose            BloodPressure 
##               0.22408577               0.45928020               0.06019258 
##            SkinThickness                  Insulin                      BMI 
##               0.08740524               0.14592233               0.30659734 
## DiabetesPedigreeFunction                      Age                  Outcome 
##               0.17053194               0.22699018               1.00000000

Data For Visualization

dfrGraph <- gather(dfrTrainDiabetesData, variable, value, -Outcome)
head(dfrGraph)
##   Outcome    variable value
## 1       1 Pregnancies     0
## 2       0 Pregnancies     1
## 3       1 Pregnancies     8
## 4       0 Pregnancies     1
## 5       1 Pregnancies     0
## 6       0 Pregnancies     5

Data Visualization

ggplot(dfrGraph) +
    geom_jitter(aes(value,Outcome, colour=variable)) + 
    facet_wrap(~variable, scales="free_x") +
    labs(title="Relation Of Outcome With Other Features")

Summary

lapply(dfrTrainDiabetesData, FUN=summary)
## $Pregnancies
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.000   3.000   3.819   6.000  17.000 
## 
## $Glucose
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##     0.0    99.0   116.5   120.5   140.2   199.0 
## 
## $BloodPressure
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   63.50   72.00   68.88   80.00  122.00 
## 
## $SkinThickness
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   23.00   20.38   32.00   99.00 
## 
## $Insulin
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00    0.00   36.50   79.88  126.50  846.00 
## 
## $BMI
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   27.00   32.00   31.89   36.50   67.10 
## 
## $DiabetesPedigreeFunction
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0780  0.2400  0.3755  0.4760  0.6370  2.4200 
## 
## $Age
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   21.00   24.00   29.00   33.12   40.00   81.00 
## 
## $Outcome
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##  0.0000  0.0000  0.0000  0.3443  1.0000  1.0000

Observation

Find Best Multi Logistic Model with R Squared values
Choose the best logistic model by using step().

stpModel=step(glm(data=dfrTrainDiabetesData, formula=Outcome~., family=binomial), trace=0, steps=100)
summary(stpModel)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure + 
##     BMI + DiabetesPedigreeFunction, family = binomial, data = dfrTrainDiabetesData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.7483  -0.7330  -0.4109   0.7159   2.8968  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -8.001307   0.712745 -11.226  < 2e-16 ***
## Pregnancies               0.154737   0.029182   5.302 1.14e-07 ***
## Glucose                   0.033491   0.003518   9.520  < 2e-16 ***
## BloodPressure            -0.012340   0.005243  -2.353  0.01860 *  
## BMI                       0.090916   0.014952   6.080 1.20e-09 ***
## DiabetesPedigreeFunction  0.886659   0.304068   2.916  0.00355 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 901.37  on 699  degrees of freedom
## Residual deviance: 660.13  on 694  degrees of freedom
## AIC: 672.13
## 
## Number of Fisher Scoring iterations: 5
pR2(stpModel)
##          llh      llhNull           G2     McFadden         r2ML 
## -330.0664745 -450.6861271  241.2393052    0.2676356    0.2915158 
##         r2CU 
##    0.4025970

Observation
Best results given by Outcome ~ Pregnancies + Glucose + BMI

Make Final Multi Linear Model

# make model
mgmModel <- glm(data=dfrTrainDiabetesData, formula=Outcome~Pregnancies+Glucose+BMI, family=binomial(link="logit"))
# print summary
summary(mgmModel)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI, family = binomial(link = "logit"), 
##     data = dfrTrainDiabetesData)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.1642  -0.7279  -0.4250   0.7684   2.8207  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)    
## (Intercept) -8.177774   0.670401 -12.198  < 2e-16 ***
## Pregnancies  0.136706   0.028022   4.879 1.07e-06 ***
## Glucose      0.033024   0.003431   9.624  < 2e-16 ***
## BMI          0.087133   0.014476   6.019 1.75e-09 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 901.37  on 699  degrees of freedom
## Residual deviance: 674.54  on 696  degrees of freedom
## AIC: 682.54
## 
## Number of Fisher Scoring iterations: 5
pR2(mgmModel)
##          llh      llhNull           G2     McFadden         r2ML 
## -337.2698365 -450.6861271  226.8325814    0.2516525    0.2767834 
##         r2CU 
##    0.3822509

Confusion Matrix

prdVal <- predict(mgmModel, type='response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfrTrainDiabetesData$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 403 107
##   1  56 134
##                                         
##                Accuracy : 0.7671        
##                  95% CI : (0.734, 0.798)
##     No Information Rate : 0.6557        
##     P-Value [Acc > NIR] : 9.981e-11     
##                                         
##                   Kappa : 0.457         
##  Mcnemar's Test P-Value : 8.992e-05     
##                                         
##             Sensitivity : 0.8780        
##             Specificity : 0.5560        
##          Pos Pred Value : 0.7902        
##          Neg Pred Value : 0.7053        
##              Prevalence : 0.6557        
##          Detection Rate : 0.5757        
##    Detection Prevalence : 0.7286        
##       Balanced Accuracy : 0.7170        
##                                         
##        'Positive' Class : 0             
## 

Regression Data

dfrPlot <- mutate(dfrTrainDiabetesData, PrdVal=prdVal, POutcome=prdBln)
head(dfrPlot)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           0     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome     PrdVal POutcome
## 1                    0.627  50       1 0.41034201        0
## 2                    0.351  31       0 0.05135422        0
## 3                    0.672  32       1 0.72898218        1
## 4                    0.167  21       0 0.06577407        0
## 5                    2.288  33       1 0.52546528        1
## 6                    0.201  30       0 0.19265441        0

Regression Visulaization

#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=POutcome))  + 
    geom_point(shape=19, colour="blue", fill="blue") +
    geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
    labs(title="Binomial Regression Curve") +
    labs(x="") +
    labs(y="")

ROC Visulaization

#rocplot(logistic.model,diag=TRUE,pred.prob.labels=FALSE,prob.label.digits=3,AUC=TRUE)
#rocplot(mgmModel)

Test Data

dfrTests <- read.csv("./data/Test Data Diabetes.csv", header=T, stringsAsFactors=F)
head(dfrTests)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           0     122            76            27     200 35.9
## 2           6     125            78            31       0 27.6
## 3           1     168            88            29       0 35.0
## 4           2     129             0             0       0 38.5
## 5           4     110            76            20     100 28.4
## 6           6      80            80            36       0 39.8
##   DiabetesPedigreeFunction Age
## 1                    0.483  26
## 2                    0.565  49
## 3                    0.905  52
## 4                    0.304  41
## 5                    0.118  27
## 6                    0.177  28

Observation
Test Data successfully created.

Predict

resVal <- predict(mgmModel, dfrTests, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)
prdSur <- as.factor(prdSur)
levels(prdSur) <- c("Diabetic", "Non-Diabetic")
dfrTests <- mutate(dfrTests, Result=resVal, Outcome=prdSur)
dfrTests 
##    Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1            0     122            76            27     200 35.9
## 2            6     125            78            31       0 27.6
## 3            1     168            88            29       0 35.0
## 4            2     129             0             0       0 38.5
## 5            4     110            76            20     100 28.4
## 6            6      80            80            36       0 39.8
## 7           10     115             0             0       0  0.0
## 8            2     127            46            21     335 34.4
## 9            9     164            78             0       0 32.8
## 10           2      93            64            32     160 38.0
## 11           3     158            64            13     387 31.2
## 12           5     126            78            27      22 29.6
## 13          10     129            62            36       0 41.2
## 14           0     134            58            20     291 26.4
## 15           3     102            74             0       0 29.5
## 16           7     187            50            33     392 33.9
## 17           3     173            78            39     185 33.8
## 18          10      94            72            18       0 23.1
## 19           1     108            60            46     178 35.5
## 20           5      97            76            27       0 35.6
## 21           4      83            86            19       0 29.3
## 22           1     114            66            36     200 38.1
## 23           1     149            68            29     127 29.3
## 24           5     117            86            30     105 39.1
## 25           1     111            94             0       0 32.8
## 26           4     112            78            40       0 39.4
## 27           1     116            78            29     180 36.1
## 28           0     141            84            26       0 32.4
## 29           2     175            88             0       0 22.9
## 30           2      92            52             0       0 30.1
## 31           3     130            78            23      79 28.4
## 32           8     120            86             0       0 28.4
## 33           2     174            88            37     120 44.5
## 34           2     106            56            27     165 29.0
## 35           2     105            75             0       0 23.3
## 36           4      95            60            32       0 35.4
## 37           0     126            86            27     120 27.4
## 38           8      65            72            23       0 32.0
## 39           2      99            60            17     160 36.6
## 40           1     102            74             0       0 39.5
## 41          11     120            80            37     150 42.3
## 42           3     102            44            20      94 30.8
## 43           1     109            58            18     116 28.5
## 44           9     140            94             0       0 32.7
## 45          13     153            88            37     140 40.6
## 46          12     100            84            33     105 30.0
## 47           1     147            94            41       0 49.3
## 48           1      81            74            41      57 46.3
## 49           3     187            70            22     200 36.4
## 50           6     162            62             0       0 24.3
## 51           4     136            70             0       0 31.2
## 52           1     121            78            39      74 39.0
## 53           3     108            62            24       0 26.0
## 54           0     181            88            44     510 43.3
## 55           8     154            78            32       0 32.4
## 56           1     128            88            39     110 36.5
## 57           7     137            90            41       0 32.0
## 58           0     123            72             0       0 36.3
## 59           1     106            76             0       0 37.5
## 60           6     190            92             0       0 35.5
## 61           2      88            58            26      16 28.4
## 62           9     170            74            31       0 44.0
## 63           9      89            62             0       0 22.5
## 64          10     101            76            48     180 32.9
## 65           2     122            70            27       0 36.8
## 66           5     121            72            23     112 26.2
## 67           1     126            60             0       0 30.1
## 68           1      93            70            31       0 30.4
##    DiabetesPedigreeFunction Age     Result      Outcome
## 1                     0.483  26 0.26487765     Diabetic
## 2                     0.565  49 0.30477416     Diabetic
## 3                     0.905  52 0.63567019 Non-Diabetic
## 4                     0.304  41 0.42809131     Diabetic
## 5                     0.118  27 0.17891277     Diabetic
## 6                     0.177  28 0.22309499     Diabetic
## 7                     0.261  30 0.04684399     Diabetic
## 8                     0.176  22 0.32895027     Diabetic
## 9                     0.148  45 0.79025778 Non-Diabetic
## 10                    0.674  23 0.17915652     Diabetic
## 11                    0.295  24 0.54207196 Non-Diabetic
## 12                    0.439  40 0.31993261     Diabetic
## 13                    0.441  38 0.73870819 Non-Diabetic
## 14                    0.352  21 0.18965614     Diabetic
## 15                    0.121  32 0.13838338     Diabetic
## 16                    0.826  34 0.87084669 Non-Diabetic
## 17                    0.970  31 0.70901244 Non-Diabetic
## 18                    0.595  56 0.15529198     Diabetic
## 19                    0.415  24 0.20080802     Diabetic
## 20                    0.378  52 0.23344176     Diabetic
## 21                    0.317  34 0.08810797     Diabetic
## 22                    0.289  21 0.27756710     Diabetic
## 23                    0.349  42 0.36181523     Diabetic
## 24                    0.251  42 0.44434841     Diabetic
## 25                    0.265  45 0.17983889     Diabetic
## 26                    0.236  38 0.37772434     Diabetic
## 27                    0.496  25 0.25639702     Diabetic
## 28                    0.433  22 0.33219383     Diabetic
## 29                    0.326  22 0.46759131     Diabetic
## 30                    0.141  22 0.09591609     Diabetic
## 31                    0.323  34 0.26894978     Diabetic
## 32                    0.259  22 0.34374102     Diabetic
## 33                    0.646  24 0.84803028 Non-Diabetic
## 34                    0.426  22 0.13273854     Diabetic
## 35                    0.560  53 0.08266761     Diabetic
## 36                    0.284  28 0.19636678     Diabetic
## 37                    0.515  21 0.16392563     Diabetic
## 38                    0.600  42 0.10439604     Diabetic
## 39                    0.453  21 0.19063102     Diabetic
## 40                    0.293  42 0.22603039     Diabetic
## 41                    0.785  48 0.72603579 Non-Diabetic
## 42                    0.400  26 0.15245049     Diabetic
## 43                    0.219  22 0.12366633     Diabetic
## 44                    0.734  45 0.62836083 Non-Diabetic
## 45                    1.174  39 0.89931884 Non-Diabetic
## 46                    0.488  46 0.34957467     Diabetic
## 47                    0.358  27 0.75196514 Non-Diabetic
## 48                    1.096  32 0.20885164     Diabetic
## 49                    0.408  36 0.82913007 Non-Diabetic
## 50                    0.178  50 0.52739308 Non-Diabetic
## 51                    1.182  22 0.39624524     Diabetic
## 52                    0.261  28 0.34367173     Diabetic
## 53                    0.223  25 0.12613154     Diabetic
## 54                    0.222  26 0.82812990 Non-Diabetic
## 55                    0.443  45 0.69522905 Non-Diabetic
## 56                    1.057  37 0.34668583     Diabetic
## 57                    0.391  39 0.52291242 Non-Diabetic
## 58                    0.258  52 0.27830365     Diabetic
## 59                    0.197  26 0.21873840     Diabetic
## 60                    0.278  66 0.88186852 Non-Diabetic
## 61                    0.766  22 0.07421535     Diabetic
## 62                    0.403  43 0.92417792 Non-Diabetic
## 63                    0.142  33 0.11427590     Diabetic
## 64                    0.171  63 0.35237604     Diabetic
## 65                    0.340  27 0.33873490     Diabetic
## 66                    0.245  30 0.22873744     Diabetic
## 67                    0.349  47 0.22143228     Diabetic
## 68                    0.315  23 0.08939939     Diabetic