The objective is to predict whether a patient has diabetes based on diagnostic measurements.
This dataset is from the National Institute of Diabetes and Digestive and Kidney Diseases. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
Attributes: Patient ID: serial number for the patient Pregnancies: Number of times pregnant Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test BloodPressure: Diastolic blood pressure (mm Hg) SkinThickness: Triceps skin fold thickness (mm) Insulin: 2-Hour serum insulin (mu U/ml) BMI: Body mass index (weight in kg/(height in m)^2) DiabetesPedigreeFunction: Diabetes pedigree function Age: Age (years) Outcome: Class variable (0 or 1)
setwd("D:/R-BA/R-Scripts")
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(caret)
## Loading required package: lattice
library(pscl)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
Functions
dfrTrainDiabetesData <- read.csv("./data/Train Data Diabetes.csv", header=T, stringsAsFactors=F)
intRowCount <- nrow(dfrTrainDiabetesData)
print(intRowCount)
## [1] 700
head(dfrTrainDiabetesData,20)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 0 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## 7 3 78 50 32 88 31.0
## 8 10 115 0 0 0 35.3
## 9 2 197 70 45 543 30.5
## 10 8 125 96 0 0 0.0
## 11 4 110 92 0 0 37.6
## 12 10 168 74 0 0 38.0
## 13 10 139 80 0 0 27.1
## 14 1 189 60 23 846 30.1
## 15 5 166 72 19 175 25.8
## 16 7 100 0 0 0 30.0
## 17 0 118 84 47 230 45.8
## 18 7 107 74 0 0 29.6
## 19 1 103 30 38 83 43.3
## 20 1 115 70 30 96 34.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
## 7 0.248 26 1
## 8 0.134 29 0
## 9 0.158 53 1
## 10 0.232 54 1
## 11 0.191 30 0
## 12 0.537 34 1
## 13 1.441 57 0
## 14 0.398 59 1
## 15 0.587 51 1
## 16 0.484 32 1
## 17 0.551 31 1
## 18 0.254 31 1
## 19 0.183 33 0
## 20 0.529 32 1
Observation Only numeric data is seen
Missing Data
#sum(is.na(dfrModel$Age))
lapply(dfrTrainDiabetesData, FUN=detect_na)
## $Pregnancies
## [1] 0
##
## $Glucose
## [1] 0
##
## $BloodPressure
## [1] 0
##
## $SkinThickness
## [1] 0
##
## $Insulin
## [1] 0
##
## $BMI
## [1] 0
##
## $DiabetesPedigreeFunction
## [1] 0
##
## $Age
## [1] 0
##
## $Outcome
## [1] 0
Observation No missing data is seen in any of the columns
Outliers Data
#detect_outliers(dfrModel$Age)
lapply(dfrTrainDiabetesData, FUN=detect_outliers)
## $Pregnancies
## integer(0)
##
## $Glucose
## integer(0)
##
## $BloodPressure
## [1] 0 0 0 0 0 0 122 0 0 0 0 0 0 0 0 0 0
## [18] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## $SkinThickness
## integer(0)
##
## $Insulin
## [1] 543 846 495 485 495 478 744 680 545 465 579 474 480 600 540 480
##
## $BMI
## [1] 0.0 0.0 0.0 0.0 0.0 67.1 0.0 0.0 0.0 0.0 0.0
##
## $DiabetesPedigreeFunction
## [1] 2.288 1.893 1.781 2.329 2.137 1.731 2.420 1.699 1.698
##
## $Age
## [1] 81
##
## $Outcome
## integer(0)
Visualizing the outliers
displaygraph <- function(inp, na.rm=TRUE) {
TrainDiabetesPlot <- ggplot(dfrTrainDiabetesData, aes(x="", y=inp)) +
geom_boxplot(aes(fill=inp), color="blue") +
labs(title="Outliers in TrainDiabetesData")
TrainDiabetesPlot
}
lapply(dfrTrainDiabetesData, FUN= displaygraph)
## $Pregnancies
##
## $Glucose
##
## $BloodPressure
##
## $SkinThickness
##
## $Insulin
##
## $BMI
##
## $DiabetesPedigreeFunction
##
## $Age
##
## $Outcome
Correlation
vctCorr = numeric(0)
for (i in names(dfrTrainDiabetesData)){
cor.result <- cor(as.numeric(dfrTrainDiabetesData$Outcome), as.numeric(dfrTrainDiabetesData[,i]))
vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrTrainDiabetesData)
dfrCorr
## Pregnancies Glucose BloodPressure
## 0.22408577 0.45928020 0.06019258
## SkinThickness Insulin BMI
## 0.08740524 0.14592233 0.30659734
## DiabetesPedigreeFunction Age Outcome
## 0.17053194 0.22699018 1.00000000
Data For Visualization
dfrGraph <- gather(dfrTrainDiabetesData, variable, value, -Outcome)
head(dfrGraph)
## Outcome variable value
## 1 1 Pregnancies 0
## 2 0 Pregnancies 1
## 3 1 Pregnancies 8
## 4 0 Pregnancies 1
## 5 1 Pregnancies 0
## 6 0 Pregnancies 5
Data Visualization
ggplot(dfrGraph) +
geom_jitter(aes(value,Outcome, colour=variable)) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of Outcome With Other Features")
Summary
lapply(dfrTrainDiabetesData, FUN=summary)
## $Pregnancies
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.819 6.000 17.000
##
## $Glucose
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 99.0 116.5 120.5 140.2 199.0
##
## $BloodPressure
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 63.50 72.00 68.88 80.00 122.00
##
## $SkinThickness
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 23.00 20.38 32.00 99.00
##
## $Insulin
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 36.50 79.88 126.50 846.00
##
## $BMI
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 27.00 32.00 31.89 36.50 67.10
##
## $DiabetesPedigreeFunction
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0780 0.2400 0.3755 0.4760 0.6370 2.4200
##
## $Age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 24.00 29.00 33.12 40.00 81.00
##
## $Outcome
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3443 1.0000 1.0000
Observation
Find Best Multi Logistic Model with R Squared values
Choose the best logistic model by using step().
stpModel=step(glm(data=dfrTrainDiabetesData, formula=Outcome~., family=binomial), trace=0, steps=100)
summary(stpModel)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## BMI + DiabetesPedigreeFunction, family = binomial, data = dfrTrainDiabetesData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7483 -0.7330 -0.4109 0.7159 2.8968
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.001307 0.712745 -11.226 < 2e-16 ***
## Pregnancies 0.154737 0.029182 5.302 1.14e-07 ***
## Glucose 0.033491 0.003518 9.520 < 2e-16 ***
## BloodPressure -0.012340 0.005243 -2.353 0.01860 *
## BMI 0.090916 0.014952 6.080 1.20e-09 ***
## DiabetesPedigreeFunction 0.886659 0.304068 2.916 0.00355 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 901.37 on 699 degrees of freedom
## Residual deviance: 660.13 on 694 degrees of freedom
## AIC: 672.13
##
## Number of Fisher Scoring iterations: 5
pR2(stpModel)
## llh llhNull G2 McFadden r2ML
## -330.0664745 -450.6861271 241.2393052 0.2676356 0.2915158
## r2CU
## 0.4025970
Observation
Best results given by Outcome ~ Pregnancies + Glucose + BMI
Make Final Multi Linear Model
# make model
mgmModel <- glm(data=dfrTrainDiabetesData, formula=Outcome~Pregnancies+Glucose+BMI, family=binomial(link="logit"))
# print summary
summary(mgmModel)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI, family = binomial(link = "logit"),
## data = dfrTrainDiabetesData)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1642 -0.7279 -0.4250 0.7684 2.8207
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.177774 0.670401 -12.198 < 2e-16 ***
## Pregnancies 0.136706 0.028022 4.879 1.07e-06 ***
## Glucose 0.033024 0.003431 9.624 < 2e-16 ***
## BMI 0.087133 0.014476 6.019 1.75e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 901.37 on 699 degrees of freedom
## Residual deviance: 674.54 on 696 degrees of freedom
## AIC: 682.54
##
## Number of Fisher Scoring iterations: 5
pR2(mgmModel)
## llh llhNull G2 McFadden r2ML
## -337.2698365 -450.6861271 226.8325814 0.2516525 0.2767834
## r2CU
## 0.3822509
Confusion Matrix
prdVal <- predict(mgmModel, type='response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfrTrainDiabetesData$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
##
## act
## prd 0 1
## 0 403 107
## 1 56 134
##
## Accuracy : 0.7671
## 95% CI : (0.734, 0.798)
## No Information Rate : 0.6557
## P-Value [Acc > NIR] : 9.981e-11
##
## Kappa : 0.457
## Mcnemar's Test P-Value : 8.992e-05
##
## Sensitivity : 0.8780
## Specificity : 0.5560
## Pos Pred Value : 0.7902
## Neg Pred Value : 0.7053
## Prevalence : 0.6557
## Detection Rate : 0.5757
## Detection Prevalence : 0.7286
## Balanced Accuracy : 0.7170
##
## 'Positive' Class : 0
##
Regression Data
dfrPlot <- mutate(dfrTrainDiabetesData, PrdVal=prdVal, POutcome=prdBln)
head(dfrPlot)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 0 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome PrdVal POutcome
## 1 0.627 50 1 0.41034201 0
## 2 0.351 31 0 0.05135422 0
## 3 0.672 32 1 0.72898218 1
## 4 0.167 21 0 0.06577407 0
## 5 2.288 33 1 0.52546528 1
## 6 0.201 30 0 0.19265441 0
Regression Visulaization
#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=POutcome)) +
geom_point(shape=19, colour="blue", fill="blue") +
geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
labs(title="Binomial Regression Curve") +
labs(x="") +
labs(y="")
ROC Visulaization
#rocplot(logistic.model,diag=TRUE,pred.prob.labels=FALSE,prob.label.digits=3,AUC=TRUE)
#rocplot(mgmModel)
Test Data
dfrTests <- read.csv("./data/Test Data Diabetes.csv", header=T, stringsAsFactors=F)
head(dfrTests)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 0 122 76 27 200 35.9
## 2 6 125 78 31 0 27.6
## 3 1 168 88 29 0 35.0
## 4 2 129 0 0 0 38.5
## 5 4 110 76 20 100 28.4
## 6 6 80 80 36 0 39.8
## DiabetesPedigreeFunction Age
## 1 0.483 26
## 2 0.565 49
## 3 0.905 52
## 4 0.304 41
## 5 0.118 27
## 6 0.177 28
Observation
Test Data successfully created.
Predict
resVal <- predict(mgmModel, dfrTests, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)
prdSur <- as.factor(prdSur)
levels(prdSur) <- c("Diabetic", "Non-Diabetic")
dfrTests <- mutate(dfrTests, Result=resVal, Outcome=prdSur)
dfrTests
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 0 122 76 27 200 35.9
## 2 6 125 78 31 0 27.6
## 3 1 168 88 29 0 35.0
## 4 2 129 0 0 0 38.5
## 5 4 110 76 20 100 28.4
## 6 6 80 80 36 0 39.8
## 7 10 115 0 0 0 0.0
## 8 2 127 46 21 335 34.4
## 9 9 164 78 0 0 32.8
## 10 2 93 64 32 160 38.0
## 11 3 158 64 13 387 31.2
## 12 5 126 78 27 22 29.6
## 13 10 129 62 36 0 41.2
## 14 0 134 58 20 291 26.4
## 15 3 102 74 0 0 29.5
## 16 7 187 50 33 392 33.9
## 17 3 173 78 39 185 33.8
## 18 10 94 72 18 0 23.1
## 19 1 108 60 46 178 35.5
## 20 5 97 76 27 0 35.6
## 21 4 83 86 19 0 29.3
## 22 1 114 66 36 200 38.1
## 23 1 149 68 29 127 29.3
## 24 5 117 86 30 105 39.1
## 25 1 111 94 0 0 32.8
## 26 4 112 78 40 0 39.4
## 27 1 116 78 29 180 36.1
## 28 0 141 84 26 0 32.4
## 29 2 175 88 0 0 22.9
## 30 2 92 52 0 0 30.1
## 31 3 130 78 23 79 28.4
## 32 8 120 86 0 0 28.4
## 33 2 174 88 37 120 44.5
## 34 2 106 56 27 165 29.0
## 35 2 105 75 0 0 23.3
## 36 4 95 60 32 0 35.4
## 37 0 126 86 27 120 27.4
## 38 8 65 72 23 0 32.0
## 39 2 99 60 17 160 36.6
## 40 1 102 74 0 0 39.5
## 41 11 120 80 37 150 42.3
## 42 3 102 44 20 94 30.8
## 43 1 109 58 18 116 28.5
## 44 9 140 94 0 0 32.7
## 45 13 153 88 37 140 40.6
## 46 12 100 84 33 105 30.0
## 47 1 147 94 41 0 49.3
## 48 1 81 74 41 57 46.3
## 49 3 187 70 22 200 36.4
## 50 6 162 62 0 0 24.3
## 51 4 136 70 0 0 31.2
## 52 1 121 78 39 74 39.0
## 53 3 108 62 24 0 26.0
## 54 0 181 88 44 510 43.3
## 55 8 154 78 32 0 32.4
## 56 1 128 88 39 110 36.5
## 57 7 137 90 41 0 32.0
## 58 0 123 72 0 0 36.3
## 59 1 106 76 0 0 37.5
## 60 6 190 92 0 0 35.5
## 61 2 88 58 26 16 28.4
## 62 9 170 74 31 0 44.0
## 63 9 89 62 0 0 22.5
## 64 10 101 76 48 180 32.9
## 65 2 122 70 27 0 36.8
## 66 5 121 72 23 112 26.2
## 67 1 126 60 0 0 30.1
## 68 1 93 70 31 0 30.4
## DiabetesPedigreeFunction Age Result Outcome
## 1 0.483 26 0.26487765 Diabetic
## 2 0.565 49 0.30477416 Diabetic
## 3 0.905 52 0.63567019 Non-Diabetic
## 4 0.304 41 0.42809131 Diabetic
## 5 0.118 27 0.17891277 Diabetic
## 6 0.177 28 0.22309499 Diabetic
## 7 0.261 30 0.04684399 Diabetic
## 8 0.176 22 0.32895027 Diabetic
## 9 0.148 45 0.79025778 Non-Diabetic
## 10 0.674 23 0.17915652 Diabetic
## 11 0.295 24 0.54207196 Non-Diabetic
## 12 0.439 40 0.31993261 Diabetic
## 13 0.441 38 0.73870819 Non-Diabetic
## 14 0.352 21 0.18965614 Diabetic
## 15 0.121 32 0.13838338 Diabetic
## 16 0.826 34 0.87084669 Non-Diabetic
## 17 0.970 31 0.70901244 Non-Diabetic
## 18 0.595 56 0.15529198 Diabetic
## 19 0.415 24 0.20080802 Diabetic
## 20 0.378 52 0.23344176 Diabetic
## 21 0.317 34 0.08810797 Diabetic
## 22 0.289 21 0.27756710 Diabetic
## 23 0.349 42 0.36181523 Diabetic
## 24 0.251 42 0.44434841 Diabetic
## 25 0.265 45 0.17983889 Diabetic
## 26 0.236 38 0.37772434 Diabetic
## 27 0.496 25 0.25639702 Diabetic
## 28 0.433 22 0.33219383 Diabetic
## 29 0.326 22 0.46759131 Diabetic
## 30 0.141 22 0.09591609 Diabetic
## 31 0.323 34 0.26894978 Diabetic
## 32 0.259 22 0.34374102 Diabetic
## 33 0.646 24 0.84803028 Non-Diabetic
## 34 0.426 22 0.13273854 Diabetic
## 35 0.560 53 0.08266761 Diabetic
## 36 0.284 28 0.19636678 Diabetic
## 37 0.515 21 0.16392563 Diabetic
## 38 0.600 42 0.10439604 Diabetic
## 39 0.453 21 0.19063102 Diabetic
## 40 0.293 42 0.22603039 Diabetic
## 41 0.785 48 0.72603579 Non-Diabetic
## 42 0.400 26 0.15245049 Diabetic
## 43 0.219 22 0.12366633 Diabetic
## 44 0.734 45 0.62836083 Non-Diabetic
## 45 1.174 39 0.89931884 Non-Diabetic
## 46 0.488 46 0.34957467 Diabetic
## 47 0.358 27 0.75196514 Non-Diabetic
## 48 1.096 32 0.20885164 Diabetic
## 49 0.408 36 0.82913007 Non-Diabetic
## 50 0.178 50 0.52739308 Non-Diabetic
## 51 1.182 22 0.39624524 Diabetic
## 52 0.261 28 0.34367173 Diabetic
## 53 0.223 25 0.12613154 Diabetic
## 54 0.222 26 0.82812990 Non-Diabetic
## 55 0.443 45 0.69522905 Non-Diabetic
## 56 1.057 37 0.34668583 Diabetic
## 57 0.391 39 0.52291242 Non-Diabetic
## 58 0.258 52 0.27830365 Diabetic
## 59 0.197 26 0.21873840 Diabetic
## 60 0.278 66 0.88186852 Non-Diabetic
## 61 0.766 22 0.07421535 Diabetic
## 62 0.403 43 0.92417792 Non-Diabetic
## 63 0.142 33 0.11427590 Diabetic
## 64 0.171 63 0.35237604 Diabetic
## 65 0.340 27 0.33873490 Diabetic
## 66 0.245 30 0.22873744 Diabetic
## 67 0.349 47 0.22143228 Diabetic
## 68 0.315 23 0.08939939 Diabetic