Problem Defination
Refer given dataset. Check if a logistic model can be created to predict Diabetese status or “Outcome” (Diabetic / Healthy) based on features of the dataset given.
Setup
Set
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
#library(corrgram)
#library(gridExtra)
#library(Deducer)
library(caret)
## Loading required package: lattice
library(pscl)
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
Functions
Dataset
dfrModel <- read.csv("D:/R-BA/R-Scripts/data/Train Data Diabetes.csv", header=T, stringsAsFactors=F)
head(dfrModel)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 0 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
Observation Dropping column SkinThickness due to lack of clarity Drop Columns
Drop all unrequired / aplha (descriptive only)
#dfrModel <- dplyr::select(dfrModel, -c(Name))
dfrModel <- dplyr::select(dfrModel, -c(SkinThickness))
head(dfrModel)
## Pregnancies Glucose BloodPressure Insulin BMI DiabetesPedigreeFunction
## 1 0 148 72 0 33.6 0.627
## 2 1 85 66 0 26.6 0.351
## 3 8 183 64 0 23.3 0.672
## 4 1 89 66 94 28.1 0.167
## 5 0 137 40 168 43.1 2.288
## 6 5 116 74 0 25.6 0.201
## Age Outcome
## 1 50 1
## 2 31 0
## 3 32 1
## 4 21 0
## 5 33 1
## 6 30 0
Missing Data
#sum(is.na(dfrModel$Age))
lapply(dfrModel, FUN=detect_na)
## $Pregnancies
## [1] 0
##
## $Glucose
## [1] 0
##
## $BloodPressure
## [1] 0
##
## $Insulin
## [1] 0
##
## $BMI
## [1] 0
##
## $DiabetesPedigreeFunction
## [1] 0
##
## $Age
## [1] 0
##
## $Outcome
## [1] 0
Observation No missing values Outliers Data
#detect_outliers(dfrModel$Age)
lapply(dfrModel, FUN=detect_outliers)
## $Pregnancies
## integer(0)
##
## $Glucose
## integer(0)
##
## $BloodPressure
## [1] 0 0 0 0 0 0 122 0 0 0 0 0 0 0 0 0 0
## [18] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## $Insulin
## [1] 543 846 495 485 495 478 744 680 545 465 579 474 480 600 540 480
##
## $BMI
## [1] 0.0 0.0 0.0 0.0 0.0 67.1 0.0 0.0 0.0 0.0 0.0
##
## $DiabetesPedigreeFunction
## [1] 2.288 1.893 1.781 2.329 2.137 1.731 2.420 1.699 1.698
##
## $Age
## [1] 81
##
## $Outcome
## integer(0)
Outliers Graph
plotgraph <- function(inp, na.rm=TRUE) {
ModelPlot <- ggplot(dfrModel, aes(x="", y=inp)) +
geom_boxplot(aes(fill=inp), color="blue") +
labs(title="Model Outliers")
ModelPlot
}
Correlation
vctCorr = numeric(0)
for (i in names(dfrModel)){
cor.result <- cor(as.numeric(dfrModel$Outcome), as.numeric(dfrModel[,i]))
vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr
## Pregnancies Glucose BloodPressure
## 0.22408577 0.45928020 0.06019258
## Insulin BMI DiabetesPedigreeFunction
## 0.14592233 0.30659734 0.17053194
## Age Outcome
## 0.22699018 1.00000000
Data For Visualization
dfrGraph <- gather(dfrModel, variable, value, -Outcome)
head(dfrGraph)
## Outcome variable value
## 1 1 Pregnancies 0
## 2 0 Pregnancies 1
## 3 1 Pregnancies 8
## 4 0 Pregnancies 1
## 5 1 Pregnancies 0
## 6 0 Pregnancies 5
Data Visualization
ggplot(dfrGraph) +
geom_jitter(aes(value,Outcome, colour=variable)) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of Outcome With Other Features")
Summary
lapply(dfrModel, FUN=summary)
## $Pregnancies
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.819 6.000 17.000
##
## $Glucose
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 99.0 116.5 120.5 140.2 199.0
##
## $BloodPressure
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 63.50 72.00 68.88 80.00 122.00
##
## $Insulin
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 36.50 79.88 126.50 846.00
##
## $BMI
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 27.00 32.00 31.89 36.50 67.10
##
## $DiabetesPedigreeFunction
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0780 0.2400 0.3755 0.4760 0.6370 2.4200
##
## $Age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 24.00 29.00 33.12 40.00 81.00
##
## $Outcome
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3443 1.0000 1.0000
Find Best Multi Logistic Model
Choose the best logistic model by using step().
stpModel=step(glm(data=dfrModel, formula=Outcome~., family=binomial), trace=0, steps=100)
summary(stpModel)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## BMI + DiabetesPedigreeFunction, family = binomial, data = dfrModel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7483 -0.7330 -0.4109 0.7159 2.8968
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.001307 0.712745 -11.226 < 2e-16 ***
## Pregnancies 0.154737 0.029182 5.302 1.14e-07 ***
## Glucose 0.033491 0.003518 9.520 < 2e-16 ***
## BloodPressure -0.012340 0.005243 -2.353 0.01860 *
## BMI 0.090916 0.014952 6.080 1.20e-09 ***
## DiabetesPedigreeFunction 0.886659 0.304068 2.916 0.00355 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 901.37 on 699 degrees of freedom
## Residual deviance: 660.13 on 694 degrees of freedom
## AIC: 672.13
##
## Number of Fisher Scoring iterations: 5
pR2(stpModel)
## llh llhNull G2 McFadden r2ML
## -330.0664745 -450.6861271 241.2393052 0.2676356 0.2915158
## r2CU
## 0.4025970
Make Final Multi Linear Model
# make model
mgmModel <- glm(data=dfrModel, formula=Outcome~Pregnancies+Glucose+BMI, family=binomial(link="logit"))
# print summary
summary(mgmModel)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI, family = binomial(link = "logit"),
## data = dfrModel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.1642 -0.7279 -0.4250 0.7684 2.8207
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.177774 0.670401 -12.198 < 2e-16 ***
## Pregnancies 0.136706 0.028022 4.879 1.07e-06 ***
## Glucose 0.033024 0.003431 9.624 < 2e-16 ***
## BMI 0.087133 0.014476 6.019 1.75e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 901.37 on 699 degrees of freedom
## Residual deviance: 674.54 on 696 degrees of freedom
## AIC: 682.54
##
## Number of Fisher Scoring iterations: 5
pR2(mgmModel)
## llh llhNull G2 McFadden r2ML
## -337.2698365 -450.6861271 226.8325814 0.2516525 0.2767834
## r2CU
## 0.3822509
Confusion Matrix
prdVal <- predict(mgmModel, type='response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfrModel$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
##
## act
## prd 0 1
## 0 403 107
## 1 56 134
##
## Accuracy : 0.7671
## 95% CI : (0.734, 0.798)
## No Information Rate : 0.6557
## P-Value [Acc > NIR] : 9.981e-11
##
## Kappa : 0.457
## Mcnemar's Test P-Value : 8.992e-05
##
## Sensitivity : 0.8780
## Specificity : 0.5560
## Pos Pred Value : 0.7902
## Neg Pred Value : 0.7053
## Prevalence : 0.6557
## Detection Rate : 0.5757
## Detection Prevalence : 0.7286
## Balanced Accuracy : 0.7170
##
## 'Positive' Class : 0
##
Regression Data
dfrPlot <- mutate(dfrModel, PrdVal=prdVal, POutcome=prdBln)
head(dfrPlot)
## Pregnancies Glucose BloodPressure Insulin BMI DiabetesPedigreeFunction
## 1 0 148 72 0 33.6 0.627
## 2 1 85 66 0 26.6 0.351
## 3 8 183 64 0 23.3 0.672
## 4 1 89 66 94 28.1 0.167
## 5 0 137 40 168 43.1 2.288
## 6 5 116 74 0 25.6 0.201
## Age Outcome PrdVal POutcome
## 1 50 1 0.41034201 0
## 2 31 0 0.05135422 0
## 3 32 1 0.72898218 1
## 4 21 0 0.06577407 0
## 5 33 1 0.52546528 1
## 6 30 0 0.19265441 0
Regression Visulaization
#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=POutcome)) +
geom_point(shape=19, colour="blue", fill="blue") +
geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
labs(title="Binomial Regression Curve") +
labs(x="") +
labs(y="")
Test Data
dfrTests <- read.csv("D:/R-BA/R-Scripts/data/Test Data Diabetes.csv", header=T, stringsAsFactors=F)
dfrTests <- dplyr::select(dfrTests, -c(SkinThickness))
head(dfrTests)
## Pregnancies Glucose BloodPressure Insulin BMI DiabetesPedigreeFunction
## 1 0 122 76 200 35.9 0.483
## 2 6 125 78 0 27.6 0.565
## 3 1 168 88 0 35.0 0.905
## 4 2 129 0 0 38.5 0.304
## 5 4 110 76 100 28.4 0.118
## 6 6 80 80 0 39.8 0.177
## Age
## 1 26
## 2 49
## 3 52
## 4 41
## 5 27
## 6 28
Observation
Test Data successfully created.
Predict
resVal <- predict(mgmModel, dfrTests, type="response")
prdSur <- ifelse(resVal > 0.5, 1, 0)
prdSur <- as.factor(prdSur)
levels(prdSur) <- c("Diabetic", "Healthy")
dfrTests<- mutate(dfrTests, Result=resVal, Outcome=prdSur)
dfrTests
## Pregnancies Glucose BloodPressure Insulin BMI DiabetesPedigreeFunction
## 1 0 122 76 200 35.9 0.483
## 2 6 125 78 0 27.6 0.565
## 3 1 168 88 0 35.0 0.905
## 4 2 129 0 0 38.5 0.304
## 5 4 110 76 100 28.4 0.118
## 6 6 80 80 0 39.8 0.177
## 7 10 115 0 0 0.0 0.261
## 8 2 127 46 335 34.4 0.176
## 9 9 164 78 0 32.8 0.148
## 10 2 93 64 160 38.0 0.674
## 11 3 158 64 387 31.2 0.295
## 12 5 126 78 22 29.6 0.439
## 13 10 129 62 0 41.2 0.441
## 14 0 134 58 291 26.4 0.352
## 15 3 102 74 0 29.5 0.121
## 16 7 187 50 392 33.9 0.826
## 17 3 173 78 185 33.8 0.970
## 18 10 94 72 0 23.1 0.595
## 19 1 108 60 178 35.5 0.415
## 20 5 97 76 0 35.6 0.378
## 21 4 83 86 0 29.3 0.317
## 22 1 114 66 200 38.1 0.289
## 23 1 149 68 127 29.3 0.349
## 24 5 117 86 105 39.1 0.251
## 25 1 111 94 0 32.8 0.265
## 26 4 112 78 0 39.4 0.236
## 27 1 116 78 180 36.1 0.496
## 28 0 141 84 0 32.4 0.433
## 29 2 175 88 0 22.9 0.326
## 30 2 92 52 0 30.1 0.141
## 31 3 130 78 79 28.4 0.323
## 32 8 120 86 0 28.4 0.259
## 33 2 174 88 120 44.5 0.646
## 34 2 106 56 165 29.0 0.426
## 35 2 105 75 0 23.3 0.560
## 36 4 95 60 0 35.4 0.284
## 37 0 126 86 120 27.4 0.515
## 38 8 65 72 0 32.0 0.600
## 39 2 99 60 160 36.6 0.453
## 40 1 102 74 0 39.5 0.293
## 41 11 120 80 150 42.3 0.785
## 42 3 102 44 94 30.8 0.400
## 43 1 109 58 116 28.5 0.219
## 44 9 140 94 0 32.7 0.734
## 45 13 153 88 140 40.6 1.174
## 46 12 100 84 105 30.0 0.488
## 47 1 147 94 0 49.3 0.358
## 48 1 81 74 57 46.3 1.096
## 49 3 187 70 200 36.4 0.408
## 50 6 162 62 0 24.3 0.178
## 51 4 136 70 0 31.2 1.182
## 52 1 121 78 74 39.0 0.261
## 53 3 108 62 0 26.0 0.223
## 54 0 181 88 510 43.3 0.222
## 55 8 154 78 0 32.4 0.443
## 56 1 128 88 110 36.5 1.057
## 57 7 137 90 0 32.0 0.391
## 58 0 123 72 0 36.3 0.258
## 59 1 106 76 0 37.5 0.197
## 60 6 190 92 0 35.5 0.278
## 61 2 88 58 16 28.4 0.766
## 62 9 170 74 0 44.0 0.403
## 63 9 89 62 0 22.5 0.142
## 64 10 101 76 180 32.9 0.171
## 65 2 122 70 0 36.8 0.340
## 66 5 121 72 112 26.2 0.245
## 67 1 126 60 0 30.1 0.349
## 68 1 93 70 0 30.4 0.315
## Age Result Outcome
## 1 26 0.26487765 Diabetic
## 2 49 0.30477416 Diabetic
## 3 52 0.63567019 Healthy
## 4 41 0.42809131 Diabetic
## 5 27 0.17891277 Diabetic
## 6 28 0.22309499 Diabetic
## 7 30 0.04684399 Diabetic
## 8 22 0.32895027 Diabetic
## 9 45 0.79025778 Healthy
## 10 23 0.17915652 Diabetic
## 11 24 0.54207196 Healthy
## 12 40 0.31993261 Diabetic
## 13 38 0.73870819 Healthy
## 14 21 0.18965614 Diabetic
## 15 32 0.13838338 Diabetic
## 16 34 0.87084669 Healthy
## 17 31 0.70901244 Healthy
## 18 56 0.15529198 Diabetic
## 19 24 0.20080802 Diabetic
## 20 52 0.23344176 Diabetic
## 21 34 0.08810797 Diabetic
## 22 21 0.27756710 Diabetic
## 23 42 0.36181523 Diabetic
## 24 42 0.44434841 Diabetic
## 25 45 0.17983889 Diabetic
## 26 38 0.37772434 Diabetic
## 27 25 0.25639702 Diabetic
## 28 22 0.33219383 Diabetic
## 29 22 0.46759131 Diabetic
## 30 22 0.09591609 Diabetic
## 31 34 0.26894978 Diabetic
## 32 22 0.34374102 Diabetic
## 33 24 0.84803028 Healthy
## 34 22 0.13273854 Diabetic
## 35 53 0.08266761 Diabetic
## 36 28 0.19636678 Diabetic
## 37 21 0.16392563 Diabetic
## 38 42 0.10439604 Diabetic
## 39 21 0.19063102 Diabetic
## 40 42 0.22603039 Diabetic
## 41 48 0.72603579 Healthy
## 42 26 0.15245049 Diabetic
## 43 22 0.12366633 Diabetic
## 44 45 0.62836083 Healthy
## 45 39 0.89931884 Healthy
## 46 46 0.34957467 Diabetic
## 47 27 0.75196514 Healthy
## 48 32 0.20885164 Diabetic
## 49 36 0.82913007 Healthy
## 50 50 0.52739308 Healthy
## 51 22 0.39624524 Diabetic
## 52 28 0.34367173 Diabetic
## 53 25 0.12613154 Diabetic
## 54 26 0.82812990 Healthy
## 55 45 0.69522905 Healthy
## 56 37 0.34668583 Diabetic
## 57 39 0.52291242 Healthy
## 58 52 0.27830365 Diabetic
## 59 26 0.21873840 Diabetic
## 60 66 0.88186852 Healthy
## 61 22 0.07421535 Diabetic
## 62 43 0.92417792 Healthy
## 63 33 0.11427590 Diabetic
## 64 63 0.35237604 Diabetic
## 65 27 0.33873490 Diabetic
## 66 30 0.22873744 Diabetic
## 67 47 0.22143228 Diabetic
## 68 23 0.08939939 Diabetic