Problem Definition
Use diabetes-train.csv to create a logistic model Use diabetes-test.csv and to predict based on diagnostic measurements whether a patient has diabetes.
This dataset is originally from the National Institute of Diabetes and Digestive and Kidney Diseases. Several constraints were placed on the selection of these instances from a larger database. In particular, all patients here are females at least 21 years old of Pima Indian heritage.
Attributes: Patient ID: serial number for the patient Pregnancies: Number of times pregnant Glucose: Plasma glucose concentration a 2 hours in an oral glucose tolerance test BloodPressure: Diastolic blood pressure (mm Hg) SkinThickness: Triceps skin fold thickness (mm) Insulin: 2-Hour serum insulin (mu U/ml) BMI: Body mass index (weight in kg/(height in m)^2) DiabetesPedigreeFunction: Diabetes pedigree function Age: Age (years) Outcome: Class variable (0 or 1)
Setup
library(tidyr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(ggplot2)
library(corrgram)
library(gridExtra)
##
## Attaching package: 'gridExtra'
## The following object is masked from 'package:dplyr':
##
## combine
library(Deducer)
## Loading required package: JGR
## Loading required package: rJava
## Loading required package: JavaGD
## Loading required package: iplots
## Note: On Mac OS X we strongly recommend using iplots from within JGR.
## Proceed at your own risk as iplots cannot resolve potential ev.loop deadlocks.
## 'Yes' is assumed for all dialogs as they cannot be shown without a deadlock,
## also ievent.wait() is disabled.
## More recent OS X version do not allow signle-threaded GUIs and will fail.
##
## Please type JGR() to launch console. Platform specific launchers (.exe and .app) can also be obtained at http://www.rforge.net/JGR/files/.
## Loading required package: car
##
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
##
## recode
## Loading required package: MASS
##
## Attaching package: 'MASS'
## The following object is masked from 'package:dplyr':
##
## select
##
##
## Note Non-JGR console detected:
## Deducer is best used from within JGR (http://jgr.markushelbig.org/).
## To Bring up GUI dialogs, type deducer().
library(caret)
## Loading required package: lattice
library(pscl)
## Classes and Methods for R developed in the
## Political Science Computational Laboratory
## Department of Political Science
## Stanford University
## Simon Jackman
## hurdle and zeroinfl functions by Achim Zeileis
Functions
Dataset
dfrModel <- read.csv("/Users/Charu/Desktop/Machine Learning/diabetes-train.csv", header=T, stringsAsFactors=F)
head(dfrModel)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
Missing Data
#sum(is.na(dfrModel))
lapply(dfrModel, FUN=detect_na)
## $Pregnancies
## [1] 0
##
## $Glucose
## [1] 0
##
## $BloodPressure
## [1] 0
##
## $SkinThickness
## [1] 0
##
## $Insulin
## [1] 0
##
## $BMI
## [1] 0
##
## $DiabetesPedigreeFunction
## [1] 0
##
## $Age
## [1] 0
##
## $Outcome
## [1] 0
Outliers Data
#detect_outliers(dfrModel$BMI)
lapply(dfrModel, FUN=detect_outliers)
## $Pregnancies
## integer(0)
##
## $Glucose
## integer(0)
##
## $BloodPressure
## [1] 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
##
## $SkinThickness
## integer(0)
##
## $Insulin
## [1] 543 846 495 485 495 478 744 680 545 465 579 474 480 600 540 480
##
## $BMI
## [1] 0.0 0.0 0.0 0.0 0.0 67.1 0.0 0.0 0.0 0.0 0.0
##
## $DiabetesPedigreeFunction
## [1] 2.288 1.893 1.781 2.329 2.137 1.731 2.420 1.699 1.698
##
## $Age
## [1] 81
##
## $Outcome
## integer(0)
Functions
lapply(dfrModel, FUN=graph_outliers )
## $Pregnancies
##
## $Glucose
##
## $BloodPressure
##
## $SkinThickness
##
## $Insulin
##
## $BMI
##
## $DiabetesPedigreeFunction
##
## $Age
##
## $Outcome
Observation
Outliers present in many features.
But Outlier count is Moderate.
For this model we will work with the outliers.
Correlation
vctCorr = numeric(0)
for (i in names(dfrModel)){
cor.result <- cor(as.numeric(dfrModel$Outcome), as.numeric(dfrModel[,i]))
vctCorr <- c(vctCorr, cor.result)
}
dfrCorr <- vctCorr
names(dfrCorr) <- names(dfrModel)
dfrCorr
## Pregnancies Glucose BloodPressure
## 0.22788336 0.45937296 0.06027482
## SkinThickness Insulin BMI
## 0.08621860 0.14530972 0.30891558
## DiabetesPedigreeFunction Age Outcome
## 0.17211110 0.22650741 1.00000000
Data For Visualization
dfrGraph <- gather(dfrModel, variable, value, -Outcome)
head(dfrGraph)
## Outcome variable value
## 1 1 Pregnancies 6
## 2 0 Pregnancies 1
## 3 1 Pregnancies 8
## 4 0 Pregnancies 1
## 5 1 Pregnancies 0
## 6 0 Pregnancies 5
Data Visualization
ggplot(dfrGraph) +
geom_jitter(aes(value,Outcome, colour=variable)) +
facet_wrap(~variable, scales="free_x") +
labs(title="Relation Of Outcome With Other Features")
Summary
lapply(dfrModel, FUN=summary)
## $Pregnancies
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 1.000 3.000 3.827 6.000 17.000
##
## $Glucose
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0 99.0 116.0 120.5 140.5 199.0
##
## $BloodPressure
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 63.00 72.00 68.88 80.00 122.00
##
## $SkinThickness
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 23.00 20.41 32.00 99.00
##
## $Insulin
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 0.00 37.00 79.99 127.00 846.00
##
## $BMI
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.00 27.00 32.00 31.87 36.50 67.10
##
## $DiabetesPedigreeFunction
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0780 0.2400 0.3750 0.4754 0.6340 2.4200
##
## $Age
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 21.00 24.00 29.00 33.13 40.00 81.00
##
## $Outcome
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.0000 0.0000 0.0000 0.3448 1.0000 1.0000
Find Best Multi Logistic Model
Choose the best logistic model by using step().
stpModel=step(glm(data=dfrModel, formula=Outcome~., family=binomial), trace=0, steps=100)
summary(stpModel)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## BMI + DiabetesPedigreeFunction, family = binomial, data = dfrModel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7640 -0.7318 -0.4090 0.7146 2.8871
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.053940 0.716823 -11.236 < 2e-16 ***
## Pregnancies 0.157926 0.029314 5.387 7.15e-08 ***
## Glucose 0.033384 0.003518 9.488 < 2e-16 ***
## BloodPressure -0.012549 0.005256 -2.388 0.01696 *
## BMI 0.092717 0.015076 6.150 7.75e-10 ***
## DiabetesPedigreeFunction 0.909844 0.305380 2.979 0.00289 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 900.53 on 698 degrees of freedom
## Residual deviance: 657.22 on 693 degrees of freedom
## AIC: 669.22
##
## Number of Fisher Scoring iterations: 5
Observation
Best Results give Outcome ~ Pregnancies + Glucose + BloodPressure + BMI + DiabetesPedigreeFunction
Make Final Multi Linear Model
# make model
mgmModel <- glm(data=dfrModel, formula=Outcome ~ Pregnancies + Glucose + BloodPressure + BMI + DiabetesPedigreeFunction, family=binomial(link="logit"))
# print summary
summary(mgmModel)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BloodPressure +
## BMI + DiabetesPedigreeFunction, family = binomial(link = "logit"),
## data = dfrModel)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7640 -0.7318 -0.4090 0.7146 2.8871
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.053940 0.716823 -11.236 < 2e-16 ***
## Pregnancies 0.157926 0.029314 5.387 7.15e-08 ***
## Glucose 0.033384 0.003518 9.488 < 2e-16 ***
## BloodPressure -0.012549 0.005256 -2.388 0.01696 *
## BMI 0.092717 0.015076 6.150 7.75e-10 ***
## DiabetesPedigreeFunction 0.909844 0.305380 2.979 0.00289 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 900.53 on 698 degrees of freedom
## Residual deviance: 657.22 on 693 degrees of freedom
## AIC: 669.22
##
## Number of Fisher Scoring iterations: 5
Confusion Matrix
prdVal <- predict(mgmModel, type='response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfrModel$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
##
## act
## prd 0 1
## 0 403 103
## 1 55 138
##
## Accuracy : 0.774
## 95% CI : (0.7411, 0.8045)
## No Information Rate : 0.6552
## P-Value [Acc > NIR] : 5.519e-12
##
## Kappa : 0.4749
## Mcnemar's Test P-Value : 0.0001847
##
## Sensitivity : 0.8799
## Specificity : 0.5726
## Pos Pred Value : 0.7964
## Neg Pred Value : 0.7150
## Prevalence : 0.6552
## Detection Rate : 0.5765
## Detection Prevalence : 0.7239
## Balanced Accuracy : 0.7263
##
## 'Positive' Class : 0
##
Regression Data
dfrPlot <- mutate(dfrModel, PrdVal=prdVal, POutcome=prdBln)
head(dfrPlot)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome PrdVal POutcome
## 1 0.627 50 1 0.64944982 1
## 2 0.351 31 0 0.04306549 0
## 3 0.672 32 1 0.78368126 1
## 4 0.167 21 0 0.04761529 0
## 5 2.288 33 1 0.89046208 1
## 6 0.201 30 0 0.14628967 0
Regression Visulaization
#dfrPlot
ggplot(dfrPlot, aes(x=PrdVal, y=POutcome)) +
geom_point(shape=19, colour="blue", fill="blue") +
geom_smooth(method="gam", formula=y~s(log(x)), se=FALSE) +
labs(title="Binomial Regression Curve") +
labs(x="") +
labs(y="")
ROC Visulaization
#rocplot(logistic.model,diag=TRUE,pred.prob.labels=FALSE,prob.label.digits=3,AUC=TRUE)
rocplot(mgmModel)
Test Data
dfrTests <- read.csv("/Users/Charu/Desktop/Machine Learning/diabetes-test.csv", header=T, stringsAsFactors=F)
head(dfrTests)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 4 118 70 0 0 44.5
## 2 2 122 76 27 200 35.9
## 3 6 125 78 31 0 27.6
## 4 1 168 88 29 0 35.0
## 5 2 129 0 0 0 38.5
## 6 4 110 76 20 100 28.4
## DiabetesPedigreeFunction Age
## 1 0.904 26
## 2 0.483 26
## 3 0.565 49
## 4 0.905 52
## 5 0.304 41
## 6 0.118 27
Predict
resVal <- predict(mgmModel, dfrTests, type="response")
prdDia <- ifelse(resVal > 0.5, 1, 0)
prdDia <- as.factor(prdDia)
levels(prdDia) <- c("NotDiabetic", "Diabetic")
dfrTests <- mutate(dfrTests, Result=resVal, Outcome=prdDia)
dfrTests
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 4 118 70 0 0 44.5
## 2 2 122 76 27 200 35.9
## 3 6 125 78 31 0 27.6
## 4 1 168 88 29 0 35.0
## 5 2 129 0 0 0 38.5
## 6 4 110 76 20 100 28.4
## 7 6 80 80 36 0 39.8
## 8 10 115 0 0 0 0.0
## 9 2 127 46 21 335 34.4
## 10 9 164 78 0 0 32.8
## 11 2 93 64 32 160 38.0
## 12 3 158 64 13 387 31.2
## 13 5 126 78 27 22 29.6
## 14 10 129 62 36 0 41.2
## 15 0 134 58 20 291 26.4
## 16 3 102 74 0 0 29.5
## 17 7 187 50 33 392 33.9
## 18 3 173 78 39 185 33.8
## 19 10 94 72 18 0 23.1
## 20 1 108 60 46 178 35.5
## 21 5 97 76 27 0 35.6
## 22 4 83 86 19 0 29.3
## 23 1 114 66 36 200 38.1
## 24 1 149 68 29 127 29.3
## 25 5 117 86 30 105 39.1
## 26 1 111 94 0 0 32.8
## 27 4 112 78 40 0 39.4
## 28 1 116 78 29 180 36.1
## 29 0 141 84 26 0 32.4
## 30 2 175 88 0 0 22.9
## 31 2 92 52 0 0 30.1
## 32 3 130 78 23 79 28.4
## 33 8 120 86 0 0 28.4
## 34 2 174 88 37 120 44.5
## 35 2 106 56 27 165 29.0
## 36 2 105 75 0 0 23.3
## 37 4 95 60 32 0 35.4
## 38 0 126 86 27 120 27.4
## 39 8 65 72 23 0 32.0
## 40 2 99 60 17 160 36.6
## 41 1 102 74 0 0 39.5
## 42 11 120 80 37 150 42.3
## 43 3 102 44 20 94 30.8
## 44 1 109 58 18 116 28.5
## 45 9 140 94 0 0 32.7
## 46 13 153 88 37 140 40.6
## 47 12 100 84 33 105 30.0
## 48 1 147 94 41 0 49.3
## 49 1 81 74 41 57 46.3
## 50 3 187 70 22 200 36.4
## 51 6 162 62 0 0 24.3
## 52 4 136 70 0 0 31.2
## 53 1 121 78 39 74 39.0
## 54 3 108 62 24 0 26.0
## 55 0 181 88 44 510 43.3
## 56 8 154 78 32 0 32.4
## 57 1 128 88 39 110 36.5
## 58 7 137 90 41 0 32.0
## 59 0 123 72 0 0 36.3
## 60 1 106 76 0 0 37.5
## 61 6 190 92 0 0 35.5
## 62 2 88 58 26 16 28.4
## 63 9 170 74 31 0 44.0
## 64 9 89 62 0 0 22.5
## 65 10 101 76 48 180 32.9
## 66 2 122 70 27 0 36.8
## 67 5 121 72 23 112 26.2
## 68 1 126 60 0 0 30.1
## 69 1 93 70 31 0 30.4
## DiabetesPedigreeFunction Age Result Outcome
## 1 0.904 26 0.64267647 Diabetic
## 2 0.483 26 0.29921925 NotDiabetic
## 3 0.565 49 0.30170247 NotDiabetic
## 4 0.905 52 0.66298462 Diabetic
## 5 0.304 41 0.60218508 Diabetic
## 6 0.118 27 0.12311702 NotDiabetic
## 7 0.177 28 0.16960382 NotDiabetic
## 8 0.261 30 0.08331746 NotDiabetic
## 9 0.176 22 0.32606579 NotDiabetic
## 10 0.148 45 0.73870615 Diabetic
## 11 0.674 23 0.21415309 NotDiabetic
## 12 0.295 24 0.51311756 Diabetic
## 13 0.439 40 0.29050080 NotDiabetic
## 14 0.441 38 0.78158743 Diabetic
## 15 0.352 21 0.17647903 NotDiabetic
## 16 0.121 32 0.09462590 NotDiabetic
## 17 0.826 34 0.92833929 Diabetic
## 18 0.970 31 0.77430137 Diabetic
## 19 0.595 56 0.17407344 NotDiabetic
## 20 0.415 24 0.20189551 NotDiabetic
## 21 0.378 52 0.20830398 NotDiabetic
## 22 0.317 34 0.06148078 NotDiabetic
## 23 0.289 21 0.24544457 NotDiabetic
## 24 0.349 42 0.32277527 NotDiabetic
## 25 0.251 42 0.35800410 NotDiabetic
## 26 0.265 45 0.11028728 NotDiabetic
## 27 0.236 38 0.31124115 NotDiabetic
## 28 0.496 25 0.23076938 NotDiabetic
## 29 0.433 22 0.26835720 NotDiabetic
## 30 0.326 22 0.35883181 NotDiabetic
## 31 0.141 22 0.08315153 NotDiabetic
## 32 0.323 34 0.21550226 NotDiabetic
## 33 0.259 22 0.26994538 NotDiabetic
## 34 0.646 24 0.84290582 Diabetic
## 35 0.426 22 0.13874107 NotDiabetic
## 36 0.560 53 0.07556685 NotDiabetic
## 37 0.284 28 0.18798455 NotDiabetic
## 38 0.515 21 0.12810698 NotDiabetic
## 39 0.600 42 0.11802413 NotDiabetic
## 40 0.453 21 0.20093624 NotDiabetic
## 41 0.293 42 0.18383046 NotDiabetic
## 42 0.785 48 0.78943297 Diabetic
## 43 0.400 26 0.18129856 NotDiabetic
## 44 0.219 22 0.10495385 NotDiabetic
## 45 0.734 45 0.63672200 Diabetic
## 46 1.174 39 0.94453500 Diabetic
## 47 0.488 46 0.34319929 NotDiabetic
## 48 0.358 27 0.67446282 Diabetic
## 49 1.096 32 0.30352146 NotDiabetic
## 50 0.408 36 0.82204322 Diabetic
## 51 0.178 50 0.48467839 NotDiabetic
## 52 1.182 22 0.55174117 Diabetic
## 53 0.261 28 0.27249731 NotDiabetic
## 54 0.223 25 0.10534153 NotDiabetic
## 55 0.222 26 0.75041171 Diabetic
## 56 0.443 45 0.68542367 Diabetic
## 57 1.057 37 0.40580178 NotDiabetic
## 58 0.391 39 0.45471929 NotDiabetic
## 59 0.258 52 0.22254103 NotDiabetic
## 60 0.197 26 0.16044107 NotDiabetic
## 61 0.278 66 0.83567334 Diabetic
## 62 0.766 22 0.09992387 NotDiabetic
## 63 0.403 43 0.92825503 Diabetic
## 64 0.142 33 0.09760062 NotDiabetic
## 65 0.171 63 0.29928838 NotDiabetic
## 66 0.340 27 0.30525694 NotDiabetic
## 67 0.245 30 0.18598321 NotDiabetic
## 68 0.349 47 0.20844816 NotDiabetic
## 69 0.315 23 0.07145369 NotDiabetic
Confusion Matrix
prdVal <- predict(mgmModel, type='response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx1 <- table(prdDia=prdBln, act=dfrModel$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
##
## act
## prd 0 1
## 0 403 103
## 1 55 138
##
## Accuracy : 0.774
## 95% CI : (0.7411, 0.8045)
## No Information Rate : 0.6552
## P-Value [Acc > NIR] : 5.519e-12
##
## Kappa : 0.4749
## Mcnemar's Test P-Value : 0.0001847
##
## Sensitivity : 0.8799
## Specificity : 0.5726
## Pos Pred Value : 0.7964
## Neg Pred Value : 0.7150
## Prevalence : 0.6552
## Detection Rate : 0.5765
## Detection Prevalence : 0.7239
## Balanced Accuracy : 0.7263
##
## 'Positive' Class : 0
##