library(caTools)
## Warning: package 'caTools' was built under R version 3.6.1
library(ROCR)
## Warning: package 'ROCR' was built under R version 3.6.1
## Loading required package: gplots
## Warning: package 'gplots' was built under R version 3.6.1
##
## Attaching package: 'gplots'
## The following object is masked from 'package:stats':
##
## lowess
pimaData <- read.csv("db.csv")
head(pimaData)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
summary(pimaData)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
str(pimaData)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
colnames(pimaData) <- c("pregnancies","glucose","bp","skinThickness","insulin","bmi","dpf","age","outcome")
table(is.na(pimaData))
##
## FALSE
## 6912
There is no missing values in dataset
splitData <- sample.split(pimaData,SplitRatio = 0.8)
train <- subset(pimaData,splitData=='TRUE')
test <- subset(pimaData,splitData=='FALSE')
model <- glm(outcome~.,train,family = "binomial")
summary(model)
##
## Call:
## glm(formula = outcome ~ ., family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.7035 -0.7241 -0.4041 0.7044 2.8671
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -8.8335216 0.8520401 -10.367 < 2e-16 ***
## pregnancies 0.1385463 0.0361898 3.828 0.000129 ***
## glucose 0.0342450 0.0043732 7.831 4.86e-15 ***
## bp -0.0111904 0.0061003 -1.834 0.066597 .
## skinThickness -0.0009522 0.0080535 -0.118 0.905879
## insulin -0.0010204 0.0010980 -0.929 0.352740
## bmi 0.0961369 0.0171581 5.603 2.11e-08 ***
## dpf 1.1910481 0.3549444 3.356 0.000792 ***
## age 0.0167324 0.0110002 1.521 0.128233
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 768.89 on 597 degrees of freedom
## Residual deviance: 560.13 on 589 degrees of freedom
## AIC: 578.13
##
## Number of Fisher Scoring iterations: 5
pred <- predict(model,test,type = "response")
rocPred = prediction(pred,test$outcome)
rocPref <- performance(rocPred,"tpr","fpr")
plot(rocPref,colorize=TRUE,print.cuttoffs.at=seq(0.1,by=0.1))
table(ActualValue=test$outcome,PredictedValue=pred>0.3)
## PredictedValue
## ActualValue FALSE TRUE
## 0 73 34
## 1 11 52