Problem Defination
Study the given dataset. Given the features of the dataset, decide the best possible features to use for predicting if a patient has Diabetes.
Dataset
Use niddkd-diabetes.csv Refer niddkd-diabetes.txt for data dict Use niddkd-diabetes.prd dataset to predict if a patient has Diabetes
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(DMwR)
## Warning: package 'DMwR' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: grid
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.5.3
##
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
##
## panel.fill
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
df <- read.csv("D:/ISME/R class/data/niddkd-diabetes.csv",stringsAsFactors = F)
head(df)
## Pregnancies Glucose BloodPressure SkinThickness Insulin BMI
## 1 6 148 72 35 0 33.6
## 2 1 85 66 29 0 26.6
## 3 8 183 64 0 0 23.3
## 4 1 89 66 23 94 28.1
## 5 0 137 40 35 168 43.1
## 6 5 116 74 0 0 25.6
## DiabetesPedigreeFunction Age Outcome
## 1 0.627 50 1
## 2 0.351 31 0
## 3 0.672 32 1
## 4 0.167 21 0
## 5 2.288 33 1
## 6 0.201 30 0
str(df)
## 'data.frame': 768 obs. of 9 variables:
## $ Pregnancies : int 6 1 8 1 0 5 3 10 2 8 ...
## $ Glucose : int 148 85 183 89 137 116 78 115 197 125 ...
## $ BloodPressure : int 72 66 64 66 40 74 50 0 70 96 ...
## $ SkinThickness : int 35 29 0 23 35 0 32 0 45 0 ...
## $ Insulin : int 0 0 0 94 168 0 88 0 543 0 ...
## $ BMI : num 33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
## $ DiabetesPedigreeFunction: num 0.627 0.351 0.672 0.167 2.288 ...
## $ Age : int 50 31 32 21 33 30 26 29 53 54 ...
## $ Outcome : int 1 0 1 0 1 0 1 0 1 1 ...
summary(df)
## Pregnancies Glucose BloodPressure SkinThickness
## Min. : 0.000 Min. : 0.0 Min. : 0.00 Min. : 0.00
## 1st Qu.: 1.000 1st Qu.: 99.0 1st Qu.: 62.00 1st Qu.: 0.00
## Median : 3.000 Median :117.0 Median : 72.00 Median :23.00
## Mean : 3.845 Mean :120.9 Mean : 69.11 Mean :20.54
## 3rd Qu.: 6.000 3rd Qu.:140.2 3rd Qu.: 80.00 3rd Qu.:32.00
## Max. :17.000 Max. :199.0 Max. :122.00 Max. :99.00
## Insulin BMI DiabetesPedigreeFunction Age
## Min. : 0.0 Min. : 0.00 Min. :0.0780 Min. :21.00
## 1st Qu.: 0.0 1st Qu.:27.30 1st Qu.:0.2437 1st Qu.:24.00
## Median : 30.5 Median :32.00 Median :0.3725 Median :29.00
## Mean : 79.8 Mean :31.99 Mean :0.4719 Mean :33.24
## 3rd Qu.:127.2 3rd Qu.:36.60 3rd Qu.:0.6262 3rd Qu.:41.00
## Max. :846.0 Max. :67.10 Max. :2.4200 Max. :81.00
## Outcome
## Min. :0.000
## 1st Qu.:0.000
## Median :0.000
## Mean :0.349
## 3rd Qu.:1.000
## Max. :1.000
detect_na <- function(inp) {
sum(is.na(inp))
}
# detect Outliers
detect_outliers <- function(inp, na.rm=TRUE) {
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 1.5 * IQR(inp, na.rm=na.rm)
otp <- inp
otp[inp < (i.qnt[1] - i.max)] <- NA
otp[inp > (i.qnt[2] + i.max)] <- NA
inp[is.na(otp)]
}
lapply(df,detect_outliers)
## $Pregnancies
## [1] 15 17 14 14
##
## $Glucose
## [1] 0 0 0 0 0
##
## $BloodPressure
## [1] 0 0 30 110 0 0 0 0 108 122 30 0 110 0 0 0 0
## [18] 0 0 0 0 0 0 108 0 0 0 0 0 0 0 0 0 0
## [35] 110 0 24 0 0 0 0 114 0 0 0
##
## $SkinThickness
## [1] 99
##
## $Insulin
## [1] 543 846 342 495 325 485 495 478 744 370 680 402 375 545 360 325 465
## [18] 325 415 579 474 328 480 326 330 600 321 440 540 480 335 387 392 510
##
## $BMI
## [1] 0.0 0.0 0.0 0.0 53.2 55.0 0.0 67.1 52.3 52.3 52.9 0.0 0.0 59.4
## [15] 0.0 0.0 57.3 0.0 0.0
##
## $DiabetesPedigreeFunction
## [1] 2.288 1.441 1.390 1.893 1.781 1.222 1.400 1.321 1.224 2.329 1.318
## [12] 1.213 1.353 1.224 1.391 1.476 2.137 1.731 1.268 1.600 2.420 1.251
## [23] 1.699 1.258 1.282 1.698 1.461 1.292 1.394
##
## $Age
## [1] 69 67 72 81 67 67 70 68 69
##
## $Outcome
## integer(0)
Replace_outliers <- function(inp, na.rm=TRUE) {
i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
i.max <- 1.5 * IQR(inp, na.rm=na.rm)
inp[inp < (i.qnt[1] - i.max)] <- NA
inp[inp > (i.qnt[2] + i.max)] <- NA
return(inp)
}
df <- sapply(df,Replace_outliers)
df <- data.frame(df,stringsAsFactors = F)
lapply(df,detect_na)
## $Pregnancies
## [1] 4
##
## $Glucose
## [1] 5
##
## $BloodPressure
## [1] 45
##
## $SkinThickness
## [1] 1
##
## $Insulin
## [1] 34
##
## $BMI
## [1] 19
##
## $DiabetesPedigreeFunction
## [1] 29
##
## $Age
## [1] 9
##
## $Outcome
## [1] 0
df$Pregnancies[is.na(df$Pregnancies)] <- round(mean(df$Pregnancies[!is.na(df$Pregnancies)]),digits=0)
df$Glucose[is.na(df$Glucose)] <- round(mean(df$Glucose[!is.na(df$Glucose)]),digits=0)
df$BloodPressure[is.na(df$BloodPressure)] <- round(mean(df$BloodPressure[!is.na(df$BloodPressure)]),digits=0)
df$SkinThickness[is.na(df$SkinThickness)] <- round(mean(df$SkinThickness[!is.na(df$SkinThickness)]),digits=0)
df$Insulin[is.na(df$Insulin)] <- round(mean(df$Insulin[!is.na(df$Insulin)]),digits=0)
df$BMI[is.na(df$BMI)] <- mean(df$BMI[!is.na(df$BMI)])
df$DiabetesPedigreeFunction[is.na(df$DiabetesPedigreeFunction)] <- mean(df$DiabetesPedigreeFunction[!is.na(df$DiabetesPedigreeFunction)])
df$Age[is.na(df$Age)] <- round(mean(df$Age[!is.na(df$Age)]),digits=0)
lapply(df,detect_outliers)
## $Pregnancies
## numeric(0)
##
## $Glucose
## numeric(0)
##
## $BloodPressure
## [1] 38 106 106 106
##
## $SkinThickness
## numeric(0)
##
## $Insulin
## [1] 300 304 284 285 318 280 278 293 285 310 277 293 291
##
## $BMI
## [1] 49.7 50.0 49.6
##
## $DiabetesPedigreeFunction
## [1] 1.114 1.189 1.101 1.136 1.127 1.191 1.095 1.138 1.159 1.144 1.154
## [12] 1.162 1.174 1.096 1.182
##
## $Age
## [1] 65 66 65 65 66 66 66
##
## $Outcome
## numeric(0)
corrgram(df)
set.seed(1)
split <- createDataPartition(df$Outcome,p = 0.75,list = FALSE)
dfTrn <- df[split,]
dfTst <- df[-split,]
stpModel=step(glm(data=df, formula=Outcome~., family=binomial), trace=0, steps=100)
summary(stpModel)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction +
## Age, family = binomial, data = df)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.5654 -0.7181 -0.3956 0.7227 2.4595
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.691031 0.760722 -12.739 < 2e-16 ***
## Pregnancies 0.094680 0.032864 2.881 0.00396 **
## Glucose 0.036203 0.003526 10.267 < 2e-16 ***
## BMI 0.090531 0.015560 5.818 5.94e-09 ***
## DiabetesPedigreeFunction 1.125394 0.371879 3.026 0.00248 **
## Age 0.020111 0.009814 2.049 0.04045 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 993.48 on 767 degrees of freedom
## Residual deviance: 720.08 on 762 degrees of freedom
## AIC: 732.08
##
## Number of Fisher Scoring iterations: 5
mgmModel <- glm(data=dfTrn, formula=Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction, family=binomial(link="logit"))
summary(mgmModel)
##
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction,
## family = binomial(link = "logit"), data = dfTrn)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.6606 -0.7169 -0.3817 0.6731 2.4596
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -9.711252 0.857336 -11.327 < 2e-16 ***
## Pregnancies 0.142634 0.033328 4.280 1.87e-05 ***
## Glucose 0.038492 0.004039 9.529 < 2e-16 ***
## BMI 0.095828 0.018016 5.319 1.04e-07 ***
## DiabetesPedigreeFunction 1.351958 0.438001 3.087 0.00202 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 746.35 on 575 degrees of freedom
## Residual deviance: 527.23 on 571 degrees of freedom
## AIC: 537.23
##
## Number of Fisher Scoring iterations: 5
prdVal <- predict(mgmModel, type='response',newdata = dfTst)
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfTst$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
##
## act
## prd 0 1
## 0 102 27
## 1 24 39
##
## Accuracy : 0.7344
## 95% CI : (0.666, 0.7954)
## No Information Rate : 0.6562
## P-Value [Acc > NIR] : 0.01256
##
## Kappa : 0.4048
##
## Mcnemar's Test P-Value : 0.77943
##
## Sensitivity : 0.8095
## Specificity : 0.5909
## Pos Pred Value : 0.7907
## Neg Pred Value : 0.6190
## Prevalence : 0.6562
## Detection Rate : 0.5312
## Detection Prevalence : 0.6719
## Balanced Accuracy : 0.7002
##
## 'Positive' Class : 0
##
Model <- glm(data=df, formula=Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction, family=binomial(link="logit"))
prd <- read.csv("D:/ISME/R class/data/niddkd-diabetes1.csv",stringsAsFactors = FALSE)
prdDf <- prd
prdVal <- predict(Model,newdata = prd,type = 'response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=prdDf$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
##
## act
## prd 0 1
## 0 6 2
## 1 0 1
##
## Accuracy : 0.7778
## 95% CI : (0.3999, 0.9719)
## No Information Rate : 0.6667
## P-Value [Acc > NIR] : 0.3772
##
## Kappa : 0.4
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 1.0000
## Specificity : 0.3333
## Pos Pred Value : 0.7500
## Neg Pred Value : 1.0000
## Prevalence : 0.6667
## Detection Rate : 0.6667
## Detection Prevalence : 0.8889
## Balanced Accuracy : 0.6667
##
## 'Positive' Class : 0
##