Problem Defination
Study the given dataset. Given the features of the dataset, decide the best possible features to use for predicting if a patient has Diabetes.

Dataset
Use niddkd-diabetes.csv Refer niddkd-diabetes.txt for data dict Use niddkd-diabetes.prd dataset to predict if a patient has Diabetes

library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(DMwR)
## Warning: package 'DMwR' was built under R version 3.5.3
## Loading required package: lattice
## Loading required package: grid
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 3.5.3
library(corrgram)
## Warning: package 'corrgram' was built under R version 3.5.3
## 
## Attaching package: 'corrgram'
## The following object is masked from 'package:lattice':
## 
##     panel.fill
library(caret)
## Warning: package 'caret' was built under R version 3.5.3
df <- read.csv("D:/ISME/R class/data/niddkd-diabetes.csv",stringsAsFactors = F)
head(df)
##   Pregnancies Glucose BloodPressure SkinThickness Insulin  BMI
## 1           6     148            72            35       0 33.6
## 2           1      85            66            29       0 26.6
## 3           8     183            64             0       0 23.3
## 4           1      89            66            23      94 28.1
## 5           0     137            40            35     168 43.1
## 6           5     116            74             0       0 25.6
##   DiabetesPedigreeFunction Age Outcome
## 1                    0.627  50       1
## 2                    0.351  31       0
## 3                    0.672  32       1
## 4                    0.167  21       0
## 5                    2.288  33       1
## 6                    0.201  30       0
str(df)
## 'data.frame':    768 obs. of  9 variables:
##  $ Pregnancies             : int  6 1 8 1 0 5 3 10 2 8 ...
##  $ Glucose                 : int  148 85 183 89 137 116 78 115 197 125 ...
##  $ BloodPressure           : int  72 66 64 66 40 74 50 0 70 96 ...
##  $ SkinThickness           : int  35 29 0 23 35 0 32 0 45 0 ...
##  $ Insulin                 : int  0 0 0 94 168 0 88 0 543 0 ...
##  $ BMI                     : num  33.6 26.6 23.3 28.1 43.1 25.6 31 35.3 30.5 0 ...
##  $ DiabetesPedigreeFunction: num  0.627 0.351 0.672 0.167 2.288 ...
##  $ Age                     : int  50 31 32 21 33 30 26 29 53 54 ...
##  $ Outcome                 : int  1 0 1 0 1 0 1 0 1 1 ...
summary(df)
##   Pregnancies        Glucose      BloodPressure    SkinThickness  
##  Min.   : 0.000   Min.   :  0.0   Min.   :  0.00   Min.   : 0.00  
##  1st Qu.: 1.000   1st Qu.: 99.0   1st Qu.: 62.00   1st Qu.: 0.00  
##  Median : 3.000   Median :117.0   Median : 72.00   Median :23.00  
##  Mean   : 3.845   Mean   :120.9   Mean   : 69.11   Mean   :20.54  
##  3rd Qu.: 6.000   3rd Qu.:140.2   3rd Qu.: 80.00   3rd Qu.:32.00  
##  Max.   :17.000   Max.   :199.0   Max.   :122.00   Max.   :99.00  
##     Insulin           BMI        DiabetesPedigreeFunction      Age       
##  Min.   :  0.0   Min.   : 0.00   Min.   :0.0780           Min.   :21.00  
##  1st Qu.:  0.0   1st Qu.:27.30   1st Qu.:0.2437           1st Qu.:24.00  
##  Median : 30.5   Median :32.00   Median :0.3725           Median :29.00  
##  Mean   : 79.8   Mean   :31.99   Mean   :0.4719           Mean   :33.24  
##  3rd Qu.:127.2   3rd Qu.:36.60   3rd Qu.:0.6262           3rd Qu.:41.00  
##  Max.   :846.0   Max.   :67.10   Max.   :2.4200           Max.   :81.00  
##     Outcome     
##  Min.   :0.000  
##  1st Qu.:0.000  
##  Median :0.000  
##  Mean   :0.349  
##  3rd Qu.:1.000  
##  Max.   :1.000
detect_na <- function(inp) {
  sum(is.na(inp))
}
# detect Outliers   
detect_outliers <- function(inp, na.rm=TRUE) {
  i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
  i.max <- 1.5 * IQR(inp, na.rm=na.rm)
  otp <- inp
  otp[inp < (i.qnt[1] - i.max)] <- NA
  otp[inp > (i.qnt[2] + i.max)] <- NA
  inp[is.na(otp)]
}
lapply(df,detect_outliers)
## $Pregnancies
## [1] 15 17 14 14
## 
## $Glucose
## [1] 0 0 0 0 0
## 
## $BloodPressure
##  [1]   0   0  30 110   0   0   0   0 108 122  30   0 110   0   0   0   0
## [18]   0   0   0   0   0   0 108   0   0   0   0   0   0   0   0   0   0
## [35] 110   0  24   0   0   0   0 114   0   0   0
## 
## $SkinThickness
## [1] 99
## 
## $Insulin
##  [1] 543 846 342 495 325 485 495 478 744 370 680 402 375 545 360 325 465
## [18] 325 415 579 474 328 480 326 330 600 321 440 540 480 335 387 392 510
## 
## $BMI
##  [1]  0.0  0.0  0.0  0.0 53.2 55.0  0.0 67.1 52.3 52.3 52.9  0.0  0.0 59.4
## [15]  0.0  0.0 57.3  0.0  0.0
## 
## $DiabetesPedigreeFunction
##  [1] 2.288 1.441 1.390 1.893 1.781 1.222 1.400 1.321 1.224 2.329 1.318
## [12] 1.213 1.353 1.224 1.391 1.476 2.137 1.731 1.268 1.600 2.420 1.251
## [23] 1.699 1.258 1.282 1.698 1.461 1.292 1.394
## 
## $Age
## [1] 69 67 72 81 67 67 70 68 69
## 
## $Outcome
## integer(0)
Replace_outliers <- function(inp, na.rm=TRUE) {
  i.qnt <- quantile(inp, probs=c(.25, .75), na.rm=na.rm)
  i.max <- 1.5 * IQR(inp, na.rm=na.rm)
  
  inp[inp < (i.qnt[1] - i.max)] <- NA
  inp[inp > (i.qnt[2] + i.max)] <- NA
  return(inp)
}

df <- sapply(df,Replace_outliers)
df <- data.frame(df,stringsAsFactors = F)
lapply(df,detect_na)
## $Pregnancies
## [1] 4
## 
## $Glucose
## [1] 5
## 
## $BloodPressure
## [1] 45
## 
## $SkinThickness
## [1] 1
## 
## $Insulin
## [1] 34
## 
## $BMI
## [1] 19
## 
## $DiabetesPedigreeFunction
## [1] 29
## 
## $Age
## [1] 9
## 
## $Outcome
## [1] 0
df$Pregnancies[is.na(df$Pregnancies)] <- round(mean(df$Pregnancies[!is.na(df$Pregnancies)]),digits=0)
df$Glucose[is.na(df$Glucose)] <- round(mean(df$Glucose[!is.na(df$Glucose)]),digits=0)
df$BloodPressure[is.na(df$BloodPressure)] <- round(mean(df$BloodPressure[!is.na(df$BloodPressure)]),digits=0)
df$SkinThickness[is.na(df$SkinThickness)] <- round(mean(df$SkinThickness[!is.na(df$SkinThickness)]),digits=0)
df$Insulin[is.na(df$Insulin)] <- round(mean(df$Insulin[!is.na(df$Insulin)]),digits=0)
df$BMI[is.na(df$BMI)] <- mean(df$BMI[!is.na(df$BMI)])
df$DiabetesPedigreeFunction[is.na(df$DiabetesPedigreeFunction)] <- mean(df$DiabetesPedigreeFunction[!is.na(df$DiabetesPedigreeFunction)])
df$Age[is.na(df$Age)] <- round(mean(df$Age[!is.na(df$Age)]),digits=0)
lapply(df,detect_outliers)
## $Pregnancies
## numeric(0)
## 
## $Glucose
## numeric(0)
## 
## $BloodPressure
## [1]  38 106 106 106
## 
## $SkinThickness
## numeric(0)
## 
## $Insulin
##  [1] 300 304 284 285 318 280 278 293 285 310 277 293 291
## 
## $BMI
## [1] 49.7 50.0 49.6
## 
## $DiabetesPedigreeFunction
##  [1] 1.114 1.189 1.101 1.136 1.127 1.191 1.095 1.138 1.159 1.144 1.154
## [12] 1.162 1.174 1.096 1.182
## 
## $Age
## [1] 65 66 65 65 66 66 66
## 
## $Outcome
## numeric(0)
corrgram(df)

set.seed(1)
split <- createDataPartition(df$Outcome,p = 0.75,list = FALSE)
dfTrn <- df[split,]
dfTst <- df[-split,]
stpModel=step(glm(data=df, formula=Outcome~., family=binomial), trace=0, steps=100)
summary(stpModel)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction + 
##     Age, family = binomial, data = df)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.5654  -0.7181  -0.3956   0.7227   2.4595  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -9.691031   0.760722 -12.739  < 2e-16 ***
## Pregnancies               0.094680   0.032864   2.881  0.00396 ** 
## Glucose                   0.036203   0.003526  10.267  < 2e-16 ***
## BMI                       0.090531   0.015560   5.818 5.94e-09 ***
## DiabetesPedigreeFunction  1.125394   0.371879   3.026  0.00248 ** 
## Age                       0.020111   0.009814   2.049  0.04045 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 993.48  on 767  degrees of freedom
## Residual deviance: 720.08  on 762  degrees of freedom
## AIC: 732.08
## 
## Number of Fisher Scoring iterations: 5
mgmModel <- glm(data=dfTrn, formula=Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction, family=binomial(link="logit"))
summary(mgmModel)
## 
## Call:
## glm(formula = Outcome ~ Pregnancies + Glucose + BMI + DiabetesPedigreeFunction, 
##     family = binomial(link = "logit"), data = dfTrn)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -2.6606  -0.7169  -0.3817   0.6731   2.4596  
## 
## Coefficients:
##                           Estimate Std. Error z value Pr(>|z|)    
## (Intercept)              -9.711252   0.857336 -11.327  < 2e-16 ***
## Pregnancies               0.142634   0.033328   4.280 1.87e-05 ***
## Glucose                   0.038492   0.004039   9.529  < 2e-16 ***
## BMI                       0.095828   0.018016   5.319 1.04e-07 ***
## DiabetesPedigreeFunction  1.351958   0.438001   3.087  0.00202 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 746.35  on 575  degrees of freedom
## Residual deviance: 527.23  on 571  degrees of freedom
## AIC: 537.23
## 
## Number of Fisher Scoring iterations: 5
prdVal <- predict(mgmModel, type='response',newdata = dfTst)
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=dfTst$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
## 
##    act
## prd   0   1
##   0 102  27
##   1  24  39
##                                          
##                Accuracy : 0.7344         
##                  95% CI : (0.666, 0.7954)
##     No Information Rate : 0.6562         
##     P-Value [Acc > NIR] : 0.01256        
##                                          
##                   Kappa : 0.4048         
##                                          
##  Mcnemar's Test P-Value : 0.77943        
##                                          
##             Sensitivity : 0.8095         
##             Specificity : 0.5909         
##          Pos Pred Value : 0.7907         
##          Neg Pred Value : 0.6190         
##              Prevalence : 0.6562         
##          Detection Rate : 0.5312         
##    Detection Prevalence : 0.6719         
##       Balanced Accuracy : 0.7002         
##                                          
##        'Positive' Class : 0              
## 
Model <- glm(data=df, formula=Outcome~Pregnancies+Glucose+BMI+DiabetesPedigreeFunction, family=binomial(link="logit"))
prd <- read.csv("D:/ISME/R class/data/niddkd-diabetes1.csv",stringsAsFactors = FALSE)
prdDf <- prd
prdVal <- predict(Model,newdata = prd,type = 'response')
prdBln <- ifelse(prdVal > 0.5, 1, 0)
cnfmtrx <- table(prd=prdBln, act=prdDf$Outcome)
confusionMatrix(cnfmtrx)
## Confusion Matrix and Statistics
## 
##    act
## prd 0 1
##   0 6 2
##   1 0 1
##                                           
##                Accuracy : 0.7778          
##                  95% CI : (0.3999, 0.9719)
##     No Information Rate : 0.6667          
##     P-Value [Acc > NIR] : 0.3772          
##                                           
##                   Kappa : 0.4             
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 1.0000          
##             Specificity : 0.3333          
##          Pos Pred Value : 0.7500          
##          Neg Pred Value : 1.0000          
##              Prevalence : 0.6667          
##          Detection Rate : 0.6667          
##    Detection Prevalence : 0.8889          
##       Balanced Accuracy : 0.6667          
##                                           
##        'Positive' Class : 0               
##