DIabetes_Final

Data Exploration

dataset = read.csv("diab.csv",header = TRUE)
#print(dataset)
print(head(dataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 1       82 Buckingham  46 female medium         1  0       3  203  56   3.6
## 2       97 Buckingham  29 female  large         0  0       2  165  24   6.9
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
##   glyhb height weight bp.1s bp.1d   bp.2s    bp.2d waist hip time.ppn  dm
## 1  4.31     62    121   118    59 152.383 92.52482    29  38      720  no
## 2  4.44     64    218   112    68 152.383 92.52482    46  48      360  no
## 3  4.64     61    256   190    92 185.000 92.00000    49  57      180  no
## 4  4.63     67    119   110    50 152.383 92.52482    33  38      480  no
## 5  7.72     68    183   138    80 152.383 92.52482    44  41      300 yes
## 6  4.81     71    190   132    86 152.383 92.52482    36  42      195  no

#building contengency table

# table(dataset$quit,dataset$promotion_last_5years)
# table(dataset$quit,dataset$department)
# table(dataset$quit,dataset$salary)

Data resuffling

set.seed(1)
index = runif(nrow(dataset))
dataset=dataset[order(index),]
print(head(dataset))

##     stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 116       80 Buckingham  63   male medium         2  0       3  194  34   5.7
## 27        87 Buckingham  37   male  large         1  1       2  232  30   7.7
## 47       193 Buckingham  54 female medium         1  0       1  148  14  10.6
## 281       98     Louisa  78 female  large         2  0       1  224  44   5.1
## 133      101     Louisa  44 female  small         0  1       1  168  59   2.8
## 228       82 Buckingham  51 female  small         0  0       1  222  87   2.6
##     glyhb height weight bp.1s bp.1d   bp.2s    bp.2d waist hip time.ppn dm
## 116  4.61     73    175   131    88 152.383 92.52482    34  39       30 no
## 27   5.10     68    252   140    95 152.383 92.52482    43  47      420 no
## 47   6.14     67    165   140    65 152.383 92.52482    42  42      150 no
## 281  5.05     63    160   150    81 152.383 92.52482    36  45      300 no
## 133  5.09     64    160   130    88 152.383 92.52482    40  43       60 no
## 228  4.64     66    110   150   110 150.000 90.00000    28  37      270 no

Data visualization

Here we are analyzing the Diabetes dataset.

dataset =  read.csv("diab.csv",header = TRUE)
print(head(dataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 1       82 Buckingham  46 female medium         1  0       3  203  56   3.6
## 2       97 Buckingham  29 female  large         0  0       2  165  24   6.9
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
##   glyhb height weight bp.1s bp.1d   bp.2s    bp.2d waist hip time.ppn  dm
## 1  4.31     62    121   118    59 152.383 92.52482    29  38      720  no
## 2  4.44     64    218   112    68 152.383 92.52482    46  48      360  no
## 3  4.64     61    256   190    92 185.000 92.00000    49  57      180  no
## 4  4.63     67    119   110    50 152.383 92.52482    33  38      480  no
## 5  7.72     68    183   138    80 152.383 92.52482    44  41      300 yes
## 6  4.81     71    190   132    86 152.383 92.52482    36  42      195  no

assocplot(table(dataset$gender,dataset$dm),xlab = "gender", ylab = "Diabetic", col = c("green","red"))

assocplot(table(dataset$location,dataset$dm),xlab = "location", ylab = "Diabetic", col = c("green","red"))

assocplot(table(dataset$frame,dataset$dm),xlab = "frame", ylab = "Diabetic", col = c("green","red"))

g= ggplot(dataset,aes(x=dm,y=stab.glu)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=age)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=chol)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=hdl)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=ratio)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=glyhb)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=height)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=weight)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=bp.1s)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=bp.1d)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=bp.2s)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=bp.2d)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=waist)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=hip)) + geom_boxplot()
ggplotly(g)

g= ggplot(dataset,aes(x=dm,y=time.ppn)) + geom_boxplot()
ggplotly(g)

Data Preparation

#convert the datatype of qualitative variables into factor datatype
 dataset$dm = as.factor(dataset$dm)
dataset$location = as.factor(dataset$location)
dataset$gender = as.factor(dataset$gender)
dataset$frame = as.factor(dataset$frame)
# dataset$promotion_last_5years = as.factor(dataset$promotion_last_5years)
# dataset$department = as.factor(dataset$department)
# dataset$salary = as.factor(dataset$salary)

# Divide dataset into training and valodation dataset
set.seed(1)
trainrows = sample(row.names(dataset), nrow(dataset)*0.7)
traindataset = dataset[trainrows,]
Validrows = setdiff(row.names(dataset), trainrows)
Validdataset = dataset[Validrows,]
print(head(traindataset))

##     stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 324       92 Buckingham  63   male  small         1  0       2  180  34   5.3
## 167      121 Buckingham  67   male  large         2  1       2  254  39   6.5
## 129      115 Buckingham  71 female  large         0  0       2  228  61   3.7
## 299       74     Louisa  43 female medium         1  0       1  243  42   5.8
## 270       90 Buckingham  38 female medium         2  0       2  206  38   5.4
## 187       90     Louisa  34   male medium         0  0       2  174  36   4.8
##     glyhb height weight bp.1s bp.1d   bp.2s     bp.2d waist hip time.ppn  dm
## 324  3.59     69    169   145    72 142.000  70.00000    35  39       30  no
## 167  9.25     68    167   161   118 151.000 111.00000    36  39       60 yes
## 129  6.39     63    244   170    92 152.383  92.52482    48  51      660  no
## 299  3.85     64    239   128    90 138.000  90.00000    48  53      330  no
## 270  4.07     69    167   138    90 152.383  92.52482    36  47       90  no
## 187  5.35     71    210   142    92 148.000  98.00000    37  43       90  no

print(head(Validdataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
## 7       92 Buckingham  30   male medium         2  1       2  195  41   4.8
## 8       75 Buckingham  37   male medium         0  0       2  227  44   5.2
##   glyhb height weight    bp.1s     bp.1d   bp.2s     bp.2d waist hip time.ppn
## 3  4.64     61    256 190.0000  92.00000 185.000  92.00000    49  57      180
## 4  4.63     67    119 110.0000  50.00000 152.383  92.52482    33  38      480
## 5  7.72     68    183 138.0000  80.00000 152.383  92.52482    44  41      300
## 6  4.81     71    190 132.0000  86.00000 152.383  92.52482    36  42      195
## 7  4.84     69    191 161.0000 112.00000 161.000 112.00000    46  49      720
## 8  3.94     59    170 136.9045  83.32161 152.383  92.52482    34  39     1020
##    dm
## 3  no
## 4  no
## 5 yes
## 6  no
## 7  no
## 8  no

Model Fitting C50

modelC5 = C5.0(x=traindataset[,1:21],y=traindataset[,22])
print(summary(modelC5))

## 
## Call:
## C5.0.default(x = traindataset[, 1:21], y = traindataset[, 22])
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Thu Sep 03 15:22:30 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 282 cases (22 attributes) from undefined.data
## 
## Decision tree:
## 
## glyhb <= 6.97: no (242)
## glyhb > 6.97: yes (40)
## 
## 
## Evaluation on training data (282 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     242          (a): class no
##            40    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% glyhb
## 
## 
## Time: 0.0 secs

plot(modelC5)

Model Prediction C50

pred = predict(modelC5,Validdataset)
print(head(pred))

## [1] no  no  yes no  no  no 
## Levels: no yes

Validdataset$predictDm = pred
print(head(Validdataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
## 7       92 Buckingham  30   male medium         2  1       2  195  41   4.8
## 8       75 Buckingham  37   male medium         0  0       2  227  44   5.2
##   glyhb height weight    bp.1s     bp.1d   bp.2s     bp.2d waist hip time.ppn
## 3  4.64     61    256 190.0000  92.00000 185.000  92.00000    49  57      180
## 4  4.63     67    119 110.0000  50.00000 152.383  92.52482    33  38      480
## 5  7.72     68    183 138.0000  80.00000 152.383  92.52482    44  41      300
## 6  4.81     71    190 132.0000  86.00000 152.383  92.52482    36  42      195
## 7  4.84     69    191 161.0000 112.00000 161.000 112.00000    46  49      720
## 8  3.94     59    170 136.9045  83.32161 152.383  92.52482    34  39     1020
##    dm predictDm
## 3  no        no
## 4  no        no
## 5 yes       yes
## 6  no        no
## 7  no        no
## 8  no        no

Model Evaluation C50

confusionMatrix(Validdataset$predictDm, Validdataset$dm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  101   0
##        yes   0  20
##                                    
##                Accuracy : 1        
##                  95% CI : (0.97, 1)
##     No Information Rate : 0.8347   
##     P-Value [Acc > NIR] : 3.205e-10
##                                    
##                   Kappa : 1        
##                                    
##  Mcnemar's Test P-Value : NA       
##                                    
##             Sensitivity : 1.0000   
##             Specificity : 1.0000   
##          Pos Pred Value : 1.0000   
##          Neg Pred Value : 1.0000   
##              Prevalence : 0.8347   
##          Detection Rate : 0.8347   
##    Detection Prevalence : 0.8347   
##       Balanced Accuracy : 1.0000   
##                                    
##        'Positive' Class : no       
##

Model Fitting RPART

modelRpart = rpart(dm~.,data=traindataset)
rpart.plot(modelRpart)

Model Prediction RPART

pred = data.frame(predict(modelRpart,Validdataset))
print(head(pred))

##   no yes
## 3  1   0
## 4  1   0
## 5  0   1
## 6  1   0
## 7  1   0
## 8  1   0

print(pred$yes)

##   [1] 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
##  [38] 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
##  [75] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1
## [112] 0 0 0 0 0 0 0 0 0 0

predictionNew = ifelse(pred$yes==1,"yes","no")
Validdataset$predictDm = predictionNew
print(head(Validdataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
## 7       92 Buckingham  30   male medium         2  1       2  195  41   4.8
## 8       75 Buckingham  37   male medium         0  0       2  227  44   5.2
##   glyhb height weight    bp.1s     bp.1d   bp.2s     bp.2d waist hip time.ppn
## 3  4.64     61    256 190.0000  92.00000 185.000  92.00000    49  57      180
## 4  4.63     67    119 110.0000  50.00000 152.383  92.52482    33  38      480
## 5  7.72     68    183 138.0000  80.00000 152.383  92.52482    44  41      300
## 6  4.81     71    190 132.0000  86.00000 152.383  92.52482    36  42      195
## 7  4.84     69    191 161.0000 112.00000 161.000 112.00000    46  49      720
## 8  3.94     59    170 136.9045  83.32161 152.383  92.52482    34  39     1020
##    dm predictDm
## 3  no        no
## 4  no        no
## 5 yes       yes
## 6  no        no
## 7  no        no
## 8  no        no

Model Evaluation RPART

Validdataset$predictDm = as.factor(Validdataset$predictDm)
Validdataset$dm = as.factor(Validdataset$dm)
confusionMatrix(Validdataset$predictDm, Validdataset$dm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  101   0
##        yes   0  20
##                                    
##                Accuracy : 1        
##                  95% CI : (0.97, 1)
##     No Information Rate : 0.8347   
##     P-Value [Acc > NIR] : 3.205e-10
##                                    
##                   Kappa : 1        
##                                    
##  Mcnemar's Test P-Value : NA       
##                                    
##             Sensitivity : 1.0000   
##             Specificity : 1.0000   
##          Pos Pred Value : 1.0000   
##          Neg Pred Value : 1.0000   
##              Prevalence : 0.8347   
##          Detection Rate : 0.8347   
##    Detection Prevalence : 0.8347   
##       Balanced Accuracy : 1.0000   
##                                    
##        'Positive' Class : no       
##

Model Fitting CTREE

modelCTREE = ctree(dm~.,data=traindataset)
plot(modelCTREE)

Model Prediction CTREE

pred = predict(modelCTREE,Validdataset)
print(head(pred))

## [1] no  no  yes no  no  no 
## Levels: no yes

Validdataset$predictDm = pred
print(head(Validdataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
## 7       92 Buckingham  30   male medium         2  1       2  195  41   4.8
## 8       75 Buckingham  37   male medium         0  0       2  227  44   5.2
##   glyhb height weight    bp.1s     bp.1d   bp.2s     bp.2d waist hip time.ppn
## 3  4.64     61    256 190.0000  92.00000 185.000  92.00000    49  57      180
## 4  4.63     67    119 110.0000  50.00000 152.383  92.52482    33  38      480
## 5  7.72     68    183 138.0000  80.00000 152.383  92.52482    44  41      300
## 6  4.81     71    190 132.0000  86.00000 152.383  92.52482    36  42      195
## 7  4.84     69    191 161.0000 112.00000 161.000 112.00000    46  49      720
## 8  3.94     59    170 136.9045  83.32161 152.383  92.52482    34  39     1020
##    dm predictDm
## 3  no        no
## 4  no        no
## 5 yes       yes
## 6  no        no
## 7  no        no
## 8  no        no

Model Evaluation CTREE

confusionMatrix(Validdataset$predictDm, Validdataset$dm)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction  no yes
##        no  101   0
##        yes   0  20
##                                    
##                Accuracy : 1        
##                  95% CI : (0.97, 1)
##     No Information Rate : 0.8347   
##     P-Value [Acc > NIR] : 3.205e-10
##                                    
##                   Kappa : 1        
##                                    
##  Mcnemar's Test P-Value : NA       
##                                    
##             Sensitivity : 1.0000   
##             Specificity : 1.0000   
##          Pos Pred Value : 1.0000   
##          Neg Pred Value : 1.0000   
##              Prevalence : 0.8347   
##          Detection Rate : 0.8347   
##    Detection Prevalence : 0.8347   
##       Balanced Accuracy : 1.0000   
##                                    
##        'Positive' Class : no       
##

Data Preperation of Binary Logestics Regression

#creating dummy for dependent variable
df = data.frame(model.matrix(~0 + dm,data = dataset))
print(head(df,25))

##    dmno dmyes
## 1     1     0
## 2     1     0
## 3     1     0
## 4     1     0
## 5     0     1
## 6     1     0
## 7     1     0
## 8     1     0
## 9     1     0
## 10    1     0
## 11    1     0
## 12    1     0
## 13    1     0
## 14    1     0
## 15    1     0
## 16    1     0
## 17    1     0
## 18    1     0
## 19    1     0
## 20    0     1
## 21    1     0
## 22    1     0
## 23    0     1
## 24    1     0
## 25    1     0

dataset$dm = df$dmyes
print(head(dataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 1       82 Buckingham  46 female medium         1  0       3  203  56   3.6
## 2       97 Buckingham  29 female  large         0  0       2  165  24   6.9
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
##   glyhb height weight bp.1s bp.1d   bp.2s    bp.2d waist hip time.ppn dm
## 1  4.31     62    121   118    59 152.383 92.52482    29  38      720  0
## 2  4.44     64    218   112    68 152.383 92.52482    46  48      360  0
## 3  4.64     61    256   190    92 185.000 92.00000    49  57      180  0
## 4  4.63     67    119   110    50 152.383 92.52482    33  38      480  0
## 5  7.72     68    183   138    80 152.383 92.52482    44  41      300  1
## 6  4.81     71    190   132    86 152.383 92.52482    36  42      195  0

set.seed(1)
trainrows = sample(row.names(dataset),dim(dataset)[1]*0.6)
trainingdataset =  dataset[trainrows, ]
validrows = setdiff(row.names(dataset),trainrows)
validationdataset = dataset[validrows, ]
print("Training Dataset")

## [1] "Training Dataset"

print(head(trainingdataset))

##     stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 324       92 Buckingham  63   male  small         1  0       2  180  34   5.3
## 167      121 Buckingham  67   male  large         2  1       2  254  39   6.5
## 129      115 Buckingham  71 female  large         0  0       2  228  61   3.7
## 299       74     Louisa  43 female medium         1  0       1  243  42   5.8
## 270       90 Buckingham  38 female medium         2  0       2  206  38   5.4
## 187       90     Louisa  34   male medium         0  0       2  174  36   4.8
##     glyhb height weight bp.1s bp.1d   bp.2s     bp.2d waist hip time.ppn dm
## 324  3.59     69    169   145    72 142.000  70.00000    35  39       30  0
## 167  9.25     68    167   161   118 151.000 111.00000    36  39       60  1
## 129  6.39     63    244   170    92 152.383  92.52482    48  51      660  0
## 299  3.85     64    239   128    90 138.000  90.00000    48  53      330  0
## 270  4.07     69    167   138    90 152.383  92.52482    36  47       90  0
## 187  5.35     71    210   142    92 148.000  98.00000    37  43       90  0

print(nrow(trainingdataset))

## [1] 241

print("Validation Dataset")

## [1] "Validation Dataset"

print(head(validationdataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 2       97 Buckingham  29 female  large         0  0       2  165  24   6.9
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
## 7       92 Buckingham  30   male medium         2  1       2  195  41   4.8
##   glyhb height weight bp.1s bp.1d   bp.2s     bp.2d waist hip time.ppn dm
## 2  4.44     64    218   112    68 152.383  92.52482    46  48      360  0
## 3  4.64     61    256   190    92 185.000  92.00000    49  57      180  0
## 4  4.63     67    119   110    50 152.383  92.52482    33  38      480  0
## 5  7.72     68    183   138    80 152.383  92.52482    44  41      300  1
## 6  4.81     71    190   132    86 152.383  92.52482    36  42      195  0
## 7  4.84     69    191   161   112 161.000 112.00000    46  49      720  0

print(nrow(validationdataset))

## [1] 162

Model fitting Binary Logestic Regression

model = glm(dm~.,data = trainingdataset,family = "binomial")

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

print(summary(model))

## 
## Call:
## glm(formula = dm ~ ., family = "binomial", data = trainingdataset)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -3.206e-05  -2.100e-08  -2.100e-08  -2.100e-08   2.641e-05  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -5.709e+02  1.103e+06  -0.001    1.000
## stab.glu       -1.796e-02  4.938e+02   0.000    1.000
## locationLouisa  1.806e+01  4.946e+04   0.000    1.000
## age            -5.161e-02  1.450e+03   0.000    1.000
## gendermale     -2.292e+01  7.663e+04   0.000    1.000
## framemedium    -1.681e+01  5.359e+04   0.000    1.000
## framesmall     -1.983e+01  1.559e+05   0.000    1.000
## insurance       4.457e+00  2.867e+04   0.000    1.000
## fh             -2.528e+01  6.673e+04   0.000    1.000
## smoking         5.421e+00  4.605e+04   0.000    1.000
## chol           -9.104e-02  1.058e+03   0.000    1.000
## hdl             5.315e-01  3.611e+03   0.000    1.000
## ratio           8.373e+00  2.918e+04   0.000    1.000
## glyhb           2.917e+01  1.765e+04   0.002    0.999
## height          2.464e+00  1.400e+04   0.000    1.000
## weight         -1.294e-01  2.539e+03   0.000    1.000
## bp.1s          -1.994e-01  2.549e+03   0.000    1.000
## bp.1d          -2.186e-01  3.162e+03   0.000    1.000
## bp.2s           3.349e-01  2.584e+03   0.000    1.000
## bp.2d           1.363e+00  3.175e+03   0.000    1.000
## waist          -7.988e-01  9.234e+03   0.000    1.000
## hip             1.815e+00  1.422e+04   0.000    1.000
## time.ppn        6.914e-04  8.483e+01   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.9428e+02  on 234  degrees of freedom
## Residual deviance: 1.0123e-08  on 212  degrees of freedom
##   (6 observations deleted due to missingness)
## AIC: 46
## 
## Number of Fisher Scoring iterations: 25

plot4 = ggplot(data = trainingdataset, aes(x = location,y = dm))+geom_point() + geom_smooth(method = "glm",se = FALSE,method.args = list(family = "binomial"))

ggplotly(plot4)

## `geom_smooth()` using formula 'y ~ x'

plot5 = ggplot(data = trainingdataset, aes(x = gender,y = dm))+geom_point() + geom_smooth(method = "glm",se = FALSE,method.args = list(family = "binomial"))

ggplotly(plot5)

## `geom_smooth()` using formula 'y ~ x'

Model Prediction : Apply the model to the validation dataset

probprediction = predict(model,newdata = validationdataset,type = "response")

print(head(probprediction))

##            2            3            4            5            6            7 
## 2.220446e-16 2.220446e-16 2.220446e-16 9.996811e-01 2.220446e-16 2.220446e-16

cut_off = 0.5

prediction = ifelse(probprediction>cut_off,1,0)

validationdataset$predicteddm = prediction

print(head(validationdataset))

##   stab.glu   location age gender  frame insurance fh smoking chol hdl ratio
## 2       97 Buckingham  29 female  large         0  0       2  165  24   6.9
## 3       92 Buckingham  58 female  large         2  0       2  228  37   6.2
## 4       93 Buckingham  67   male  large         1  0       3   78  12   6.5
## 5       90 Buckingham  64   male medium         0  0       3  249  28   8.9
## 6       94 Buckingham  34   male  large         1  0       1  248  69   3.6
## 7       92 Buckingham  30   male medium         2  1       2  195  41   4.8
##   glyhb height weight bp.1s bp.1d   bp.2s     bp.2d waist hip time.ppn dm
## 2  4.44     64    218   112    68 152.383  92.52482    46  48      360  0
## 3  4.64     61    256   190    92 185.000  92.00000    49  57      180  0
## 4  4.63     67    119   110    50 152.383  92.52482    33  38      480  0
## 5  7.72     68    183   138    80 152.383  92.52482    44  41      300  1
## 6  4.81     71    190   132    86 152.383  92.52482    36  42      195  0
## 7  4.84     69    191   161   112 161.000 112.00000    46  49      720  0
##   predicteddm
## 2           0
## 3           0
## 4           0
## 5           1
## 6           0
## 7           0

Model Evaluation Binary Logestic Regression

confusionMatrix(as.factor(validationdataset$predicteddm),as.factor(validationdataset$dm),positive = "1")

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 132   2
##          1   0  22
##                                           
##                Accuracy : 0.9872          
##                  95% CI : (0.9545, 0.9984)
##     No Information Rate : 0.8462          
##     P-Value [Acc > NIR] : 2.063e-09       
##                                           
##                   Kappa : 0.949           
##                                           
##  Mcnemar's Test P-Value : 0.4795          
##                                           
##             Sensitivity : 0.9167          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9851          
##              Prevalence : 0.1538          
##          Detection Rate : 0.1410          
##    Detection Prevalence : 0.1410          
##       Balanced Accuracy : 0.9583          
##                                           
##        'Positive' Class : 1               
##

DIabetes_Final

Krishna Sharma

01/09/2020