Data Exploration

dataset=read.csv("diabetes_imputed.csv",header=TRUE)

print(head(dataset))
##      id  dm   location gender  frame insurance fh smoking   X chol stab.glu hdl
## 1 20782  no Buckingham   male  small         1  0       2 324  180       92  34
## 2 15252 yes Buckingham   male  large         2  1       2 167  254      121  39
## 3 10016  no Buckingham female  large         0  0       2 129  228      115  61
## 4 20325  no     Louisa female medium         1  0       1 299  243       74  42
## 5 17841  no Buckingham female medium         2  0       2 270  206       90  38
## 6 15520  no     Louisa   male medium         0  0       2 187  174       90  36
##   ratio glyhb age height weight bp.1s bp.1d   bp.2s     bp.2d waist hip
## 1   5.3  3.59  63     69    169   145    72 142.000  70.00000    35  39
## 2   6.5  9.25  67     68    167   161   118 151.000 111.00000    36  39
## 3   3.7  6.39  71     63    244   170    92 152.383  92.52482    48  51
## 4   5.8  3.85  43     64    239   128    90 138.000  90.00000    48  53
## 5   5.4  4.07  38     69    167   138    90 152.383  92.52482    36  47
## 6   4.8  5.35  34     71    210   142    92 148.000  98.00000    37  43
##   time.ppn
## 1       30
## 2       60
## 3      660
## 4      330
## 5       90
## 6       90
#Rattle was used and data was imputed and changed to fill in the missing values

str(dataset) #to see types of these data
## 'data.frame':    282 obs. of  24 variables:
##  $ id       : int  20782 15252 10016 20325 17841 15520 20350 4758 20267 40773 ...
##  $ dm       : chr  "no" "yes" "no" "no" ...
##  $ location : chr  "Buckingham" "Buckingham" "Buckingham" "Louisa" ...
##  $ gender   : chr  "male" "male" "female" "female" ...
##  $ frame    : chr  "small" "large" "large" "medium" ...
##  $ insurance: int  1 2 0 1 2 0 2 1 2 2 ...
##  $ fh       : int  0 1 0 0 0 0 0 0 0 1 ...
##  $ smoking  : int  2 2 2 1 2 2 2 1 2 1 ...
##  $ X        : int  324 167 129 299 270 187 307 85 277 362 ...
##  $ chol     : num  180 254 228 243 206 174 164 185 191 212 ...
##  $ stab.glu : int  92 121 115 74 90 90 91 84 74 88 ...
##  $ hdl      : num  34 39 61 42 38 36 67 52 33 36 ...
##  $ ratio    : num  5.3 6.5 3.7 5.8 5.4 ...
##  $ glyhb    : num  3.59 9.25 6.39 3.85 4.07 ...
##  $ age      : int  63 67 71 43 38 34 20 53 40 37 ...
##  $ height   : num  69 68 63 64 69 71 70 61 72 64 ...
##  $ weight   : num  169 167 244 239 167 210 141 145 270 160 ...
##  $ bp.1s    : num  145 161 170 128 138 142 122 147 136 124 ...
##  $ bp.1d    : num  72 118 92 90 90 92 86 72 70 82 ...
##  $ bp.2s    : num  142 151 152 138 152 ...
##  $ bp.2d    : num  70 111 92.5 90 92.5 ...
##  $ waist    : num  35 36 48 48 36 37 32 37 45 37 ...
##  $ hip      : num  39 39 51 53 47 43 39 40 49 45 ...
##  $ time.ppn : num  30 60 660 330 90 90 390 420 150 15 ...
#Convert the datatype of qualitative variables into factor datatype

dataset$location = as.factor(dataset$location)
dataset$gender = as.factor(dataset$gender)
dataset$frame = as.factor(dataset$frame)
dataset$dm = as.factor(dataset$dm)
dataset$insurance = as.factor(dataset$insurance)
dataset$smoking = as.factor(dataset$smoking)
dataset$fh = as.factor(dataset$fh)

# can also use to convert all but here we have continuous too so its better to induvidually convert dataset=data.frame(lapply(dataset,as.factor))  

#Grouping age would be helpful in visualizing later
age=dataset$age
age_grouped = ifelse(age < 45, "under 45", 
            ifelse(age >= 45 & age < 65, "45 - 64",                   ifelse(age >= 65 & age < 75, "65 - 74",  ifelse(age >= 75, "75 or over", NA)))) 
#as done in the course we can also calculate BMI

# Divide dataset into training and validation dataset
set.seed(1)
trainrows = sample(row.names(dataset), nrow(dataset)*0.7)
traindataset = dataset[trainrows,]
Validrows = setdiff(row.names(dataset), trainrows)
Validdataset = dataset[Validrows,]
print(head(traindataset))
##        id  dm   location gender  frame insurance fh smoking   X chol stab.glu
## 167 12005  no     Louisa female  small         0  1       1 133  168      101
## 129  4500  no Buckingham female medium         1  0       2  77  242      108
## 270  2791 yes Buckingham female medium         0  0       2  68  213      203
## 187 40251  no     Louisa   male medium         2  0       2 351  150       80
## 85  21320  no     Louisa   male medium         2  0       1 336  216      155
## 277  4760 yes     Louisa female  large         1  0       2  87  218      182
##     hdl ratio glyhb age  height weight bp.1s bp.1d   bp.2s    bp.2d waist hip
## 167  59   2.8  5.09  44 64.0000    160   130    88 152.383 92.52482    40  43
## 129  53   4.6  5.47  46 62.0000    183   130    86 152.383 92.52482    37  45
## 270  75   2.8 11.41  71 63.0000    165   150    80 145.000 80.00000    34  42
## 187  38   3.9  3.97  35 73.0000    179   138    92 135.000 88.00000    32  37
## 85   30   7.2  5.91  38 68.0000    145   110    60 152.383 92.52482    34  37
## 277  54   4.0 10.55  51 66.0201    215   139    69 152.383 92.52482    42  53
##     time.ppn
## 167       60
## 129      180
## 270      960
## 187      450
## 85        20
## 277      720
print(head(Validdataset))
##       id dm   location gender  frame insurance fh smoking   X chol stab.glu hdl
## 3  10016 no Buckingham female  large         0  0       2 129  228      115  61
## 4  20325 no     Louisa female medium         1  0       1 299  243       74  42
## 6  15520 no     Louisa   male medium         0  0       2 187  174       90  36
## 7  20350 no     Louisa female  small         2  0       2 307  164       91  67
## 8   4758 no     Louisa female medium         1  0       1  85  185       84  52
## 10 40773 no     Louisa female  small         2  1       1 362  212       88  36
##    ratio glyhb age height weight bp.1s bp.1d   bp.2s    bp.2d waist hip
## 3    3.7  6.39  71     63    244   170    92 152.383 92.52482    48  51
## 4    5.8  3.85  43     64    239   128    90 138.000 90.00000    48  53
## 6    4.8  5.35  34     71    210   142    92 148.000 98.00000    37  43
## 7    2.4  3.97  20     70    141   122    86 152.383 92.52482    32  39
## 8    3.6  5.28  53     61    145   147    72 152.383 92.52482    37  40
## 10   5.9  5.22  37     64    160   124    82 152.383 92.52482    37  45
##    time.ppn
## 3       660
## 4       330
## 6        90
## 7       390
## 8       420
## 10       15

Data Exploration and visualization

#Data visualization

assocplot(table(dataset$dm,age_grouped),xlab = "prone to diabetes", ylab = "age group", col = c("green","red"))

boxplot(age ~ dm,data=dataset)

g= ggplot(dataset, aes(x= dm, y= age)) + geom_boxplot() 
ggplotly(g)
assocplot(table(dataset$dm,dataset$gender),xlab = "prone to diabetes", ylab = "gender", col = c("green","red"))

assocplot(table(dataset$dm,dataset$location),xlab = "prone to diabetes", ylab = "location", col = c("green","red"))

assocplot(table(dataset$dm,dataset$insurance),xlab = "prone to diabetes", ylab = "insurance(0=none,1=govt,2=private)", col = c("green","red"))

assocplot(table(dataset$dm,dataset$frame),xlab = "prone to diabetes", ylab = "frame", col = c("green","red"))

assocplot(table(dataset$dm,dataset$smoking),xlab = "prone to diabetes", ylab = "smoking", col = c("green","red"))

assocplot(table(dataset$dm,dataset$fh),xlab = "prone to diabetes", ylab = "family history (0=no, 1=yes)", col = c("green","red"))

g= ggplot(dataset, aes(x= dm, y= chol)) + geom_boxplot() 
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= stab.glu)) + geom_boxplot() 
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= hdl)) + geom_boxplot() 
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= ratio)) + geom_boxplot() 
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= glyhb)) + geom_boxplot() 
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= time.ppn)) + geom_boxplot() 
ggplotly(g)

Model Fitting

1.BLR

m = glm(dm~age+gender+chol+hdl+bp.1s+bp.1d+weight+height+location+frame+insurance+smoking,data=traindataset,family = "binomial")

summary(m) 
## 
## Call:
## glm(formula = dm ~ age + gender + chol + hdl + bp.1s + bp.1d + 
##     weight + height + location + frame + insurance + smoking, 
##     family = "binomial", data = traindataset)
## 
## Deviance Residuals: 
##     Min       1Q   Median       3Q      Max  
## -1.5325  -0.5426  -0.3289  -0.1708   2.8638  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)   
## (Intercept)    -13.649385   6.426155  -2.124  0.03367 * 
## age              0.063835   0.021085   3.028  0.00247 **
## gendermale      -0.944296   0.727032  -1.299  0.19400   
## chol             0.006568   0.005479   1.199  0.23059   
## hdl             -0.011127   0.014402  -0.773  0.43976   
## bp.1s            0.014793   0.013692   1.080  0.27994   
## bp.1d            0.006475   0.023291   0.278  0.78101   
## weight           0.003373   0.007754   0.435  0.66351   
## height           0.091367   0.092115   0.992  0.32125   
## locationLouisa   0.216891   0.476877   0.455  0.64924   
## framemedium     -0.638790   0.573199  -1.114  0.26509   
## framesmall      -0.544748   0.817559  -0.666  0.50521   
## insurance1      -0.799112   0.587902  -1.359  0.17406   
## insurance2      -0.661964   0.561805  -1.178  0.23869   
## smoking2        -0.355577   0.552521  -0.644  0.51986   
## smoking3        -0.895669   0.830791  -1.078  0.28099   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 168.10  on 196  degrees of freedom
## Residual deviance: 130.61  on 181  degrees of freedom
## AIC: 162.61
## 
## Number of Fisher Scoring iterations: 6
confint(m)
## Waiting for profiling to be done...
##                        2.5 %      97.5 %
## (Intercept)    -27.087976132 -1.77075876
## age              0.024099054  0.10743089
## gendermale      -2.437681236  0.43180036
## chol            -0.004308266  0.01740155
## hdl             -0.040196116  0.01687274
## bp.1s           -0.012052360  0.04217983
## bp.1d           -0.040011049  0.05215005
## weight          -0.012300799  0.01851679
## height          -0.081149397  0.28088390
## locationLouisa  -0.709258326  1.17513108
## framemedium     -1.771159664  0.49301716
## framesmall      -2.197564279  1.04015724
## insurance1      -1.991812022  0.34000281
## insurance2      -1.790693544  0.43277951
## smoking2        -1.435334073  0.75466712
## smoking3        -2.636100985  0.66970170
#full data included in model
model = glm(dm~., data=traindataset,family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model)
## 
## Call:  glm(formula = dm ~ ., family = "binomial", data = traindataset)
## 
## Coefficients:
##    (Intercept)              id  locationLouisa      gendermale     framemedium  
##     -2.397e+02      -2.361e-04      -6.058e+00      -9.029e+00      -6.722e+00  
##     framesmall      insurance1      insurance2             fh1        smoking2  
##     -4.828e+00      -1.154e+01      -8.058e+00      -1.129e+01       8.381e+00  
##       smoking3               X            chol        stab.glu             hdl  
##     -1.635e+01       1.029e-01       5.252e-02       1.003e-01      -1.942e-01  
##          ratio           glyhb             age          height          weight  
##      2.670e-01       2.183e+01       1.096e-01       1.164e+00      -6.525e-03  
##          bp.1s           bp.1d           bp.2s           bp.2d           waist  
##     -2.145e-01       7.178e-01       3.928e-01      -8.166e-01      -7.662e-01  
##            hip        time.ppn  
##      1.483e-01      -1.089e-02  
## 
## Degrees of Freedom: 196 Total (i.e. Null);  170 Residual
## Null Deviance:       168.1 
## Residual Deviance: 5.968e-09     AIC: 54
print(summary(model))
## 
## Call:
## glm(formula = dm ~ ., family = "binomial", data = traindataset)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -2.437e-05  -2.110e-08  -2.110e-08  -2.110e-08   2.537e-05  
## 
## Coefficients:
##                  Estimate Std. Error z value Pr(>|z|)
## (Intercept)    -2.397e+02  1.380e+06   0.000        1
## id             -2.361e-04  1.020e+01   0.000        1
## locationLouisa -6.058e+00  5.807e+04   0.000        1
## gendermale     -9.029e+00  1.389e+05   0.000        1
## framemedium    -6.722e+00  8.774e+04   0.000        1
## framesmall     -4.828e+00  2.011e+05   0.000        1
## insurance1     -1.154e+01  9.386e+04   0.000        1
## insurance2     -8.058e+00  1.423e+05   0.000        1
## fh1            -1.129e+01  1.928e+05   0.000        1
## smoking2        8.381e+00  1.285e+05   0.000        1
## smoking3       -1.635e+01  1.622e+05   0.000        1
## X               1.029e-01  1.139e+03   0.000        1
## chol            5.252e-02  1.148e+03   0.000        1
## stab.glu        1.003e-01  1.278e+03   0.000        1
## hdl            -1.942e-01  5.710e+03   0.000        1
## ratio           2.670e-01  5.063e+04   0.000        1
## glyhb           2.183e+01  4.167e+04   0.001        1
## age             1.096e-01  3.418e+03   0.000        1
## height          1.164e+00  1.327e+04   0.000        1
## weight         -6.525e-03  2.232e+03   0.000        1
## bp.1s          -2.145e-01  3.923e+03   0.000        1
## bp.1d           7.178e-01  4.824e+03   0.000        1
## bp.2s           3.928e-01  3.310e+03   0.000        1
## bp.2d          -8.166e-01  5.528e+03   0.000        1
## waist          -7.662e-01  1.325e+04   0.000        1
## hip             1.483e-01  2.392e+04   0.000        1
## time.ppn       -1.089e-02  1.284e+02   0.000        1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.6810e+02  on 196  degrees of freedom
## Residual deviance: 5.9683e-09  on 170  degrees of freedom
## AIC: 54
## 
## Number of Fisher Scoring iterations: 25

Model Fitting C50

modelC5 = C5.0(dm~.,data=traindataset)
print(summary(modelC5))
## 
## Call:
## C5.0.formula(formula = dm ~ ., data = traindataset)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sat Sep 05 16:31:38 2020
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 197 cases (24 attributes) from undefined.data
## 
## Decision tree:
## 
## glyhb <= 6.97: no (167)
## glyhb > 6.97: yes (30)
## 
## 
## Evaluation on training data (197 cases):
## 
##      Decision Tree   
##    ----------------  
##    Size      Errors  
## 
##       2    0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     167          (a): class no
##            30    (b): class yes
## 
## 
##  Attribute usage:
## 
##  100.00% glyhb
## 
## 
## Time: 0.0 secs
plot(modelC5)

Model Prediction C50

pred = predict(modelC5,Validdataset)
print(head(pred))
## [1] no no no no no no
## Levels: no yes
Validdataset$predictdm = pred
print(head(Validdataset))
##       id dm   location gender  frame insurance fh smoking   X chol stab.glu hdl
## 3  10016 no Buckingham female  large         0  0       2 129  228      115  61
## 4  20325 no     Louisa female medium         1  0       1 299  243       74  42
## 6  15520 no     Louisa   male medium         0  0       2 187  174       90  36
## 7  20350 no     Louisa female  small         2  0       2 307  164       91  67
## 8   4758 no     Louisa female medium         1  0       1  85  185       84  52
## 10 40773 no     Louisa female  small         2  1       1 362  212       88  36
##    ratio glyhb age height weight bp.1s bp.1d   bp.2s    bp.2d waist hip
## 3    3.7  6.39  71     63    244   170    92 152.383 92.52482    48  51
## 4    5.8  3.85  43     64    239   128    90 138.000 90.00000    48  53
## 6    4.8  5.35  34     71    210   142    92 148.000 98.00000    37  43
## 7    2.4  3.97  20     70    141   122    86 152.383 92.52482    32  39
## 8    3.6  5.28  53     61    145   147    72 152.383 92.52482    37  40
## 10   5.9  5.22  37     64    160   124    82 152.383 92.52482    37  45
##    time.ppn predictdm
## 3       660        no
## 4       330        no
## 6        90        no
## 7       390        no
## 8       420        no
## 10       15        no

Model Evaluation C50

confusionMatrix(Validdataset$predictdm, Validdataset$dm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no  75   0
##        yes  0  10
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9575, 1)
##     No Information Rate : 0.8824     
##     P-Value [Acc > NIR] : 2.397e-05  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8824     
##          Detection Rate : 0.8824     
##    Detection Prevalence : 0.8824     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : no         
## 

Model Fitting CTREE

modelCTREE = ctree(dm~.,data=traindataset)
plot(modelCTREE)

Model Prediction CTREE

pred = predict(modelCTREE,Validdataset)
print(head(pred))
## [1] no no no no no no
## Levels: no yes
Validdataset$predictdm = pred
print(head(Validdataset))
##       id dm   location gender  frame insurance fh smoking   X chol stab.glu hdl
## 3  10016 no Buckingham female  large         0  0       2 129  228      115  61
## 4  20325 no     Louisa female medium         1  0       1 299  243       74  42
## 6  15520 no     Louisa   male medium         0  0       2 187  174       90  36
## 7  20350 no     Louisa female  small         2  0       2 307  164       91  67
## 8   4758 no     Louisa female medium         1  0       1  85  185       84  52
## 10 40773 no     Louisa female  small         2  1       1 362  212       88  36
##    ratio glyhb age height weight bp.1s bp.1d   bp.2s    bp.2d waist hip
## 3    3.7  6.39  71     63    244   170    92 152.383 92.52482    48  51
## 4    5.8  3.85  43     64    239   128    90 138.000 90.00000    48  53
## 6    4.8  5.35  34     71    210   142    92 148.000 98.00000    37  43
## 7    2.4  3.97  20     70    141   122    86 152.383 92.52482    32  39
## 8    3.6  5.28  53     61    145   147    72 152.383 92.52482    37  40
## 10   5.9  5.22  37     64    160   124    82 152.383 92.52482    37  45
##    time.ppn predictdm
## 3       660        no
## 4       330        no
## 6        90        no
## 7       390        no
## 8       420        no
## 10       15        no

Model Evaluation CTREE

confusionMatrix(Validdataset$predictdm, Validdataset$dm)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction no yes
##        no  75   0
##        yes  0  10
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9575, 1)
##     No Information Rate : 0.8824     
##     P-Value [Acc > NIR] : 2.397e-05  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.8824     
##          Detection Rate : 0.8824     
##    Detection Prevalence : 0.8824     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : no         
## 

Model Fitting RPART

modelRpart = rpart(dm~.,data=traindataset)
rpart.plot(modelRpart)

Model Prediction RPART

pred = predict(modelRpart,Validdataset)
print(head(pred))
##    no yes
## 3   1   0
## 4   1   0
## 6   1   0
## 7   1   0
## 8   1   0
## 10  1   0
Validdataset$predictdm = pred
print(head(Validdataset))
##       id dm   location gender  frame insurance fh smoking   X chol stab.glu hdl
## 3  10016 no Buckingham female  large         0  0       2 129  228      115  61
## 4  20325 no     Louisa female medium         1  0       1 299  243       74  42
## 6  15520 no     Louisa   male medium         0  0       2 187  174       90  36
## 7  20350 no     Louisa female  small         2  0       2 307  164       91  67
## 8   4758 no     Louisa female medium         1  0       1  85  185       84  52
## 10 40773 no     Louisa female  small         2  1       1 362  212       88  36
##    ratio glyhb age height weight bp.1s bp.1d   bp.2s    bp.2d waist hip
## 3    3.7  6.39  71     63    244   170    92 152.383 92.52482    48  51
## 4    5.8  3.85  43     64    239   128    90 138.000 90.00000    48  53
## 6    4.8  5.35  34     71    210   142    92 148.000 98.00000    37  43
## 7    2.4  3.97  20     70    141   122    86 152.383 92.52482    32  39
## 8    3.6  5.28  53     61    145   147    72 152.383 92.52482    37  40
## 10   5.9  5.22  37     64    160   124    82 152.383 92.52482    37  45
##    time.ppn predictdm.no predictdm.yes
## 3       660            1             0
## 4       330            1             0
## 6        90            1             0
## 7       390            1             0
## 8       420            1             0
## 10       15            1             0