Data Exploration
dataset=read.csv("diabetes_imputed.csv",header=TRUE)
print(head(dataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 1 20782 no Buckingham male small 1 0 2 324 180 92 34
## 2 15252 yes Buckingham male large 2 1 2 167 254 121 39
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 5 17841 no Buckingham female medium 2 0 2 270 206 90 38
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 1 5.3 3.59 63 69 169 145 72 142.000 70.00000 35 39
## 2 6.5 9.25 67 68 167 161 118 151.000 111.00000 36 39
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 5 5.4 4.07 38 69 167 138 90 152.383 92.52482 36 47
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## time.ppn
## 1 30
## 2 60
## 3 660
## 4 330
## 5 90
## 6 90
#Rattle was used and data was imputed and changed to fill in the missing values
str(dataset) #to see types of these data
## 'data.frame': 282 obs. of 24 variables:
## $ id : int 20782 15252 10016 20325 17841 15520 20350 4758 20267 40773 ...
## $ dm : chr "no" "yes" "no" "no" ...
## $ location : chr "Buckingham" "Buckingham" "Buckingham" "Louisa" ...
## $ gender : chr "male" "male" "female" "female" ...
## $ frame : chr "small" "large" "large" "medium" ...
## $ insurance: int 1 2 0 1 2 0 2 1 2 2 ...
## $ fh : int 0 1 0 0 0 0 0 0 0 1 ...
## $ smoking : int 2 2 2 1 2 2 2 1 2 1 ...
## $ X : int 324 167 129 299 270 187 307 85 277 362 ...
## $ chol : num 180 254 228 243 206 174 164 185 191 212 ...
## $ stab.glu : int 92 121 115 74 90 90 91 84 74 88 ...
## $ hdl : num 34 39 61 42 38 36 67 52 33 36 ...
## $ ratio : num 5.3 6.5 3.7 5.8 5.4 ...
## $ glyhb : num 3.59 9.25 6.39 3.85 4.07 ...
## $ age : int 63 67 71 43 38 34 20 53 40 37 ...
## $ height : num 69 68 63 64 69 71 70 61 72 64 ...
## $ weight : num 169 167 244 239 167 210 141 145 270 160 ...
## $ bp.1s : num 145 161 170 128 138 142 122 147 136 124 ...
## $ bp.1d : num 72 118 92 90 90 92 86 72 70 82 ...
## $ bp.2s : num 142 151 152 138 152 ...
## $ bp.2d : num 70 111 92.5 90 92.5 ...
## $ waist : num 35 36 48 48 36 37 32 37 45 37 ...
## $ hip : num 39 39 51 53 47 43 39 40 49 45 ...
## $ time.ppn : num 30 60 660 330 90 90 390 420 150 15 ...
#Convert the datatype of qualitative variables into factor datatype
dataset$location = as.factor(dataset$location)
dataset$gender = as.factor(dataset$gender)
dataset$frame = as.factor(dataset$frame)
dataset$dm = as.factor(dataset$dm)
dataset$insurance = as.factor(dataset$insurance)
dataset$smoking = as.factor(dataset$smoking)
dataset$fh = as.factor(dataset$fh)
# can also use to convert all but here we have continuous too so its better to induvidually convert dataset=data.frame(lapply(dataset,as.factor))
#Grouping age would be helpful in visualizing later
age=dataset$age
age_grouped = ifelse(age < 45, "under 45",
ifelse(age >= 45 & age < 65, "45 - 64", ifelse(age >= 65 & age < 75, "65 - 74", ifelse(age >= 75, "75 or over", NA))))
#as done in the course we can also calculate BMI
# Divide dataset into training and validation dataset
set.seed(1)
trainrows = sample(row.names(dataset), nrow(dataset)*0.7)
traindataset = dataset[trainrows,]
Validrows = setdiff(row.names(dataset), trainrows)
Validdataset = dataset[Validrows,]
print(head(traindataset))
## id dm location gender frame insurance fh smoking X chol stab.glu
## 167 12005 no Louisa female small 0 1 1 133 168 101
## 129 4500 no Buckingham female medium 1 0 2 77 242 108
## 270 2791 yes Buckingham female medium 0 0 2 68 213 203
## 187 40251 no Louisa male medium 2 0 2 351 150 80
## 85 21320 no Louisa male medium 2 0 1 336 216 155
## 277 4760 yes Louisa female large 1 0 2 87 218 182
## hdl ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 167 59 2.8 5.09 44 64.0000 160 130 88 152.383 92.52482 40 43
## 129 53 4.6 5.47 46 62.0000 183 130 86 152.383 92.52482 37 45
## 270 75 2.8 11.41 71 63.0000 165 150 80 145.000 80.00000 34 42
## 187 38 3.9 3.97 35 73.0000 179 138 92 135.000 88.00000 32 37
## 85 30 7.2 5.91 38 68.0000 145 110 60 152.383 92.52482 34 37
## 277 54 4.0 10.55 51 66.0201 215 139 69 152.383 92.52482 42 53
## time.ppn
## 167 60
## 129 180
## 270 960
## 187 450
## 85 20
## 277 720
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn
## 3 660
## 4 330
## 6 90
## 7 390
## 8 420
## 10 15
Data Exploration and visualization
#Data visualization
assocplot(table(dataset$dm,age_grouped),xlab = "prone to diabetes", ylab = "age group", col = c("green","red"))
boxplot(age ~ dm,data=dataset)
g= ggplot(dataset, aes(x= dm, y= age)) + geom_boxplot()
ggplotly(g)
assocplot(table(dataset$dm,dataset$gender),xlab = "prone to diabetes", ylab = "gender", col = c("green","red"))
assocplot(table(dataset$dm,dataset$location),xlab = "prone to diabetes", ylab = "location", col = c("green","red"))
assocplot(table(dataset$dm,dataset$insurance),xlab = "prone to diabetes", ylab = "insurance(0=none,1=govt,2=private)", col = c("green","red"))
assocplot(table(dataset$dm,dataset$frame),xlab = "prone to diabetes", ylab = "frame", col = c("green","red"))
assocplot(table(dataset$dm,dataset$smoking),xlab = "prone to diabetes", ylab = "smoking", col = c("green","red"))
assocplot(table(dataset$dm,dataset$fh),xlab = "prone to diabetes", ylab = "family history (0=no, 1=yes)", col = c("green","red"))
g= ggplot(dataset, aes(x= dm, y= chol)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= stab.glu)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= hdl)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= ratio)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= glyhb)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= time.ppn)) + geom_boxplot()
ggplotly(g)
Model Fitting
1.BLR
m = glm(dm~age+gender+chol+hdl+bp.1s+bp.1d+weight+height+location+frame+insurance+smoking,data=traindataset,family = "binomial")
summary(m)
##
## Call:
## glm(formula = dm ~ age + gender + chol + hdl + bp.1s + bp.1d +
## weight + height + location + frame + insurance + smoking,
## family = "binomial", data = traindataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5325 -0.5426 -0.3289 -0.1708 2.8638
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.649385 6.426155 -2.124 0.03367 *
## age 0.063835 0.021085 3.028 0.00247 **
## gendermale -0.944296 0.727032 -1.299 0.19400
## chol 0.006568 0.005479 1.199 0.23059
## hdl -0.011127 0.014402 -0.773 0.43976
## bp.1s 0.014793 0.013692 1.080 0.27994
## bp.1d 0.006475 0.023291 0.278 0.78101
## weight 0.003373 0.007754 0.435 0.66351
## height 0.091367 0.092115 0.992 0.32125
## locationLouisa 0.216891 0.476877 0.455 0.64924
## framemedium -0.638790 0.573199 -1.114 0.26509
## framesmall -0.544748 0.817559 -0.666 0.50521
## insurance1 -0.799112 0.587902 -1.359 0.17406
## insurance2 -0.661964 0.561805 -1.178 0.23869
## smoking2 -0.355577 0.552521 -0.644 0.51986
## smoking3 -0.895669 0.830791 -1.078 0.28099
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 168.10 on 196 degrees of freedom
## Residual deviance: 130.61 on 181 degrees of freedom
## AIC: 162.61
##
## Number of Fisher Scoring iterations: 6
confint(m)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -27.087976132 -1.77075876
## age 0.024099054 0.10743089
## gendermale -2.437681236 0.43180036
## chol -0.004308266 0.01740155
## hdl -0.040196116 0.01687274
## bp.1s -0.012052360 0.04217983
## bp.1d -0.040011049 0.05215005
## weight -0.012300799 0.01851679
## height -0.081149397 0.28088390
## locationLouisa -0.709258326 1.17513108
## framemedium -1.771159664 0.49301716
## framesmall -2.197564279 1.04015724
## insurance1 -1.991812022 0.34000281
## insurance2 -1.790693544 0.43277951
## smoking2 -1.435334073 0.75466712
## smoking3 -2.636100985 0.66970170
#full data included in model
model = glm(dm~., data=traindataset,family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model)
##
## Call: glm(formula = dm ~ ., family = "binomial", data = traindataset)
##
## Coefficients:
## (Intercept) id locationLouisa gendermale framemedium
## -2.397e+02 -2.361e-04 -6.058e+00 -9.029e+00 -6.722e+00
## framesmall insurance1 insurance2 fh1 smoking2
## -4.828e+00 -1.154e+01 -8.058e+00 -1.129e+01 8.381e+00
## smoking3 X chol stab.glu hdl
## -1.635e+01 1.029e-01 5.252e-02 1.003e-01 -1.942e-01
## ratio glyhb age height weight
## 2.670e-01 2.183e+01 1.096e-01 1.164e+00 -6.525e-03
## bp.1s bp.1d bp.2s bp.2d waist
## -2.145e-01 7.178e-01 3.928e-01 -8.166e-01 -7.662e-01
## hip time.ppn
## 1.483e-01 -1.089e-02
##
## Degrees of Freedom: 196 Total (i.e. Null); 170 Residual
## Null Deviance: 168.1
## Residual Deviance: 5.968e-09 AIC: 54
print(summary(model))
##
## Call:
## glm(formula = dm ~ ., family = "binomial", data = traindataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.437e-05 -2.110e-08 -2.110e-08 -2.110e-08 2.537e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.397e+02 1.380e+06 0.000 1
## id -2.361e-04 1.020e+01 0.000 1
## locationLouisa -6.058e+00 5.807e+04 0.000 1
## gendermale -9.029e+00 1.389e+05 0.000 1
## framemedium -6.722e+00 8.774e+04 0.000 1
## framesmall -4.828e+00 2.011e+05 0.000 1
## insurance1 -1.154e+01 9.386e+04 0.000 1
## insurance2 -8.058e+00 1.423e+05 0.000 1
## fh1 -1.129e+01 1.928e+05 0.000 1
## smoking2 8.381e+00 1.285e+05 0.000 1
## smoking3 -1.635e+01 1.622e+05 0.000 1
## X 1.029e-01 1.139e+03 0.000 1
## chol 5.252e-02 1.148e+03 0.000 1
## stab.glu 1.003e-01 1.278e+03 0.000 1
## hdl -1.942e-01 5.710e+03 0.000 1
## ratio 2.670e-01 5.063e+04 0.000 1
## glyhb 2.183e+01 4.167e+04 0.001 1
## age 1.096e-01 3.418e+03 0.000 1
## height 1.164e+00 1.327e+04 0.000 1
## weight -6.525e-03 2.232e+03 0.000 1
## bp.1s -2.145e-01 3.923e+03 0.000 1
## bp.1d 7.178e-01 4.824e+03 0.000 1
## bp.2s 3.928e-01 3.310e+03 0.000 1
## bp.2d -8.166e-01 5.528e+03 0.000 1
## waist -7.662e-01 1.325e+04 0.000 1
## hip 1.483e-01 2.392e+04 0.000 1
## time.ppn -1.089e-02 1.284e+02 0.000 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.6810e+02 on 196 degrees of freedom
## Residual deviance: 5.9683e-09 on 170 degrees of freedom
## AIC: 54
##
## Number of Fisher Scoring iterations: 25
Model Fitting C50
modelC5 = C5.0(dm~.,data=traindataset)
print(summary(modelC5))
##
## Call:
## C5.0.formula(formula = dm ~ ., data = traindataset)
##
##
## C5.0 [Release 2.07 GPL Edition] Sat Sep 05 16:31:38 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 197 cases (24 attributes) from undefined.data
##
## Decision tree:
##
## glyhb <= 6.97: no (167)
## glyhb > 6.97: yes (30)
##
##
## Evaluation on training data (197 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 167 (a): class no
## 30 (b): class yes
##
##
## Attribute usage:
##
## 100.00% glyhb
##
##
## Time: 0.0 secs
plot(modelC5)
Model Prediction C50
pred = predict(modelC5,Validdataset)
print(head(pred))
## [1] no no no no no no
## Levels: no yes
Validdataset$predictdm = pred
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn predictdm
## 3 660 no
## 4 330 no
## 6 90 no
## 7 390 no
## 8 420 no
## 10 15 no
Model Evaluation C50
confusionMatrix(Validdataset$predictdm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 75 0
## yes 0 10
##
## Accuracy : 1
## 95% CI : (0.9575, 1)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 2.397e-05
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8824
## Detection Rate : 0.8824
## Detection Prevalence : 0.8824
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : no
##
Model Fitting CTREE
modelCTREE = ctree(dm~.,data=traindataset)
plot(modelCTREE)
Model Prediction CTREE
pred = predict(modelCTREE,Validdataset)
print(head(pred))
## [1] no no no no no no
## Levels: no yes
Validdataset$predictdm = pred
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn predictdm
## 3 660 no
## 4 330 no
## 6 90 no
## 7 390 no
## 8 420 no
## 10 15 no
Model Evaluation CTREE
confusionMatrix(Validdataset$predictdm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 75 0
## yes 0 10
##
## Accuracy : 1
## 95% CI : (0.9575, 1)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 2.397e-05
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8824
## Detection Rate : 0.8824
## Detection Prevalence : 0.8824
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : no
##
Model Fitting RPART
modelRpart = rpart(dm~.,data=traindataset)
rpart.plot(modelRpart)
Model Prediction RPART
pred = predict(modelRpart,Validdataset)
print(head(pred))
## no yes
## 3 1 0
## 4 1 0
## 6 1 0
## 7 1 0
## 8 1 0
## 10 1 0
Validdataset$predictdm = pred
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn predictdm.no predictdm.yes
## 3 660 1 0
## 4 330 1 0
## 6 90 1 0
## 7 390 1 0
## 8 420 1 0
## 10 15 1 0