Data Exploration
dataset = read.csv("diab.csv",header = TRUE)
#print(dataset)
print(head(dataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 1 82 Buckingham 46 female medium 1 0 3 203 56 3.6
## 2 97 Buckingham 29 female large 0 0 2 165 24 6.9
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn dm
## 1 4.31 62 121 118 59 152.383 92.52482 29 38 720 no
## 2 4.44 64 218 112 68 152.383 92.52482 46 48 360 no
## 3 4.64 61 256 190 92 185.000 92.00000 49 57 180 no
## 4 4.63 67 119 110 50 152.383 92.52482 33 38 480 no
## 5 7.72 68 183 138 80 152.383 92.52482 44 41 300 yes
## 6 4.81 71 190 132 86 152.383 92.52482 36 42 195 no
#building contengency table
# table(dataset$quit,dataset$promotion_last_5years)
# table(dataset$quit,dataset$department)
# table(dataset$quit,dataset$salary)
Data resuffling
set.seed(1)
index = runif(nrow(dataset))
dataset=dataset[order(index),]
print(head(dataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 116 80 Buckingham 63 male medium 2 0 3 194 34 5.7
## 27 87 Buckingham 37 male large 1 1 2 232 30 7.7
## 47 193 Buckingham 54 female medium 1 0 1 148 14 10.6
## 281 98 Louisa 78 female large 2 0 1 224 44 5.1
## 133 101 Louisa 44 female small 0 1 1 168 59 2.8
## 228 82 Buckingham 51 female small 0 0 1 222 87 2.6
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn dm
## 116 4.61 73 175 131 88 152.383 92.52482 34 39 30 no
## 27 5.10 68 252 140 95 152.383 92.52482 43 47 420 no
## 47 6.14 67 165 140 65 152.383 92.52482 42 42 150 no
## 281 5.05 63 160 150 81 152.383 92.52482 36 45 300 no
## 133 5.09 64 160 130 88 152.383 92.52482 40 43 60 no
## 228 4.64 66 110 150 110 150.000 90.00000 28 37 270 no
Data visualization
Here we are analyzing the Diabetes dataset.
dataset = read.csv("diab.csv",header = TRUE)
print(head(dataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 1 82 Buckingham 46 female medium 1 0 3 203 56 3.6
## 2 97 Buckingham 29 female large 0 0 2 165 24 6.9
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn dm
## 1 4.31 62 121 118 59 152.383 92.52482 29 38 720 no
## 2 4.44 64 218 112 68 152.383 92.52482 46 48 360 no
## 3 4.64 61 256 190 92 185.000 92.00000 49 57 180 no
## 4 4.63 67 119 110 50 152.383 92.52482 33 38 480 no
## 5 7.72 68 183 138 80 152.383 92.52482 44 41 300 yes
## 6 4.81 71 190 132 86 152.383 92.52482 36 42 195 no
assocplot(table(dataset$gender,dataset$dm),xlab = "gender", ylab = "Diabetic", col = c("green","red"))
assocplot(table(dataset$location,dataset$dm),xlab = "location", ylab = "Diabetic", col = c("green","red"))
assocplot(table(dataset$frame,dataset$dm),xlab = "frame", ylab = "Diabetic", col = c("green","red"))
g= ggplot(dataset,aes(x=dm,y=stab.glu)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=age)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=chol)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=hdl)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=ratio)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=glyhb)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=height)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=weight)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=bp.1s)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=bp.1d)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=bp.2s)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=bp.2d)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=waist)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=hip)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset,aes(x=dm,y=time.ppn)) + geom_boxplot()
ggplotly(g)
Data Preparation
#convert the datatype of qualitative variables into factor datatype
dataset$dm = as.factor(dataset$dm)
dataset$location = as.factor(dataset$location)
dataset$gender = as.factor(dataset$gender)
dataset$frame = as.factor(dataset$frame)
# dataset$promotion_last_5years = as.factor(dataset$promotion_last_5years)
# dataset$department = as.factor(dataset$department)
# dataset$salary = as.factor(dataset$salary)
# Divide dataset into training and valodation dataset
set.seed(1)
trainrows = sample(row.names(dataset), nrow(dataset)*0.7)
traindataset = dataset[trainrows,]
Validrows = setdiff(row.names(dataset), trainrows)
Validdataset = dataset[Validrows,]
print(head(traindataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 324 92 Buckingham 63 male small 1 0 2 180 34 5.3
## 167 121 Buckingham 67 male large 2 1 2 254 39 6.5
## 129 115 Buckingham 71 female large 0 0 2 228 61 3.7
## 299 74 Louisa 43 female medium 1 0 1 243 42 5.8
## 270 90 Buckingham 38 female medium 2 0 2 206 38 5.4
## 187 90 Louisa 34 male medium 0 0 2 174 36 4.8
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn dm
## 324 3.59 69 169 145 72 142.000 70.00000 35 39 30 no
## 167 9.25 68 167 161 118 151.000 111.00000 36 39 60 yes
## 129 6.39 63 244 170 92 152.383 92.52482 48 51 660 no
## 299 3.85 64 239 128 90 138.000 90.00000 48 53 330 no
## 270 4.07 69 167 138 90 152.383 92.52482 36 47 90 no
## 187 5.35 71 210 142 92 148.000 98.00000 37 43 90 no
print(head(Validdataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## 7 92 Buckingham 30 male medium 2 1 2 195 41 4.8
## 8 75 Buckingham 37 male medium 0 0 2 227 44 5.2
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn
## 3 4.64 61 256 190.0000 92.00000 185.000 92.00000 49 57 180
## 4 4.63 67 119 110.0000 50.00000 152.383 92.52482 33 38 480
## 5 7.72 68 183 138.0000 80.00000 152.383 92.52482 44 41 300
## 6 4.81 71 190 132.0000 86.00000 152.383 92.52482 36 42 195
## 7 4.84 69 191 161.0000 112.00000 161.000 112.00000 46 49 720
## 8 3.94 59 170 136.9045 83.32161 152.383 92.52482 34 39 1020
## dm
## 3 no
## 4 no
## 5 yes
## 6 no
## 7 no
## 8 no
Model Fitting C50
modelC5 = C5.0(x=traindataset[,1:21],y=traindataset[,22])
print(summary(modelC5))
##
## Call:
## C5.0.default(x = traindataset[, 1:21], y = traindataset[, 22])
##
##
## C5.0 [Release 2.07 GPL Edition] Thu Sep 03 15:22:30 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 282 cases (22 attributes) from undefined.data
##
## Decision tree:
##
## glyhb <= 6.97: no (242)
## glyhb > 6.97: yes (40)
##
##
## Evaluation on training data (282 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 242 (a): class no
## 40 (b): class yes
##
##
## Attribute usage:
##
## 100.00% glyhb
##
##
## Time: 0.0 secs
plot(modelC5)
Model Prediction C50
pred = predict(modelC5,Validdataset)
print(head(pred))
## [1] no no yes no no no
## Levels: no yes
Validdataset$predictDm = pred
print(head(Validdataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## 7 92 Buckingham 30 male medium 2 1 2 195 41 4.8
## 8 75 Buckingham 37 male medium 0 0 2 227 44 5.2
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn
## 3 4.64 61 256 190.0000 92.00000 185.000 92.00000 49 57 180
## 4 4.63 67 119 110.0000 50.00000 152.383 92.52482 33 38 480
## 5 7.72 68 183 138.0000 80.00000 152.383 92.52482 44 41 300
## 6 4.81 71 190 132.0000 86.00000 152.383 92.52482 36 42 195
## 7 4.84 69 191 161.0000 112.00000 161.000 112.00000 46 49 720
## 8 3.94 59 170 136.9045 83.32161 152.383 92.52482 34 39 1020
## dm predictDm
## 3 no no
## 4 no no
## 5 yes yes
## 6 no no
## 7 no no
## 8 no no
Model Evaluation C50
confusionMatrix(Validdataset$predictDm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 101 0
## yes 0 20
##
## Accuracy : 1
## 95% CI : (0.97, 1)
## No Information Rate : 0.8347
## P-Value [Acc > NIR] : 3.205e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8347
## Detection Rate : 0.8347
## Detection Prevalence : 0.8347
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : no
##
Model Fitting RPART
modelRpart = rpart(dm~.,data=traindataset)
rpart.plot(modelRpart)
Model Prediction RPART
pred = data.frame(predict(modelRpart,Validdataset))
print(head(pred))
## no yes
## 3 1 0
## 4 1 0
## 5 0 1
## 6 1 0
## 7 1 0
## 8 1 0
print(pred$yes)
## [1] 0 0 1 0 0 0 0 0 0 0 1 0 0 0 0 0 0 1 0 0 1 1 0 0 0 1 0 0 0 0 0 0 1 0 0 0 0
## [38] 0 0 0 0 1 0 1 0 0 0 1 0 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 1 0 0 0 1 0
## [75] 0 0 0 0 0 1 0 0 0 0 0 0 0 0 0 0 1 0 1 0 0 0 0 1 0 0 1 0 0 0 0 0 0 0 0 1 1
## [112] 0 0 0 0 0 0 0 0 0 0
predictionNew = ifelse(pred$yes==1,"yes","no")
Validdataset$predictDm = predictionNew
print(head(Validdataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## 7 92 Buckingham 30 male medium 2 1 2 195 41 4.8
## 8 75 Buckingham 37 male medium 0 0 2 227 44 5.2
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn
## 3 4.64 61 256 190.0000 92.00000 185.000 92.00000 49 57 180
## 4 4.63 67 119 110.0000 50.00000 152.383 92.52482 33 38 480
## 5 7.72 68 183 138.0000 80.00000 152.383 92.52482 44 41 300
## 6 4.81 71 190 132.0000 86.00000 152.383 92.52482 36 42 195
## 7 4.84 69 191 161.0000 112.00000 161.000 112.00000 46 49 720
## 8 3.94 59 170 136.9045 83.32161 152.383 92.52482 34 39 1020
## dm predictDm
## 3 no no
## 4 no no
## 5 yes yes
## 6 no no
## 7 no no
## 8 no no
Model Evaluation RPART
Validdataset$predictDm = as.factor(Validdataset$predictDm)
Validdataset$dm = as.factor(Validdataset$dm)
confusionMatrix(Validdataset$predictDm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 101 0
## yes 0 20
##
## Accuracy : 1
## 95% CI : (0.97, 1)
## No Information Rate : 0.8347
## P-Value [Acc > NIR] : 3.205e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8347
## Detection Rate : 0.8347
## Detection Prevalence : 0.8347
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : no
##
Model Fitting CTREE
modelCTREE = ctree(dm~.,data=traindataset)
plot(modelCTREE)
Model Prediction CTREE
pred = predict(modelCTREE,Validdataset)
print(head(pred))
## [1] no no yes no no no
## Levels: no yes
Validdataset$predictDm = pred
print(head(Validdataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## 7 92 Buckingham 30 male medium 2 1 2 195 41 4.8
## 8 75 Buckingham 37 male medium 0 0 2 227 44 5.2
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn
## 3 4.64 61 256 190.0000 92.00000 185.000 92.00000 49 57 180
## 4 4.63 67 119 110.0000 50.00000 152.383 92.52482 33 38 480
## 5 7.72 68 183 138.0000 80.00000 152.383 92.52482 44 41 300
## 6 4.81 71 190 132.0000 86.00000 152.383 92.52482 36 42 195
## 7 4.84 69 191 161.0000 112.00000 161.000 112.00000 46 49 720
## 8 3.94 59 170 136.9045 83.32161 152.383 92.52482 34 39 1020
## dm predictDm
## 3 no no
## 4 no no
## 5 yes yes
## 6 no no
## 7 no no
## 8 no no
Model Evaluation CTREE
confusionMatrix(Validdataset$predictDm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 101 0
## yes 0 20
##
## Accuracy : 1
## 95% CI : (0.97, 1)
## No Information Rate : 0.8347
## P-Value [Acc > NIR] : 3.205e-10
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8347
## Detection Rate : 0.8347
## Detection Prevalence : 0.8347
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : no
##
Data Preperation of Binary Logestics Regression
#creating dummy for dependent variable
df = data.frame(model.matrix(~0 + dm,data = dataset))
print(head(df,25))
## dmno dmyes
## 1 1 0
## 2 1 0
## 3 1 0
## 4 1 0
## 5 0 1
## 6 1 0
## 7 1 0
## 8 1 0
## 9 1 0
## 10 1 0
## 11 1 0
## 12 1 0
## 13 1 0
## 14 1 0
## 15 1 0
## 16 1 0
## 17 1 0
## 18 1 0
## 19 1 0
## 20 0 1
## 21 1 0
## 22 1 0
## 23 0 1
## 24 1 0
## 25 1 0
dataset$dm = df$dmyes
print(head(dataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 1 82 Buckingham 46 female medium 1 0 3 203 56 3.6
## 2 97 Buckingham 29 female large 0 0 2 165 24 6.9
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn dm
## 1 4.31 62 121 118 59 152.383 92.52482 29 38 720 0
## 2 4.44 64 218 112 68 152.383 92.52482 46 48 360 0
## 3 4.64 61 256 190 92 185.000 92.00000 49 57 180 0
## 4 4.63 67 119 110 50 152.383 92.52482 33 38 480 0
## 5 7.72 68 183 138 80 152.383 92.52482 44 41 300 1
## 6 4.81 71 190 132 86 152.383 92.52482 36 42 195 0
set.seed(1)
trainrows = sample(row.names(dataset),dim(dataset)[1]*0.6)
trainingdataset = dataset[trainrows, ]
validrows = setdiff(row.names(dataset),trainrows)
validationdataset = dataset[validrows, ]
print("Training Dataset")
## [1] "Training Dataset"
print(head(trainingdataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 324 92 Buckingham 63 male small 1 0 2 180 34 5.3
## 167 121 Buckingham 67 male large 2 1 2 254 39 6.5
## 129 115 Buckingham 71 female large 0 0 2 228 61 3.7
## 299 74 Louisa 43 female medium 1 0 1 243 42 5.8
## 270 90 Buckingham 38 female medium 2 0 2 206 38 5.4
## 187 90 Louisa 34 male medium 0 0 2 174 36 4.8
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn dm
## 324 3.59 69 169 145 72 142.000 70.00000 35 39 30 0
## 167 9.25 68 167 161 118 151.000 111.00000 36 39 60 1
## 129 6.39 63 244 170 92 152.383 92.52482 48 51 660 0
## 299 3.85 64 239 128 90 138.000 90.00000 48 53 330 0
## 270 4.07 69 167 138 90 152.383 92.52482 36 47 90 0
## 187 5.35 71 210 142 92 148.000 98.00000 37 43 90 0
print(nrow(trainingdataset))
## [1] 241
print("Validation Dataset")
## [1] "Validation Dataset"
print(head(validationdataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 2 97 Buckingham 29 female large 0 0 2 165 24 6.9
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## 7 92 Buckingham 30 male medium 2 1 2 195 41 4.8
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn dm
## 2 4.44 64 218 112 68 152.383 92.52482 46 48 360 0
## 3 4.64 61 256 190 92 185.000 92.00000 49 57 180 0
## 4 4.63 67 119 110 50 152.383 92.52482 33 38 480 0
## 5 7.72 68 183 138 80 152.383 92.52482 44 41 300 1
## 6 4.81 71 190 132 86 152.383 92.52482 36 42 195 0
## 7 4.84 69 191 161 112 161.000 112.00000 46 49 720 0
print(nrow(validationdataset))
## [1] 162
Model fitting Binary Logestic Regression
model = glm(dm~.,data = trainingdataset,family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(summary(model))
##
## Call:
## glm(formula = dm ~ ., family = "binomial", data = trainingdataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -3.206e-05 -2.100e-08 -2.100e-08 -2.100e-08 2.641e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.709e+02 1.103e+06 -0.001 1.000
## stab.glu -1.796e-02 4.938e+02 0.000 1.000
## locationLouisa 1.806e+01 4.946e+04 0.000 1.000
## age -5.161e-02 1.450e+03 0.000 1.000
## gendermale -2.292e+01 7.663e+04 0.000 1.000
## framemedium -1.681e+01 5.359e+04 0.000 1.000
## framesmall -1.983e+01 1.559e+05 0.000 1.000
## insurance 4.457e+00 2.867e+04 0.000 1.000
## fh -2.528e+01 6.673e+04 0.000 1.000
## smoking 5.421e+00 4.605e+04 0.000 1.000
## chol -9.104e-02 1.058e+03 0.000 1.000
## hdl 5.315e-01 3.611e+03 0.000 1.000
## ratio 8.373e+00 2.918e+04 0.000 1.000
## glyhb 2.917e+01 1.765e+04 0.002 0.999
## height 2.464e+00 1.400e+04 0.000 1.000
## weight -1.294e-01 2.539e+03 0.000 1.000
## bp.1s -1.994e-01 2.549e+03 0.000 1.000
## bp.1d -2.186e-01 3.162e+03 0.000 1.000
## bp.2s 3.349e-01 2.584e+03 0.000 1.000
## bp.2d 1.363e+00 3.175e+03 0.000 1.000
## waist -7.988e-01 9.234e+03 0.000 1.000
## hip 1.815e+00 1.422e+04 0.000 1.000
## time.ppn 6.914e-04 8.483e+01 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.9428e+02 on 234 degrees of freedom
## Residual deviance: 1.0123e-08 on 212 degrees of freedom
## (6 observations deleted due to missingness)
## AIC: 46
##
## Number of Fisher Scoring iterations: 25
plot4 = ggplot(data = trainingdataset, aes(x = location,y = dm))+geom_point() + geom_smooth(method = "glm",se = FALSE,method.args = list(family = "binomial"))
ggplotly(plot4)
## `geom_smooth()` using formula 'y ~ x'
plot5 = ggplot(data = trainingdataset, aes(x = gender,y = dm))+geom_point() + geom_smooth(method = "glm",se = FALSE,method.args = list(family = "binomial"))
ggplotly(plot5)
## `geom_smooth()` using formula 'y ~ x'
Model Prediction : Apply the model to the validation dataset
probprediction = predict(model,newdata = validationdataset,type = "response")
print(head(probprediction))
## 2 3 4 5 6 7
## 2.220446e-16 2.220446e-16 2.220446e-16 9.996811e-01 2.220446e-16 2.220446e-16
cut_off = 0.5
prediction = ifelse(probprediction>cut_off,1,0)
validationdataset$predicteddm = prediction
print(head(validationdataset))
## stab.glu location age gender frame insurance fh smoking chol hdl ratio
## 2 97 Buckingham 29 female large 0 0 2 165 24 6.9
## 3 92 Buckingham 58 female large 2 0 2 228 37 6.2
## 4 93 Buckingham 67 male large 1 0 3 78 12 6.5
## 5 90 Buckingham 64 male medium 0 0 3 249 28 8.9
## 6 94 Buckingham 34 male large 1 0 1 248 69 3.6
## 7 92 Buckingham 30 male medium 2 1 2 195 41 4.8
## glyhb height weight bp.1s bp.1d bp.2s bp.2d waist hip time.ppn dm
## 2 4.44 64 218 112 68 152.383 92.52482 46 48 360 0
## 3 4.64 61 256 190 92 185.000 92.00000 49 57 180 0
## 4 4.63 67 119 110 50 152.383 92.52482 33 38 480 0
## 5 7.72 68 183 138 80 152.383 92.52482 44 41 300 1
## 6 4.81 71 190 132 86 152.383 92.52482 36 42 195 0
## 7 4.84 69 191 161 112 161.000 112.00000 46 49 720 0
## predicteddm
## 2 0
## 3 0
## 4 0
## 5 1
## 6 0
## 7 0
Model Evaluation Binary Logestic Regression
confusionMatrix(as.factor(validationdataset$predicteddm),as.factor(validationdataset$dm),positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 132 2
## 1 0 22
##
## Accuracy : 0.9872
## 95% CI : (0.9545, 0.9984)
## No Information Rate : 0.8462
## P-Value [Acc > NIR] : 2.063e-09
##
## Kappa : 0.949
##
## Mcnemar's Test P-Value : 0.4795
##
## Sensitivity : 0.9167
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9851
## Prevalence : 0.1538
## Detection Rate : 0.1410
## Detection Prevalence : 0.1410
## Balanced Accuracy : 0.9583
##
## 'Positive' Class : 1
##