Framework followed: Data Exploration, Data Preparation, Data Visualization,Model Fitting, Applying Model to get predictions,Model Evaluation and Deployment
Data:Diabetes taken from coursera BLR course
dataset=read.csv("diabetes_imputed.csv",header=TRUE)
print(head(dataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 1 20782 no Buckingham male small 1 0 2 324 180 92 34
## 2 15252 yes Buckingham male large 2 1 2 167 254 121 39
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 5 17841 no Buckingham female medium 2 0 2 270 206 90 38
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 1 5.3 3.59 63 69 169 145 72 142.000 70.00000 35 39
## 2 6.5 9.25 67 68 167 161 118 151.000 111.00000 36 39
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 5 5.4 4.07 38 69 167 138 90 152.383 92.52482 36 47
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## time.ppn
## 1 30
## 2 60
## 3 660
## 4 330
## 5 90
## 6 90
# Name : Meaning # id : Subject ID # chol : Total Cholesterol # stab.glu : Stabilized Glucose # hdl : High Density Lipoprotein # ratio : Cholesterol/HDL Ratio # glyhb : Glycosolated Hemoglobin # location : # age : # gender : # height : # weight : # frame : # insurance 0=none, 1=gov, 2=private : # fh family history :1 =yes 0=no : # bp.1s : First Systolic Blood Pressure # bp.1d : First Diastolic Blood Pressure # bp.2s : Second Systolic Blood Pressure # bp.2d : Second Diastolic Blood Pressure # waist : # hip : # time.ppn : Postprandial Time when Labs were Drawn
Data Preparation
#Rattle was used and data was imputed and changed to fill in the missing values
str(dataset) #to see types of these data
## 'data.frame': 282 obs. of 24 variables:
## $ id : int 20782 15252 10016 20325 17841 15520 20350 4758 20267 40773 ...
## $ dm : chr "no" "yes" "no" "no" ...
## $ location : chr "Buckingham" "Buckingham" "Buckingham" "Louisa" ...
## $ gender : chr "male" "male" "female" "female" ...
## $ frame : chr "small" "large" "large" "medium" ...
## $ insurance: int 1 2 0 1 2 0 2 1 2 2 ...
## $ fh : int 0 1 0 0 0 0 0 0 0 1 ...
## $ smoking : int 2 2 2 1 2 2 2 1 2 1 ...
## $ X : int 324 167 129 299 270 187 307 85 277 362 ...
## $ chol : num 180 254 228 243 206 174 164 185 191 212 ...
## $ stab.glu : int 92 121 115 74 90 90 91 84 74 88 ...
## $ hdl : num 34 39 61 42 38 36 67 52 33 36 ...
## $ ratio : num 5.3 6.5 3.7 5.8 5.4 ...
## $ glyhb : num 3.59 9.25 6.39 3.85 4.07 ...
## $ age : int 63 67 71 43 38 34 20 53 40 37 ...
## $ height : num 69 68 63 64 69 71 70 61 72 64 ...
## $ weight : num 169 167 244 239 167 210 141 145 270 160 ...
## $ bp.1s : num 145 161 170 128 138 142 122 147 136 124 ...
## $ bp.1d : num 72 118 92 90 90 92 86 72 70 82 ...
## $ bp.2s : num 142 151 152 138 152 ...
## $ bp.2d : num 70 111 92.5 90 92.5 ...
## $ waist : num 35 36 48 48 36 37 32 37 45 37 ...
## $ hip : num 39 39 51 53 47 43 39 40 49 45 ...
## $ time.ppn : num 30 60 660 330 90 90 390 420 150 15 ...
#Convert the datatype of qualitative variables into factor datatype
dataset$location = as.factor(dataset$location)
dataset$gender = as.factor(dataset$gender)
dataset$frame = as.factor(dataset$frame)
dataset$dm = as.factor(dataset$dm)
dataset$insurance = as.factor(dataset$insurance)
dataset$smoking = as.factor(dataset$smoking)
dataset$fh = as.factor(dataset$fh)
# can also use to convert all but here we have continuous too so its better to induvidually convert dataset=data.frame(lapply(dataset,as.factor))
#Grouping age would be helpful in visualizing later
age=dataset$age
age_grouped = ifelse(age < 45, "under 45",
ifelse(age >= 45 & age < 65, "45 - 64", ifelse(age >= 65 & age < 75, "65 - 74", ifelse(age >= 75, "75 or over", NA))))
#as done in the course we can also calculate BMI
# Divide dataset into training and validation dataset
set.seed(1)
trainrows = sample(row.names(dataset), nrow(dataset)*0.7)
traindataset = dataset[trainrows,]
Validrows = setdiff(row.names(dataset), trainrows)
Validdataset = dataset[Validrows,]
print(head(traindataset))
## id dm location gender frame insurance fh smoking X chol stab.glu
## 167 12005 no Louisa female small 0 1 1 133 168 101
## 129 4500 no Buckingham female medium 1 0 2 77 242 108
## 270 2791 yes Buckingham female medium 0 0 2 68 213 203
## 187 40251 no Louisa male medium 2 0 2 351 150 80
## 85 21320 no Louisa male medium 2 0 1 336 216 155
## 277 4760 yes Louisa female large 1 0 2 87 218 182
## hdl ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 167 59 2.8 5.09 44 64.0000 160 130 88 152.383 92.52482 40 43
## 129 53 4.6 5.47 46 62.0000 183 130 86 152.383 92.52482 37 45
## 270 75 2.8 11.41 71 63.0000 165 150 80 145.000 80.00000 34 42
## 187 38 3.9 3.97 35 73.0000 179 138 92 135.000 88.00000 32 37
## 85 30 7.2 5.91 38 68.0000 145 110 60 152.383 92.52482 34 37
## 277 54 4.0 10.55 51 66.0201 215 139 69 152.383 92.52482 42 53
## time.ppn
## 167 60
## 129 180
## 270 960
## 187 450
## 85 20
## 277 720
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn
## 3 660
## 4 330
## 6 90
## 7 390
## 8 420
## 10 15
Data Exploration and visualization
#Data visualization
assocplot(table(dataset$dm,age_grouped),xlab = "prone to diabetes", ylab = "age group", col = c("green","red"))
boxplot(age ~ dm,data=dataset)
g= ggplot(dataset, aes(x= dm, y= age)) + geom_boxplot()
ggplotly(g)
assocplot(table(dataset$dm,dataset$gender),xlab = "prone to diabetes", ylab = "gender", col = c("green","red"))
assocplot(table(dataset$dm,dataset$location),xlab = "prone to diabetes", ylab = "location", col = c("green","red"))
assocplot(table(dataset$dm,dataset$insurance),xlab = "prone to diabetes", ylab = "insurance(0=none,1=govt,2=private)", col = c("green","red"))
assocplot(table(dataset$dm,dataset$frame),xlab = "prone to diabetes", ylab = "frame", col = c("green","red"))
assocplot(table(dataset$dm,dataset$smoking),xlab = "prone to diabetes", ylab = "smoking", col = c("green","red"))
assocplot(table(dataset$dm,dataset$fh),xlab = "prone to diabetes", ylab = "family history (0=no, 1=yes)", col = c("green","red"))
g= ggplot(dataset, aes(x= dm, y= chol)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= stab.glu)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= hdl)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= ratio)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= glyhb)) + geom_boxplot()
ggplotly(g)
g= ggplot(dataset, aes(x= dm, y= time.ppn)) + geom_boxplot()
ggplotly(g)
Model Fitting
1.Logistic Regression
m = glm(dm~age+gender+chol+hdl+bp.1s+bp.1d+weight+height+location+frame+insurance+smoking,data=traindataset,family = "binomial")
summary(m)
##
## Call:
## glm(formula = dm ~ age + gender + chol + hdl + bp.1s + bp.1d +
## weight + height + location + frame + insurance + smoking,
## family = "binomial", data = traindataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -1.5325 -0.5426 -0.3289 -0.1708 2.8638
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -13.649385 6.426155 -2.124 0.03367 *
## age 0.063835 0.021085 3.028 0.00247 **
## gendermale -0.944296 0.727032 -1.299 0.19400
## chol 0.006568 0.005479 1.199 0.23059
## hdl -0.011127 0.014402 -0.773 0.43976
## bp.1s 0.014793 0.013692 1.080 0.27994
## bp.1d 0.006475 0.023291 0.278 0.78101
## weight 0.003373 0.007754 0.435 0.66351
## height 0.091367 0.092115 0.992 0.32125
## locationLouisa 0.216891 0.476877 0.455 0.64924
## framemedium -0.638790 0.573199 -1.114 0.26509
## framesmall -0.544748 0.817559 -0.666 0.50521
## insurance1 -0.799112 0.587902 -1.359 0.17406
## insurance2 -0.661964 0.561805 -1.178 0.23869
## smoking2 -0.355577 0.552521 -0.644 0.51986
## smoking3 -0.895669 0.830791 -1.078 0.28099
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 168.10 on 196 degrees of freedom
## Residual deviance: 130.61 on 181 degrees of freedom
## AIC: 162.61
##
## Number of Fisher Scoring iterations: 6
confint(m)
## Waiting for profiling to be done...
## 2.5 % 97.5 %
## (Intercept) -27.087976132 -1.77075876
## age 0.024099054 0.10743089
## gendermale -2.437681236 0.43180036
## chol -0.004308266 0.01740155
## hdl -0.040196116 0.01687274
## bp.1s -0.012052360 0.04217983
## bp.1d -0.040011049 0.05215005
## weight -0.012300799 0.01851679
## height -0.081149397 0.28088390
## locationLouisa -0.709258326 1.17513108
## framemedium -1.771159664 0.49301716
## framesmall -2.197564279 1.04015724
## insurance1 -1.991812022 0.34000281
## insurance2 -1.790693544 0.43277951
## smoking2 -1.435334073 0.75466712
## smoking3 -2.636100985 0.66970170
#full data included in model
model = glm(dm~., data=traindataset,family = "binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(model)
##
## Call: glm(formula = dm ~ ., family = "binomial", data = traindataset)
##
## Coefficients:
## (Intercept) id locationLouisa gendermale framemedium
## -2.397e+02 -2.361e-04 -6.058e+00 -9.029e+00 -6.722e+00
## framesmall insurance1 insurance2 fh1 smoking2
## -4.828e+00 -1.154e+01 -8.058e+00 -1.129e+01 8.381e+00
## smoking3 X chol stab.glu hdl
## -1.635e+01 1.029e-01 5.252e-02 1.003e-01 -1.942e-01
## ratio glyhb age height weight
## 2.670e-01 2.183e+01 1.096e-01 1.164e+00 -6.525e-03
## bp.1s bp.1d bp.2s bp.2d waist
## -2.145e-01 7.178e-01 3.928e-01 -8.166e-01 -7.662e-01
## hip time.ppn
## 1.483e-01 -1.089e-02
##
## Degrees of Freedom: 196 Total (i.e. Null); 170 Residual
## Null Deviance: 168.1
## Residual Deviance: 5.968e-09 AIC: 54
print(summary(model))
##
## Call:
## glm(formula = dm ~ ., family = "binomial", data = traindataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.437e-05 -2.110e-08 -2.110e-08 -2.110e-08 2.537e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.397e+02 1.380e+06 0.000 1
## id -2.361e-04 1.020e+01 0.000 1
## locationLouisa -6.058e+00 5.807e+04 0.000 1
## gendermale -9.029e+00 1.389e+05 0.000 1
## framemedium -6.722e+00 8.774e+04 0.000 1
## framesmall -4.828e+00 2.011e+05 0.000 1
## insurance1 -1.154e+01 9.386e+04 0.000 1
## insurance2 -8.058e+00 1.423e+05 0.000 1
## fh1 -1.129e+01 1.928e+05 0.000 1
## smoking2 8.381e+00 1.285e+05 0.000 1
## smoking3 -1.635e+01 1.622e+05 0.000 1
## X 1.029e-01 1.139e+03 0.000 1
## chol 5.252e-02 1.148e+03 0.000 1
## stab.glu 1.003e-01 1.278e+03 0.000 1
## hdl -1.942e-01 5.710e+03 0.000 1
## ratio 2.670e-01 5.063e+04 0.000 1
## glyhb 2.183e+01 4.167e+04 0.001 1
## age 1.096e-01 3.418e+03 0.000 1
## height 1.164e+00 1.327e+04 0.000 1
## weight -6.525e-03 2.232e+03 0.000 1
## bp.1s -2.145e-01 3.923e+03 0.000 1
## bp.1d 7.178e-01 4.824e+03 0.000 1
## bp.2s 3.928e-01 3.310e+03 0.000 1
## bp.2d -8.166e-01 5.528e+03 0.000 1
## waist -7.662e-01 1.325e+04 0.000 1
## hip 1.483e-01 2.392e+04 0.000 1
## time.ppn -1.089e-02 1.284e+02 0.000 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.6810e+02 on 196 degrees of freedom
## Residual deviance: 5.9683e-09 on 170 degrees of freedom
## AIC: 54
##
## Number of Fisher Scoring iterations: 25
2.Naive Bayes
model = naiveBayes(dm~., data=traindataset)
print(model)
##
## Naive Bayes Classifier for Discrete Predictors
##
## Call:
## naiveBayes.default(x = X, y = Y, laplace = laplace)
##
## A-priori probabilities:
## Y
## no yes
## 0.8477157 0.1522843
##
## Conditional probabilities:
## id
## Y [,1] [,2]
## no 16383.43 12635.41
## yes 18290.10 14191.18
##
## location
## Y Buckingham Louisa
## no 0.4550898 0.5449102
## yes 0.4666667 0.5333333
##
## gender
## Y female male
## no 0.5748503 0.4251497
## yes 0.6666667 0.3333333
##
## frame
## Y large medium small
## no 0.1796407 0.5628743 0.2574850
## yes 0.3666667 0.4666667 0.1666667
##
## insurance
## Y 0 1 2
## no 0.2514970 0.4131737 0.3353293
## yes 0.3666667 0.2666667 0.3666667
##
## fh
## Y 0 1
## no 0.8562874 0.1437126
## yes 0.7000000 0.3000000
##
## smoking
## Y 1 2 3
## no 0.2634731 0.5928144 0.1437126
## yes 0.3000000 0.5666667 0.1333333
##
## X
## Y [,1] [,2]
## no 203.006 121.5149
## yes 213.400 125.6455
##
## chol
## Y [,1] [,2]
## no 202.8434 40.81699
## yes 226.5667 48.65337
##
## stab.glu
## Y [,1] [,2]
## no 91.73653 31.90407
## yes 195.76667 81.98389
##
## hdl
## Y [,1] [,2]
## no 49.46973 16.22697
## yes 50.16667 18.89916
##
## ratio
## Y [,1] [,2]
## no 4.425279 1.412590
## yes 4.990000 1.773823
##
## glyhb
## Y [,1] [,2]
## no 4.759749 0.725817
## yes 10.442333 2.542690
##
## age
## Y [,1] [,2]
## no 43.92216 14.61432
## yes 58.46667 13.79088
##
## height
## Y [,1] [,2]
## no 66.01820 4.107696
## yes 66.00067 3.904023
##
## weight
## Y [,1] [,2]
## no 176.5449 40.12987
## yes 183.0197 33.24374
##
## bp.1s
## Y [,1] [,2]
## no 134.7648 21.66039
## yes 150.8667 21.37955
##
## bp.1d
## Y [,1] [,2]
## no 82.93392 13.14919
## yes 85.83333 13.93849
##
## bp.2s
## Y [,1] [,2]
## no 151.5862 15.23349
## yes 155.1915 13.46084
##
## bp.2d
## Y [,1] [,2]
## no 92.24419 6.785390
## yes 92.16241 7.349268
##
## waist
## Y [,1] [,2]
## no 37.2988 5.249728
## yes 39.5000 5.787918
##
## hip
## Y [,1] [,2]
## no 42.81461 5.551111
## yes 43.90000 5.610028
##
## time.ppn
## Y [,1] [,2]
## no 307.3129 298.0683
## yes 388.0000 358.0951
print(summary(model))
## Length Class Mode
## apriori 2 table numeric
## tables 23 -none- list
## levels 2 -none- character
## isnumeric 23 -none- logical
## call 4 -none- call
Model Prediction Naive Bayes
pred = predict(model,Validdataset)
print(head(pred))
## [1] yes no no no no no
## Levels: no yes
Validdataset$predictdm = pred
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn predictdm
## 3 660 yes
## 4 330 no
## 6 90 no
## 7 390 no
## 8 420 no
## 10 15 no
Model Evaluation Naive Bayes
confusionMatrix(Validdataset$predictdm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 72 0
## yes 3 10
##
## Accuracy : 0.9647
## 95% CI : (0.9003, 0.9927)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 0.007428
##
## Kappa : 0.8496
##
## Mcnemar's Test P-Value : 0.248213
##
## Sensitivity : 0.9600
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.7692
## Prevalence : 0.8824
## Detection Rate : 0.8471
## Detection Prevalence : 0.8471
## Balanced Accuracy : 0.9800
##
## 'Positive' Class : no
##
Model Deployment
id=20782
location='Buckingham'
gender='male'
frame='small'
insurance='1'
fh='0'
smoking='2'
X=324
chol=180
stab.glu=92
hdl=34
ratio=5.30
glyhb=3.58
age=63
height=69
weight=169
bp.1s=145
bp.1d=72
bp.2s=142
bp.2d=70
waist=35
hip=39
time.ppn=30
newdataset=data.frame(id,location, gender, frame, insurance, fh, smoking, X, chol, stab.glu, hdl, ratio, glyhb, age, height, weight, bp.1s, bp.1d, bp.2s, bp.2d, waist, hip, time.ppn)
predict(model,newdataset)
## [1] no
## Levels: no yes
predictionnew=predict(model,newdataset)
print(predictionnew)
## [1] no
## Levels: no yes
3.Model Fitting C50
modelC5 = C5.0(dm~.,data=traindataset)
print(summary(modelC5))
##
## Call:
## C5.0.formula(formula = dm ~ ., data = traindataset)
##
##
## C5.0 [Release 2.07 GPL Edition] Fri Sep 04 14:18:58 2020
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 197 cases (24 attributes) from undefined.data
##
## Decision tree:
##
## glyhb <= 6.97: no (167)
## glyhb > 6.97: yes (30)
##
##
## Evaluation on training data (197 cases):
##
## Decision Tree
## ----------------
## Size Errors
##
## 2 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 167 (a): class no
## 30 (b): class yes
##
##
## Attribute usage:
##
## 100.00% glyhb
##
##
## Time: 0.0 secs
plot(modelC5)
Model Prediction C50
pred = predict(modelC5,Validdataset)
print(head(pred))
## [1] no no no no no no
## Levels: no yes
Validdataset$predictdm = pred
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn predictdm
## 3 660 no
## 4 330 no
## 6 90 no
## 7 390 no
## 8 420 no
## 10 15 no
Model Evaluation C50
confusionMatrix(Validdataset$predictdm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 75 0
## yes 0 10
##
## Accuracy : 1
## 95% CI : (0.9575, 1)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 2.397e-05
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8824
## Detection Rate : 0.8824
## Detection Prevalence : 0.8824
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : no
##
modelCTREE = ctree(dm~.,data=traindataset)
plot(modelCTREE)
Model Prediction CTREE
pred = predict(modelCTREE,Validdataset)
print(head(pred))
## [1] no no no no no no
## Levels: no yes
Validdataset$predictdm = pred
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn predictdm
## 3 660 no
## 4 330 no
## 6 90 no
## 7 390 no
## 8 420 no
## 10 15 no
Model Evaluation CTREE
confusionMatrix(Validdataset$predictdm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 75 0
## yes 0 10
##
## Accuracy : 1
## 95% CI : (0.9575, 1)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 2.397e-05
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.8824
## Detection Rate : 0.8824
## Detection Prevalence : 0.8824
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : no
##
5.Model Fitting RPART
modelRpart = rpart(dm~.,data=traindataset)
rpart.plot(modelRpart)
Model Prediction RPART
pred = predict(modelRpart,Validdataset)
print(head(pred))
## no yes
## 3 1 0
## 4 1 0
## 6 1 0
## 7 1 0
## 8 1 0
## 10 1 0
Validdataset$predictdm = pred
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn predictdm.no predictdm.yes
## 3 660 1 0
## 4 330 1 0
## 6 90 1 0
## 7 390 1 0
## 8 420 1 0
## 10 15 1 0
modeldisc=q=lda(dm~.,data=traindataset)
print(modeldisc)
## Call:
## lda(dm ~ ., data = traindataset)
##
## Prior probabilities of groups:
## no yes
## 0.8477157 0.1522843
##
## Group means:
## id locationLouisa gendermale framemedium framesmall insurance1
## no 16383.43 0.5449102 0.4251497 0.5628743 0.2574850 0.4131737
## yes 18290.10 0.5333333 0.3333333 0.4666667 0.1666667 0.2666667
## insurance2 fh1 smoking2 smoking3 X chol stab.glu
## no 0.3353293 0.1437126 0.5928144 0.1437126 203.006 202.8434 91.73653
## yes 0.3666667 0.3000000 0.5666667 0.1333333 213.400 226.5667 195.76667
## hdl ratio glyhb age height weight bp.1s bp.1d
## no 49.46973 4.425279 4.759749 43.92216 66.01820 176.5449 134.7648 82.93392
## yes 50.16667 4.990000 10.442333 58.46667 66.00067 183.0197 150.8667 85.83333
## bp.2s bp.2d waist hip time.ppn
## no 151.5862 92.24419 37.2988 42.81461 307.3129
## yes 155.1915 92.16241 39.5000 43.90000 388.0000
##
## Coefficients of linear discriminants:
## LD1
## id -3.106325e-05
## locationLouisa -4.576866e-02
## gendermale -4.472582e-01
## framemedium -3.195063e-01
## framesmall -1.809960e-01
## insurance1 1.907870e-01
## insurance2 1.716758e-01
## fh1 1.048499e-01
## smoking2 4.823668e-02
## smoking3 -5.752476e-02
## X 3.847322e-03
## chol -7.553135e-03
## stab.glu 3.682277e-03
## hdl 2.088906e-02
## ratio 2.081914e-01
## glyhb 8.017335e-01
## age 3.317806e-03
## height 5.300882e-03
## weight -2.044322e-03
## bp.1s 9.111621e-03
## bp.1d 2.816918e-03
## bp.2s -4.819227e-03
## bp.2d 8.394225e-03
## waist 3.914013e-02
## hip -2.736845e-02
## time.ppn 3.869986e-04
partimat(dm~age+gender+chol+hdl,data=traindataset)
partimat(dm~bp.1s+bp.1d+weight+height,data=traindataset)
partimat(dm~insurance+smoking,data=traindataset)
partimat(dm~location+frame,data=traindataset)
Model Prediction Discriminant analysis
pred = predict(modeldisc,Validdataset)
Validdataset$predictdm = pred$class
print(head(Validdataset))
## id dm location gender frame insurance fh smoking X chol stab.glu hdl
## 3 10016 no Buckingham female large 0 0 2 129 228 115 61
## 4 20325 no Louisa female medium 1 0 1 299 243 74 42
## 6 15520 no Louisa male medium 0 0 2 187 174 90 36
## 7 20350 no Louisa female small 2 0 2 307 164 91 67
## 8 4758 no Louisa female medium 1 0 1 85 185 84 52
## 10 40773 no Louisa female small 2 1 1 362 212 88 36
## ratio glyhb age height weight bp.1s bp.1d bp.2s bp.2d waist hip
## 3 3.7 6.39 71 63 244 170 92 152.383 92.52482 48 51
## 4 5.8 3.85 43 64 239 128 90 138.000 90.00000 48 53
## 6 4.8 5.35 34 71 210 142 92 148.000 98.00000 37 43
## 7 2.4 3.97 20 70 141 122 86 152.383 92.52482 32 39
## 8 3.6 5.28 53 61 145 147 72 152.383 92.52482 37 40
## 10 5.9 5.22 37 64 160 124 82 152.383 92.52482 37 45
## time.ppn predictdm
## 3 660 no
## 4 330 no
## 6 90 no
## 7 390 no
## 8 420 no
## 10 15 no
Model Evaluation Discriminant analysis
confusionMatrix(Validdataset$predictdm, Validdataset$dm)
## Confusion Matrix and Statistics
##
## Reference
## Prediction no yes
## no 75 1
## yes 0 9
##
## Accuracy : 0.9882
## 95% CI : (0.9362, 0.9997)
## No Information Rate : 0.8824
## P-Value [Acc > NIR] : 0.0002956
##
## Kappa : 0.9408
##
## Mcnemar's Test P-Value : 1.0000000
##
## Sensitivity : 1.0000
## Specificity : 0.9000
## Pos Pred Value : 0.9868
## Neg Pred Value : 1.0000
## Prevalence : 0.8824
## Detection Rate : 0.8824
## Detection Prevalence : 0.8941
## Balanced Accuracy : 0.9500
##
## 'Positive' Class : no
##