Data Preparation
Diabetes Dataset
dataset=read.csv("Diabetes_cleaned.csv",header=TRUE)
print(head(dataset))
## X id stabglu location age gender insurance fh smoking chol hdl
## 1 49 2750 85 Buckingham 40 female Private Yes Never smoked 169 51
## 2 321 20773 71 Louisa 45 male Govt. No Never smoked 203 78
## 3 153 13500 112 Louisa 82 male Private No Never smoked 255 34
## 4 74 3751 84 Buckingham 40 female Govt. No Never smoked 180 69
## 5 228 16001 82 Buckingham 51 female None No Current smoker 222 87
## 6 146 12766 97 Buckingham 52 male Private No Never smoked 171 69
## ratio glyhb height weight bp.1s bp.1d waist hips Diabetic
## 1 3.3 6.14 65 180 106 82 40 44 no
## 2 2.6 2.85 66 115 135 88 30 34 no
## 3 7.5 5.60 66 163 179 89 37 43 no
## 4 2.6 5.20 68 264 142 98 43 54 no
## 5 2.6 4.64 66 110 150 110 28 37 no
## 6 2.5 4.04 71 159 125 72 33 39 no
Converting categorical variables to factors
dataset$location=as.factor(dataset$location)
dataset$Diabetic=as.factor(dataset$Diabetic)
dataset$gender=as.factor(dataset$gender)
Data Exploration & Visualization
Plotting dependent variable against quantitative independent variables
Cholesterol
plot1=ggplot(data=dataset,aes(x=Diabetic,y=chol))+geom_boxplot(fill="red")
ggplotly(plot1)
High Density Lipoprotein
plot2=ggplot(data=dataset,aes(x=Diabetic,y=hdl))+geom_boxplot(fill="blue")
ggplotly(plot2)
Cholesterol/hdl ratio
plot3=ggplot(data=dataset,aes(x=Diabetic,y=ratio))+geom_boxplot(fill="green")
ggplotly(plot3)
Glycosolated Haemoglobin
plot4=ggplot(data=dataset,aes(x=Diabetic,y=glyhb))+geom_boxplot(fill="orange")
ggplotly(plot4)
Stabilized Glucose
plot11=ggplot(data=dataset,aes(x=Diabetic,y=stabglu))+geom_boxplot(fill="orange")
ggplotly(plot11)
Height
plot5=ggplot(data=dataset,aes(x=Diabetic,y=height))+geom_boxplot(fill="purple")
ggplotly(plot5)
Weight
plot6=ggplot(data=dataset,aes(x=Diabetic,y=weight))+geom_boxplot(fill="yellow")
ggplotly(plot6)
Blood Pressure (Systolic)
plot7=ggplot(data=dataset,aes(x=Diabetic,y=bp.1s))+geom_boxplot(fill="maroon")
ggplotly(plot7)
Blood Pressure (Dystolic)
plot8=ggplot(data=dataset,aes(x=Diabetic,y=bp.1d))+geom_boxplot(fill="maroon")
ggplotly(plot8)
Hips
plot9=ggplot(data=dataset,aes(x=Diabetic,y=hips))+geom_boxplot(fill="red")
ggplotly(plot9)
Age
plot10=ggplot(data=dataset,aes(x=Diabetic,y=age))+geom_boxplot(fill="red")
ggplotly(plot10)
Plotting dependent variable against qualitative independent variables
Gender
assocplot(table(dataset$gender,dataset$Diabetic), xlab="gender",ylab="Diabetic",col=c("green","red"))
Location
assocplot(table(dataset$location,dataset$Diabetic), xlab="location",ylab="Diabetic",col=c("green","red"))
Family History
assocplot(table(dataset$fh,dataset$Diabetic), xlab="fh",ylab="Diabetic",col=c("green","red"))
Insurance
assocplot(table(dataset$insurance,dataset$Diabetic), xlab="insurance",ylab="Diabetic",col=c("green","red"))
Smoking
assocplot(table(dataset$smoking,dataset$Diabetic), xlab="smoking",ylab="Diabetic",col=c("green","red"))
Dividing Data into Training and Validation dataset
trainrows=sample(row.names(dataset),nrow(dataset)*0.7)
traindataset=dataset[trainrows,]
validrows=setdiff(row.names(dataset),trainrows)
validdataset=dataset[validrows,]
print(head(traindataset))
## X id stabglu location age gender insurance fh smoking chol
## 177 396 41500 85 Louisa 37 male None No Never smoked 179
## 52 186 15519 106 Louisa 65 female None No Current smoker 219
## 85 208 15792 81 Buckingham 64 female Govt. No Ex-smoker 202
## 231 253 17794 92 Buckingham 27 female None Yes Current smoker 241
## 155 31 1301 101 Buckingham 42 female Govt. No Never smoked 177
## 139 185 15518 79 Louisa 75 male None No Current smoker 205
## hdl ratio glyhb height weight bp.1s bp.1d waist hips Diabetic
## 177 50 3.6 4.99 66 136 190 94 33 39 no
## 52 50 4.4 4.56 63 233 140 90 40 53 no
## 85 55 3.7 5.50 62 167 190 118 44 47 no
## 231 40 6.0 5.04 63 179 120 75 40 42 no
## 155 36 4.9 5.11 65 174 146 94 37 40 no
## 139 32 6.4 4.21 69 204 136 90 44 42 no
print(head(validdataset))
## X id stabglu location age gender insurance fh smoking chol
## 1 49 2750 85 Buckingham 40 female Private Yes Never smoked 169
## 3 153 13500 112 Louisa 82 male Private No Never smoked 255
## 4 74 3751 84 Buckingham 40 female Govt. No Never smoked 180
## 11 24 1256 92 Buckingham 66 female Govt. No Current smoker 281
## 16 110 4808 80 Buckingham 68 male Private No Ex-smoker 218
## 17 20 1250 206 Buckingham 62 female None No Never smoked 196
## hdl ratio glyhb height weight bp.1s bp.1d waist hips Diabetic
## 1 51 3.3 6.140000 65 180 106 82 40 44 no
## 3 34 7.5 5.600000 66 163 179 89 37 43 no
## 4 69 2.6 5.200000 68 264 142 98 43 54 no
## 11 41 6.9 5.560000 62 185 158 88 48 44 no
## 16 71 3.1 5.589769 70 170 130 73 37 42 no
## 17 41 4.8 11.240000 65 196 178 90 46 51 yes
Creating Dummy Variable
df=data.frame(model.matrix(~0+Diabetic,data=traindataset))
traindataset$Diabetic=df$Diabeticyes
Data Modelling
Fitting Binary Logistic Regression model into the training dataset
model=glm(Diabetic~.,data=traindataset,family="binomial")
## Warning: glm.fit: algorithm did not converge
## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred
print(summary(model))
##
## Call:
## glm(formula = Diabetic ~ ., family = "binomial", data = traindataset)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -2.742e-05 -2.110e-08 -2.110e-08 -2.110e-08 2.754e-05
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.381e+02 9.780e+05 0.000 1.000
## X 1.360e-01 6.950e+02 0.000 1.000
## id -1.838e-03 6.784e+00 0.000 1.000
## stabglu 8.969e-02 7.954e+02 0.000 1.000
## locationLouisa 1.344e+01 1.077e+05 0.000 1.000
## age 6.004e-01 3.547e+03 0.000 1.000
## gendermale -1.230e+01 7.023e+04 0.000 1.000
## insuranceNone 1.336e+01 7.770e+04 0.000 1.000
## insurancePrivate 8.532e+00 5.480e+04 0.000 1.000
## fhYes -4.890e+00 5.142e+04 0.000 1.000
## smokingEx-smoker 1.101e+01 1.473e+05 0.000 1.000
## smokingNever smoked -2.398e+00 8.346e+04 0.000 1.000
## chol -8.394e-02 1.376e+03 0.000 1.000
## hdl 2.498e-01 7.927e+03 0.000 1.000
## ratio 8.776e-01 5.472e+04 0.000 1.000
## glyhb 2.917e+01 3.506e+04 0.001 0.999
## height -2.009e-01 1.235e+04 0.000 1.000
## weight 1.629e-01 2.563e+03 0.000 1.000
## bp.1s -4.617e-02 2.287e+03 0.000 1.000
## bp.1d -8.492e-02 2.814e+03 0.000 1.000
## waist -3.317e-01 1.235e+04 0.000 1.000
## hips -1.199e-01 1.316e+04 0.000 1.000
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 1.8123e+02 on 196 degrees of freedom
## Residual deviance: 9.2645e-09 on 175 degrees of freedom
## AIC: 44
##
## Number of Fisher Scoring iterations: 25
Applying the BLR model to validation dataset to get the predictions
pred=predict(model,validdataset)
print(head(pred))
## 1 3 4 11 16 17
## -37.20954 -38.51148 -55.97305 -52.67719 -27.41995 136.47226
validdataset$DiabetesPrediction=pred
print(head(validdataset))
## X id stabglu location age gender insurance fh smoking chol
## 1 49 2750 85 Buckingham 40 female Private Yes Never smoked 169
## 3 153 13500 112 Louisa 82 male Private No Never smoked 255
## 4 74 3751 84 Buckingham 40 female Govt. No Never smoked 180
## 11 24 1256 92 Buckingham 66 female Govt. No Current smoker 281
## 16 110 4808 80 Buckingham 68 male Private No Ex-smoker 218
## 17 20 1250 206 Buckingham 62 female None No Never smoked 196
## hdl ratio glyhb height weight bp.1s bp.1d waist hips Diabetic
## 1 51 3.3 6.140000 65 180 106 82 40 44 no
## 3 34 7.5 5.600000 66 163 179 89 37 43 no
## 4 69 2.6 5.200000 68 264 142 98 43 54 no
## 11 41 6.9 5.560000 62 185 158 88 48 44 no
## 16 71 3.1 5.589769 70 170 130 73 37 42 no
## 17 41 4.8 11.240000 65 196 178 90 46 51 yes
## DiabetesPrediction
## 1 -37.20954
## 3 -38.51148
## 4 -55.97305
## 11 -52.67719
## 16 -27.41995
## 17 136.47226
Fitting RPART model
modelRpart=rpart(Diabetic~.,data=traindataset)
rpart.plot(modelRpart)
Applying the RPART model to validation dataset to get the predictions
pred=predict(modelRpart,validdataset)
print(head(pred))
## 1 3 4 11 16 17
## 0 0 0 0 0 1
validdataset$DiabetesPrediction = pred
print(head(validdataset))
## X id stabglu location age gender insurance fh smoking chol
## 1 49 2750 85 Buckingham 40 female Private Yes Never smoked 169
## 3 153 13500 112 Louisa 82 male Private No Never smoked 255
## 4 74 3751 84 Buckingham 40 female Govt. No Never smoked 180
## 11 24 1256 92 Buckingham 66 female Govt. No Current smoker 281
## 16 110 4808 80 Buckingham 68 male Private No Ex-smoker 218
## 17 20 1250 206 Buckingham 62 female None No Never smoked 196
## hdl ratio glyhb height weight bp.1s bp.1d waist hips Diabetic
## 1 51 3.3 6.140000 65 180 106 82 40 44 no
## 3 34 7.5 5.600000 66 163 179 89 37 43 no
## 4 69 2.6 5.200000 68 264 142 98 43 54 no
## 11 41 6.9 5.560000 62 185 158 88 48 44 no
## 16 71 3.1 5.589769 70 170 130 73 37 42 no
## 17 41 4.8 11.240000 65 196 178 90 46 51 yes
## DiabetesPrediction
## 1 0
## 3 0
## 4 0
## 11 0
## 16 0
## 17 1