Predicting Diabetes via Binary Logistic Regression and RPART models

Data Preparation

Diabetes Dataset

dataset=read.csv("Diabetes_cleaned.csv",header=TRUE)
print(head(dataset))

##     X    id stabglu   location age gender insurance  fh        smoking chol hdl
## 1  49  2750      85 Buckingham  40 female   Private Yes   Never smoked  169  51
## 2 321 20773      71     Louisa  45   male     Govt.  No   Never smoked  203  78
## 3 153 13500     112     Louisa  82   male   Private  No   Never smoked  255  34
## 4  74  3751      84 Buckingham  40 female     Govt.  No   Never smoked  180  69
## 5 228 16001      82 Buckingham  51 female      None  No Current smoker  222  87
## 6 146 12766      97 Buckingham  52   male   Private  No   Never smoked  171  69
##   ratio glyhb height weight bp.1s bp.1d waist hips Diabetic
## 1   3.3  6.14     65    180   106    82    40   44       no
## 2   2.6  2.85     66    115   135    88    30   34       no
## 3   7.5  5.60     66    163   179    89    37   43       no
## 4   2.6  5.20     68    264   142    98    43   54       no
## 5   2.6  4.64     66    110   150   110    28   37       no
## 6   2.5  4.04     71    159   125    72    33   39       no

Converting categorical variables to factors

dataset$location=as.factor(dataset$location)
dataset$Diabetic=as.factor(dataset$Diabetic)
dataset$gender=as.factor(dataset$gender)

Data Exploration & Visualization

Plotting dependent variable against quantitative independent variables

Cholesterol

plot1=ggplot(data=dataset,aes(x=Diabetic,y=chol))+geom_boxplot(fill="red")
ggplotly(plot1)

High Density Lipoprotein

plot2=ggplot(data=dataset,aes(x=Diabetic,y=hdl))+geom_boxplot(fill="blue")
ggplotly(plot2)

Cholesterol/hdl ratio

plot3=ggplot(data=dataset,aes(x=Diabetic,y=ratio))+geom_boxplot(fill="green")
ggplotly(plot3)

Glycosolated Haemoglobin

plot4=ggplot(data=dataset,aes(x=Diabetic,y=glyhb))+geom_boxplot(fill="orange")
ggplotly(plot4)

Stabilized Glucose

plot11=ggplot(data=dataset,aes(x=Diabetic,y=stabglu))+geom_boxplot(fill="orange")
ggplotly(plot11)

Height

plot5=ggplot(data=dataset,aes(x=Diabetic,y=height))+geom_boxplot(fill="purple")
ggplotly(plot5)

Weight

plot6=ggplot(data=dataset,aes(x=Diabetic,y=weight))+geom_boxplot(fill="yellow")
ggplotly(plot6)

Blood Pressure (Systolic)

plot7=ggplot(data=dataset,aes(x=Diabetic,y=bp.1s))+geom_boxplot(fill="maroon")
ggplotly(plot7)

Blood Pressure (Dystolic)

plot8=ggplot(data=dataset,aes(x=Diabetic,y=bp.1d))+geom_boxplot(fill="maroon")
ggplotly(plot8)

Hips

plot9=ggplot(data=dataset,aes(x=Diabetic,y=hips))+geom_boxplot(fill="red")
ggplotly(plot9)

Age

plot10=ggplot(data=dataset,aes(x=Diabetic,y=age))+geom_boxplot(fill="red")
ggplotly(plot10)

Plotting dependent variable against qualitative independent variables

Gender

assocplot(table(dataset$gender,dataset$Diabetic), xlab="gender",ylab="Diabetic",col=c("green","red"))

Location

assocplot(table(dataset$location,dataset$Diabetic), xlab="location",ylab="Diabetic",col=c("green","red"))

Family History

assocplot(table(dataset$fh,dataset$Diabetic), xlab="fh",ylab="Diabetic",col=c("green","red"))

Insurance

assocplot(table(dataset$insurance,dataset$Diabetic), xlab="insurance",ylab="Diabetic",col=c("green","red"))

Smoking

assocplot(table(dataset$smoking,dataset$Diabetic), xlab="smoking",ylab="Diabetic",col=c("green","red"))

Dividing Data into Training and Validation dataset

trainrows=sample(row.names(dataset),nrow(dataset)*0.7)
traindataset=dataset[trainrows,]
validrows=setdiff(row.names(dataset),trainrows)
validdataset=dataset[validrows,]
print(head(traindataset))

##       X    id stabglu   location age gender insurance  fh        smoking chol
## 177 396 41500      85     Louisa  37   male      None  No   Never smoked  179
## 52  186 15519     106     Louisa  65 female      None  No Current smoker  219
## 85  208 15792      81 Buckingham  64 female     Govt.  No      Ex-smoker  202
## 231 253 17794      92 Buckingham  27 female      None Yes Current smoker  241
## 155  31  1301     101 Buckingham  42 female     Govt.  No   Never smoked  177
## 139 185 15518      79     Louisa  75   male      None  No Current smoker  205
##     hdl ratio glyhb height weight bp.1s bp.1d waist hips Diabetic
## 177  50   3.6  4.99     66    136   190    94    33   39       no
## 52   50   4.4  4.56     63    233   140    90    40   53       no
## 85   55   3.7  5.50     62    167   190   118    44   47       no
## 231  40   6.0  5.04     63    179   120    75    40   42       no
## 155  36   4.9  5.11     65    174   146    94    37   40       no
## 139  32   6.4  4.21     69    204   136    90    44   42       no

print(head(validdataset))

##      X    id stabglu   location age gender insurance  fh        smoking chol
## 1   49  2750      85 Buckingham  40 female   Private Yes   Never smoked  169
## 3  153 13500     112     Louisa  82   male   Private  No   Never smoked  255
## 4   74  3751      84 Buckingham  40 female     Govt.  No   Never smoked  180
## 11  24  1256      92 Buckingham  66 female     Govt.  No Current smoker  281
## 16 110  4808      80 Buckingham  68   male   Private  No      Ex-smoker  218
## 17  20  1250     206 Buckingham  62 female      None  No   Never smoked  196
##    hdl ratio     glyhb height weight bp.1s bp.1d waist hips Diabetic
## 1   51   3.3  6.140000     65    180   106    82    40   44       no
## 3   34   7.5  5.600000     66    163   179    89    37   43       no
## 4   69   2.6  5.200000     68    264   142    98    43   54       no
## 11  41   6.9  5.560000     62    185   158    88    48   44       no
## 16  71   3.1  5.589769     70    170   130    73    37   42       no
## 17  41   4.8 11.240000     65    196   178    90    46   51      yes

Creating Dummy Variable

df=data.frame(model.matrix(~0+Diabetic,data=traindataset))
traindataset$Diabetic=df$Diabeticyes

Data Modelling

Fitting Binary Logistic Regression model into the training dataset

model=glm(Diabetic~.,data=traindataset,family="binomial")

## Warning: glm.fit: algorithm did not converge

## Warning: glm.fit: fitted probabilities numerically 0 or 1 occurred

print(summary(model))

## 
## Call:
## glm(formula = Diabetic ~ ., family = "binomial", data = traindataset)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -2.742e-05  -2.110e-08  -2.110e-08  -2.110e-08   2.754e-05  
## 
## Coefficients:
##                       Estimate Std. Error z value Pr(>|z|)
## (Intercept)         -2.381e+02  9.780e+05   0.000    1.000
## X                    1.360e-01  6.950e+02   0.000    1.000
## id                  -1.838e-03  6.784e+00   0.000    1.000
## stabglu              8.969e-02  7.954e+02   0.000    1.000
## locationLouisa       1.344e+01  1.077e+05   0.000    1.000
## age                  6.004e-01  3.547e+03   0.000    1.000
## gendermale          -1.230e+01  7.023e+04   0.000    1.000
## insuranceNone        1.336e+01  7.770e+04   0.000    1.000
## insurancePrivate     8.532e+00  5.480e+04   0.000    1.000
## fhYes               -4.890e+00  5.142e+04   0.000    1.000
## smokingEx-smoker     1.101e+01  1.473e+05   0.000    1.000
## smokingNever smoked -2.398e+00  8.346e+04   0.000    1.000
## chol                -8.394e-02  1.376e+03   0.000    1.000
## hdl                  2.498e-01  7.927e+03   0.000    1.000
## ratio                8.776e-01  5.472e+04   0.000    1.000
## glyhb                2.917e+01  3.506e+04   0.001    0.999
## height              -2.009e-01  1.235e+04   0.000    1.000
## weight               1.629e-01  2.563e+03   0.000    1.000
## bp.1s               -4.617e-02  2.287e+03   0.000    1.000
## bp.1d               -8.492e-02  2.814e+03   0.000    1.000
## waist               -3.317e-01  1.235e+04   0.000    1.000
## hips                -1.199e-01  1.316e+04   0.000    1.000
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 1.8123e+02  on 196  degrees of freedom
## Residual deviance: 9.2645e-09  on 175  degrees of freedom
## AIC: 44
## 
## Number of Fisher Scoring iterations: 25

Applying the BLR model to validation dataset to get the predictions

pred=predict(model,validdataset)
print(head(pred))

##         1         3         4        11        16        17 
## -37.20954 -38.51148 -55.97305 -52.67719 -27.41995 136.47226

validdataset$DiabetesPrediction=pred
print(head(validdataset))

##      X    id stabglu   location age gender insurance  fh        smoking chol
## 1   49  2750      85 Buckingham  40 female   Private Yes   Never smoked  169
## 3  153 13500     112     Louisa  82   male   Private  No   Never smoked  255
## 4   74  3751      84 Buckingham  40 female     Govt.  No   Never smoked  180
## 11  24  1256      92 Buckingham  66 female     Govt.  No Current smoker  281
## 16 110  4808      80 Buckingham  68   male   Private  No      Ex-smoker  218
## 17  20  1250     206 Buckingham  62 female      None  No   Never smoked  196
##    hdl ratio     glyhb height weight bp.1s bp.1d waist hips Diabetic
## 1   51   3.3  6.140000     65    180   106    82    40   44       no
## 3   34   7.5  5.600000     66    163   179    89    37   43       no
## 4   69   2.6  5.200000     68    264   142    98    43   54       no
## 11  41   6.9  5.560000     62    185   158    88    48   44       no
## 16  71   3.1  5.589769     70    170   130    73    37   42       no
## 17  41   4.8 11.240000     65    196   178    90    46   51      yes
##    DiabetesPrediction
## 1           -37.20954
## 3           -38.51148
## 4           -55.97305
## 11          -52.67719
## 16          -27.41995
## 17          136.47226

Fitting RPART model

modelRpart=rpart(Diabetic~.,data=traindataset)
rpart.plot(modelRpart)

Applying the RPART model to validation dataset to get the predictions

pred=predict(modelRpart,validdataset)
print(head(pred))

##  1  3  4 11 16 17 
##  0  0  0  0  0  1

validdataset$DiabetesPrediction = pred
print(head(validdataset))

##      X    id stabglu   location age gender insurance  fh        smoking chol
## 1   49  2750      85 Buckingham  40 female   Private Yes   Never smoked  169
## 3  153 13500     112     Louisa  82   male   Private  No   Never smoked  255
## 4   74  3751      84 Buckingham  40 female     Govt.  No   Never smoked  180
## 11  24  1256      92 Buckingham  66 female     Govt.  No Current smoker  281
## 16 110  4808      80 Buckingham  68   male   Private  No      Ex-smoker  218
## 17  20  1250     206 Buckingham  62 female      None  No   Never smoked  196
##    hdl ratio     glyhb height weight bp.1s bp.1d waist hips Diabetic
## 1   51   3.3  6.140000     65    180   106    82    40   44       no
## 3   34   7.5  5.600000     66    163   179    89    37   43       no
## 4   69   2.6  5.200000     68    264   142    98    43   54       no
## 11  41   6.9  5.560000     62    185   158    88    48   44       no
## 16  71   3.1  5.589769     70    170   130    73    37   42       no
## 17  41   4.8 11.240000     65    196   178    90    46   51      yes
##    DiabetesPrediction
## 1                   0
## 3                   0
## 4                   0
## 11                  0
## 16                  0
## 17                  1

Predicting Diabetes via Binary Logistic Regression and RPART models

Akshay Bhabhra

04/09/2020