Load daata

data <- read.csv(file = "~/Dropbox/DKE/Machine Learning/Master Assignment/Chronic_Kidney_Disease/chronic_kidney_disease_full.csv")

Data preproc

Change some vars from numeric to categorical

data$al <- as.factor(data$al)
data$su <- as.factor(data$su)
describe(data)
## data 
## 
##  25  Variables      400  Observations
## ---------------------------------------------------------------------------
## age 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      391        9       76    0.999    51.48    19.19     19.0     28.0 
##      .25      .50      .75      .90      .95 
##     42.0     55.0     64.5     71.0     74.5 
## 
## lowest :  2  3  4  5  6, highest: 80 81 82 83 90 
## ---------------------------------------------------------------------------
## bp 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      388       12       10     0.94    76.47    14.17       60       60 
##      .25      .50      .75      .90      .95 
##       70       80       80       90      100 
## 
## lowest :  50  60  70  80  90, highest: 100 110 120 140 180 
##                                                                       
## Value         50    60    70    80    90   100   110   120   140   180
## Frequency      5    71   112   116    53    25     3     1     1     1
## Proportion 0.013 0.183 0.289 0.299 0.137 0.064 0.008 0.003 0.003 0.003
## ---------------------------------------------------------------------------
## sg 
##        n  missing distinct     Info     Mean      Gmd 
##      353       47        5    0.938    1.017 0.006385 
## 
## lowest : 1.005 1.010 1.015 1.020 1.025
## highest: 1.005 1.010 1.015 1.020 1.025 
## 
## 1.005 (7, 0.020), 1.01 (84, 0.238), 1.015 (75, 0.212), 1.02 (106, 0.300),
## 1.025 (81, 0.229)
## ---------------------------------------------------------------------------
## al 
##        n  missing distinct 
##      354       46        6 
## 
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5 
## 
## 0 (199, 0.562), 1 (44, 0.124), 2 (43, 0.121), 3 (43, 0.121), 4 (24,
## 0.068), 5 (1, 0.003)
## ---------------------------------------------------------------------------
## su 
##        n  missing distinct 
##      351       49        6 
## 
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5 
## 
## 0 (290, 0.826), 1 (13, 0.037), 2 (18, 0.051), 3 (14, 0.040), 4 (13,
## 0.037), 5 (3, 0.009)
## ---------------------------------------------------------------------------
## rbc 
##        n  missing distinct 
##      248      152        2 
## 
## abnormal (47, 0.19), normal (201, 0.81)
## ---------------------------------------------------------------------------
## pc 
##        n  missing distinct 
##      335       65        2 
## 
## abnormal (76, 0.227), normal (259, 0.773)
## ---------------------------------------------------------------------------
## pcc 
##        n  missing distinct 
##      396        4        2 
## 
## notpresent (354, 0.894), present (42, 0.106)
## ---------------------------------------------------------------------------
## ba 
##        n  missing distinct 
##      396        4        2 
## 
## notpresent (374, 0.944), present (22, 0.056)
## ---------------------------------------------------------------------------
## bgr 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      356       44      146        1      148    76.04    78.75    86.50 
##      .25      .50      .75      .90      .95 
##    99.00   121.00   163.00   254.00   307.25 
## 
## lowest :  22  70  74  75  76, highest: 424 425 447 463 490 
## ---------------------------------------------------------------------------
## bu 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      381       19      118        1    57.43    46.22       17       19 
##      .25      .50      .75      .90      .95 
##       27       42       66      118      162 
## 
## lowest :   1.5  10.0  15.0  16.0  17.0
## highest: 235.0 241.0 309.0 322.0 391.0 
## ---------------------------------------------------------------------------
## sc 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      383       17       84    0.998    3.072    3.561     0.50     0.60 
##      .25      .50      .75      .90      .95 
##     0.90     1.30     2.80     6.78    11.89 
## 
## lowest :  0.4  0.5  0.6  0.7  0.8, highest: 18.1 24.0 32.0 48.1 76.0 
## ---------------------------------------------------------------------------
## sod 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      313       87       34    0.995    137.5    8.365      125      131 
##      .25      .50      .75      .90      .95 
##      135      138      142      146      150 
## 
## lowest :   4.5 104.0 111.0 113.0 114.0
## highest: 145.0 146.0 147.0 150.0 163.0 
## ---------------------------------------------------------------------------
## pot 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      312       88       40    0.997    4.627    1.322      3.4      3.5 
##      .25      .50      .75      .90      .95 
##      3.8      4.4      4.9      5.2      5.7 
## 
## lowest :  2.5  2.7  2.8  2.9  3.0, highest:  6.5  6.6  7.6 39.0 47.0 
## ---------------------------------------------------------------------------
## hemo 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      348       52      115        1    12.53    3.323     7.90     8.60 
##      .25      .50      .75      .90      .95 
##    10.30    12.65    15.00    16.20    16.90 
## 
## lowest :  3.1  4.8  5.5  5.6  5.8, highest: 17.4 17.5 17.6 17.7 17.8 
## ---------------------------------------------------------------------------
## pcv 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      329       71       42    0.998    38.88    10.21     24.0     27.0 
##      .25      .50      .75      .90      .95 
##     32.0     40.0     45.0     50.2     52.0 
## 
## lowest :  9 14 15 16 17, highest: 50 51 52 53 54 
## ---------------------------------------------------------------------------
## wbcc 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      294      106       89        1     8406     3075     4500     5300 
##      .25      .50      .75      .90      .95 
##     6500     8000     9800    11370    12940 
## 
## lowest :  2200  2600  3800  4100  4200
## highest: 16700 18900 19100 21600 26400 
## ---------------------------------------------------------------------------
## rbcc 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      269      131       45    0.999    4.707    1.165     2.94     3.38 
##      .25      .50      .75      .90      .95 
##     3.90     4.80     5.40     6.10     6.30 
## 
## lowest : 2.1 2.3 2.4 2.5 2.6, highest: 6.2 6.3 6.4 6.5 8.0 
## ---------------------------------------------------------------------------
## htn 
##        n  missing distinct 
##      398        2        2 
## 
## no (251, 0.631), yes (147, 0.369)
## ---------------------------------------------------------------------------
## dm 
##        n  missing distinct 
##      398        2        2 
## 
## no (261, 0.656), yes (137, 0.344)
## ---------------------------------------------------------------------------
## cad 
##        n  missing distinct 
##      398        2        2 
## 
## no (364, 0.915), yes (34, 0.085)
## ---------------------------------------------------------------------------
## appet 
##        n  missing distinct 
##      399        1        2 
## 
## good (317, 0.794), poor (82, 0.206)
## ---------------------------------------------------------------------------
## pe 
##        n  missing distinct 
##      399        1        2 
## 
## no (323, 0.81), yes (76, 0.19)
## ---------------------------------------------------------------------------
## ane 
##        n  missing distinct 
##      399        1        2 
## 
## no (339, 0.85), yes (60, 0.15)
## ---------------------------------------------------------------------------
## class 
##        n  missing distinct 
##      400        0        2 
## 
## ckd (250, 0.625), notckd (150, 0.375)
## ---------------------------------------------------------------------------

Exploratory Data Analysis

Following code stolen from: http://rpubs.com/imtiazbdsrpubs/234741

#Plots histograms
plotHist <- function(data_in, i) {
  data <- data.frame(x=data_in[[i]])
  p <- ggplot(data=data, aes(x=factor(x))) + stat_count() + xlab(colnames(data_in)[i]) + theme_light() + 
    theme(axis.text.x = element_text(angle = 90, hjust =1))
  return (p)
}

doPlots <- function(data_in, fun, ii, ncol=3) {
  pp <- list()
  for (i in ii) {
    p <- fun(data_in=data_in, i=i)
    pp <- c(pp, list(p))
  }
  do.call("grid.arrange", c(pp, ncol=ncol))
}

#Plots density plots for numeric variables
plotDen <- function(data_in, i){
  #require(moments)
  data=data.frame(x=data_in[[i]])
  p <- ggplot(data= data) + geom_line(aes(x = x), stat = 'density', size = 1,alpha = 1.0) +
    xlab(paste0((colnames(data_in)[i]), '\n', 'Skewness: ',round(skewness(data_in[[i]], na.rm = TRUE), 2))) + theme_light() 
  return(p)
  
}

Filter categorical vars

# sapply(data, class)

categorical <- sapply(data, is.factor)

data_categorical <- data[categorical]

Barplots of categorical data

Density plot of numerical ones

doPlots(data[!categorical], fun = plotDen, ii = 1:12, ncol=3)

for(i in names(data[!categorical])){
  plot(density(data[complete.cases(data[,i]), i]), main = i)
  print('Skewness:')
  print(skewness(data[complete.cases(data[,i]), i]))
}

## [1] "Skewness:"
## [1] -0.6656931

## [1] "Skewness:"
## [1] 1.599216

## [1] "Skewness:"
## [1] -0.1717101

## [1] "Skewness:"
## [1] 2.002291

## [1] "Skewness:"
## [1] 2.623992

## [1] "Skewness:"
## [1] 7.480095

## [1] "Skewness:"
## [1] -6.962994

## [1] "Skewness:"
## [1] 11.52719

## [1] "Skewness:"
## [1] -0.3336486

## [1] "Skewness:"
## [1] -0.4316988

## [1] "Skewness:"
## [1] 1.613304

## [1] "Skewness:"
## [1] -0.1823055

Density plots splitted on pcc

l <- list()
for(i in names(data[!categorical])){
  col=data.frame(x=data[[i]])
  p <- ggplot(cbind(col, pcc = data$pcc), aes(x, colour = pcc, fill=pcc)) + 
    geom_density(alpha=0.5) + 
    xlab(i)
  l <- c(l, list(p))
}
do.call('grid.arrange', c(l, ncol=3))

Correlation Matrix

require(corrplot)
corrplot(cor(data[names(data[!categorical])], use='pairwise'), 
         method='number', type = 'upper')

Building Models

Split data set into train and test set

require(caTools)
# convert labels to 0s and 1s
classes <- data$class
data$class <- ifelse(data$class == 'ckd', 1, 0)

# split data set into train and test set
s <- sample.split(data$class, SplitRatio = 0.6)

train <- subset(data, s==T)
test <- subset(data, s==F)

# split test set into test and validation set
t <- sample.split(test$class, SplitRatio = 0.5)
val <- subset(test, t == T)
test <- subset(test, t==F)

# check relative frequency of labels
table(train$class)
## 
##   0   1 
##  90 150
prop.table(table(train$class))
## 
##     0     1 
## 0.375 0.625
prop.table(table(test$class))
## 
##     0     1 
## 0.375 0.625
prop.table(table(val$class))
## 
##     0     1 
## 0.375 0.625

Relative frequency hasn’t been changed after splitting into train, validation and test set.

Function for calculating accuracy given confusion matrix

accuracy <- function(table) {
  sum(table[1], table[4])/sum(table)
}

Fit Logistic Regression

According to National Kidney Foundation: Glomerular filtration rate (GFR) is the best estimate of kidney function. See: https://www.kidney.org/kidneydisease/aboutckd

For more info: https://medlineplus.gov/ency/article/007305.htm

fit <- glm(class ~ hemo + sc + pot, train, family = "binomial")
summary(fit)
## 
## Call:
## glm(formula = class ~ hemo + sc + pot, family = "binomial", data = train)
## 
## Deviance Residuals: 
##        Min          1Q      Median          3Q         Max  
## -4.676e-04  -2.000e-08   2.000e-08   2.000e-08   5.908e-04  
## 
## Coefficients:
##              Estimate Std. Error z value Pr(>|z|)
## (Intercept)   1138.75  128562.24   0.009    0.993
## hemo          -129.75   12016.91  -0.011    0.991
## sc             252.77   17267.38   0.015    0.988
## pot             65.22    6720.96   0.010    0.992
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 2.2304e+02  on 160  degrees of freedom
## Residual deviance: 7.2487e-07  on 157  degrees of freedom
##   (79 observations deleted due to missingness)
## AIC: 8
## 
## Number of Fisher Scoring iterations: 25
pred <- predict(fit, newdata = val, type = 'response')

Calculate Variance-Inflation-Factor (https://en.wikipedia.org/wiki/Variance_inflation_factor)

require(car)
car::vif(fit)
##     hemo       sc      pot 
## 5.577985 1.194184 5.658540
labels <- ifelse(pred > 0.5, 1, 0)

# Since there is still NA value's 
# determine the most occuring prediction
# and set all NA's to that result
sort(table(labels),decreasing=TRUE)[1:3]
## labels
##    0    1 <NA> 
##   31   29   NA
labels[is.na(labels)] <- 0
table(labels)
## labels
##  0  1 
## 51 29
# accuracy on validation set
accuracy(prop.table(table(val$class, labels)))
## [1] 0.7375
# accuracy on test set
labels_test <- ifelse(predict(fit, newdata = test, type = 'response') < .5 |  is.na(predict(fit,newdata = test, type='response')), 0, 1)
labels_test
##   1   7  10  15  17  19  28  32  35  47  50  51  66  68  74  75  88  90 
##   0   1   1   1   1   1   1   1   0   1   0   1   0   0   1   1   0   0 
##  92 104 105 107 109 111 114 122 131 150 160 165 168 169 170 176 178 183 
##   0   0   0   1   1   1   0   0   1   0   1   0   0   0   0   0   0   1 
## 185 192 197 199 226 228 229 232 235 237 238 241 245 247 258 260 263 265 
##   1   1   1   1   1   0   0   1   0   1   0   0   1   1   0   0   0   0 
## 267 272 273 284 290 302 306 307 308 312 314 324 325 332 336 343 349 355 
##   1   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0   0 
## 360 368 372 380 385 394 398 399 
##   0   0   0   0   0   0   0   0
table(labels_test)
## labels_test
##  0  1 
## 53 27
x <- prop.table(table(test$class, labels_test))
x
##    labels_test
##          0      1
##   0 0.3625 0.0125
##   1 0.3000 0.3250
accuracy(x)
## [1] 0.6875

Cross Validation

seed <- 666
data$class <- classes
train_set <- createDataPartition(classes, p = 0.7, list = FALSE)

## 10-fold X-val
ctrl <- trainControl(method = "cv", number = 10)

data_train <- data[train_set, ]
data_test <- data[-train_set, ]

C5.0

set.seed(seed)

c50Grid <- expand.grid(.trials = c(1:9, (1:10)*10),
                       .model = c("tree", "rules"),
                       .winnow = c(TRUE, FALSE))

c5_fit <- train(as.factor(class) ~ ., data = data_train,
                 method="C5.0",
                 na.action=na.pass,
                tuneGrid=c50Grid,
                trControl=ctrl,
                metric="Accuracy",
                importance=T)

c5_fit
## C5.0 
## 
## 109 samples
##  24 predictor
##   2 classes: 'ckd', 'notckd' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 252, 251, 252, 252, 253, 253, ... 
## Resampling results across tuning parameters:
## 
##   model  winnow  trials  Accuracy   Kappa    
##   rules  FALSE     1     0.9857143  0.9702748
##   rules  FALSE     2     0.9892857  0.9778835
##   rules  FALSE     3     0.9858374  0.9704285
##   rules  FALSE     4     0.9858374  0.9704285
##   rules  FALSE     5     0.9858374  0.9704285
##   rules  FALSE     6     0.9892857  0.9778835
##   rules  FALSE     7     0.9892857  0.9778835
##   rules  FALSE     8     0.9892857  0.9778835
##   rules  FALSE     9     0.9892857  0.9778835
##   rules  FALSE    10     0.9892857  0.9778835
##   rules  FALSE    20     0.9892857  0.9778835
##   rules  FALSE    30     0.9892857  0.9778835
##   rules  FALSE    40     0.9892857  0.9778835
##   rules  FALSE    50     0.9892857  0.9778835
##   rules  FALSE    60     0.9892857  0.9778835
##   rules  FALSE    70     0.9892857  0.9778835
##   rules  FALSE    80     0.9892857  0.9778835
##   rules  FALSE    90     0.9892857  0.9778835
##   rules  FALSE   100     0.9892857  0.9778835
##   rules   TRUE     1     0.9532978  0.9016014
##   rules   TRUE     2     0.9570015  0.9042668
##   rules   TRUE     3     0.9640212  0.9228085
##   rules   TRUE     4     0.9677249  0.9293872
##   rules   TRUE     5     0.9677249  0.9316732
##   rules   TRUE     6     0.9642766  0.9228569
##   rules   TRUE     7     0.9640212  0.9222105
##   rules   TRUE     8     0.9642766  0.9228569
##   rules   TRUE     9     0.9712963  0.9392819
##   rules   TRUE    10     0.9642766  0.9228569
##   rules   TRUE    20     0.9712963  0.9392819
##   rules   TRUE    30     0.9712963  0.9392819
##   rules   TRUE    40     0.9748677  0.9473789
##   rules   TRUE    50     0.9748677  0.9473789
##   rules   TRUE    60     0.9748677  0.9473789
##   rules   TRUE    70     0.9712963  0.9392819
##   rules   TRUE    80     0.9748677  0.9473789
##   rules   TRUE    90     0.9678480  0.9318269
##   rules   TRUE   100     0.9714194  0.9399239
##   tree   FALSE     1     0.9855820  0.9693061
##   tree   FALSE     2     0.9929803  0.9851953
##   tree   FALSE     3     0.9928571  0.9850229
##   tree   FALSE     4     0.9928571  0.9850229
##   tree   FALSE     5     0.9891534  0.9772419
##   tree   FALSE     6     0.9928571  0.9850229
##   tree   FALSE     7     0.9857052  0.9700459
##   tree   FALSE     8     0.9928571  0.9850229
##   tree   FALSE     9     0.9928571  0.9850229
##   tree   FALSE    10     0.9928571  0.9850229
##   tree   FALSE    20     0.9928571  0.9850229
##   tree   FALSE    30     0.9928571  0.9850229
##   tree   FALSE    40     0.9928571  0.9850229
##   tree   FALSE    50     0.9928571  0.9850229
##   tree   FALSE    60     0.9928571  0.9850229
##   tree   FALSE    70     0.9928571  0.9850229
##   tree   FALSE    80     0.9928571  0.9850229
##   tree   FALSE    90     0.9928571  0.9850229
##   tree   FALSE   100     0.9928571  0.9850229
##   tree    TRUE     1     0.9604406  0.9170149
##   tree    TRUE     2     0.9570015  0.9080037
##   tree    TRUE     3     0.9640212  0.9240534
##   tree    TRUE     4     0.9604497  0.9151998
##   tree    TRUE     5     0.9604497  0.9164413
##   tree    TRUE     6     0.9714286  0.9394542
##   tree    TRUE     7     0.9641534  0.9240645
##   tree    TRUE     8     0.9677249  0.9316732
##   tree    TRUE     9     0.9641534  0.9240645
##   tree    TRUE    10     0.9714286  0.9394542
##   tree    TRUE    20     0.9715517  0.9396079
##   tree    TRUE    30     0.9715517  0.9396079
##   tree    TRUE    40     0.9715517  0.9396079
##   tree    TRUE    50     0.9715517  0.9396079
##   tree    TRUE    60     0.9715517  0.9396079
##   tree    TRUE    70     0.9678480  0.9314998
##   tree    TRUE    80     0.9715517  0.9396079
##   tree    TRUE    90     0.9715517  0.9396079
##   tree    TRUE   100     0.9715517  0.9396079
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were trials = 2, model = tree
##  and winnow = FALSE.
plot(c5_fit)

predictors(c5_fit)
## [1] "hemo" "sc"   "sg"

Accuracy on test set

accuracy(table(data_test$class,
               predict(c5_fit, newdata = data_test, na.action = na.pass)
               )
         )
## [1] 0.9916667