data <- read.csv(file = "~/Dropbox/DKE/Machine Learning/Master Assignment/Chronic_Kidney_Disease/chronic_kidney_disease_full.csv")
Change some vars from numeric to categorical
data$al <- as.factor(data$al)
data$su <- as.factor(data$su)
describe(data)
## data
##
## 25 Variables 400 Observations
## ---------------------------------------------------------------------------
## age
## n missing distinct Info Mean Gmd .05 .10
## 391 9 76 0.999 51.48 19.19 19.0 28.0
## .25 .50 .75 .90 .95
## 42.0 55.0 64.5 71.0 74.5
##
## lowest : 2 3 4 5 6, highest: 80 81 82 83 90
## ---------------------------------------------------------------------------
## bp
## n missing distinct Info Mean Gmd .05 .10
## 388 12 10 0.94 76.47 14.17 60 60
## .25 .50 .75 .90 .95
## 70 80 80 90 100
##
## lowest : 50 60 70 80 90, highest: 100 110 120 140 180
##
## Value 50 60 70 80 90 100 110 120 140 180
## Frequency 5 71 112 116 53 25 3 1 1 1
## Proportion 0.013 0.183 0.289 0.299 0.137 0.064 0.008 0.003 0.003 0.003
## ---------------------------------------------------------------------------
## sg
## n missing distinct Info Mean Gmd
## 353 47 5 0.938 1.017 0.006385
##
## lowest : 1.005 1.010 1.015 1.020 1.025
## highest: 1.005 1.010 1.015 1.020 1.025
##
## 1.005 (7, 0.020), 1.01 (84, 0.238), 1.015 (75, 0.212), 1.02 (106, 0.300),
## 1.025 (81, 0.229)
## ---------------------------------------------------------------------------
## al
## n missing distinct
## 354 46 6
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## 0 (199, 0.562), 1 (44, 0.124), 2 (43, 0.121), 3 (43, 0.121), 4 (24,
## 0.068), 5 (1, 0.003)
## ---------------------------------------------------------------------------
## su
## n missing distinct
## 351 49 6
##
## lowest : 0 1 2 3 4, highest: 1 2 3 4 5
##
## 0 (290, 0.826), 1 (13, 0.037), 2 (18, 0.051), 3 (14, 0.040), 4 (13,
## 0.037), 5 (3, 0.009)
## ---------------------------------------------------------------------------
## rbc
## n missing distinct
## 248 152 2
##
## abnormal (47, 0.19), normal (201, 0.81)
## ---------------------------------------------------------------------------
## pc
## n missing distinct
## 335 65 2
##
## abnormal (76, 0.227), normal (259, 0.773)
## ---------------------------------------------------------------------------
## pcc
## n missing distinct
## 396 4 2
##
## notpresent (354, 0.894), present (42, 0.106)
## ---------------------------------------------------------------------------
## ba
## n missing distinct
## 396 4 2
##
## notpresent (374, 0.944), present (22, 0.056)
## ---------------------------------------------------------------------------
## bgr
## n missing distinct Info Mean Gmd .05 .10
## 356 44 146 1 148 76.04 78.75 86.50
## .25 .50 .75 .90 .95
## 99.00 121.00 163.00 254.00 307.25
##
## lowest : 22 70 74 75 76, highest: 424 425 447 463 490
## ---------------------------------------------------------------------------
## bu
## n missing distinct Info Mean Gmd .05 .10
## 381 19 118 1 57.43 46.22 17 19
## .25 .50 .75 .90 .95
## 27 42 66 118 162
##
## lowest : 1.5 10.0 15.0 16.0 17.0
## highest: 235.0 241.0 309.0 322.0 391.0
## ---------------------------------------------------------------------------
## sc
## n missing distinct Info Mean Gmd .05 .10
## 383 17 84 0.998 3.072 3.561 0.50 0.60
## .25 .50 .75 .90 .95
## 0.90 1.30 2.80 6.78 11.89
##
## lowest : 0.4 0.5 0.6 0.7 0.8, highest: 18.1 24.0 32.0 48.1 76.0
## ---------------------------------------------------------------------------
## sod
## n missing distinct Info Mean Gmd .05 .10
## 313 87 34 0.995 137.5 8.365 125 131
## .25 .50 .75 .90 .95
## 135 138 142 146 150
##
## lowest : 4.5 104.0 111.0 113.0 114.0
## highest: 145.0 146.0 147.0 150.0 163.0
## ---------------------------------------------------------------------------
## pot
## n missing distinct Info Mean Gmd .05 .10
## 312 88 40 0.997 4.627 1.322 3.4 3.5
## .25 .50 .75 .90 .95
## 3.8 4.4 4.9 5.2 5.7
##
## lowest : 2.5 2.7 2.8 2.9 3.0, highest: 6.5 6.6 7.6 39.0 47.0
## ---------------------------------------------------------------------------
## hemo
## n missing distinct Info Mean Gmd .05 .10
## 348 52 115 1 12.53 3.323 7.90 8.60
## .25 .50 .75 .90 .95
## 10.30 12.65 15.00 16.20 16.90
##
## lowest : 3.1 4.8 5.5 5.6 5.8, highest: 17.4 17.5 17.6 17.7 17.8
## ---------------------------------------------------------------------------
## pcv
## n missing distinct Info Mean Gmd .05 .10
## 329 71 42 0.998 38.88 10.21 24.0 27.0
## .25 .50 .75 .90 .95
## 32.0 40.0 45.0 50.2 52.0
##
## lowest : 9 14 15 16 17, highest: 50 51 52 53 54
## ---------------------------------------------------------------------------
## wbcc
## n missing distinct Info Mean Gmd .05 .10
## 294 106 89 1 8406 3075 4500 5300
## .25 .50 .75 .90 .95
## 6500 8000 9800 11370 12940
##
## lowest : 2200 2600 3800 4100 4200
## highest: 16700 18900 19100 21600 26400
## ---------------------------------------------------------------------------
## rbcc
## n missing distinct Info Mean Gmd .05 .10
## 269 131 45 0.999 4.707 1.165 2.94 3.38
## .25 .50 .75 .90 .95
## 3.90 4.80 5.40 6.10 6.30
##
## lowest : 2.1 2.3 2.4 2.5 2.6, highest: 6.2 6.3 6.4 6.5 8.0
## ---------------------------------------------------------------------------
## htn
## n missing distinct
## 398 2 2
##
## no (251, 0.631), yes (147, 0.369)
## ---------------------------------------------------------------------------
## dm
## n missing distinct
## 398 2 2
##
## no (261, 0.656), yes (137, 0.344)
## ---------------------------------------------------------------------------
## cad
## n missing distinct
## 398 2 2
##
## no (364, 0.915), yes (34, 0.085)
## ---------------------------------------------------------------------------
## appet
## n missing distinct
## 399 1 2
##
## good (317, 0.794), poor (82, 0.206)
## ---------------------------------------------------------------------------
## pe
## n missing distinct
## 399 1 2
##
## no (323, 0.81), yes (76, 0.19)
## ---------------------------------------------------------------------------
## ane
## n missing distinct
## 399 1 2
##
## no (339, 0.85), yes (60, 0.15)
## ---------------------------------------------------------------------------
## class
## n missing distinct
## 400 0 2
##
## ckd (250, 0.625), notckd (150, 0.375)
## ---------------------------------------------------------------------------
Following code stolen from: http://rpubs.com/imtiazbdsrpubs/234741
#Plots histograms
plotHist <- function(data_in, i) {
data <- data.frame(x=data_in[[i]])
p <- ggplot(data=data, aes(x=factor(x))) + stat_count() + xlab(colnames(data_in)[i]) + theme_light() +
theme(axis.text.x = element_text(angle = 90, hjust =1))
return (p)
}
doPlots <- function(data_in, fun, ii, ncol=3) {
pp <- list()
for (i in ii) {
p <- fun(data_in=data_in, i=i)
pp <- c(pp, list(p))
}
do.call("grid.arrange", c(pp, ncol=ncol))
}
#Plots density plots for numeric variables
plotDen <- function(data_in, i){
#require(moments)
data=data.frame(x=data_in[[i]])
p <- ggplot(data= data) + geom_line(aes(x = x), stat = 'density', size = 1,alpha = 1.0) +
xlab(paste0((colnames(data_in)[i]), '\n', 'Skewness: ',round(skewness(data_in[[i]], na.rm = TRUE), 2))) + theme_light()
return(p)
}
Filter categorical vars
# sapply(data, class)
categorical <- sapply(data, is.factor)
data_categorical <- data[categorical]
doPlots(data[!categorical], fun = plotDen, ii = 1:12, ncol=3)
for(i in names(data[!categorical])){
plot(density(data[complete.cases(data[,i]), i]), main = i)
print('Skewness:')
print(skewness(data[complete.cases(data[,i]), i]))
}
## [1] "Skewness:"
## [1] -0.6656931
## [1] "Skewness:"
## [1] 1.599216
## [1] "Skewness:"
## [1] -0.1717101
## [1] "Skewness:"
## [1] 2.002291
## [1] "Skewness:"
## [1] 2.623992
## [1] "Skewness:"
## [1] 7.480095
## [1] "Skewness:"
## [1] -6.962994
## [1] "Skewness:"
## [1] 11.52719
## [1] "Skewness:"
## [1] -0.3336486
## [1] "Skewness:"
## [1] -0.4316988
## [1] "Skewness:"
## [1] 1.613304
## [1] "Skewness:"
## [1] -0.1823055
pcc
l <- list()
for(i in names(data[!categorical])){
col=data.frame(x=data[[i]])
p <- ggplot(cbind(col, pcc = data$pcc), aes(x, colour = pcc, fill=pcc)) +
geom_density(alpha=0.5) +
xlab(i)
l <- c(l, list(p))
}
do.call('grid.arrange', c(l, ncol=3))
require(corrplot)
corrplot(cor(data[names(data[!categorical])], use='pairwise'),
method='number', type = 'upper')
require(caTools)
# convert labels to 0s and 1s
classes <- data$class
data$class <- ifelse(data$class == 'ckd', 1, 0)
# split data set into train and test set
s <- sample.split(data$class, SplitRatio = 0.6)
train <- subset(data, s==T)
test <- subset(data, s==F)
# split test set into test and validation set
t <- sample.split(test$class, SplitRatio = 0.5)
val <- subset(test, t == T)
test <- subset(test, t==F)
# check relative frequency of labels
table(train$class)
##
## 0 1
## 90 150
prop.table(table(train$class))
##
## 0 1
## 0.375 0.625
prop.table(table(test$class))
##
## 0 1
## 0.375 0.625
prop.table(table(val$class))
##
## 0 1
## 0.375 0.625
Relative frequency hasn’t been changed after splitting into train, validation and test set.
accuracy <- function(table) {
sum(table[1], table[4])/sum(table)
}
According to National Kidney Foundation: Glomerular filtration rate (GFR) is the best estimate of kidney function. See: https://www.kidney.org/kidneydisease/aboutckd
For more info: https://medlineplus.gov/ency/article/007305.htm
fit <- glm(class ~ hemo + sc + pot, train, family = "binomial")
summary(fit)
##
## Call:
## glm(formula = class ~ hemo + sc + pot, family = "binomial", data = train)
##
## Deviance Residuals:
## Min 1Q Median 3Q Max
## -4.676e-04 -2.000e-08 2.000e-08 2.000e-08 5.908e-04
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) 1138.75 128562.24 0.009 0.993
## hemo -129.75 12016.91 -0.011 0.991
## sc 252.77 17267.38 0.015 0.988
## pot 65.22 6720.96 0.010 0.992
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 2.2304e+02 on 160 degrees of freedom
## Residual deviance: 7.2487e-07 on 157 degrees of freedom
## (79 observations deleted due to missingness)
## AIC: 8
##
## Number of Fisher Scoring iterations: 25
pred <- predict(fit, newdata = val, type = 'response')
require(car)
car::vif(fit)
## hemo sc pot
## 5.577985 1.194184 5.658540
labels <- ifelse(pred > 0.5, 1, 0)
# Since there is still NA value's
# determine the most occuring prediction
# and set all NA's to that result
sort(table(labels),decreasing=TRUE)[1:3]
## labels
## 0 1 <NA>
## 31 29 NA
labels[is.na(labels)] <- 0
table(labels)
## labels
## 0 1
## 51 29
# accuracy on validation set
accuracy(prop.table(table(val$class, labels)))
## [1] 0.7375
# accuracy on test set
labels_test <- ifelse(predict(fit, newdata = test, type = 'response') < .5 | is.na(predict(fit,newdata = test, type='response')), 0, 1)
labels_test
## 1 7 10 15 17 19 28 32 35 47 50 51 66 68 74 75 88 90
## 0 1 1 1 1 1 1 1 0 1 0 1 0 0 1 1 0 0
## 92 104 105 107 109 111 114 122 131 150 160 165 168 169 170 176 178 183
## 0 0 0 1 1 1 0 0 1 0 1 0 0 0 0 0 0 1
## 185 192 197 199 226 228 229 232 235 237 238 241 245 247 258 260 263 265
## 1 1 1 1 1 0 0 1 0 1 0 0 1 1 0 0 0 0
## 267 272 273 284 290 302 306 307 308 312 314 324 325 332 336 343 349 355
## 1 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0 0
## 360 368 372 380 385 394 398 399
## 0 0 0 0 0 0 0 0
table(labels_test)
## labels_test
## 0 1
## 53 27
x <- prop.table(table(test$class, labels_test))
x
## labels_test
## 0 1
## 0 0.3625 0.0125
## 1 0.3000 0.3250
accuracy(x)
## [1] 0.6875
seed <- 666
data$class <- classes
train_set <- createDataPartition(classes, p = 0.7, list = FALSE)
## 10-fold X-val
ctrl <- trainControl(method = "cv", number = 10)
data_train <- data[train_set, ]
data_test <- data[-train_set, ]
set.seed(seed)
c50Grid <- expand.grid(.trials = c(1:9, (1:10)*10),
.model = c("tree", "rules"),
.winnow = c(TRUE, FALSE))
c5_fit <- train(as.factor(class) ~ ., data = data_train,
method="C5.0",
na.action=na.pass,
tuneGrid=c50Grid,
trControl=ctrl,
metric="Accuracy",
importance=T)
c5_fit
## C5.0
##
## 109 samples
## 24 predictor
## 2 classes: 'ckd', 'notckd'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 252, 251, 252, 252, 253, 253, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.9857143 0.9702748
## rules FALSE 2 0.9892857 0.9778835
## rules FALSE 3 0.9858374 0.9704285
## rules FALSE 4 0.9858374 0.9704285
## rules FALSE 5 0.9858374 0.9704285
## rules FALSE 6 0.9892857 0.9778835
## rules FALSE 7 0.9892857 0.9778835
## rules FALSE 8 0.9892857 0.9778835
## rules FALSE 9 0.9892857 0.9778835
## rules FALSE 10 0.9892857 0.9778835
## rules FALSE 20 0.9892857 0.9778835
## rules FALSE 30 0.9892857 0.9778835
## rules FALSE 40 0.9892857 0.9778835
## rules FALSE 50 0.9892857 0.9778835
## rules FALSE 60 0.9892857 0.9778835
## rules FALSE 70 0.9892857 0.9778835
## rules FALSE 80 0.9892857 0.9778835
## rules FALSE 90 0.9892857 0.9778835
## rules FALSE 100 0.9892857 0.9778835
## rules TRUE 1 0.9532978 0.9016014
## rules TRUE 2 0.9570015 0.9042668
## rules TRUE 3 0.9640212 0.9228085
## rules TRUE 4 0.9677249 0.9293872
## rules TRUE 5 0.9677249 0.9316732
## rules TRUE 6 0.9642766 0.9228569
## rules TRUE 7 0.9640212 0.9222105
## rules TRUE 8 0.9642766 0.9228569
## rules TRUE 9 0.9712963 0.9392819
## rules TRUE 10 0.9642766 0.9228569
## rules TRUE 20 0.9712963 0.9392819
## rules TRUE 30 0.9712963 0.9392819
## rules TRUE 40 0.9748677 0.9473789
## rules TRUE 50 0.9748677 0.9473789
## rules TRUE 60 0.9748677 0.9473789
## rules TRUE 70 0.9712963 0.9392819
## rules TRUE 80 0.9748677 0.9473789
## rules TRUE 90 0.9678480 0.9318269
## rules TRUE 100 0.9714194 0.9399239
## tree FALSE 1 0.9855820 0.9693061
## tree FALSE 2 0.9929803 0.9851953
## tree FALSE 3 0.9928571 0.9850229
## tree FALSE 4 0.9928571 0.9850229
## tree FALSE 5 0.9891534 0.9772419
## tree FALSE 6 0.9928571 0.9850229
## tree FALSE 7 0.9857052 0.9700459
## tree FALSE 8 0.9928571 0.9850229
## tree FALSE 9 0.9928571 0.9850229
## tree FALSE 10 0.9928571 0.9850229
## tree FALSE 20 0.9928571 0.9850229
## tree FALSE 30 0.9928571 0.9850229
## tree FALSE 40 0.9928571 0.9850229
## tree FALSE 50 0.9928571 0.9850229
## tree FALSE 60 0.9928571 0.9850229
## tree FALSE 70 0.9928571 0.9850229
## tree FALSE 80 0.9928571 0.9850229
## tree FALSE 90 0.9928571 0.9850229
## tree FALSE 100 0.9928571 0.9850229
## tree TRUE 1 0.9604406 0.9170149
## tree TRUE 2 0.9570015 0.9080037
## tree TRUE 3 0.9640212 0.9240534
## tree TRUE 4 0.9604497 0.9151998
## tree TRUE 5 0.9604497 0.9164413
## tree TRUE 6 0.9714286 0.9394542
## tree TRUE 7 0.9641534 0.9240645
## tree TRUE 8 0.9677249 0.9316732
## tree TRUE 9 0.9641534 0.9240645
## tree TRUE 10 0.9714286 0.9394542
## tree TRUE 20 0.9715517 0.9396079
## tree TRUE 30 0.9715517 0.9396079
## tree TRUE 40 0.9715517 0.9396079
## tree TRUE 50 0.9715517 0.9396079
## tree TRUE 60 0.9715517 0.9396079
## tree TRUE 70 0.9678480 0.9314998
## tree TRUE 80 0.9715517 0.9396079
## tree TRUE 90 0.9715517 0.9396079
## tree TRUE 100 0.9715517 0.9396079
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 2, model = tree
## and winnow = FALSE.
plot(c5_fit)
predictors(c5_fit)
## [1] "hemo" "sc" "sg"
accuracy(table(data_test$class,
predict(c5_fit, newdata = data_test, na.action = na.pass)
)
)
## [1] 0.9916667