data <- read.csv("~/Dropbox/DKE/Machine Learning/Master Assignment/Chronic_Kidney_Disease/Imputed_DAta.csv")
require(caret)
require(glmnet)
require(ROCR)
seed <- 666
classes <- data[, "class"]
train_set <- createDataPartition(classes, p = 0.7, list = FALSE)
data_train <- data[train_set, ]
data_test <- data[-train_set, ]
ctrl <- trainControl(method = "cv", number = 10)
Naive Bayes
nb_fit <- train(class ~ ., data = data_train,
method="nb",
trControl=ctrl,
metric="Accuracy",
importance=T)
nb_fit
## Naive Bayes
##
## 280 samples
## 24 predictor
## 2 classes: 'ckd', 'notckd'
##
## No pre-processing
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 252, 253, 251, 253, 252, 253, ...
## Resampling results across tuning parameters:
##
## usekernel Accuracy Kappa
## FALSE NaN NaN
## TRUE 0.9893997 0.9780727
##
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
## parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE
## and adjust = 1.
C5.0
c50Grid <- expand.grid(.trials = c(1:9, (1:10)*10),
.model = c("tree", "rules"),
.winnow = c(TRUE, FALSE))
c5_fit <- train(class ~ ., data = data_train,
method="C5.0",
tuneGrid=c50Grid,
trControl=ctrl,
metric="Accuracy",
preProcess = c("center", "scale"),
importance=T)
c5_fit
## C5.0
##
## 280 samples
## 24 predictor
## 2 classes: 'ckd', 'notckd'
##
## Pre-processing: centered (24), scaled (24)
## Resampling: Cross-Validated (10 fold)
## Summary of sample sizes: 252, 251, 251, 253, 252, 252, ...
## Resampling results across tuning parameters:
##
## model winnow trials Accuracy Kappa
## rules FALSE 1 0.9678480 0.9312090
## rules FALSE 2 0.9714194 0.9385774
## rules FALSE 3 0.9821429 0.9624409
## rules FALSE 4 0.9857143 0.9700496
## rules FALSE 5 0.9822660 0.9628536
## rules FALSE 6 0.9857143 0.9700496
## rules FALSE 7 0.9857143 0.9700496
## rules FALSE 8 0.9857143 0.9700496
## rules FALSE 9 0.9857143 0.9700496
## rules FALSE 10 0.9857143 0.9700496
## rules FALSE 20 0.9857143 0.9700496
## rules FALSE 30 0.9857143 0.9700496
## rules FALSE 40 0.9857143 0.9700496
## rules FALSE 50 0.9857143 0.9700496
## rules FALSE 60 0.9857143 0.9700496
## rules FALSE 70 0.9857143 0.9700496
## rules FALSE 80 0.9857143 0.9700496
## rules FALSE 90 0.9857143 0.9700496
## rules FALSE 100 0.9857143 0.9700496
## rules TRUE 1 0.9646552 0.9257833
## rules TRUE 2 0.9716749 0.9403439
## rules TRUE 3 0.9751140 0.9481184
## rules TRUE 4 0.9681034 0.9323039
## rules TRUE 5 0.9610837 0.9167090
## rules TRUE 6 0.9646552 0.9256316
## rules TRUE 7 0.9575123 0.9093525
## rules TRUE 8 0.9536855 0.8997877
## rules TRUE 9 0.9716749 0.9404298
## rules TRUE 10 0.9608283 0.9159567
## rules TRUE 20 0.9609606 0.9167002
## rules TRUE 30 0.9716749 0.9403439
## rules TRUE 40 0.9716749 0.9403439
## rules TRUE 50 0.9716749 0.9403439
## rules TRUE 60 0.9716749 0.9403439
## rules TRUE 70 0.9716749 0.9403439
## rules TRUE 80 0.9716749 0.9403439
## rules TRUE 90 0.9716749 0.9403439
## rules TRUE 100 0.9716749 0.9403439
## tree FALSE 1 0.9642766 0.9236080
## tree FALSE 2 0.9821429 0.9618586
## tree FALSE 3 0.9821429 0.9624409
## tree FALSE 4 0.9892857 0.9774142
## tree FALSE 5 0.9857143 0.9700458
## tree FALSE 6 0.9857143 0.9700458
## tree FALSE 7 0.9822660 0.9628497
## tree FALSE 8 0.9857143 0.9700458
## tree FALSE 9 0.9857143 0.9700458
## tree FALSE 10 0.9857143 0.9700458
## tree FALSE 20 0.9857143 0.9700458
## tree FALSE 30 0.9857143 0.9700458
## tree FALSE 40 0.9857143 0.9700458
## tree FALSE 50 0.9857143 0.9700458
## tree FALSE 60 0.9857143 0.9700458
## tree FALSE 70 0.9857143 0.9700458
## tree FALSE 80 0.9857143 0.9700458
## tree FALSE 90 0.9857143 0.9700458
## tree FALSE 100 0.9857143 0.9700458
## tree TRUE 1 0.9646552 0.9257833
## tree TRUE 2 0.9716749 0.9403439
## tree TRUE 3 0.9751140 0.9481184
## tree TRUE 4 0.9752463 0.9482907
## tree TRUE 5 0.9610837 0.9167090
## tree TRUE 6 0.9681034 0.9330733
## tree TRUE 7 0.9609606 0.9167943
## tree TRUE 8 0.9681034 0.9330733
## tree TRUE 9 0.9752463 0.9482907
## tree TRUE 10 0.9716749 0.9409262
## tree TRUE 20 0.9681034 0.9329793
## tree TRUE 30 0.9716749 0.9403439
## tree TRUE 40 0.9681034 0.9329793
## tree TRUE 50 0.9681034 0.9329793
## tree TRUE 60 0.9681034 0.9329793
## tree TRUE 70 0.9681034 0.9329793
## tree TRUE 80 0.9681034 0.9329793
## tree TRUE 90 0.9681034 0.9329793
## tree TRUE 100 0.9681034 0.9329793
##
## Accuracy was used to select the optimal model using the largest value.
## The final values used for the model were trials = 4, model = tree
## and winnow = FALSE.
plot(c5_fit)

predictors(c5_fit)
## [1] "hemo" "sc" "sg" "dmyes" "al" "pcv" "peyes"
summary(c5_fit$finalModel)
##
## Call:
## C5.0.default(x = structure(c(-0.141874822894331,
## "fuzzyThreshold", "sample", "earlyStopping", "label",
## "seed")), importance = TRUE)
##
##
## C5.0 [Release 2.07 GPL Edition] Sun Jan 22 20:24:19 2017
## -------------------------------
##
## Class specified by attribute `outcome'
##
## Read 280 cases (25 attributes) from undefined.data
##
## ----- Trial 0: -----
##
## Decision tree:
##
## hemo <= 0.1577877: ckd (151)
## hemo > 0.1577877:
## :...sg <= -0.4097673: ckd (18)
## sg > -0.4097673:
## :...al > -0.8079858: ckd (2)
## al <= -0.8079858:
## :...pcv > 0.1992479: notckd (100/1)
## pcv <= 0.1992479:
## :...sc <= -0.2980658: notckd (7/1)
## sc > -0.2980658: ckd (2)
##
## ----- Trial 1: -----
##
## Decision tree:
##
## sc > -0.3140188: ckd (139.2)
## sc <= -0.3140188:
## :...dmyes > -0.6982936: ckd (43.8)
## dmyes <= -0.6982936:
## :...hemo <= 0.08559725: ckd (13.5)
## hemo > 0.08559725: notckd (83.4/4.5)
##
## ----- Trial 2: -----
##
## Decision tree:
##
## sg <= -0.4097673: ckd (133.3)
## sg > -0.4097673:
## :...peyes > -0.5157663: ckd (45)
## peyes <= -0.5157663:
## :...dmyes > -0.6982936: ckd (33.6)
## dmyes <= -0.6982936:
## :...hemo <= 0.08559725: ckd (7.4)
## hemo > 0.08559725: notckd (60.7/1.1)
##
## ----- Trial 3: -----
##
## Decision tree:
##
## sg > 0.4660712: notckd (26.8/5.5)
## sg <= 0.4660712:
## :...sc > -0.3140188: ckd (143.9)
## sc <= -0.3140188:
## :...hemo <= 1.168454: ckd (101.6/15.8)
## hemo > 1.168454: notckd (7.7)
##
##
## Evaluation on training data (280 cases):
##
## Trial Decision Tree
## ----- ----------------
## Size Errors
##
## 0 6 2( 0.7%)
## 1 4 6( 2.1%)
## 2 5 2( 0.7%)
## 3 4 50(17.9%)
## boost 0( 0.0%) <<
##
##
## (a) (b) <-classified as
## ---- ----
## 175 (a): class ckd
## 105 (b): class notckd
##
##
## Attribute usage:
##
## 100.00% sg
## 100.00% sc
## 100.00% hemo
## 57.86% dmyes
## 52.14% peyes
## 39.64% al
## 38.93% pcv
##
##
## Time: 0.0 secs
Prediction accuracy on test set
prop.table(table(data_test$class, predict(c5_fit, newdata=data_test)))
##
## ckd notckd
## ckd 0.616666667 0.008333333
## notckd 0.000000000 0.375000000
glmnet_grid <- expand.grid(alpha = c(0, .1, .2, .4, .6, .8, 1),
lambda = seq(.01, .7, length = 50))
glmnet_ctrl <- trainControl(method = "cv", number = 10)
glmnet_fit <- train(class ~ ., data = data_train,
method = "glmnet",
preProcess = c("center", "scale"),
tuneGrid = glmnet_grid,
trControl = glmnet_ctrl)
#glmnet_fit
trellis.par.set(caretTheme())
plot(glmnet_fit, scales = list(x = list(log = 2)))

Lasso Regression, explicitly
# Function for scaling numeric vars of a DF
scale_df <- function(df, y){
numeric_cols <- sapply(df, is.numeric)
cbind(scale(df[,numeric_cols]), df[, !numeric_cols])
}
scaled_data_train <- scale_df(data_train)
scaled_data_test <- scale_df(data_test)
OHE of categorical vars
dummies_train <- model.matrix(class ~ . , data=scaled_data_train)[,-1]
dummies_test <- model.matrix(class~.,data=scaled_data_test)[,-1]
Lasso Regression
glmmod <- glmnet(dummies_train, y=scaled_data_train$class, alpha=1, family="binomial")
plot(glmmod,xvar="lambda")

# X-Val
cv.glmmod <- cv.glmnet(dummies_train,
y=as.numeric(scaled_data_train$class),
alpha=1)
best_lambda <- cv.glmmod$lambda.min
plot(cv.glmmod)

best_lambda
## [1] 0.006693215
best <- glmnet(dummies_train, y=scaled_data_train$class,
alpha=1, lambda = best_lambda, family = "binomial")
coef(best)
## 25 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -2.46178531
## age 0.05575661
## bp -0.22020149
## sg 2.36504841
## bgr -0.26480060
## bu .
## sc .
## sod .
## pot .
## hemo 1.92252154
## pcv 0.86636638
## wbcc .
## rbcc 0.10583180
## al -0.73040931
## su .
## rbcnormal 0.35028323
## pcnormal .
## pccpresent .
## bapresent .
## htnyes -0.93995007
## dmyes -2.38953134
## cadyes .
## appetpoor -1.11524092
## peyes -2.96092954
## aneyes .
ROC Curve
# Predict outcomes for test set
pred <- predict(best, newx=dummies_test, s="lambda.min", type="response")
p <- ifelse(pred > .5, 1, 0)
# Confusion matrix
table(data_test$class,p)
## p
## 0 1
## ckd 74 1
## notckd 0 45
# %
prop.table(table(data_test$class,p))
## p
## 0 1
## ckd 0.616666667 0.008333333
## notckd 0.000000000 0.375000000
auroc <- prediction(p, data_test$class)
perform <- performance(auroc, "tpr", "fpr")
plot(perform, main='ROC curve')

Ridge Regression
glmmod <- glmnet(dummies_train, y=scaled_data_train$class, alpha=0, family="binomial")
plot(glmmod,xvar="lambda")

# X-Val
cv.glmmod <- cv.glmnet(dummies_train,
y=as.numeric(scaled_data_train$class),
alpha=0)
best_lambda <- cv.glmmod$lambda.min
plot(cv.glmmod)

best_lambda
## [1] 0.07695571
best <- glmnet(dummies_train, y=scaled_data_train$class,
alpha=0, lambda = best_lambda, family = "binomial")
coef(best)
## 25 x 1 sparse Matrix of class "dgCMatrix"
## s0
## (Intercept) -1.714437317
## age 0.138960765
## bp -0.148854571
## sg 0.657323342
## bgr -0.223004938
## bu -0.090894403
## sc -0.118180409
## sod 0.158821090
## pot 0.003293743
## hemo 0.545664058
## pcv 0.425251991
## wbcc -0.150781520
## rbcc 0.244404879
## al -0.420434073
## su -0.228121032
## rbcnormal 0.763023744
## pcnormal 0.311582626
## pccpresent -0.334198790
## bapresent -0.407339849
## htnyes -0.687686070
## dmyes -0.634485092
## cadyes -0.241727348
## appetpoor -0.698032225
## peyes -0.698109376
## aneyes -0.228965102
ROC Curve
# Predict outcomes for test set
pred <- predict(best, newx=dummies_test, s="lambda.min", type="response")
p <- ifelse(pred > .5, 1, 0)
# Confusion matrix
table(data_test$class,p)
## p
## 0 1
## ckd 72 3
## notckd 0 45
# %
prop.table(table(data_test$class,p))
## p
## 0 1
## ckd 0.600 0.025
## notckd 0.000 0.375
auroc <- prediction(p, data_test$class)
perform <- performance(auroc, "tpr", "fpr")
plot(perform, main='ROC curve')
