data <- read.csv("~/Dropbox/DKE/Machine Learning/Master Assignment/Chronic_Kidney_Disease/Imputed_DAta.csv")

require(caret)
require(glmnet)
require(ROCR)

seed <- 666
classes <- data[, "class"]
train_set <- createDataPartition(classes, p = 0.7, list = FALSE)

data_train <- data[train_set, ]
data_test <- data[-train_set, ]
ctrl <- trainControl(method = "cv", number = 10)

Naive Bayes

nb_fit <- train(class ~ ., data = data_train,
                 method="nb",
                 trControl=ctrl,
                 metric="Accuracy",
                 importance=T)
nb_fit

## Naive Bayes 
## 
## 280 samples
##  24 predictor
##   2 classes: 'ckd', 'notckd' 
## 
## No pre-processing
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 252, 253, 251, 253, 252, 253, ... 
## Resampling results across tuning parameters:
## 
##   usekernel  Accuracy   Kappa    
##   FALSE            NaN        NaN
##    TRUE      0.9893997  0.9780727
## 
## Tuning parameter 'fL' was held constant at a value of 0
## Tuning
##  parameter 'adjust' was held constant at a value of 1
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were fL = 0, usekernel = TRUE
##  and adjust = 1.

C5.0

c50Grid <- expand.grid(.trials = c(1:9, (1:10)*10),
                       .model = c("tree", "rules"),
                       .winnow = c(TRUE, FALSE))

c5_fit <- train(class ~ ., data = data_train,
                 method="C5.0",
                tuneGrid=c50Grid,
                trControl=ctrl,
                metric="Accuracy",
                preProcess = c("center", "scale"),
                importance=T)
c5_fit

## C5.0 
## 
## 280 samples
##  24 predictor
##   2 classes: 'ckd', 'notckd' 
## 
## Pre-processing: centered (24), scaled (24) 
## Resampling: Cross-Validated (10 fold) 
## Summary of sample sizes: 252, 251, 251, 253, 252, 252, ... 
## Resampling results across tuning parameters:
## 
##   model  winnow  trials  Accuracy   Kappa    
##   rules  FALSE     1     0.9678480  0.9312090
##   rules  FALSE     2     0.9714194  0.9385774
##   rules  FALSE     3     0.9821429  0.9624409
##   rules  FALSE     4     0.9857143  0.9700496
##   rules  FALSE     5     0.9822660  0.9628536
##   rules  FALSE     6     0.9857143  0.9700496
##   rules  FALSE     7     0.9857143  0.9700496
##   rules  FALSE     8     0.9857143  0.9700496
##   rules  FALSE     9     0.9857143  0.9700496
##   rules  FALSE    10     0.9857143  0.9700496
##   rules  FALSE    20     0.9857143  0.9700496
##   rules  FALSE    30     0.9857143  0.9700496
##   rules  FALSE    40     0.9857143  0.9700496
##   rules  FALSE    50     0.9857143  0.9700496
##   rules  FALSE    60     0.9857143  0.9700496
##   rules  FALSE    70     0.9857143  0.9700496
##   rules  FALSE    80     0.9857143  0.9700496
##   rules  FALSE    90     0.9857143  0.9700496
##   rules  FALSE   100     0.9857143  0.9700496
##   rules   TRUE     1     0.9646552  0.9257833
##   rules   TRUE     2     0.9716749  0.9403439
##   rules   TRUE     3     0.9751140  0.9481184
##   rules   TRUE     4     0.9681034  0.9323039
##   rules   TRUE     5     0.9610837  0.9167090
##   rules   TRUE     6     0.9646552  0.9256316
##   rules   TRUE     7     0.9575123  0.9093525
##   rules   TRUE     8     0.9536855  0.8997877
##   rules   TRUE     9     0.9716749  0.9404298
##   rules   TRUE    10     0.9608283  0.9159567
##   rules   TRUE    20     0.9609606  0.9167002
##   rules   TRUE    30     0.9716749  0.9403439
##   rules   TRUE    40     0.9716749  0.9403439
##   rules   TRUE    50     0.9716749  0.9403439
##   rules   TRUE    60     0.9716749  0.9403439
##   rules   TRUE    70     0.9716749  0.9403439
##   rules   TRUE    80     0.9716749  0.9403439
##   rules   TRUE    90     0.9716749  0.9403439
##   rules   TRUE   100     0.9716749  0.9403439
##   tree   FALSE     1     0.9642766  0.9236080
##   tree   FALSE     2     0.9821429  0.9618586
##   tree   FALSE     3     0.9821429  0.9624409
##   tree   FALSE     4     0.9892857  0.9774142
##   tree   FALSE     5     0.9857143  0.9700458
##   tree   FALSE     6     0.9857143  0.9700458
##   tree   FALSE     7     0.9822660  0.9628497
##   tree   FALSE     8     0.9857143  0.9700458
##   tree   FALSE     9     0.9857143  0.9700458
##   tree   FALSE    10     0.9857143  0.9700458
##   tree   FALSE    20     0.9857143  0.9700458
##   tree   FALSE    30     0.9857143  0.9700458
##   tree   FALSE    40     0.9857143  0.9700458
##   tree   FALSE    50     0.9857143  0.9700458
##   tree   FALSE    60     0.9857143  0.9700458
##   tree   FALSE    70     0.9857143  0.9700458
##   tree   FALSE    80     0.9857143  0.9700458
##   tree   FALSE    90     0.9857143  0.9700458
##   tree   FALSE   100     0.9857143  0.9700458
##   tree    TRUE     1     0.9646552  0.9257833
##   tree    TRUE     2     0.9716749  0.9403439
##   tree    TRUE     3     0.9751140  0.9481184
##   tree    TRUE     4     0.9752463  0.9482907
##   tree    TRUE     5     0.9610837  0.9167090
##   tree    TRUE     6     0.9681034  0.9330733
##   tree    TRUE     7     0.9609606  0.9167943
##   tree    TRUE     8     0.9681034  0.9330733
##   tree    TRUE     9     0.9752463  0.9482907
##   tree    TRUE    10     0.9716749  0.9409262
##   tree    TRUE    20     0.9681034  0.9329793
##   tree    TRUE    30     0.9716749  0.9403439
##   tree    TRUE    40     0.9681034  0.9329793
##   tree    TRUE    50     0.9681034  0.9329793
##   tree    TRUE    60     0.9681034  0.9329793
##   tree    TRUE    70     0.9681034  0.9329793
##   tree    TRUE    80     0.9681034  0.9329793
##   tree    TRUE    90     0.9681034  0.9329793
##   tree    TRUE   100     0.9681034  0.9329793
## 
## Accuracy was used to select the optimal model using  the largest value.
## The final values used for the model were trials = 4, model = tree
##  and winnow = FALSE.

plot(c5_fit)

predictors(c5_fit)

## [1] "hemo"  "sc"    "sg"    "dmyes" "al"    "pcv"   "peyes"

summary(c5_fit$finalModel)

## 
## Call:
## C5.0.default(x = structure(c(-0.141874822894331,
##  "fuzzyThreshold", "sample", "earlyStopping", "label",
##  "seed")), importance = TRUE)
## 
## 
## C5.0 [Release 2.07 GPL Edition]      Sun Jan 22 20:24:19 2017
## -------------------------------
## 
## Class specified by attribute `outcome'
## 
## Read 280 cases (25 attributes) from undefined.data
## 
## -----  Trial 0:  -----
## 
## Decision tree:
## 
## hemo <= 0.1577877: ckd (151)
## hemo > 0.1577877:
## :...sg <= -0.4097673: ckd (18)
##     sg > -0.4097673:
##     :...al > -0.8079858: ckd (2)
##         al <= -0.8079858:
##         :...pcv > 0.1992479: notckd (100/1)
##             pcv <= 0.1992479:
##             :...sc <= -0.2980658: notckd (7/1)
##                 sc > -0.2980658: ckd (2)
## 
## -----  Trial 1:  -----
## 
## Decision tree:
## 
## sc > -0.3140188: ckd (139.2)
## sc <= -0.3140188:
## :...dmyes > -0.6982936: ckd (43.8)
##     dmyes <= -0.6982936:
##     :...hemo <= 0.08559725: ckd (13.5)
##         hemo > 0.08559725: notckd (83.4/4.5)
## 
## -----  Trial 2:  -----
## 
## Decision tree:
## 
## sg <= -0.4097673: ckd (133.3)
## sg > -0.4097673:
## :...peyes > -0.5157663: ckd (45)
##     peyes <= -0.5157663:
##     :...dmyes > -0.6982936: ckd (33.6)
##         dmyes <= -0.6982936:
##         :...hemo <= 0.08559725: ckd (7.4)
##             hemo > 0.08559725: notckd (60.7/1.1)
## 
## -----  Trial 3:  -----
## 
## Decision tree:
## 
## sg > 0.4660712: notckd (26.8/5.5)
## sg <= 0.4660712:
## :...sc > -0.3140188: ckd (143.9)
##     sc <= -0.3140188:
##     :...hemo <= 1.168454: ckd (101.6/15.8)
##         hemo > 1.168454: notckd (7.7)
## 
## 
## Evaluation on training data (280 cases):
## 
## Trial        Decision Tree   
## -----      ----------------  
##    Size      Errors  
## 
##    0      6    2( 0.7%)
##    1      4    6( 2.1%)
##    2      5    2( 0.7%)
##    3      4   50(17.9%)
## boost              0( 0.0%)   <<
## 
## 
##     (a)   (b)    <-classified as
##    ----  ----
##     175          (a): class ckd
##           105    (b): class notckd
## 
## 
##  Attribute usage:
## 
##  100.00% sg
##  100.00% sc
##  100.00% hemo
##   57.86% dmyes
##   52.14% peyes
##   39.64% al
##   38.93% pcv
## 
## 
## Time: 0.0 secs

Prediction accuracy on test set

prop.table(table(data_test$class, predict(c5_fit, newdata=data_test)))

##         
##                  ckd      notckd
##   ckd    0.616666667 0.008333333
##   notckd 0.000000000 0.375000000

glmnet_grid <- expand.grid(alpha = c(0,  .1,  .2, .4, .6, .8, 1),
                           lambda = seq(.01, .7, length = 50))
glmnet_ctrl <- trainControl(method = "cv", number = 10)
glmnet_fit <- train(class ~ ., data = data_train,
                    method = "glmnet",
                    preProcess = c("center", "scale"),
                    tuneGrid = glmnet_grid,
                    trControl = glmnet_ctrl)
#glmnet_fit

trellis.par.set(caretTheme())
plot(glmnet_fit, scales = list(x = list(log = 2)))

Lasso Regression, explicitly

# Function for scaling numeric vars of a DF
scale_df <- function(df, y){
  numeric_cols <- sapply(df, is.numeric)
  cbind(scale(df[,numeric_cols]), df[, !numeric_cols])
}

scaled_data_train <- scale_df(data_train)
scaled_data_test <- scale_df(data_test)

OHE of categorical vars

dummies_train <- model.matrix(class ~ . , data=scaled_data_train)[,-1]
dummies_test <- model.matrix(class~.,data=scaled_data_test)[,-1]

Lasso Regression

glmmod <- glmnet(dummies_train, y=scaled_data_train$class, alpha=1, family="binomial")
plot(glmmod,xvar="lambda")

# X-Val
cv.glmmod <- cv.glmnet(dummies_train,
                       y=as.numeric(scaled_data_train$class),
                       alpha=1)
best_lambda <- cv.glmmod$lambda.min

plot(cv.glmmod)

best_lambda

## [1] 0.006693215

best <- glmnet(dummies_train, y=scaled_data_train$class,
               alpha=1, lambda = best_lambda, family = "binomial")
coef(best)

## 25 x 1 sparse Matrix of class "dgCMatrix"
##                      s0
## (Intercept) -2.46178531
## age          0.05575661
## bp          -0.22020149
## sg           2.36504841
## bgr         -0.26480060
## bu           .         
## sc           .         
## sod          .         
## pot          .         
## hemo         1.92252154
## pcv          0.86636638
## wbcc         .         
## rbcc         0.10583180
## al          -0.73040931
## su           .         
## rbcnormal    0.35028323
## pcnormal     .         
## pccpresent   .         
## bapresent    .         
## htnyes      -0.93995007
## dmyes       -2.38953134
## cadyes       .         
## appetpoor   -1.11524092
## peyes       -2.96092954
## aneyes       .

ROC Curve

# Predict outcomes for test set
pred <- predict(best, newx=dummies_test, s="lambda.min", type="response")

p <- ifelse(pred > .5, 1, 0)
# Confusion matrix
table(data_test$class,p)

##         p
##           0  1
##   ckd    74  1
##   notckd  0 45

# %
prop.table(table(data_test$class,p))

##         p
##                    0           1
##   ckd    0.616666667 0.008333333
##   notckd 0.000000000 0.375000000

auroc <- prediction(p, data_test$class)
perform <- performance(auroc, "tpr", "fpr")
plot(perform, main='ROC curve')

Ridge Regression

glmmod <- glmnet(dummies_train, y=scaled_data_train$class, alpha=0, family="binomial")
plot(glmmod,xvar="lambda")

# X-Val
cv.glmmod <- cv.glmnet(dummies_train,
                       y=as.numeric(scaled_data_train$class),
                       alpha=0)
best_lambda <- cv.glmmod$lambda.min

plot(cv.glmmod)

best_lambda

## [1] 0.07695571

best <- glmnet(dummies_train, y=scaled_data_train$class,
               alpha=0, lambda = best_lambda, family = "binomial")
coef(best)

## 25 x 1 sparse Matrix of class "dgCMatrix"
##                       s0
## (Intercept) -1.714437317
## age          0.138960765
## bp          -0.148854571
## sg           0.657323342
## bgr         -0.223004938
## bu          -0.090894403
## sc          -0.118180409
## sod          0.158821090
## pot          0.003293743
## hemo         0.545664058
## pcv          0.425251991
## wbcc        -0.150781520
## rbcc         0.244404879
## al          -0.420434073
## su          -0.228121032
## rbcnormal    0.763023744
## pcnormal     0.311582626
## pccpresent  -0.334198790
## bapresent   -0.407339849
## htnyes      -0.687686070
## dmyes       -0.634485092
## cadyes      -0.241727348
## appetpoor   -0.698032225
## peyes       -0.698109376
## aneyes      -0.228965102

ROC Curve

# Predict outcomes for test set
pred <- predict(best, newx=dummies_test, s="lambda.min", type="response")

p <- ifelse(pred > .5, 1, 0)
# Confusion matrix
table(data_test$class,p)

##         p
##           0  1
##   ckd    72  3
##   notckd  0 45

# %
prop.table(table(data_test$class,p))

##         p
##              0     1
##   ckd    0.600 0.025
##   notckd 0.000 0.375

auroc <- prediction(p, data_test$class)
perform <- performance(auroc, "tpr", "fpr")
plot(perform, main='ROC curve')

Chronic Kidney Disease - Imputed Data

Naive Bayes

C5.0

Prediction accuracy on test set

Lasso Regression, explicitly

OHE of categorical vars

Lasso Regression

ROC Curve

Ridge Regression

ROC Curve