Chronic Kidney Disease Analysis, analysis for personal experiment

1. EDA

Library upload

library(tidyverse)
library(caret)
library(ggplot2)
library(dplyr)
library(RWeka)
library(ROCR)
library(rpart)
library(rpart.plot)
library(rattle)

Upload data

CKD <- read.arff("D:/FREE-INDO/Chronic_Kidney_Disease/chronic_kidney_disease.arff") # ubah dg direktori anda
head(CKD,3)
##   age bp    sg al su    rbc     pc        pcc         ba bgr bu  sc sod pot
## 1  48 80 1.020  1  0   <NA> normal notpresent notpresent 121 36 1.2  NA  NA
## 2   7 50 1.020  4  0   <NA> normal notpresent notpresent  NA 18 0.8  NA  NA
## 3  62 80 1.010  2  3 normal normal notpresent notpresent 423 53 1.8  NA  NA
##   hemo pcv wbcc rbcc htn  dm cad appet pe ane class
## 1 15.4  44 7800  5.2 yes yes  no  good no  no   ckd
## 2 11.3  38 6000   NA  no  no  no  good no  no   ckd
## 3  9.6  31 7500   NA  no yes  no  poor no yes   ckd
dim(CKD)
## [1] 400  25
summary(CKD)
##       age              bp             sg         al         su     
##  Min.   : 2.00   Min.   : 50.00   1.005:  7   0   :199   0   :290  
##  1st Qu.:42.00   1st Qu.: 70.00   1.010: 84   1   : 44   1   : 13  
##  Median :55.00   Median : 80.00   1.015: 75   2   : 43   2   : 18  
##  Mean   :51.48   Mean   : 76.47   1.020:106   3   : 43   3   : 14  
##  3rd Qu.:64.50   3rd Qu.: 80.00   1.025: 81   4   : 24   4   : 13  
##  Max.   :90.00   Max.   :180.00   NA's : 47   5   :  1   5   :  3  
##  NA's   :9       NA's   :12                   NA's: 46   NA's: 49  
##        rbc             pc              pcc               ba           bgr     
##  normal  :201   normal  :259   present   : 42   present   : 22   Min.   : 22  
##  abnormal: 47   abnormal: 76   notpresent:354   notpresent:374   1st Qu.: 99  
##  NA's    :152   NA's    : 65   NA's      :  4   NA's      :  4   Median :121  
##                                                                  Mean   :148  
##                                                                  3rd Qu.:163  
##                                                                  Max.   :490  
##                                                                  NA's   :44   
##        bu               sc              sod             pot        
##  Min.   :  1.50   Min.   : 0.400   Min.   :  4.5   Min.   : 2.500  
##  1st Qu.: 27.00   1st Qu.: 0.900   1st Qu.:135.0   1st Qu.: 3.800  
##  Median : 42.00   Median : 1.300   Median :138.0   Median : 4.400  
##  Mean   : 57.43   Mean   : 3.072   Mean   :137.5   Mean   : 4.627  
##  3rd Qu.: 66.00   3rd Qu.: 2.800   3rd Qu.:142.0   3rd Qu.: 4.900  
##  Max.   :391.00   Max.   :76.000   Max.   :163.0   Max.   :47.000  
##  NA's   :19       NA's   :17       NA's   :87      NA's   :88      
##       hemo            pcv             wbcc            rbcc         htn     
##  Min.   : 3.10   Min.   : 9.00   Min.   : 2200   Min.   :2.100   yes :147  
##  1st Qu.:10.30   1st Qu.:32.00   1st Qu.: 6500   1st Qu.:3.900   no  :251  
##  Median :12.65   Median :40.00   Median : 8000   Median :4.800   NA's:  2  
##  Mean   :12.53   Mean   :38.88   Mean   : 8406   Mean   :4.707             
##  3rd Qu.:15.00   3rd Qu.:45.00   3rd Qu.: 9800   3rd Qu.:5.400             
##  Max.   :17.80   Max.   :54.00   Max.   :26400   Max.   :8.000             
##  NA's   :52      NA's   :71      NA's   :106     NA's   :131               
##     dm        cad       appet        pe        ane         class    
##  yes :137   yes : 34   good:317   yes : 76   yes : 60   ckd   :250  
##  no  :261   no  :364   poor: 82   no  :323   no  :339   notckd:150  
##  NA's:  2   NA's:  2   NA's:  1   NA's:  1   NA's:  1               
##                                                                     
##                                                                     
##                                                                     
## 

Check missing data(% missing data)

apply(CKD,2,function(i) {(sum(is.na(i))/nrow(CKD))*100})
##   age    bp    sg    al    su   rbc    pc   pcc    ba   bgr    bu    sc   sod 
##  2.25  3.00 11.75 11.50 12.25 38.00 16.25  1.00  1.00 11.00  4.75  4.25 21.75 
##   pot  hemo   pcv  wbcc  rbcc   htn    dm   cad appet    pe   ane class 
## 22.00 13.00 17.75 26.50 32.75  0.50  0.50  0.50  0.25  0.25  0.25  0.00

Sample no missing data

sum(complete.cases(CKD))
## [1] 158

Complete missing data, remove columns data >= 25% of missing data

sum(complete.cases(CKD[,-c(6,17,18)]))
## [1] 209

Remove missing data in observation row

CKD2 <- CKD[complete.cases(CKD),]
dim(CKD2)
## [1] 158  25
apply(CKD2[,c(3:9,19:25)],2,table)
## $sg
## 
## 1.005 1.010 1.015 1.020 1.025 
##     3    23    10    61    61 
## 
## $al
## 
##   0   1   2   3   4 
## 116   3   9  15  15 
## 
## $su
## 
##   0   1   2   3   4   5 
## 140   6   6   3   2   1 
## 
## $rbc
## 
## abnormal   normal 
##       18      140 
## 
## $pc
## 
## abnormal   normal 
##       29      129 
## 
## $pcc
## 
## notpresent    present 
##        144         14 
## 
## $ba
## 
## notpresent    present 
##        146         12 
## 
## $htn
## 
##  no yes 
## 124  34 
## 
## $dm
## 
##  no yes 
## 130  28 
## 
## $cad
## 
##  no yes 
## 147  11 
## 
## $appet
## 
## good poor 
##  139   19 
## 
## $pe
## 
##  no yes 
## 138  20 
## 
## $ane
## 
##  no yes 
## 142  16 
## 
## $class
## 
##    ckd notckd 
##     43    115

2. Create dummy for categorical data

creat_dummy_var_data <- function(dataset){
  dummy_variables <- dummyVars(~., data=dataset, fullRank=T)
  dummy_var_data <- data.frame( predict(dummy_variables, newdata=dataset) )
  return(dummy_var_data)
}

2.1 Split data to train data and tes data

create_training_test <- function(features_dataset,outcome_data,training_test_ratio){
  training_index <- createDataPartition(outcome_data,p=training_test_ratio,list=F)
  
  training_set <- droplevels(features_dataset[training_index,])
  test_set <- droplevels(features_dataset[-training_index,])
  
  outcome_training_set <- factor(outcome_data[training_index])
  outcome_test_set <- factor(outcome_data[-training_index])
  
  return(list(training_features=training_set, test_features=test_set, training_outcome=outcome_training_set, test_outcome=outcome_test_set))
}
# Funsi Pra proses data

remove_nonvaring_collinear_features <- function(training_data,test_data,corr_theshold=0.75){
  
  near_zero_covariates <- colnames(training_data)[nearZeroVar(training_data)]
  
  if(length(near_zero_covariates)>0)
  {
    # cari variable yang corelasinya kecil atau mendekati nol
    nzc_indices_training <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data))})
    training_data_nzc <- training_data[,-nzc_indices_training]
    
    nzc_indices_test <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data))})  
    test_data_nzc <- test_data[,-nzc_indices_test]
  } else {
    training_data_nzc <- training_data
    test_data_nzc <- test_data
  }

  # Tampilkan semua korelasi 
  feature_correlation <- cor(training_data_nzc)
  # Menghapus variable autocorelation
  high_correlation <- findCorrelation(feature_correlation,corr_theshold,verbose=F,names=T)

  if(length(high_correlation)>0)
  {
    correlated_indices_training <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data_nzc))} )
    final_training_data <- training_data_nzc[,-correlated_indices_training]
    
    correlated_indices_test <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data_nzc))} )
    final_test_data <- test_data_nzc[,-correlated_indices_test]
  }else{
    final_training_data <- training_data_nzc
    final_test_data <- test_data_nzc
  }
  
  return(list(processed_training_set=final_training_data, processed_test_set=final_test_data))
}

2.2 Dummy variable execution

Konversi variable categori menjadi variabel dummy, kecuali variable outcome

ID_outcome<- grep("class",colnames(CKD2))
Dummy_variables <- creat_dummy_var_data(CKD2[,-ID_outcome])

2.3 Data split

set.seed(135)
split_data <- create_training_test(Dummy_variables,CKD2$class,0.6)
lapply(split_data,head)
## $training_features
##    age bp sg.1.010 sg.1.015 sg.1.020 sg.1.025 al.1 al.2 al.3 al.4 al.5 su.1
## 4   48 70        0        0        0        0    0    0    0    1    0    0
## 15  68 80        1        0        0        0    0    0    1    0    0    0
## 21  61 80        0        1        0        0    0    1    0    0    0    0
## 23  48 80        0        0        0        1    0    0    0    1    0    0
## 28  69 70        1        0        0        0    0    0    1    0    0    0
## 49  73 70        0        0        0        0    0    0    0    0    0    0
##    su.2 su.3 su.4 su.5 rbc.abnormal pc.abnormal pcc.notpresent ba.notpresent
## 4     0    0    0    0            0           1              0             1
## 15    1    0    0    0            0           1              0             0
## 21    0    0    0    0            1           1              1             1
## 23    0    0    0    0            0           1              1             1
## 28    0    0    1    0            0           1              1             1
## 49    0    0    0    0            0           0              1             1
##    bgr  bu  sc sod pot hemo pcv  wbcc rbcc htn.no dm.no cad.no appet.poor pe.no
## 4  117  56 3.8 111 2.5 11.2  32  6700  3.9      0     1      1          1     0
## 15 157  90 4.1 130 6.4  5.6  16 11000  2.6      0     0      0          1     0
## 21 173 148 3.9 135 5.2  7.7  24  9200  3.2      0     0      0          1     0
## 23  95 163 7.7 136 3.8  9.8  32  6900  3.4      0     1      1          0     1
## 28 264  87 2.7 130 4.0 12.5  37  9600  4.1      0     0      0          0     0
## 49  70  32 0.9 125 4.0 10.0  29 18900  3.5      0     0      1          0     0
##    ane.no
## 4       0
## 15      1
## 21      0
## 23      0
## 28      1
## 49      1
## 
## $test_features
##    age  bp sg.1.010 sg.1.015 sg.1.020 sg.1.025 al.1 al.2 al.3 al.4 al.5 su.1
## 10  53  90        0        0        1        0    0    1    0    0    0    0
## 12  63  70        1        0        0        0    0    0    1    0    0    0
## 75  56  90        0        1        0        0    0    1    0    0    0    0
## 85  59  70        1        0        0        0    0    0    1    0    0    0
## 91  63 100        1        0        0        0    0    1    0    0    0    0
## 93  71  70        1        0        0        0    0    0    1    0    0    0
##    su.2 su.3 su.4 su.5 rbc.abnormal pc.abnormal pcc.notpresent ba.notpresent
## 10    0    0    0    0            1           1              0             1
## 12    0    0    0    0            1           1              0             1
## 75    0    0    0    0            1           1              1             1
## 85    0    0    0    0            0           1              1             1
## 91    1    0    0    0            0           0              1             0
## 93    0    0    0    0            0           1              0             0
##    bgr  bu   sc sod pot hemo pcv  wbcc rbcc htn.no dm.no cad.no appet.poor
## 10  70 107  7.2 114 3.7  9.5  29 12100  3.7      0     0      1          1
## 12 380  60  2.7 131 4.2 10.8  32  4500  3.8      0     0      1          1
## 75 129 107  6.7 131 4.8  9.1  29  6400  3.4      0     1      1          0
## 85  76 186 15.0 135 7.6  7.1  22  3800  2.1      0     1      1          1
## 91 280  35  3.2 143 3.5 13.0  40  9800  4.2      0     1      0          0
## 93 219  82  3.6 133 4.4 10.4  33  5600  3.6      0     0      0          0
##    pe.no ane.no
## 10     1      0
## 12     0      1
## 75     1      1
## 85     0      0
## 91     1      1
## 93     1      1
## 
## $training_outcome
## [1] ckd ckd ckd ckd ckd ckd
## Levels: ckd notckd
## 
## $test_outcome
## [1] ckd ckd ckd ckd ckd ckd
## Levels: ckd notckd
lapply(split_data[1:2],dim)
## $training_features
## [1] 95 35
## 
## $test_features
## [1] 63 35
processed_data <- remove_nonvaring_collinear_features(split_data$training_features,split_data$test_features,0.60)#0.75

final_training_set <- processed_data$processed_training_set
final_test_set <- processed_data$processed_test_set

training_output <- split_data$training_outcome
test_output <- split_data$test_outcome

2.3 Chart PCA

2.4 Detail variable vs outcome

Detail masing masing variable thd outcome/variable respons

3. Model

3.1 Validation 5 times repetition

Tcontrol <- trainControl(method="cv", number=5, savePredictions = T,classProbs =  TRUE)

3.2 SVM-linier

Simetrical Uncertainty(SU) model linier
Tcontrol <- trainControl(method="cv", number=5, savePredictions = T,classProbs =  TRUE)

set.seed(101)
Model_lm <- train(y=training_output, 
                      x=final_training_set, 
                      trControl=Tcontrol, 
                      method = "svmLinear",
                      preProcess = c("center", "scale","pca"))

3.3 SVM rbf kernel

Use multivariate data/radial kernel
Tcontrol <- trainControl(method="cv", number=5, savePredictions = T,classProbs =  TRUE)

set.seed(202)
Model_rf <- train(y=training_output, 
        x=final_training_set, 
        trControl=Tcontrol, 
        method = "svmRadial", 
        tuneLength=10, 
        preProcess = c("center", "scale","pca"))

3.4 Accuracy and Prediction

3.4.1 Accuracy and Prediction SVM-linier
Pred_Model_lm <- predict(Model_lm, newdata=final_test_set)
confusionMatrix(data=Pred_Model_lm, reference=test_output)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ckd notckd
##     ckd     16      0
##     notckd   1     46
##                                           
##                Accuracy : 0.9841          
##                  95% CI : (0.9147, 0.9996)
##     No Information Rate : 0.7302          
##     P-Value [Acc > NIR] : 6.034e-08       
##                                           
##                   Kappa : 0.959           
##                                           
##  Mcnemar's Test P-Value : 1               
##                                           
##             Sensitivity : 0.9412          
##             Specificity : 1.0000          
##          Pos Pred Value : 1.0000          
##          Neg Pred Value : 0.9787          
##              Prevalence : 0.2698          
##          Detection Rate : 0.2540          
##    Detection Prevalence : 0.2540          
##       Balanced Accuracy : 0.9706          
##                                           
##        'Positive' Class : ckd             
## 
3.4.2 Accuracy and Prediction SVM rbf kernel
Pred_Model_rf  <- predict(Model_rf, newdata=final_test_set)
confusionMatrix(data=Pred_Model_rf, reference=test_output)
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ckd notckd
##     ckd     17      0
##     notckd   0     46
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.7302     
##     P-Value [Acc > NIR] : 2.485e-09  
##                                      
##                   Kappa : 1          
##                                      
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.2698     
##          Detection Rate : 0.2698     
##    Detection Prevalence : 0.2698     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : ckd        
## 

4. Important variable rank

Mengambarkan rangking variable important