Chronic Kidney Disease Analysis, analysis for personal experiment
library(tidyverse)
library(caret)
library(ggplot2)
library(dplyr)
library(RWeka)
library(ROCR)
library(rpart)
library(rpart.plot)
library(rattle)
CKD <- read.arff("D:/FREE-INDO/Chronic_Kidney_Disease/chronic_kidney_disease.arff") # ubah dg direktori anda
head(CKD,3)
## age bp sg al su rbc pc pcc ba bgr bu sc sod pot
## 1 48 80 1.020 1 0 <NA> normal notpresent notpresent 121 36 1.2 NA NA
## 2 7 50 1.020 4 0 <NA> normal notpresent notpresent NA 18 0.8 NA NA
## 3 62 80 1.010 2 3 normal normal notpresent notpresent 423 53 1.8 NA NA
## hemo pcv wbcc rbcc htn dm cad appet pe ane class
## 1 15.4 44 7800 5.2 yes yes no good no no ckd
## 2 11.3 38 6000 NA no no no good no no ckd
## 3 9.6 31 7500 NA no yes no poor no yes ckd
dim(CKD)
## [1] 400 25
summary(CKD)
## age bp sg al su
## Min. : 2.00 Min. : 50.00 1.005: 7 0 :199 0 :290
## 1st Qu.:42.00 1st Qu.: 70.00 1.010: 84 1 : 44 1 : 13
## Median :55.00 Median : 80.00 1.015: 75 2 : 43 2 : 18
## Mean :51.48 Mean : 76.47 1.020:106 3 : 43 3 : 14
## 3rd Qu.:64.50 3rd Qu.: 80.00 1.025: 81 4 : 24 4 : 13
## Max. :90.00 Max. :180.00 NA's : 47 5 : 1 5 : 3
## NA's :9 NA's :12 NA's: 46 NA's: 49
## rbc pc pcc ba bgr
## normal :201 normal :259 present : 42 present : 22 Min. : 22
## abnormal: 47 abnormal: 76 notpresent:354 notpresent:374 1st Qu.: 99
## NA's :152 NA's : 65 NA's : 4 NA's : 4 Median :121
## Mean :148
## 3rd Qu.:163
## Max. :490
## NA's :44
## bu sc sod pot
## Min. : 1.50 Min. : 0.400 Min. : 4.5 Min. : 2.500
## 1st Qu.: 27.00 1st Qu.: 0.900 1st Qu.:135.0 1st Qu.: 3.800
## Median : 42.00 Median : 1.300 Median :138.0 Median : 4.400
## Mean : 57.43 Mean : 3.072 Mean :137.5 Mean : 4.627
## 3rd Qu.: 66.00 3rd Qu.: 2.800 3rd Qu.:142.0 3rd Qu.: 4.900
## Max. :391.00 Max. :76.000 Max. :163.0 Max. :47.000
## NA's :19 NA's :17 NA's :87 NA's :88
## hemo pcv wbcc rbcc htn
## Min. : 3.10 Min. : 9.00 Min. : 2200 Min. :2.100 yes :147
## 1st Qu.:10.30 1st Qu.:32.00 1st Qu.: 6500 1st Qu.:3.900 no :251
## Median :12.65 Median :40.00 Median : 8000 Median :4.800 NA's: 2
## Mean :12.53 Mean :38.88 Mean : 8406 Mean :4.707
## 3rd Qu.:15.00 3rd Qu.:45.00 3rd Qu.: 9800 3rd Qu.:5.400
## Max. :17.80 Max. :54.00 Max. :26400 Max. :8.000
## NA's :52 NA's :71 NA's :106 NA's :131
## dm cad appet pe ane class
## yes :137 yes : 34 good:317 yes : 76 yes : 60 ckd :250
## no :261 no :364 poor: 82 no :323 no :339 notckd:150
## NA's: 2 NA's: 2 NA's: 1 NA's: 1 NA's: 1
##
##
##
##
apply(CKD,2,function(i) {(sum(is.na(i))/nrow(CKD))*100})
## age bp sg al su rbc pc pcc ba bgr bu sc sod
## 2.25 3.00 11.75 11.50 12.25 38.00 16.25 1.00 1.00 11.00 4.75 4.25 21.75
## pot hemo pcv wbcc rbcc htn dm cad appet pe ane class
## 22.00 13.00 17.75 26.50 32.75 0.50 0.50 0.50 0.25 0.25 0.25 0.00
sum(complete.cases(CKD))
## [1] 158
sum(complete.cases(CKD[,-c(6,17,18)]))
## [1] 209
CKD2 <- CKD[complete.cases(CKD),]
dim(CKD2)
## [1] 158 25
apply(CKD2[,c(3:9,19:25)],2,table)
## $sg
##
## 1.005 1.010 1.015 1.020 1.025
## 3 23 10 61 61
##
## $al
##
## 0 1 2 3 4
## 116 3 9 15 15
##
## $su
##
## 0 1 2 3 4 5
## 140 6 6 3 2 1
##
## $rbc
##
## abnormal normal
## 18 140
##
## $pc
##
## abnormal normal
## 29 129
##
## $pcc
##
## notpresent present
## 144 14
##
## $ba
##
## notpresent present
## 146 12
##
## $htn
##
## no yes
## 124 34
##
## $dm
##
## no yes
## 130 28
##
## $cad
##
## no yes
## 147 11
##
## $appet
##
## good poor
## 139 19
##
## $pe
##
## no yes
## 138 20
##
## $ane
##
## no yes
## 142 16
##
## $class
##
## ckd notckd
## 43 115
creat_dummy_var_data <- function(dataset){
dummy_variables <- dummyVars(~., data=dataset, fullRank=T)
dummy_var_data <- data.frame( predict(dummy_variables, newdata=dataset) )
return(dummy_var_data)
}
create_training_test <- function(features_dataset,outcome_data,training_test_ratio){
training_index <- createDataPartition(outcome_data,p=training_test_ratio,list=F)
training_set <- droplevels(features_dataset[training_index,])
test_set <- droplevels(features_dataset[-training_index,])
outcome_training_set <- factor(outcome_data[training_index])
outcome_test_set <- factor(outcome_data[-training_index])
return(list(training_features=training_set, test_features=test_set, training_outcome=outcome_training_set, test_outcome=outcome_test_set))
}
# Funsi Pra proses data
remove_nonvaring_collinear_features <- function(training_data,test_data,corr_theshold=0.75){
near_zero_covariates <- colnames(training_data)[nearZeroVar(training_data)]
if(length(near_zero_covariates)>0)
{
# cari variable yang corelasinya kecil atau mendekati nol
nzc_indices_training <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data))})
training_data_nzc <- training_data[,-nzc_indices_training]
nzc_indices_test <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data))})
test_data_nzc <- test_data[,-nzc_indices_test]
} else {
training_data_nzc <- training_data
test_data_nzc <- test_data
}
# Tampilkan semua korelasi
feature_correlation <- cor(training_data_nzc)
# Menghapus variable autocorelation
high_correlation <- findCorrelation(feature_correlation,corr_theshold,verbose=F,names=T)
if(length(high_correlation)>0)
{
correlated_indices_training <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data_nzc))} )
final_training_data <- training_data_nzc[,-correlated_indices_training]
correlated_indices_test <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data_nzc))} )
final_test_data <- test_data_nzc[,-correlated_indices_test]
}else{
final_training_data <- training_data_nzc
final_test_data <- test_data_nzc
}
return(list(processed_training_set=final_training_data, processed_test_set=final_test_data))
}
Konversi variable categori menjadi variabel dummy, kecuali variable outcome
ID_outcome<- grep("class",colnames(CKD2))
Dummy_variables <- creat_dummy_var_data(CKD2[,-ID_outcome])
set.seed(135)
split_data <- create_training_test(Dummy_variables,CKD2$class,0.6)
lapply(split_data,head)
## $training_features
## age bp sg.1.010 sg.1.015 sg.1.020 sg.1.025 al.1 al.2 al.3 al.4 al.5 su.1
## 4 48 70 0 0 0 0 0 0 0 1 0 0
## 15 68 80 1 0 0 0 0 0 1 0 0 0
## 21 61 80 0 1 0 0 0 1 0 0 0 0
## 23 48 80 0 0 0 1 0 0 0 1 0 0
## 28 69 70 1 0 0 0 0 0 1 0 0 0
## 49 73 70 0 0 0 0 0 0 0 0 0 0
## su.2 su.3 su.4 su.5 rbc.abnormal pc.abnormal pcc.notpresent ba.notpresent
## 4 0 0 0 0 0 1 0 1
## 15 1 0 0 0 0 1 0 0
## 21 0 0 0 0 1 1 1 1
## 23 0 0 0 0 0 1 1 1
## 28 0 0 1 0 0 1 1 1
## 49 0 0 0 0 0 0 1 1
## bgr bu sc sod pot hemo pcv wbcc rbcc htn.no dm.no cad.no appet.poor pe.no
## 4 117 56 3.8 111 2.5 11.2 32 6700 3.9 0 1 1 1 0
## 15 157 90 4.1 130 6.4 5.6 16 11000 2.6 0 0 0 1 0
## 21 173 148 3.9 135 5.2 7.7 24 9200 3.2 0 0 0 1 0
## 23 95 163 7.7 136 3.8 9.8 32 6900 3.4 0 1 1 0 1
## 28 264 87 2.7 130 4.0 12.5 37 9600 4.1 0 0 0 0 0
## 49 70 32 0.9 125 4.0 10.0 29 18900 3.5 0 0 1 0 0
## ane.no
## 4 0
## 15 1
## 21 0
## 23 0
## 28 1
## 49 1
##
## $test_features
## age bp sg.1.010 sg.1.015 sg.1.020 sg.1.025 al.1 al.2 al.3 al.4 al.5 su.1
## 10 53 90 0 0 1 0 0 1 0 0 0 0
## 12 63 70 1 0 0 0 0 0 1 0 0 0
## 75 56 90 0 1 0 0 0 1 0 0 0 0
## 85 59 70 1 0 0 0 0 0 1 0 0 0
## 91 63 100 1 0 0 0 0 1 0 0 0 0
## 93 71 70 1 0 0 0 0 0 1 0 0 0
## su.2 su.3 su.4 su.5 rbc.abnormal pc.abnormal pcc.notpresent ba.notpresent
## 10 0 0 0 0 1 1 0 1
## 12 0 0 0 0 1 1 0 1
## 75 0 0 0 0 1 1 1 1
## 85 0 0 0 0 0 1 1 1
## 91 1 0 0 0 0 0 1 0
## 93 0 0 0 0 0 1 0 0
## bgr bu sc sod pot hemo pcv wbcc rbcc htn.no dm.no cad.no appet.poor
## 10 70 107 7.2 114 3.7 9.5 29 12100 3.7 0 0 1 1
## 12 380 60 2.7 131 4.2 10.8 32 4500 3.8 0 0 1 1
## 75 129 107 6.7 131 4.8 9.1 29 6400 3.4 0 1 1 0
## 85 76 186 15.0 135 7.6 7.1 22 3800 2.1 0 1 1 1
## 91 280 35 3.2 143 3.5 13.0 40 9800 4.2 0 1 0 0
## 93 219 82 3.6 133 4.4 10.4 33 5600 3.6 0 0 0 0
## pe.no ane.no
## 10 1 0
## 12 0 1
## 75 1 1
## 85 0 0
## 91 1 1
## 93 1 1
##
## $training_outcome
## [1] ckd ckd ckd ckd ckd ckd
## Levels: ckd notckd
##
## $test_outcome
## [1] ckd ckd ckd ckd ckd ckd
## Levels: ckd notckd
lapply(split_data[1:2],dim)
## $training_features
## [1] 95 35
##
## $test_features
## [1] 63 35
processed_data <- remove_nonvaring_collinear_features(split_data$training_features,split_data$test_features,0.60)#0.75
final_training_set <- processed_data$processed_training_set
final_test_set <- processed_data$processed_test_set
training_output <- split_data$training_outcome
test_output <- split_data$test_outcome
Detail masing masing variable thd outcome/variable respons
Tcontrol <- trainControl(method="cv", number=5, savePredictions = T,classProbs = TRUE)
Tcontrol <- trainControl(method="cv", number=5, savePredictions = T,classProbs = TRUE)
set.seed(101)
Model_lm <- train(y=training_output,
x=final_training_set,
trControl=Tcontrol,
method = "svmLinear",
preProcess = c("center", "scale","pca"))
Tcontrol <- trainControl(method="cv", number=5, savePredictions = T,classProbs = TRUE)
set.seed(202)
Model_rf <- train(y=training_output,
x=final_training_set,
trControl=Tcontrol,
method = "svmRadial",
tuneLength=10,
preProcess = c("center", "scale","pca"))
Pred_Model_lm <- predict(Model_lm, newdata=final_test_set)
confusionMatrix(data=Pred_Model_lm, reference=test_output)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ckd notckd
## ckd 16 0
## notckd 1 46
##
## Accuracy : 0.9841
## 95% CI : (0.9147, 0.9996)
## No Information Rate : 0.7302
## P-Value [Acc > NIR] : 6.034e-08
##
## Kappa : 0.959
##
## Mcnemar's Test P-Value : 1
##
## Sensitivity : 0.9412
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 0.9787
## Prevalence : 0.2698
## Detection Rate : 0.2540
## Detection Prevalence : 0.2540
## Balanced Accuracy : 0.9706
##
## 'Positive' Class : ckd
##
Pred_Model_rf <- predict(Model_rf, newdata=final_test_set)
confusionMatrix(data=Pred_Model_rf, reference=test_output)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ckd notckd
## ckd 17 0
## notckd 0 46
##
## Accuracy : 1
## 95% CI : (0.9431, 1)
## No Information Rate : 0.7302
## P-Value [Acc > NIR] : 2.485e-09
##
## Kappa : 1
##
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.2698
## Detection Rate : 0.2698
## Detection Prevalence : 0.2698
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : ckd
##
Mengambarkan rangking variable important