Machine Learning Classification Problem example. Uses different machine learning algorithms to classify patients with Chronic Kidney Disease or not based on 24 features. https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease
Load Required Packages
suppressWarnings (library(RWeka))
suppressWarnings (library(caret))
suppressWarnings (library(ROCR))
# libraries for partition trees
suppressWarnings (library(rpart))
suppressWarnings (library(rpart.plot))
suppressWarnings (library(rattle))
Data Exploration
# load data
file_location <- file.path("/Users","shruti","Dropbox","SHRUTIM","Rscripts","MachineLearning","KidneyDiseasePrediction/")
Chronic_Kidney_Disease <- read.arff(paste(file_location,"chronic_kidney_disease.arff",sep=""))
# data munging
dim(Chronic_Kidney_Disease)
## [1] 400 25
Chronic_Kidney_Disease[1:2,]
## age bp sg al su rbc pc pcc ba bgr bu sc sod pot
## 1 48 80 1.020 1 0 <NA> normal notpresent notpresent 121 36 1.2 NA NA
## 2 7 50 1.020 4 0 <NA> normal notpresent notpresent NA 18 0.8 NA NA
## hemo pcv wbcc rbcc htn dm cad appet pe ane class
## 1 15.4 44 7800 5.2 yes yes no good no no ckd
## 2 11.3 38 6000 NA no no no good no no ckd
summary(Chronic_Kidney_Disease)
## age bp sg al su
## Min. : 2.00 Min. : 50.00 1.005: 7 0 :199 0 :290
## 1st Qu.:42.00 1st Qu.: 70.00 1.010: 84 1 : 44 1 : 13
## Median :55.00 Median : 80.00 1.015: 75 2 : 43 2 : 18
## Mean :51.48 Mean : 76.47 1.020:106 3 : 43 3 : 14
## 3rd Qu.:64.50 3rd Qu.: 80.00 1.025: 81 4 : 24 4 : 13
## Max. :90.00 Max. :180.00 NA's : 47 5 : 1 5 : 3
## NA's :9 NA's :12 NA's: 46 NA's: 49
## rbc pc pcc ba
## normal :201 normal :259 present : 42 present : 22
## abnormal: 47 abnormal: 76 notpresent:354 notpresent:374
## NA's :152 NA's : 65 NA's : 4 NA's : 4
##
##
##
##
## bgr bu sc sod
## Min. : 22 Min. : 1.50 Min. : 0.400 Min. : 4.5
## 1st Qu.: 99 1st Qu.: 27.00 1st Qu.: 0.900 1st Qu.:135.0
## Median :121 Median : 42.00 Median : 1.300 Median :138.0
## Mean :148 Mean : 57.43 Mean : 3.072 Mean :137.5
## 3rd Qu.:163 3rd Qu.: 66.00 3rd Qu.: 2.800 3rd Qu.:142.0
## Max. :490 Max. :391.00 Max. :76.000 Max. :163.0
## NA's :44 NA's :19 NA's :17 NA's :87
## pot hemo pcv wbcc
## Min. : 2.500 Min. : 3.10 Min. : 9.00 Min. : 2200
## 1st Qu.: 3.800 1st Qu.:10.30 1st Qu.:32.00 1st Qu.: 6500
## Median : 4.400 Median :12.65 Median :40.00 Median : 8000
## Mean : 4.627 Mean :12.53 Mean :38.88 Mean : 8406
## 3rd Qu.: 4.900 3rd Qu.:15.00 3rd Qu.:45.00 3rd Qu.: 9800
## Max. :47.000 Max. :17.80 Max. :54.00 Max. :26400
## NA's :88 NA's :52 NA's :71 NA's :106
## rbcc htn dm cad appet pe
## Min. :2.100 yes :147 yes :137 yes : 34 good:317 yes : 76
## 1st Qu.:3.900 no :251 no :261 no :364 poor: 82 no :323
## Median :4.800 NA's: 2 NA's: 2 NA's: 2 NA's: 1 NA's: 1
## Mean :4.707
## 3rd Qu.:5.400
## Max. :8.000
## NA's :131
## ane class
## yes : 60 ckd :250
## no :339 notckd:150
## NA's: 1
##
##
##
##
# % data missing in each column
apply(Chronic_Kidney_Disease,2,function(i) {(sum(is.na(i))/nrow(Chronic_Kidney_Disease))*100})
## age bp sg al su rbc pc pcc ba bgr bu sc
## 2.25 3.00 11.75 11.50 12.25 38.00 16.25 1.00 1.00 11.00 4.75 4.25
## sod pot hemo pcv wbcc rbcc htn dm cad appet pe ane
## 21.75 22.00 13.00 17.75 26.50 32.75 0.50 0.50 0.50 0.25 0.25 0.25
## class
## 0.00
# samples with no missing data
sum(complete.cases(Chronic_Kidney_Disease))
## [1] 158
# samples with no missing data after removing columns which have more than 25% of data missing
sum(complete.cases(Chronic_Kidney_Disease[,-c(6,17,18)]))
## [1] 209
### remove rows with any missing value
Chronic_Kidney_Disease2 <- Chronic_Kidney_Disease[complete.cases(Chronic_Kidney_Disease),]
dim(Chronic_Kidney_Disease2)
## [1] 158 25
apply(Chronic_Kidney_Disease2[,c(3:9,19:25)],2,table)
## $sg
##
## 1.005 1.010 1.015 1.020 1.025
## 3 23 10 61 61
##
## $al
##
## 0 1 2 3 4
## 116 3 9 15 15
##
## $su
##
## 0 1 2 3 4 5
## 140 6 6 3 2 1
##
## $rbc
##
## abnormal normal
## 18 140
##
## $pc
##
## abnormal normal
## 29 129
##
## $pcc
##
## notpresent present
## 144 14
##
## $ba
##
## notpresent present
## 146 12
##
## $htn
##
## no yes
## 124 34
##
## $dm
##
## no yes
## 130 28
##
## $cad
##
## no yes
## 147 11
##
## $appet
##
## good poor
## 139 19
##
## $pe
##
## no yes
## 138 20
##
## $ane
##
## no yes
## 142 16
##
## $class
##
## ckd notckd
## 43 115
Function to convert factor variables into dummy variables
# function to create dataset with dummy variables
creat_dummy_var_data <- function(dataset){
dummy_variables <- dummyVars(~., data=dataset, fullRank=T)
dummy_var_data <- data.frame( predict(dummy_variables, newdata=dataset) )
return(dummy_var_data)
}
Function to Split data into Training and Test
create_training_test <- function(features_dataset,outcome_data,training_test_ratio){
training_index <- createDataPartition(outcome_data,p=training_test_ratio,list=F)
training_set <- droplevels(features_dataset[training_index,])
test_set <- droplevels(features_dataset[-training_index,])
outcome_training_set <- factor(outcome_data[training_index])
outcome_test_set <- factor(outcome_data[-training_index])
return(list(training_features=training_set, test_features=test_set, training_outcome=outcome_training_set, test_outcome=outcome_test_set))
}
Function for Data Pre-processing
remove_nonvaring_collinear_features <- function(training_data,test_data,corr_theshold=0.75){
# remove zero covaritates (features with NO VARIABILITY)
#nearZeroVar(training_data,saveMetrics = T)
near_zero_covariates <- colnames(training_data)[nearZeroVar(training_data)]
if(length(near_zero_covariates)>0)
{
# find column indices of the near_zero_covariates
nzc_indices_training <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data))})
training_data_nzc <- training_data[,-nzc_indices_training]
nzc_indices_test <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data))})
test_data_nzc <- test_data[,-nzc_indices_test]
} else {
training_data_nzc <- training_data
test_data_nzc <- test_data
}
# CORRELATED features
feature_correlation <- cor(training_data_nzc)
# search through a correlation matrix and returns a vector of integers corresponding to columns to remove to reduce pair-wise correlations.
high_correlation <- findCorrelation(feature_correlation,corr_theshold,verbose=F,names=T)
if(length(high_correlation)>0)
{
correlated_indices_training <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data_nzc))} )
final_training_data <- training_data_nzc[,-correlated_indices_training]
correlated_indices_test <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data_nzc))} )
final_test_data <- test_data_nzc[,-correlated_indices_test]
}else{
final_training_data <- training_data_nzc
final_test_data <- test_data_nzc
}
return(list(processed_training_set=final_training_data, processed_test_set=final_test_data))
}
Execution
#sapply(Chronic_Kidney_Disease2[1,],class)
# since many of the features are categorical, convert them into dummy varaibles except the outcome.
outcome_column_id <- grep("class",colnames(Chronic_Kidney_Disease2))
dataset_dummy_variables <- creat_dummy_var_data(Chronic_Kidney_Disease2[,-outcome_column_id])
# split data
# IMPORTANT NOTE: the class of column used for creating DataPartion is very important. Same variable can give different training_index depending on whether it is numeric or factor.
set.seed(123)
split_data <- create_training_test(dataset_dummy_variables,Chronic_Kidney_Disease2$class,0.6)
lapply(split_data,head)
## $training_features
## age bp sg.1.010 sg.1.015 sg.1.020 sg.1.025 al.1 al.2 al.3 al.4 al.5
## 10 53 90 0 0 1 0 0 1 0 0 0
## 12 63 70 1 0 0 0 0 0 1 0 0
## 28 69 70 1 0 0 0 0 0 1 0 0
## 59 73 80 0 0 1 0 0 1 0 0 0
## 85 59 70 1 0 0 0 0 0 1 0 0
## 91 63 100 1 0 0 0 0 1 0 0 0
## su.1 su.2 su.3 su.4 su.5 rbc.abnormal pc.abnormal pc.cnotpresent
## 10 0 0 0 0 0 1 1 0
## 12 0 0 0 0 0 1 1 0
## 28 0 0 0 1 0 0 1 1
## 59 0 0 0 0 0 1 1 1
## 85 0 0 0 0 0 0 1 1
## 91 0 1 0 0 0 0 0 1
## ba.notpresent bgr bu sc sod pot hemo pc.v wbcc rbc.c htn.no dm.no
## 10 1 70 107 7.2 114 3.7 9.5 29 12100 3.7 0 0
## 12 1 380 60 2.7 131 4.2 10.8 32 4500 3.8 0 0
## 28 1 264 87 2.7 130 4.0 12.5 37 9600 4.1 0 0
## 59 1 253 142 4.6 138 5.8 10.5 33 7200 4.3 0 0
## 85 1 76 186 15.0 135 7.6 7.1 22 3800 2.1 0 1
## 91 0 280 35 3.2 143 3.5 13.0 40 9800 4.2 0 1
## cad.no appet.poor pe.no ane.no
## 10 1 1 1 0
## 12 1 1 0 1
## 28 0 0 0 1
## 59 0 0 1 1
## 85 1 1 0 0
## 91 0 0 1 1
##
## $test_features
## age bp sg.1.010 sg.1.015 sg.1.020 sg.1.025 al.1 al.2 al.3 al.4 al.5
## 4 48 70 0 0 0 0 0 0 0 1 0
## 15 68 80 1 0 0 0 0 0 1 0 0
## 21 61 80 0 1 0 0 0 1 0 0 0
## 23 48 80 0 0 0 1 0 0 0 1 0
## 49 73 70 0 0 0 0 0 0 0 0 0
## 72 46 60 1 0 0 0 1 0 0 0 0
## su.1 su.2 su.3 su.4 su.5 rbc.abnormal pc.abnormal pc.cnotpresent
## 4 0 0 0 0 0 0 1 0
## 15 0 1 0 0 0 0 1 0
## 21 0 0 0 0 0 1 1 1
## 23 0 0 0 0 0 0 1 1
## 49 0 0 0 0 0 0 0 1
## 72 0 0 0 0 0 0 0 1
## ba.notpresent bgr bu sc sod pot hemo pc.v wbcc rbc.c htn.no dm.no
## 4 1 117 56 3.8 111 2.5 11.2 32 6700 3.9 0 1
## 15 0 157 90 4.1 130 6.4 5.6 16 11000 2.6 0 0
## 21 1 173 148 3.9 135 5.2 7.7 24 9200 3.2 0 0
## 23 1 95 163 7.7 136 3.8 9.8 32 6900 3.4 0 1
## 49 1 70 32 0.9 125 4.0 10.0 29 18900 3.5 0 0
## 72 1 163 92 3.3 141 4.0 9.8 28 14600 3.2 0 0
## cad.no appet.poor pe.no ane.no
## 4 1 1 0 0
## 15 0 1 0 1
## 21 0 1 0 0
## 23 1 0 1 0
## 49 1 0 0 1
## 72 1 0 1 1
##
## $training_outcome
## [1] ckd ckd ckd ckd ckd ckd
## Levels: ckd notckd
##
## $test_outcome
## [1] ckd ckd ckd ckd ckd ckd
## Levels: ckd notckd
lapply(split_data[1:2],dim)
## $training_features
## [1] 95 35
##
## $test_features
## [1] 63 35
# data preprocessing
processed_data <- remove_nonvaring_collinear_features(split_data$training_features,split_data$test_features,0.75)
#lapply(processed_data,dim)
final_training_set <- processed_data$processed_training_set
final_test_set <- processed_data$processed_test_set
#dim(final_training_set); dim(final_test_set)
training_output <- split_data$training_outcome
test_output <- split_data$test_outcome
#length(training_output); length(test_output)
Exploratory Graphs
# PCA
pc <- prcomp(final_training_set,center=T,scale=T)
plot(pc,type="l",lab=c(10,10,12))
#pc$rotation[order(-abs(pc$rotation[,"PC1"])),]
par(xpd=TRUE)
par(mfrow=c(2,2))
for(i in seq_along(colnames(final_training_set)))
{
plot(training_output,final_training_set[,i],main=colnames(final_training_set)[i],col=2:3)
}
par(mfrow=c(1,1))
Model Building
# k-fold cross validation
train_control <- trainControl(method="cv", number=5, savePredictions = T,classProbs = TRUE)
# svm - linear
set.seed(1)
svm_lm_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "svmLinear",preProcess = c("center", "scale","pca"))
## Loading required package: kernlab
## Warning: package 'kernlab' was built under R version 3.2.4
## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "kfunction"; definition not
## updated
##
## Attaching package: 'kernlab'
## The following object is masked from 'package:ggplot2':
##
## alpha
# svm - rbf kernel
set.seed(1)
svm_rbf_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "svmRadial", tuneLegth=5, preProcess = c("center", "scale","pca"))
# Ridge Regression creates a linear regression model that is penalized with the L2-norm which is the sum of the squared coefficients.
set.seed(1)
ridge_regression_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=0,lambda=0.001),preProcess = c("center", "scale","pca"))
## Loading required package: glmnet
## Warning: package 'glmnet' was built under R version 3.2.4
## Loading required package: Matrix
## Warning: package 'Matrix' was built under R version 3.2.4
## Loading required package: foreach
## Loaded glmnet 2.0-5
# LASSO (Least Absolute Shrinkage and Selection Operator) creates a regression model that is penalized with the L1-norm which is the sum of the absolute coefficients.
set.seed(1)
lasso_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=1,lambda=0.001),preProcess = c("center", "scale","pca"))
# Elastic Net creates a regression model that is penalized with both the L1-norm and L2-norm.
set.seed(1)
elastic_net_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=0.5,lambda=0.001),preProcess = c("center", "scale","pca"))
# classification Trees
set.seed(1)
rpart_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "rpart",preProcess = c("center", "scale","pca"))
#plot(rpart_model)
# random forest
set.seed(1)
rf_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "rf",prox=T,preProcess = c("center", "scale"))
## Loading required package: randomForest
## randomForest 4.6-12
## Type rfNews() to see new features/changes/bug fixes.
##
## Attaching package: 'randomForest'
## The following object is masked from 'package:ggplot2':
##
## margin
plot(rf_model)
# boosting with tres
set.seed(1)
gbm_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "gbm", verbose=F,preProcess = c("center", "scale","pca"))
## Loading required package: gbm
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: splines
## Loading required package: parallel
## Loaded gbm 2.1.1
## Loading required package: plyr
#plot(gbm_model)
# linear discriminant analysis
set.seed(1)
lda_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "lda",preProcess = c("center", "scale","pca"))
## Loading required package: MASS
## Warning: package 'MASS' was built under R version 3.2.2
# collect resamples
training_models <- list(SVM_LM=svm_lm_model,SVM_RBF=svm_rbf_model,RPART=rpart_model,GBM=gbm_model,RF=rf_model,LDA=lda_model,RIDGE=ridge_regression_model,LASSO=lasso_model,ELASTIC=elastic_net_model)
train_results <- resamples(training_models)
# summarize the distributions
summary(train_results)
##
## Call:
## summary.resamples(object = train_results)
##
## Models: SVM_LM, SVM_RBF, RPART, GBM, RF, LDA, RIDGE, LASSO, ELASTIC
## Number of resamples: 5
##
## Accuracy
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_LM 0.9474 1.0000 1.0000 0.9895 1 1 0
## SVM_RBF 1.0000 1.0000 1.0000 1.0000 1 1 0
## RPART 0.9474 1.0000 1.0000 0.9895 1 1 0
## GBM 0.9474 1.0000 1.0000 0.9895 1 1 0
## RF 0.9474 1.0000 1.0000 0.9895 1 1 0
## LDA 0.9444 0.9474 0.9474 0.9678 1 1 0
## RIDGE 0.8947 0.9444 0.9474 0.9573 1 1 0
## LASSO 0.9444 0.9474 1.0000 0.9784 1 1 0
## ELASTIC 0.9474 0.9474 1.0000 0.9789 1 1 0
##
## Kappa
## Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
## SVM_LM 0.8550 1.0000 1.000 0.9710 1 1 0
## SVM_RBF 1.0000 1.0000 1.000 1.0000 1 1 0
## RPART 0.8550 1.0000 1.000 0.9710 1 1 0
## GBM 0.8550 1.0000 1.000 0.9710 1 1 0
## RF 0.8550 1.0000 1.000 0.9710 1 1 0
## LDA 0.8525 0.8550 0.855 0.9125 1 1 0
## RIDGE 0.6885 0.8525 0.855 0.8792 1 1 0
## LASSO 0.8525 0.8550 1.000 0.9415 1 1 0
## ELASTIC 0.8550 0.8550 1.000 0.9420 1 1 0
# boxplots of results
bwplot(train_results)
# the above results suggest that svm-rf and random forest model performs best on the training data.
EVALUATE MODEL ACCURACY ON TEST SET
#Ideally, you select model that performs best on training data and evaluate on test set. I am doing for all models just for illustration
test_pred_svm_lm <- predict(svm_lm_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_svm_lm, reference=test_output)
test_pred_svm_rbf <- predict(svm_rbf_model, newdata=final_test_set)
confusionMatrix(data=test_pred_svm_rbf, reference=test_output)
## Confusion Matrix and Statistics
##
## Reference
## Prediction ckd notckd
## ckd 17 0
## notckd 0 46
##
## Accuracy : 1
## 95% CI : (0.9431, 1)
## No Information Rate : 0.7302
## P-Value [Acc > NIR] : 2.485e-09
##
## Kappa : 1
## Mcnemar's Test P-Value : NA
##
## Sensitivity : 1.0000
## Specificity : 1.0000
## Pos Pred Value : 1.0000
## Neg Pred Value : 1.0000
## Prevalence : 0.2698
## Detection Rate : 0.2698
## Detection Prevalence : 0.2698
## Balanced Accuracy : 1.0000
##
## 'Positive' Class : ckd
##
test_pred_rpart <- predict(rpart_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_rpart, test_output)
test_pred_gbm <- predict(gbm_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_gbm, test_output)
test_pred_rf <- predict(rf_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_rf, test_output)
test_pred_lda <- predict(lda_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_lda, test_output)
test_pred_ridge <- predict(ridge_regression_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_ridge, test_output)
test_pred_lasso <- predict(lasso_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_lasso, test_output)
test_pred_elastic_net <- predict(elastic_net_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_elastic_net, test_output)
balanced_accuracy <- function(trained_model, test_features=final_test_set, test_outcomes=test_output){
test_model <- predict(trained_model,test_features)
test_score <- confusionMatrix(data=test_model, test_outcomes)
return(test_score$byClass[["Balanced Accuracy"]])
}
lapply(training_models, balanced_accuracy)
## $SVM_LM
## [1] 0.9411765
##
## $SVM_RBF
## [1] 1
##
## $RPART
## [1] 1
##
## $GBM
## [1] 1
##
## $RF
## [1] 1
##
## $LDA
## [1] 0.8823529
##
## $RIDGE
## [1] 0.8823529
##
## $LASSO
## [1] 0.8823529
##
## $ELASTIC
## [1] 0.8823529
ROC CURVES
roc_curve <- function(test_predictions,colour=1,test_labels=test_output){
pred <- prediction(as.numeric(test_predictions), as.numeric(test_labels) )
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf,col=colour)
}
roc_curve(test_pred_svm_rbf,colour=1)
par(new = TRUE)
roc_curve(test_pred_lda,colour=2)
par(new = TRUE)
roc_curve(test_pred_rf,colour=3)
par(new = TRUE)
roc_curve(test_pred_elastic_net,colour=4)
par(new = TRUE)
roc_curve(test_pred_gbm,colour=5)
par(new = FALSE)
legend("bottomright",c("svm radial kernel", "lda","random forest","elastic_net","graded boosting"), col = c(1:5),cex=0.8,lty=1)
title(main="ROC curves for test data")
Feature Importance
plot(varImp(svm_rbf_model))
ckd_training_set <- which(training_output=="ckd")
nonckd_training_set <- which(training_output=="notckd")
imp_features_svm <- (c("rbc.c","bu","sod","bgr","age"))
sapply(imp_features_svm,function(i) {
t.test(final_training_set[ckd_training_set,i],final_training_set[nonckd_training_set,i])
})
## rbc.c
## statistic -6.330299
## parameter 30.15484
## p.value 5.434398e-07
## conf.int Numeric,2
## estimate Numeric,2
## null.value 0
## alternative "two.sided"
## method "Welch Two Sample t-test"
## data.name "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"
## bu
## statistic 5.7228
## parameter 25.54879
## p.value 5.378718e-06
## conf.int Numeric,2
## estimate Numeric,2
## null.value 0
## alternative "two.sided"
## method "Welch Two Sample t-test"
## data.name "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"
## sod
## statistic -6.31163
## parameter 33.19514
## p.value 3.770082e-07
## conf.int Numeric,2
## estimate Numeric,2
## null.value 0
## alternative "two.sided"
## method "Welch Two Sample t-test"
## data.name "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"
## bgr
## statistic 5.864056
## parameter 25.61372
## p.value 3.704955e-06
## conf.int Numeric,2
## estimate Numeric,2
## null.value 0
## alternative "two.sided"
## method "Welch Two Sample t-test"
## data.name "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"
## age
## statistic 5.000511
## parameter 85.06871
## p.value 3.032596e-06
## conf.int Numeric,2
## estimate Numeric,2
## null.value 0
## alternative "two.sided"
## method "Welch Two Sample t-test"
## data.name "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"