ChronicKidneyDiseasePrediction

Machine Learning Classification Problem example. Uses different machine learning algorithms to classify patients with Chronic Kidney Disease or not based on 24 features. https://archive.ics.uci.edu/ml/datasets/Chronic_Kidney_Disease

Load Required Packages

suppressWarnings (library(RWeka))
suppressWarnings (library(caret))
suppressWarnings (library(ROCR))
# libraries for partition trees
suppressWarnings (library(rpart))
suppressWarnings (library(rpart.plot))
suppressWarnings (library(rattle))

Data Exploration

# load data
file_location <- file.path("/Users","shruti","Dropbox","SHRUTIM","Rscripts","MachineLearning","KidneyDiseasePrediction/")
Chronic_Kidney_Disease <- read.arff(paste(file_location,"chronic_kidney_disease.arff",sep=""))

# data munging
dim(Chronic_Kidney_Disease)

## [1] 400  25

Chronic_Kidney_Disease[1:2,]

##   age bp    sg al su  rbc     pc        pcc         ba bgr bu  sc sod pot
## 1  48 80 1.020  1  0 <NA> normal notpresent notpresent 121 36 1.2  NA  NA
## 2   7 50 1.020  4  0 <NA> normal notpresent notpresent  NA 18 0.8  NA  NA
##   hemo pcv wbcc rbcc htn  dm cad appet pe ane class
## 1 15.4  44 7800  5.2 yes yes  no  good no  no   ckd
## 2 11.3  38 6000   NA  no  no  no  good no  no   ckd

summary(Chronic_Kidney_Disease)

##       age              bp             sg         al         su     
##  Min.   : 2.00   Min.   : 50.00   1.005:  7   0   :199   0   :290  
##  1st Qu.:42.00   1st Qu.: 70.00   1.010: 84   1   : 44   1   : 13  
##  Median :55.00   Median : 80.00   1.015: 75   2   : 43   2   : 18  
##  Mean   :51.48   Mean   : 76.47   1.020:106   3   : 43   3   : 14  
##  3rd Qu.:64.50   3rd Qu.: 80.00   1.025: 81   4   : 24   4   : 13  
##  Max.   :90.00   Max.   :180.00   NA's : 47   5   :  1   5   :  3  
##  NA's   :9       NA's   :12                   NA's: 46   NA's: 49  
##        rbc             pc              pcc               ba     
##  normal  :201   normal  :259   present   : 42   present   : 22  
##  abnormal: 47   abnormal: 76   notpresent:354   notpresent:374  
##  NA's    :152   NA's    : 65   NA's      :  4   NA's      :  4  
##                                                                 
##                                                                 
##                                                                 
##                                                                 
##       bgr            bu               sc              sod       
##  Min.   : 22   Min.   :  1.50   Min.   : 0.400   Min.   :  4.5  
##  1st Qu.: 99   1st Qu.: 27.00   1st Qu.: 0.900   1st Qu.:135.0  
##  Median :121   Median : 42.00   Median : 1.300   Median :138.0  
##  Mean   :148   Mean   : 57.43   Mean   : 3.072   Mean   :137.5  
##  3rd Qu.:163   3rd Qu.: 66.00   3rd Qu.: 2.800   3rd Qu.:142.0  
##  Max.   :490   Max.   :391.00   Max.   :76.000   Max.   :163.0  
##  NA's   :44    NA's   :19       NA's   :17       NA's   :87     
##       pot              hemo            pcv             wbcc      
##  Min.   : 2.500   Min.   : 3.10   Min.   : 9.00   Min.   : 2200  
##  1st Qu.: 3.800   1st Qu.:10.30   1st Qu.:32.00   1st Qu.: 6500  
##  Median : 4.400   Median :12.65   Median :40.00   Median : 8000  
##  Mean   : 4.627   Mean   :12.53   Mean   :38.88   Mean   : 8406  
##  3rd Qu.: 4.900   3rd Qu.:15.00   3rd Qu.:45.00   3rd Qu.: 9800  
##  Max.   :47.000   Max.   :17.80   Max.   :54.00   Max.   :26400  
##  NA's   :88       NA's   :52      NA's   :71      NA's   :106    
##       rbcc         htn         dm        cad       appet        pe     
##  Min.   :2.100   yes :147   yes :137   yes : 34   good:317   yes : 76  
##  1st Qu.:3.900   no  :251   no  :261   no  :364   poor: 82   no  :323  
##  Median :4.800   NA's:  2   NA's:  2   NA's:  2   NA's:  1   NA's:  1  
##  Mean   :4.707                                                         
##  3rd Qu.:5.400                                                         
##  Max.   :8.000                                                         
##  NA's   :131                                                           
##    ane         class    
##  yes : 60   ckd   :250  
##  no  :339   notckd:150  
##  NA's:  1               
##                         
##                         
##                         
##

# % data missing in each column
apply(Chronic_Kidney_Disease,2,function(i) {(sum(is.na(i))/nrow(Chronic_Kidney_Disease))*100})

##   age    bp    sg    al    su   rbc    pc   pcc    ba   bgr    bu    sc 
##  2.25  3.00 11.75 11.50 12.25 38.00 16.25  1.00  1.00 11.00  4.75  4.25 
##   sod   pot  hemo   pcv  wbcc  rbcc   htn    dm   cad appet    pe   ane 
## 21.75 22.00 13.00 17.75 26.50 32.75  0.50  0.50  0.50  0.25  0.25  0.25 
## class 
##  0.00

# samples with no missing data
sum(complete.cases(Chronic_Kidney_Disease))

## [1] 158

# samples with no missing data after removing columns which have more than 25% of data missing
sum(complete.cases(Chronic_Kidney_Disease[,-c(6,17,18)]))

## [1] 209

### remove rows with any missing value
Chronic_Kidney_Disease2 <- Chronic_Kidney_Disease[complete.cases(Chronic_Kidney_Disease),]
dim(Chronic_Kidney_Disease2)

## [1] 158  25

apply(Chronic_Kidney_Disease2[,c(3:9,19:25)],2,table)

## $sg
## 
## 1.005 1.010 1.015 1.020 1.025 
##     3    23    10    61    61 
## 
## $al
## 
##   0   1   2   3   4 
## 116   3   9  15  15 
## 
## $su
## 
##   0   1   2   3   4   5 
## 140   6   6   3   2   1 
## 
## $rbc
## 
## abnormal   normal 
##       18      140 
## 
## $pc
## 
## abnormal   normal 
##       29      129 
## 
## $pcc
## 
## notpresent    present 
##        144         14 
## 
## $ba
## 
## notpresent    present 
##        146         12 
## 
## $htn
## 
##  no yes 
## 124  34 
## 
## $dm
## 
##  no yes 
## 130  28 
## 
## $cad
## 
##  no yes 
## 147  11 
## 
## $appet
## 
## good poor 
##  139   19 
## 
## $pe
## 
##  no yes 
## 138  20 
## 
## $ane
## 
##  no yes 
## 142  16 
## 
## $class
## 
##    ckd notckd 
##     43    115

Function to convert factor variables into dummy variables

# function to create dataset with dummy variables
creat_dummy_var_data <- function(dataset){
  dummy_variables <- dummyVars(~., data=dataset, fullRank=T)
  dummy_var_data <- data.frame( predict(dummy_variables, newdata=dataset) )
  return(dummy_var_data)
}

Function to Split data into Training and Test

create_training_test <- function(features_dataset,outcome_data,training_test_ratio){
  training_index <- createDataPartition(outcome_data,p=training_test_ratio,list=F)
  
  training_set <- droplevels(features_dataset[training_index,])
  test_set <- droplevels(features_dataset[-training_index,])
  
  outcome_training_set <- factor(outcome_data[training_index])
  outcome_test_set <- factor(outcome_data[-training_index])
  
  return(list(training_features=training_set, test_features=test_set, training_outcome=outcome_training_set, test_outcome=outcome_test_set))
}

Function for Data Pre-processing

remove_nonvaring_collinear_features <- function(training_data,test_data,corr_theshold=0.75){
  # remove zero covaritates (features with NO VARIABILITY)
  #nearZeroVar(training_data,saveMetrics = T)
  near_zero_covariates <- colnames(training_data)[nearZeroVar(training_data)]
  
  if(length(near_zero_covariates)>0)
  {
    # find column indices of the near_zero_covariates
    nzc_indices_training <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data))})
    training_data_nzc <- training_data[,-nzc_indices_training]
    
    nzc_indices_test <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data))})  
    test_data_nzc <- test_data[,-nzc_indices_test]
  } else {
    training_data_nzc <- training_data
    test_data_nzc <- test_data
  }

  # CORRELATED features
  feature_correlation <- cor(training_data_nzc)
  # search through a correlation matrix and returns a vector of integers corresponding to columns to remove to reduce pair-wise correlations.
  high_correlation <- findCorrelation(feature_correlation,corr_theshold,verbose=F,names=T)

  if(length(high_correlation)>0)
  {
    correlated_indices_training <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data_nzc))} )
    final_training_data <- training_data_nzc[,-correlated_indices_training]
    
    correlated_indices_test <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data_nzc))} )
    final_test_data <- test_data_nzc[,-correlated_indices_test]
  }else{
    final_training_data <- training_data_nzc
    final_test_data <- test_data_nzc
  }
  
  return(list(processed_training_set=final_training_data, processed_test_set=final_test_data))
}

Execution

#sapply(Chronic_Kidney_Disease2[1,],class)
# since many of the features are categorical, convert them into dummy varaibles except the outcome. 
outcome_column_id <- grep("class",colnames(Chronic_Kidney_Disease2))
dataset_dummy_variables <- creat_dummy_var_data(Chronic_Kidney_Disease2[,-outcome_column_id])

# split data
# IMPORTANT NOTE: the class of column used for creating DataPartion is very important. Same variable can give different training_index depending on whether it is numeric or factor.
set.seed(123)
split_data <- create_training_test(dataset_dummy_variables,Chronic_Kidney_Disease2$class,0.6)
lapply(split_data,head)

## $training_features
##    age  bp sg.1.010 sg.1.015 sg.1.020 sg.1.025 al.1 al.2 al.3 al.4 al.5
## 10  53  90        0        0        1        0    0    1    0    0    0
## 12  63  70        1        0        0        0    0    0    1    0    0
## 28  69  70        1        0        0        0    0    0    1    0    0
## 59  73  80        0        0        1        0    0    1    0    0    0
## 85  59  70        1        0        0        0    0    0    1    0    0
## 91  63 100        1        0        0        0    0    1    0    0    0
##    su.1 su.2 su.3 su.4 su.5 rbc.abnormal pc.abnormal pc.cnotpresent
## 10    0    0    0    0    0            1           1              0
## 12    0    0    0    0    0            1           1              0
## 28    0    0    0    1    0            0           1              1
## 59    0    0    0    0    0            1           1              1
## 85    0    0    0    0    0            0           1              1
## 91    0    1    0    0    0            0           0              1
##    ba.notpresent bgr  bu   sc sod pot hemo pc.v  wbcc rbc.c htn.no dm.no
## 10             1  70 107  7.2 114 3.7  9.5   29 12100   3.7      0     0
## 12             1 380  60  2.7 131 4.2 10.8   32  4500   3.8      0     0
## 28             1 264  87  2.7 130 4.0 12.5   37  9600   4.1      0     0
## 59             1 253 142  4.6 138 5.8 10.5   33  7200   4.3      0     0
## 85             1  76 186 15.0 135 7.6  7.1   22  3800   2.1      0     1
## 91             0 280  35  3.2 143 3.5 13.0   40  9800   4.2      0     1
##    cad.no appet.poor pe.no ane.no
## 10      1          1     1      0
## 12      1          1     0      1
## 28      0          0     0      1
## 59      0          0     1      1
## 85      1          1     0      0
## 91      0          0     1      1
## 
## $test_features
##    age bp sg.1.010 sg.1.015 sg.1.020 sg.1.025 al.1 al.2 al.3 al.4 al.5
## 4   48 70        0        0        0        0    0    0    0    1    0
## 15  68 80        1        0        0        0    0    0    1    0    0
## 21  61 80        0        1        0        0    0    1    0    0    0
## 23  48 80        0        0        0        1    0    0    0    1    0
## 49  73 70        0        0        0        0    0    0    0    0    0
## 72  46 60        1        0        0        0    1    0    0    0    0
##    su.1 su.2 su.3 su.4 su.5 rbc.abnormal pc.abnormal pc.cnotpresent
## 4     0    0    0    0    0            0           1              0
## 15    0    1    0    0    0            0           1              0
## 21    0    0    0    0    0            1           1              1
## 23    0    0    0    0    0            0           1              1
## 49    0    0    0    0    0            0           0              1
## 72    0    0    0    0    0            0           0              1
##    ba.notpresent bgr  bu  sc sod pot hemo pc.v  wbcc rbc.c htn.no dm.no
## 4              1 117  56 3.8 111 2.5 11.2   32  6700   3.9      0     1
## 15             0 157  90 4.1 130 6.4  5.6   16 11000   2.6      0     0
## 21             1 173 148 3.9 135 5.2  7.7   24  9200   3.2      0     0
## 23             1  95 163 7.7 136 3.8  9.8   32  6900   3.4      0     1
## 49             1  70  32 0.9 125 4.0 10.0   29 18900   3.5      0     0
## 72             1 163  92 3.3 141 4.0  9.8   28 14600   3.2      0     0
##    cad.no appet.poor pe.no ane.no
## 4       1          1     0      0
## 15      0          1     0      1
## 21      0          1     0      0
## 23      1          0     1      0
## 49      1          0     0      1
## 72      1          0     1      1
## 
## $training_outcome
## [1] ckd ckd ckd ckd ckd ckd
## Levels: ckd notckd
## 
## $test_outcome
## [1] ckd ckd ckd ckd ckd ckd
## Levels: ckd notckd

lapply(split_data[1:2],dim)

## $training_features
## [1] 95 35
## 
## $test_features
## [1] 63 35

# data preprocessing
processed_data <- remove_nonvaring_collinear_features(split_data$training_features,split_data$test_features,0.75)
#lapply(processed_data,dim)

final_training_set <- processed_data$processed_training_set
final_test_set <- processed_data$processed_test_set
#dim(final_training_set); dim(final_test_set)

training_output <- split_data$training_outcome
test_output <- split_data$test_outcome
#length(training_output); length(test_output)

Exploratory Graphs

# PCA
pc <- prcomp(final_training_set,center=T,scale=T)
plot(pc,type="l",lab=c(10,10,12))

#pc$rotation[order(-abs(pc$rotation[,"PC1"])),]

par(xpd=TRUE)
par(mfrow=c(2,2))
for(i in seq_along(colnames(final_training_set)))
{
  plot(training_output,final_training_set[,i],main=colnames(final_training_set)[i],col=2:3)
}

par(mfrow=c(1,1))

Model Building

# k-fold cross validation
train_control <- trainControl(method="cv", number=5, savePredictions = T,classProbs =  TRUE)

# svm - linear
set.seed(1)
svm_lm_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "svmLinear",preProcess = c("center", "scale","pca"))

## Loading required package: kernlab

## Warning: package 'kernlab' was built under R version 3.2.4

## Warning in .recacheSubclasses(def@className, def, doSubclasses, env):
## undefined subclass "externalRefMethod" of class "kfunction"; definition not
## updated

## 
## Attaching package: 'kernlab'

## The following object is masked from 'package:ggplot2':
## 
##     alpha

# svm - rbf kernel
set.seed(1)
svm_rbf_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "svmRadial", tuneLegth=5, preProcess = c("center", "scale","pca"))

# Ridge Regression creates a linear regression model that is penalized with the L2-norm which is the sum of the squared coefficients.
set.seed(1)
ridge_regression_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=0,lambda=0.001),preProcess = c("center", "scale","pca"))

## Loading required package: glmnet

## Warning: package 'glmnet' was built under R version 3.2.4

## Loading required package: Matrix

## Warning: package 'Matrix' was built under R version 3.2.4

## Loading required package: foreach

## Loaded glmnet 2.0-5

# LASSO (Least Absolute Shrinkage and Selection Operator) creates a regression model that is penalized with the L1-norm which is the sum of the absolute coefficients. 
set.seed(1)
lasso_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=1,lambda=0.001),preProcess = c("center", "scale","pca"))

# Elastic Net creates a regression model that is penalized with both the L1-norm and L2-norm. 
set.seed(1)
elastic_net_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=0.5,lambda=0.001),preProcess = c("center", "scale","pca"))

# classification Trees
set.seed(1)
rpart_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "rpart",preProcess = c("center", "scale","pca"))
#plot(rpart_model)

# random forest
set.seed(1)
rf_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "rf",prox=T,preProcess = c("center", "scale"))

## Loading required package: randomForest

## randomForest 4.6-12

## Type rfNews() to see new features/changes/bug fixes.

## 
## Attaching package: 'randomForest'

## The following object is masked from 'package:ggplot2':
## 
##     margin

plot(rf_model)

# boosting with tres
set.seed(1)
gbm_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "gbm", verbose=F,preProcess = c("center", "scale","pca"))

## Loading required package: gbm

## Loading required package: survival

## 
## Attaching package: 'survival'

## The following object is masked from 'package:caret':
## 
##     cluster

## Loading required package: splines

## Loading required package: parallel

## Loaded gbm 2.1.1

## Loading required package: plyr

#plot(gbm_model)

# linear discriminant analysis
set.seed(1)
lda_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "lda",preProcess = c("center", "scale","pca"))

## Loading required package: MASS

## Warning: package 'MASS' was built under R version 3.2.2

# collect resamples
training_models <- list(SVM_LM=svm_lm_model,SVM_RBF=svm_rbf_model,RPART=rpart_model,GBM=gbm_model,RF=rf_model,LDA=lda_model,RIDGE=ridge_regression_model,LASSO=lasso_model,ELASTIC=elastic_net_model)
train_results <- resamples(training_models)
# summarize the distributions
summary(train_results)

## 
## Call:
## summary.resamples(object = train_results)
## 
## Models: SVM_LM, SVM_RBF, RPART, GBM, RF, LDA, RIDGE, LASSO, ELASTIC 
## Number of resamples: 5 
## 
## Accuracy 
##           Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## SVM_LM  0.9474  1.0000 1.0000 0.9895       1    1    0
## SVM_RBF 1.0000  1.0000 1.0000 1.0000       1    1    0
## RPART   0.9474  1.0000 1.0000 0.9895       1    1    0
## GBM     0.9474  1.0000 1.0000 0.9895       1    1    0
## RF      0.9474  1.0000 1.0000 0.9895       1    1    0
## LDA     0.9444  0.9474 0.9474 0.9678       1    1    0
## RIDGE   0.8947  0.9444 0.9474 0.9573       1    1    0
## LASSO   0.9444  0.9474 1.0000 0.9784       1    1    0
## ELASTIC 0.9474  0.9474 1.0000 0.9789       1    1    0
## 
## Kappa 
##           Min. 1st Qu. Median   Mean 3rd Qu. Max. NA's
## SVM_LM  0.8550  1.0000  1.000 0.9710       1    1    0
## SVM_RBF 1.0000  1.0000  1.000 1.0000       1    1    0
## RPART   0.8550  1.0000  1.000 0.9710       1    1    0
## GBM     0.8550  1.0000  1.000 0.9710       1    1    0
## RF      0.8550  1.0000  1.000 0.9710       1    1    0
## LDA     0.8525  0.8550  0.855 0.9125       1    1    0
## RIDGE   0.6885  0.8525  0.855 0.8792       1    1    0
## LASSO   0.8525  0.8550  1.000 0.9415       1    1    0
## ELASTIC 0.8550  0.8550  1.000 0.9420       1    1    0

# boxplots of results
bwplot(train_results)

# the above results suggest that svm-rf and random forest model performs best on the training data.

EVALUATE MODEL ACCURACY ON TEST SET

#Ideally, you select model that performs best on training data and evaluate on test set. I am doing for all models just for illustration 
test_pred_svm_lm <- predict(svm_lm_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_svm_lm, reference=test_output)

test_pred_svm_rbf <- predict(svm_rbf_model, newdata=final_test_set)
confusionMatrix(data=test_pred_svm_rbf, reference=test_output)

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction ckd notckd
##     ckd     17      0
##     notckd   0     46
##                                      
##                Accuracy : 1          
##                  95% CI : (0.9431, 1)
##     No Information Rate : 0.7302     
##     P-Value [Acc > NIR] : 2.485e-09  
##                                      
##                   Kappa : 1          
##  Mcnemar's Test P-Value : NA         
##                                      
##             Sensitivity : 1.0000     
##             Specificity : 1.0000     
##          Pos Pred Value : 1.0000     
##          Neg Pred Value : 1.0000     
##              Prevalence : 0.2698     
##          Detection Rate : 0.2698     
##    Detection Prevalence : 0.2698     
##       Balanced Accuracy : 1.0000     
##                                      
##        'Positive' Class : ckd        
##

test_pred_rpart <- predict(rpart_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_rpart, test_output)

test_pred_gbm <- predict(gbm_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_gbm, test_output)

test_pred_rf <- predict(rf_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_rf, test_output)

test_pred_lda <- predict(lda_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_lda, test_output)

test_pred_ridge <- predict(ridge_regression_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_ridge, test_output)

test_pred_lasso <- predict(lasso_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_lasso, test_output)

test_pred_elastic_net <- predict(elastic_net_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_elastic_net, test_output)

balanced_accuracy <- function(trained_model, test_features=final_test_set, test_outcomes=test_output){
  test_model <- predict(trained_model,test_features)
  test_score <- confusionMatrix(data=test_model, test_outcomes)
  return(test_score$byClass[["Balanced Accuracy"]])
}

lapply(training_models, balanced_accuracy)

## $SVM_LM
## [1] 0.9411765
## 
## $SVM_RBF
## [1] 1
## 
## $RPART
## [1] 1
## 
## $GBM
## [1] 1
## 
## $RF
## [1] 1
## 
## $LDA
## [1] 0.8823529
## 
## $RIDGE
## [1] 0.8823529
## 
## $LASSO
## [1] 0.8823529
## 
## $ELASTIC
## [1] 0.8823529

ROC CURVES

roc_curve <- function(test_predictions,colour=1,test_labels=test_output){
  pred <- prediction(as.numeric(test_predictions), as.numeric(test_labels) )
  perf <- performance(pred, measure = "tpr", x.measure = "fpr")
  plot(perf,col=colour)
}

roc_curve(test_pred_svm_rbf,colour=1)
par(new = TRUE)
roc_curve(test_pred_lda,colour=2)
par(new = TRUE)
roc_curve(test_pred_rf,colour=3)
par(new = TRUE)
roc_curve(test_pred_elastic_net,colour=4)
par(new = TRUE)
roc_curve(test_pred_gbm,colour=5)
par(new = FALSE)
legend("bottomright",c("svm radial kernel", "lda","random forest","elastic_net","graded boosting"), col = c(1:5),cex=0.8,lty=1)
title(main="ROC curves for test data")

Feature Importance

plot(varImp(svm_rbf_model))

ckd_training_set <- which(training_output=="ckd")
nonckd_training_set <- which(training_output=="notckd")

imp_features_svm <- (c("rbc.c","bu","sod","bgr","age"))
sapply(imp_features_svm,function(i) {
  t.test(final_training_set[ckd_training_set,i],final_training_set[nonckd_training_set,i])
})

##             rbc.c                                                                                   
## statistic   -6.330299                                                                               
## parameter   30.15484                                                                                
## p.value     5.434398e-07                                                                            
## conf.int    Numeric,2                                                                               
## estimate    Numeric,2                                                                               
## null.value  0                                                                                       
## alternative "two.sided"                                                                             
## method      "Welch Two Sample t-test"                                                               
## data.name   "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"
##             bu                                                                                      
## statistic   5.7228                                                                                  
## parameter   25.54879                                                                                
## p.value     5.378718e-06                                                                            
## conf.int    Numeric,2                                                                               
## estimate    Numeric,2                                                                               
## null.value  0                                                                                       
## alternative "two.sided"                                                                             
## method      "Welch Two Sample t-test"                                                               
## data.name   "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"
##             sod                                                                                     
## statistic   -6.31163                                                                                
## parameter   33.19514                                                                                
## p.value     3.770082e-07                                                                            
## conf.int    Numeric,2                                                                               
## estimate    Numeric,2                                                                               
## null.value  0                                                                                       
## alternative "two.sided"                                                                             
## method      "Welch Two Sample t-test"                                                               
## data.name   "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"
##             bgr                                                                                     
## statistic   5.864056                                                                                
## parameter   25.61372                                                                                
## p.value     3.704955e-06                                                                            
## conf.int    Numeric,2                                                                               
## estimate    Numeric,2                                                                               
## null.value  0                                                                                       
## alternative "two.sided"                                                                             
## method      "Welch Two Sample t-test"                                                               
## data.name   "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"
##             age                                                                                     
## statistic   5.000511                                                                                
## parameter   85.06871                                                                                
## p.value     3.032596e-06                                                                            
## conf.int    Numeric,2                                                                               
## estimate    Numeric,2                                                                               
## null.value  0                                                                                       
## alternative "two.sided"                                                                             
## method      "Welch Two Sample t-test"                                                               
## data.name   "final_training_set[ckd_training_set, i] and final_training_set[nonckd_training_set, i]"

ChronicKidneyDiseasePrediction_ML

Shruti

February 25, 2016