install.packages("RWeka")
Installing package into 㤼㸱C:/Users/compu lab/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
also installing the dependency 㤼㸱RWekajars㤼㸲
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/RWekajars_3.9.3-2.zip'
Content type 'application/zip' length 10032721 bytes (9.6 MB)
downloaded 9.6 MB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/RWeka_0.4-43.zip'
Content type 'application/zip' length 627382 bytes (612 KB)
downloaded 612 KB
package ‘RWekajars’ successfully unpacked and MD5 sums checked
package ‘RWeka’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\compu lab\AppData\Local\Temp\RtmpGwhocc\downloaded_packages
install.packages("caret")
Installing package into 㤼㸱C:/Users/compu lab/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/caret_6.0-86.zip'
Content type 'application/zip' length 6253920 bytes (6.0 MB)
downloaded 6.0 MB
package ‘caret’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\compu lab\AppData\Local\Temp\RtmpGwhocc\downloaded_packages
install.packages("ROCR")
Installing package into 㤼㸱C:/Users/compu lab/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
also installing the dependencies 㤼㸱caTools㤼㸲, 㤼㸱gplots㤼㸲
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/caTools_1.18.0.zip'
Content type 'application/zip' length 317376 bytes (309 KB)
downloaded 309 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/gplots_3.1.0.zip'
Content type 'application/zip' length 602599 bytes (588 KB)
downloaded 588 KB
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/ROCR_1.0-11.zip'
Content type 'application/zip' length 457691 bytes (446 KB)
downloaded 446 KB
package ‘caTools’ successfully unpacked and MD5 sums checked
package ‘gplots’ successfully unpacked and MD5 sums checked
package ‘ROCR’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\compu lab\AppData\Local\Temp\RtmpGwhocc\downloaded_packages
install.packages("rpart")
Installing package into 㤼㸱C:/Users/compu lab/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/rpart_4.1-15.zip'
Content type 'application/zip' length 765821 bytes (747 KB)
downloaded 747 KB
package ‘rpart’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\compu lab\AppData\Local\Temp\RtmpGwhocc\downloaded_packages
install.packages("rpart.plot")
Installing package into 㤼㸱C:/Users/compu lab/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/rpart.plot_3.0.9.zip'
Content type 'application/zip' length 1034159 bytes (1009 KB)
downloaded 1009 KB
package ‘rpart.plot’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\compu lab\AppData\Local\Temp\RtmpGwhocc\downloaded_packages
install.packages("rattle")
Installing package into 㤼㸱C:/Users/compu lab/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/rattle_5.4.0.zip'
Content type 'application/zip' length 5464555 bytes (5.2 MB)
downloaded 5.2 MB
package ‘rattle’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\compu lab\AppData\Local\Temp\RtmpGwhocc\downloaded_packages
install.packages("RGtk2")
Installing package into 㤼㸱C:/Users/compu lab/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/RGtk2_2.20.36.zip'
Content type 'application/zip' length 16529627 bytes (15.8 MB)
downloaded 15.8 MB
package ‘RGtk2’ successfully unpacked and MD5 sums checked
The downloaded binary packages are in
C:\Users\compu lab\AppData\Local\Temp\RtmpGwhocc\downloaded_packages
suppressWarnings (library(RWeka))
suppressWarnings (library(caret))
suppressWarnings (library(ROCR))
# libraries for partition trees
suppressWarnings (library(rpart))
suppressWarnings (library(rpart.plot))
suppressWarnings (library(rattle))
Chronic_Kidney_Disease
Chronic_Kidney_Disease[1:2,]
summary(Chronic_Kidney_Disease)
id age bp sg
Min. : 0.00 Min. : 2.00 Min. : 50.00 Min. :1.005
1st Qu.: 99.75 1st Qu.:42.00 1st Qu.: 70.00 1st Qu.:1.010
Median :199.50 Median :55.00 Median : 80.00 Median :1.020
Mean :199.50 Mean :51.48 Mean : 76.47 Mean :1.017
3rd Qu.:299.25 3rd Qu.:64.50 3rd Qu.: 80.00 3rd Qu.:1.020
Max. :399.00 Max. :90.00 Max. :180.00 Max. :1.025
NA's :9 NA's :12 NA's :47
al su rbc pc
Min. :0.000 Min. :0.0000 Length:400 Length:400
1st Qu.:0.000 1st Qu.:0.0000 Class :character Class :character
Median :0.000 Median :0.0000 Mode :character Mode :character
Mean :1.017 Mean :0.4501
3rd Qu.:2.000 3rd Qu.:0.0000
Max. :5.000 Max. :5.0000
NA's :46 NA's :49
pcc ba bgr bu
Length:400 Length:400 Min. : 22 Min. : 1.50
Class :character Class :character 1st Qu.: 99 1st Qu.: 27.00
Mode :character Mode :character Median :121 Median : 42.00
Mean :148 Mean : 57.43
3rd Qu.:163 3rd Qu.: 66.00
Max. :490 Max. :391.00
NA's :44 NA's :19
sc sod pot hemo
Min. : 0.400 Min. : 4.5 Min. : 2.500 Min. : 3.10
1st Qu.: 0.900 1st Qu.:135.0 1st Qu.: 3.800 1st Qu.:10.30
Median : 1.300 Median :138.0 Median : 4.400 Median :12.65
Mean : 3.072 Mean :137.5 Mean : 4.627 Mean :12.53
3rd Qu.: 2.800 3rd Qu.:142.0 3rd Qu.: 4.900 3rd Qu.:15.00
Max. :76.000 Max. :163.0 Max. :47.000 Max. :17.80
NA's :17 NA's :87 NA's :88 NA's :52
pcv wc rc
Length:400 Length:400 Length:400
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
htn dm cad
Length:400 Length:400 Length:400
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
appet pe ane
Length:400 Length:400 Length:400
Class :character Class :character Class :character
Mode :character Mode :character Mode :character
classification
Length:400
Class :character
Mode :character
#% data missing in each column
apply(Chronic_Kidney_Disease,2,function(i) {(sum(is.na(i))/nrow(Chronic_Kidney_Disease))*100})
id age bp sg al
0.00 2.25 3.00 11.75 11.50
su rbc pc pcc ba
12.25 38.00 16.25 1.00 1.00
bgr bu sc sod pot
11.00 4.75 4.25 21.75 22.00
hemo pcv wc rc htn
13.00 17.50 26.25 32.50 0.50
dm cad appet pe ane
0.50 0.50 0.25 0.25 0.25
classification
0.00
# samples with no missing data
sum(complete.cases(Chronic_Kidney_Disease))
[1] 158
# samples with no missing data after removing columns which have more than 25% of data missing
sum(complete.cases(Chronic_Kidney_Disease[,-c(6,17,18)]))
[1] 159
### remove rows with any missing value
Chronic_Kidney_Disease2 <- Chronic_Kidney_Disease[complete.cases(Chronic_Kidney_Disease),]
dim(Chronic_Kidney_Disease2)
[1] 158 26
apply(Chronic_Kidney_Disease2[,c(3:9,19:25)],2,table)
$bp
50 60 70 80 90 100 110
1 40 37 63 9 7 1
$sg
1.005 1.010 1.015 1.020 1.025
3 23 10 61 61
$al
0 1 2 3 4
116 3 9 15 15
$su
0 1 2 3 4 5
140 6 6 3 2 1
$rbc
abnormal normal
18 140
$pc
abnormal normal
29 129
$pcc
notpresent present
144 14
$rc
2.1 2.5 2.6 2.7 2.9 3.0 3.1 3.2 3.3 3.4 3.5 3.6 3.7 3.8 3.9 4 4.1 4.2 4.3
2 1 1 1 2 1 1 3 1 5 2 1 3 2 5 1 3 1 3
4.5 4.6 4.7 4.8 4.9 5.0 5.1 5.2 5.3 5.4 5.5 5.6 5.7 5.8 5.9 6.0 6.1 6.2 6.3
12 3 7 9 9 7 4 8 7 5 6 6 4 4 5 3 3 5 3
6.4 6.5 8.0
5 3 1
$htn
no yes
124 34
$dm
no yes
130 28
$cad
no yes
147 11
$appet
good poor
139 19
$pe
no yes
138 20
$ane
no yes
142 16
#Function to convert factor variables into dummy variables
creat_dummy_var_data <- function(dataset){
dummy_variables <- dummyVars(~., data=dataset, fullRank=T)
dummy_var_data <- data.frame( predict(dummy_variables, newdata=dataset) )
return(dummy_var_data)
}
#Function to Split data into Training and Test
create_training_test <- function(features_dataset,outcome_data,training_test_ratio){
training_index <- createDataPartition(outcome_data,p=training_test_ratio,list=F)
training_set <- droplevels(features_dataset[training_index,])
test_set <- droplevels(features_dataset[-training_index,])
outcome_training_set <- factor(outcome_data[training_index])
outcome_test_set <- factor(outcome_data[-training_index])
return(list(training_features=training_set, test_features=test_set, training_outcome=outcome_training_set, test_outcome=outcome_test_set))
}
remove_nonvaring_collinear_features <- function(training_data,test_data,corr_theshold=0.75){
# remove zero covaritates (features with NO VARIABILITY)
#nearZeroVar(training_data,saveMetrics = T)
near_zero_covariates <- colnames(training_data)[nearZeroVar(training_data)]
if(length(near_zero_covariates)>0)
{
# find column indices of the near_zero_covariates
nzc_indices_training <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data))})
training_data_nzc <- training_data[,-nzc_indices_training]
nzc_indices_test <- sapply(near_zero_covariates,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data))})
test_data_nzc <- test_data[,-nzc_indices_test]
} else {
training_data_nzc <- training_data
test_data_nzc <- test_data
}
# CORRELATED features
feature_correlation <- cor(training_data_nzc)
# search through a correlation matrix and returns a vector of integers corresponding to columns to remove to reduce pair-wise correlations.
high_correlation <- findCorrelation(feature_correlation,corr_theshold,verbose=F,names=T)
if(length(high_correlation)>0)
{
correlated_indices_training <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(training_data_nzc))} )
final_training_data <- training_data_nzc[,-correlated_indices_training]
correlated_indices_test <- sapply( high_correlation,function(i) {grep( paste("^",i,"$",sep=""),colnames(test_data_nzc))} )
final_test_data <- test_data_nzc[,-correlated_indices_test]
}else{
final_training_data <- training_data_nzc
final_test_data <- test_data_nzc
}
return(list(processed_training_set=final_training_data, processed_test_set=final_test_data))
}
#Execution
#sapply(Chronic_Kidney_Disease2[1,],class)
# since many of the features are categorical, convert them into dummy varaibles except the outcome.
outcome_column_id <- grep("class",colnames(Chronic_Kidney_Disease2))
dataset_dummy_variables <- creat_dummy_var_data(Chronic_Kidney_Disease2[,-outcome_column_id])
# split data
# IMPORTANT NOTE: the class of column used for creating DataPartion is very #important. Same variable can give different training_index depending on whether #it is numeric or factor.
set.seed(123)
split_data <- create_training_test(dataset_dummy_variables,Chronic_Kidney_Disease2$class,0.6)
lapply(split_data,head)
$training_features
$test_features
$training_outcome
[1] ckd ckd ckd ckd ckd ckd
Levels: ckd notckd
$test_outcome
[1] ckd ckd ckd ckd ckd ckd
Levels: ckd notckd
# data preprocessing
processed_data <- remove_nonvaring_collinear_features(split_data$training_features,split_data$test_features,0.75)
#lapply(processed_data,dim)
final_training_set <- processed_data$processed_training_set
final_test_set <- processed_data$processed_test_set
#dim(final_training_set); dim(final_test_set)
training_output <- split_data$training_outcome
test_output <- split_data$test_outcome
#length(training_output); length(test_output)
#Exploratory Graphs
# PCA
pc <- prcomp(final_training_set,center=T,scale=T)
plot(pc,type="l",lab=c(10,10,12))
#pc$rotation[order(-abs(pc$rotation[,"PC1"])),]
par(xpd=TRUE)
par(mfrow=c(2,2))
for(i in seq_along(colnames(final_training_set)))
{
plot(training_output,final_training_set[,i],main=colnames(final_training_set)[i],col=2:3)
}
par(mfrow=c(1,1))
#Model Building
# k-fold cross validation
train_control <- trainControl(method="cv", number=5, savePredictions = T,classProbs = TRUE)
# svm - linear
set.seed(1)
svm_lm_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "svmLinear",preProcess = c("center", "scale","pca"))
install.packages("kernlab")
Error in install.packages : Updating loaded packages
# svm - rbf kernel
set.seed(1)
svm_rbf_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "svmRadial", tuneLegth=5, preProcess = c("center", "scale","pca"))
# Ridge Regression creates a linear regression model that is penalized with the L2-norm which is the sum of the squared coefficients.
set.seed(1)
ridge_regression_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=0,lambda=0.001),preProcess = c("center", "scale","pca"))
install.packages("kernlab")
Installing package into 㤼㸱C:/Users/compu lab/R/win-library/4.0㤼㸲
(as 㤼㸱lib㤼㸲 is unspecified)
trying URL 'https://cran.rstudio.com/bin/windows/contrib/4.0/kernlab_0.9-29.zip'
Content type 'application/zip' length 2844794 bytes (2.7 MB)
downloaded 2.7 MB
package ‘kernlab’ successfully unpacked and MD5 sums checked
Warning in install.packages :
cannot remove prior installation of package ‘kernlab’
Warning in install.packages :
problem copying C:\Users\compu lab\R\win-library\4.0\00LOCK\kernlab\libs\x64\kernlab.dll to C:\Users\compu lab\R\win-library\4.0\kernlab\libs\x64\kernlab.dll: Permission denied
Warning in install.packages :
restored ‘kernlab’
The downloaded binary packages are in
C:\Users\compu lab\AppData\Local\Temp\RtmpGwhocc\downloaded_packages
# LASSO (Least Absolute Shrinkage and Selection Operator) creates a regression model that is penalized with the L1-norm which is the sum of the absolute coefficients.
set.seed(1)
lasso_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=1,lambda=0.001),preProcess = c("center", "scale","pca"))
# Elastic Net creates a regression model that is penalized with both the L1-norm and L2-norm.
set.seed(1)
elastic_net_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "glmnet",family = "binomial",tuneGrid=expand.grid(alpha=0.5,lambda=0.001),preProcess = c("center", "scale","pca"))
# classification Trees
set.seed(1)
rpart_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "rpart",preProcess = c("center", "scale","pca"))
#plot(rpart_model)
# random forest
set.seed(1)
rf_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "rf",prox=T,preProcess = c("center", "scale"))
plot(rf_model)
# boosting with tres
set.seed(1)
gbm_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "gbm", verbose=F,preProcess = c("center", "scale","pca"))
#plot(gbm_model)
# linear discriminant analysis
set.seed(1)
lda_model <- train(y=training_output, x=final_training_set, trControl=train_control, method = "lda",preProcess = c("center", "scale","pca"))
# collect resamples
training_models <- list(SVM_LM=svm_lm_model,SVM_RBF=svm_rbf_model,RPART=rpart_model,GBM=gbm_model,RF=rf_model,LDA=lda_model,RIDGE=ridge_regression_model,LASSO=lasso_model,ELASTIC=elastic_net_model)
train_results <- resamples(training_models)
# summarize the distributions
summary(train_results)
Call:
summary.resamples(object = train_results)
Models: SVM_LM, SVM_RBF, RPART, GBM, RF, LDA, RIDGE, LASSO, ELASTIC
Number of resamples: 5
Accuracy
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
SVM_LM 0.9473684 0.9500000 1 0.9794737 1 1 0
SVM_RBF 0.8000000 1.0000000 1 0.9600000 1 1 0
RPART 1.0000000 1.0000000 1 1.0000000 1 1 0
GBM 1.0000000 1.0000000 1 1.0000000 1 1 0
RF 1.0000000 1.0000000 1 1.0000000 1 1 0
LDA 0.9500000 1.0000000 1 0.9900000 1 1 0
RIDGE 0.9000000 0.9473684 1 0.9694737 1 1 0
LASSO 1.0000000 1.0000000 1 1.0000000 1 1 0
ELASTIC 0.9473684 1.0000000 1 0.9894737 1 1 0
Kappa
Min. 1st Qu. Median Mean 3rd Qu. Max. NA's
SVM_LM 0.8549618 0.8750000 1 0.9459924 1 1 0
SVM_RBF 0.5652174 1.0000000 1 0.9130435 1 1 0
RPART 1.0000000 1.0000000 1 1.0000000 1 1 0
GBM 1.0000000 1.0000000 1 1.0000000 1 1 0
RF 1.0000000 1.0000000 1 1.0000000 1 1 0
LDA 0.8750000 1.0000000 1 0.9750000 1 1 0
RIDGE 0.7368421 0.8549618 1 0.9183608 1 1 0
LASSO 1.0000000 1.0000000 1 1.0000000 1 1 0
ELASTIC 0.8549618 1.0000000 1 0.9709924 1 1 0
# boxplots of results
bwplot(train_results)
# the above results suggest that svm-rf and random forest model performs best on the training data.
#EVALUATE MODEL ACCURACY ON TEST SET
#Ideally, you select model that performs best on training data and evaluate on test set. I am doing for all models just for illustration
test_pred_svm_lm <- predict(svm_lm_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_svm_lm, reference=test_output)
test_pred_svm_rbf <- predict(svm_rbf_model, newdata=final_test_set)
confusionMatrix(data=test_pred_svm_rbf, reference=test_output)
Confusion Matrix and Statistics
Reference
Prediction ckd notckd
ckd 17 1
notckd 0 45
Accuracy : 0.9841
95% CI : (0.9147, 0.9996)
No Information Rate : 0.7302
P-Value [Acc > NIR] : 6.034e-08
Kappa : 0.9605
Mcnemar's Test P-Value : 1
Sensitivity : 1.0000
Specificity : 0.9783
Pos Pred Value : 0.9444
Neg Pred Value : 1.0000
Prevalence : 0.2698
Detection Rate : 0.2698
Detection Prevalence : 0.2857
Balanced Accuracy : 0.9891
'Positive' Class : ckd
test_pred_rpart <- predict(rpart_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_rpart, test_output)
test_pred_gbm <- predict(gbm_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_gbm, test_output)
test_pred_rf <- predict(rf_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_rf, test_output)
test_pred_lda <- predict(lda_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_lda, test_output)
test_pred_ridge <- predict(ridge_regression_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_ridge, test_output)
test_pred_lasso <- predict(lasso_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_lasso, test_output)
test_pred_elastic_net <- predict(elastic_net_model, newdata=final_test_set)
#confusionMatrix(data=test_pred_elastic_net, test_output)
balanced_accuracy <- function(trained_model, test_features=final_test_set, test_outcomes=test_output){
test_model <- predict(trained_model,test_features)
test_score <- confusionMatrix(data=test_model, test_outcomes)
return(test_score$byClass[["Balanced Accuracy"]])
}
lapply(training_models, balanced_accuracy)
$SVM_LM
[1] 0.9411765
$SVM_RBF
[1] 0.9891304
$RPART
[1] 1
$GBM
[1] 1
$RF
[1] 1
$LDA
[1] 0.9411765
$RIDGE
[1] 0.8823529
$LASSO
[1] 0.9411765
$ELASTIC
[1] 0.9411765
#ROC CURVES
roc_curve <- function(test_predictions,colour=1,test_labels=test_output){
pred <- prediction(as.numeric(test_predictions), as.numeric(test_labels) )
perf <- performance(pred, measure = "tpr", x.measure = "fpr")
plot(perf,col=colour)
}
roc_curve(test_pred_svm_rbf,colour=1)
par(new = TRUE)
roc_curve(test_pred_lda,colour=2)
par(new = TRUE)
roc_curve(test_pred_rf,colour=3)
par(new = TRUE)
roc_curve(test_pred_elastic_net,colour=4)
par(new = TRUE)
roc_curve(test_pred_gbm,colour=5)
par(new = FALSE)
legend("bottomright",c("svm radial kernel", "lda","random forest","elastic_net","graded boosting"), col = c(1:5),cex=0.8,lty=1)
title(main="ROC curves for test data")
plot(varImp(svm_rbf_model))
ckd_training_set <- which(training_output=="ckd")
nonckd_training_set <- which(training_output=="notckd")
imp_features_svm <- (c("rbc.c","bu","sod","bgr","age"))
sapply(imp_features_svm,function(i) {
t.test(final_training_set[ckd_training_set,i],final_training_set[nonckd_training_set,i])
})
Add a new chunk by clicking the Insert Chunk button on the toolbar or by pressing Ctrl+Alt+I.
When you save the notebook, an HTML file containing the code and output will be saved alongside it (click the Preview button or press Ctrl+Shift+K to preview the HTML file).
The preview shows you a rendered HTML copy of the contents of the editor. Consequently, unlike Knit, Preview does not run any R code chunks. Instead, the output of the chunk when it was last run in the editor is displayed.