pacman::p_load(tidyverse, caret, corrplot, e1071, interplot, caTools, car, ROCR, IRdisplay, xlsx, ggmap, ggpubr, broom, relaimpo, ggpubr, MASS, MLmetrics,pROC)
setwd("C:/Users/SK/Desktop/SK/NUS EBA/Semester 4/Advance Analytic Project/Diabetic Retinopathy")
retino <- read.csv("C:/Users/SK/Desktop/SK/NUS EBA/Semester 4/Advance Analytic Project/Diabetic Retinopathy/train.csv")
str(retino)
## 'data.frame': 920 obs. of 20 variables:
## $ q : int 1 1 1 1 1 1 1 1 1 1 ...
## $ ps : int 1 1 1 1 1 1 1 1 1 0 ...
## $ nma.a : int 75 79 41 17 63 86 28 57 3 27 ...
## $ nma.b : int 63 76 41 16 63 85 27 57 3 27 ...
## $ nma.c : int 60 74 40 16 63 84 26 56 3 27 ...
## $ nma.d : int 55 72 40 14 59 83 26 55 3 25 ...
## $ nma.e : int 48 69 38 12 57 81 25 55 3 23 ...
## $ nma.f : int 35 50 35 9 48 64 15 37 1 19 ...
## $ nex.a : num 13.2 61.56 6.09 75.44 13.56 ...
## $ nex.b : num 4.397 28.959 0.834 20.352 5.366 ...
## $ nex.c : num 0.1041 12.7781 0.0275 5.2374 0.6041 ...
## $ nex.d : num 0 2.0453 0 0.2068 0.0515 ...
## $ nex.e : num 0 0.03802 0 0.00388 0 ...
## $ nma.f.1: num 0 0 0 0.000971 0 ...
## $ nma.g : num 0 0 0 0.000971 0 ...
## $ nma.h : num 0 0 0 0.000971 0 ...
## $ dd : num 0.513 0.528 0.507 0.545 0.553 ...
## $ dm : num 0.124 0.1019 0.0915 0.0893 0.1124 ...
## $ amfm : int 0 0 1 1 0 0 0 0 1 1 ...
## $ Class : int 1 1 0 1 1 0 1 1 0 0 ...
summary(retino)
## q ps nma.a nma.b
## Min. :0.0000 Min. :0.0000 Min. : 1.00 Min. : 1.00
## 1st Qu.:1.0000 1st Qu.:1.0000 1st Qu.: 16.00 1st Qu.: 16.00
## Median :1.0000 Median :1.0000 Median : 36.00 Median : 35.00
## Mean :0.9957 Mean :0.9141 Mean : 38.32 Mean : 36.79
## 3rd Qu.:1.0000 3rd Qu.:1.0000 3rd Qu.: 55.00 3rd Qu.: 53.00
## Max. :1.0000 Max. :1.0000 Max. :147.00 Max. :132.00
## nma.c nma.d nma.e nma.f
## Min. : 1.00 Min. : 1.00 Min. : 1.00 Min. : 1.00
## 1st Qu.: 15.00 1st Qu.:13.75 1st Qu.:11.00 1st Qu.: 8.00
## Median : 32.00 Median :30.00 Median :25.50 Median :18.00
## Mean : 35.01 Mean :32.18 Mean :28.67 Mean :21.06
## 3rd Qu.: 51.00 3rd Qu.:47.25 3rd Qu.:43.00 3rd Qu.:32.00
## Max. :113.00 Max. :94.00 Max. :86.00 Max. :77.00
## nex.a nex.b nex.c nex.d
## Min. : 0.3493 Min. : 0.000 Min. : 0.000 Min. : 0.00000
## 1st Qu.: 22.3678 1st Qu.: 7.931 1st Qu.: 1.235 1st Qu.: 0.07922
## Median : 44.2333 Median : 17.042 Median : 4.434 Median : 0.48734
## Mean : 63.6982 Mean : 22.905 Mean : 8.520 Mean : 1.79810
## 3rd Qu.: 86.7991 3rd Qu.: 30.451 3rd Qu.: 11.425 3rd Qu.: 1.86316
## Max. :403.9391 Max. :167.131 Max. :106.070 Max. :59.76612
## nex.e nma.f.1 nma.g nma.h
## Min. : 0.00000 Min. : 0.00000 Min. :0.00000 Min. :0.000000
## 1st Qu.: 0.00000 1st Qu.: 0.00000 1st Qu.:0.00000 1st Qu.:0.000000
## Median : 0.02329 Median : 0.00128 Median :0.00000 Median :0.000000
## Mean : 0.55191 Mean : 0.19475 Mean :0.07539 Mean :0.031751
## 3rd Qu.: 0.19314 3rd Qu.: 0.03762 3rd Qu.:0.00483 3rd Qu.:0.003825
## Max. :51.42321 Max. :20.09860 Max. :5.93780 Max. :2.171842
## dd dm amfm Class
## Min. :0.3678 Min. :0.05791 Min. :0.0000 Min. :0.0000
## 1st Qu.:0.5025 1st Qu.:0.09609 1st Qu.:0.0000 1st Qu.:0.0000
## Median :0.5236 Median :0.10728 Median :0.0000 Median :1.0000
## Mean :0.5230 Mean :0.10882 Mean :0.3315 Mean :0.5239
## 3rd Qu.:0.5440 3rd Qu.:0.12044 3rd Qu.:1.0000 3rd Qu.:1.0000
## Max. :0.5922 Max. :0.21920 Max. :1.0000 Max. :1.0000
dim(retino)
## [1] 920 20
retino$Class <- factor(retino$Class, levels = c(0,1),
labels = c("No", "Yes"))
retino$q <- as.factor(retino$q)
retino$ps <- as.factor(retino$ps)
retino$amfm <- as.factor(retino$amfm)
str(retino)
## 'data.frame': 920 obs. of 20 variables:
## $ q : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
## $ ps : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 1 ...
## $ nma.a : int 75 79 41 17 63 86 28 57 3 27 ...
## $ nma.b : int 63 76 41 16 63 85 27 57 3 27 ...
## $ nma.c : int 60 74 40 16 63 84 26 56 3 27 ...
## $ nma.d : int 55 72 40 14 59 83 26 55 3 25 ...
## $ nma.e : int 48 69 38 12 57 81 25 55 3 23 ...
## $ nma.f : int 35 50 35 9 48 64 15 37 1 19 ...
## $ nex.a : num 13.2 61.56 6.09 75.44 13.56 ...
## $ nex.b : num 4.397 28.959 0.834 20.352 5.366 ...
## $ nex.c : num 0.1041 12.7781 0.0275 5.2374 0.6041 ...
## $ nex.d : num 0 2.0453 0 0.2068 0.0515 ...
## $ nex.e : num 0 0.03802 0 0.00388 0 ...
## $ nma.f.1: num 0 0 0 0.000971 0 ...
## $ nma.g : num 0 0 0 0.000971 0 ...
## $ nma.h : num 0 0 0 0.000971 0 ...
## $ dd : num 0.513 0.528 0.507 0.545 0.553 ...
## $ dm : num 0.124 0.1019 0.0915 0.0893 0.1124 ...
## $ amfm : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 2 2 ...
## $ Class : Factor w/ 2 levels "No","Yes": 2 2 1 2 2 1 2 2 1 1 ...
table(retino$Class)
##
## No Yes
## 438 482
set.seed(123)
split <- createDataPartition(y = retino$Class, p= 0.7, list = FALSE)
trainset <- retino[split,]
testset <- retino[-split,]
retino_cv <- trainControl(method="repeatedcv", number= 10, repeats = 3)
svm_retino <- train(Class~., data = trainset, method = "svmLinear", trControl=retino_cv,
preProcess= c("center","scale"), tuneLength = 10)
svm_retino
## Support Vector Machines with Linear Kernel
##
## 645 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (19), scaled (19)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 580, 580, 582, 580, 580, 582, ...
## Resampling results:
##
## Accuracy Kappa
## 0.7288398 0.4624023
##
## Tuning parameter 'C' was held constant at a value of 1
y_train <- predict(svm_retino, newdata = trainset)
confusionMatrix(table(y_train, trainset$Class))
## Confusion Matrix and Statistics
##
##
## y_train No Yes
## No 262 116
## Yes 45 222
##
## Accuracy : 0.7504
## 95% CI : (0.7151, 0.7834)
## No Information Rate : 0.524
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5049
##
## Mcnemar's Test P-Value : 3.453e-08
##
## Sensitivity : 0.8534
## Specificity : 0.6568
## Pos Pred Value : 0.6931
## Neg Pred Value : 0.8315
## Prevalence : 0.4760
## Detection Rate : 0.4062
## Detection Prevalence : 0.5860
## Balanced Accuracy : 0.7551
##
## 'Positive' Class : No
##
y <- predict(svm_retino, newdata = testset)
confusionMatrix(table(y, testset$Class))
## Confusion Matrix and Statistics
##
##
## y No Yes
## No 114 61
## Yes 17 83
##
## Accuracy : 0.7164
## 95% CI : (0.6591, 0.7689)
## No Information Rate : 0.5236
## P-Value [Acc > NIR] : 5.252e-11
##
## Kappa : 0.4399
##
## Mcnemar's Test P-Value : 1.123e-06
##
## Sensitivity : 0.8702
## Specificity : 0.5764
## Pos Pred Value : 0.6514
## Neg Pred Value : 0.8300
## Prevalence : 0.4764
## Detection Rate : 0.4145
## Detection Prevalence : 0.6364
## Balanced Accuracy : 0.7233
##
## 'Positive' Class : No
##
varImp(svm_retino)
## ROC curve variable importance
##
## Importance
## nma.a 100.000
## nma.b 89.118
## nma.c 76.200
## nma.d 60.205
## nma.f.1 53.397
## nma.g 52.131
## nma.h 48.244
## nma.e 46.790
## nex.e 39.516
## nma.f 32.718
## nex.b 23.310
## dm 22.313
## ps 19.001
## dd 14.766
## amfm 11.502
## nex.c 8.469
## nex.a 7.284
## nex.d 1.566
## q 0.000
svmROC_train <- roc(response=trainset$Class, predictor=factor(y_train, ordered = TRUE), plot=TRUE)
plot(svmROC_train, col="red", lwd=3, main="ROC curve SVM_train")
auc(svmROC_train)
## Area under the curve: 0.7551
svmROC <- roc(response=testset$Class, predictor=factor(y, ordered = TRUE), plot=TRUE)
plot(svmROC, col="red", lwd=3, main="ROC curve SVM_test")
auc(svmROC)
## Area under the curve: 0.7233
grid <- expand.grid(C = c(0,0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2,5))
svm_retino_tune <- train(Class ~., data = trainset, method = "svmLinear",
trControl=retino_cv,
preProcess = c("center", "scale"),
tuneGrid = grid,
tuneLength = 10)
svm_retino_tune
## Support Vector Machines with Linear Kernel
##
## 645 samples
## 19 predictor
## 2 classes: 'No', 'Yes'
##
## Pre-processing: centered (19), scaled (19)
## Resampling: Cross-Validated (10 fold, repeated 3 times)
## Summary of sample sizes: 580, 580, 580, 581, 581, 581, ...
## Resampling results across tuning parameters:
##
## C Accuracy Kappa
## 0.00 NaN NaN
## 0.01 0.6283932 0.2604783
## 0.05 0.6779127 0.3617916
## 0.10 0.6970073 0.4003680
## 0.25 0.7145799 0.4353897
## 0.50 0.7280259 0.4617944
## 0.75 0.7352935 0.4758235
## 1.00 0.7363192 0.4775174
## 1.25 0.7363272 0.4773797
## 1.50 0.7358307 0.4764285
## 1.75 0.7357823 0.4762034
## 2.00 0.7414717 0.4874701
## 5.00 0.7424650 0.4891268
##
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 5.
plot(svm_retino_tune)
train_pred_grid <- predict(svm_retino_tune, newdata = trainset)
confusionMatrix(table(train_pred_grid, trainset$Class))
## Confusion Matrix and Statistics
##
##
## train_pred_grid No Yes
## No 265 107
## Yes 42 231
##
## Accuracy : 0.769
## 95% CI : (0.7345, 0.801)
## No Information Rate : 0.524
## P-Value [Acc > NIR] : < 2.2e-16
##
## Kappa : 0.5414
##
## Mcnemar's Test P-Value : 1.579e-07
##
## Sensitivity : 0.8632
## Specificity : 0.6834
## Pos Pred Value : 0.7124
## Neg Pred Value : 0.8462
## Prevalence : 0.4760
## Detection Rate : 0.4109
## Detection Prevalence : 0.5767
## Balanced Accuracy : 0.7733
##
## 'Positive' Class : No
##
svmROC_tune_train <- roc(response=trainset$Class, predictor=factor(train_pred_grid, ordered = TRUE), plot=TRUE)
plot(svmROC_tune_train, col="blue", lwd=3, main="ROC curve SVM")
auc(svmROC_tune_train)
## Area under the curve: 0.7733
test_pred_grid <- predict(svm_retino_tune, newdata = testset)
confusionMatrix(table(test_pred_grid, testset$Class))
## Confusion Matrix and Statistics
##
##
## test_pred_grid No Yes
## No 114 64
## Yes 17 80
##
## Accuracy : 0.7055
## 95% CI : (0.6477, 0.7587)
## No Information Rate : 0.5236
## P-Value [Acc > NIR] : 6.057e-10
##
## Kappa : 0.419
##
## Mcnemar's Test P-Value : 3.203e-07
##
## Sensitivity : 0.8702
## Specificity : 0.5556
## Pos Pred Value : 0.6404
## Neg Pred Value : 0.8247
## Prevalence : 0.4764
## Detection Rate : 0.4145
## Detection Prevalence : 0.6473
## Balanced Accuracy : 0.7129
##
## 'Positive' Class : No
##
varImp(svm_retino_tune)
## ROC curve variable importance
##
## Importance
## nma.a 100.000
## nma.b 89.118
## nma.c 76.200
## nma.d 60.205
## nma.f.1 53.397
## nma.g 52.131
## nma.h 48.244
## nma.e 46.790
## nex.e 39.516
## nma.f 32.718
## nex.b 23.310
## dm 22.313
## ps 19.001
## dd 14.766
## amfm 11.502
## nex.c 8.469
## nex.a 7.284
## nex.d 1.566
## q 0.000
svmROC_tune <- roc(response=testset$Class, predictor=factor(test_pred_grid, ordered = TRUE), plot=TRUE)
plot(svmROC_tune, col="blue", lwd=3, main="ROC curve SVM")
auc(svmROC_tune)
## Area under the curve: 0.7129