Load the package

pacman::p_load(tidyverse, caret, corrplot, e1071, interplot, caTools, car, ROCR, IRdisplay, xlsx, ggmap, ggpubr, broom, relaimpo, ggpubr, MASS, MLmetrics,pROC)

Read the data file

setwd("C:/Users/SK/Desktop/SK/NUS EBA/Semester 4/Advance Analytic Project/Diabetic Retinopathy")
retino <- read.csv("C:/Users/SK/Desktop/SK/NUS EBA/Semester 4/Advance Analytic Project/Diabetic Retinopathy/train.csv")

Explore the data file

str(retino)
## 'data.frame':    920 obs. of  20 variables:
##  $ q      : int  1 1 1 1 1 1 1 1 1 1 ...
##  $ ps     : int  1 1 1 1 1 1 1 1 1 0 ...
##  $ nma.a  : int  75 79 41 17 63 86 28 57 3 27 ...
##  $ nma.b  : int  63 76 41 16 63 85 27 57 3 27 ...
##  $ nma.c  : int  60 74 40 16 63 84 26 56 3 27 ...
##  $ nma.d  : int  55 72 40 14 59 83 26 55 3 25 ...
##  $ nma.e  : int  48 69 38 12 57 81 25 55 3 23 ...
##  $ nma.f  : int  35 50 35 9 48 64 15 37 1 19 ...
##  $ nex.a  : num  13.2 61.56 6.09 75.44 13.56 ...
##  $ nex.b  : num  4.397 28.959 0.834 20.352 5.366 ...
##  $ nex.c  : num  0.1041 12.7781 0.0275 5.2374 0.6041 ...
##  $ nex.d  : num  0 2.0453 0 0.2068 0.0515 ...
##  $ nex.e  : num  0 0.03802 0 0.00388 0 ...
##  $ nma.f.1: num  0 0 0 0.000971 0 ...
##  $ nma.g  : num  0 0 0 0.000971 0 ...
##  $ nma.h  : num  0 0 0 0.000971 0 ...
##  $ dd     : num  0.513 0.528 0.507 0.545 0.553 ...
##  $ dm     : num  0.124 0.1019 0.0915 0.0893 0.1124 ...
##  $ amfm   : int  0 0 1 1 0 0 0 0 1 1 ...
##  $ Class  : int  1 1 0 1 1 0 1 1 0 0 ...
summary(retino)
##        q                ps             nma.a            nma.b       
##  Min.   :0.0000   Min.   :0.0000   Min.   :  1.00   Min.   :  1.00  
##  1st Qu.:1.0000   1st Qu.:1.0000   1st Qu.: 16.00   1st Qu.: 16.00  
##  Median :1.0000   Median :1.0000   Median : 36.00   Median : 35.00  
##  Mean   :0.9957   Mean   :0.9141   Mean   : 38.32   Mean   : 36.79  
##  3rd Qu.:1.0000   3rd Qu.:1.0000   3rd Qu.: 55.00   3rd Qu.: 53.00  
##  Max.   :1.0000   Max.   :1.0000   Max.   :147.00   Max.   :132.00  
##      nma.c            nma.d           nma.e           nma.f      
##  Min.   :  1.00   Min.   : 1.00   Min.   : 1.00   Min.   : 1.00  
##  1st Qu.: 15.00   1st Qu.:13.75   1st Qu.:11.00   1st Qu.: 8.00  
##  Median : 32.00   Median :30.00   Median :25.50   Median :18.00  
##  Mean   : 35.01   Mean   :32.18   Mean   :28.67   Mean   :21.06  
##  3rd Qu.: 51.00   3rd Qu.:47.25   3rd Qu.:43.00   3rd Qu.:32.00  
##  Max.   :113.00   Max.   :94.00   Max.   :86.00   Max.   :77.00  
##      nex.a              nex.b             nex.c             nex.d         
##  Min.   :  0.3493   Min.   :  0.000   Min.   :  0.000   Min.   : 0.00000  
##  1st Qu.: 22.3678   1st Qu.:  7.931   1st Qu.:  1.235   1st Qu.: 0.07922  
##  Median : 44.2333   Median : 17.042   Median :  4.434   Median : 0.48734  
##  Mean   : 63.6982   Mean   : 22.905   Mean   :  8.520   Mean   : 1.79810  
##  3rd Qu.: 86.7991   3rd Qu.: 30.451   3rd Qu.: 11.425   3rd Qu.: 1.86316  
##  Max.   :403.9391   Max.   :167.131   Max.   :106.070   Max.   :59.76612  
##      nex.e             nma.f.1             nma.g             nma.h         
##  Min.   : 0.00000   Min.   : 0.00000   Min.   :0.00000   Min.   :0.000000  
##  1st Qu.: 0.00000   1st Qu.: 0.00000   1st Qu.:0.00000   1st Qu.:0.000000  
##  Median : 0.02329   Median : 0.00128   Median :0.00000   Median :0.000000  
##  Mean   : 0.55191   Mean   : 0.19475   Mean   :0.07539   Mean   :0.031751  
##  3rd Qu.: 0.19314   3rd Qu.: 0.03762   3rd Qu.:0.00483   3rd Qu.:0.003825  
##  Max.   :51.42321   Max.   :20.09860   Max.   :5.93780   Max.   :2.171842  
##        dd               dm               amfm            Class       
##  Min.   :0.3678   Min.   :0.05791   Min.   :0.0000   Min.   :0.0000  
##  1st Qu.:0.5025   1st Qu.:0.09609   1st Qu.:0.0000   1st Qu.:0.0000  
##  Median :0.5236   Median :0.10728   Median :0.0000   Median :1.0000  
##  Mean   :0.5230   Mean   :0.10882   Mean   :0.3315   Mean   :0.5239  
##  3rd Qu.:0.5440   3rd Qu.:0.12044   3rd Qu.:1.0000   3rd Qu.:1.0000  
##  Max.   :0.5922   Max.   :0.21920   Max.   :1.0000   Max.   :1.0000
dim(retino)
## [1] 920  20

Convert Class variable to factor

retino$Class <- factor(retino$Class, levels = c(0,1),
                       labels = c("No", "Yes"))

retino$q <- as.factor(retino$q)
retino$ps <- as.factor(retino$ps)
retino$amfm <- as.factor(retino$amfm)

str(retino)
## 'data.frame':    920 obs. of  20 variables:
##  $ q      : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 2 ...
##  $ ps     : Factor w/ 2 levels "0","1": 2 2 2 2 2 2 2 2 2 1 ...
##  $ nma.a  : int  75 79 41 17 63 86 28 57 3 27 ...
##  $ nma.b  : int  63 76 41 16 63 85 27 57 3 27 ...
##  $ nma.c  : int  60 74 40 16 63 84 26 56 3 27 ...
##  $ nma.d  : int  55 72 40 14 59 83 26 55 3 25 ...
##  $ nma.e  : int  48 69 38 12 57 81 25 55 3 23 ...
##  $ nma.f  : int  35 50 35 9 48 64 15 37 1 19 ...
##  $ nex.a  : num  13.2 61.56 6.09 75.44 13.56 ...
##  $ nex.b  : num  4.397 28.959 0.834 20.352 5.366 ...
##  $ nex.c  : num  0.1041 12.7781 0.0275 5.2374 0.6041 ...
##  $ nex.d  : num  0 2.0453 0 0.2068 0.0515 ...
##  $ nex.e  : num  0 0.03802 0 0.00388 0 ...
##  $ nma.f.1: num  0 0 0 0.000971 0 ...
##  $ nma.g  : num  0 0 0 0.000971 0 ...
##  $ nma.h  : num  0 0 0 0.000971 0 ...
##  $ dd     : num  0.513 0.528 0.507 0.545 0.553 ...
##  $ dm     : num  0.124 0.1019 0.0915 0.0893 0.1124 ...
##  $ amfm   : Factor w/ 2 levels "0","1": 1 1 2 2 1 1 1 1 2 2 ...
##  $ Class  : Factor w/ 2 levels "No","Yes": 2 2 1 2 2 1 2 2 1 1 ...

Check the data imbalance

table(retino$Class)
## 
##  No Yes 
## 438 482

Split the data to 70% trainset and 30% testset

set.seed(123)
split <- createDataPartition(y = retino$Class, p= 0.7, list = FALSE)
trainset <- retino[split,]
testset <- retino[-split,]

Use repeat cross validation

retino_cv <- trainControl(method="repeatedcv", number= 10, repeats = 3)

Create SVM model

preProcess = centering and scaling the data

svm_retino <- train(Class~., data = trainset, method = "svmLinear", trControl=retino_cv,
                    preProcess= c("center","scale"), tuneLength = 10)
svm_retino
## Support Vector Machines with Linear Kernel 
## 
## 645 samples
##  19 predictor
##   2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (19), scaled (19) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 580, 580, 582, 580, 580, 582, ... 
## Resampling results:
## 
##   Accuracy   Kappa    
##   0.7288398  0.4624023
## 
## Tuning parameter 'C' was held constant at a value of 1
y_train <- predict(svm_retino, newdata = trainset)
confusionMatrix(table(y_train, trainset$Class))
## Confusion Matrix and Statistics
## 
##        
## y_train  No Yes
##     No  262 116
##     Yes  45 222
##                                           
##                Accuracy : 0.7504          
##                  95% CI : (0.7151, 0.7834)
##     No Information Rate : 0.524           
##     P-Value [Acc > NIR] : < 2.2e-16       
##                                           
##                   Kappa : 0.5049          
##                                           
##  Mcnemar's Test P-Value : 3.453e-08       
##                                           
##             Sensitivity : 0.8534          
##             Specificity : 0.6568          
##          Pos Pred Value : 0.6931          
##          Neg Pred Value : 0.8315          
##              Prevalence : 0.4760          
##          Detection Rate : 0.4062          
##    Detection Prevalence : 0.5860          
##       Balanced Accuracy : 0.7551          
##                                           
##        'Positive' Class : No              
## 
y <- predict(svm_retino, newdata = testset)
confusionMatrix(table(y, testset$Class))
## Confusion Matrix and Statistics
## 
##      
## y      No Yes
##   No  114  61
##   Yes  17  83
##                                           
##                Accuracy : 0.7164          
##                  95% CI : (0.6591, 0.7689)
##     No Information Rate : 0.5236          
##     P-Value [Acc > NIR] : 5.252e-11       
##                                           
##                   Kappa : 0.4399          
##                                           
##  Mcnemar's Test P-Value : 1.123e-06       
##                                           
##             Sensitivity : 0.8702          
##             Specificity : 0.5764          
##          Pos Pred Value : 0.6514          
##          Neg Pred Value : 0.8300          
##              Prevalence : 0.4764          
##          Detection Rate : 0.4145          
##    Detection Prevalence : 0.6364          
##       Balanced Accuracy : 0.7233          
##                                           
##        'Positive' Class : No              
## 
varImp(svm_retino)
## ROC curve variable importance
## 
##         Importance
## nma.a      100.000
## nma.b       89.118
## nma.c       76.200
## nma.d       60.205
## nma.f.1     53.397
## nma.g       52.131
## nma.h       48.244
## nma.e       46.790
## nex.e       39.516
## nma.f       32.718
## nex.b       23.310
## dm          22.313
## ps          19.001
## dd          14.766
## amfm        11.502
## nex.c        8.469
## nex.a        7.284
## nex.d        1.566
## q            0.000

plot the ROC curve and calculate the AUC

svmROC_train <- roc(response=trainset$Class, predictor=factor(y_train, ordered = TRUE), plot=TRUE)

plot(svmROC_train, col="red", lwd=3, main="ROC curve SVM_train")

auc(svmROC_train)
## Area under the curve: 0.7551
svmROC <- roc(response=testset$Class, predictor=factor(y, ordered = TRUE), plot=TRUE)

plot(svmROC, col="red", lwd=3, main="ROC curve SVM_test")

auc(svmROC)
## Area under the curve: 0.7233

Tuning the model (Improve the accuracy but sensitivity drop)

grid <- expand.grid(C = c(0,0.01, 0.05, 0.1, 0.25, 0.5, 0.75, 1, 1.25, 1.5, 1.75, 2,5))
svm_retino_tune <- train(Class ~., data = trainset, method = "svmLinear",
                         trControl=retino_cv,
                         preProcess = c("center", "scale"),
                         tuneGrid = grid,
                         tuneLength = 10)
svm_retino_tune
## Support Vector Machines with Linear Kernel 
## 
## 645 samples
##  19 predictor
##   2 classes: 'No', 'Yes' 
## 
## Pre-processing: centered (19), scaled (19) 
## Resampling: Cross-Validated (10 fold, repeated 3 times) 
## Summary of sample sizes: 580, 580, 580, 581, 581, 581, ... 
## Resampling results across tuning parameters:
## 
##   C     Accuracy   Kappa    
##   0.00        NaN        NaN
##   0.01  0.6283932  0.2604783
##   0.05  0.6779127  0.3617916
##   0.10  0.6970073  0.4003680
##   0.25  0.7145799  0.4353897
##   0.50  0.7280259  0.4617944
##   0.75  0.7352935  0.4758235
##   1.00  0.7363192  0.4775174
##   1.25  0.7363272  0.4773797
##   1.50  0.7358307  0.4764285
##   1.75  0.7357823  0.4762034
##   2.00  0.7414717  0.4874701
##   5.00  0.7424650  0.4891268
## 
## Accuracy was used to select the optimal model using the largest value.
## The final value used for the model was C = 5.
plot(svm_retino_tune)

train_pred_grid <- predict(svm_retino_tune, newdata = trainset)
confusionMatrix(table(train_pred_grid, trainset$Class))
## Confusion Matrix and Statistics
## 
##                
## train_pred_grid  No Yes
##             No  265 107
##             Yes  42 231
##                                          
##                Accuracy : 0.769          
##                  95% CI : (0.7345, 0.801)
##     No Information Rate : 0.524          
##     P-Value [Acc > NIR] : < 2.2e-16      
##                                          
##                   Kappa : 0.5414         
##                                          
##  Mcnemar's Test P-Value : 1.579e-07      
##                                          
##             Sensitivity : 0.8632         
##             Specificity : 0.6834         
##          Pos Pred Value : 0.7124         
##          Neg Pred Value : 0.8462         
##              Prevalence : 0.4760         
##          Detection Rate : 0.4109         
##    Detection Prevalence : 0.5767         
##       Balanced Accuracy : 0.7733         
##                                          
##        'Positive' Class : No             
## 
svmROC_tune_train <- roc(response=trainset$Class, predictor=factor(train_pred_grid, ordered = TRUE), plot=TRUE)

plot(svmROC_tune_train, col="blue", lwd=3, main="ROC curve SVM")

auc(svmROC_tune_train)
## Area under the curve: 0.7733
test_pred_grid <- predict(svm_retino_tune, newdata = testset)
confusionMatrix(table(test_pred_grid, testset$Class))
## Confusion Matrix and Statistics
## 
##               
## test_pred_grid  No Yes
##            No  114  64
##            Yes  17  80
##                                           
##                Accuracy : 0.7055          
##                  95% CI : (0.6477, 0.7587)
##     No Information Rate : 0.5236          
##     P-Value [Acc > NIR] : 6.057e-10       
##                                           
##                   Kappa : 0.419           
##                                           
##  Mcnemar's Test P-Value : 3.203e-07       
##                                           
##             Sensitivity : 0.8702          
##             Specificity : 0.5556          
##          Pos Pred Value : 0.6404          
##          Neg Pred Value : 0.8247          
##              Prevalence : 0.4764          
##          Detection Rate : 0.4145          
##    Detection Prevalence : 0.6473          
##       Balanced Accuracy : 0.7129          
##                                           
##        'Positive' Class : No              
## 
varImp(svm_retino_tune)
## ROC curve variable importance
## 
##         Importance
## nma.a      100.000
## nma.b       89.118
## nma.c       76.200
## nma.d       60.205
## nma.f.1     53.397
## nma.g       52.131
## nma.h       48.244
## nma.e       46.790
## nex.e       39.516
## nma.f       32.718
## nex.b       23.310
## dm          22.313
## ps          19.001
## dd          14.766
## amfm        11.502
## nex.c        8.469
## nex.a        7.284
## nex.d        1.566
## q            0.000
svmROC_tune <- roc(response=testset$Class, predictor=factor(test_pred_grid, ordered = TRUE), plot=TRUE)

plot(svmROC_tune, col="blue", lwd=3, main="ROC curve SVM")

auc(svmROC_tune)
## Area under the curve: 0.7129