Pengaplikasian Metode Machine Learning SVM pada pengklasifikasian Sinta Score ITS

2022-10-14

Library

Beberapa Library yang digunakan pada praktikum kali ini :

library(caret)
library(tidyverse)
library(knitr)
library(ggplot2)
library(tidyr)
library(e1071)
library(ROCR)
library(rpart)
library(UBL)
library(googlesheets4)

Dataset

gs4_deauth()
data_sinta_SVM <- read_sheet("https://docs.google.com/spreadsheets/d/1DW_JRJop1LPqmxNue7tRawadTgf914KFFe4eXansaW4/edit?usp=sharing")
## ✔ Reading from "Data Sinta ITS".
## ✔ Range 'data_sinta_417'.
data_sinta_SVM$Rumpun_Ilmu  <- as.factor(data_sinta_SVM$Rumpun_Ilmu)
data_sinta_SVM$Jenjang <- as.factor(data_sinta_SVM$Jenjang)
data_sinta_SVM$Akreditasi  <- as.factor(data_sinta_SVM$Akreditasi)
data_sinta_SVM$y  <- as.factor(data_sinta_SVM$y)
data_sinta_SVM <- as.data.frame(data_sinta_SVM)
str(data_sinta_SVM)
## 'data.frame':    981 obs. of  7 variables:
##  $ Rumpun_Ilmu              : Factor w/ 7 levels "Ekonomi","Kesehatan",..: 1 1 1 1 1 1 1 1 1 1 ...
##  $ Jenjang                  : Factor w/ 4 levels "D4","S1","S2",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Akreditasi               : Factor w/ 3 levels "Baik","Baik Sekali",..: 2 2 2 2 2 2 2 2 2 2 ...
##  $ Jumlah_Dosen_Total       : num  14 14 14 14 14 14 14 14 14 14 ...
##  $ Jumlah_Mahasiswa         : num  488 488 488 488 488 488 488 488 488 488 ...
##  $ Rasio_Dosen_per_Mahasiswa: num  0.104 0.104 0.104 0.104 0.104 ...
##  $ y                        : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
set.seed(414)
in.train <- createDataPartition(as.factor(data_sinta_SVM$y), p=0.75, list=FALSE) #partisi data
data_sinta_SVM_train <- data_sinta_SVM[in.train,] #data training utk modelling
data_sinta_SVM_test<- data_sinta_SVM[-in.train,] #data testing utk evaluasi model
cat("Frekuensi Data Training/Testing")
## Frekuensi Data Training/Testing
round((table(data_sinta_SVM_train$y)), digits = 4)
## 
##   0   1 
## 560 177
round((table(data_sinta_SVM_test$y)), digits = 4)
## 
##   0   1 
## 186  58
cat("\nProporsi Data Training/Testing")
## 
## Proporsi Data Training/Testing
round(prop.table(table(data_sinta_SVM_train$y)), digits = 4)
## 
##      0      1 
## 0.7598 0.2402
round(prop.table(table(data_sinta_SVM_test$y)), digits = 4)
## 
##      0      1 
## 0.7623 0.2377

SMOTE

pada kasus ini akan digunakan SMOTE sebagai perlakuan penanganan

set.seed(414)
data_sinta_SVM_train_smote <- SmoteClassif(form = y ~., dat = data_sinta_SVM_train, C.perc = list("0"=1,"1"=sum(data_sinta_SVM_train==0)/sum(data_sinta_SVM_train==1)),dist = "HVDM")
round((table(data_sinta_SVM_train_smote$y)), digits = 4)
## 
##   0   1 
## 560 560
round(prop.table(table(data_sinta_SVM_train_smote$y)), digits = 4)
## 
##   0   1 
## 0.5 0.5

Barplot sebelum vs Sesudah Smote

Sebelum

df1 <- as.data.frame(table(data_sinta_SVM_train$y))
my_bar1 <- barplot(df1$Freq, names.arg=df1$Var1, border=F,
                  col=c("coral", "cadetblue1"),
                  las=2, ylim=c(0,600), main="Perbandingan Frekuensi Klasifikasi Sinta Score")

text(my_bar1, df1$Freq+25, df1$Freq) 

Sesudah

df2 <- as.data.frame(table(data_sinta_SVM_train_smote$y))
my_bar2 <- barplot(df2$Freq, names.arg=df2$Var1, border=F,
                  col=c("coral", "cadetblue1"),
                  las=2, ylim=c(0,600), main="Perbandingan Frekuensi Klasifikasi Sinta Score")

text(my_bar2, df2$Freq+25, df2$Freq) 

Function Performa Model

perform <- function(pred,data){
  tabel <- caret::confusionMatrix(pred, data$y, positive="1")
  result <- c(tabel$overall[1],tabel$byClass[c(1:2,11)])
  return(result)
}

Model SVM Kernel Linear

model.svm1 <- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", scale=TRUE)
#model.svm1
pred.svm1 <- predict(model.svm1,data_sinta_SVM_test)
tabel1 <- caret::confusionMatrix(pred.svm1, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel1

Model SVM Kernel Sigmoid

model.svm2<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="sigmoid", scale=TRUE)
#model.svm2
pred.svm2 <- predict(model.svm2,data_sinta_SVM_test)
tabel2 <- caret::confusionMatrix(pred.svm2, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel2

Model SVM Kernel Radial

model.svm3<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="radial", scale=TRUE)
#model.svm3
pred.svm3<- predict(model.svm3,data_sinta_SVM_test)
tabel3 <- caret::confusionMatrix(pred.svm3, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel3

Model SVM Kernel Polynomial

model.svm4<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="polynomial", scale=TRUE)
#model.svm4
pred.svm4<- predict(model.svm4,data_sinta_SVM_test)
tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel4

Perbandingan Model

hasil_eval <- rbind(
  c(tabel1$overall[1], tabel1$byClass[1], tabel1$byClass[2]),
  c(tabel2$overall[1], tabel2$byClass[1], tabel2$byClass[2]),
  c(tabel3$overall[1], tabel3$byClass[1], tabel3$byClass[2]),
  c(tabel4$overall[1], tabel4$byClass[1], tabel4$byClass[2]))
row.names(hasil_eval) <- 
  c("SVM Kernel Linear","SVM Kernel Sigmoid",
    "SVM Kernel Radial", "SVM Kernel Polynomial")
hasil_eval <- as.data.frame(hasil_eval)
dplyr::arrange(.data = hasil_eval, desc(Accuracy))

Tuning Hyperparameter SVM

tuningsvm <- tune(svm,y~.,data=data_sinta_SVM_train_smote,
                  ranges=list(kernel=c("radial","linear","polynomial","sigmoid")))
tuningsvm$best.model
## 
## Call:
## best.tune(METHOD = svm, train.x = y ~ ., data = data_sinta_SVM_train_smote, 
##     ranges = list(kernel = c("radial", "linear", "polynomial", "sigmoid")))
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  738
#Tune SVM to find the best hyperparameters
tune_svm <- tune(svm, y~.,data=data_sinta_SVM_train_smote,
              kernel="linear", ranges=list(cost=seq(.01,.1,.01)))
print(tune_svm)
## 
## Parameter tuning of 'svm':
## 
## - sampling method: 10-fold cross validation 
## 
## - best parameters:
##  cost
##  0.04
## 
## - best performance: 0.3116071
model.svm5<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", cost=0.04)
model.svm5
## 
## Call:
## svm(formula = y ~ ., data = data_sinta_SVM_train_smote, kernel = "linear", 
##     cost = 0.04)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  0.04 
## 
## Number of Support Vectors:  837
pred.svm5<- predict(model.svm5,data_sinta_SVM_test)
pred.svm6<- predict(model.svm5,data_sinta_SVM_train)

tabel5 <- caret::confusionMatrix(pred.svm5, as.factor(data_sinta_SVM_test$y), positive = "1")
tabel5
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 160  28
##          1  26  30
##                                           
##                Accuracy : 0.7787          
##                  95% CI : (0.7213, 0.8292)
##     No Information Rate : 0.7623          
##     P-Value [Acc > NIR] : 0.3027          
##                                           
##                   Kappa : 0.382           
##                                           
##  Mcnemar's Test P-Value : 0.8918          
##                                           
##             Sensitivity : 0.5172          
##             Specificity : 0.8602          
##          Pos Pred Value : 0.5357          
##          Neg Pred Value : 0.8511          
##              Prevalence : 0.2377          
##          Detection Rate : 0.1230          
##    Detection Prevalence : 0.2295          
##       Balanced Accuracy : 0.6887          
##                                           
##        'Positive' Class : 1               
## 
tabel6 <- caret::confusionMatrix(pred.svm6, as.factor(data_sinta_SVM_train$y), positive = "1")
tabel6
## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 469  82
##          1  91  95
##                                          
##                Accuracy : 0.7653         
##                  95% CI : (0.733, 0.7954)
##     No Information Rate : 0.7598         
##     P-Value [Acc > NIR] : 0.384          
##                                          
##                   Kappa : 0.3678         
##                                          
##  Mcnemar's Test P-Value : 0.543          
##                                          
##             Sensitivity : 0.5367         
##             Specificity : 0.8375         
##          Pos Pred Value : 0.5108         
##          Neg Pred Value : 0.8512         
##              Prevalence : 0.2402         
##          Detection Rate : 0.1289         
##    Detection Prevalence : 0.2524         
##       Balanced Accuracy : 0.6871         
##                                          
##        'Positive' Class : 1              
## 
hasil_eval <- rbind(
  c(tabel6$overall[1], tabel6$byClass["Balanced Accuracy"]),
  c(tabel5$overall[1], tabel5$byClass["Balanced Accuracy"]))
row.names(hasil_eval) <- 
  c("SVM Linear Training", "SVM Kernel Testing")
hasil_eval <- as.data.frame(hasil_eval)
dplyr::arrange(.data = hasil_eval, desc(Accuracy))

Evaluasi Model dengan Pengulangan

perulangan <- 100
df_akurasi <- data.frame("akurasi_svm" = numeric(), "akurasi_nb" = numeric(), "akurasi_pc" = numeric())
SVM_list <- vector(mode="list", length = perulangan)

for (i in 1:perulangan){
  in.train <- createDataPartition(as.factor(data_sinta_SVM$y),p=0.75,list=F)
  data_sinta_SVM_train <- data_sinta_SVM[in.train,] 
  data_sinta_SVM_test<- data_sinta_SVM[-in.train,] 
  data_sinta_SVM_train_smote <- SmoteClassif(form = y ~., dat = data_sinta_SVM_train, C.perc = list("0"=1,"1"=sum(data_sinta_SVM_train==0)/sum(data_sinta_SVM_train==1)),dist = "HVDM")

  #Model Support Vector Machine
  model.svm4<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", cost=0.04)
  pred.svm4<- predict(model.svm4,data_sinta_SVM_test)
  tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(data_sinta_SVM_test$y), positive = "1")
  akurasi<-as.data.frame(tabel4$overall)
  akurasi_svm<-akurasi[1,]

  # Update table akurasi
  df_akurasi<- rbind(df_akurasi, c(akurasi_svm))
  paste0("ulangan ", i, " selesai.\n")
}

colnames(df_akurasi) = c("SVM")
df_akurasi %>%
  as_tibble() %>%
  mutate(ulangan = 1:perulangan) %>%
  pivot_longer(-ulangan) %>%
  ggplot(aes(name, value)) +
  geom_boxplot()+xlab("Metode")+ylab("Akurasi")

library("e1071")
library("DALEX")
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
## 
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
## 
##     explain
library("ggplot2")
set.seed(41)
model.svm1 <- svm(y == "1" ~ ., data = data_sinta_SVM_train_smote, kernel="linear", cost=0.04, type = "C-classification", probability = TRUE)
set.seed(41)
explainer_svm <- DALEX::explain(model = model.svm1, 
                                data = data_sinta_SVM_train_smote[,-7], 
                                y = data_sinta_SVM_train_smote$y=="1", 
                                label = "Support Vector Machine")
## Preparation of a new explainer is initiated
##   -> model label       :  Support Vector Machine 
##   -> data              :  1120  rows  6  cols 
##   -> target variable   :  1120  values 
##   -> predict function  :  yhat.svm  will be used (  default  )
##   -> predicted values  :  No value for predict function target column. (  default  )
##   -> model_info        :  package e1071 , ver. 1.7.13 , task classification (  default  ) 
##   -> model_info        :  Model info detected classification task but 'y' is a logical . Converted to numeric.  (  NOTE  )
##   -> predicted values  :  numerical, min =  0.2509177 , mean =  0.4986054 , max =  0.8645471  
##   -> residual function :  difference between y and yhat (  default  )
##   -> residuals         :  numerical, min =  -0.8428362 , mean =  0.00139461 , max =  0.7490823  
##   A new explainer has been created!
set.seed(41)
vip_svm <- model_parts(explainer = explainer_svm, B = 1000)
vip_svm
plot(vip_svm) +
  ggtitle("Mean variable-importance over 1000 permutations", "") 

---
title: "Pengaplikasian Metode Machine Learning SVM pada pengklasifikasian Sinta Score ITS"
date: "2022-10-14"
output:
  rmdformats::downcute:
    downcute_theme: "chaos"
    self_contained: true
    code_download: true
    toc_float: true
    toc_depth: 3
    df_print: paged
    code_folding: show
    theme: cerulean
    highlight: "kate"
---

![](Paralel 1_4_TugasUTS.jpg)

# Library

Beberapa Library yang digunakan pada praktikum kali ini :

```{r, warning=FALSE, message=FALSE, collapse=TRUE}
library(caret)
library(tidyverse)
library(knitr)
library(ggplot2)
library(tidyr)
library(e1071)
library(ROCR)
library(rpart)
library(UBL)
library(googlesheets4)
```

## Dataset

```{r}
gs4_deauth()
data_sinta_SVM <- read_sheet("https://docs.google.com/spreadsheets/d/1DW_JRJop1LPqmxNue7tRawadTgf914KFFe4eXansaW4/edit?usp=sharing")
data_sinta_SVM$Rumpun_Ilmu  <- as.factor(data_sinta_SVM$Rumpun_Ilmu)
data_sinta_SVM$Jenjang <- as.factor(data_sinta_SVM$Jenjang)
data_sinta_SVM$Akreditasi  <- as.factor(data_sinta_SVM$Akreditasi)
data_sinta_SVM$y  <- as.factor(data_sinta_SVM$y)
data_sinta_SVM <- as.data.frame(data_sinta_SVM)
str(data_sinta_SVM)
```

```{r}
set.seed(414)
in.train <- createDataPartition(as.factor(data_sinta_SVM$y), p=0.75, list=FALSE) #partisi data
data_sinta_SVM_train <- data_sinta_SVM[in.train,] #data training utk modelling
data_sinta_SVM_test<- data_sinta_SVM[-in.train,] #data testing utk evaluasi model
cat("Frekuensi Data Training/Testing")
round((table(data_sinta_SVM_train$y)), digits = 4)
round((table(data_sinta_SVM_test$y)), digits = 4)
cat("\nProporsi Data Training/Testing")
round(prop.table(table(data_sinta_SVM_train$y)), digits = 4)
round(prop.table(table(data_sinta_SVM_test$y)), digits = 4)
```

## SMOTE

pada kasus ini akan digunakan SMOTE sebagai perlakuan penanganan

```{r, warning=FALSE, message=FALSE}
set.seed(414)
data_sinta_SVM_train_smote <- SmoteClassif(form = y ~., dat = data_sinta_SVM_train, C.perc = list("0"=1,"1"=sum(data_sinta_SVM_train==0)/sum(data_sinta_SVM_train==1)),dist = "HVDM")
round((table(data_sinta_SVM_train_smote$y)), digits = 4)
round(prop.table(table(data_sinta_SVM_train_smote$y)), digits = 4)
```

# Barplot sebelum vs Sesudah `Smote`

## Sebelum

```{r}
df1 <- as.data.frame(table(data_sinta_SVM_train$y))
my_bar1 <- barplot(df1$Freq, names.arg=df1$Var1, border=F,
                  col=c("coral", "cadetblue1"),
                  las=2, ylim=c(0,600), main="Perbandingan Frekuensi Klasifikasi Sinta Score")

text(my_bar1, df1$Freq+25, df1$Freq) 
```

## Sesudah

```{r}
df2 <- as.data.frame(table(data_sinta_SVM_train_smote$y))
my_bar2 <- barplot(df2$Freq, names.arg=df2$Var1, border=F,
                  col=c("coral", "cadetblue1"),
                  las=2, ylim=c(0,600), main="Perbandingan Frekuensi Klasifikasi Sinta Score")

text(my_bar2, df2$Freq+25, df2$Freq) 
```

# Function Performa Model

```{r}
perform <- function(pred,data){
  tabel <- caret::confusionMatrix(pred, data$y, positive="1")
  result <- c(tabel$overall[1],tabel$byClass[c(1:2,11)])
  return(result)
}
```

## Model SVM Kernel `Linear`

```{r}
model.svm1 <- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", scale=TRUE)
#model.svm1
pred.svm1 <- predict(model.svm1,data_sinta_SVM_test)
tabel1 <- caret::confusionMatrix(pred.svm1, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel1
```

## Model SVM Kernel `Sigmoid`

```{r}
model.svm2<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="sigmoid", scale=TRUE)
#model.svm2
pred.svm2 <- predict(model.svm2,data_sinta_SVM_test)
tabel2 <- caret::confusionMatrix(pred.svm2, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel2
```

## Model SVM Kernel `Radial`

```{r}
model.svm3<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="radial", scale=TRUE)
#model.svm3
pred.svm3<- predict(model.svm3,data_sinta_SVM_test)
tabel3 <- caret::confusionMatrix(pred.svm3, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel3
```

## Model SVM Kernel `Polynomial`

```{r}
model.svm4<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="polynomial", scale=TRUE)
#model.svm4
pred.svm4<- predict(model.svm4,data_sinta_SVM_test)
tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel4
```

## Perbandingan Model

```{r}
hasil_eval <- rbind(
  c(tabel1$overall[1], tabel1$byClass[1], tabel1$byClass[2]),
  c(tabel2$overall[1], tabel2$byClass[1], tabel2$byClass[2]),
  c(tabel3$overall[1], tabel3$byClass[1], tabel3$byClass[2]),
  c(tabel4$overall[1], tabel4$byClass[1], tabel4$byClass[2]))
row.names(hasil_eval) <- 
  c("SVM Kernel Linear","SVM Kernel Sigmoid",
    "SVM Kernel Radial", "SVM Kernel Polynomial")
hasil_eval <- as.data.frame(hasil_eval)
dplyr::arrange(.data = hasil_eval, desc(Accuracy))

```

# Tuning Hyperparameter SVM

```{r}
tuningsvm <- tune(svm,y~.,data=data_sinta_SVM_train_smote,
                  ranges=list(kernel=c("radial","linear","polynomial","sigmoid")))
tuningsvm$best.model
```

```{r}
#Tune SVM to find the best hyperparameters
tune_svm <- tune(svm, y~.,data=data_sinta_SVM_train_smote,
              kernel="linear", ranges=list(cost=seq(.01,.1,.01)))
print(tune_svm)
```

```{r}
model.svm5<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", cost=0.04)
model.svm5
```

```{r}
pred.svm5<- predict(model.svm5,data_sinta_SVM_test)
pred.svm6<- predict(model.svm5,data_sinta_SVM_train)

tabel5 <- caret::confusionMatrix(pred.svm5, as.factor(data_sinta_SVM_test$y), positive = "1")
tabel5
tabel6 <- caret::confusionMatrix(pred.svm6, as.factor(data_sinta_SVM_train$y), positive = "1")
tabel6
```

```{r}
hasil_eval <- rbind(
  c(tabel6$overall[1], tabel6$byClass["Balanced Accuracy"]),
  c(tabel5$overall[1], tabel5$byClass["Balanced Accuracy"]))
row.names(hasil_eval) <- 
  c("SVM Linear Training", "SVM Kernel Testing")
hasil_eval <- as.data.frame(hasil_eval)
dplyr::arrange(.data = hasil_eval, desc(Accuracy))

```

# Evaluasi Model dengan Pengulangan

```{r}
perulangan <- 100
df_akurasi <- data.frame("akurasi_svm" = numeric(), "akurasi_nb" = numeric(), "akurasi_pc" = numeric())
SVM_list <- vector(mode="list", length = perulangan)

for (i in 1:perulangan){
  in.train <- createDataPartition(as.factor(data_sinta_SVM$y),p=0.75,list=F)
  data_sinta_SVM_train <- data_sinta_SVM[in.train,] 
  data_sinta_SVM_test<- data_sinta_SVM[-in.train,] 
  data_sinta_SVM_train_smote <- SmoteClassif(form = y ~., dat = data_sinta_SVM_train, C.perc = list("0"=1,"1"=sum(data_sinta_SVM_train==0)/sum(data_sinta_SVM_train==1)),dist = "HVDM")

  #Model Support Vector Machine
  model.svm4<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", cost=0.04)
  pred.svm4<- predict(model.svm4,data_sinta_SVM_test)
  tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(data_sinta_SVM_test$y), positive = "1")
  akurasi<-as.data.frame(tabel4$overall)
  akurasi_svm<-akurasi[1,]

  # Update table akurasi
  df_akurasi<- rbind(df_akurasi, c(akurasi_svm))
  paste0("ulangan ", i, " selesai.\n")
}

colnames(df_akurasi) = c("SVM")
df_akurasi %>%
  as_tibble() %>%
  mutate(ulangan = 1:perulangan) %>%
  pivot_longer(-ulangan) %>%
  ggplot(aes(name, value)) +
  geom_boxplot()+xlab("Metode")+ylab("Akurasi")
  
```

```{r warning=FALSE}
library("e1071")
library("DALEX")
library("ggplot2")
set.seed(41)
model.svm1 <- svm(y == "1" ~ ., data = data_sinta_SVM_train_smote, kernel="linear", cost=0.04, type = "C-classification", probability = TRUE)
set.seed(41)
explainer_svm <- DALEX::explain(model = model.svm1, 
                                data = data_sinta_SVM_train_smote[,-7], 
                                y = data_sinta_SVM_train_smote$y=="1", 
                                label = "Support Vector Machine")
set.seed(41)
vip_svm <- model_parts(explainer = explainer_svm, B = 1000)
vip_svm
plot(vip_svm) +
  ggtitle("Mean variable-importance over 1000 permutations", "") 
```
