Library
Beberapa Library yang digunakan pada praktikum kali ini :
library(caret)
library(tidyverse)
library(knitr)
library(ggplot2)
library(tidyr)
library(e1071)
library(ROCR)
library(rpart)
library(UBL)
library(googlesheets4)
Dataset
gs4_deauth()
data_sinta_SVM <- read_sheet("https://docs.google.com/spreadsheets/d/1DW_JRJop1LPqmxNue7tRawadTgf914KFFe4eXansaW4/edit?usp=sharing")
## ✔ Reading from "Data Sinta ITS".
## ✔ Range 'data_sinta_417'.
data_sinta_SVM$Rumpun_Ilmu <- as.factor(data_sinta_SVM$Rumpun_Ilmu)
data_sinta_SVM$Jenjang <- as.factor(data_sinta_SVM$Jenjang)
data_sinta_SVM$Akreditasi <- as.factor(data_sinta_SVM$Akreditasi)
data_sinta_SVM$y <- as.factor(data_sinta_SVM$y)
data_sinta_SVM <- as.data.frame(data_sinta_SVM)
str(data_sinta_SVM)
## 'data.frame': 981 obs. of 7 variables:
## $ Rumpun_Ilmu : Factor w/ 7 levels "Ekonomi","Kesehatan",..: 1 1 1 1 1 1 1 1 1 1 ...
## $ Jenjang : Factor w/ 4 levels "D4","S1","S2",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Akreditasi : Factor w/ 3 levels "Baik","Baik Sekali",..: 2 2 2 2 2 2 2 2 2 2 ...
## $ Jumlah_Dosen_Total : num 14 14 14 14 14 14 14 14 14 14 ...
## $ Jumlah_Mahasiswa : num 488 488 488 488 488 488 488 488 488 488 ...
## $ Rasio_Dosen_per_Mahasiswa: num 0.104 0.104 0.104 0.104 0.104 ...
## $ y : Factor w/ 2 levels "0","1": 2 2 1 1 1 1 1 1 1 1 ...
set.seed(414)
in.train <- createDataPartition(as.factor(data_sinta_SVM$y), p=0.75, list=FALSE) #partisi data
data_sinta_SVM_train <- data_sinta_SVM[in.train,] #data training utk modelling
data_sinta_SVM_test<- data_sinta_SVM[-in.train,] #data testing utk evaluasi model
cat("Frekuensi Data Training/Testing")
## Frekuensi Data Training/Testing
round((table(data_sinta_SVM_train$y)), digits = 4)
##
## 0 1
## 560 177
round((table(data_sinta_SVM_test$y)), digits = 4)
##
## 0 1
## 186 58
cat("\nProporsi Data Training/Testing")
##
## Proporsi Data Training/Testing
round(prop.table(table(data_sinta_SVM_train$y)), digits = 4)
##
## 0 1
## 0.7598 0.2402
round(prop.table(table(data_sinta_SVM_test$y)), digits = 4)
##
## 0 1
## 0.7623 0.2377
SMOTE
pada kasus ini akan digunakan SMOTE sebagai perlakuan penanganan
set.seed(414)
data_sinta_SVM_train_smote <- SmoteClassif(form = y ~., dat = data_sinta_SVM_train, C.perc = list("0"=1,"1"=sum(data_sinta_SVM_train==0)/sum(data_sinta_SVM_train==1)),dist = "HVDM")
round((table(data_sinta_SVM_train_smote$y)), digits = 4)
##
## 0 1
## 560 560
round(prop.table(table(data_sinta_SVM_train_smote$y)), digits = 4)
##
## 0 1
## 0.5 0.5
Barplot sebelum vs Sesudah Smote
Sebelum
df1 <- as.data.frame(table(data_sinta_SVM_train$y))
my_bar1 <- barplot(df1$Freq, names.arg=df1$Var1, border=F,
col=c("coral", "cadetblue1"),
las=2, ylim=c(0,600), main="Perbandingan Frekuensi Klasifikasi Sinta Score")
text(my_bar1, df1$Freq+25, df1$Freq)
Sesudah
df2 <- as.data.frame(table(data_sinta_SVM_train_smote$y))
my_bar2 <- barplot(df2$Freq, names.arg=df2$Var1, border=F,
col=c("coral", "cadetblue1"),
las=2, ylim=c(0,600), main="Perbandingan Frekuensi Klasifikasi Sinta Score")
text(my_bar2, df2$Freq+25, df2$Freq)
Function Performa Model
perform <- function(pred,data){
tabel <- caret::confusionMatrix(pred, data$y, positive="1")
result <- c(tabel$overall[1],tabel$byClass[c(1:2,11)])
return(result)
}
Model SVM Kernel Linear
model.svm1 <- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", scale=TRUE)
#model.svm1
pred.svm1 <- predict(model.svm1,data_sinta_SVM_test)
tabel1 <- caret::confusionMatrix(pred.svm1, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel1
Model SVM Kernel Sigmoid
model.svm2<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="sigmoid", scale=TRUE)
#model.svm2
pred.svm2 <- predict(model.svm2,data_sinta_SVM_test)
tabel2 <- caret::confusionMatrix(pred.svm2, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel2
Model SVM Kernel Radial
model.svm3<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="radial", scale=TRUE)
#model.svm3
pred.svm3<- predict(model.svm3,data_sinta_SVM_test)
tabel3 <- caret::confusionMatrix(pred.svm3, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel3
Model SVM Kernel Polynomial
model.svm4<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="polynomial", scale=TRUE)
#model.svm4
pred.svm4<- predict(model.svm4,data_sinta_SVM_test)
tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(data_sinta_SVM_test$y), positive = "1")
#tabel4
Perbandingan Model
hasil_eval <- rbind(
c(tabel1$overall[1], tabel1$byClass[1], tabel1$byClass[2]),
c(tabel2$overall[1], tabel2$byClass[1], tabel2$byClass[2]),
c(tabel3$overall[1], tabel3$byClass[1], tabel3$byClass[2]),
c(tabel4$overall[1], tabel4$byClass[1], tabel4$byClass[2]))
row.names(hasil_eval) <-
c("SVM Kernel Linear","SVM Kernel Sigmoid",
"SVM Kernel Radial", "SVM Kernel Polynomial")
hasil_eval <- as.data.frame(hasil_eval)
dplyr::arrange(.data = hasil_eval, desc(Accuracy))
Tuning Hyperparameter SVM
tuningsvm <- tune(svm,y~.,data=data_sinta_SVM_train_smote,
ranges=list(kernel=c("radial","linear","polynomial","sigmoid")))
tuningsvm$best.model
##
## Call:
## best.tune(METHOD = svm, train.x = y ~ ., data = data_sinta_SVM_train_smote,
## ranges = list(kernel = c("radial", "linear", "polynomial", "sigmoid")))
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 738
#Tune SVM to find the best hyperparameters
tune_svm <- tune(svm, y~.,data=data_sinta_SVM_train_smote,
kernel="linear", ranges=list(cost=seq(.01,.1,.01)))
print(tune_svm)
##
## Parameter tuning of 'svm':
##
## - sampling method: 10-fold cross validation
##
## - best parameters:
## cost
## 0.04
##
## - best performance: 0.3116071
model.svm5<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", cost=0.04)
model.svm5
##
## Call:
## svm(formula = y ~ ., data = data_sinta_SVM_train_smote, kernel = "linear",
## cost = 0.04)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 0.04
##
## Number of Support Vectors: 837
pred.svm5<- predict(model.svm5,data_sinta_SVM_test)
pred.svm6<- predict(model.svm5,data_sinta_SVM_train)
tabel5 <- caret::confusionMatrix(pred.svm5, as.factor(data_sinta_SVM_test$y), positive = "1")
tabel5
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 160 28
## 1 26 30
##
## Accuracy : 0.7787
## 95% CI : (0.7213, 0.8292)
## No Information Rate : 0.7623
## P-Value [Acc > NIR] : 0.3027
##
## Kappa : 0.382
##
## Mcnemar's Test P-Value : 0.8918
##
## Sensitivity : 0.5172
## Specificity : 0.8602
## Pos Pred Value : 0.5357
## Neg Pred Value : 0.8511
## Prevalence : 0.2377
## Detection Rate : 0.1230
## Detection Prevalence : 0.2295
## Balanced Accuracy : 0.6887
##
## 'Positive' Class : 1
##
tabel6 <- caret::confusionMatrix(pred.svm6, as.factor(data_sinta_SVM_train$y), positive = "1")
tabel6
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 469 82
## 1 91 95
##
## Accuracy : 0.7653
## 95% CI : (0.733, 0.7954)
## No Information Rate : 0.7598
## P-Value [Acc > NIR] : 0.384
##
## Kappa : 0.3678
##
## Mcnemar's Test P-Value : 0.543
##
## Sensitivity : 0.5367
## Specificity : 0.8375
## Pos Pred Value : 0.5108
## Neg Pred Value : 0.8512
## Prevalence : 0.2402
## Detection Rate : 0.1289
## Detection Prevalence : 0.2524
## Balanced Accuracy : 0.6871
##
## 'Positive' Class : 1
##
hasil_eval <- rbind(
c(tabel6$overall[1], tabel6$byClass["Balanced Accuracy"]),
c(tabel5$overall[1], tabel5$byClass["Balanced Accuracy"]))
row.names(hasil_eval) <-
c("SVM Linear Training", "SVM Kernel Testing")
hasil_eval <- as.data.frame(hasil_eval)
dplyr::arrange(.data = hasil_eval, desc(Accuracy))
Evaluasi Model dengan Pengulangan
perulangan <- 100
df_akurasi <- data.frame("akurasi_svm" = numeric(), "akurasi_nb" = numeric(), "akurasi_pc" = numeric())
SVM_list <- vector(mode="list", length = perulangan)
for (i in 1:perulangan){
in.train <- createDataPartition(as.factor(data_sinta_SVM$y),p=0.75,list=F)
data_sinta_SVM_train <- data_sinta_SVM[in.train,]
data_sinta_SVM_test<- data_sinta_SVM[-in.train,]
data_sinta_SVM_train_smote <- SmoteClassif(form = y ~., dat = data_sinta_SVM_train, C.perc = list("0"=1,"1"=sum(data_sinta_SVM_train==0)/sum(data_sinta_SVM_train==1)),dist = "HVDM")
#Model Support Vector Machine
model.svm4<- svm(y ~.,data=data_sinta_SVM_train_smote,kernel="linear", cost=0.04)
pred.svm4<- predict(model.svm4,data_sinta_SVM_test)
tabel4 <- caret::confusionMatrix(pred.svm4, as.factor(data_sinta_SVM_test$y), positive = "1")
akurasi<-as.data.frame(tabel4$overall)
akurasi_svm<-akurasi[1,]
# Update table akurasi
df_akurasi<- rbind(df_akurasi, c(akurasi_svm))
paste0("ulangan ", i, " selesai.\n")
}
colnames(df_akurasi) = c("SVM")
df_akurasi %>%
as_tibble() %>%
mutate(ulangan = 1:perulangan) %>%
pivot_longer(-ulangan) %>%
ggplot(aes(name, value)) +
geom_boxplot()+xlab("Metode")+ylab("Akurasi")
library("e1071")
library("DALEX")
## Welcome to DALEX (version: 2.4.3).
## Find examples and detailed introduction at: http://ema.drwhy.ai/
##
## Attaching package: 'DALEX'
## The following object is masked from 'package:dplyr':
##
## explain
library("ggplot2")
set.seed(41)
model.svm1 <- svm(y == "1" ~ ., data = data_sinta_SVM_train_smote, kernel="linear", cost=0.04, type = "C-classification", probability = TRUE)
set.seed(41)
explainer_svm <- DALEX::explain(model = model.svm1,
data = data_sinta_SVM_train_smote[,-7],
y = data_sinta_SVM_train_smote$y=="1",
label = "Support Vector Machine")
## Preparation of a new explainer is initiated
## -> model label : Support Vector Machine
## -> data : 1120 rows 6 cols
## -> target variable : 1120 values
## -> predict function : yhat.svm will be used ( default )
## -> predicted values : No value for predict function target column. ( default )
## -> model_info : package e1071 , ver. 1.7.13 , task classification ( default )
## -> model_info : Model info detected classification task but 'y' is a logical . Converted to numeric. ( NOTE )
## -> predicted values : numerical, min = 0.2509177 , mean = 0.4986054 , max = 0.8645471
## -> residual function : difference between y and yhat ( default )
## -> residuals : numerical, min = -0.8428362 , mean = 0.00139461 , max = 0.7490823
## A new explainer has been created!
set.seed(41)
vip_svm <- model_parts(explainer = explainer_svm, B = 1000)
vip_svm
plot(vip_svm) +
ggtitle("Mean variable-importance over 1000 permutations", "")