library(readxl)
## Warning: package 'readxl' was built under R version 4.1.2
library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.1.2
## Warning: package 'tibble' was built under R version 4.1.2
## Warning: package 'tidyr' was built under R version 4.1.2
## Warning: package 'readr' was built under R version 4.1.2
## Warning: package 'purrr' was built under R version 4.1.2
## Warning: package 'dplyr' was built under R version 4.1.2
## Warning: package 'forcats' was built under R version 4.1.2
## Warning: package 'lubridate' was built under R version 4.1.2
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.4.4 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(ggplot2)
library(e1071)
## Warning: package 'e1071' was built under R version 4.1.2
library(dplyr)
library(skimr)
## Warning: package 'skimr' was built under R version 4.1.2
library(DataExplorer)
library(rpart.plot)
## Warning: package 'rpart.plot' was built under R version 4.1.2
## Loading required package: rpart
## Warning: package 'rpart' was built under R version 4.1.2
library(gridExtra)
##
## Attaching package: 'gridExtra'
##
## The following object is masked from 'package:dplyr':
##
## combine
library(ROSE)
## Loaded ROSE 0.0-4
library(caTools)
library(recipes)
##
## Attaching package: 'recipes'
##
## The following object is masked from 'package:stringr':
##
## fixed
##
## The following object is masked from 'package:stats':
##
## step
library(caret)
## Warning: package 'caret' was built under R version 4.1.2
## Loading required package: lattice
## Warning: package 'lattice' was built under R version 4.1.2
##
## Attaching package: 'caret'
##
## The following object is masked from 'package:purrr':
##
## lift
setwd("~/Desktop")
Data_Lama <- read.csv("tugas-sainsdata-23.csv", header = TRUE)
Data<- Data_Lama[,-1]
head(Data)
## cabang jenis.kelamin usia pendidikan frekuensi.fashion nilai.fashion
## 1 11 2 38 2 5 1.1588
## 2 2 2 33 2 4 0.4964
## 3 7 1 41 3 5 0.6008
## 4 2 1 43 3 6 0.3612
## 5 3 2 37 4 2 0.6572
## 6 4 2 39 3 3 0.3894
## frekuensi.footwear nilai.footwear frekuensi.lainnya nilai.lainnya
## 1 1 0.4560 4 0.9990
## 2 1 0.9162 4 0.2822
## 3 1 0.0384 4 0.6252
## 4 3 0.5626 6 0.4118
## 5 3 0.1014 1 0.6478
## 6 2 0.2674 3 0.5866
## total.nilai.tunai lama.member promo
## 1 0.00 18 0
## 2 0.59 35 1
## 3 3.05 39 0
## 4 0.00 9 1
## 5 1.06 51 0
## 6 0.26 19 0
summary(Data)
## cabang jenis.kelamin usia pendidikan
## Min. : 1.000 Min. :1.000 Min. :26.00 Min. :1.000
## 1st Qu.: 4.000 1st Qu.:1.000 1st Qu.:37.00 1st Qu.:3.000
## Median : 8.000 Median :2.000 Median :40.00 Median :3.000
## Mean : 7.609 Mean :1.575 Mean :40.13 Mean :3.067
## 3rd Qu.:11.000 3rd Qu.:2.000 3rd Qu.:44.00 3rd Qu.:4.000
## Max. :14.000 Max. :2.000 Max. :56.00 Max. :4.000
## frekuensi.fashion nilai.fashion frekuensi.footwear nilai.footwear
## Min. :0.000 Min. :0.0296 Min. :0.000 Min. :0.0250
## 1st Qu.:2.000 1st Qu.:0.3952 1st Qu.:2.000 1st Qu.:0.4107
## Median :3.000 Median :0.6768 Median :3.000 Median :0.6904
## Mean :3.061 Mean :0.7852 Mean :3.073 Mean :0.8451
## 3rd Qu.:4.000 3rd Qu.:1.0470 3rd Qu.:4.000 3rd Qu.:1.1055
## Max. :8.000 Max. :2.6276 Max. :8.000 Max. :3.5494
## frekuensi.lainnya nilai.lainnya total.nilai.tunai lama.member
## Min. :0.000 Min. :0.0214 Min. : 0.000 Min. : 1.00
## 1st Qu.:2.000 1st Qu.:0.4323 1st Qu.: 0.000 1st Qu.:14.00
## Median :3.000 Median :0.7311 Median : 0.675 Median :25.00
## Mean :2.782 Mean :0.8506 Mean : 2.240 Mean :25.85
## 3rd Qu.:4.000 3rd Qu.:1.1118 3rd Qu.: 3.045 3rd Qu.:38.00
## Max. :7.000 Max. :3.0334 Max. :23.020 Max. :51.00
## promo
## Min. :0.0000
## 1st Qu.:0.0000
## Median :0.0000
## Mean :0.3324
## 3rd Qu.:1.0000
## Max. :1.0000
str(Data)
## 'data.frame': 358 obs. of 13 variables:
## $ cabang : int 11 2 7 2 3 4 11 9 12 13 ...
## $ jenis.kelamin : int 2 2 1 1 2 2 2 1 1 1 ...
## $ usia : int 38 33 41 43 37 39 41 42 45 52 ...
## $ pendidikan : int 2 2 3 3 4 3 3 3 3 3 ...
## $ frekuensi.fashion : int 5 4 5 6 2 3 4 5 4 2 ...
## $ nilai.fashion : num 1.159 0.496 0.601 0.361 0.657 ...
## $ frekuensi.footwear: int 1 1 1 3 3 2 6 3 3 1 ...
## $ nilai.footwear : num 0.456 0.9162 0.0384 0.5626 0.1014 ...
## $ frekuensi.lainnya : int 4 4 4 6 1 3 4 4 2 2 ...
## $ nilai.lainnya : num 0.999 0.282 0.625 0.412 0.648 ...
## $ total.nilai.tunai : num 0 0.59 3.05 0 1.06 0.26 2.85 0 0.3 6.31 ...
## $ lama.member : int 18 35 39 9 51 19 21 27 5 37 ...
## $ promo : int 0 1 0 1 0 0 1 0 0 0 ...
plot_intro(Data,ggtheme = theme_classic())
Data %>%
count(promo) %>%
mutate(percent = n * 100 / sum(n), label = str_c(round(percent, 2), "%")) %>%
ggplot(aes(x = "", y = n, fill = as.factor(promo))) +
geom_col() +
geom_text(aes(label = label), position = position_stack(vjust = 0.5)) +
coord_polar(theta = "y") +
theme_void()
skim_without_charts(Data)
| Name | Data |
| Number of rows | 358 |
| Number of columns | 13 |
| _______________________ | |
| Column type frequency: | |
| numeric | 13 |
| ________________________ | |
| Group variables | None |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
|---|---|---|---|---|---|---|---|---|---|
| cabang | 0 | 1 | 7.61 | 3.96 | 1.00 | 4.00 | 8.00 | 11.00 | 14.00 |
| jenis.kelamin | 0 | 1 | 1.58 | 0.49 | 1.00 | 1.00 | 2.00 | 2.00 | 2.00 |
| usia | 0 | 1 | 40.13 | 5.07 | 26.00 | 37.00 | 40.00 | 44.00 | 56.00 |
| pendidikan | 0 | 1 | 3.07 | 0.81 | 1.00 | 3.00 | 3.00 | 4.00 | 4.00 |
| frekuensi.fashion | 0 | 1 | 3.06 | 1.52 | 0.00 | 2.00 | 3.00 | 4.00 | 8.00 |
| nilai.fashion | 0 | 1 | 0.79 | 0.52 | 0.03 | 0.40 | 0.68 | 1.05 | 2.63 |
| frekuensi.footwear | 0 | 1 | 3.07 | 1.63 | 0.00 | 2.00 | 3.00 | 4.00 | 8.00 |
| nilai.footwear | 0 | 1 | 0.85 | 0.61 | 0.03 | 0.41 | 0.69 | 1.11 | 3.55 |
| frekuensi.lainnya | 0 | 1 | 2.78 | 1.49 | 0.00 | 2.00 | 3.00 | 4.00 | 7.00 |
| nilai.lainnya | 0 | 1 | 0.85 | 0.58 | 0.02 | 0.43 | 0.73 | 1.11 | 3.03 |
| total.nilai.tunai | 0 | 1 | 2.24 | 3.43 | 0.00 | 0.00 | 0.68 | 3.04 | 23.02 |
| lama.member | 0 | 1 | 25.85 | 14.41 | 1.00 | 14.00 | 25.00 | 38.00 | 51.00 |
| promo | 0 | 1 | 0.33 | 0.47 | 0.00 | 0.00 | 0.00 | 1.00 | 1.00 |
# Plot hubungan antara Jenis Kelamin dan Promo Response
plot1 <- ggplot(Data, aes(x = factor(jenis.kelamin), fill = factor(promo))) +
geom_bar(position = "dodge", stat = "count") +
labs(x = "Jenis Kelamin", fill = "Promo Response") +
theme_minimal() +
theme(legend.position="bottom")
# Plot hubungan antara Pendidikan dan Promo Response
plot2 <- ggplot(Data, aes(x = factor(pendidikan), fill = factor(promo))) +
geom_bar(position = "dodge", stat = "count") +
labs(x = "Pendidikan", fill = "Promo Response") +
theme_minimal() +
theme(legend.position="bottom")
# Plot hubungan antara Cabang dan Promo Response
plot3 <- ggplot(Data, aes(x = factor(frekuensi.fashion), fill = factor(promo))) +
geom_bar(position = "dodge", stat = "count") +
labs(x = "Cabang", fill = "Promo Response") +
theme_minimal() +
theme(legend.position="bottom")
grid.arrange(plot1, plot2, plot3, ncol = 2)
plot_boxplot(data = Data, by = "promo",
ggtheme = theme_classic(),
geom_boxplot_args = list(fill = "Blue"))
Tahap praproses data terdiri dari:
Data cleaning (pembersihan data): adalah proses identifikasi,
koreksi, dan penghapusan masalah atau ketidaksempurnaan dalam data agar
dapat diolah dan dianalisis dengan akurat dengan menangani Missing
Value, outlier, duplikasi data dan kesalahan input data.
Feature Selection (Pemilihan Fitur): adalah proses pemilihan
sebagian fitur yang paling relevan atau penting untuk digunakan dalam
membangun model.
Feature Engineering (Penciptaan Fitur): adalah proses
transformasi data mentah menjadi suatu fitur yang lebih baik dalam
merepresentasikan pola yang terkandung di dalam data, sehingga dapat
meningkatkan performa model.
# Mendeteksi outlier untuk setiap variabel numerik
# Menghitung IQR untuk setiap variabel numerik
iqr_values <- apply(Data[, c("usia", "frekuensi.fashion", "nilai.fashion",
"frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya",
"nilai.lainnya", "total.nilai.tunai", "lama.member")], 2, IQR)
# Menghitung batas atas dan batas bawah untuk mendeteksi outlier
lower_bound <- apply(Data[, c("usia", "frekuensi.fashion", "nilai.fashion",
"frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya",
"nilai.lainnya", "total.nilai.tunai", "lama.member")], 2, function(x) quantile(x, 0.25) - 1.5 * IQR(x))
upper_bound <- apply(Data[, c("usia", "frekuensi.fashion", "nilai.fashion",
"frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya",
"nilai.lainnya", "total.nilai.tunai", "lama.member")], 2, function(x) quantile(x, 0.75) + 1.5 * IQR(x))
# Mendeteksi outlier
outliers <- sapply(1:ncol(Data[, c("usia", "frekuensi.fashion", "nilai.fashion",
"frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya",
"nilai.lainnya", "total.nilai.tunai", "lama.member")]),
function(i) Data[, c("usia", "frekuensi.fashion", "nilai.fashion",
"frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya",
"nilai.lainnya", "total.nilai.tunai", "lama.member")][which(Data[, c("usia", "frekuensi.fashion", "nilai.fashion",
"frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya",
"nilai.lainnya", "total.nilai.tunai", "lama.member")][, i] < lower_bound[i] | Data[, c("usia", "frekuensi.fashion", "nilai.fashion",
"frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya",
"nilai.lainnya", "total.nilai.tunai", "lama.member")][, i] > upper_bound[i]), i])
# Menampilkan outlier
print(outliers)
## [[1]]
## [1] 56 26
##
## [[2]]
## [1] 8
##
## [[3]]
## [1] 2.2226 2.6276 2.0776 2.2716 2.2006 2.3006 2.1268 2.4374 2.3964 2.5078
## [11] 2.1186 2.3752 2.2716
##
## [[4]]
## [1] 8 8
##
## [[5]]
## [1] 2.4224 2.8932 3.5494 2.4298 2.2678 2.2986 2.5292 2.2276 2.4346 3.0260
## [11] 2.7520 2.1770 2.5416 2.6252 2.7402 2.8930 2.1974
##
## [[6]]
## integer(0)
##
## [[7]]
## [1] 2.8132 2.2904 2.5626 2.4766 2.5246 2.5776 3.0334 2.2050 2.5248 2.6500
## [11] 2.7596 2.1968 2.2588 2.9106 3.0118 2.5678
##
## [[8]]
## [1] 9.88 9.48 23.02 8.80 8.44 8.80 13.32 8.48 9.73 8.43 21.26 8.32
## [13] 15.53 22.95 9.92 9.80 9.42 9.53 8.69 10.32 9.78 8.92 11.40 8.61
## [25] 12.26 8.12 7.87
##
## [[9]]
## integer(0)
# Menggantikan outlier dengan nilai batas atas atau batas bawah
# Gantilah dengan batas atas atau batas bawah sesuai kebutuhan
replace_outliers <- function(x, lower_bound, upper_bound) {
x[x < lower_bound] <- lower_bound
x[x > upper_bound] <- upper_bound
return(x)
}
# Gantilah outlier untuk setiap variabel numerik
Data_clean <- Data
Data_clean$usia <- replace_outliers(Data_clean$usia, lower_bound[1], upper_bound[1])
Data_clean$frekuensi.fashion <- replace_outliers(Data_clean$frekuensi.fashion, lower_bound[2], upper_bound[2])
Data_clean$nilai.fashion <- replace_outliers(Data_clean$nilai.fashion, lower_bound[3], upper_bound[3])
Data_clean$frekuensi.footwear <- replace_outliers(Data_clean$frekuensi.footwear, lower_bound[4], upper_bound[4])
Data_clean$nilai.footwear <- replace_outliers(Data_clean$nilai.footwear, lower_bound[5], upper_bound[5])
Data_clean$frekuensi.lainnya <- replace_outliers(Data_clean$frekuensi.lainnya, lower_bound[6], upper_bound[6])
Data_clean$nilai.lainnya <- replace_outliers(Data_clean$nilai.lainnya, lower_bound[7], upper_bound[7])
Data_clean$total.nilai.tunai <- replace_outliers(Data_clean$total.nilai.tunai, lower_bound[8], upper_bound[8])
Data_clean$lama.member <- replace_outliers(Data_clean$lama.member, lower_bound[9], upper_bound[9])
# Tampilkan struktur data setelah mengatasi outlier
str(Data_clean)
## 'data.frame': 358 obs. of 13 variables:
## $ cabang : int 11 2 7 2 3 4 11 9 12 13 ...
## $ jenis.kelamin : int 2 2 1 1 2 2 2 1 1 1 ...
## $ usia : num 38 33 41 43 37 39 41 42 45 52 ...
## $ pendidikan : int 2 2 3 3 4 3 3 3 3 3 ...
## $ frekuensi.fashion : num 5 4 5 6 2 3 4 5 4 2 ...
## $ nilai.fashion : num 1.159 0.496 0.601 0.361 0.657 ...
## $ frekuensi.footwear: num 1 1 1 3 3 2 6 3 3 1 ...
## $ nilai.footwear : num 0.456 0.9162 0.0384 0.5626 0.1014 ...
## $ frekuensi.lainnya : num 4 4 4 6 1 3 4 4 2 2 ...
## $ nilai.lainnya : num 0.999 0.282 0.625 0.412 0.648 ...
## $ total.nilai.tunai : num 0 0.59 3.05 0 1.06 0.26 2.85 0 0.3 6.31 ...
## $ lama.member : num 18 35 39 9 51 19 21 27 5 37 ...
## $ promo : int 0 1 0 1 0 0 1 0 0 0 ...
# Pilih kolom-kolom numerik yang akan distandarisasi
numeric_columns <- sapply(Data_clean, is.numeric)
selected_data <- Data_clean[, numeric_columns]
# Standarisasi data
scaled_data <- scale(selected_data)
# Gantikan kolom numerik pada data awal dengan data yang sudah distandarisasi
Data_clean[, numeric_columns] <- scaled_data
# Tampilkan beberapa baris data setelah standarisasi
head(Data_clean)
## cabang jenis.kelamin usia pendidikan frekuensi.fashion
## 1 0.8556472 0.8577896 -0.4209199 -1.31129051 1.28268728
## 2 -1.4152715 0.8577896 -1.4109944 -1.31129051 0.62196491
## 3 -0.1536500 -1.1625306 0.1731248 -0.08238474 1.28268728
## 4 -1.4152715 -1.1625306 0.5691545 -0.08238474 1.94340965
## 5 -1.1629472 0.8577896 -0.6189348 1.14652102 -0.69947983
## 6 -0.9106229 0.8577896 -0.2229050 -0.08238474 -0.03875746
## nilai.fashion frekuensi.footwear nilai.footwear frekuensi.lainnya
## 1 0.7810533 -1.28179132 -0.6638244 0.8150925
## 2 -0.5673239 -1.28179132 0.1658464 0.8150925
## 3 -0.3548080 -1.28179132 -1.4166939 0.8150925
## 4 -0.8425362 -0.04157161 -0.4716408 2.1536389
## 5 -0.2400005 -0.04157161 -1.3031145 -1.1927271
## 6 -0.7851325 -0.66168146 -1.0038416 0.1458193
## nilai.lainnya total.nilai.tunai lama.member promo
## 1 0.3206062 -0.7744219 -0.5445594 -0.7046397
## 2 -1.0417954 -0.5428881 0.6352869 1.4152008
## 3 -0.3898650 0.4224899 0.9128978 -0.7046397
## 4 -0.7954683 -0.7744219 -1.1691839 1.4152008
## 5 -0.3469098 -0.3584460 1.7457305 -0.7046397
## 6 -0.4632309 -0.6723901 -0.4751566 -0.7046397
# Memeriksa distribusi kelas
table(Data_clean$promo)
##
## -0.704639727422298 1.41520079709184
## 239 119
# Memeriksa persentase kelas
prop.table(table(Data_clean$promo)) * 100
##
## -0.704639727422298 1.41520079709184
## 66.75978 33.24022
# Visualisasi distribusi kelas
barplot(table(Data_clean$promo), col = c("blue", "red"), main = "Distribusi Kelas")
# Menangani Imbalanced Data dengan Oversampling
oversampled_data <- ROSE(promo ~ ., data = Data_clean, seed = 123, N = nrow(Data) / 2, p = 0.5)$data
# Setelah oversampling, memeriksa kembali distribusi kelas
table(oversampled_data$promo)
##
## -0.704639727422298 1.41520079709184
## 94 85
# Visualisasi distribusi kelas setelah penangan
barplot(table(oversampled_data$promo), col = c("blue", "red"), main = "Distribusi Kelas")
set.seed(123)
Split_tp <- sample.split(Data, SplitRatio = 0.7)
Train_tp <- subset(Data, Split_tp==TRUE)
Test_tp <- subset(Data, Split_tp==FALSE)
Train_tp$promo <- as.factor(Train_tp$promo)
Test_tp$promo <- as.factor(Test_tp$promo)
svm_model_tp <- svm(promo ~ ., data = Train_tp)
summary(svm_model_tp)
##
## Call:
## svm(formula = promo ~ ., data = Train_tp)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 209
##
## ( 121 88 )
##
##
## Number of Classes: 2
##
## Levels:
## 0 1
p_train_tp <- predict(svm_model_tp)
p_train_tp <-confusionMatrix(p_train_tp, Train_tp$promo)
p_train_tp
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 157 44
## 1 3 44
##
## Accuracy : 0.8105
## 95% CI : (0.7561, 0.8573)
## No Information Rate : 0.6452
## P-Value [Acc > NIR] : 8.525e-09
##
## Kappa : 0.5376
##
## Mcnemar's Test P-Value : 5.392e-09
##
## Sensitivity : 0.9812
## Specificity : 0.5000
## Pos Pred Value : 0.7811
## Neg Pred Value : 0.9362
## Prevalence : 0.6452
## Detection Rate : 0.6331
## Detection Prevalence : 0.8105
## Balanced Accuracy : 0.7406
##
## 'Positive' Class : 0
##
set.seed(123)
Split <- sample.split(oversampled_data, SplitRatio = 0.7)
Train <- subset(oversampled_data, Split==TRUE)
Test <- subset(oversampled_data, Split==FALSE)
Train$promo <- as.factor(Train$promo)
Test$promo <- as.factor(Test$promo)
svm_model <- svm(promo ~ ., data = Train)
summary(svm_model)
##
## Call:
## svm(formula = promo ~ ., data = Train)
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: radial
## cost: 1
##
## Number of Support Vectors: 115
##
## ( 59 56 )
##
##
## Number of Classes: 2
##
## Levels:
## -0.704639727422298 1.41520079709184
p_train <- predict(svm_model)
p_train_cm<-confusionMatrix(p_train, Train$promo)
p_train_cm
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.704639727422298 1.41520079709184
## -0.704639727422298 64 11
## 1.41520079709184 2 47
##
## Accuracy : 0.8952
## 95% CI : (0.8274, 0.943)
## No Information Rate : 0.5323
## P-Value [Acc > NIR] : <2e-16
##
## Kappa : 0.7874
##
## Mcnemar's Test P-Value : 0.0265
##
## Sensitivity : 0.9697
## Specificity : 0.8103
## Pos Pred Value : 0.8533
## Neg Pred Value : 0.9592
## Prevalence : 0.5323
## Detection Rate : 0.5161
## Detection Prevalence : 0.6048
## Balanced Accuracy : 0.8900
##
## 'Positive' Class : -0.704639727422298
##
Train$promo <- as.factor(Train$promo)
Test$promo <- as.factor(Test$promo)
svm_model_kl <- svm(promo ~ ., data = Train, kernel = "linear")
summary(svm_model_kl)
##
## Call:
## svm(formula = promo ~ ., data = Train, kernel = "linear")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: linear
## cost: 1
##
## Number of Support Vectors: 88
##
## ( 44 44 )
##
##
## Number of Classes: 2
##
## Levels:
## -0.704639727422298 1.41520079709184
p_train_kl <- predict(svm_model_kl)
p_train_kl <-confusionMatrix(p_train_kl, Train$promo)
p_train_kl
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.704639727422298 1.41520079709184
## -0.704639727422298 55 21
## 1.41520079709184 11 37
##
## Accuracy : 0.7419
## 95% CI : (0.6557, 0.8163)
## No Information Rate : 0.5323
## P-Value [Acc > NIR] : 1.304e-06
##
## Kappa : 0.4762
##
## Mcnemar's Test P-Value : 0.1116
##
## Sensitivity : 0.8333
## Specificity : 0.6379
## Pos Pred Value : 0.7237
## Neg Pred Value : 0.7708
## Prevalence : 0.5323
## Detection Rate : 0.4435
## Detection Prevalence : 0.6129
## Balanced Accuracy : 0.7356
##
## 'Positive' Class : -0.704639727422298
##
Train$promo <- as.factor(Train$promo)
Test$promo <- as.factor(Test$promo)
svm_model_p <- svm(promo ~ ., data = Train, kernel = "polynomial")
summary(svm_model_p)
##
## Call:
## svm(formula = promo ~ ., data = Train, kernel = "polynomial")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: polynomial
## cost: 1
## degree: 3
## coef.0: 0
##
## Number of Support Vectors: 109
##
## ( 57 52 )
##
##
## Number of Classes: 2
##
## Levels:
## -0.704639727422298 1.41520079709184
p_train_p <- predict(svm_model_p)
p_train_p <-confusionMatrix(p_train_p, Train$promo)
p_train_p
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.704639727422298 1.41520079709184
## -0.704639727422298 66 20
## 1.41520079709184 0 38
##
## Accuracy : 0.8387
## 95% CI : (0.7619, 0.8986)
## No Information Rate : 0.5323
## P-Value [Acc > NIR] : 6.310e-13
##
## Kappa : 0.6692
##
## Mcnemar's Test P-Value : 2.152e-05
##
## Sensitivity : 1.0000
## Specificity : 0.6552
## Pos Pred Value : 0.7674
## Neg Pred Value : 1.0000
## Prevalence : 0.5323
## Detection Rate : 0.5323
## Detection Prevalence : 0.6935
## Balanced Accuracy : 0.8276
##
## 'Positive' Class : -0.704639727422298
##
Train$promo <- as.factor(Train$promo)
Test$promo <- as.factor(Test$promo)
svm_model_s <- svm(promo ~ ., data = Train, kernel = "sigmoid")
summary(svm_model_s)
##
## Call:
## svm(formula = promo ~ ., data = Train, kernel = "sigmoid")
##
##
## Parameters:
## SVM-Type: C-classification
## SVM-Kernel: sigmoid
## cost: 1
## coef.0: 0
##
## Number of Support Vectors: 96
##
## ( 48 48 )
##
##
## Number of Classes: 2
##
## Levels:
## -0.704639727422298 1.41520079709184
p_train_s <- predict(svm_model_s)
p_train_s <-confusionMatrix(p_train_s, Train$promo)
p_train_s
## Confusion Matrix and Statistics
##
## Reference
## Prediction -0.704639727422298 1.41520079709184
## -0.704639727422298 56 28
## 1.41520079709184 10 30
##
## Accuracy : 0.6935
## 95% CI : (0.6044, 0.7732)
## No Information Rate : 0.5323
## P-Value [Acc > NIR] : 0.0001848
##
## Kappa : 0.3727
##
## Mcnemar's Test P-Value : 0.0058198
##
## Sensitivity : 0.8485
## Specificity : 0.5172
## Pos Pred Value : 0.6667
## Neg Pred Value : 0.7500
## Prevalence : 0.5323
## Detection Rate : 0.4516
## Detection Prevalence : 0.6774
## Balanced Accuracy : 0.6829
##
## 'Positive' Class : -0.704639727422298
##
# Extracting metrics from the confusion matrices
metrics <- data.frame(
Model = c("SVM Tanpa Preprocessing", "SVM Kernel Radial", "SVM Kernel Linear", "SVM Kernel Polynomial", "SVM Kernel Sigmoid"),
Accuracy = c(
p_train_tp$overall["Accuracy"],
p_train_cm$overall["Accuracy"],
p_train_kl$overall["Accuracy"],
p_train_p$overall["Accuracy"],
p_train_s$overall["Accuracy"]
),
Sensitivity = c(
p_train_tp$byClass["Sensitivity"],
p_train_cm$byClass["Sensitivity"],
p_train_kl$byClass["Sensitivity"],
p_train_p$byClass["Sensitivity"],
p_train_s$byClass["Sensitivity"]
),
Specificity = c(
p_train_tp$byClass["Specificity"],
p_train_cm$byClass["Specificity"],
p_train_kl$byClass["Specificity"],
p_train_p$byClass["Specificity"],
p_train_s$byClass["Specificity"]
),
Kappa = c(
p_train_tp$overall["Kappa"],
p_train_cm$overall["Kappa"],
p_train_kl$overall["Kappa"],
p_train_p$overall["Kappa"],
p_train_s$overall["Kappa"]
)
)
# Display the metrics data frame
print(metrics)
## Model Accuracy Sensitivity Specificity Kappa
## 1 SVM Tanpa Preprocessing 0.8104839 0.9812500 0.5000000 0.5376071
## 2 SVM Kernel Radial 0.8951613 0.9696970 0.8103448 0.7874473
## 3 SVM Kernel Linear 0.7419355 0.8333333 0.6379310 0.4762408
## 4 SVM Kernel Polynomial 0.8387097 1.0000000 0.6551724 0.6691569
## 5 SVM Kernel Sigmoid 0.6935484 0.8484848 0.5172414 0.3727370
# Fit the SVM model using caret's train function
Model <- train(promo ~ ., data = Train, method = "svmRadial", trControl = trainControl(method = "cv"))
# Extract variable importance
svm.imp <- varImp(Model)
# Create a variable importance plot
plot(svm.imp)