Tugas Sains Data

Packages

library(readxl)

## Warning: package 'readxl' was built under R version 4.1.2

library(tidyverse)

## Warning: package 'tidyverse' was built under R version 4.1.2

## Warning: package 'tibble' was built under R version 4.1.2

## Warning: package 'tidyr' was built under R version 4.1.2

## Warning: package 'readr' was built under R version 4.1.2

## Warning: package 'purrr' was built under R version 4.1.2

## Warning: package 'dplyr' was built under R version 4.1.2

## Warning: package 'forcats' was built under R version 4.1.2

## Warning: package 'lubridate' was built under R version 4.1.2

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.4.4     ✔ tibble    3.2.1
## ✔ lubridate 1.9.2     ✔ tidyr     1.3.0
## ✔ purrr     1.0.1     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(ggplot2)
library(e1071)

## Warning: package 'e1071' was built under R version 4.1.2

library(dplyr) 
library(skimr)

## Warning: package 'skimr' was built under R version 4.1.2

library(DataExplorer)
library(rpart.plot)

## Warning: package 'rpart.plot' was built under R version 4.1.2

## Loading required package: rpart

## Warning: package 'rpart' was built under R version 4.1.2

library(gridExtra)

## 
## Attaching package: 'gridExtra'
## 
## The following object is masked from 'package:dplyr':
## 
##     combine

library(ROSE)

## Loaded ROSE 0.0-4

library(caTools)
library(recipes)

## 
## Attaching package: 'recipes'
## 
## The following object is masked from 'package:stringr':
## 
##     fixed
## 
## The following object is masked from 'package:stats':
## 
##     step

library(caret)

## Warning: package 'caret' was built under R version 4.1.2

## Loading required package: lattice

## Warning: package 'lattice' was built under R version 4.1.2

## 
## Attaching package: 'caret'
## 
## The following object is masked from 'package:purrr':
## 
##     lift

Memanggil Data

setwd("~/Desktop")
Data_Lama <- read.csv("tugas-sainsdata-23.csv", header = TRUE)
Data<- Data_Lama[,-1]
head(Data)

##   cabang jenis.kelamin usia pendidikan frekuensi.fashion nilai.fashion
## 1     11             2   38          2                 5        1.1588
## 2      2             2   33          2                 4        0.4964
## 3      7             1   41          3                 5        0.6008
## 4      2             1   43          3                 6        0.3612
## 5      3             2   37          4                 2        0.6572
## 6      4             2   39          3                 3        0.3894
##   frekuensi.footwear nilai.footwear frekuensi.lainnya nilai.lainnya
## 1                  1         0.4560                 4        0.9990
## 2                  1         0.9162                 4        0.2822
## 3                  1         0.0384                 4        0.6252
## 4                  3         0.5626                 6        0.4118
## 5                  3         0.1014                 1        0.6478
## 6                  2         0.2674                 3        0.5866
##   total.nilai.tunai lama.member promo
## 1              0.00          18     0
## 2              0.59          35     1
## 3              3.05          39     0
## 4              0.00           9     1
## 5              1.06          51     0
## 6              0.26          19     0

Data Eksploration

Ringkasan Statistik Deskriptif

summary(Data)

##      cabang       jenis.kelamin        usia         pendidikan   
##  Min.   : 1.000   Min.   :1.000   Min.   :26.00   Min.   :1.000  
##  1st Qu.: 4.000   1st Qu.:1.000   1st Qu.:37.00   1st Qu.:3.000  
##  Median : 8.000   Median :2.000   Median :40.00   Median :3.000  
##  Mean   : 7.609   Mean   :1.575   Mean   :40.13   Mean   :3.067  
##  3rd Qu.:11.000   3rd Qu.:2.000   3rd Qu.:44.00   3rd Qu.:4.000  
##  Max.   :14.000   Max.   :2.000   Max.   :56.00   Max.   :4.000  
##  frekuensi.fashion nilai.fashion    frekuensi.footwear nilai.footwear  
##  Min.   :0.000     Min.   :0.0296   Min.   :0.000      Min.   :0.0250  
##  1st Qu.:2.000     1st Qu.:0.3952   1st Qu.:2.000      1st Qu.:0.4107  
##  Median :3.000     Median :0.6768   Median :3.000      Median :0.6904  
##  Mean   :3.061     Mean   :0.7852   Mean   :3.073      Mean   :0.8451  
##  3rd Qu.:4.000     3rd Qu.:1.0470   3rd Qu.:4.000      3rd Qu.:1.1055  
##  Max.   :8.000     Max.   :2.6276   Max.   :8.000      Max.   :3.5494  
##  frekuensi.lainnya nilai.lainnya    total.nilai.tunai  lama.member   
##  Min.   :0.000     Min.   :0.0214   Min.   : 0.000    Min.   : 1.00  
##  1st Qu.:2.000     1st Qu.:0.4323   1st Qu.: 0.000    1st Qu.:14.00  
##  Median :3.000     Median :0.7311   Median : 0.675    Median :25.00  
##  Mean   :2.782     Mean   :0.8506   Mean   : 2.240    Mean   :25.85  
##  3rd Qu.:4.000     3rd Qu.:1.1118   3rd Qu.: 3.045    3rd Qu.:38.00  
##  Max.   :7.000     Max.   :3.0334   Max.   :23.020    Max.   :51.00  
##      promo       
##  Min.   :0.0000  
##  1st Qu.:0.0000  
##  Median :0.0000  
##  Mean   :0.3324  
##  3rd Qu.:1.0000  
##  Max.   :1.0000

Struktur Data

str(Data)

## 'data.frame':    358 obs. of  13 variables:
##  $ cabang            : int  11 2 7 2 3 4 11 9 12 13 ...
##  $ jenis.kelamin     : int  2 2 1 1 2 2 2 1 1 1 ...
##  $ usia              : int  38 33 41 43 37 39 41 42 45 52 ...
##  $ pendidikan        : int  2 2 3 3 4 3 3 3 3 3 ...
##  $ frekuensi.fashion : int  5 4 5 6 2 3 4 5 4 2 ...
##  $ nilai.fashion     : num  1.159 0.496 0.601 0.361 0.657 ...
##  $ frekuensi.footwear: int  1 1 1 3 3 2 6 3 3 1 ...
##  $ nilai.footwear    : num  0.456 0.9162 0.0384 0.5626 0.1014 ...
##  $ frekuensi.lainnya : int  4 4 4 6 1 3 4 4 2 2 ...
##  $ nilai.lainnya     : num  0.999 0.282 0.625 0.412 0.648 ...
##  $ total.nilai.tunai : num  0 0.59 3.05 0 1.06 0.26 2.85 0 0.3 6.31 ...
##  $ lama.member       : int  18 35 39 9 51 19 21 27 5 37 ...
##  $ promo             : int  0 1 0 1 0 0 1 0 0 0 ...

Eksploarasi Data

plot_intro(Data,ggtheme = theme_classic())

Eksploarasi Variabel Respon

Data %>%
  count(promo) %>%
  mutate(percent = n * 100 / sum(n), label = str_c(round(percent, 2), "%")) %>%
  ggplot(aes(x = "", y = n, fill = as.factor(promo))) +
  geom_col() +
  geom_text(aes(label = label), position = position_stack(vjust = 0.5)) +
  coord_polar(theta = "y") +
  theme_void()

Eksplorasi Secara Numerik

skim_without_charts(Data)

Data summary
Name	Data
Number of rows	358
Number of columns	13
_______________________
Column type frequency:
numeric	13
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
cabang	1	7.61	3.96	1.00	4.00	8.00	11.00	14.00
jenis.kelamin	1	1.58	0.49	1.00	1.00	2.00	2.00	2.00
usia	1	40.13	5.07	26.00	37.00	40.00	44.00	56.00
pendidikan	1	3.07	0.81	1.00	3.00	3.00	4.00	4.00
frekuensi.fashion	1	3.06	1.52	0.00	2.00	3.00	4.00	8.00
nilai.fashion	1	0.79	0.52	0.03	0.40	0.68	1.05	2.63
frekuensi.footwear	1	3.07	1.63	0.00	2.00	3.00	4.00	8.00
nilai.footwear	1	0.85	0.61	0.03	0.41	0.69	1.11	3.55
frekuensi.lainnya	1	2.78	1.49	0.00	2.00	3.00	4.00	7.00
nilai.lainnya	1	0.85	0.58	0.02	0.43	0.73	1.11	3.03
total.nilai.tunai	1	2.24	3.43	0.00	0.00	0.68	3.04	23.02
lama.member	1	25.85	14.41	1.00	14.00	25.00	38.00	51.00
promo	1	0.33	0.47	0.00	0.00	0.00	1.00	1.00

Eksplorasi Hubungan Prediktor Kategorik dengan Respon

# Plot hubungan antara Jenis Kelamin dan Promo Response
plot1 <- ggplot(Data, aes(x = factor(jenis.kelamin), fill = factor(promo))) +
  geom_bar(position = "dodge", stat = "count") +
  labs(x = "Jenis Kelamin", fill = "Promo Response") +
  theme_minimal() +
  theme(legend.position="bottom")

# Plot hubungan antara Pendidikan dan Promo Response
plot2 <- ggplot(Data, aes(x = factor(pendidikan), fill = factor(promo))) +
  geom_bar(position = "dodge", stat = "count") +
  labs(x = "Pendidikan", fill = "Promo Response") +
  theme_minimal() +
  theme(legend.position="bottom")

# Plot hubungan antara Cabang dan Promo Response
plot3 <- ggplot(Data, aes(x = factor(frekuensi.fashion), fill = factor(promo))) +
  geom_bar(position = "dodge", stat = "count") +
  labs(x = "Cabang", fill = "Promo Response") +
  theme_minimal() +
  theme(legend.position="bottom")

grid.arrange(plot1, plot2, plot3, ncol = 2)

Eksplorasi Hubungan Prediktor Kontinu dengan Respon

plot_boxplot(data = Data, by = "promo",
             ggtheme = theme_classic(),
             geom_boxplot_args = list(fill = "Blue"))

Data Pre-processing

Tahap praproses data terdiri dari:

Data cleaning (pembersihan data): adalah proses identifikasi, koreksi, dan penghapusan masalah atau ketidaksempurnaan dalam data agar dapat diolah dan dianalisis dengan akurat dengan menangani Missing Value, outlier, duplikasi data dan kesalahan input data.

Feature Selection (Pemilihan Fitur): adalah proses pemilihan sebagian fitur yang paling relevan atau penting untuk digunakan dalam membangun model.
Feature Engineering (Penciptaan Fitur): adalah proses transformasi data mentah menjadi suatu fitur yang lebih baik dalam merepresentasikan pola yang terkandung di dalam data, sehingga dapat meningkatkan performa model.

Data Cleaning

Handling Outliers

# Mendeteksi outlier untuk setiap variabel numerik

# Menghitung IQR untuk setiap variabel numerik
iqr_values <- apply(Data[, c("usia", "frekuensi.fashion", "nilai.fashion", 
                             "frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya", 
                             "nilai.lainnya", "total.nilai.tunai", "lama.member")], 2, IQR)

# Menghitung batas atas dan batas bawah untuk mendeteksi outlier
lower_bound <- apply(Data[, c("usia", "frekuensi.fashion", "nilai.fashion", 
                              "frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya", 
                              "nilai.lainnya", "total.nilai.tunai", "lama.member")], 2, function(x) quantile(x, 0.25) - 1.5 * IQR(x))
upper_bound <- apply(Data[, c("usia", "frekuensi.fashion", "nilai.fashion", 
                              "frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya", 
                              "nilai.lainnya", "total.nilai.tunai", "lama.member")], 2, function(x) quantile(x, 0.75) + 1.5 * IQR(x))

# Mendeteksi outlier
outliers <- sapply(1:ncol(Data[, c("usia", "frekuensi.fashion", "nilai.fashion", 
                                    "frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya", 
                                    "nilai.lainnya", "total.nilai.tunai", "lama.member")]),
                   function(i) Data[, c("usia", "frekuensi.fashion", "nilai.fashion", 
                                         "frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya", 
                                         "nilai.lainnya", "total.nilai.tunai", "lama.member")][which(Data[, c("usia", "frekuensi.fashion", "nilai.fashion", 
                                                                                                                  "frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya", 
                                                                                                                  "nilai.lainnya", "total.nilai.tunai", "lama.member")][, i] < lower_bound[i] | Data[, c("usia", "frekuensi.fashion", "nilai.fashion", 
                                                                                                                                                                              "frekuensi.footwear", "nilai.footwear", "frekuensi.lainnya", 
                                                                                                                                                                              "nilai.lainnya", "total.nilai.tunai", "lama.member")][, i] > upper_bound[i]), i])

# Menampilkan outlier
print(outliers)

## [[1]]
## [1] 56 26
## 
## [[2]]
## [1] 8
## 
## [[3]]
##  [1] 2.2226 2.6276 2.0776 2.2716 2.2006 2.3006 2.1268 2.4374 2.3964 2.5078
## [11] 2.1186 2.3752 2.2716
## 
## [[4]]
## [1] 8 8
## 
## [[5]]
##  [1] 2.4224 2.8932 3.5494 2.4298 2.2678 2.2986 2.5292 2.2276 2.4346 3.0260
## [11] 2.7520 2.1770 2.5416 2.6252 2.7402 2.8930 2.1974
## 
## [[6]]
## integer(0)
## 
## [[7]]
##  [1] 2.8132 2.2904 2.5626 2.4766 2.5246 2.5776 3.0334 2.2050 2.5248 2.6500
## [11] 2.7596 2.1968 2.2588 2.9106 3.0118 2.5678
## 
## [[8]]
##  [1]  9.88  9.48 23.02  8.80  8.44  8.80 13.32  8.48  9.73  8.43 21.26  8.32
## [13] 15.53 22.95  9.92  9.80  9.42  9.53  8.69 10.32  9.78  8.92 11.40  8.61
## [25] 12.26  8.12  7.87
## 
## [[9]]
## integer(0)

# Menggantikan outlier dengan nilai batas atas atau batas bawah

# Gantilah dengan batas atas atau batas bawah sesuai kebutuhan
replace_outliers <- function(x, lower_bound, upper_bound) {
  x[x < lower_bound] <- lower_bound
  x[x > upper_bound] <- upper_bound
  return(x)
}

# Gantilah outlier untuk setiap variabel numerik
Data_clean <- Data

Data_clean$usia <- replace_outliers(Data_clean$usia, lower_bound[1], upper_bound[1])
Data_clean$frekuensi.fashion <- replace_outliers(Data_clean$frekuensi.fashion, lower_bound[2], upper_bound[2])
Data_clean$nilai.fashion <- replace_outliers(Data_clean$nilai.fashion, lower_bound[3], upper_bound[3])
Data_clean$frekuensi.footwear <- replace_outliers(Data_clean$frekuensi.footwear, lower_bound[4], upper_bound[4])
Data_clean$nilai.footwear <- replace_outliers(Data_clean$nilai.footwear, lower_bound[5], upper_bound[5])
Data_clean$frekuensi.lainnya <- replace_outliers(Data_clean$frekuensi.lainnya, lower_bound[6], upper_bound[6])
Data_clean$nilai.lainnya <- replace_outliers(Data_clean$nilai.lainnya, lower_bound[7], upper_bound[7])
Data_clean$total.nilai.tunai <- replace_outliers(Data_clean$total.nilai.tunai, lower_bound[8], upper_bound[8])
Data_clean$lama.member <- replace_outliers(Data_clean$lama.member, lower_bound[9], upper_bound[9])

# Tampilkan struktur data setelah mengatasi outlier
str(Data_clean)

## 'data.frame':    358 obs. of  13 variables:
##  $ cabang            : int  11 2 7 2 3 4 11 9 12 13 ...
##  $ jenis.kelamin     : int  2 2 1 1 2 2 2 1 1 1 ...
##  $ usia              : num  38 33 41 43 37 39 41 42 45 52 ...
##  $ pendidikan        : int  2 2 3 3 4 3 3 3 3 3 ...
##  $ frekuensi.fashion : num  5 4 5 6 2 3 4 5 4 2 ...
##  $ nilai.fashion     : num  1.159 0.496 0.601 0.361 0.657 ...
##  $ frekuensi.footwear: num  1 1 1 3 3 2 6 3 3 1 ...
##  $ nilai.footwear    : num  0.456 0.9162 0.0384 0.5626 0.1014 ...
##  $ frekuensi.lainnya : num  4 4 4 6 1 3 4 4 2 2 ...
##  $ nilai.lainnya     : num  0.999 0.282 0.625 0.412 0.648 ...
##  $ total.nilai.tunai : num  0 0.59 3.05 0 1.06 0.26 2.85 0 0.3 6.31 ...
##  $ lama.member       : num  18 35 39 9 51 19 21 27 5 37 ...
##  $ promo             : int  0 1 0 1 0 0 1 0 0 0 ...

Feature Scaling (Standarisasi)

# Pilih kolom-kolom numerik yang akan distandarisasi
numeric_columns <- sapply(Data_clean, is.numeric)
selected_data <- Data_clean[, numeric_columns]

# Standarisasi data
scaled_data <- scale(selected_data)

# Gantikan kolom numerik pada data awal dengan data yang sudah distandarisasi
Data_clean[, numeric_columns] <- scaled_data

# Tampilkan beberapa baris data setelah standarisasi
head(Data_clean)

##       cabang jenis.kelamin       usia  pendidikan frekuensi.fashion
## 1  0.8556472     0.8577896 -0.4209199 -1.31129051        1.28268728
## 2 -1.4152715     0.8577896 -1.4109944 -1.31129051        0.62196491
## 3 -0.1536500    -1.1625306  0.1731248 -0.08238474        1.28268728
## 4 -1.4152715    -1.1625306  0.5691545 -0.08238474        1.94340965
## 5 -1.1629472     0.8577896 -0.6189348  1.14652102       -0.69947983
## 6 -0.9106229     0.8577896 -0.2229050 -0.08238474       -0.03875746
##   nilai.fashion frekuensi.footwear nilai.footwear frekuensi.lainnya
## 1     0.7810533        -1.28179132     -0.6638244         0.8150925
## 2    -0.5673239        -1.28179132      0.1658464         0.8150925
## 3    -0.3548080        -1.28179132     -1.4166939         0.8150925
## 4    -0.8425362        -0.04157161     -0.4716408         2.1536389
## 5    -0.2400005        -0.04157161     -1.3031145        -1.1927271
## 6    -0.7851325        -0.66168146     -1.0038416         0.1458193
##   nilai.lainnya total.nilai.tunai lama.member      promo
## 1     0.3206062        -0.7744219  -0.5445594 -0.7046397
## 2    -1.0417954        -0.5428881   0.6352869  1.4152008
## 3    -0.3898650         0.4224899   0.9128978 -0.7046397
## 4    -0.7954683        -0.7744219  -1.1691839  1.4152008
## 5    -0.3469098        -0.3584460   1.7457305 -0.7046397
## 6    -0.4632309        -0.6723901  -0.4751566 -0.7046397

Handling Class Imbalance

# Memeriksa distribusi kelas
table(Data_clean$promo)

## 
## -0.704639727422298   1.41520079709184 
##                239                119

# Memeriksa persentase kelas
prop.table(table(Data_clean$promo)) * 100

## 
## -0.704639727422298   1.41520079709184 
##           66.75978           33.24022

# Visualisasi distribusi kelas
barplot(table(Data_clean$promo), col = c("blue", "red"), main = "Distribusi Kelas")

# Menangani Imbalanced Data dengan Oversampling
oversampled_data <- ROSE(promo ~ ., data = Data_clean, seed = 123, N = nrow(Data) / 2, p = 0.5)$data

# Setelah oversampling, memeriksa kembali distribusi kelas
table(oversampled_data$promo)

## 
## -0.704639727422298   1.41520079709184 
##                 94                 85

# Visualisasi distribusi kelas setelah penangan
barplot(table(oversampled_data$promo), col = c("blue", "red"), main = "Distribusi Kelas")

Model Training and Evaluation Menggunakan Support Vector Machine (SVM)

Tanpa Pre-processing Data

Memisahkan Data untuk Training dan Testing

set.seed(123)
Split_tp <- sample.split(Data, SplitRatio = 0.7)
Train_tp <- subset(Data, Split_tp==TRUE)
Test_tp <- subset(Data, Split_tp==FALSE)

Modeling Train Data (Default Parameter)

Train_tp$promo <- as.factor(Train_tp$promo)
Test_tp$promo <- as.factor(Test_tp$promo)
svm_model_tp <- svm(promo ~ ., data = Train_tp)
summary(svm_model_tp)

## 
## Call:
## svm(formula = promo ~ ., data = Train_tp)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  209
## 
##  ( 121 88 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  0 1

Predict (Default Parameter)

p_train_tp <- predict(svm_model_tp)
p_train_tp <-confusionMatrix(p_train_tp, Train_tp$promo)
p_train_tp

## Confusion Matrix and Statistics
## 
##           Reference
## Prediction   0   1
##          0 157  44
##          1   3  44
##                                           
##                Accuracy : 0.8105          
##                  95% CI : (0.7561, 0.8573)
##     No Information Rate : 0.6452          
##     P-Value [Acc > NIR] : 8.525e-09       
##                                           
##                   Kappa : 0.5376          
##                                           
##  Mcnemar's Test P-Value : 5.392e-09       
##                                           
##             Sensitivity : 0.9812          
##             Specificity : 0.5000          
##          Pos Pred Value : 0.7811          
##          Neg Pred Value : 0.9362          
##              Prevalence : 0.6452          
##          Detection Rate : 0.6331          
##    Detection Prevalence : 0.8105          
##       Balanced Accuracy : 0.7406          
##                                           
##        'Positive' Class : 0               
##

Dengan Pre-processing Data

Memisahkan Data untuk Training dan Testing (Split Data)

set.seed(123)
Split <- sample.split(oversampled_data, SplitRatio = 0.7)
Train <- subset(oversampled_data, Split==TRUE)
Test <- subset(oversampled_data, Split==FALSE)

Modeling Train Data (Default Parameter)

Train$promo <- as.factor(Train$promo)
Test$promo <- as.factor(Test$promo)
svm_model <- svm(promo ~ ., data = Train)
summary(svm_model)

## 
## Call:
## svm(formula = promo ~ ., data = Train)
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  radial 
##        cost:  1 
## 
## Number of Support Vectors:  115
## 
##  ( 59 56 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -0.704639727422298 1.41520079709184

Predict (Default Parameter)

p_train <- predict(svm_model)
p_train_cm<-confusionMatrix(p_train, Train$promo)
p_train_cm

## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.704639727422298 1.41520079709184
##   -0.704639727422298                 64               11
##   1.41520079709184                    2               47
##                                             
##                Accuracy : 0.8952            
##                  95% CI : (0.8274, 0.943)   
##     No Information Rate : 0.5323            
##     P-Value [Acc > NIR] : <2e-16            
##                                             
##                   Kappa : 0.7874            
##                                             
##  Mcnemar's Test P-Value : 0.0265            
##                                             
##             Sensitivity : 0.9697            
##             Specificity : 0.8103            
##          Pos Pred Value : 0.8533            
##          Neg Pred Value : 0.9592            
##              Prevalence : 0.5323            
##          Detection Rate : 0.5161            
##    Detection Prevalence : 0.6048            
##       Balanced Accuracy : 0.8900            
##                                             
##        'Positive' Class : -0.704639727422298
##

Kernel Linear

Train$promo <- as.factor(Train$promo)
Test$promo <- as.factor(Test$promo)
svm_model_kl <- svm(promo ~ ., data = Train, kernel = "linear")
summary(svm_model_kl)

## 
## Call:
## svm(formula = promo ~ ., data = Train, kernel = "linear")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  linear 
##        cost:  1 
## 
## Number of Support Vectors:  88
## 
##  ( 44 44 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -0.704639727422298 1.41520079709184

p_train_kl <- predict(svm_model_kl)
p_train_kl <-confusionMatrix(p_train_kl, Train$promo)
p_train_kl

## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.704639727422298 1.41520079709184
##   -0.704639727422298                 55               21
##   1.41520079709184                   11               37
##                                             
##                Accuracy : 0.7419            
##                  95% CI : (0.6557, 0.8163)  
##     No Information Rate : 0.5323            
##     P-Value [Acc > NIR] : 1.304e-06         
##                                             
##                   Kappa : 0.4762            
##                                             
##  Mcnemar's Test P-Value : 0.1116            
##                                             
##             Sensitivity : 0.8333            
##             Specificity : 0.6379            
##          Pos Pred Value : 0.7237            
##          Neg Pred Value : 0.7708            
##              Prevalence : 0.5323            
##          Detection Rate : 0.4435            
##    Detection Prevalence : 0.6129            
##       Balanced Accuracy : 0.7356            
##                                             
##        'Positive' Class : -0.704639727422298
##

Kernel Polynomial

Train$promo <- as.factor(Train$promo)
Test$promo <- as.factor(Test$promo)
svm_model_p <- svm(promo ~ ., data = Train, kernel = "polynomial")
summary(svm_model_p)

## 
## Call:
## svm(formula = promo ~ ., data = Train, kernel = "polynomial")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  polynomial 
##        cost:  1 
##      degree:  3 
##      coef.0:  0 
## 
## Number of Support Vectors:  109
## 
##  ( 57 52 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -0.704639727422298 1.41520079709184

p_train_p <- predict(svm_model_p)
p_train_p <-confusionMatrix(p_train_p, Train$promo)
p_train_p

## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.704639727422298 1.41520079709184
##   -0.704639727422298                 66               20
##   1.41520079709184                    0               38
##                                             
##                Accuracy : 0.8387            
##                  95% CI : (0.7619, 0.8986)  
##     No Information Rate : 0.5323            
##     P-Value [Acc > NIR] : 6.310e-13         
##                                             
##                   Kappa : 0.6692            
##                                             
##  Mcnemar's Test P-Value : 2.152e-05         
##                                             
##             Sensitivity : 1.0000            
##             Specificity : 0.6552            
##          Pos Pred Value : 0.7674            
##          Neg Pred Value : 1.0000            
##              Prevalence : 0.5323            
##          Detection Rate : 0.5323            
##    Detection Prevalence : 0.6935            
##       Balanced Accuracy : 0.8276            
##                                             
##        'Positive' Class : -0.704639727422298
##

Kernel Sigmoid

Train$promo <- as.factor(Train$promo)
Test$promo <- as.factor(Test$promo)
svm_model_s <- svm(promo ~ ., data = Train, kernel = "sigmoid")
summary(svm_model_s)

## 
## Call:
## svm(formula = promo ~ ., data = Train, kernel = "sigmoid")
## 
## 
## Parameters:
##    SVM-Type:  C-classification 
##  SVM-Kernel:  sigmoid 
##        cost:  1 
##      coef.0:  0 
## 
## Number of Support Vectors:  96
## 
##  ( 48 48 )
## 
## 
## Number of Classes:  2 
## 
## Levels: 
##  -0.704639727422298 1.41520079709184

p_train_s <- predict(svm_model_s)
p_train_s <-confusionMatrix(p_train_s, Train$promo)
p_train_s

## Confusion Matrix and Statistics
## 
##                     Reference
## Prediction           -0.704639727422298 1.41520079709184
##   -0.704639727422298                 56               28
##   1.41520079709184                   10               30
##                                             
##                Accuracy : 0.6935            
##                  95% CI : (0.6044, 0.7732)  
##     No Information Rate : 0.5323            
##     P-Value [Acc > NIR] : 0.0001848         
##                                             
##                   Kappa : 0.3727            
##                                             
##  Mcnemar's Test P-Value : 0.0058198         
##                                             
##             Sensitivity : 0.8485            
##             Specificity : 0.5172            
##          Pos Pred Value : 0.6667            
##          Neg Pred Value : 0.7500            
##              Prevalence : 0.5323            
##          Detection Rate : 0.4516            
##    Detection Prevalence : 0.6774            
##       Balanced Accuracy : 0.6829            
##                                             
##        'Positive' Class : -0.704639727422298
##

Tabel Evaulasi Model

# Extracting metrics from the confusion matrices
metrics <- data.frame(
  Model = c("SVM Tanpa Preprocessing", "SVM Kernel Radial", "SVM Kernel Linear", "SVM Kernel Polynomial", "SVM Kernel Sigmoid"),
  Accuracy = c(
    p_train_tp$overall["Accuracy"],
    p_train_cm$overall["Accuracy"],
    p_train_kl$overall["Accuracy"],
    p_train_p$overall["Accuracy"],
    p_train_s$overall["Accuracy"]
  ),
  Sensitivity = c(
    p_train_tp$byClass["Sensitivity"],
    p_train_cm$byClass["Sensitivity"],
    p_train_kl$byClass["Sensitivity"],
    p_train_p$byClass["Sensitivity"],
    p_train_s$byClass["Sensitivity"]
  ),
  Specificity = c(
    p_train_tp$byClass["Specificity"],
    p_train_cm$byClass["Specificity"],
    p_train_kl$byClass["Specificity"],
    p_train_p$byClass["Specificity"],
    p_train_s$byClass["Specificity"]
  ),
  Kappa = c(
    p_train_tp$overall["Kappa"],
    p_train_cm$overall["Kappa"],
    p_train_kl$overall["Kappa"],
    p_train_p$overall["Kappa"],
    p_train_s$overall["Kappa"]
  )
)

# Display the metrics data frame
print(metrics)

##                     Model  Accuracy Sensitivity Specificity     Kappa
## 1 SVM Tanpa Preprocessing 0.8104839   0.9812500   0.5000000 0.5376071
## 2       SVM Kernel Radial 0.8951613   0.9696970   0.8103448 0.7874473
## 3       SVM Kernel Linear 0.7419355   0.8333333   0.6379310 0.4762408
## 4   SVM Kernel Polynomial 0.8387097   1.0000000   0.6551724 0.6691569
## 5      SVM Kernel Sigmoid 0.6935484   0.8484848   0.5172414 0.3727370

Importance Variable

# Fit the SVM model using caret's train function
Model <- train(promo ~ ., data = Train, method = "svmRadial", trControl = trainControl(method = "cv"))

# Extract variable importance
svm.imp <- varImp(Model)

# Create a variable importance plot
plot(svm.imp)

Tugas Sains Data

Yunia Hasnataeni

2023-12-02

Packages

Memanggil Data

Data Eksploration

Ringkasan Statistik Deskriptif

Struktur Data

Eksploarasi Data

Eksploarasi Variabel Respon

Eksplorasi Secara Numerik

Eksplorasi Hubungan Prediktor Kategorik dengan Respon

Eksplorasi Hubungan Prediktor Kontinu dengan Respon

Data Pre-processing

Data Cleaning

Handling Outliers

Feature Scaling (Standarisasi)

Handling Class Imbalance

Model Training and Evaluation Menggunakan Support Vector Machine (SVM)

Tanpa Pre-processing Data

Memisahkan Data untuk Training dan Testing

Modeling Train Data (Default Parameter)

Predict (Default Parameter)

Dengan Pre-processing Data

Memisahkan Data untuk Training dan Testing (Split Data)

Modeling Train Data (Default Parameter)

Predict (Default Parameter)

Kernel Linear

Kernel Polynomial

Kernel Sigmoid

Tabel Evaulasi Model

Importance Variable