#Praktikum 3. Regresi Logistik
#Regresi Logistik Multinomial dan Ordinal dengan tidymodels
#===Package
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.9 ✔ rsample 1.3.1
## ✔ dials 1.4.2 ✔ tailor 0.1.0
## ✔ infer 1.0.9 ✔ tune 2.0.0
## ✔ modeldata 1.5.1 ✔ workflows 1.3.0
## ✔ parsnip 1.3.3 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(DataExplorer)
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
##
## Attaching package: 'VGAM'
##
## The following object is masked from 'package:workflows':
##
## update_formula
library(broom.helpers)
#===Import Data
cust <- read_csv("D:/Kuliah/IPB 2025 Semester 3/Pemodelan Klasifikasi/Praktikum/Praktikum 3 dan 4/train.csv")
## Rows: 8068 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Gender, Ever_Married, Graduated, Profession, Spending_Score, Var_1,...
## dbl (4): ID, Age, Work_Experience, Family_Size
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(cust)
## Rows: 8,068
## Columns: 11
## $ ID <dbl> 462809, 462643, 466315, 461735, 462669, 461319, 460156…
## $ Gender <chr> "Male", "Female", "Female", "Male", "Female", "Male", …
## $ Ever_Married <chr> "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", "…
## $ Age <dbl> 22, 38, 67, 67, 40, 56, 32, 33, 61, 55, 26, 19, 19, 70…
## $ Graduated <chr> "No", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Yes", …
## $ Profession <chr> "Healthcare", "Engineer", "Engineer", "Lawyer", "Enter…
## $ Work_Experience <dbl> 1, NA, 1, 0, NA, 0, 1, 1, 0, 1, 1, 4, 0, NA, 0, 1, 9, …
## $ Spending_Score <chr> "Low", "Average", "Low", "High", "High", "Average", "L…
## $ Family_Size <dbl> 4, 3, 1, 2, 6, 2, 3, 3, 3, 4, 3, 4, NA, 1, 1, 2, 5, 6,…
## $ Var_1 <chr> "Cat_4", "Cat_4", "Cat_6", "Cat_6", "Cat_6", "Cat_6", …
## $ Segmentation <chr> "D", "A", "B", "B", "A", "C", "C", "D", "D", "C", "A",…
#Diilustrasi kali ini kita hanya akan menggunakan 400 amatan saja dengan 100 setiap kelas pada variabel respon Segmentation
set.seed(2023)
cust <- cust %>% slice_sample(n = 100,by = Segmentation)
glimpse(cust)
## Rows: 400
## Columns: 11
## $ ID <dbl> 460101, 465365, 461953, 467127, 464946, 462091, 459331…
## $ Gender <chr> "Female", "Male", "Male", "Male", "Male", "Male", "Fem…
## $ Ever_Married <chr> "No", "No", "Yes", "Yes", "Yes", "No", "No", "Yes", "N…
## $ Age <dbl> 31, 31, 33, 70, 53, 18, 70, 33, 22, 79, 25, 19, 39, 28…
## $ Graduated <chr> "Yes", "No", "No", "No", "Yes", "No", "Yes", "Yes", "N…
## $ Profession <chr> "Healthcare", "Entertainment", "Healthcare", "Lawyer",…
## $ Work_Experience <dbl> 9, 1, 11, NA, 9, 0, 0, 8, NA, 1, 12, 1, 9, 7, NA, 4, N…
## $ Spending_Score <chr> "Low", "Low", "High", "Low", "Low", "Low", "Low", "Hig…
## $ Family_Size <dbl> 3, 4, 2, 1, 1, 5, 1, 2, 4, 1, 1, 4, 6, 4, 1, 2, 4, 3, …
## $ Var_1 <chr> "Cat_3", "Cat_2", "Cat_6", "Cat_6", "Cat_4", "Cat_4", …
## $ Segmentation <chr> "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",…
#Kemudian kita akan menghapus kolom ID
cust <- cust %>%
select(-ID)
glimpse(cust)
## Rows: 400
## Columns: 10
## $ Gender <chr> "Female", "Male", "Male", "Male", "Male", "Male", "Fem…
## $ Ever_Married <chr> "No", "No", "Yes", "Yes", "Yes", "No", "No", "Yes", "N…
## $ Age <dbl> 31, 31, 33, 70, 53, 18, 70, 33, 22, 79, 25, 19, 39, 28…
## $ Graduated <chr> "Yes", "No", "No", "No", "Yes", "No", "Yes", "Yes", "N…
## $ Profession <chr> "Healthcare", "Entertainment", "Healthcare", "Lawyer",…
## $ Work_Experience <dbl> 9, 1, 11, NA, 9, 0, 0, 8, NA, 1, 12, 1, 9, 7, NA, 4, N…
## $ Spending_Score <chr> "Low", "Low", "High", "Low", "Low", "Low", "Low", "Hig…
## $ Family_Size <dbl> 3, 4, 2, 1, 1, 5, 1, 2, 4, 1, 1, 4, 6, 4, 1, 2, 4, 3, …
## $ Var_1 <chr> "Cat_3", "Cat_2", "Cat_6", "Cat_6", "Cat_4", "Cat_4", …
## $ Segmentation <chr> "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",…
#mengubah ke factor
cust <- cust %>%
mutate(across(where(is.character),as.factor))
glimpse(cust)
## Rows: 400
## Columns: 10
## $ Gender <fct> Female, Male, Male, Male, Male, Male, Female, Female, …
## $ Ever_Married <fct> No, No, Yes, Yes, Yes, No, No, Yes, No, Yes, No, No, Y…
## $ Age <dbl> 31, 31, 33, 70, 53, 18, 70, 33, 22, 79, 25, 19, 39, 28…
## $ Graduated <fct> Yes, No, No, No, Yes, No, Yes, Yes, No, NA, Yes, No, N…
## $ Profession <fct> Healthcare, Entertainment, Healthcare, Lawyer, Executi…
## $ Work_Experience <dbl> 9, 1, 11, NA, 9, 0, 0, 8, NA, 1, 12, 1, 9, 7, NA, 4, N…
## $ Spending_Score <fct> Low, Low, High, Low, Low, Low, Low, High, Low, Low, Lo…
## $ Family_Size <dbl> 3, 4, 2, 1, 1, 5, 1, 2, 4, 1, 1, 4, 6, 4, 1, 2, 4, 3, …
## $ Var_1 <fct> Cat_3, Cat_2, Cat_6, Cat_6, Cat_4, Cat_4, Cat_6, Cat_6…
## $ Segmentation <fct> D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, …
#===Ekplorasi Data
#1. PLot na
plot_intro(cust,theme_config = theme_classic())

#menghapus missing value
cust <- cust %>%
na.omit
#Boxplot variabel kontinue dengan target
plot_boxplot(data = cust,
by = "Segmentation",
geom_boxplot_args = list(fill="#03A9F4"),
ggtheme = theme_classic())

#Boxplot variabel diskret dengan target
plot_bar(data = cust,
by = "Segmentation",
ggtheme = theme_classic())
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DataExplorer package.
## Please report the issue at
## <https://github.com/boxuancui/DataExplorer/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#===Pembagian Data
# K-fold Cross Validation
set.seed(2045)
folds <- vfold_cv(cust, v = 10,strata = "Segmentation" )
folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [307/35]> Fold01
## 2 <split [307/35]> Fold02
## 3 <split [307/35]> Fold03
## 4 <split [307/35]> Fold04
## 5 <split [307/35]> Fold05
## 6 <split [308/34]> Fold06
## 7 <split [308/34]> Fold07
## 8 <split [308/34]> Fold08
## 9 <split [309/33]> Fold09
## 10 <split [310/32]> Fold10
cust %>%
mutate(Row=seq(nrow(cust))) %>%
select(Segmentation,Row) %>%
left_join(tidy(folds),by="Row")%>%
count(Segmentation,Fold,Data) %>%
group_by(Segmentation,Fold) %>%
mutate(percent=n*100/sum(n)) %>%
arrange(Fold,Data)
## # A tibble: 80 × 5
## # Groups: Segmentation, Fold [40]
## Segmentation Fold Data n percent
## <fct> <chr> <chr> <int> <dbl>
## 1 A Fold01 Analysis 76 89.4
## 2 B Fold01 Analysis 79 89.8
## 3 C Fold01 Analysis 81 90
## 4 D Fold01 Analysis 71 89.9
## 5 A Fold01 Assessment 9 10.6
## 6 B Fold01 Assessment 9 10.2
## 7 C Fold01 Assessment 9 10
## 8 D Fold01 Assessment 8 10.1
## 9 A Fold02 Analysis 76 89.4
## 10 B Fold02 Analysis 79 89.8
## # ℹ 70 more rows
#===Pemodelan Regresi Logistik
#A. Regresi Logistik Multinomial (target tidak berurutan)
#A.1 Set-up Model
regresi_mult <- multinom_reg()%>%
set_engine("nnet") %>%
set_mode("classification")
regresi_mult
## Multinomial Regression Model Specification (classification)
##
## Computational engine: nnet
#A.2 Intepretasi Model
fit_regresi_mult <- regresi_mult %>%
fit(Segmentation~.,data=cust)
#A.3 Mencari Nilai Odds Ratio
fit_regresi_mult %>%
repair_call(data = cust) %>%
# exponentiate = TRUE koefisien dalam bentuk odds-ratio
tidy(conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE) %>%
mutate(across(where(is.numeric),~round(.x,4)))
## # A tibble: 69 × 8
## y.level term estimate std.error statistic p.value conf.low conf.high
## <chr> <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 B (Intercept) 1.89 1.85 0.343 0.732 0.0499 71.5
## 2 B GenderMale 0.672 0.370 -1.07 0.282 0.325 1.39
## 3 B Ever_Married… 1.78 0.499 1.16 0.246 0.671 4.74
## 4 B Age 0.999 0.0158 -0.0443 0.965 0.969 1.03
## 5 B GraduatedYes 1.59 0.405 1.15 0.251 0.720 3.52
## 6 B ProfessionDo… 0.522 0.703 -0.924 0.356 0.132 2.07
## 7 B ProfessionEn… 0.224 0.674 -2.22 0.0264 0.0597 0.839
## 8 B ProfessionEn… 0.337 0.518 -2.10 0.0359 0.122 0.931
## 9 B ProfessionEx… 0.245 0.944 -1.49 0.137 0.0385 1.56
## 10 B ProfessionHe… 0.970 0.803 -0.0374 0.970 0.201 4.68
## # ℹ 59 more rows
levels(cust$Segmentation)
## [1] "A" "B" "C" "D"
#A.4 Evaluasi Model
##K-fold Cross Validation
regresi_mult_cv <-
workflow() %>%
add_formula(Segmentation~.) %>%
add_model(regresi_mult) %>%
fit_resamples(folds,
metrics=metric_set(accuracy),
control=control_resamples(save_pred = TRUE))
cm_cv_regresi_mult <-
conf_mat_resampled(x = regresi_mult_cv,tidy = FALSE)
autoplot(cm_cv_regresi_mult,type = "heatmap")+
scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

collect_metrics(regresi_mult_cv,summarize = FALSE)
## # A tibble: 10 × 5
## id .metric .estimator .estimate .config
## <chr> <chr> <chr> <dbl> <chr>
## 1 Fold01 accuracy multiclass 0.4 pre0_mod0_post0
## 2 Fold02 accuracy multiclass 0.543 pre0_mod0_post0
## 3 Fold03 accuracy multiclass 0.486 pre0_mod0_post0
## 4 Fold04 accuracy multiclass 0.486 pre0_mod0_post0
## 5 Fold05 accuracy multiclass 0.4 pre0_mod0_post0
## 6 Fold06 accuracy multiclass 0.412 pre0_mod0_post0
## 7 Fold07 accuracy multiclass 0.382 pre0_mod0_post0
## 8 Fold08 accuracy multiclass 0.441 pre0_mod0_post0
## 9 Fold09 accuracy multiclass 0.455 pre0_mod0_post0
## 10 Fold10 accuracy multiclass 0.531 pre0_mod0_post0
collect_metrics(regresi_mult_cv,summarize = TRUE)
## # A tibble: 1 × 6
## .metric .estimator mean n std_err .config
## <chr> <chr> <dbl> <int> <dbl> <chr>
## 1 accuracy multiclass 0.454 10 0.0179 pre0_mod0_post0
#A.5 Prediksi Data Baru
set.seed(1234)
data_baru <- cust %>%
slice_sample(n = 2,by = Segmentation) %>%
select(-Segmentation)
data_baru
## # A tibble: 8 × 9
## Gender Ever_Married Age Graduated Profession Work_Experience Spending_Score
## <fct> <fct> <dbl> <fct> <fct> <dbl> <fct>
## 1 Male No 43 Yes Healthcare 1 Low
## 2 Male No 19 No Healthcare 3 Low
## 3 Male No 30 Yes Artist 9 Low
## 4 Male No 41 Yes Artist 7 Low
## 5 Female No 38 Yes Doctor 9 Low
## 6 Female Yes 47 Yes Artist 6 Low
## 7 Female Yes 60 Yes Artist 8 High
## 8 Male No 42 Yes Doctor 8 Low
## # ℹ 2 more variables: Family_Size <dbl>, Var_1 <fct>
regresi_mult_opt <- regresi_mult %>%
fit(Segmentation~.,data=cust)
pred_data_baru <- regresi_mult_opt %>%
predict(new_data = data_baru)
pred_data_baru
## # A tibble: 8 × 1
## .pred_class
## <fct>
## 1 C
## 2 D
## 3 A
## 4 A
## 5 D
## 6 C
## 7 C
## 8 D