#Praktikum 3. Regresi Logistik
#Regresi Logistik Multinomial dan Ordinal dengan tidymodels
#===Package
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.9     ✔ rsample      1.3.1
## ✔ dials        1.4.2     ✔ tailor       0.1.0
## ✔ infer        1.0.9     ✔ tune         2.0.0
## ✔ modeldata    1.5.1     ✔ workflows    1.3.0
## ✔ parsnip      1.3.3     ✔ workflowsets 1.1.1
## ✔ recipes      1.3.1     ✔ yardstick    1.3.2
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter()   masks stats::filter()
## ✖ recipes::fixed()  masks stringr::fixed()
## ✖ dplyr::lag()      masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step()   masks stats::step()
library(DataExplorer)
library(VGAM)
## Loading required package: stats4
## Loading required package: splines
## 
## Attaching package: 'VGAM'
## 
## The following object is masked from 'package:workflows':
## 
##     update_formula
library(broom.helpers)
#===Import Data
cust <- read_csv("D:/Kuliah/IPB 2025 Semester 3/Pemodelan Klasifikasi/Praktikum/Praktikum 3 dan 4/train.csv")
## Rows: 8068 Columns: 11
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): Gender, Ever_Married, Graduated, Profession, Spending_Score, Var_1,...
## dbl (4): ID, Age, Work_Experience, Family_Size
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(cust)
## Rows: 8,068
## Columns: 11
## $ ID              <dbl> 462809, 462643, 466315, 461735, 462669, 461319, 460156…
## $ Gender          <chr> "Male", "Female", "Female", "Male", "Female", "Male", …
## $ Ever_Married    <chr> "No", "Yes", "Yes", "Yes", "Yes", "Yes", "No", "No", "…
## $ Age             <dbl> 22, 38, 67, 67, 40, 56, 32, 33, 61, 55, 26, 19, 19, 70…
## $ Graduated       <chr> "No", "Yes", "Yes", "Yes", "Yes", "No", "Yes", "Yes", …
## $ Profession      <chr> "Healthcare", "Engineer", "Engineer", "Lawyer", "Enter…
## $ Work_Experience <dbl> 1, NA, 1, 0, NA, 0, 1, 1, 0, 1, 1, 4, 0, NA, 0, 1, 9, …
## $ Spending_Score  <chr> "Low", "Average", "Low", "High", "High", "Average", "L…
## $ Family_Size     <dbl> 4, 3, 1, 2, 6, 2, 3, 3, 3, 4, 3, 4, NA, 1, 1, 2, 5, 6,…
## $ Var_1           <chr> "Cat_4", "Cat_4", "Cat_6", "Cat_6", "Cat_6", "Cat_6", …
## $ Segmentation    <chr> "D", "A", "B", "B", "A", "C", "C", "D", "D", "C", "A",…
#Diilustrasi kali ini kita hanya akan menggunakan 400 amatan saja dengan 100 setiap kelas pada variabel respon Segmentation
set.seed(2023)
cust <- cust %>% slice_sample(n = 100,by = Segmentation)
glimpse(cust)
## Rows: 400
## Columns: 11
## $ ID              <dbl> 460101, 465365, 461953, 467127, 464946, 462091, 459331…
## $ Gender          <chr> "Female", "Male", "Male", "Male", "Male", "Male", "Fem…
## $ Ever_Married    <chr> "No", "No", "Yes", "Yes", "Yes", "No", "No", "Yes", "N…
## $ Age             <dbl> 31, 31, 33, 70, 53, 18, 70, 33, 22, 79, 25, 19, 39, 28…
## $ Graduated       <chr> "Yes", "No", "No", "No", "Yes", "No", "Yes", "Yes", "N…
## $ Profession      <chr> "Healthcare", "Entertainment", "Healthcare", "Lawyer",…
## $ Work_Experience <dbl> 9, 1, 11, NA, 9, 0, 0, 8, NA, 1, 12, 1, 9, 7, NA, 4, N…
## $ Spending_Score  <chr> "Low", "Low", "High", "Low", "Low", "Low", "Low", "Hig…
## $ Family_Size     <dbl> 3, 4, 2, 1, 1, 5, 1, 2, 4, 1, 1, 4, 6, 4, 1, 2, 4, 3, …
## $ Var_1           <chr> "Cat_3", "Cat_2", "Cat_6", "Cat_6", "Cat_4", "Cat_4", …
## $ Segmentation    <chr> "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",…
#Kemudian kita akan menghapus kolom ID
cust <- cust %>% 
  select(-ID)
glimpse(cust)
## Rows: 400
## Columns: 10
## $ Gender          <chr> "Female", "Male", "Male", "Male", "Male", "Male", "Fem…
## $ Ever_Married    <chr> "No", "No", "Yes", "Yes", "Yes", "No", "No", "Yes", "N…
## $ Age             <dbl> 31, 31, 33, 70, 53, 18, 70, 33, 22, 79, 25, 19, 39, 28…
## $ Graduated       <chr> "Yes", "No", "No", "No", "Yes", "No", "Yes", "Yes", "N…
## $ Profession      <chr> "Healthcare", "Entertainment", "Healthcare", "Lawyer",…
## $ Work_Experience <dbl> 9, 1, 11, NA, 9, 0, 0, 8, NA, 1, 12, 1, 9, 7, NA, 4, N…
## $ Spending_Score  <chr> "Low", "Low", "High", "Low", "Low", "Low", "Low", "Hig…
## $ Family_Size     <dbl> 3, 4, 2, 1, 1, 5, 1, 2, 4, 1, 1, 4, 6, 4, 1, 2, 4, 3, …
## $ Var_1           <chr> "Cat_3", "Cat_2", "Cat_6", "Cat_6", "Cat_4", "Cat_4", …
## $ Segmentation    <chr> "D", "D", "D", "D", "D", "D", "D", "D", "D", "D", "D",…
#mengubah ke factor
cust <- cust %>% 
  mutate(across(where(is.character),as.factor))
glimpse(cust)
## Rows: 400
## Columns: 10
## $ Gender          <fct> Female, Male, Male, Male, Male, Male, Female, Female, …
## $ Ever_Married    <fct> No, No, Yes, Yes, Yes, No, No, Yes, No, Yes, No, No, Y…
## $ Age             <dbl> 31, 31, 33, 70, 53, 18, 70, 33, 22, 79, 25, 19, 39, 28…
## $ Graduated       <fct> Yes, No, No, No, Yes, No, Yes, Yes, No, NA, Yes, No, N…
## $ Profession      <fct> Healthcare, Entertainment, Healthcare, Lawyer, Executi…
## $ Work_Experience <dbl> 9, 1, 11, NA, 9, 0, 0, 8, NA, 1, 12, 1, 9, 7, NA, 4, N…
## $ Spending_Score  <fct> Low, Low, High, Low, Low, Low, Low, High, Low, Low, Lo…
## $ Family_Size     <dbl> 3, 4, 2, 1, 1, 5, 1, 2, 4, 1, 1, 4, 6, 4, 1, 2, 4, 3, …
## $ Var_1           <fct> Cat_3, Cat_2, Cat_6, Cat_6, Cat_4, Cat_4, Cat_6, Cat_6…
## $ Segmentation    <fct> D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, D, …
#===Ekplorasi Data
#1. PLot na
plot_intro(cust,theme_config = theme_classic())

#menghapus missing value
cust <- cust %>% 
  na.omit
#Boxplot variabel kontinue dengan target
plot_boxplot(data = cust,
             by = "Segmentation",
             geom_boxplot_args = list(fill="#03A9F4"),
             ggtheme = theme_classic())

#Boxplot variabel diskret dengan target
plot_bar(data = cust,
         by = "Segmentation",
         ggtheme = theme_classic())
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DataExplorer package.
##   Please report the issue at
##   <https://github.com/boxuancui/DataExplorer/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#===Pembagian Data
# K-fold Cross Validation
set.seed(2045)
folds <- vfold_cv(cust, v = 10,strata = "Segmentation" )
folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [307/35]> Fold01
##  2 <split [307/35]> Fold02
##  3 <split [307/35]> Fold03
##  4 <split [307/35]> Fold04
##  5 <split [307/35]> Fold05
##  6 <split [308/34]> Fold06
##  7 <split [308/34]> Fold07
##  8 <split [308/34]> Fold08
##  9 <split [309/33]> Fold09
## 10 <split [310/32]> Fold10
cust %>% 
  mutate(Row=seq(nrow(cust))) %>% 
  select(Segmentation,Row) %>% 
  left_join(tidy(folds),by="Row")%>% 
  count(Segmentation,Fold,Data) %>% 
  group_by(Segmentation,Fold) %>% 
  mutate(percent=n*100/sum(n)) %>% 
  arrange(Fold,Data)
## # A tibble: 80 × 5
## # Groups:   Segmentation, Fold [40]
##    Segmentation Fold   Data           n percent
##    <fct>        <chr>  <chr>      <int>   <dbl>
##  1 A            Fold01 Analysis      76    89.4
##  2 B            Fold01 Analysis      79    89.8
##  3 C            Fold01 Analysis      81    90  
##  4 D            Fold01 Analysis      71    89.9
##  5 A            Fold01 Assessment     9    10.6
##  6 B            Fold01 Assessment     9    10.2
##  7 C            Fold01 Assessment     9    10  
##  8 D            Fold01 Assessment     8    10.1
##  9 A            Fold02 Analysis      76    89.4
## 10 B            Fold02 Analysis      79    89.8
## # ℹ 70 more rows
#===Pemodelan Regresi Logistik
#A. Regresi Logistik Multinomial (target tidak berurutan)
#A.1 Set-up Model 
regresi_mult <-  multinom_reg()%>% 
  set_engine("nnet") %>%
  set_mode("classification")
regresi_mult
## Multinomial Regression Model Specification (classification)
## 
## Computational engine: nnet
#A.2 Intepretasi Model
fit_regresi_mult <- regresi_mult %>% 
  fit(Segmentation~.,data=cust)
#A.3 Mencari Nilai Odds Ratio
fit_regresi_mult %>% 
  repair_call(data = cust) %>% 
  # exponentiate = TRUE koefisien dalam bentuk odds-ratio
  tidy(conf.int = TRUE, conf.level = 0.95, exponentiate = TRUE) %>% 
  mutate(across(where(is.numeric),~round(.x,4)))
## # A tibble: 69 × 8
##    y.level term          estimate std.error statistic p.value conf.low conf.high
##    <chr>   <chr>            <dbl>     <dbl>     <dbl>   <dbl>    <dbl>     <dbl>
##  1 B       (Intercept)      1.89     1.85      0.343   0.732    0.0499    71.5  
##  2 B       GenderMale       0.672    0.370    -1.07    0.282    0.325      1.39 
##  3 B       Ever_Married…    1.78     0.499     1.16    0.246    0.671      4.74 
##  4 B       Age              0.999    0.0158   -0.0443  0.965    0.969      1.03 
##  5 B       GraduatedYes     1.59     0.405     1.15    0.251    0.720      3.52 
##  6 B       ProfessionDo…    0.522    0.703    -0.924   0.356    0.132      2.07 
##  7 B       ProfessionEn…    0.224    0.674    -2.22    0.0264   0.0597     0.839
##  8 B       ProfessionEn…    0.337    0.518    -2.10    0.0359   0.122      0.931
##  9 B       ProfessionEx…    0.245    0.944    -1.49    0.137    0.0385     1.56 
## 10 B       ProfessionHe…    0.970    0.803    -0.0374  0.970    0.201      4.68 
## # ℹ 59 more rows
levels(cust$Segmentation)
## [1] "A" "B" "C" "D"
#A.4 Evaluasi Model
##K-fold Cross Validation
regresi_mult_cv <- 
  workflow() %>%
  add_formula(Segmentation~.) %>% 
  add_model(regresi_mult) %>%
  fit_resamples(folds,
                metrics=metric_set(accuracy),
                control=control_resamples(save_pred = TRUE))
cm_cv_regresi_mult <- 
  conf_mat_resampled(x = regresi_mult_cv,tidy = FALSE)
autoplot(cm_cv_regresi_mult,type = "heatmap")+
  scale_fill_viridis_c(direction = -1,option = "inferno",alpha = 0.6)
## Scale for fill is already present.
## Adding another scale for fill, which will replace the existing scale.

collect_metrics(regresi_mult_cv,summarize = FALSE)
## # A tibble: 10 × 5
##    id     .metric  .estimator .estimate .config        
##    <chr>  <chr>    <chr>          <dbl> <chr>          
##  1 Fold01 accuracy multiclass     0.4   pre0_mod0_post0
##  2 Fold02 accuracy multiclass     0.543 pre0_mod0_post0
##  3 Fold03 accuracy multiclass     0.486 pre0_mod0_post0
##  4 Fold04 accuracy multiclass     0.486 pre0_mod0_post0
##  5 Fold05 accuracy multiclass     0.4   pre0_mod0_post0
##  6 Fold06 accuracy multiclass     0.412 pre0_mod0_post0
##  7 Fold07 accuracy multiclass     0.382 pre0_mod0_post0
##  8 Fold08 accuracy multiclass     0.441 pre0_mod0_post0
##  9 Fold09 accuracy multiclass     0.455 pre0_mod0_post0
## 10 Fold10 accuracy multiclass     0.531 pre0_mod0_post0
collect_metrics(regresi_mult_cv,summarize = TRUE)
## # A tibble: 1 × 6
##   .metric  .estimator  mean     n std_err .config        
##   <chr>    <chr>      <dbl> <int>   <dbl> <chr>          
## 1 accuracy multiclass 0.454    10  0.0179 pre0_mod0_post0
#A.5 Prediksi Data Baru
set.seed(1234)
data_baru <- cust %>% 
  slice_sample(n = 2,by = Segmentation) %>% 
  select(-Segmentation)
data_baru
## # A tibble: 8 × 9
##   Gender Ever_Married   Age Graduated Profession Work_Experience Spending_Score
##   <fct>  <fct>        <dbl> <fct>     <fct>                <dbl> <fct>         
## 1 Male   No              43 Yes       Healthcare               1 Low           
## 2 Male   No              19 No        Healthcare               3 Low           
## 3 Male   No              30 Yes       Artist                   9 Low           
## 4 Male   No              41 Yes       Artist                   7 Low           
## 5 Female No              38 Yes       Doctor                   9 Low           
## 6 Female Yes             47 Yes       Artist                   6 Low           
## 7 Female Yes             60 Yes       Artist                   8 High          
## 8 Male   No              42 Yes       Doctor                   8 Low           
## # ℹ 2 more variables: Family_Size <dbl>, Var_1 <fct>
regresi_mult_opt <- regresi_mult %>% 
  fit(Segmentation~.,data=cust)
pred_data_baru <- regresi_mult_opt %>% 
  predict(new_data = data_baru)
pred_data_baru
## # A tibble: 8 × 1
##   .pred_class
##   <fct>      
## 1 C          
## 2 D          
## 3 A          
## 4 A          
## 5 D          
## 6 C          
## 7 C          
## 8 D