Analisis Diskriminan

library(lares)
library(discrim)

## Loading required package: parsnip

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(tidymodels)

## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.9     ✔ tailor       0.1.0
## ✔ dials        1.4.2     ✔ tune         2.0.0
## ✔ infer        1.0.9     ✔ workflows    1.3.0
## ✔ modeldata    1.5.1     ✔ workflowsets 1.1.1
## ✔ recipes      1.3.1     ✔ yardstick    1.3.2
## ✔ rsample      1.3.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ yardstick::conf_mat() masks lares::conf_mat()
## ✖ scales::discard()     masks purrr::discard()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ recipes::fixed()      masks stringr::fixed()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ yardstick::mae()      masks lares::mae()
## ✖ yardstick::mape()     masks lares::mape()
## ✖ yardstick::rmse()     masks lares::rmse()
## ✖ yardstick::rsq()      masks lares::rsq()
## ✖ dials::smoothness()   masks discrim::smoothness()
## ✖ yardstick::spec()     masks readr::spec()
## ✖ recipes::step()       masks stats::step()

library(themis)
library(tidyposterior)
library(SmartEDA)
library(DataExplorer)
library(skimr)
library(ggpubr)
library(workflowsets)

#===Import Data
data(wine, package='rattle')
df <- wine
rm(wine)
glimpse(df)

## Rows: 178
## Columns: 14
## $ Type            <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Alcohol         <dbl> 14.23, 13.20, 13.16, 14.37, 13.24, 14.20, 14.39, 14.06…
## $ Malic           <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.15, 1.64, …
## $ Ash             <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45, 2.61, 2.17, …
## $ Alcalinity      <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17.6, 14.0, …
## $ Magnesium       <int> 127, 100, 101, 113, 118, 112, 96, 121, 97, 98, 105, 95…
## $ Phenols         <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50, 2.60, 2.80, …
## $ Flavanoids      <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.51, 2.98, …
## $ Nonflavanoids   <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30, 0.31, 0.29, …
## $ Proanthocyanins <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.25, 1.98, …
## $ Color           <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25, 5.05, 5.20, …
## $ Hue             <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02, 1.06, 1.08, …
## $ Dilution        <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58, 3.58, 2.85, …
## $ Proline         <int> 1065, 1050, 1185, 1480, 735, 1450, 1290, 1295, 1045, 1…

#===Eksplorasi Data
df %>% 
  skim_without_charts()

Data summary
Name	Piped data
Number of rows	178
Number of columns	14
_______________________
Column type frequency:
factor	1
numeric	13
________________________
Group variables	None

Variable type: factor

skim_variable	n_missing	complete_rate	ordered	n_unique	top_counts
Type	0	1	FALSE	3	2: 71, 1: 59, 3: 48

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
Alcohol	1	13.00	0.81	11.03	12.36	13.05	13.68	14.83
Malic	1	2.34	1.12	0.74	1.60	1.87	3.08	5.80
Ash	1	2.37	0.27	1.36	2.21	2.36	2.56	3.23
Alcalinity	1	19.49	3.34	10.60	17.20	19.50	21.50	30.00
Magnesium	1	99.74	14.28	70.00	88.00	98.00	107.00	162.00
Phenols	1	2.30	0.63	0.98	1.74	2.36	2.80	3.88
Flavanoids	1	2.03	1.00	0.34	1.20	2.13	2.88	5.08
Nonflavanoids	1	0.36	0.12	0.13	0.27	0.34	0.44	0.66
Proanthocyanins	1	1.59	0.57	0.41	1.25	1.56	1.95	3.58
Color	1	5.06	2.32	1.28	3.22	4.69	6.20	13.00
Hue	1	0.96	0.23	0.48	0.78	0.96	1.12	1.71
Dilution	1	2.61	0.71	1.27	1.94	2.78	3.17	4.00
Proline	1	746.89	314.91	278.00	500.50	673.50	985.00	1680.00

#Plot distribusi variabel kategorik
df %>% 
  ExpCatViz(clim = 10,
            col = "#107896",
            Page = NULL,
            Flip = TRUE)

## [[1]]

#Distribusi Variabel Numerik
df %>% 
  plot_histogram(
    ggtheme = theme_lares(),
    geom_histogram_args = list(bins=30,
                               col="black",
                               fill="#107896"),
    ncol = 1,nrow = 1)

#Boxplot Variabel Kategorik vs Numerik
df %>% 
  ExpNumViz(target = "Type",
            type = 2)

## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

#Deteksi missing value
df %>% 
  plot_missing(missing_only = FALSE,
               ggtheme = theme_lares())

## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DataExplorer package.
##   Please report the issue at
##   <https://github.com/boxuancui/DataExplorer/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#===Uji Asumsi Analisis Diskriminan
#1. Asumis Uji Normal Ganda setiap kelas variabel respon
uji_normalGanda <- MVN::mvn(data = df,subset="Type",mvn_test = "hz")
uji_normalGanda$multivariate_normality

##   Group          Test Statistic p.value          MVN
## 1     1 Henze-Zirkler     0.993   0.199     ✓ Normal
## 2     2 Henze-Zirkler     1.015  <0.001 ✗ Not normal
## 3     3 Henze-Zirkler     0.989   0.372     ✓ Normal

#2. Uji Asumsi Kesamaan Ragam setiap kelas variabel respon
uji_ragam <- heplots::boxM(Y=df %>% dplyr::select(-Type),
                           group=df$Type)
tidy(uji_ragam)

## # A tibble: 1 × 4
##   statistic  p.value parameter method                                           
##       <dbl>    <dbl>     <dbl> <chr>                                            
## 1      684. 2.89e-59       182 Box's M-test for Homogeneity of Covariance Matri…

#===Pembagian Data
#K-fold Cross Validation
set.seed(345)
folds <- vfold_cv(df, v = 10,strata = "Type" )
folds

## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [159/19]> Fold01
##  2 <split [160/18]> Fold02
##  3 <split [160/18]> Fold03
##  4 <split [160/18]> Fold04
##  5 <split [160/18]> Fold05
##  6 <split [160/18]> Fold06
##  7 <split [160/18]> Fold07
##  8 <split [160/18]> Fold08
##  9 <split [161/17]> Fold09
## 10 <split [162/16]> Fold10

#===Mendefinisikan Model
#1. Linear Discriminant
disc_linear <-  discrim_linear()%>% 
set_engine(engine = "MASS") %>% 
  set_mode("classification")
disc_linear

## Linear Discriminant Model Specification (classification)
## 
## Computational engine: MASS

#2. Quadratic Discriminant
disc_quad <- discrim_quad() %>% 
  set_engine(engine = "MASS") %>% 
  set_mode("classification")
disc_quad

## Quadratic Discriminant Model Specification (classification)
## 
## Computational engine: MASS

#===Praprosesing Data
#1. Mendefinisikan praproses data
basic_recipe <- recipe(Type~.,data =df)
basic_recipe

##

## ── Recipe ──────────────────────────────────────────────────────────────────────

##

## ── Inputs

## Number of variables by role

## outcome:    1
## predictor: 13

#2. Menggabungkan tahap praproses data dan model
models <- list(disc_linear=disc_linear,
               disc_quad =disc_quad)
preproc <- list(basic_recipe=basic_recipe)
wf_set <- workflow_set(preproc = preproc,models = models,
                       cross = TRUE)
#3. Training Models
train_models <- workflow_map(wf_set,
                             fn="fit_resamples",
                             resamples = folds,
                             control=control_resamples(save_workflow = TRUE),
                             metrics=metric_set(sensitivity,
                                                specificity,
                                                bal_accuracy,
                                                f_meas),
                             seed = 2123)

#===Evaluasi Model
autoplot(train_models,type="wflow_id",
         rank_metric = "bal_accuracy")+
  scale_y_continuous(limits = c(0,1)) +
  theme_lares() +
  # memindahkan legend ke bawah
  theme(legend.position = "bottom")+
  # legend menjadi 2 baris
  guides(color = guide_legend(nrow = 2))

rank_results(train_models,rank_metric = "bal_accuracy") %>% 
  filter(.metric=="bal_accuracy") %>% 
  dplyr::select(wflow_id,.metric,mean,std_err,rank)

## # A tibble: 2 × 5
##   wflow_id                 .metric       mean std_err  rank
##   <chr>                    <chr>        <dbl>   <dbl> <int>
## 1 basic_recipe_disc_linear bal_accuracy 0.996 0.00366     1
## 2 basic_recipe_disc_quad   bal_accuracy 0.991 0.00862     2

rank_results(train_models,rank_metric = "f_meas") %>% 
  filter(.metric=="f_meas") %>% 
  dplyr::select(wflow_id,.metric,mean,std_err,rank)

## # A tibble: 2 × 5
##   wflow_id                 .metric  mean std_err  rank
##   <chr>                    <chr>   <dbl>   <dbl> <int>
## 1 basic_recipe_disc_linear f_meas  0.994 0.00559     1
## 2 basic_recipe_disc_quad   f_meas  0.989 0.0110      2

#===Menentukan Model terbaik dari Training
mod_best1 <- fit_best(x = train_models,metric = "bal_accuracy")
mod_best1

## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: discrim_linear()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Call:
## lda(..y ~ ., data = data)
## 
## Prior probabilities of groups:
##         1         2         3 
## 0.3314607 0.3988764 0.2696629 
## 
## Group means:
##    Alcohol    Malic      Ash Alcalinity Magnesium  Phenols Flavanoids
## 1 13.74475 2.010678 2.455593   17.03729  106.3390 2.840169  2.9823729
## 2 12.27873 1.932676 2.244789   20.23803   94.5493 2.258873  2.0808451
## 3 13.15375 3.333750 2.437083   21.41667   99.3125 1.678750  0.7814583
##   Nonflavanoids Proanthocyanins    Color       Hue Dilution   Proline
## 1      0.290000        1.899322 5.528305 1.0620339 3.157797 1115.7119
## 2      0.363662        1.630282 3.086620 1.0562817 2.785352  519.5070
## 3      0.447500        1.153542 7.396250 0.6827083 1.683542  629.8958
## 
## Coefficients of linear discriminants:
##                          LD1           LD2
## Alcohol         -0.403399781  0.8717930699
## Malic            0.165254596  0.3053797325
## Ash             -0.369075256  2.3458497486
## Alcalinity       0.154797889 -0.1463807654
## Magnesium       -0.002163496 -0.0004627565
## Phenols          0.618052068 -0.0322128171
## Flavanoids      -1.661191235 -0.4919980543
## Nonflavanoids   -1.495818440 -1.6309537953
## Proanthocyanins  0.134092628 -0.3070875776
## Color            0.355055710  0.2532306865
## Hue             -0.818036073 -1.5156344987
## Dilution        -1.157559376  0.0511839665
## Proline         -0.002691206  0.0028529846
## 
## Proportion of trace:
##    LD1    LD2 
## 0.6875 0.3125

#===Interpretasi Model Terbaik
coefficient <- mod_best1 %>% 
  extract_fit_engine() %>% 
  coef() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "Predictors") 
coefficient

##         Predictors          LD1           LD2
## 1          Alcohol -0.403399781  0.8717930699
## 2            Malic  0.165254596  0.3053797325
## 3              Ash -0.369075256  2.3458497486
## 4       Alcalinity  0.154797889 -0.1463807654
## 5        Magnesium -0.002163496 -0.0004627565
## 6          Phenols  0.618052068 -0.0322128171
## 7       Flavanoids -1.661191235 -0.4919980543
## 8    Nonflavanoids -1.495818440 -1.6309537953
## 9  Proanthocyanins  0.134092628 -0.3070875776
## 10           Color  0.355055710  0.2532306865
## 11             Hue -0.818036073 -1.5156344987
## 12        Dilution -1.157559376  0.0511839665
## 13         Proline -0.002691206  0.0028529846

## custom function
lda_equation <-function(x,names){
  str_c(str_c("(",x,")",
              names,collapse = "+"
  )
  )
}

coefficient %>% 
  mutate(across(where(is.numeric),function(x) round(x,digits = 2))) %>%
  summarise(D1=lda_equation(LD1,Predictors),
            D2=lda_equation(LD2,Predictors)) %>% 
  pivot_longer(c(D1,D2),names_to = "discriminant",
               values_to="equation"
  )

## # A tibble: 2 × 2
##   discriminant equation                                                         
##   <chr>        <chr>                                                            
## 1 D1           (-0.4)Alcohol+(0.17)Malic+(-0.37)Ash+(0.15)Alcalinity+(0)Magnesi…
## 2 D2           (0.87)Alcohol+(0.31)Malic+(2.35)Ash+(-0.15)Alcalinity+(0)Magnesi…

## custom function
autoplot.lda<-function(object,response,data){
  object %>% 
    predict(newdata = data) %>% 
    magrittr::extract2("x") %>% 
    as_tibble() %>% 
    bind_cols(data %>% dplyr::select(all_of(response))) %>% 
    ggscatter(x="LD1",y="LD2",color = response)
}

mod_best1 %>% 
  extract_fit_engine() %>% 
  autoplot(response="Type",data=df)

#===Prediksi Data baru
set.seed(1234)
data_baru <- df %>% 
  slice_sample(n = 2,by = Type) %>% 
  dplyr::select(-Type)
data_baru

##   Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1   13.30  1.72 2.14       17.0        94    2.40       2.19          0.27
## 2   13.63  1.81 2.70       17.2       112    2.85       2.91          0.30
## 3   12.00  0.92 2.00       19.0        86    2.42       2.26          0.30
## 4   12.37  1.17 1.92       19.6        78    2.11       2.00          0.27
## 5   12.51  1.24 2.25       17.5        85    2.00       0.58          0.60
## 6   12.82  3.37 2.30       19.5        88    1.48       0.66          0.40
##   Proanthocyanins Color  Hue Dilution Proline
## 1            1.35  3.95 1.02     2.77    1285
## 2            1.46  7.30 1.28     2.88    1310
## 3            1.43  2.50 1.38     3.12     278
## 4            1.04  4.68 1.12     3.48     510
## 5            1.25  5.45 0.75     1.51     650
## 6            0.97 10.26 0.72     1.75     685

pred_data_baru1 <- mod_best1%>% 
  predict(new_data = data_baru)
pred_data_baru1

## # A tibble: 6 × 1
##   .pred_class
##   <fct>      
## 1 1          
## 2 1          
## 3 2          
## 4 2          
## 5 3          
## 6 3

Analisis Diskriminan

Felasofa Rahmatanti - M0501241033

2025-10-09