library(lares)
library(discrim)
## Loading required package: parsnip
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom        1.0.9     ✔ tailor       0.1.0
## ✔ dials        1.4.2     ✔ tune         2.0.0
## ✔ infer        1.0.9     ✔ workflows    1.3.0
## ✔ modeldata    1.5.1     ✔ workflowsets 1.1.1
## ✔ recipes      1.3.1     ✔ yardstick    1.3.2
## ✔ rsample      1.3.1     
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ yardstick::conf_mat() masks lares::conf_mat()
## ✖ scales::discard()     masks purrr::discard()
## ✖ dplyr::filter()       masks stats::filter()
## ✖ recipes::fixed()      masks stringr::fixed()
## ✖ dplyr::lag()          masks stats::lag()
## ✖ yardstick::mae()      masks lares::mae()
## ✖ yardstick::mape()     masks lares::mape()
## ✖ yardstick::rmse()     masks lares::rmse()
## ✖ yardstick::rsq()      masks lares::rsq()
## ✖ dials::smoothness()   masks discrim::smoothness()
## ✖ yardstick::spec()     masks readr::spec()
## ✖ recipes::step()       masks stats::step()
library(themis)
library(tidyposterior)
library(SmartEDA)
library(DataExplorer)
library(skimr)
library(ggpubr)
library(workflowsets)
#===Import Data
data(wine, package='rattle')
df <- wine
rm(wine)
glimpse(df)
## Rows: 178
## Columns: 14
## $ Type            <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Alcohol         <dbl> 14.23, 13.20, 13.16, 14.37, 13.24, 14.20, 14.39, 14.06…
## $ Malic           <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.15, 1.64, …
## $ Ash             <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45, 2.61, 2.17, …
## $ Alcalinity      <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17.6, 14.0, …
## $ Magnesium       <int> 127, 100, 101, 113, 118, 112, 96, 121, 97, 98, 105, 95…
## $ Phenols         <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50, 2.60, 2.80, …
## $ Flavanoids      <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.51, 2.98, …
## $ Nonflavanoids   <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30, 0.31, 0.29, …
## $ Proanthocyanins <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.25, 1.98, …
## $ Color           <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25, 5.05, 5.20, …
## $ Hue             <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02, 1.06, 1.08, …
## $ Dilution        <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58, 3.58, 2.85, …
## $ Proline         <int> 1065, 1050, 1185, 1480, 735, 1450, 1290, 1295, 1045, 1…
#===Eksplorasi Data
df %>% 
  skim_without_charts()
Data summary
Name Piped data
Number of rows 178
Number of columns 14
_______________________
Column type frequency:
factor 1
numeric 13
________________________
Group variables None

Variable type: factor

skim_variable n_missing complete_rate ordered n_unique top_counts
Type 0 1 FALSE 3 2: 71, 1: 59, 3: 48

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
Alcohol 0 1 13.00 0.81 11.03 12.36 13.05 13.68 14.83
Malic 0 1 2.34 1.12 0.74 1.60 1.87 3.08 5.80
Ash 0 1 2.37 0.27 1.36 2.21 2.36 2.56 3.23
Alcalinity 0 1 19.49 3.34 10.60 17.20 19.50 21.50 30.00
Magnesium 0 1 99.74 14.28 70.00 88.00 98.00 107.00 162.00
Phenols 0 1 2.30 0.63 0.98 1.74 2.36 2.80 3.88
Flavanoids 0 1 2.03 1.00 0.34 1.20 2.13 2.88 5.08
Nonflavanoids 0 1 0.36 0.12 0.13 0.27 0.34 0.44 0.66
Proanthocyanins 0 1 1.59 0.57 0.41 1.25 1.56 1.95 3.58
Color 0 1 5.06 2.32 1.28 3.22 4.69 6.20 13.00
Hue 0 1 0.96 0.23 0.48 0.78 0.96 1.12 1.71
Dilution 0 1 2.61 0.71 1.27 1.94 2.78 3.17 4.00
Proline 0 1 746.89 314.91 278.00 500.50 673.50 985.00 1680.00
#Plot distribusi variabel kategorik
df %>% 
  ExpCatViz(clim = 10,
            col = "#107896",
            Page = NULL,
            Flip = TRUE)
## [[1]]

#Distribusi Variabel Numerik
df %>% 
  plot_histogram(
    ggtheme = theme_lares(),
    geom_histogram_args = list(bins=30,
                               col="black",
                               fill="#107896"),
    ncol = 1,nrow = 1)

#Boxplot Variabel Kategorik vs Numerik
df %>% 
  ExpNumViz(target = "Type",
            type = 2)
## [[1]]

## 
## [[2]]

## 
## [[3]]

## 
## [[4]]

## 
## [[5]]

## 
## [[6]]

## 
## [[7]]

## 
## [[8]]

## 
## [[9]]

## 
## [[10]]

## 
## [[11]]

## 
## [[12]]

## 
## [[13]]

#Deteksi missing value
df %>% 
  plot_missing(missing_only = FALSE,
               ggtheme = theme_lares())
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DataExplorer package.
##   Please report the issue at
##   <https://github.com/boxuancui/DataExplorer/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#===Uji Asumsi Analisis Diskriminan
#1. Asumis Uji Normal Ganda setiap kelas variabel respon
uji_normalGanda <- MVN::mvn(data = df,subset="Type",mvn_test = "hz")
uji_normalGanda$multivariate_normality
##   Group          Test Statistic p.value          MVN
## 1     1 Henze-Zirkler     0.993   0.199     ✓ Normal
## 2     2 Henze-Zirkler     1.015  <0.001 ✗ Not normal
## 3     3 Henze-Zirkler     0.989   0.372     ✓ Normal
#2. Uji Asumsi Kesamaan Ragam setiap kelas variabel respon
uji_ragam <- heplots::boxM(Y=df %>% dplyr::select(-Type),
                           group=df$Type)
tidy(uji_ragam)
## # A tibble: 1 × 4
##   statistic  p.value parameter method                                           
##       <dbl>    <dbl>     <dbl> <chr>                                            
## 1      684. 2.89e-59       182 Box's M-test for Homogeneity of Covariance Matri…
#===Pembagian Data
#K-fold Cross Validation
set.seed(345)
folds <- vfold_cv(df, v = 10,strata = "Type" )
folds
## #  10-fold cross-validation using stratification 
## # A tibble: 10 × 2
##    splits           id    
##    <list>           <chr> 
##  1 <split [159/19]> Fold01
##  2 <split [160/18]> Fold02
##  3 <split [160/18]> Fold03
##  4 <split [160/18]> Fold04
##  5 <split [160/18]> Fold05
##  6 <split [160/18]> Fold06
##  7 <split [160/18]> Fold07
##  8 <split [160/18]> Fold08
##  9 <split [161/17]> Fold09
## 10 <split [162/16]> Fold10
#===Mendefinisikan Model
#1. Linear Discriminant
disc_linear <-  discrim_linear()%>% 
set_engine(engine = "MASS") %>% 
  set_mode("classification")
disc_linear
## Linear Discriminant Model Specification (classification)
## 
## Computational engine: MASS
#2. Quadratic Discriminant
disc_quad <- discrim_quad() %>% 
  set_engine(engine = "MASS") %>% 
  set_mode("classification")
disc_quad
## Quadratic Discriminant Model Specification (classification)
## 
## Computational engine: MASS
#===Praprosesing Data
#1. Mendefinisikan praproses data
basic_recipe <- recipe(Type~.,data =df)
basic_recipe 
## 
## ── Recipe ──────────────────────────────────────────────────────────────────────
## 
## ── Inputs
## Number of variables by role
## outcome:    1
## predictor: 13
#2. Menggabungkan tahap praproses data dan model
models <- list(disc_linear=disc_linear,
               disc_quad =disc_quad)
preproc <- list(basic_recipe=basic_recipe)
wf_set <- workflow_set(preproc = preproc,models = models,
                       cross = TRUE)
#3. Training Models
train_models <- workflow_map(wf_set,
                             fn="fit_resamples",
                             resamples = folds,
                             control=control_resamples(save_workflow = TRUE),
                             metrics=metric_set(sensitivity,
                                                specificity,
                                                bal_accuracy,
                                                f_meas),
                             seed = 2123)
#===Evaluasi Model
autoplot(train_models,type="wflow_id",
         rank_metric = "bal_accuracy")+
  scale_y_continuous(limits = c(0,1)) +
  theme_lares() +
  # memindahkan legend ke bawah
  theme(legend.position = "bottom")+
  # legend menjadi 2 baris
  guides(color = guide_legend(nrow = 2))

rank_results(train_models,rank_metric = "bal_accuracy") %>% 
  filter(.metric=="bal_accuracy") %>% 
  dplyr::select(wflow_id,.metric,mean,std_err,rank)
## # A tibble: 2 × 5
##   wflow_id                 .metric       mean std_err  rank
##   <chr>                    <chr>        <dbl>   <dbl> <int>
## 1 basic_recipe_disc_linear bal_accuracy 0.996 0.00366     1
## 2 basic_recipe_disc_quad   bal_accuracy 0.991 0.00862     2
rank_results(train_models,rank_metric = "f_meas") %>% 
  filter(.metric=="f_meas") %>% 
  dplyr::select(wflow_id,.metric,mean,std_err,rank)
## # A tibble: 2 × 5
##   wflow_id                 .metric  mean std_err  rank
##   <chr>                    <chr>   <dbl>   <dbl> <int>
## 1 basic_recipe_disc_linear f_meas  0.994 0.00559     1
## 2 basic_recipe_disc_quad   f_meas  0.989 0.0110      2
#===Menentukan Model terbaik dari Training
mod_best1 <- fit_best(x = train_models,metric = "bal_accuracy")
mod_best1
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: discrim_linear()
## 
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
## 
## ── Model ───────────────────────────────────────────────────────────────────────
## Call:
## lda(..y ~ ., data = data)
## 
## Prior probabilities of groups:
##         1         2         3 
## 0.3314607 0.3988764 0.2696629 
## 
## Group means:
##    Alcohol    Malic      Ash Alcalinity Magnesium  Phenols Flavanoids
## 1 13.74475 2.010678 2.455593   17.03729  106.3390 2.840169  2.9823729
## 2 12.27873 1.932676 2.244789   20.23803   94.5493 2.258873  2.0808451
## 3 13.15375 3.333750 2.437083   21.41667   99.3125 1.678750  0.7814583
##   Nonflavanoids Proanthocyanins    Color       Hue Dilution   Proline
## 1      0.290000        1.899322 5.528305 1.0620339 3.157797 1115.7119
## 2      0.363662        1.630282 3.086620 1.0562817 2.785352  519.5070
## 3      0.447500        1.153542 7.396250 0.6827083 1.683542  629.8958
## 
## Coefficients of linear discriminants:
##                          LD1           LD2
## Alcohol         -0.403399781  0.8717930699
## Malic            0.165254596  0.3053797325
## Ash             -0.369075256  2.3458497486
## Alcalinity       0.154797889 -0.1463807654
## Magnesium       -0.002163496 -0.0004627565
## Phenols          0.618052068 -0.0322128171
## Flavanoids      -1.661191235 -0.4919980543
## Nonflavanoids   -1.495818440 -1.6309537953
## Proanthocyanins  0.134092628 -0.3070875776
## Color            0.355055710  0.2532306865
## Hue             -0.818036073 -1.5156344987
## Dilution        -1.157559376  0.0511839665
## Proline         -0.002691206  0.0028529846
## 
## Proportion of trace:
##    LD1    LD2 
## 0.6875 0.3125
#===Interpretasi Model Terbaik
coefficient <- mod_best1 %>% 
  extract_fit_engine() %>% 
  coef() %>% 
  as.data.frame() %>% 
  rownames_to_column(var = "Predictors") 
coefficient
##         Predictors          LD1           LD2
## 1          Alcohol -0.403399781  0.8717930699
## 2            Malic  0.165254596  0.3053797325
## 3              Ash -0.369075256  2.3458497486
## 4       Alcalinity  0.154797889 -0.1463807654
## 5        Magnesium -0.002163496 -0.0004627565
## 6          Phenols  0.618052068 -0.0322128171
## 7       Flavanoids -1.661191235 -0.4919980543
## 8    Nonflavanoids -1.495818440 -1.6309537953
## 9  Proanthocyanins  0.134092628 -0.3070875776
## 10           Color  0.355055710  0.2532306865
## 11             Hue -0.818036073 -1.5156344987
## 12        Dilution -1.157559376  0.0511839665
## 13         Proline -0.002691206  0.0028529846
## custom function
lda_equation <-function(x,names){
  str_c(str_c("(",x,")",
              names,collapse = "+"
  )
  )
}

coefficient %>% 
  mutate(across(where(is.numeric),function(x) round(x,digits = 2))) %>%
  summarise(D1=lda_equation(LD1,Predictors),
            D2=lda_equation(LD2,Predictors)) %>% 
  pivot_longer(c(D1,D2),names_to = "discriminant",
               values_to="equation"
  )
## # A tibble: 2 × 2
##   discriminant equation                                                         
##   <chr>        <chr>                                                            
## 1 D1           (-0.4)Alcohol+(0.17)Malic+(-0.37)Ash+(0.15)Alcalinity+(0)Magnesi…
## 2 D2           (0.87)Alcohol+(0.31)Malic+(2.35)Ash+(-0.15)Alcalinity+(0)Magnesi…
## custom function
autoplot.lda<-function(object,response,data){
  object %>% 
    predict(newdata = data) %>% 
    magrittr::extract2("x") %>% 
    as_tibble() %>% 
    bind_cols(data %>% dplyr::select(all_of(response))) %>% 
    ggscatter(x="LD1",y="LD2",color = response)
}

mod_best1 %>% 
  extract_fit_engine() %>% 
  autoplot(response="Type",data=df)

#===Prediksi Data baru
set.seed(1234)
data_baru <- df %>% 
  slice_sample(n = 2,by = Type) %>% 
  dplyr::select(-Type)
data_baru
##   Alcohol Malic  Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1   13.30  1.72 2.14       17.0        94    2.40       2.19          0.27
## 2   13.63  1.81 2.70       17.2       112    2.85       2.91          0.30
## 3   12.00  0.92 2.00       19.0        86    2.42       2.26          0.30
## 4   12.37  1.17 1.92       19.6        78    2.11       2.00          0.27
## 5   12.51  1.24 2.25       17.5        85    2.00       0.58          0.60
## 6   12.82  3.37 2.30       19.5        88    1.48       0.66          0.40
##   Proanthocyanins Color  Hue Dilution Proline
## 1            1.35  3.95 1.02     2.77    1285
## 2            1.46  7.30 1.28     2.88    1310
## 3            1.43  2.50 1.38     3.12     278
## 4            1.04  4.68 1.12     3.48     510
## 5            1.25  5.45 0.75     1.51     650
## 6            0.97 10.26 0.72     1.75     685
pred_data_baru1 <- mod_best1%>% 
  predict(new_data = data_baru)
pred_data_baru1
## # A tibble: 6 × 1
##   .pred_class
##   <fct>      
## 1 1          
## 2 1          
## 3 2          
## 4 2          
## 5 3          
## 6 3