library(lares)
library(discrim)
## Loading required package: parsnip
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 4.0.0 ✔ tibble 3.3.0
## ✔ lubridate 1.9.4 ✔ tidyr 1.3.1
## ✔ purrr 1.1.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## ── Attaching packages ────────────────────────────────────── tidymodels 1.4.1 ──
## ✔ broom 1.0.9 ✔ tailor 0.1.0
## ✔ dials 1.4.2 ✔ tune 2.0.0
## ✔ infer 1.0.9 ✔ workflows 1.3.0
## ✔ modeldata 1.5.1 ✔ workflowsets 1.1.1
## ✔ recipes 1.3.1 ✔ yardstick 1.3.2
## ✔ rsample 1.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ yardstick::conf_mat() masks lares::conf_mat()
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::mae() masks lares::mae()
## ✖ yardstick::mape() masks lares::mape()
## ✖ yardstick::rmse() masks lares::rmse()
## ✖ yardstick::rsq() masks lares::rsq()
## ✖ dials::smoothness() masks discrim::smoothness()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
library(themis)
library(tidyposterior)
library(SmartEDA)
library(DataExplorer)
library(skimr)
library(ggpubr)
library(workflowsets)
#===Import Data
data(wine, package='rattle')
df <- wine
rm(wine)
glimpse(df)
## Rows: 178
## Columns: 14
## $ Type <fct> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, …
## $ Alcohol <dbl> 14.23, 13.20, 13.16, 14.37, 13.24, 14.20, 14.39, 14.06…
## $ Malic <dbl> 1.71, 1.78, 2.36, 1.95, 2.59, 1.76, 1.87, 2.15, 1.64, …
## $ Ash <dbl> 2.43, 2.14, 2.67, 2.50, 2.87, 2.45, 2.45, 2.61, 2.17, …
## $ Alcalinity <dbl> 15.6, 11.2, 18.6, 16.8, 21.0, 15.2, 14.6, 17.6, 14.0, …
## $ Magnesium <int> 127, 100, 101, 113, 118, 112, 96, 121, 97, 98, 105, 95…
## $ Phenols <dbl> 2.80, 2.65, 2.80, 3.85, 2.80, 3.27, 2.50, 2.60, 2.80, …
## $ Flavanoids <dbl> 3.06, 2.76, 3.24, 3.49, 2.69, 3.39, 2.52, 2.51, 2.98, …
## $ Nonflavanoids <dbl> 0.28, 0.26, 0.30, 0.24, 0.39, 0.34, 0.30, 0.31, 0.29, …
## $ Proanthocyanins <dbl> 2.29, 1.28, 2.81, 2.18, 1.82, 1.97, 1.98, 1.25, 1.98, …
## $ Color <dbl> 5.64, 4.38, 5.68, 7.80, 4.32, 6.75, 5.25, 5.05, 5.20, …
## $ Hue <dbl> 1.04, 1.05, 1.03, 0.86, 1.04, 1.05, 1.02, 1.06, 1.08, …
## $ Dilution <dbl> 3.92, 3.40, 3.17, 3.45, 2.93, 2.85, 3.58, 3.58, 2.85, …
## $ Proline <int> 1065, 1050, 1185, 1480, 735, 1450, 1290, 1295, 1045, 1…
#===Eksplorasi Data
df %>%
skim_without_charts()
Data summary
Name |
Piped data |
Number of rows |
178 |
Number of columns |
14 |
_______________________ |
|
Column type frequency: |
|
factor |
1 |
numeric |
13 |
________________________ |
|
Group variables |
None |
Variable type: factor
Type |
0 |
1 |
FALSE |
3 |
2: 71, 1: 59, 3: 48 |
Variable type: numeric
Alcohol |
0 |
1 |
13.00 |
0.81 |
11.03 |
12.36 |
13.05 |
13.68 |
14.83 |
Malic |
0 |
1 |
2.34 |
1.12 |
0.74 |
1.60 |
1.87 |
3.08 |
5.80 |
Ash |
0 |
1 |
2.37 |
0.27 |
1.36 |
2.21 |
2.36 |
2.56 |
3.23 |
Alcalinity |
0 |
1 |
19.49 |
3.34 |
10.60 |
17.20 |
19.50 |
21.50 |
30.00 |
Magnesium |
0 |
1 |
99.74 |
14.28 |
70.00 |
88.00 |
98.00 |
107.00 |
162.00 |
Phenols |
0 |
1 |
2.30 |
0.63 |
0.98 |
1.74 |
2.36 |
2.80 |
3.88 |
Flavanoids |
0 |
1 |
2.03 |
1.00 |
0.34 |
1.20 |
2.13 |
2.88 |
5.08 |
Nonflavanoids |
0 |
1 |
0.36 |
0.12 |
0.13 |
0.27 |
0.34 |
0.44 |
0.66 |
Proanthocyanins |
0 |
1 |
1.59 |
0.57 |
0.41 |
1.25 |
1.56 |
1.95 |
3.58 |
Color |
0 |
1 |
5.06 |
2.32 |
1.28 |
3.22 |
4.69 |
6.20 |
13.00 |
Hue |
0 |
1 |
0.96 |
0.23 |
0.48 |
0.78 |
0.96 |
1.12 |
1.71 |
Dilution |
0 |
1 |
2.61 |
0.71 |
1.27 |
1.94 |
2.78 |
3.17 |
4.00 |
Proline |
0 |
1 |
746.89 |
314.91 |
278.00 |
500.50 |
673.50 |
985.00 |
1680.00 |
#Plot distribusi variabel kategorik
df %>%
ExpCatViz(clim = 10,
col = "#107896",
Page = NULL,
Flip = TRUE)
## [[1]]

#Distribusi Variabel Numerik
df %>%
plot_histogram(
ggtheme = theme_lares(),
geom_histogram_args = list(bins=30,
col="black",
fill="#107896"),
ncol = 1,nrow = 1)













#Boxplot Variabel Kategorik vs Numerik
df %>%
ExpNumViz(target = "Type",
type = 2)
## [[1]]

##
## [[2]]

##
## [[3]]

##
## [[4]]

##
## [[5]]

##
## [[6]]

##
## [[7]]

##
## [[8]]

##
## [[9]]

##
## [[10]]

##
## [[11]]

##
## [[12]]

##
## [[13]]

#Deteksi missing value
df %>%
plot_missing(missing_only = FALSE,
ggtheme = theme_lares())
## Warning: `aes_string()` was deprecated in ggplot2 3.0.0.
## ℹ Please use tidy evaluation idioms with `aes()`.
## ℹ See also `vignette("ggplot2-in-packages")` for more information.
## ℹ The deprecated feature was likely used in the DataExplorer package.
## Please report the issue at
## <https://github.com/boxuancui/DataExplorer/issues>.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

#===Uji Asumsi Analisis Diskriminan
#1. Asumis Uji Normal Ganda setiap kelas variabel respon
uji_normalGanda <- MVN::mvn(data = df,subset="Type",mvn_test = "hz")
uji_normalGanda$multivariate_normality
## Group Test Statistic p.value MVN
## 1 1 Henze-Zirkler 0.993 0.199 ✓ Normal
## 2 2 Henze-Zirkler 1.015 <0.001 ✗ Not normal
## 3 3 Henze-Zirkler 0.989 0.372 ✓ Normal
#2. Uji Asumsi Kesamaan Ragam setiap kelas variabel respon
uji_ragam <- heplots::boxM(Y=df %>% dplyr::select(-Type),
group=df$Type)
tidy(uji_ragam)
## # A tibble: 1 × 4
## statistic p.value parameter method
## <dbl> <dbl> <dbl> <chr>
## 1 684. 2.89e-59 182 Box's M-test for Homogeneity of Covariance Matri…
#===Pembagian Data
#K-fold Cross Validation
set.seed(345)
folds <- vfold_cv(df, v = 10,strata = "Type" )
folds
## # 10-fold cross-validation using stratification
## # A tibble: 10 × 2
## splits id
## <list> <chr>
## 1 <split [159/19]> Fold01
## 2 <split [160/18]> Fold02
## 3 <split [160/18]> Fold03
## 4 <split [160/18]> Fold04
## 5 <split [160/18]> Fold05
## 6 <split [160/18]> Fold06
## 7 <split [160/18]> Fold07
## 8 <split [160/18]> Fold08
## 9 <split [161/17]> Fold09
## 10 <split [162/16]> Fold10
#===Mendefinisikan Model
#1. Linear Discriminant
disc_linear <- discrim_linear()%>%
set_engine(engine = "MASS") %>%
set_mode("classification")
disc_linear
## Linear Discriminant Model Specification (classification)
##
## Computational engine: MASS
#2. Quadratic Discriminant
disc_quad <- discrim_quad() %>%
set_engine(engine = "MASS") %>%
set_mode("classification")
disc_quad
## Quadratic Discriminant Model Specification (classification)
##
## Computational engine: MASS
#===Praprosesing Data
#1. Mendefinisikan praproses data
basic_recipe <- recipe(Type~.,data =df)
basic_recipe
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 13
#2. Menggabungkan tahap praproses data dan model
models <- list(disc_linear=disc_linear,
disc_quad =disc_quad)
preproc <- list(basic_recipe=basic_recipe)
wf_set <- workflow_set(preproc = preproc,models = models,
cross = TRUE)
#3. Training Models
train_models <- workflow_map(wf_set,
fn="fit_resamples",
resamples = folds,
control=control_resamples(save_workflow = TRUE),
metrics=metric_set(sensitivity,
specificity,
bal_accuracy,
f_meas),
seed = 2123)
#===Evaluasi Model
autoplot(train_models,type="wflow_id",
rank_metric = "bal_accuracy")+
scale_y_continuous(limits = c(0,1)) +
theme_lares() +
# memindahkan legend ke bawah
theme(legend.position = "bottom")+
# legend menjadi 2 baris
guides(color = guide_legend(nrow = 2))

rank_results(train_models,rank_metric = "bal_accuracy") %>%
filter(.metric=="bal_accuracy") %>%
dplyr::select(wflow_id,.metric,mean,std_err,rank)
## # A tibble: 2 × 5
## wflow_id .metric mean std_err rank
## <chr> <chr> <dbl> <dbl> <int>
## 1 basic_recipe_disc_linear bal_accuracy 0.996 0.00366 1
## 2 basic_recipe_disc_quad bal_accuracy 0.991 0.00862 2
rank_results(train_models,rank_metric = "f_meas") %>%
filter(.metric=="f_meas") %>%
dplyr::select(wflow_id,.metric,mean,std_err,rank)
## # A tibble: 2 × 5
## wflow_id .metric mean std_err rank
## <chr> <chr> <dbl> <dbl> <int>
## 1 basic_recipe_disc_linear f_meas 0.994 0.00559 1
## 2 basic_recipe_disc_quad f_meas 0.989 0.0110 2
#===Menentukan Model terbaik dari Training
mod_best1 <- fit_best(x = train_models,metric = "bal_accuracy")
mod_best1
## ══ Workflow [trained] ══════════════════════════════════════════════════════════
## Preprocessor: Recipe
## Model: discrim_linear()
##
## ── Preprocessor ────────────────────────────────────────────────────────────────
## 0 Recipe Steps
##
## ── Model ───────────────────────────────────────────────────────────────────────
## Call:
## lda(..y ~ ., data = data)
##
## Prior probabilities of groups:
## 1 2 3
## 0.3314607 0.3988764 0.2696629
##
## Group means:
## Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids
## 1 13.74475 2.010678 2.455593 17.03729 106.3390 2.840169 2.9823729
## 2 12.27873 1.932676 2.244789 20.23803 94.5493 2.258873 2.0808451
## 3 13.15375 3.333750 2.437083 21.41667 99.3125 1.678750 0.7814583
## Nonflavanoids Proanthocyanins Color Hue Dilution Proline
## 1 0.290000 1.899322 5.528305 1.0620339 3.157797 1115.7119
## 2 0.363662 1.630282 3.086620 1.0562817 2.785352 519.5070
## 3 0.447500 1.153542 7.396250 0.6827083 1.683542 629.8958
##
## Coefficients of linear discriminants:
## LD1 LD2
## Alcohol -0.403399781 0.8717930699
## Malic 0.165254596 0.3053797325
## Ash -0.369075256 2.3458497486
## Alcalinity 0.154797889 -0.1463807654
## Magnesium -0.002163496 -0.0004627565
## Phenols 0.618052068 -0.0322128171
## Flavanoids -1.661191235 -0.4919980543
## Nonflavanoids -1.495818440 -1.6309537953
## Proanthocyanins 0.134092628 -0.3070875776
## Color 0.355055710 0.2532306865
## Hue -0.818036073 -1.5156344987
## Dilution -1.157559376 0.0511839665
## Proline -0.002691206 0.0028529846
##
## Proportion of trace:
## LD1 LD2
## 0.6875 0.3125
#===Interpretasi Model Terbaik
coefficient <- mod_best1 %>%
extract_fit_engine() %>%
coef() %>%
as.data.frame() %>%
rownames_to_column(var = "Predictors")
coefficient
## Predictors LD1 LD2
## 1 Alcohol -0.403399781 0.8717930699
## 2 Malic 0.165254596 0.3053797325
## 3 Ash -0.369075256 2.3458497486
## 4 Alcalinity 0.154797889 -0.1463807654
## 5 Magnesium -0.002163496 -0.0004627565
## 6 Phenols 0.618052068 -0.0322128171
## 7 Flavanoids -1.661191235 -0.4919980543
## 8 Nonflavanoids -1.495818440 -1.6309537953
## 9 Proanthocyanins 0.134092628 -0.3070875776
## 10 Color 0.355055710 0.2532306865
## 11 Hue -0.818036073 -1.5156344987
## 12 Dilution -1.157559376 0.0511839665
## 13 Proline -0.002691206 0.0028529846
## custom function
lda_equation <-function(x,names){
str_c(str_c("(",x,")",
names,collapse = "+"
)
)
}
coefficient %>%
mutate(across(where(is.numeric),function(x) round(x,digits = 2))) %>%
summarise(D1=lda_equation(LD1,Predictors),
D2=lda_equation(LD2,Predictors)) %>%
pivot_longer(c(D1,D2),names_to = "discriminant",
values_to="equation"
)
## # A tibble: 2 × 2
## discriminant equation
## <chr> <chr>
## 1 D1 (-0.4)Alcohol+(0.17)Malic+(-0.37)Ash+(0.15)Alcalinity+(0)Magnesi…
## 2 D2 (0.87)Alcohol+(0.31)Malic+(2.35)Ash+(-0.15)Alcalinity+(0)Magnesi…
## custom function
autoplot.lda<-function(object,response,data){
object %>%
predict(newdata = data) %>%
magrittr::extract2("x") %>%
as_tibble() %>%
bind_cols(data %>% dplyr::select(all_of(response))) %>%
ggscatter(x="LD1",y="LD2",color = response)
}
mod_best1 %>%
extract_fit_engine() %>%
autoplot(response="Type",data=df)

#===Prediksi Data baru
set.seed(1234)
data_baru <- df %>%
slice_sample(n = 2,by = Type) %>%
dplyr::select(-Type)
data_baru
## Alcohol Malic Ash Alcalinity Magnesium Phenols Flavanoids Nonflavanoids
## 1 13.30 1.72 2.14 17.0 94 2.40 2.19 0.27
## 2 13.63 1.81 2.70 17.2 112 2.85 2.91 0.30
## 3 12.00 0.92 2.00 19.0 86 2.42 2.26 0.30
## 4 12.37 1.17 1.92 19.6 78 2.11 2.00 0.27
## 5 12.51 1.24 2.25 17.5 85 2.00 0.58 0.60
## 6 12.82 3.37 2.30 19.5 88 1.48 0.66 0.40
## Proanthocyanins Color Hue Dilution Proline
## 1 1.35 3.95 1.02 2.77 1285
## 2 1.46 7.30 1.28 2.88 1310
## 3 1.43 2.50 1.38 3.12 278
## 4 1.04 4.68 1.12 3.48 510
## 5 1.25 5.45 0.75 1.51 650
## 6 0.97 10.26 0.72 1.75 685
pred_data_baru1 <- mod_best1%>%
predict(new_data = data_baru)
pred_data_baru1
## # A tibble: 6 × 1
## .pred_class
## <fct>
## 1 1
## 2 1
## 3 2
## 4 2
## 5 3
## 6 3