# Package Management
packages <- c("tidyverse","MVN","biotools","car","heplots",
              "corrplot","psych","ggplot2","ggcorrplot",
              "knitr","kableExtra","moments","GGally")

installed <- rownames(installed.packages())
to_install <- packages[!packages %in% installed]
if (length(to_install) > 0) install.packages(to_install, repos = "https://cran.r-project.org")

suppressPackageStartupMessages({
  library(tidyverse)
  library(MVN)
  library(biotools)
  library(car)
  library(psych)
  library(ggplot2)
  library(ggcorrplot)
  library(knitr)
  library(kableExtra)
  library(moments)
  library(GGally)
})

knitr::opts_chunk$set(
  echo    = TRUE,
  message = FALSE,
  warning = FALSE,
  fig.align = "center",
  fig.width = 8,
  fig.height = 5,
  dpi = 150
)

# Tema ggplot global
theme_set(
  theme_minimal(base_size = 13) +
    theme(
      plot.title    = element_text(face = "bold", color = "#1a5276"),
      plot.subtitle = element_text(color = "#566573"),
      axis.title    = element_text(color = "#2c3e50"),
      strip.text    = element_text(face = "bold")
    )
)

1 Pendahuluan

1.1 Latar Belakang

Analisis MANCOVA (Multivariate Analysis of Covariance) merupakan perluasan dari MANOVA yang menyertakan satu atau lebih covariate (kovariat) untuk mengontrol variansi yang tidak relevan. Sebelum menjalankan MANCOVA, terdapat 5 asumsi yang harus dipenuhi agar hasil analisis valid dan dapat dipercaya.

Dataset yang digunakan adalah Wisconsin Breast Cancer Dataset dari UCI Machine Learning Repository. Dataset ini berisi fitur-fitur yang diekstrak dari gambar sel tumor payudara dan umum digunakan dalam penelitian biomedis.

1.2 Desain Penelitian

Komponen Keterangan
Independent Variable (IV) diagnosis — Malignant (M) / Benign (B)
Dependent Variables (DVs) texture_mean, smoothness_mean, symmetry_mean
Covariate (COV) concavity_mean
Metode Analisis MANCOVA
Jumlah Observasi 50 (25 Malignant, 25 Benign)

1.3 Alur Analisis

Berikut alur pengujian asumsi secara berurutan:

  1. Dependensi antar DV → Bartlett’s Test of Sphericity
  2. Homogenitas Matriks Kovarians → Box’s M Test
  3. Normalitas Multivariat → Mardia’s Test
  4. Linearitas Covariate–DV → Pearson Correlation
  5. Independensi Observasi → Evaluasi desain (naratif)

2 Load & Persiapan Data

2.1 Membaca Data

df_raw <- read.csv("Breast_Cancer_Data.csv")

df_raw <- df_raw %>%
  dplyr::select(-any_of(c("id", "Unnamed..32"))) %>%
  mutate(
    diagnosis = trimws(diagnosis),
    diagnosis = dplyr::recode(diagnosis,
                              "M" = "Malignant",
                              "B" = "Benign")
  ) %>%
  drop_na(diagnosis)

cat(sprintf("Jumlah baris : %d\nJumlah kolom : %d\n",
            nrow(df_raw), ncol(df_raw)))
## Jumlah baris : 569
## Jumlah kolom : 32

Distribusi kelas pada data mentah:

table(df_raw$diagnosis) %>%
  as.data.frame() %>%
  rename(Diagnosis = Var1, Frekuensi = Freq) %>%
  kable(caption = "Distribusi Diagnosis (Data Mentah)") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Distribusi Diagnosis (Data Mentah)
Diagnosis Frekuensi
Benign 357
Malignant 212

2.2 Sampling Terstruktur

DVS <- c("texture_mean", "smoothness_mean", "symmetry_mean")
COV <- "concavity_mean"
IV  <- "diagnosis"

IDX_MAL <- c(2,5,13,24,77,135,172,197,223,252,
             256,277,280,297,328,337,369,393,441,444,
             451,460,479,512,536)

IDX_BEN <- c(76,84,92,106,113,151,266,303,313,314,
             345,357,367,378,413,438,466,481,483,500,
             510,513,541,547,553)

df_raw <- df_raw %>% mutate(row_id = row_number() - 1)
IDX    <- c(IDX_MAL, IDX_BEN)

df <- df_raw %>%
  filter(row_id %in% IDX) %>%
  arrange(match(row_id, IDX)) %>%
  dplyr::select(all_of(c(IV, DVS, COV)))

cat(sprintf("Total sampel : %d observasi\n", nrow(df)))
## Total sampel : 50 observasi
table(df$diagnosis)
## 
##    Benign Malignant 
##        25        25

Metode Sampling: Fixed-index sampling berdasarkan indeks baris yang telah ditentukan sebelumnya. Sampling menghasilkan 50 observasi seimbang — 25 Malignant dan 25 Benign — untuk meminimalkan bias dan meningkatkan stabilitas analisis.

2.3 Statistik Deskriptif

desc_stats <- df %>%
  group_by(diagnosis) %>%
  summarise(across(all_of(c(DVS, COV)),
                   list(
                     Mean = ~round(mean(.), 5),
                     SD   = ~round(sd(.), 5),
                     Min  = ~round(min(.), 5),
                     Max  = ~round(max(.), 5)
                   ))) %>%
  pivot_longer(-diagnosis, names_to = c("Variabel", ".value"),
               names_sep = "_(?=[^_]+$)") %>%
  arrange(Variabel, diagnosis)

kable(desc_stats,
      caption = "Statistik Deskriptif per Variabel dan Grup") %>%
  kable_styling(bootstrap_options = c("striped","hover","condensed"),
                full_width = TRUE) %>%
  column_spec(1, bold = TRUE)
Statistik Deskriptif per Variabel dan Grup
diagnosis Variabel Mean SD Min Max
Benign concavity_mean 0.05301 0.03194 0.00000 0.1321
Malignant concavity_mean 0.15039 0.06166 0.02685 0.2810
Benign smoothness_mean 0.09458 0.01240 0.07355 0.1291
Malignant smoothness_mean 0.10181 0.01308 0.07371 0.1278
Benign symmetry_mean 0.17707 0.02542 0.13860 0.2403
Malignant symmetry_mean 0.18730 0.01989 0.14670 0.2162
Benign texture_mean 17.57880 3.39616 10.72000 24.9900
Malignant texture_mean 21.26040 3.79553 11.89000 28.7700

2.4 Visualisasi Distribusi

df_long <- df %>%
  pivot_longer(cols = all_of(c(DVS, COV)),
               names_to = "Variabel", values_to = "Nilai")

ggplot(df_long, aes(x = Nilai, fill = diagnosis, color = diagnosis)) +
  geom_density(alpha = 0.35, linewidth = 0.9) +
  geom_rug(alpha = 0.5, linewidth = 0.5) +
  facet_wrap(~Variabel, scales = "free", ncol = 2) +
  scale_fill_manual(values  = c("Malignant" = "#c0392b", "Benign" = "#2e86c1")) +
  scale_color_manual(values = c("Malignant" = "#c0392b", "Benign" = "#2e86c1")) +
  labs(title    = "Distribusi Variabel per Grup Diagnosis",
       subtitle = "Density plot dengan rug marks",
       x = "Nilai", y = "Densitas", fill = "Diagnosis", color = "Diagnosis") +
  theme(legend.position = "bottom")

ggplot(df_long, aes(x = diagnosis, y = Nilai, fill = diagnosis)) +
  geom_boxplot(alpha = 0.7, outlier.shape = 21, outlier.size = 2) +
  geom_jitter(aes(color = diagnosis), width = 0.12, alpha = 0.5, size = 1.8) +
  facet_wrap(~Variabel, scales = "free_y", ncol = 2) +
  scale_fill_manual(values  = c("Malignant" = "#c0392b", "Benign" = "#2e86c1")) +
  scale_color_manual(values = c("Malignant" = "#c0392b", "Benign" = "#2e86c1")) +
  labs(title    = "Boxplot Variabel per Grup Diagnosis",
       subtitle = "Dengan jitter untuk menampilkan distribusi individual",
       x = NULL, y = "Nilai") +
  theme(legend.position = "none")


3 Asumsi 1: Dependensi Antar DV

3.1 Penjelasan

Tujuan: Memverifikasi bahwa variabel-variabel dependen (DV) tidak independen satu sama lain — artinya mereka saling berkorelasi. Ini merupakan syarat mendasar MANCOVA: jika DV tidak berkorelasi sama sekali, analisis multivariat tidak memberikan keuntungan dibanding analisis univariat terpisah.

Metode: Bartlett’s Test of Sphericity mengasumsikan H₀: matriks korelasi = matriks identitas (DV-DV tidak berkorelasi). Kita ingin menolak H₀ untuk membuktikan adanya korelasi.

Keputusan: Jika p-value < 0.05, maka H₀ ditolak → DV-DV saling berkorelasi → asumsi terpenuhi.

3.2 Uji & Hasil

R_matrix   <- cor(df[, DVS])
sphericity <- cortest.bartlett(R_matrix, n = nrow(df))

tibble(
  Statistik  = c("Chi-square", "Degrees of Freedom", "p-value"),
  Nilai      = c(round(sphericity$chisq, 4),
                 sphericity$df,
                 format(sphericity$p.value, scientific = TRUE, digits = 4))
) %>%
  kable(caption = "Hasil Bartlett's Test of Sphericity") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Hasil Bartlett’s Test of Sphericity
Statistik Nilai
Chi-square 24.0657
Degrees of Freedom 3
p-value 2.42e-05

Interpretasi: p-value = 2.42e-05 < 0.05.
H₀ ditolak → DV-DV saling berkorelasi secara signifikan.
✔ ASUMSI 1 TERPENUHI

3.3 Matriks Korelasi

R_round <- round(R_matrix, 4)

kable(R_round, caption = "Matriks Korelasi antar DV") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Matriks Korelasi antar DV
texture_mean smoothness_mean symmetry_mean
texture_mean 1.0000 -0.1386 -0.0124
smoothness_mean -0.1386 1.0000 0.6184
symmetry_mean -0.0124 0.6184 1.0000
ggcorrplot(R_matrix,
           method    = "circle",
           type      = "lower",
           lab       = TRUE,
           lab_size  = 4,
           colors    = c("#c0392b","white","#2e86c1"),
           title     = "Matriks Korelasi Antar DV",
           ggtheme   = theme_minimal())


4 Asumsi 2: Homogenitas Matriks Kovarians

4.1 Penjelasan

Tujuan: Memastikan bahwa matriks kovarians antar kelompok IV (Malignant vs Benign) adalah sama (homogen). Pelanggaran asumsi ini mempengaruhi ketepatan uji F pada MANCOVA.

Metode: Box’s M Test. H₀: semua grup memiliki matriks kovarians yang sama.

Keputusan: Jika p-value ≥ 0.05, maka H₀ gagal ditolak → matriks kovarians homogen → asumsi terpenuhi.

Box’s M sangat sensitif terhadap pelanggaran normalitas. p-value > 0.001 umumnya sudah dianggap aman dalam praktik.

4.2 Uji & Hasil

bm <- boxM(df[, DVS], df[[IV]])

tibble(
  Statistik = c("Box's M (Chi-Sq approx.)", "df", "p-value"),
  Nilai     = c(round(as.numeric(bm$statistic), 4),
                as.numeric(bm$parameter),
                round(as.numeric(bm$p.value), 4))
) %>%
  kable(caption = "Hasil Box's M Test") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Hasil Box’s M Test
Statistik Nilai
Box’s M (Chi-Sq approx.) 3.5164
df 6.0000
p-value 0.7418

Interpretasi: p-value = 0.7418 ≥ 0.05.
H₀ gagal ditolak → Matriks kovarians antar grup homogen.
✔ ASUMSI 2 TERPENUHI

4.3 Matriks Kovarians per Grup

cov_mal <- cov(df[df$diagnosis == "Malignant", DVS])
cov_ben <- cov(df[df$diagnosis == "Benign",    DVS])

kable(round(cov_mal, 6), caption = "Matriks Kovarians — Malignant") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Matriks Kovarians — Malignant
texture_mean smoothness_mean symmetry_mean
texture_mean 14.406037 -0.023032 -0.020564
smoothness_mean -0.023032 0.000171 0.000160
symmetry_mean -0.020564 0.000160 0.000395
kable(round(cov_ben, 6), caption = "Matriks Kovarians — Benign") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Matriks Kovarians — Benign
texture_mean smoothness_mean symmetry_mean
texture_mean 11.533911 -0.005765 -0.001420
smoothness_mean -0.005765 0.000154 0.000185
symmetry_mean -0.001420 0.000185 0.000646
# Visualisasi perbedaan varians antar grup
var_compare <- bind_rows(
  as.data.frame(cov_mal) %>% mutate(Var = rownames(cov_mal), Grup = "Malignant"),
  as.data.frame(cov_ben) %>% mutate(Var = rownames(cov_ben), Grup = "Benign")
) %>%
  rowwise() %>%
  mutate(Variansi = get(Var)) %>%
  ungroup() %>%
  dplyr::select(Var, Grup, Variansi)

ggplot(var_compare, aes(x = Var, y = Variansi, fill = Grup)) +
  geom_bar(stat = "identity", position = "dodge", width = 0.6, alpha = 0.85) +
  scale_fill_manual(values = c("Malignant" = "#c0392b", "Benign" = "#2e86c1")) +
  labs(title    = "Perbandingan Variansi per DV dan Grup",
       subtitle = "Diagonal matriks kovarians (variansi masing-masing DV)",
       x = "Variabel Dependen", y = "Variansi", fill = "Diagnosis") +
  theme(legend.position = "bottom")


5 Asumsi 3: Normalitas Multivariat

5.1 Penjelasan

Tujuan: Memastikan bahwa gabungan DV mengikuti distribusi normal multivariat. MANCOVA mengasumsikan residual berdistribusi normal secara multivariat.

Metode: Mardia’s Test — menguji skewness (kemencengan) dan kurtosis (keruncingan) multivariat secara terpisah.

Keputusan: Kedua p-value (skewness dan kurtosis) harus ≥ 0.05 untuk menyatakan normalitas multivariat terpenuhi.

5.2 Uji Mardia

mardia_result <- psych::mardia(df[, DVS], plot = FALSE)

p_skew <- mardia_result$p.skew
z_kurt <- mardia_result$kurtosis
p_kurt <- 2 * (1 - pnorm(abs(z_kurt)))

tibble(
  Komponen   = c("Mardia Skewness", "Mardia Kurtosis"),
  Statistik  = c(round(mardia_result$b1p, 4), round(mardia_result$b2p, 4)),
  `Z / Chi²` = c("—", round(z_kurt, 4)),
  `p-value`  = c(round(p_skew, 4), round(p_kurt, 4)),
  Status     = c(
    ifelse(p_skew >= 0.05, "Normal ✔", "Tidak Normal ✘"),
    ifelse(p_kurt >= 0.05, "Normal ✔", "Tidak Normal ✘")
  )
) %>%
  kable(caption = "Hasil Mardia's Test — Normalitas Multivariat") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  column_spec(5, bold = TRUE,
              color = ifelse(c(p_skew, p_kurt) >= 0.05, "#1e8449", "#c0392b"))
Hasil Mardia’s Test — Normalitas Multivariat
Komponen Statistik Z / Chi² p-value Status
Mardia Skewness 0.9458 0.6404 Normal ✔
Mardia Kurtosis 13.3407 -1.0711 0.2841 Normal ✔

Interpretasi: p-skewness = 0.6404 ≥ 0.05 dan p-kurtosis = 0.2841 ≥ 0.05.
Keduanya tidak signifikan → Data memenuhi normalitas multivariat.
✔ ASUMSI 3 TERPENUHI

5.3 Uji Shapiro-Wilk (Univariat)

Sebagai verifikasi tambahan, uji normalitas univariat dilakukan per variabel:

sw_results <- map_df(DVS, function(v) {
  sw <- shapiro.test(df[[v]])
  tibble(
    Variabel = v,
    W        = round(sw$statistic, 5),
    `p-value` = round(sw$p.value, 5),
    Status   = ifelse(sw$p.value >= 0.05, "Normal ✔", "Tidak Normal ✘")
  )
})

kable(sw_results, caption = "Shapiro-Wilk Test per Variabel Dependen") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE)
Shapiro-Wilk Test per Variabel Dependen
Variabel W p-value Status
texture_mean 0.98849 0.90475 Normal ✔
smoothness_mean 0.97732 0.44547 Normal ✔
symmetry_mean 0.98450 0.75012 Normal ✔

5.4 Q-Q Plot

par(mfrow = c(1, 3), mar = c(4, 4, 3, 1))
for (v in DVS) {
  qqnorm(df[[v]],
         main  = paste("Q-Q Plot:", v),
         col   = "#2e86c1", pch = 19, cex = 0.8)
  qqline(df[[v]], col = "#c0392b", lwd = 2)
}

par(mfrow = c(1,1))

6 Asumsi 4: Linearitas Covariate–DV

6.1 Penjelasan

Tujuan: Memastikan bahwa covariate (concavity_mean) memiliki hubungan linear dengan setiap DV. Jika hubungannya tidak linear, memasukkan covariate sebagai kontrol linear tidak akan efektif.

Metode: Uji korelasi Pearson antara covariate dan masing-masing DV.

Keputusan: Jika p-value < 0.05, maka ada hubungan linear yang signifikan → asumsi terpenuhi.

6.2 Uji Korelasi Pearson

lin_results <- map_df(DVS, function(v) {
  ct <- cor.test(df[[COV]], df[[v]])
  tibble(
    DV        = v,
    r         = round(ct$estimate, 4),
    `t-stat`  = round(ct$statistic, 4),
    df        = ct$parameter,
    `p-value` = round(ct$p.value, 6),
    Status    = ifelse(ct$p.value < 0.05,
                       "Linear Signifikan ✔",
                       "Tidak Linear ✘")
  )
})

kable(lin_results, caption = "Hasil Uji Linearitas: Covariate vs DV") %>%
  kable_styling(bootstrap_options = c("striped","hover"), full_width = FALSE) %>%
  column_spec(6, bold = TRUE, color = "#1e8449")
Hasil Uji Linearitas: Covariate vs DV
DV r t-stat df p-value Status
texture_mean 0.3750 2.8024 48 0.007294 Linear Signifikan ✔
smoothness_mean 0.4534 3.5247 48 0.000943 Linear Signifikan ✔
symmetry_mean 0.4436 3.4293 48 0.001252 Linear Signifikan ✔

Interpretasi: Semua p-value < 0.05 → Ketiga DV memiliki hubungan linear signifikan dengan covariate concavity_mean.
✔ ASUMSI 4 TERPENUHI

6.3 Scatter Plot

df_scatter <- df %>%
  pivot_longer(cols = all_of(DVS), names_to = "DV", values_to = "Nilai_DV")

ggplot(df_scatter, aes(x = concavity_mean, y = Nilai_DV, color = diagnosis)) +
  geom_point(alpha = 0.7, size = 2.2) +
  geom_smooth(method = "lm", se = TRUE, aes(group = 1),
              color = "#2c3e50", linewidth = 1, linetype = "dashed") +
  facet_wrap(~DV, scales = "free_y", ncol = 3) +
  scale_color_manual(values = c("Malignant" = "#c0392b", "Benign" = "#2e86c1")) +
  labs(title    = "Linearitas: concavity_mean vs Setiap DV",
       subtitle = "Garis regresi linear keseluruhan ditampilkan",
       x = "concavity_mean (Covariate)",
       y = "Nilai DV",
       color = "Diagnosis") +
  theme(legend.position = "bottom")


7 Asumsi 5: Independensi Observasi

7.1 Penjelasan & Evaluasi

Tujuan: Memastikan bahwa setiap observasi bersifat independen — nilai satu observasi tidak mempengaruhi atau dipengaruhi oleh observasi lainnya.

Catatan: Asumsi ini tidak diuji secara statistik, melainkan dievaluasi berdasarkan desain pengumpulan data.

Evaluasi independensi observasi berdasarkan struktur dataset:

Kriteria Evaluasi Status
Unit observasi berbeda Setiap baris mewakili pasien yang berbeda (bukan repeated measures)
Tidak ada tumpang tindih antar grup Setiap pasien hanya termasuk satu grup (Malignant ATAU Benign)
Proses sampling tidak sistematis Fixed-index sampling tanpa dependensi antar baris
Sumber data independen Data Wisconsin BC dari UCI: setiap observasi dikumpulkan secara independen

Kesimpulan: Berdasarkan desain studi dan sumber data, tidak ada indikasi dependensi antar observasi. Setiap entri data mewakili satu pasien unik, tanpa pengukuran berulang maupun pengelompokan bersarang.
✔ ASUMSI 5 TERPENUHI


8 Ringkasan Akhir

summary_df <- tibble(
  No      = 1:5,
  Asumsi  = c("Dependensi antar DV",
               "Homogenitas Matriks Kovarians",
               "Normalitas Multivariat",
               "Linearitas Covariate–DV",
               "Independensi Observasi"),
  Metode  = c("Bartlett's Test of Sphericity",
               "Box's M Test",
               "Mardia's Test",
               "Pearson Correlation",
               "Evaluasi Desain"),
  `p-value` = c(
    format(sphericity$p.value, scientific = TRUE, digits = 3),
    as.character(round(bm$p.value, 4)),
    paste0("skew=", round(p_skew,4), " / kurt=", round(p_kurt,4)),
    paste0("maks=", round(max(lin_results[["p-value"]]), 4)),
    "N/A"
  ),
  Keputusan = c("H₀ Ditolak", "H₀ Gagal Ditolak",
                 "H₀ Gagal Ditolak", "H₀ Ditolak", "—"),
  Status    = rep("✔ TERPENUHI", 5)
)

kable(summary_df,
      caption = "Ringkasan Uji Asumsi MANCOVA — Breast Cancer Dataset") %>%
  kable_styling(bootstrap_options = c("striped","hover","bordered"),
                full_width = TRUE) %>%
  column_spec(6, bold = TRUE, color = "#1e8449") %>%
  row_spec(0, bold = TRUE, background = "#1a5276", color = "white")
Ringkasan Uji Asumsi MANCOVA — Breast Cancer Dataset
No Asumsi Metode p-value Keputusan Status
1 Dependensi antar DV Bartlett’s Test of Sphericity 2.42e-05 H₀ Ditolak ✔ TERPENUHI
2 Homogenitas Matriks Kovarians Box’s M Test 0.7418 H₀ Gagal Ditolak ✔ TERPENUHI
3 Normalitas Multivariat Mardia’s Test skew=0.6404 / kurt=0.2841 H₀ Gagal Ditolak ✔ TERPENUHI
4 Linearitas Covariate–DV Pearson Correlation maks=0.0073 H₀ Ditolak ✔ TERPENUHI
5 Independensi Observasi Evaluasi Desain N/A ✔ TERPENUHI

8.1 kesimpulan

Seluruh 5 asumsi MANCOVA telah terpenuhi berdasarkan pengujian yang dilakukan:

  1. DV-DV saling berkorelasi signifikan (Bartlett’s Sphericity, χ² besar, p < 0.05)
  2. Matriks kovarians antar grup homogen (Box’s M, p = 0.7418)
  3. Data berdistribusi normal multivariat (Mardia skewness p = 0.6404, kurtosis p = 0.2841)
  4. Covariate concavity_mean memiliki hubungan linear dengan semua DV (semua p < 0.05)
  5. Observasi bersifat independen berdasarkan desain studi

Dataset dan variabel yang dipilih layak untuk dianalisis menggunakan MANCOVA.