library(MASS)
data(crabs)
str(crabs)
## 'data.frame': 200 obs. of 8 variables:
## $ sp : Factor w/ 2 levels "B","O": 1 1 1 1 1 1 1 1 1 1 ...
## $ sex : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
## $ index: int 1 2 3 4 5 6 7 8 9 10 ...
## $ FL : num 8.1 8.8 9.2 9.6 9.8 10.8 11.1 11.6 11.8 11.8 ...
## $ RW : num 6.7 7.7 7.8 7.9 8 9 9.9 9.1 9.6 10.5 ...
## $ CL : num 16.1 18.1 19 20.1 20.3 23 23.8 24.5 24.2 25.2 ...
## $ CW : num 19 20.8 22.4 23.1 23 26.5 27.1 28.4 27.8 29.3 ...
## $ BD : num 7 7.4 7.7 8.2 8.2 9.8 9.8 10.4 9.7 10.3 ...
Data set yang dipakai adalah dataset bawaan Rstudio yang menyediakan data kelompok spesies kepiting B dan O. Diberikan beberapa variabel mengenai fisik masing-masing kepiting yang dapat menjadi determinasi jenis spesies. Akan dilakukan analisis diskriminan untuk mengetahui apakah pengelompokan sudah benar atau tidak.
summary(crabs)
## sp sex index FL RW CL
## B:100 F:100 Min. : 1.0 Min. : 7.20 Min. : 6.50 Min. :14.70
## O:100 M:100 1st Qu.:13.0 1st Qu.:12.90 1st Qu.:11.00 1st Qu.:27.27
## Median :25.5 Median :15.55 Median :12.80 Median :32.10
## Mean :25.5 Mean :15.58 Mean :12.74 Mean :32.11
## 3rd Qu.:38.0 3rd Qu.:18.05 3rd Qu.:14.30 3rd Qu.:37.23
## Max. :50.0 Max. :23.10 Max. :20.20 Max. :47.60
## CW BD
## Min. :17.10 Min. : 6.10
## 1st Qu.:31.50 1st Qu.:11.40
## Median :36.80 Median :13.90
## Mean :36.41 Mean :14.03
## 3rd Qu.:42.00 3rd Qu.:16.60
## Max. :54.60 Max. :21.60
table(crabs$sp)
##
## B O
## 100 100
table(crabs$sex)
##
## F M
## 100 100
table(crabs$sp, crabs$sex)
##
## F M
## B 50 50
## O 50 50
pairs(crabs[,4:8], col = crabs$sp)
pairs(crabs[,4:8], col = crabs$sex)
prediktor <- crabs[, c("FL", "RW", "CL", "CW", "BD")]
target <- crabs$sp
library(MASS)
data(crabs)
# pilih prediktor numerik
X <- crabs[, c("FL", "RW", "CL", "CW", "BD")]
# target klasifikasi
y <- crabs$sp
# pisahkan data per kelas
X_B <- X[y == "B", ]
X_O <- X[y == "O", ]
mean_B <- colMeans(X_B)
mean_O <- colMeans(X_O)
mean_B
## FL RW CL CW BD
## 14.056 11.928 30.058 34.717 12.583
mean_O
## FL RW CL CW BD
## 17.110 13.549 34.153 38.112 15.478
S_B <- cov(X_B)
S_O <- cov(X_O)
S_B
## FL RW CL CW BD
## FL 9.118044 6.179527 20.74157 23.64237 9.158436
## RW 6.179527 5.195168 14.10301 16.21093 6.322804
## CL 20.741568 14.103006 47.64731 54.22052 21.004935
## CW 23.642372 16.210933 54.22052 61.87456 23.952615
## BD 9.158436 6.322804 21.00494 23.95262 9.411930
S_O
## FL RW CL CW BD
## FL 10.729394 7.718697 21.90159 24.49089 10.140828
## RW 7.718697 6.788787 15.41899 17.67314 7.059574
## CL 21.901586 15.418993 45.75524 50.84400 21.192592
## CW 24.490889 17.673143 50.84400 56.86551 23.530772
## BD 10.140828 7.059574 21.19259 23.53077 9.931834
n1 <- nrow(X_B)
n2 <- nrow(X_O)
S_pooled <- ((n1-1)*S_B + (n2-1)*S_O) / (n1 + n2 - 2)
S_pooled
## FL RW CL CW BD
## FL 9.923719 6.949112 21.32158 24.06663 9.649632
## RW 6.949112 5.991977 14.76100 16.94204 6.691189
## CL 21.321577 14.760999 46.70128 52.53226 21.098764
## CW 24.066630 16.942038 52.53226 59.37003 23.741693
## BD 9.649632 6.691189 21.09876 23.74169 9.671882
S_inv <- solve(S_pooled)
S_inv
## FL RW CL CW BD
## FL 6.6496836 -0.78570935 -1.377102 -1.00586881 -0.61761508
## RW -0.7857094 1.18515072 1.346667 -1.20284545 -0.02105825
## CL -1.3771017 1.34666658 7.375313 -5.16749631 -2.96190082
## CW -1.0058688 -1.20284545 -5.167496 5.34570320 -0.01380633
## BD -0.6176151 -0.02105825 -2.961901 -0.01380633 7.22929537
a <- S_inv %*% (mean_B - mean_O)
a
## [,1]
## FL -8.1923472
## RW -0.8915487
## CL -2.0608334
## CW 8.0739401
## BD -6.8326219
h <- 0.5 * t(a) %*% (mean_B + mean_O)
h
## [,1]
## [1,] -7.039036
score_manual <- as.matrix(X) %*% a
head(score_manual)
## [,1]
## 1 0.06570248
## 2 1.11788738
## 3 6.76556117
## 4 3.36799779
## 5 0.42081278
## 6 3.09926207
h <- as.numeric(0.5 * t(a) %*% (mean_B + mean_O))
pred_manual <- ifelse(score_manual >= h, "B", "O")
pred_manual <- factor(pred_manual, levels = c("B","O"))
table(Actual = y, Pred = pred_manual)
## Pred
## Actual B O
## B 100 0
## O 0 100
lda_model <- lda(sp ~ FL + RW + CL + CW + BD, data = crabs)
lda_model
## Call:
## lda(sp ~ FL + RW + CL + CW + BD, data = crabs)
##
## Prior probabilities of groups:
## B O
## 0.5 0.5
##
## Group means:
## FL RW CL CW BD
## B 14.056 11.928 30.058 34.717 12.583
## O 17.110 13.549 34.153 38.112 15.478
##
## Coefficients of linear discriminants:
## LD1
## FL 1.5687028
## RW 0.1707172
## CL 0.3946165
## CW -1.5460297
## BD 1.3083372
pred_mass <- predict(lda_model)$class
table(Actual = y, Pred = pred_mass)
## Pred
## Actual B O
## B 100 0
## O 0 100
Dari kedua metode, didapatkan analisis diskriminan yang sesuai dengan data yang ada. Tidak ada pengamatan yang statusnya berbeda setelah melalui analisis diskriminan. Hal ini disebabkan oleh kondisi fisik spesies B dan O yang sangat berbeda di setiap variabel. Sehingga semua data dikelompokkan dengan benar.
library(MASS)
# Model LDA crabs
model <- lda(sp ~ FL + RW + CL + CW + BD, data = crabs)
# =============================
# 1. Mean masing-masing kelas
# =============================
mu_B <- as.numeric(model$means["B", ])
mu_O <- as.numeric(model$means["O", ])
mu_B
## [1] 14.056 11.928 30.058 34.717 12.583
mu_O
## [1] 17.110 13.549 34.153 38.112 15.478
# =================================================
# 2. Koefisien per kelas: b_k = S_inv %*% mu_k
# =================================================
b_B <- S_inv %*% mu_B
b_O <- S_inv %*% mu_O
b_B
## [,1]
## FL 0.01089126
## RW 1.54648977
## CL 1.72410001
## CW 1.60241658
## BD -7.47428589
b_O
## [,1]
## FL 8.203238
## RW 2.438038
## CL 3.784933
## CW -6.471524
## BD -0.641664
# =====================================================
# 3. Konstanta kelas: c_k = -1/2 * mu' S_inv mu + log(prior)
# =====================================================
prior <- model$prior # prior otomatis dari MASS
c_B <- -0.5 * t(mu_B) %*% S_inv %*% mu_B + log(prior["B"])
c_O <- -0.5 * t(mu_O) %*% S_inv %*% mu_O + log(prior["O"])
c_B
## [,1]
## [1,] -16.69553
c_O
## [,1]
## [1,] -23.73457
# ============================
# 4. Fungsi Diskriminan B
# ============================
a_B <- c(c_B, b_B)
a_B
## [1] -16.69553350 0.01089126 1.54648977 1.72410001 1.60241658
## [6] -7.47428589
# ============================
# 5. Fungsi Diskriminan O
# ============================
a_O <- c(c_O, b_O)
a_O
## [1] -23.734569 8.203238 2.438038 3.784933 -6.471524 -0.641664