library(MASS)
data(crabs)
str(crabs)
## 'data.frame':    200 obs. of  8 variables:
##  $ sp   : Factor w/ 2 levels "B","O": 1 1 1 1 1 1 1 1 1 1 ...
##  $ sex  : Factor w/ 2 levels "F","M": 2 2 2 2 2 2 2 2 2 2 ...
##  $ index: int  1 2 3 4 5 6 7 8 9 10 ...
##  $ FL   : num  8.1 8.8 9.2 9.6 9.8 10.8 11.1 11.6 11.8 11.8 ...
##  $ RW   : num  6.7 7.7 7.8 7.9 8 9 9.9 9.1 9.6 10.5 ...
##  $ CL   : num  16.1 18.1 19 20.1 20.3 23 23.8 24.5 24.2 25.2 ...
##  $ CW   : num  19 20.8 22.4 23.1 23 26.5 27.1 28.4 27.8 29.3 ...
##  $ BD   : num  7 7.4 7.7 8.2 8.2 9.8 9.8 10.4 9.7 10.3 ...

Data set yang dipakai adalah dataset bawaan Rstudio yang menyediakan data kelompok spesies kepiting B dan O. Diberikan beberapa variabel mengenai fisik masing-masing kepiting yang dapat menjadi determinasi jenis spesies. Akan dilakukan analisis diskriminan untuk mengetahui apakah pengelompokan sudah benar atau tidak.

Eksplorasi

summary(crabs)
##  sp      sex         index            FL              RW              CL       
##  B:100   F:100   Min.   : 1.0   Min.   : 7.20   Min.   : 6.50   Min.   :14.70  
##  O:100   M:100   1st Qu.:13.0   1st Qu.:12.90   1st Qu.:11.00   1st Qu.:27.27  
##                  Median :25.5   Median :15.55   Median :12.80   Median :32.10  
##                  Mean   :25.5   Mean   :15.58   Mean   :12.74   Mean   :32.11  
##                  3rd Qu.:38.0   3rd Qu.:18.05   3rd Qu.:14.30   3rd Qu.:37.23  
##                  Max.   :50.0   Max.   :23.10   Max.   :20.20   Max.   :47.60  
##        CW              BD       
##  Min.   :17.10   Min.   : 6.10  
##  1st Qu.:31.50   1st Qu.:11.40  
##  Median :36.80   Median :13.90  
##  Mean   :36.41   Mean   :14.03  
##  3rd Qu.:42.00   3rd Qu.:16.60  
##  Max.   :54.60   Max.   :21.60
table(crabs$sp)
## 
##   B   O 
## 100 100
table(crabs$sex)
## 
##   F   M 
## 100 100
table(crabs$sp, crabs$sex)
##    
##      F  M
##   B 50 50
##   O 50 50
pairs(crabs[,4:8], col = crabs$sp)

pairs(crabs[,4:8], col = crabs$sex)

prediktor <- crabs[, c("FL", "RW", "CL", "CW", "BD")]
target <- crabs$sp

MANUAL

library(MASS)
data(crabs)

# pilih prediktor numerik
X <- crabs[, c("FL", "RW", "CL", "CW", "BD")]

# target klasifikasi
y <- crabs$sp

# pisahkan data per kelas
X_B <- X[y == "B", ]
X_O <- X[y == "O", ]
mean_B <- colMeans(X_B)
mean_O <- colMeans(X_O)

mean_B
##     FL     RW     CL     CW     BD 
## 14.056 11.928 30.058 34.717 12.583
mean_O
##     FL     RW     CL     CW     BD 
## 17.110 13.549 34.153 38.112 15.478
S_B <- cov(X_B)
S_O <- cov(X_O)

S_B
##           FL        RW       CL       CW        BD
## FL  9.118044  6.179527 20.74157 23.64237  9.158436
## RW  6.179527  5.195168 14.10301 16.21093  6.322804
## CL 20.741568 14.103006 47.64731 54.22052 21.004935
## CW 23.642372 16.210933 54.22052 61.87456 23.952615
## BD  9.158436  6.322804 21.00494 23.95262  9.411930
S_O
##           FL        RW       CL       CW        BD
## FL 10.729394  7.718697 21.90159 24.49089 10.140828
## RW  7.718697  6.788787 15.41899 17.67314  7.059574
## CL 21.901586 15.418993 45.75524 50.84400 21.192592
## CW 24.490889 17.673143 50.84400 56.86551 23.530772
## BD 10.140828  7.059574 21.19259 23.53077  9.931834
n1 <- nrow(X_B)
n2 <- nrow(X_O)

S_pooled <- ((n1-1)*S_B + (n2-1)*S_O) / (n1 + n2 - 2)
S_pooled
##           FL        RW       CL       CW        BD
## FL  9.923719  6.949112 21.32158 24.06663  9.649632
## RW  6.949112  5.991977 14.76100 16.94204  6.691189
## CL 21.321577 14.760999 46.70128 52.53226 21.098764
## CW 24.066630 16.942038 52.53226 59.37003 23.741693
## BD  9.649632  6.691189 21.09876 23.74169  9.671882
S_inv <- solve(S_pooled)
S_inv
##            FL          RW        CL          CW          BD
## FL  6.6496836 -0.78570935 -1.377102 -1.00586881 -0.61761508
## RW -0.7857094  1.18515072  1.346667 -1.20284545 -0.02105825
## CL -1.3771017  1.34666658  7.375313 -5.16749631 -2.96190082
## CW -1.0058688 -1.20284545 -5.167496  5.34570320 -0.01380633
## BD -0.6176151 -0.02105825 -2.961901 -0.01380633  7.22929537
a <- S_inv %*% (mean_B - mean_O)
a
##          [,1]
## FL -8.1923472
## RW -0.8915487
## CL -2.0608334
## CW  8.0739401
## BD -6.8326219
h <- 0.5 * t(a) %*% (mean_B + mean_O)
h
##           [,1]
## [1,] -7.039036
score_manual <- as.matrix(X) %*% a
head(score_manual)
##         [,1]
## 1 0.06570248
## 2 1.11788738
## 3 6.76556117
## 4 3.36799779
## 5 0.42081278
## 6 3.09926207
h <- as.numeric(0.5 * t(a) %*% (mean_B + mean_O))

pred_manual <- ifelse(score_manual >= h, "B", "O")
pred_manual <- factor(pred_manual, levels = c("B","O"))
table(Actual = y, Pred = pred_manual)
##       Pred
## Actual   B   O
##      B 100   0
##      O   0 100

Dengan Fungsi

lda_model <- lda(sp ~ FL + RW + CL + CW + BD, data = crabs)
lda_model
## Call:
## lda(sp ~ FL + RW + CL + CW + BD, data = crabs)
## 
## Prior probabilities of groups:
##   B   O 
## 0.5 0.5 
## 
## Group means:
##       FL     RW     CL     CW     BD
## B 14.056 11.928 30.058 34.717 12.583
## O 17.110 13.549 34.153 38.112 15.478
## 
## Coefficients of linear discriminants:
##           LD1
## FL  1.5687028
## RW  0.1707172
## CL  0.3946165
## CW -1.5460297
## BD  1.3083372
pred_mass <- predict(lda_model)$class
table(Actual = y, Pred = pred_mass)
##       Pred
## Actual   B   O
##      B 100   0
##      O   0 100

Dari kedua metode, didapatkan analisis diskriminan yang sesuai dengan data yang ada. Tidak ada pengamatan yang statusnya berbeda setelah melalui analisis diskriminan. Hal ini disebabkan oleh kondisi fisik spesies B dan O yang sangat berbeda di setiap variabel. Sehingga semua data dikelompokkan dengan benar.

Model Masing Masing Species

library(MASS)

# Model LDA crabs
model <- lda(sp ~ FL + RW + CL + CW + BD, data = crabs)

# =============================
#   1. Mean masing-masing kelas
# =============================

mu_B <- as.numeric(model$means["B", ])
mu_O <- as.numeric(model$means["O", ])

mu_B
## [1] 14.056 11.928 30.058 34.717 12.583
mu_O
## [1] 17.110 13.549 34.153 38.112 15.478
# =================================================
#   2. Koefisien per kelas: b_k = S_inv %*% mu_k
# =================================================

b_B <- S_inv %*% mu_B
b_O <- S_inv %*% mu_O

b_B
##           [,1]
## FL  0.01089126
## RW  1.54648977
## CL  1.72410001
## CW  1.60241658
## BD -7.47428589
b_O
##         [,1]
## FL  8.203238
## RW  2.438038
## CL  3.784933
## CW -6.471524
## BD -0.641664
# =====================================================
#   3. Konstanta kelas: c_k = -1/2 * mu' S_inv mu + log(prior)
# =====================================================

prior <- model$prior    # prior otomatis dari MASS

c_B <- -0.5 * t(mu_B) %*% S_inv %*% mu_B + log(prior["B"])
c_O <- -0.5 * t(mu_O) %*% S_inv %*% mu_O + log(prior["O"])

c_B
##           [,1]
## [1,] -16.69553
c_O
##           [,1]
## [1,] -23.73457
# ============================
#   4. Fungsi Diskriminan B
# ============================

a_B <- c(c_B, b_B)
a_B
## [1] -16.69553350   0.01089126   1.54648977   1.72410001   1.60241658
## [6]  -7.47428589
# ============================
#   5. Fungsi Diskriminan O
# ============================

a_O <- c(c_O, b_O)
a_O
## [1] -23.734569   8.203238   2.438038   3.784933  -6.471524  -0.641664