# ============================================================
# Algoritma Expectation-Maximization (EM) - Masalah 2 Koin
# Parameter: theta_A = 0.7, theta_B = 0.4
# Dataset: 5 set pelemparan koin 
# ============================================================

# ----------------------------------------------------------
# DATA OBSERVASI
# Setiap baris: c(jumlah_H, jumlah_T)
# ----------------------------------------------------------
data_observasi <- list(
  set1 = c(H = 5, T = 5),
  set2 = c(H = 9, T = 1),
  set3 = c(H = 8, T = 2),
  set4 = c(H = 4, T = 6),
  set5 = c(H = 7, T = 3)
)

# ----------------------------------------------------------
# PARAMETER AWAL
# Slide menggunakan: theta_A = 0.6, theta_B = 0.5
# Tes ini gunakan: theta_A = 0.7, theta_B = 0.4
# ----------------------------------------------------------
theta_A_init <- 0.7
theta_B_init <- 0.4

# ----------------------------------------------------------
# FUNGSI E-STEP
# Hitung probabilitas setiap set berasal dari koin A atau B
# ----------------------------------------------------------
e_step <- function(data, theta_A, theta_B) {
  hasil <- lapply(data, function(set) {
    H <- set["H"]
    T <- set["T"]
    n <- H + T
    
    # Likelihood set ini dari koin A dan koin B
    L_A <- theta_A^H * (1 - theta_A)^T
    L_B <- theta_B^H * (1 - theta_B)^T
    
    # Probabilitas posterior (asumsikan prior seimbang P(A)=P(B)=0.5)
    denom <- L_A + L_B
    p_A <- L_A / denom
    p_B <- L_B / denom
    
    # Estimasi kontribusi H dan T untuk masing-masing koin
    H_A_est <- p_A * H
    T_A_est <- p_A * T
    H_B_est <- p_B * H
    T_B_est <- p_B * T
    
    list(p_A = p_A, p_B = p_B,
         H_A = H_A_est, T_A = T_A_est,
         H_B = H_B_est, T_B = T_B_est)
  })
  return(hasil)
}

# ----------------------------------------------------------
# FUNGSI M-STEP
# Update parameter theta_A dan theta_B
# ----------------------------------------------------------
m_step <- function(e_hasil) {
  total_H_A <- sum(sapply(e_hasil, function(x) x$H_A))
  total_T_A <- sum(sapply(e_hasil, function(x) x$T_A))
  total_H_B <- sum(sapply(e_hasil, function(x) x$H_B))
  total_T_B <- sum(sapply(e_hasil, function(x) x$T_B))
  
  theta_A_baru <- total_H_A / (total_H_A + total_T_A)
  theta_B_baru <- total_H_B / (total_H_B + total_T_B)
  
  list(theta_A = theta_A_baru, theta_B = theta_B_baru,
       total_H_A = total_H_A, total_T_A = total_T_A,
       total_H_B = total_H_B, total_T_B = total_T_B)
}

# ----------------------------------------------------------
# FUNGSI EUCLIDEAN DISTANCE (cek konvergensi)
# ----------------------------------------------------------
euclidean_dist <- function(t_A_lama, t_B_lama, t_A_baru, t_B_baru) {
  sqrt((t_A_baru - t_A_lama)^2 + (t_B_baru - t_B_lama)^2)
}

# ----------------------------------------------------------
# JALANKAN ALGORITMA EM
# ----------------------------------------------------------
cat("==========================================================\n")
## ==========================================================
cat("  ALGORITMA EM - MASALAH 2 KOIN\n")
##   ALGORITMA EM - MASALAH 2 KOIN
cat("==========================================================\n")
## ==========================================================
cat(sprintf("  Parameter Awal : theta_A = %.1f | theta_B = %.1f\n",
            theta_A_init, theta_B_init))
##   Parameter Awal : theta_A = 0.7 | theta_B = 0.4
cat("  (Slide contoh  : theta_A = 0.6 | theta_B = 0.5)\n")
##   (Slide contoh  : theta_A = 0.6 | theta_B = 0.5)
cat("\n")
# Tampilkan data observasi
cat("----------------------------------------------------------\n")
## ----------------------------------------------------------
cat("  DATA OBSERVASI\n")
##   DATA OBSERVASI
cat("----------------------------------------------------------\n")
## ----------------------------------------------------------
for (i in seq_along(data_observasi)) {
  s <- data_observasi[[i]]
  cat(sprintf("  Set %d : %dH %dT  (total %d lemparan)\n",
              i, s["H"], s["T"], s["H"] + s["T"]))
}
##   Set 1 : 5H 5T  (total 10 lemparan)
##   Set 2 : 9H 1T  (total 10 lemparan)
##   Set 3 : 8H 2T  (total 10 lemparan)
##   Set 4 : 4H 6T  (total 10 lemparan)
##   Set 5 : 7H 3T  (total 10 lemparan)
cat("\n")
# Inisialisasi
theta_A <- theta_A_init
theta_B <- theta_B_init
max_iter <- 100
tol <- 1e-6
riwayat <- data.frame(iter = integer(), theta_A = numeric(),
                       theta_B = numeric(), delta = numeric())

for (iter in 1:max_iter) {
  
  # ---- E-STEP ----
  e_hasil <- e_step(data_observasi, theta_A, theta_B)
  
  # ---- M-STEP ----
  m_hasil <- m_step(e_hasil)
  theta_A_baru <- m_hasil$theta_A
  theta_B_baru <- m_hasil$theta_B
  
  # Hitung perubahan (Euclidean Distance)
  delta <- euclidean_dist(theta_A, theta_B, theta_A_baru, theta_B_baru)
  
  # Simpan riwayat
  riwayat <- rbind(riwayat, data.frame(
    iter = iter,
    theta_A = round(theta_A_baru, 6),
    theta_B = round(theta_B_baru, 6),
    delta = round(delta, 8)
  ))
  
  # Cetak detail iterasi (5 iterasi pertama + saat konvergen)
  if (iter <= 5 || delta < tol) {
    cat(sprintf("  === ITERASI %d ===\n", iter))
    cat("  [E-Step] Probabilitas & Estimasi per Set:\n")
    cat(sprintf("  %-6s %-8s %-8s %-8s %-8s %-8s %-8s\n",
                "Set", "P(A)", "P(B)", "H_A", "T_A", "H_B", "T_B"))
    for (i in seq_along(e_hasil)) {
      e <- e_hasil[[i]]
      cat(sprintf("  %-6d %-8.4f %-8.4f %-8.4f %-8.4f %-8.4f %-8.4f\n",
                  i, e$p_A, e$p_B, e$H_A, e$T_A, e$H_B, e$T_B))
    }
    cat(sprintf("\n  [M-Step] Total H_A=%.4f T_A=%.4f | H_B=%.4f T_B=%.4f\n",
                m_hasil$total_H_A, m_hasil$total_T_A,
                m_hasil$total_H_B, m_hasil$total_T_B))
    cat(sprintf("  [Update] theta_A: %.4f -> %.6f | theta_B: %.4f -> %.6f\n",
                theta_A, theta_A_baru, theta_B, theta_B_baru))
    cat(sprintf("  [Delta]  Euclidean Distance = %.8f\n\n", delta))
  }
  
  # Update
  theta_A <- theta_A_baru
  theta_B <- theta_B_baru
  
  # Cek konvergensi
  if (delta < tol) {
    cat(sprintf("  >> KONVERGEN pada iterasi ke-%d (delta < %.0e)\n\n", iter, tol))
    break
  }
}
##   === ITERASI 1 ===
##   [E-Step] Probabilitas & Estimasi per Set:
##   Set    P(A)     P(B)     H_A      T_A      H_B      T_B     
##   1      0.3390   0.6610   1.6951   1.6951   3.3049   3.3049  
##   2      0.9872   0.0128   8.8846   0.9872   0.1154   0.0128  
##   3      0.9565   0.0435   7.6520   1.9130   0.3480   0.0870  
##   4      0.1278   0.8722   0.5113   0.7669   3.4887   5.2331  
##   5      0.8627   0.1373   6.0389   2.5881   0.9611   0.4119  
## 
##   [M-Step] Total H_A=24.7818 T_A=7.9503 | H_B=8.2182 T_B=9.0497
##   [Update] theta_A: 0.7000 -> 0.757111 | theta_B: 0.4000 -> 0.475921
##   [Delta]  Euclidean Distance = 0.09500371
## 
##   === ITERASI 2 ===
##   [E-Step] Probabilitas & Estimasi per Set:
##   Set    P(A)     P(B)     H_A      T_A      H_B      T_B     
##   1      0.1789   0.8211   0.8944   0.8944   4.1056   4.1056  
##   2      0.9680   0.0320   8.7119   0.9680   0.2881   0.0320  
##   3      0.8981   0.1019   7.1846   1.7961   0.8154   0.2039  
##   4      0.0597   0.9403   0.2387   0.3581   3.7613   5.6419  
##   5      0.7196   0.2804   5.0375   2.1589   1.9625   0.8411  
## 
##   [M-Step] Total H_A=22.0672 T_A=6.1756 | H_B=10.9328 T_B=10.8244
##   [Update] theta_A: 0.7571 -> 0.781339 | theta_B: 0.4759 -> 0.502491
##   [Delta]  Euclidean Distance = 0.03595785
## 
##   === ITERASI 3 ===
##   [E-Step] Probabilitas & Estimasi per Set:
##   Set    P(A)     P(B)     H_A      T_A      H_B      T_B     
##   1      0.1297   0.8703   0.6487   0.6487   4.3513   4.3513  
##   2      0.9589   0.0411   8.6305   0.9589   0.3695   0.0411  
##   3      0.8684   0.1316   6.9475   1.7369   1.0525   0.2631  
##   4      0.0404   0.9596   0.1617   0.2426   3.8383   5.7574  
##   5      0.6511   0.3489   4.5575   1.9532   2.4425   1.0468  
## 
##   [M-Step] Total H_A=20.9459 T_A=5.5403 | H_B=12.0541 T_B=11.4597
##   [Update] theta_A: 0.7813 -> 0.790823 | theta_B: 0.5025 -> 0.512640
##   [Delta]  Euclidean Distance = 0.01388987
## 
##   === ITERASI 4 ===
##   [E-Step] Probabilitas & Estimasi per Set:
##   Set    P(A)     P(B)     H_A      T_A      H_B      T_B     
##   1      0.1129   0.8871   0.5644   0.5644   4.4356   4.4356  
##   2      0.9550   0.0450   8.5952   0.9550   0.4048   0.0450  
##   3      0.8552   0.1448   6.8420   1.7105   1.1580   0.2895  
##   4      0.0342   0.9658   0.1368   0.2052   3.8632   5.7948  
##   5      0.6218   0.3782   4.3523   1.8653   2.6477   1.1347  
## 
##   [M-Step] Total H_A=20.4908 T_A=5.3004 | H_B=12.5092 T_B=11.6996
##   [Update] theta_A: 0.7908 -> 0.794488 | theta_B: 0.5126 -> 0.516722
##   [Delta]  Euclidean Distance = 0.00548610
## 
##   === ITERASI 5 ===
##   [E-Step] Probabilitas & Estimasi per Set:
##   Set    P(A)     P(B)     H_A      T_A      H_B      T_B     
##   1      0.1067   0.8933   0.5337   0.5337   4.4663   4.4663  
##   2      0.9533   0.0467   8.5799   0.9533   0.4201   0.0467  
##   3      0.8496   0.1504   6.7967   1.6992   1.2033   0.3008  
##   4      0.0320   0.9680   0.1280   0.1920   3.8720   5.8080  
##   5      0.6097   0.3903   4.2680   1.8291   2.7320   1.1709  
## 
##   [M-Step] Total H_A=20.3062 T_A=5.2073 | H_B=12.6938 T_B=11.7927
##   [Update] theta_A: 0.7945 -> 0.795901 | theta_B: 0.5167 -> 0.518399
##   [Delta]  Euclidean Distance = 0.00219320
## 
##   === ITERASI 14 ===
##   [E-Step] Probabilitas & Estimasi per Set:
##   Set    P(A)     P(B)     H_A      T_A      H_B      T_B     
##   1      0.1030   0.8970   0.5150   0.5150   4.4850   4.4850  
##   2      0.9520   0.0480   8.5681   0.9520   0.4319   0.0480  
##   3      0.8455   0.1545   6.7640   1.6910   1.2360   0.3090  
##   4      0.0307   0.9693   0.1228   0.1842   3.8772   5.8158  
##   5      0.6015   0.3985   4.2105   1.8045   2.7895   1.1955  
## 
##   [M-Step] Total H_A=20.1805 T_A=5.1468 | H_B=12.8195 T_B=11.8532
##   [Update] theta_A: 0.7968 -> 0.796789 | theta_B: 0.5196 -> 0.519583
##   [Delta]  Euclidean Distance = 0.00000065
## 
##   >> KONVERGEN pada iterasi ke-14 (delta < 1e-06)
# ----------------------------------------------------------
# HASIL AKHIR
# ----------------------------------------------------------
cat("==========================================================\n")
## ==========================================================
cat("  HASIL AKHIR ALGORITMA EM\n")
##   HASIL AKHIR ALGORITMA EM
cat("==========================================================\n")
## ==========================================================
cat(sprintf("  theta_A (prob H koin A) = %.6f\n", theta_A))
##   theta_A (prob H koin A) = 0.796789
cat(sprintf("  theta_B (prob H koin B) = %.6f\n", theta_B))
##   theta_B (prob H koin B) = 0.519583
cat("\n")
# Perbandingan dengan contoh slide
cat("----------------------------------------------------------\n")
## ----------------------------------------------------------
cat("  PERBANDINGAN HASIL\n")
##   PERBANDINGAN HASIL
cat("----------------------------------------------------------\n")
## ----------------------------------------------------------
cat(sprintf("  %-20s %-12s %-12s\n", "Parameter", "theta_A", "theta_B"))
##   Parameter            theta_A      theta_B
cat(sprintf("  %-20s %-12.4f %-12.4f\n", "Slide (0.6 / 0.5)", 0.7965, 0.5173))
##   Slide (0.6 / 0.5)    0.7965       0.5173
cat(sprintf("  %-20s %-12.6f %-12.6f\n", "Tes ini (0.7 / 0.4)", theta_A, theta_B))
##   Tes ini (0.7 / 0.4)  0.796789     0.519583
cat("\n")
# ----------------------------------------------------------
# KESIMPULAN IDENTIFIKASI KOIN PER SET
# ----------------------------------------------------------
cat("----------------------------------------------------------\n")
## ----------------------------------------------------------
cat("  KESIMPULAN IDENTIFIKASI KOIN\n")
##   KESIMPULAN IDENTIFIKASI KOIN
cat("----------------------------------------------------------\n")
## ----------------------------------------------------------
e_final <- e_step(data_observasi, theta_A, theta_B)
cat(sprintf("  %-6s %-12s %-8s %-8s %-20s\n",
            "Set", "Hasil", "P(Koin A)", "P(Koin B)", "Kesimpulan"))
##   Set    Hasil        P(Koin A) P(Koin B) Kesimpulan
for (i in seq_along(e_final)) {
  s <- data_observasi[[i]]
  e <- e_final[[i]]
  label <- ifelse(e$p_A > e$p_B, "Koin A", "Koin B")
  cat(sprintf("  %-6d %dH %dT      %-8.4f %-8.4f %-20s\n",
              i, s["H"], s["T"], e$p_A, e$p_B, label))
}
##   1      5H 5T      0.1030   0.8970   Koin B              
##   2      9H 1T      0.9520   0.0480   Koin A              
##   3      8H 2T      0.8455   0.1545   Koin A              
##   4      4H 6T      0.0307   0.9693   Koin B              
##   5      7H 3T      0.6015   0.3985   Koin A
cat("\n")
cat("----------------------------------------------------------\n")
## ----------------------------------------------------------
cat("  INSIGHT\n")
##   INSIGHT
cat("----------------------------------------------------------\n")
## ----------------------------------------------------------
cat("  1. Dengan theta_A=0.7 (bias ke H) dan theta_B=0.4 (bias ke T),\n")
##   1. Dengan theta_A=0.7 (bias ke H) dan theta_B=0.4 (bias ke T),
cat("     pemisahan antara koin A dan B lebih tegas dibanding contoh.\n")
##      pemisahan antara koin A dan B lebih tegas dibanding contoh.
cat("  2. Set dengan banyak H diatribusikan kuat ke Koin A.\n")
##   2. Set dengan banyak H diatribusikan kuat ke Koin A.
cat("  3. Set dengan banyak T diatribusikan kuat ke Koin B.\n")
##   3. Set dengan banyak T diatribusikan kuat ke Koin B.
cat("  4. Nilai akhir theta konvergen mendekati nilai 'benar' dari data.\n")
##   4. Nilai akhir theta konvergen mendekati nilai 'benar' dari data.
cat("  5. Jumlah iterasi hingga konvergen:\n")
##   5. Jumlah iterasi hingga konvergen:
cat(sprintf("     - Slide (0.6/0.5) : ~10 iterasi\n"))
##      - Slide (0.6/0.5) : ~10 iterasi
cat(sprintf("     - Tes ini  (0.7/0.4) : %d iterasi\n", nrow(riwayat)))
##      - Tes ini  (0.7/0.4) : 14 iterasi
cat("==========================================================\n")
## ==========================================================
# ----------------------------------------------------------
# TAMPILKAN RIWAYAT ITERASI RINGKAS
# ----------------------------------------------------------
cat("\n  RIWAYAT KONVERGENSI THETA:\n")
## 
##   RIWAYAT KONVERGENSI THETA:
cat(sprintf("  %-8s %-12s %-12s %-15s\n", "Iterasi", "theta_A", "theta_B", "Delta"))
##   Iterasi  theta_A      theta_B      Delta
print_rows <- unique(c(1:5, nrow(riwayat)))
for (i in print_rows) {
  r <- riwayat[i, ]
  cat(sprintf("  %-8d %-12.6f %-12.6f %-15.8f\n",
              r$iter, r$theta_A, r$theta_B, r$delta))
}
##   1        0.757111     0.475921     0.09500371     
##   2        0.781339     0.502491     0.03595785     
##   3        0.790823     0.512640     0.01388987     
##   4        0.794488     0.516722     0.00548610     
##   5        0.795901     0.518399     0.00219320     
##   14       0.796789     0.519583     0.00000065
cat("==========================================================\n")
## ==========================================================