# Bersihkan environment
rm(list = ls())

# Panggil package
library(dplyr)
## Warning: package 'dplyr' was built under R version 4.5.2
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.5.2
# ================================
# 1. IMPORT DATA
# ================================
df <- read.csv("C:/Pengantar Proses Stokastik/data_kedatangan_pelanggan_per_jam.csv", sep=";")

# Cek struktur
cat("=== STRUKTUR DATA ===\n")
## === STRUKTUR DATA ===
head(df)
##      tanggal jam_mulai jam_selesai jumlah_pelanggan
## 1 2026-01-06     10:00       11:00                9
## 2 2026-01-06     11:00       12:00               33
## 3 2026-01-06     12:00       13:00               26
## 4 2026-01-06     13:00       14:00               17
## 5 2026-01-07     10:00       11:00               10
## 6 2026-01-07     11:00       12:00               19
str(df)
## 'data.frame':    20 obs. of  4 variables:
##  $ tanggal         : chr  "2026-01-06" "2026-01-06" "2026-01-06" "2026-01-06" ...
##  $ jam_mulai       : chr  "10:00" "11:00" "12:00" "13:00" ...
##  $ jam_selesai     : chr  "11:00" "12:00" "13:00" "14:00" ...
##  $ jumlah_pelanggan: int  9 33 26 17 10 19 21 14 12 24 ...
# ================================
# 2. DESKRIPSI DATA
# ================================
n_data <- nrow(df)
min_x <- min(df$jumlah_pelanggan)
max_x <- max(df$jumlah_pelanggan)
mean_x <- mean(df$jumlah_pelanggan)
median_x <- median(df$jumlah_pelanggan)
var_x <- var(df$jumlah_pelanggan)
sd_x <- sd(df$jumlah_pelanggan)

cat("\n=== DESKRIPSI DATA ===\n")
## 
## === DESKRIPSI DATA ===
cat("Jumlah data =", n_data, "\n")
## Jumlah data = 20
cat("Min =", min_x, "\n")
## Min = 9
cat("Max =", max_x, "\n")
## Max = 35
cat("Mean =", mean_x, "\n")
## Mean = 19.3
cat("Median =", median_x, "\n")
## Median = 18
cat("Varians =", var_x, "\n")
## Varians = 50.95789
cat("SD =", sd_x, "\n")
## SD = 7.13848
summary(df$jumlah_pelanggan)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    9.00   14.00   18.00   19.30   23.25   35.00
# ================================
# 3. VISUALISASI
# ================================
hist(df$jumlah_pelanggan,
     main="Histogram Jumlah Pelanggan",
     xlab="Jumlah pelanggan")

boxplot(df$jumlah_pelanggan,
        main="Boxplot Jumlah Pelanggan",
        ylab="Jumlah pelanggan")

# ================================
# 4. PMF EMPIRIS
# ================================
pmf_emp <- df %>%
  count(jumlah_pelanggan) %>%
  mutate(p_emp = n / sum(n)) %>%
  arrange(jumlah_pelanggan)

cat("\n=== PMF EMPIRIS ===\n")
## 
## === PMF EMPIRIS ===
print(pmf_emp)
##    jumlah_pelanggan n p_emp
## 1                 9 1  0.05
## 2                10 1  0.05
## 3                12 1  0.05
## 4                13 1  0.05
## 5                14 3  0.15
## 6                16 2  0.10
## 7                17 1  0.05
## 8                19 1  0.05
## 9                21 1  0.05
## 10               22 1  0.05
## 11               23 2  0.10
## 12               24 1  0.05
## 13               25 1  0.05
## 14               26 1  0.05
## 15               33 1  0.05
## 16               35 1  0.05
# ================================
# 5. ESTIMASI PARAMETER POISSON
# ================================
lambda_hat <- mean_x

cat("\n=== ESTIMASI PARAMETER ===\n")
## 
## === ESTIMASI PARAMETER ===
cat("Lambda (λ^) =", lambda_hat, "\n")
## Lambda (λ^) = 19.3
# ================================
# 6. UJI DIAGNOSTIK POISSON
# ================================
cat("\n=== CEK POISSON ===\n")
## 
## === CEK POISSON ===
cat("Mean =", mean_x, "\n")
## Mean = 19.3
cat("Varians =", var_x, "\n")
## Varians = 50.95789
dispersion_index <- var_x / mean_x
cat("Dispersion Index =", dispersion_index, "\n")
## Dispersion Index = 2.640305
if (dispersion_index > 1.2) {
  cat("Indikasi: OVERDISPERSION\n")
} else if (dispersion_index < 0.8) {
  cat("Indikasi: UNDERDISPERSION\n")
} else {
  cat("Indikasi: Mendekati Poisson\n")
}
## Indikasi: OVERDISPERSION
# ================================
# 7. PERBANDINGAN DENGAN POISSON
# ================================
df_pois <- data.frame(
  k = 0:max(df$jumlah_pelanggan),
  prob = dpois(0:max(df$jumlah_pelanggan), lambda = lambda_hat)
)

ggplot(df, aes(x = jumlah_pelanggan)) +
  geom_histogram(aes(y = after_stat(density)),
                 binwidth = 1, boundary = -0.5) +
  geom_point(data = df_pois, aes(x = k, y = prob), color="red") +
  geom_line(data = df_pois, aes(x = k, y = prob), color="red") +
  labs(title = "Perbandingan Data dengan Distribusi Poisson",
       x = "Jumlah pelanggan",
       y = "Probabilitas")

# ================================
# 8. SIMULASI INTERARRIVAL (TEORI)
# ================================
cat("\n=== SIMULASI INTERARRIVAL ===\n")
## 
## === SIMULASI INTERARRIVAL ===
set.seed(123)
m <- 2000
interarrival_sim <- rexp(m, rate = lambda_hat)

cat("Mean teoritis =", 1/lambda_hat, "\n")
## Mean teoritis = 0.05181347
cat("Mean simulasi =", mean(interarrival_sim), "\n")
## Mean simulasi = 0.05242371
ggplot(data.frame(t = interarrival_sim), aes(x = t)) +
  geom_histogram(aes(y = after_stat(density)), bins = 30) +
  labs(title = "Simulasi Waktu Antar Kedatangan (Eksponensial)",
       x = "Waktu antar kedatangan",
       y = "Kepadatan")

# ================================
# 9. CEK HETEROGENITAS (PER JAM)
# ================================
df_hour <- df %>%
  group_by(jam_mulai) %>%
  summarise(
    mean_per_jam = mean(jumlah_pelanggan),
    var_per_jam = var(jumlah_pelanggan),
    n = n(),
    .groups = "drop"
  )

cat("\n=== RINGKASAN PER JAM ===\n")
## 
## === RINGKASAN PER JAM ===
print(df_hour)
## # A tibble: 4 × 4
##   jam_mulai mean_per_jam var_per_jam     n
##   <chr>            <dbl>       <dbl> <int>
## 1 10:00             11.6         4.3     5
## 2 11:00             22.6        49.3     5
## 3 12:00             26          29       5
## 4 13:00             17           9       5
ggplot(df_hour, aes(x = jam_mulai, y = mean_per_jam)) +
  geom_col() +
  labs(title = "Rata-rata Pelanggan per Jam",
       x = "Jam mulai",
       y = "Mean pelanggan")