Analisis outlier dengan R

IQR

# Buat data sampel
set.seed(123)
data <- data.frame(
  ID = 1:50,
  Value = c(rnorm(45, mean = 50, sd = 10), rnorm(5, mean = 100, sd = 5)) # 5 outlier
)

boxplot(data$Value)

library(ggplot2)
# Hitung IQR
Q1 <- quantile(data$Value, 0.25)
Q3 <- quantile(data$Value, 0.75)
IQR_value <- Q3 - Q1

# Tentukan batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value

# Deteksi outlier
data$Outlier <- ifelse(data$Value < lower_bound | data$Value > upper_bound, "Yes", "No")

# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
  geom_point(size = 3) +
  geom_hline(yintercept = lower_bound, linetype = "dashed", color = "red") +
  geom_hline(yintercept = upper_bound, linetype = "dashed", color = "red") +
  labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
  theme_minimal()

ID 46-50 terdeteksi sebagai outlier.

Z-Score

# Hitung z-score
z_scores <- scale(data$Value)

# Tentukan batas z-score
threshold <- 3

# Deteksi outlier
data$Outlier <- ifelse(abs(z_scores) > threshold, "Yes", "No")

# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
  geom_point(size = 3) +
  geom_hline(yintercept = mean(data$Value) + threshold * sd(data$Value), linetype = "dashed", color = "red") +
  geom_hline(yintercept = mean(data$Value) - threshold * sd(data$Value), linetype = "dashed", color = "red") +
  labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
  theme_minimal()

Dengan menggunakan z-score, ID 46-50 tidak terdeteksi sebagai outlier.

Modified Z-Score

# Hitung Modified Z-Score
median_value <- median(data$Value)
mad_value <- mad(data$Value)
mod_z_scores <- 0.6745 * (data$Value - median_value) / mad_value

# Tentukan batas Modified Z-Score
threshold <- 3

# Deteksi outlier
data$Outlier <- ifelse(abs(mod_z_scores) > threshold, "Yes", "No")

# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
  geom_point(size = 3) +
  geom_hline(yintercept = median_value + threshold * mad_value / 0.6745, linetype = "dashed", color = "red") +
  geom_hline(yintercept = median_value - threshold * mad_value / 0.6745, linetype = "dashed", color = "red") +
  labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
  theme_minimal()

Dengan menggunakan Modified Z-Score, hanya ID 49-50 yang terdeteksi sebagai outlier.

Cluster Based

# Clustering untuk deteksi outlier
library(cluster)
kmeans_result <- kmeans(data$Value, centers = 2) # Dua klaster: normal dan outlier

data$Cluster <- as.factor(kmeans_result$cluster)

# Tentukan outlier sebagai anggota cluster terkecil
cluster_sizes <- table(kmeans_result$cluster)
outlier_cluster <- as.numeric(names(cluster_sizes)[which.min(cluster_sizes)])
data$Outlier <- ifelse(data$Cluster == outlier_cluster, "Yes", "No")

# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
  geom_point(size = 3) +
  labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
  theme_minimal()

Metode berbasis klaster seperti k-means tidak menetapkan batas ambang (threshold) eksplisit seperti metode IQR atau Z-Score.

K-Nearest Neighbors

# Deteksi outlier dengan k-NN
library(FNN)

# Tentukan jumlah tetangga terdekat
k <- 5

# Hitung jarak rata-rata ke k tetangga terdekat
nn_distances <- knn.dist(data$Value, k = k)
data$Mean_Distance <- rowMeans(nn_distances)

# Tetapkan ambang batas sebagai persentil tertentu
threshold <- quantile(data$Mean_Distance, 0.95)
data$Outlier <- ifelse(data$Mean_Distance > threshold, "Yes", "No")

# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
  geom_point(size = 3) +
  labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
  theme_minimal()

Local Outlier Factor (LOF)

# remotes::install_github("cran/DMwR")
library(DMwR)
Loading required package: lattice
Loading required package: grid
Registered S3 method overwritten by 'quantmod':
  method            from
  as.zoo.data.frame zoo 
k = 5
data$LOF_Score <- lofactor(data$Value, k = k)

# Tetapkan outlier berdasarkan threshold
threshold <- quantile(data$LOF_Score, 0.95)
data$Outlier <- ifelse(data$LOF_Score > threshold, "Yes", "No")

# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
  geom_point(size = 3) +
  labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
  theme_minimal()

Pemilihan nilai k yang tepat sangat penting dalam metode LOF atau pun proximity-based lainnya. Selain itu, data tidak terlalu cocok untuk metode ini karena hanya ada satu variabel dan tidak ada pola jarak yang jelas.

Isolation Forest

library(isotree)
iso_forest <- isolation.forest(as.matrix(data$Value))
data$Isolation_Score <- predict(iso_forest, as.matrix(data$Value), type = "score")

# Tetapkan outlier berdasarkan threshold
data$Outlier <- ifelse(data$Isolation_Score > 0.5, "Yes", "No")

# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
  geom_point(size = 3) +
  labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
  theme_minimal()

Mahalanobis distance to find outliers

Mahalanobis Distance (\(D^2\)) digunakan untuk mendeteksi outlier dalam data multivariat. Formula dasarnya adalah:

\[ D^2 = (x - \mu)^T S^{-1} (x - \mu) \]

di mana:

  • \(x\) adalah vektor pengamatan.
  • \(\mu\) adalah vektor rata-rata dari data.
  • \(S\) adalah matriks kovarians dari data.
  • \(S^{-1}\) adalah invers dari matriks kovarians.

Dalam implementasi R, Mahalanobis Distance dapat dihitung menggunakan fungsi mahalanobis():

Contoh:

Manual

hw <- data.frame(Height.cm=c(164, 167, 168, 169, 169, 
                             170, 170, 170, 171, 172, 172, 173, 173, 175, 176, 178),
                 Weight.kg=c( 54,  57,  58,  60,  61,  
                              60,  61,  62,  62,  64,  62,  62,  64,  56,  66,  70)) 

is.height.outlier <- abs(scale(hw$Height.cm)) > 2
is.weight.outlier <- abs(scale(hw$Weight.kg)) > 2
pch               <- (is.height.outlier | is.weight.outlier) * 16

plot(hw, pch=pch) 

Dengan fungsi mahalanobis()

n.outliers   <- 2 # Mark as outliers the 2 most extreme points
m.dist.order <- order(mahalanobis(hw, colMeans(hw), cov(hw)), decreasing=TRUE)
is.outlier   <- rep(FALSE, nrow(hw))
is.outlier[m.dist.order[1:n.outliers]] <- TRUE
pch <- is.outlier * 16
plot(hw, pch=pch) 

n.outliers   <- 3 # Mark as outliers the 2 most extreme points
m.dist.order <- order(mahalanobis(hw, colMeans(hw), cov(hw)), decreasing=TRUE)
is.outlier   <- rep(FALSE, nrow(hw))
is.outlier[m.dist.order[1:n.outliers]] <- TRUE
pch <- is.outlier * 16
plot(hw, pch=pch)

3D Data

library(rgl)
hwa <- data.frame(Height.cm=c(164, 167, 168, 168, 169, 
                              169, 169, 170, 172, 173, 175, 176, 178),
                  Weight.kg=c( 55,  57,  58,  56,  57,  
                               61,  61,  61,  64,  62,  56,  66,  70),
                  Age      =c( 13,  12,  14,  17,  15,  14,  
                               16,  16,  13,  15,  16,  14,  16))

m.dist.order <- order(mahalanobis(hwa, colMeans(hwa), cov(hwa)), decreasing=TRUE)

is.outlier <- rep(FALSE, nrow(hwa))

is.outlier[m.dist.order[1:2]] <- TRUE # Mark as outliers the 2 most extreme points

col <- is.outlier + 1

library(scatterplot3d)
scatterplot3d(hwa$Height.cm, hwa$Weight.kg, hwa$Age, pch=16, color=col, angle=30)

Nonlinear relationships

Kurang bekerja dengan baik pada data non-linear.

my.dataframe <- data.frame(x=c(4,  8, 10, 16, 17, 22, 27, 33, 38, 40, 47, 48, 
                               53, 55, 63, 71, 76, 85, 85, 92, 96),
                           y=c(6, 22, 32, 34, 42, 51, 59, 63, 64, 69, 70, 20, 
                               70, 63, 63, 55, 46, 41, 33, 19,  6))

percentage.of.outliers        <- 10 # Mark 10% of points as outliers

number.of.outliers            <- trunc(nrow(my.dataframe) * percentage.of.outliers / 100)

m.dist                        <- mahalanobis(my.dataframe, colMeans(my.dataframe),
                                             cov(my.dataframe))

m.dist.order                  <- order(m.dist, decreasing=TRUE)

rows.not.outliers             <- m.dist.order[(number.of.outliers+1):nrow(my.dataframe)]

is.outlier                    <- rep(TRUE, nrow(my.dataframe))

is.outlier[rows.not.outliers] <- FALSE

pch                           <- is.outlier * 16
plot(x=my.dataframe$x, y=my.dataframe$y, pch=pch)

Prediction-based outlier detection in streaming time series

library(TSA)

Attaching package: 'TSA'
The following objects are masked from 'package:stats':

    acf, arima
The following object is masked from 'package:utils':

    tar
data(rivers)
plot(rivers,ylab='miles',xlab='Bulan', type = "l")

# Load library yang dibutuhkan
library(ggplot2)
library(forecast)
Registered S3 methods overwritten by 'forecast':
  method       from
  fitted.Arima TSA 
  plot.Arima   TSA 
# Gunakan dataset rivers sebagai time series
rivers_ts <- ts(rivers, frequency = 1)

# Panjang jendela tetap
window_size <- 22

# Simpan hasil prediksi dan flag outlier
predictions <- numeric()
outlier_flags <- character()
time_values <- numeric()
actual_values <- numeric()

# Rolling window untuk memproses seluruh data
i <- window_size
while (i < length(rivers_ts)) {
  # Data latih dengan jendela tetap 22
  train_data <- rivers_ts[(i - window_size + 1):i]
  
  # Bangun model ARIMA dan prediksi 1 langkah ke depan
  model <- auto.arima(train_data)
  forecast_result <- forecast(model, h = 1, level = 95)  # level 95%
  predicted_value <- forecast_result$mean[1]
  
  # Hitung standard error secara manual menggunakan interval atas
  se <- (forecast_result$upper[1, 1] - predicted_value) / 1.96
  
  # Tentukan batas berdasarkan ±2 standar error dari nilai prediksi
  lower_bound <- predicted_value - 3 * se
  upper_bound <- predicted_value + 3 * se
  
  # Data aktual berikutnya
  actual_value <- rivers_ts[i + 1]
  
  # Deteksi outlier: jika nilai aktual di luar interval [lower_bound, upper_bound]
  outlier_flag <- ifelse(actual_value < lower_bound | actual_value > upper_bound, "Yes", "No")
  
  # Simpan hasil
  predictions <- c(predictions, predicted_value)
  actual_values <- c(actual_values, actual_value)
  time_values <- c(time_values, i + 1)
  outlier_flags <- c(outlier_flags, outlier_flag)
  
  # Geser jendela satu langkah ke depan
  i <- i + 1
}

# Buat data frame hasil
result_df <- data.frame(
  Time = time_values,
  Actual = actual_values,
  Predicted = predictions,
  Outlier = outlier_flags
)

# Visualisasi hasil
p1 <- ggplot(result_df, aes(x = Time)) +
  geom_line(aes(y = Actual, color = "Actual")) +
  geom_line(aes(y = Predicted, color = "Predicted")) +
  geom_point(aes(y = Actual, color = Outlier), size = 2) +
  labs(title = "Deteksi Outlier dengan ARIMA",
       x = "Time", y = "Value") +
  scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red", "Yes" = "black", "No" = "green")) +
  theme_minimal()

# Tampilkan plot
p1

# Dengan IQR
# Load library yang dibutuhkan
library(ggplot2)

# Gunakan dataset rivers sebagai time series
rivers_ts <- ts(rivers, frequency = 1)

# Dengan IQR
# Tentukan panjang jendela dan frekuensi update
window_size <- 22

# Simpan hasil outlier
outlier_flags <- character()
time_values <- numeric()
actual_values <- numeric()

# Rolling window untuk memproses seluruh data
i <- window_size
while (i <= length(rivers_ts) - 1) {
  train_data <- rivers_ts[(i - window_size + 1):i]  # Gunakan jendela tetap
  
  # Hitung batas IQR dari train_data
  Q1 <- quantile(train_data, 0.25)
  Q3 <- quantile(train_data, 0.75)
  IQR_value <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_value
  upper_bound <- Q3 + 1.5 * IQR_value
  
  actual_subset <- rivers_ts[(i + 1):(i + 1)]
  
  # Bandingkan data baru dengan threshold IQR
  outlier_flags <- c(outlier_flags, ifelse(actual_subset < lower_bound | actual_subset > upper_bound, "Yes", "No"))
  
  actual_values <- c(actual_values, actual_subset)
  time_values <- c(time_values, (i + 1):(i + 1))
  
  i <- i + 1
}

# Buat data frame hasil
result_df <- data.frame(
  Time = time_values,
  Actual = actual_values,
  Outlier = outlier_flags
)

# Visualisasi hasil
p1 <- ggplot(result_df, aes(x = Time)) +
  geom_line(aes(y = Actual, color = "Actual")) +
  geom_point(aes(y = Actual, color = Outlier), size = 2) +
  labs(title = "IQR-based Outlier Detection in Time Series",
       x = "Time", y = "Value") +
  scale_color_manual(values = c("Actual" = "blue", "Yes" = "black", "No" = "green")) +
  theme_minimal()

p1

Ensamble outlier analysis

Cocok untuk data offline (historis) dan berdimensi tinggi.

Bagaimana Subspace Histogram Ensembles (SHE) bekerja? 1. Pemilihan Subruang Acak
- Data dibagi ke dalam beberapa subruang (subspace sampling). - Setiap subruang hanya mempertimbangkan sebagian fitur dari data.

  1. Pembuatan Histogram di Setiap Subruang
    • Histogram dibuat untuk mendistribusikan data dalam setiap subruang.
    • Setiap titik data memiliki representasi dalam beberapa histogram.
  2. Penentuan Skor Outlier
    • Titik data dengan probabilitas rendah di histogram dianggap sebagai outlier.
    • Skor outlier dihitung berdasarkan seberapa jarang titik tersebut muncul di berbagai subruang.
  3. Ensemble Voting atau Aggregation
    • Skor dari berbagai histogram dikombinasikan untuk mendapatkan final outlier score.
    • Teknik agregasi seperti majority voting atau weighted averaging digunakan.
# Load library yang dibutuhkan
library(ggplot2)
library(dplyr)

Attaching package: 'dplyr'
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(caret)
library(cluster)

# Gunakan dataset rivers
set.seed(123)
data <- data.frame(Value = rivers)

# Fungsi untuk membentuk histogram dalam subruang
create_subspace_histogram <- function(data, num_bins = 10, num_subspaces = 5) {
  subspace_scores <- matrix(0, nrow = nrow(data), ncol = num_subspaces)
  for (i in 1:num_subspaces) {
    sampled_features <- sample(1:ncol(data), 1)  # Pilih fitur secara acak
    hist_data <- hist(data[[sampled_features]], breaks = num_bins, plot = FALSE)
    bin_assignments <- cut(data[[sampled_features]], breaks = hist_data$breaks, labels = FALSE)
    bin_density <- table(bin_assignments) / length(bin_assignments)
    
    for (j in 1:nrow(data)) {
      if (!is.na(bin_assignments[j])) {
        subspace_scores[j, i] <- bin_density[as.character(bin_assignments[j])]
      }
    }
  }
  rowMeans(subspace_scores, na.rm = TRUE)  # Ambil rata-rata dari semua subruang
}

# Hitung skor outlier menggunakan Subspace Histogram Ensembles (SHE)
data$Outlier_Score <- create_subspace_histogram(data)

# Menentukan threshold outlier berdasarkan persentil rendah
threshold <- quantile(data$Outlier_Score, probs = 0.05)
data$Outlier <- ifelse(data$Outlier_Score < threshold, "Yes", "No")

# Visualisasi hasil
data$Time <- 1:nrow(data)
p1 <- ggplot(data, aes(x = Time)) +
  geom_line(aes(y = Value)) +
  geom_point(aes(y = Value, color = Outlier), size = 2) +
  labs(title = "Subspace Histogram Ensembles (SHE)",
       x = "Time", y = "Value") +
  scale_color_manual(values = c("Yes" = "black", "No" = "green")) +
  theme_minimal()
p1

Di bawah ini adalah contoh metode ensemble outlier detection untuk data stream yang bekerja secara online (rolling window) dan dapat digunakan seperti ARIMA. Dalam contoh ini, kita menggabungkan tiga pendekatan:

  1. ARIMA Forecast Residual Method:
    Menggunakan model ARIMA untuk memprediksi nilai berikutnya dan memeriksa apakah nilai aktual berada di luar interval prediksi (misalnya, ±3 standard error).

  2. IQR-based Method:
    Menghitung kuartil dan IQR dari data training (window) dan menentukan apakah nilai aktual berada di luar rentang ([Q1 - 1.5 IQR,, Q3 + 1.5 IQR]).

  3. Z-score-based Method:
    Menghitung z-score dari nilai aktual berdasarkan mean dan standar deviasi data training. Jika (|z|) melebihi threshold (misalnya, 3), maka dianggap outlier.

Pendekatan ensemble ini menggabungkan voting dari ketiga metode tersebut: jika minimal dua dari tiga metode mendeteksi outlier, maka observasi baru dianggap sebagai outlier.

# Load library yang dibutuhkan
library(ggplot2)
library(forecast)

# Gunakan dataset rivers sebagai data stream
rivers_ts <- ts(rivers, frequency = 1)

# Panjang jendela rolling (data training)
window_size <- 22

# Data frame untuk menyimpan hasil
results <- data.frame(Time = numeric(),
                      Value = numeric(),
                      ARIMA_Outlier = character(),
                      IQR_Outlier = character(),
                      ZScore_Outlier = character(),
                      Ensemble_Outlier = character(),
                      stringsAsFactors = FALSE)

# Proses rolling window untuk setiap observasi i+1
for(i in window_size:(length(rivers_ts)-1)) {
  # Data training: jendela tetap dari (i - window_size + 1) hingga i
  train_data <- rivers_ts[(i - window_size + 1):i]
  new_value <- rivers_ts[i + 1]
  
  ## 1. ARIMA Forecast Residual Method
  model <- auto.arima(train_data)
  forecast_result <- forecast(model, h = 1, level = 95)  # interval 95%
  predicted_value <- forecast_result$mean[1]
  # Hitung standard error dari forecast menggunakan upper bound
  se <- (forecast_result$upper[1, 1] - predicted_value) / 1.96
  # Jika nilai aktual di luar [predicted_value ± 2*se], tandai sebagai outlier
  arima_outlier <- ifelse(new_value < predicted_value - 3 * se | new_value > predicted_value + 3 * se, "Yes", "No")
  
  ## 2. IQR-based Method
  Q1 <- quantile(train_data, 0.25)
  Q3 <- quantile(train_data, 0.75)
  IQR_value <- Q3 - Q1
  lower_bound <- Q1 - 1.5 * IQR_value
  upper_bound <- Q3 + 1.5 * IQR_value
  iqr_outlier <- ifelse(new_value < lower_bound | new_value > upper_bound, "Yes", "No")
  
  ## 3. Z-score-based Method
  mu <- mean(train_data)
  sigma <- sd(train_data)
  z <- abs(new_value - mu) / sigma
  # Misal, threshold z > 3 dianggap outlier
  zscore_outlier <- ifelse(z > 3, "Yes", "No")
  
  ## Ensemble: majority vote (jika minimal 2 metode mendeteksi outlier)
  votes <- sum(c(arima_outlier, iqr_outlier, zscore_outlier) == "Yes")
  ensemble_outlier <- ifelse(votes >= 2, "Yes", "No")
  
  # Simpan hasil untuk observasi i+1
  results <- rbind(results, data.frame(Time = i + 1,
                                       Value = new_value,
                                       ARIMA_Outlier = arima_outlier,
                                       IQR_Outlier = iqr_outlier,
                                       ZScore_Outlier = zscore_outlier,
                                       Ensemble_Outlier = ensemble_outlier,
                                       stringsAsFactors = FALSE))
}

# Visualisasi hasil ensemble outlier detection
ggplot(results, aes(x = Time, y = Value)) +
  geom_line(color = "blue") +
  geom_point(aes(color = Ensemble_Outlier), size = 2) +
  scale_color_manual(values = c("Yes" = "red", "No" = "green")) +
  labs(title = "Ensemble Outlier Detection pada Data Stream",
       x = "Time", y = "Value") +
  theme_minimal()