# Buat data sampel
set.seed(123)
data <- data.frame(
ID = 1:50,
Value = c(rnorm(45, mean = 50, sd = 10), rnorm(5, mean = 100, sd = 5)) # 5 outlier
)
boxplot(data$Value)Analisis outlier dengan R
IQR
library(ggplot2)
# Hitung IQR
Q1 <- quantile(data$Value, 0.25)
Q3 <- quantile(data$Value, 0.75)
IQR_value <- Q3 - Q1
# Tentukan batas bawah dan atas
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
# Deteksi outlier
data$Outlier <- ifelse(data$Value < lower_bound | data$Value > upper_bound, "Yes", "No")
# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
geom_point(size = 3) +
geom_hline(yintercept = lower_bound, linetype = "dashed", color = "red") +
geom_hline(yintercept = upper_bound, linetype = "dashed", color = "red") +
labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
theme_minimal()ID 46-50 terdeteksi sebagai outlier.
Z-Score
# Hitung z-score
z_scores <- scale(data$Value)
# Tentukan batas z-score
threshold <- 3
# Deteksi outlier
data$Outlier <- ifelse(abs(z_scores) > threshold, "Yes", "No")
# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
geom_point(size = 3) +
geom_hline(yintercept = mean(data$Value) + threshold * sd(data$Value), linetype = "dashed", color = "red") +
geom_hline(yintercept = mean(data$Value) - threshold * sd(data$Value), linetype = "dashed", color = "red") +
labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
theme_minimal()Dengan menggunakan z-score, ID 46-50 tidak terdeteksi sebagai outlier.
Modified Z-Score
# Hitung Modified Z-Score
median_value <- median(data$Value)
mad_value <- mad(data$Value)
mod_z_scores <- 0.6745 * (data$Value - median_value) / mad_value
# Tentukan batas Modified Z-Score
threshold <- 3
# Deteksi outlier
data$Outlier <- ifelse(abs(mod_z_scores) > threshold, "Yes", "No")
# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
geom_point(size = 3) +
geom_hline(yintercept = median_value + threshold * mad_value / 0.6745, linetype = "dashed", color = "red") +
geom_hline(yintercept = median_value - threshold * mad_value / 0.6745, linetype = "dashed", color = "red") +
labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
theme_minimal()Dengan menggunakan Modified Z-Score, hanya ID 49-50 yang terdeteksi sebagai outlier.
Cluster Based
# Clustering untuk deteksi outlier
library(cluster)
kmeans_result <- kmeans(data$Value, centers = 2) # Dua klaster: normal dan outlier
data$Cluster <- as.factor(kmeans_result$cluster)
# Tentukan outlier sebagai anggota cluster terkecil
cluster_sizes <- table(kmeans_result$cluster)
outlier_cluster <- as.numeric(names(cluster_sizes)[which.min(cluster_sizes)])
data$Outlier <- ifelse(data$Cluster == outlier_cluster, "Yes", "No")
# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
geom_point(size = 3) +
labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
theme_minimal()Metode berbasis klaster seperti k-means tidak menetapkan batas ambang (threshold) eksplisit seperti metode IQR atau Z-Score.
K-Nearest Neighbors
# Deteksi outlier dengan k-NN
library(FNN)
# Tentukan jumlah tetangga terdekat
k <- 5
# Hitung jarak rata-rata ke k tetangga terdekat
nn_distances <- knn.dist(data$Value, k = k)
data$Mean_Distance <- rowMeans(nn_distances)
# Tetapkan ambang batas sebagai persentil tertentu
threshold <- quantile(data$Mean_Distance, 0.95)
data$Outlier <- ifelse(data$Mean_Distance > threshold, "Yes", "No")
# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
geom_point(size = 3) +
labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
theme_minimal()Local Outlier Factor (LOF)
# remotes::install_github("cran/DMwR")
library(DMwR)Loading required package: lattice
Loading required package: grid
Registered S3 method overwritten by 'quantmod':
method from
as.zoo.data.frame zoo
k = 5
data$LOF_Score <- lofactor(data$Value, k = k)
# Tetapkan outlier berdasarkan threshold
threshold <- quantile(data$LOF_Score, 0.95)
data$Outlier <- ifelse(data$LOF_Score > threshold, "Yes", "No")
# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
geom_point(size = 3) +
labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
theme_minimal()Pemilihan nilai k yang tepat sangat penting dalam metode LOF atau pun proximity-based lainnya. Selain itu, data tidak terlalu cocok untuk metode ini karena hanya ada satu variabel dan tidak ada pola jarak yang jelas.
Isolation Forest
library(isotree)
iso_forest <- isolation.forest(as.matrix(data$Value))
data$Isolation_Score <- predict(iso_forest, as.matrix(data$Value), type = "score")
# Tetapkan outlier berdasarkan threshold
data$Outlier <- ifelse(data$Isolation_Score > 0.5, "Yes", "No")
# Visualisasi dengan scatter plot
ggplot(data, aes(x = ID, y = Value, color = Outlier)) +
geom_point(size = 3) +
labs(title = "Scatterplot dengan Outlier", x = "ID", y = "Value") +
theme_minimal()Mahalanobis distance to find outliers
Mahalanobis Distance (\(D^2\)) digunakan untuk mendeteksi outlier dalam data multivariat. Formula dasarnya adalah:
\[ D^2 = (x - \mu)^T S^{-1} (x - \mu) \]
di mana:
- \(x\) adalah vektor pengamatan.
- \(\mu\) adalah vektor rata-rata dari data.
- \(S\) adalah matriks kovarians dari data.
- \(S^{-1}\) adalah invers dari matriks kovarians.
Dalam implementasi R, Mahalanobis Distance dapat dihitung menggunakan fungsi mahalanobis():
Contoh:
Manual
hw <- data.frame(Height.cm=c(164, 167, 168, 169, 169,
170, 170, 170, 171, 172, 172, 173, 173, 175, 176, 178),
Weight.kg=c( 54, 57, 58, 60, 61,
60, 61, 62, 62, 64, 62, 62, 64, 56, 66, 70))
is.height.outlier <- abs(scale(hw$Height.cm)) > 2
is.weight.outlier <- abs(scale(hw$Weight.kg)) > 2
pch <- (is.height.outlier | is.weight.outlier) * 16
plot(hw, pch=pch) Dengan fungsi mahalanobis()
n.outliers <- 2 # Mark as outliers the 2 most extreme points
m.dist.order <- order(mahalanobis(hw, colMeans(hw), cov(hw)), decreasing=TRUE)
is.outlier <- rep(FALSE, nrow(hw))
is.outlier[m.dist.order[1:n.outliers]] <- TRUE
pch <- is.outlier * 16
plot(hw, pch=pch) n.outliers <- 3 # Mark as outliers the 2 most extreme points
m.dist.order <- order(mahalanobis(hw, colMeans(hw), cov(hw)), decreasing=TRUE)
is.outlier <- rep(FALSE, nrow(hw))
is.outlier[m.dist.order[1:n.outliers]] <- TRUE
pch <- is.outlier * 16
plot(hw, pch=pch)3D Data
library(rgl)
hwa <- data.frame(Height.cm=c(164, 167, 168, 168, 169,
169, 169, 170, 172, 173, 175, 176, 178),
Weight.kg=c( 55, 57, 58, 56, 57,
61, 61, 61, 64, 62, 56, 66, 70),
Age =c( 13, 12, 14, 17, 15, 14,
16, 16, 13, 15, 16, 14, 16))
m.dist.order <- order(mahalanobis(hwa, colMeans(hwa), cov(hwa)), decreasing=TRUE)
is.outlier <- rep(FALSE, nrow(hwa))
is.outlier[m.dist.order[1:2]] <- TRUE # Mark as outliers the 2 most extreme points
col <- is.outlier + 1
library(scatterplot3d)
scatterplot3d(hwa$Height.cm, hwa$Weight.kg, hwa$Age, pch=16, color=col, angle=30)Nonlinear relationships
Kurang bekerja dengan baik pada data non-linear.
my.dataframe <- data.frame(x=c(4, 8, 10, 16, 17, 22, 27, 33, 38, 40, 47, 48,
53, 55, 63, 71, 76, 85, 85, 92, 96),
y=c(6, 22, 32, 34, 42, 51, 59, 63, 64, 69, 70, 20,
70, 63, 63, 55, 46, 41, 33, 19, 6))
percentage.of.outliers <- 10 # Mark 10% of points as outliers
number.of.outliers <- trunc(nrow(my.dataframe) * percentage.of.outliers / 100)
m.dist <- mahalanobis(my.dataframe, colMeans(my.dataframe),
cov(my.dataframe))
m.dist.order <- order(m.dist, decreasing=TRUE)
rows.not.outliers <- m.dist.order[(number.of.outliers+1):nrow(my.dataframe)]
is.outlier <- rep(TRUE, nrow(my.dataframe))
is.outlier[rows.not.outliers] <- FALSE
pch <- is.outlier * 16plot(x=my.dataframe$x, y=my.dataframe$y, pch=pch)Prediction-based outlier detection in streaming time series
library(TSA)
Attaching package: 'TSA'
The following objects are masked from 'package:stats':
acf, arima
The following object is masked from 'package:utils':
tar
data(rivers)plot(rivers,ylab='miles',xlab='Bulan', type = "l")# Load library yang dibutuhkan
library(ggplot2)
library(forecast)Registered S3 methods overwritten by 'forecast':
method from
fitted.Arima TSA
plot.Arima TSA
# Gunakan dataset rivers sebagai time series
rivers_ts <- ts(rivers, frequency = 1)
# Panjang jendela tetap
window_size <- 22
# Simpan hasil prediksi dan flag outlier
predictions <- numeric()
outlier_flags <- character()
time_values <- numeric()
actual_values <- numeric()
# Rolling window untuk memproses seluruh data
i <- window_size
while (i < length(rivers_ts)) {
# Data latih dengan jendela tetap 22
train_data <- rivers_ts[(i - window_size + 1):i]
# Bangun model ARIMA dan prediksi 1 langkah ke depan
model <- auto.arima(train_data)
forecast_result <- forecast(model, h = 1, level = 95) # level 95%
predicted_value <- forecast_result$mean[1]
# Hitung standard error secara manual menggunakan interval atas
se <- (forecast_result$upper[1, 1] - predicted_value) / 1.96
# Tentukan batas berdasarkan ±2 standar error dari nilai prediksi
lower_bound <- predicted_value - 3 * se
upper_bound <- predicted_value + 3 * se
# Data aktual berikutnya
actual_value <- rivers_ts[i + 1]
# Deteksi outlier: jika nilai aktual di luar interval [lower_bound, upper_bound]
outlier_flag <- ifelse(actual_value < lower_bound | actual_value > upper_bound, "Yes", "No")
# Simpan hasil
predictions <- c(predictions, predicted_value)
actual_values <- c(actual_values, actual_value)
time_values <- c(time_values, i + 1)
outlier_flags <- c(outlier_flags, outlier_flag)
# Geser jendela satu langkah ke depan
i <- i + 1
}
# Buat data frame hasil
result_df <- data.frame(
Time = time_values,
Actual = actual_values,
Predicted = predictions,
Outlier = outlier_flags
)
# Visualisasi hasil
p1 <- ggplot(result_df, aes(x = Time)) +
geom_line(aes(y = Actual, color = "Actual")) +
geom_line(aes(y = Predicted, color = "Predicted")) +
geom_point(aes(y = Actual, color = Outlier), size = 2) +
labs(title = "Deteksi Outlier dengan ARIMA",
x = "Time", y = "Value") +
scale_color_manual(values = c("Actual" = "blue", "Predicted" = "red", "Yes" = "black", "No" = "green")) +
theme_minimal()
# Tampilkan plot
p1# Dengan IQR
# Load library yang dibutuhkan
library(ggplot2)
# Gunakan dataset rivers sebagai time series
rivers_ts <- ts(rivers, frequency = 1)
# Dengan IQR
# Tentukan panjang jendela dan frekuensi update
window_size <- 22
# Simpan hasil outlier
outlier_flags <- character()
time_values <- numeric()
actual_values <- numeric()
# Rolling window untuk memproses seluruh data
i <- window_size
while (i <= length(rivers_ts) - 1) {
train_data <- rivers_ts[(i - window_size + 1):i] # Gunakan jendela tetap
# Hitung batas IQR dari train_data
Q1 <- quantile(train_data, 0.25)
Q3 <- quantile(train_data, 0.75)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
actual_subset <- rivers_ts[(i + 1):(i + 1)]
# Bandingkan data baru dengan threshold IQR
outlier_flags <- c(outlier_flags, ifelse(actual_subset < lower_bound | actual_subset > upper_bound, "Yes", "No"))
actual_values <- c(actual_values, actual_subset)
time_values <- c(time_values, (i + 1):(i + 1))
i <- i + 1
}
# Buat data frame hasil
result_df <- data.frame(
Time = time_values,
Actual = actual_values,
Outlier = outlier_flags
)
# Visualisasi hasil
p1 <- ggplot(result_df, aes(x = Time)) +
geom_line(aes(y = Actual, color = "Actual")) +
geom_point(aes(y = Actual, color = Outlier), size = 2) +
labs(title = "IQR-based Outlier Detection in Time Series",
x = "Time", y = "Value") +
scale_color_manual(values = c("Actual" = "blue", "Yes" = "black", "No" = "green")) +
theme_minimal()
p1Ensamble outlier analysis
Cocok untuk data offline (historis) dan berdimensi tinggi.
Bagaimana Subspace Histogram Ensembles (SHE) bekerja? 1. Pemilihan Subruang Acak
- Data dibagi ke dalam beberapa subruang (subspace sampling). - Setiap subruang hanya mempertimbangkan sebagian fitur dari data.
- Pembuatan Histogram di Setiap Subruang
- Histogram dibuat untuk mendistribusikan data dalam setiap subruang.
- Setiap titik data memiliki representasi dalam beberapa histogram.
- Penentuan Skor Outlier
- Titik data dengan probabilitas rendah di histogram dianggap sebagai outlier.
- Skor outlier dihitung berdasarkan seberapa jarang titik tersebut muncul di berbagai subruang.
- Ensemble Voting atau Aggregation
- Skor dari berbagai histogram dikombinasikan untuk mendapatkan final outlier score.
- Teknik agregasi seperti majority voting atau weighted averaging digunakan.
# Load library yang dibutuhkan
library(ggplot2)
library(dplyr)
Attaching package: 'dplyr'
The following objects are masked from 'package:stats':
filter, lag
The following objects are masked from 'package:base':
intersect, setdiff, setequal, union
library(caret)
library(cluster)
# Gunakan dataset rivers
set.seed(123)
data <- data.frame(Value = rivers)
# Fungsi untuk membentuk histogram dalam subruang
create_subspace_histogram <- function(data, num_bins = 10, num_subspaces = 5) {
subspace_scores <- matrix(0, nrow = nrow(data), ncol = num_subspaces)
for (i in 1:num_subspaces) {
sampled_features <- sample(1:ncol(data), 1) # Pilih fitur secara acak
hist_data <- hist(data[[sampled_features]], breaks = num_bins, plot = FALSE)
bin_assignments <- cut(data[[sampled_features]], breaks = hist_data$breaks, labels = FALSE)
bin_density <- table(bin_assignments) / length(bin_assignments)
for (j in 1:nrow(data)) {
if (!is.na(bin_assignments[j])) {
subspace_scores[j, i] <- bin_density[as.character(bin_assignments[j])]
}
}
}
rowMeans(subspace_scores, na.rm = TRUE) # Ambil rata-rata dari semua subruang
}
# Hitung skor outlier menggunakan Subspace Histogram Ensembles (SHE)
data$Outlier_Score <- create_subspace_histogram(data)
# Menentukan threshold outlier berdasarkan persentil rendah
threshold <- quantile(data$Outlier_Score, probs = 0.05)
data$Outlier <- ifelse(data$Outlier_Score < threshold, "Yes", "No")
# Visualisasi hasil
data$Time <- 1:nrow(data)
p1 <- ggplot(data, aes(x = Time)) +
geom_line(aes(y = Value)) +
geom_point(aes(y = Value, color = Outlier), size = 2) +
labs(title = "Subspace Histogram Ensembles (SHE)",
x = "Time", y = "Value") +
scale_color_manual(values = c("Yes" = "black", "No" = "green")) +
theme_minimal()
p1Di bawah ini adalah contoh metode ensemble outlier detection untuk data stream yang bekerja secara online (rolling window) dan dapat digunakan seperti ARIMA. Dalam contoh ini, kita menggabungkan tiga pendekatan:
ARIMA Forecast Residual Method:
Menggunakan model ARIMA untuk memprediksi nilai berikutnya dan memeriksa apakah nilai aktual berada di luar interval prediksi (misalnya, ±3 standard error).IQR-based Method:
Menghitung kuartil dan IQR dari data training (window) dan menentukan apakah nilai aktual berada di luar rentang ([Q1 - 1.5 IQR,, Q3 + 1.5 IQR]).Z-score-based Method:
Menghitung z-score dari nilai aktual berdasarkan mean dan standar deviasi data training. Jika (|z|) melebihi threshold (misalnya, 3), maka dianggap outlier.
Pendekatan ensemble ini menggabungkan voting dari ketiga metode tersebut: jika minimal dua dari tiga metode mendeteksi outlier, maka observasi baru dianggap sebagai outlier.
# Load library yang dibutuhkan
library(ggplot2)
library(forecast)
# Gunakan dataset rivers sebagai data stream
rivers_ts <- ts(rivers, frequency = 1)
# Panjang jendela rolling (data training)
window_size <- 22
# Data frame untuk menyimpan hasil
results <- data.frame(Time = numeric(),
Value = numeric(),
ARIMA_Outlier = character(),
IQR_Outlier = character(),
ZScore_Outlier = character(),
Ensemble_Outlier = character(),
stringsAsFactors = FALSE)
# Proses rolling window untuk setiap observasi i+1
for(i in window_size:(length(rivers_ts)-1)) {
# Data training: jendela tetap dari (i - window_size + 1) hingga i
train_data <- rivers_ts[(i - window_size + 1):i]
new_value <- rivers_ts[i + 1]
## 1. ARIMA Forecast Residual Method
model <- auto.arima(train_data)
forecast_result <- forecast(model, h = 1, level = 95) # interval 95%
predicted_value <- forecast_result$mean[1]
# Hitung standard error dari forecast menggunakan upper bound
se <- (forecast_result$upper[1, 1] - predicted_value) / 1.96
# Jika nilai aktual di luar [predicted_value ± 2*se], tandai sebagai outlier
arima_outlier <- ifelse(new_value < predicted_value - 3 * se | new_value > predicted_value + 3 * se, "Yes", "No")
## 2. IQR-based Method
Q1 <- quantile(train_data, 0.25)
Q3 <- quantile(train_data, 0.75)
IQR_value <- Q3 - Q1
lower_bound <- Q1 - 1.5 * IQR_value
upper_bound <- Q3 + 1.5 * IQR_value
iqr_outlier <- ifelse(new_value < lower_bound | new_value > upper_bound, "Yes", "No")
## 3. Z-score-based Method
mu <- mean(train_data)
sigma <- sd(train_data)
z <- abs(new_value - mu) / sigma
# Misal, threshold z > 3 dianggap outlier
zscore_outlier <- ifelse(z > 3, "Yes", "No")
## Ensemble: majority vote (jika minimal 2 metode mendeteksi outlier)
votes <- sum(c(arima_outlier, iqr_outlier, zscore_outlier) == "Yes")
ensemble_outlier <- ifelse(votes >= 2, "Yes", "No")
# Simpan hasil untuk observasi i+1
results <- rbind(results, data.frame(Time = i + 1,
Value = new_value,
ARIMA_Outlier = arima_outlier,
IQR_Outlier = iqr_outlier,
ZScore_Outlier = zscore_outlier,
Ensemble_Outlier = ensemble_outlier,
stringsAsFactors = FALSE))
}
# Visualisasi hasil ensemble outlier detection
ggplot(results, aes(x = Time, y = Value)) +
geom_line(color = "blue") +
geom_point(aes(color = Ensemble_Outlier), size = 2) +
scale_color_manual(values = c("Yes" = "red", "No" = "green")) +
labs(title = "Ensemble Outlier Detection pada Data Stream",
x = "Time", y = "Value") +
theme_minimal()