Data
data <- data.frame(
A = c(1,2,3,4,5),
B = c(2,4,6,8,10),
C = c(1,4,9,16,25),
D = c(5,4,3,2,1)
)
mean_vec <- colMeans(data)
mean_vec
## A B C D
## 3 6 11 3
X_minus_mu <- sweep(data, 2, mean_vec, "-")
X_minus_mu
## A B C D
## 1 -2 -4 -10 2
## 2 -1 -2 -7 1
## 3 0 0 -2 0
## 4 1 2 5 -1
## 5 2 4 14 -2
cov_matrix <- cov(data)
cov_matrix
## A B C D
## A 2.5 5 15.0 -2.5
## B 5.0 10 30.0 -5.0
## C 15.0 30 93.5 -15.0
## D -2.5 -5 -15.0 2.5
det(cov_matrix)
## [1] 0
data_reduced <- as.matrix(data[, c("A", "C")])
data_reduced
## A C
## [1,] 1 1
## [2,] 2 4
## [3,] 3 9
## [4,] 4 16
## [5,] 5 25
mean_red <- colMeans(data_reduced)
cov_red <- cov(data_reduced)
mean_red
## A C
## 3 11
cov_red
## A C
## A 2.5 15.0
## C 15.0 93.5
det_S <- det(cov_red)
det_S
## [1] 8.75
a <- cov_red[1,1]
b <- cov_red[1,2]
c <- cov_red[2,1]
d <- cov_red[2,2]
adj <- matrix(c(
d, -b,
-c, a
), nrow = 2, byrow = TRUE)
adj
## [,1] [,2]
## [1,] 93.5 -15.0
## [2,] -15.0 2.5
cat("Inverse Matrix = (1 /", det_S, ") *\n")
## Inverse Matrix = (1 / 8.75 ) *
print(adj)
## [,1] [,2]
## [1,] 93.5 -15.0
## [2,] -15.0 2.5
diff <- sweep(data_reduced, 2, mean_red, "-")
diff
## A C
## [1,] -2 -10
## [2,] -1 -7
## [3,] 0 -2
## [4,] 1 5
## [5,] 2 14
Sigma_inv <- solve(cov_red)
Sigma_inv
## A C
## A 10.685714 -1.7142857
## C -1.714286 0.2857143
manual_md <- diag(diff %*% Sigma_inv %*% t(diff))
manual_md
## [1] 2.7428571 0.6857143 1.1428571 0.6857143 2.7428571
md <- mahalanobis(data_reduced, center = mean_red, cov = cov_red)
md
## [1] 2.7428571 0.6857143 1.1428571 0.6857143 2.7428571
data$Mahalanobis <- md
data
## A B C D Mahalanobis
## 1 1 2 1 5 2.7428571
## 2 2 4 4 4 0.6857143
## 3 3 6 9 3 1.1428571
## 4 4 8 16 2 0.6857143
## 5 5 10 25 1 2.7428571
threshold <- qchisq(0.975, df = 2)
threshold
## [1] 7.377759
data$Outlier <- md > threshold
data
## A B C D Mahalanobis Outlier
## 1 1 2 1 5 2.7428571 FALSE
## 2 2 4 4 4 0.6857143 FALSE
## 3 3 6 9 3 1.1428571 FALSE
## 4 4 8 16 2 0.6857143 FALSE
## 5 5 10 25 1 2.7428571 FALSE