R Lab 2 (I): 2023-04-17

Calculating distances of multivariate data

Consider two points P and Q with 5 coordinates.

P <- c(9, 2, 6, 5, 8)
P
[1] 9 2 6 5 8
length(P)
[1] 5
Q <- c(12, 8, 6, 4, 10)
O <- rep(0, 5) # Origin with 5 coordinates

? operator in R provides access to the documentation page of a R function.

?dist
starting httpd help server ... done

Straight-line/Euclidean distance from P to the origin

d_OP <- dist(rbind(O, P), method="euclidean")
d_OP
         O
P 14.49138

Euclidean distance from P to Q

d_PQ <- dist(rbind(P, Q), method="euclidean")
d_PQ
         P
Q 7.071068

Randomly generate 5 data points which are normally distributed

set.seed(1)
x1 <- rnorm(5, mean=0, sd=10)
#mean(x1)
#sd(x1)
x2 <- rnorm(5, mean=0, sd=1)
plot(x1, x2, pch=19) # scatterplot of x1 and x2
abline(h=0, v=0, col="red")
plot of chunk unnamed-chunk-5

Euclidean distance between each pair of data points

d_PQ <- dist(cbind(x1, x2), method="euclidean")
d_PQ
          1         2         3         4
2  8.205872                              
3  2.608687 10.195807                    
4 22.261177 14.116651 24.309638          
5  9.573482  1.660182 11.698018 12.688365
# Euclidean distance from 1 to 2 = 8.205872
# Euclidean distance from 1 to 3 = 2.608687
# Euclidean distance from 2 to 3 = 10.195807, etc.

Mahalanobis distance between a point and the mean vector (0, 0)

maha1 <- sqrt((x1^2/var(x1))+(x2^2/var(x2)))
maha1
[1] 1.3891495 0.7534099 1.4052152 1.8699059 0.5709981
# Calculate the var-cov matrix of two variables (independent case)
S <- matrix(diag(c(var(x1), var(x2))), nrow=2, ncol=2)
S
         [,1]      [,2]
[1,] 92.35968 0.0000000
[2,]  0.00000 0.4473392
# Inverse of the matrix S
solve(S)
           [,1]    [,2]
[1,] 0.01082724 0.00000
[2,] 0.00000000 2.23544
# Data matrix
dmat <- cbind(x1, x2)
dmat
            x1         x2
[1,] -6.264538 -0.8204684
[2,]  1.836433  0.4874291
[3,] -8.356286  0.7383247
[4,] 15.952808  0.5757814
[5,]  3.295078 -0.3053884
dim(dmat)
[1] 5 2
# Mean matrix
mu <- matrix(rep(0, 10), ncol=2, nrow=5)
mu
     [,1] [,2]
[1,]    0    0
[2,]    0    0
[3,]    0    0
[4,]    0    0
[5,]    0    0
maha2 <- sqrt(diag((dmat-mu)%*%solve(S)%*%t(dmat-mu)))
maha2
[1] 1.3891495 0.7534099 1.4052152 1.8699059 0.5709981
?mahalanobis

# Calculate Mahalanobis distance
maha3 <- sqrt(mahalanobis(dmat, center=c(0, 0), cov=S))
maha3
[1] 1.3891495 0.7534099 1.4052152 1.8699059 0.5709981

Mahalanobis distance between two different points

P <- dmat[1, ]
Q <- dmat[2, ]

maha1 <- sqrt((((P[1]-Q[1])^2)/var(x1)) + (((P[2]-Q[2])^2)/var(x2)))
maha1
      x1 
2.129432 
maha2 <- sqrt(t(as.matrix(P-Q))%*%solve(S)%*%as.matrix(P-Q))
maha2
         [,1]
[1,] 2.129432
maha3 <- sqrt(mahalanobis(P, center=Q, cov=S))
maha3
[1] 2.129432