Calculating distances of multivariate data

Let’s consider two points P and Q with 5 coordinates.

P <- c(9, 2, 6, 5, 8)
P
[1] 9 2 6 5 8
length(P)
[1] 5
Q <- c(12, 8, 6, 4, 10)
O <- rep(0, 5) # Origin with 5 coordinates

# ? operator in R provides access to the documentation page of a R function.
?dist
starting httpd help server ... done

Straight-line or Euclidean distance from P to the origin

d_OP <- dist(rbind(O, P), method="euclidean")
d_OP
         O
P 14.49138

Euclidean distance from P to Q

d_PQ <- dist(rbind(P, Q), method="euclidean")
d_PQ
         P
Q 7.071068

Let’s randomly generate 5 data points which are normally distributed.

set.seed(1)
x1 <- rnorm(5, mean=0, sd=10)
x1
[1] -6.264538  1.836433 -8.356286 15.952808  3.295078
#mean(x1)
#sd(x1)

x2 <- rnorm(5, mean=0, sd=1)
x2
[1] -0.8204684  0.4874291  0.7383247  0.5757814 -0.3053884
plot(x1, x2, pch=19) # Scatterplot of x1 and x2
abline(h=0, v=0, col="red")

Euclidean distance between each pair of data points

d_PQ <- dist(cbind(x1, x2), method="euclidean")
d_PQ
          1         2         3         4
2  8.205872                              
3  2.608687 10.195807                    
4 22.261177 14.116651 24.309638          
5  9.573482  1.660182 11.698018 12.688365

Euclidean distance from 1 to 2 = 8.205872.

Euclidean distance from 1 to 3 = 2.608687.

Euclidean distance from 2 to 3 = 10.195807, etc.

Mahalanobis distance between a point and the mean vector (0, 0)

maha1 <- sqrt((x1^2/var(x1))+(x2^2/var(x2))) # Method 1
maha1
[1] 1.3891495 0.7534099 1.4052152 1.8699059 0.5709981
# Calculate the var-cov matrix of two variables (independent case)
S <- matrix(diag(c(var(x1), var(x2))), nrow=2, ncol=2)
S
         [,1]      [,2]
[1,] 92.35968 0.0000000
[2,]  0.00000 0.4473392
# Inverse of the matrix S
solve(S)
           [,1]    [,2]
[1,] 0.01082724 0.00000
[2,] 0.00000000 2.23544
# Data matrix
dmat <- cbind(x1, x2)
dmat
            x1         x2
[1,] -6.264538 -0.8204684
[2,]  1.836433  0.4874291
[3,] -8.356286  0.7383247
[4,] 15.952808  0.5757814
[5,]  3.295078 -0.3053884
dim(dmat)
[1] 5 2
# Mean matrix
mu <- matrix(rep(0, 10), ncol=2, nrow=5)
mu
     [,1] [,2]
[1,]    0    0
[2,]    0    0
[3,]    0    0
[4,]    0    0
[5,]    0    0
maha2 <- sqrt(diag((dmat-mu)%*%solve(S)%*%t(dmat-mu))) # Method 2
maha2
[1] 1.3891495 0.7534099 1.4052152 1.8699059 0.5709981
?mahalanobis

# Calculate Mahalanobis distance
maha3 <- sqrt(mahalanobis(dmat, center=c(0, 0), cov=S)) # Method 3
maha3
[1] 1.3891495 0.7534099 1.4052152 1.8699059 0.5709981

Mahalanobis distance between two different points

P <- dmat[1, ]
Q <- dmat[2, ]

maha1 <- sqrt((((P[1]-Q[1])^2)/var(x1)) + (((P[2]-Q[2])^2)/var(x2))) # Method 1
maha1
      x1 
2.129432 
maha2 <- sqrt(t(as.matrix(P-Q))%*%solve(S)%*%as.matrix(P-Q)) # Method 2
maha2
         [,1]
[1,] 2.129432
maha3 <- sqrt(mahalanobis(P, center=Q, cov=S)) # Method 3
maha3
[1] 2.129432

Review of matrix algebra

Vectors

# Define vector x
x <- c(3, 4, 10)
x
[1]  3  4 10
# Size of vector x
length(x)
[1] 3
# Convert a vector into a matrix object
xmat <- as.matrix(x)
xmat
     [,1]
[1,]    3
[2,]    4
[3,]   10
dim(xmat)
[1] 3 1
# Transpose of a vector
t(x)
     [,1] [,2] [,3]
[1,]    3    4   10
length(t(x))
[1] 3
# Transpose of a matrix
t(xmat)
     [,1] [,2] [,3]
[1,]    3    4   10
dim(t(xmat))
[1] 1 3
# Multiplication by a scalar
3 * x
[1]  9 12 30
# Vector addition and and subtraction
x <- c(4, 1, 3, 2)
y <- c(1, -1, 3, 0)

x + y
[1] 5 0 6 2
x - y
[1] 3 2 0 2
# Inner product of x and y
t(x) %*% y
     [,1]
[1,]   12
crossprod(x, y)
     [,1]
[1,]   12
# Component-wise product of vectors
x * y
[1]  4 -1  9  0
# Normalized vector: a vector of unit length
z <- x / sqrt(c(t(x)%*%x)) # x / sqrt(c(crossprod(x)))
z
[1] 0.7302967 0.1825742 0.5477226 0.3651484
# Check the length of the normalized vector
crossprod(z)
     [,1]
[1,]    1

Matrices

data <- c(1:6)
data
[1] 1 2 3 4 5 6
# Define matrix A
A <- matrix(data, nrow=3, ncol=2)
A
     [,1] [,2]
[1,]    1    4
[2,]    2    5
[3,]    3    6
# Define matrix B
B <- matrix(data, nrow = 3, ncol=2, byrow=TRUE)
B
     [,1] [,2]
[1,]    1    2
[2,]    3    4
[3,]    5    6
# Multiplication by a scalar
3 * A
     [,1] [,2]
[1,]    3   12
[2,]    6   15
[3,]    9   18
# Matrix addition and subtraction
A <- matrix(c(1:6), 2, 3)
A
     [,1] [,2] [,3]
[1,]    1    3    5
[2,]    2    4    6
B <- matrix(c(20:25), 2, 3)
B
     [,1] [,2] [,3]
[1,]   20   22   24
[2,]   21   23   25
A + B
     [,1] [,2] [,3]
[1,]   21   25   29
[2,]   23   27   31
A - B
     [,1] [,2] [,3]
[1,]  -19  -19  -19
[2,]  -19  -19  -19
# Matrix multiplication
A %*% t(B)
     [,1] [,2]
[1,]  206  215
[2,]  272  284
t(B)%*%A
     [,1] [,2] [,3]
[1,]   62  144  226
[2,]   68  158  248
[3,]   74  172  270
# Multiply element-by-element
A * B
     [,1] [,2] [,3]
[1,]   20   66  120
[2,]   42   92  150
# Special matrices

# Diagonal matrix
ddata <- diag(data)
ddata
     [,1] [,2] [,3] [,4] [,5] [,6]
[1,]    1    0    0    0    0    0
[2,]    0    2    0    0    0    0
[3,]    0    0    3    0    0    0
[4,]    0    0    0    4    0    0
[5,]    0    0    0    0    5    0
[6,]    0    0    0    0    0    6
diag(ddata)
[1] 1 2 3 4 5 6
# Identity matrix
diag(rep(1,4))
     [,1] [,2] [,3] [,4]
[1,]    1    0    0    0
[2,]    0    1    0    0
[3,]    0    0    1    0
[4,]    0    0    0    1
# Trace of a matrix: sum of its diagonal elements
A <- matrix(rnorm(25), 5, 5)
A
           [,1]        [,2]        [,3]        [,4]        [,5]
[1,]  1.5117812 -0.04493361  0.91897737 -0.05612874  1.35867955
[2,]  0.3898432 -0.01619026  0.78213630 -0.15579551 -0.10278773
[3,] -0.6212406  0.94383621  0.07456498 -1.47075238  0.38767161
[4,] -2.2146999  0.82122120 -1.98935170 -0.47815006 -0.05380504
[5,]  1.1249309  0.59390132  0.61982575  0.41794156 -1.37705956
sum(diag(A))
[1] -0.2850537
# Determinant of a matrix
det(A)
[1] 1.332143
# Inverse of a matrix
solve(A)
           [,1]      [,2]       [,3]      [,4]       [,5]
[1,] -0.2455356 -3.328855  0.8689000 -1.295708  0.3014574
[2,]  1.0825373  2.687555 -0.7137777  1.716740  0.5994606
[3,]  0.4521566  3.897085 -0.8458634  1.166644 -0.1284804
[4,]  1.0273684  3.673334 -1.6763979  1.915998  0.1926631
[5,]  0.7816277  1.308704 -0.4875499  0.788552 -0.2207416
# Check A.inv(A) = I
A %*% solve(A)
              [,1]          [,2]          [,3]          [,4]          [,5]
[1,]  1.000000e+00 -2.220446e-16  1.110223e-16  4.440892e-16  0.000000e+00
[2,]  4.163336e-17  1.000000e+00  1.387779e-17 -1.387779e-17  3.816392e-17
[3,] -1.110223e-16  5.551115e-16  1.000000e+00 -1.110223e-16  6.938894e-17
[4,] -1.387779e-16  3.885781e-16 -1.457168e-16  1.000000e+00 -1.734723e-18
[5,] -2.220446e-16 -1.332268e-15  3.330669e-16  0.000000e+00  1.000000e+00
# Eigenvalues and eigenvectors
evs <- eigen(A)
evs$values
[1]  2.5070958+0.0000000i -2.4759155+0.0000000i -1.1194433+0.0000000i
[4]  0.4016046+0.1744205i  0.4016046-0.1744205i
evs$vectors
              [,1]           [,2]          [,3]                    [,4]
[1,]  0.6215789+0i  0.06419107+0i -0.3781959+0i  0.54812264+0.00344903i
[2,]  0.2339175+0i -0.15866809+0i -0.1366923+0i -0.40145783+0.09570145i
[3,]  0.3458510+0i  0.52712185+0i  0.5549777+0i -0.59398737+0.00000000i
[4,] -0.6309227+0i  0.64718932+0i  0.6222747+0i -0.37695071+0.15068046i
[5,]  0.2030907+0i -0.52344291+0i  0.3782289+0i -0.07540428+0.07693685i
                        [,5]
[1,]  0.54812264-0.00344903i
[2,] -0.40145783-0.09570145i
[3,] -0.59398737+0.00000000i
[4,] -0.37695071-0.15068046i
[5,] -0.07540428-0.07693685i