knitr::opts_chunk$set(echo = TRUE)

Definition of a distance

Exercice 1

  • Prove that the three axioms A1-A3 imply the non-negativity condition: \[d(\mathbf{x},\mathbf{y})\geq 0.\]

Euclidean distance

\[d(\mathbf{x},\mathbf{y})=\sqrt{\sum_{i=1}^n (x_i-y_i)^2}.\] * A1A2 ae onbvious. * The proof of A3 is provided below.

Manhattan distance

\[d(\mathbf{x},\mathbf{y}) =\sum_{i=1}^n |x_i-y_i|. \]

Manhattan distance vs Euclidean distance Graph

x = c(0, 0)
y = c(6,6)
dist(rbind(x, y), method = "euclidian")
         x
y 8.485281
6*sqrt(2)
[1] 8.485281
dist(rbind(x, y), method = "manhattan")
   x
y 12

Canberra distance

\[d(\mathbf{x},\mathbf{y}) =\sum_{i=1}^n \frac{|x_i-y_i|}{|x_i|+|y_i|}.\]

x = c(0, 0)
y = c(6,6)
dist(rbind(x, y), method = "canberra")
  x
y 2
6/6+6/6
[1] 2

Exercice 2

  • Prove that the Canberra distance is a true distance.

Minkowski distance

library("ggplot2")
x = c(0, 0)
y = c(6,6)
MinkowDist=c()
for (p in seq(1,30,.01))
{
MinkowDist=c(MinkowDist,dist(rbind(x, y), method = "minkowski", p = p))     
}
ggplot(data =data.frame(x = seq(1,30,.01), y=MinkowDist ) , mapping = aes(x = x, y = y))+geom_point(size=.1,color="red")+xlim(1,11)+xlab("p")+ylab("Minkowski Distance")+ggtitle("Minkowski distance wrt p")

Chebyshev distance

Minkowski inequality

Hölder inequality

\[ |\sum_{i=1}^nx_iy_i|\leq\sqrt{\sum_{i=1}^n x_i^2}\sqrt{\sum_{i=1}^n y_i^2}. \] * Using the dot product notation called also scalar product noation: \(\mathbf{x\cdot y}=\sum_{i=1}^nx_iy_i\), and the norm notation \(\|\mathbf{\cdot}\|_2 \|\), the Cauchy-Schwart inequality is: \[ |\mathbf{x\cdot y} | \leq \|\mathbf{x}\|_2 \| \mathbf{y}\|_2. \]

Pearson correlation distance

Cosine correlation distance

Spearman correlation distance

x=c(3, 1, 4, 15, 92)
rank(x)
[1] 2 1 3 4 5
x=c(3, 1, 4, 15, 92)
rank(x)
[1] 2 1 3 4 5
y=c(30,2 , 9, 20, 48)
rank(y)
[1] 4 1 2 3 5
d=rank(x)-rank(y)
d
[1] -2  0  1  1  0
cor(rank(x),rank(y))
[1] 0.7
1-6*sum(d^2)/(5*(5^2-1))
[1] 0.7

Kendall tau distance

x=c(3, 1, 4, 15, 92)
y=c(30,2 , 9, 20, 48)
tau=0
for (i in 1:5)
{  
tau=tau+sign(x -x[i])%*%sign(y -y[i])
}
tau=tau/(5*4)
tau
     [,1]
[1,]  0.6
cor(x,y, method="kendall")
[1] 0.6

Variables standardization

x=c(3, 1, 4, 15, 92)
y=c(30,2 , 9, 20, 48)
(x-mean(x))/sd(x)
[1] -0.5134116 -0.5647527 -0.4877410 -0.2053646  1.7712699
scale(x)
           [,1]
[1,] -0.5134116
[2,] -0.5647527
[3,] -0.4877410
[4,] -0.2053646
[5,]  1.7712699
attr(,"scaled:center")
[1] 23
attr(,"scaled:scale")
[1] 38.9551
(y-mean(y))/sd(y)
[1]  0.45263128 -1.09293895 -0.70654639 -0.09935809
[5]  1.44621214
scale(y)
            [,1]
[1,]  0.45263128
[2,] -1.09293895
[3,] -0.70654639
[4,] -0.09935809
[5,]  1.44621214
attr(,"scaled:center")
[1] 21.8
attr(,"scaled:scale")
[1] 18.11629

Distance matrix computation

install.packages("FactoMineR")
Error in install.packages : Updating loaded packages
library("FactoMineR")
data("USArrests") # Loading
head(USArrests, 3) # Print the first 3 rows
set.seed(123)
ss <- sample(1:50, 15) # Take 15 random rows
df <- USArrests[ss, ] # Subset the 15 rows
df.scaled <- scale(df) # Standardize the variables
