Data Visualization Home work 5

Antanas Kaminskas

2022-04-20

Task Goal: calculate distances/similarities of multidimensional objects using following techniques:

  1. Manhattan
  2. Euclidean
  3. Cosine
  4. SMC
  5. Jaccard
  1. Meaningfully apply appropriate distance/similarity techniques to get the most similar subgroup of objects.
  2. Comment the results. Are subgroups of similar objects obtained using different techniques are composed of the same objects?
  3. For a distance/similarity techniques that cannot be applied to your data, select a test dataset.

Complex number distance calculating theory

The modulus of the complex number \(a + bi\) is \(|a + bi| = \sqrt{a^2 + b^2}\) . This is the distance between the origin \((0,0)\) and the point \((a + bi)\) in the complex plane. For two point in the complex plane, the distance between the points is the modulus of the difference of two complex numbers.

Let \((a, b)\) and \((s,t)\) be points in the complex plane. The difference of the complex numbers is \((s + it) - (a + bi) = (s - a) + (t - b)i\). The modulus of the difference is \[ |(s-a) + (t - b)i| = \sqrt{(s - a)^2 + (t - b)^2} \]. So \[ d = \sqrt{(s - a)^2 + (t - b)^2} \] is the difference between the two poins in the complex plane.

Methodology for calculated distances

  1. Euclidean distance is calculated by formula \[ d(x, y) = \sqrt{\sum\limits_{i = 1}^{n} (x_i - y_i)^2} \]

  2. Manhettan distance is calculated by formula \[ d(x, y) = \sum\limits_{i = 1}^{n} |x_i - y_i| \]

  3. Chebyshev distance is calculated by formula \[ d(x, y) = \max\limits_{i = 1, 2, \dots, n} |x_i - y_i| \]

  4. Canberra distance is calculated by formula \[ d(x, y) = \sum\limits_{i = 1}^{n} \frac{|x_i - y_i|}{|x_i + y_i|} \]

library(pracma)
## Warning: paketas 'pracma' buvo sukurtas pagal R versijÄ… 4.1.3
library(ggplot2)
library(plot3D)
library(rgl)
library(viscomplexr)
library(philentropy)
## Warning: paketas 'philentropy' buvo sukurtas pagal R versijÄ… 4.1.3

Data

ilgis <- 20
x <- seq(-13, 49.8, len = ilgis)
y <- seq(-13, 49.8, leng = ilgis)
j <- zeta(0.5 + x*1i)

Euclidean distance

df <- data.frame(x, y, j, abs(j))
data <- as.matrix(df)

dist_euc <- as.matrix(dist(df, method = "euclidean"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"

ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between\nelements", low="blue", high="green")

min( temp$Distance)
## [1] 0
max( temp$Distance)
## [1] 88.817
summary(temp$Distance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   14.03   28.05   31.15   46.76   88.82

Minimal value

which(temp$Distance == min(temp$Distance))
##  [1]   1  22  43  64  85 106 127 148 169 190 211 232 253 274 295 316 337 358 379
## [20] 400
i1 <- which.min(replace(temp$Distance, temp$Distance<=0, NA))
which.min(temp$Distance*NA^(temp$Distance <=0))
## [1] 275
temp$Distance[i1]
## [1] 4.676506

Maximum value

which(temp$Distance == max(temp$Distance), arr.ind = TRUE)
## [1]  20 381

Manhettan distance

df <- data.frame(x, y, j, abs(j))
data <- as.matrix(df)

dist_man <- as.matrix(dist(df, method = "manhattan"))
dist_ma <- as.data.frame(dist_man)
temp <- NULL
for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_ma[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"

ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between\nelements", low="green", high="red")

min( temp$Distance)
## [1] 0
max( temp$Distance)
## [1] 126.813
summary(temp$Distance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##    0.00   20.37   40.50   45.61   67.94  126.81

Minimal value

which(temp$Distance == min(temp$Distance), arr.ind = TRUE)
##  [1]   1  22  43  64  85 106 127 148 169 190 211 232 253 274 295 316 337 358 379
## [20] 400
i1 <- which.min(replace(temp$Distance, temp$Distance<=0, NA))
which.min(temp$Distance*NA^(temp$Distance <=0))
## [1] 275
temp$Distance[i1]
## [1] 6.780091

Maximum value

which(temp$Distance == max(temp$Distance), arr.ind = TRUE)
## [1]  20 381

Chebyshev distance

df <- data.frame(x, y, j, abs(j))
data <- as.matrix(df)

dist_che <- as.matrix(dist(df, method = "maximum"))
dist_ch <- as.data.frame(dist_che)
temp <- NULL
for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_ch[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"

ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between\nelements", low="green", high="gold")

min( temp$Distance)
## [1] 0
max( temp$Distance)
## [1] 62.8
summary(temp$Distance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   9.916  19.832  21.980  33.053  62.800

Minimal value

which(temp$Distance == min(temp$Distance), arr.ind = TRUE)
##  [1]   1  22  43  64  85 106 127 148 169 190 211 232 253 274 295 316 337 358 379
## [20] 400
i1 <- which.min(replace(temp$Distance, temp$Distance<=0, NA))
which.min(temp$Distance*NA^(temp$Distance <=0))
## [1] 107
temp$Distance[i1]
## [1] 3.305263

Maximum value

which(temp$Distance == max(temp$Distance), arr.ind = TRUE)
## [1]  20 381

Canberra distance

df <- data.frame(x, y, j, abs(j))
data <- as.matrix(df)

dist_can <- as.matrix(dist(df, method = "canberra"))
dist_ca <- as.data.frame(dist_can)
temp <- NULL
for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_ca[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"

ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between\nelements", low="blue", high="yellow")

min( temp$Distance)
## [1] 0
max( temp$Distance)
## [1] 3.952353
summary(temp$Distance)
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   1.537   2.236   2.129   2.908   3.952

Minimal value

which(temp$Distance == min(temp$Distance), arr.ind = TRUE)
##  [1]   1  22  43  64  85 106 127 148 169 190 211 232 253 274 295 316 337 358 379
## [20] 400
i1 <- which.min(replace(temp$Distance, temp$Distance<=0, NA))
which.min(temp$Distance*NA^(temp$Distance <=0))
## [1] 317
temp$Distance[i1]
## [1] 0.2727582

Maximum value

which(temp$Distance == max(temp$Distance), arr.ind = TRUE)
## [1]  40 382

Results

The most similar element in different distances

The smallest distance

Euclidean distance is between 19 and 20 in this element equals 4.676506

Chebyshev distance is between 11 and 10 in this element equals 3.305263

Manhettan distance is between 4 and 3 in this elements equals 6.780091

Canberra distance is between 13 and 10 in this elements equals 0.2727582

The biggest distance

Euclidean distance is between 1 and 20 in this element equals 88.817

Chebyshev distance is between 1 and 20 in this element equals 62.800

Manhettan distance is between 2 and 19 in this elements equals 126.81

Canberra distance is between 1 and 20 in this elements equals 3.952353