Goal: calculate distances/similarities of multidimensional objects using following techniques:
Manhattan
Euclidean
Cosine
SMC
Jaccard
I used 4 quantitative variables, namely Value, LDM, Volume and Weight for calculating the similarity (distance metrics) between objects. Let \(p\) and \(q\) be vectors with attribute values. Then the distance metrics used for this homework can be denoted as:
\[\begin{align*} d(p,q) = \sqrt{\sum_{i=1}^n (q_i - p_i)^2}; \end{align*}\]
\[\begin{align*} d(p,q) = {\tt max}_i(|p_i - q_i|); \end{align*}\]
\[\begin{align*} d(p,q) = \sum_{i=1}^n \displaystyle \frac{|p_i - q_i|}{|p_i| + |q_i|}. \end{align*}\]
The data was scaled prior to calculating the distance matrices.
sdata6 <- scale(data6, center = FALSE)
dist_euc <- as.matrix(dist(sdata6, method = "euclidean"))
dist_can <- as.matrix(dist(sdata6, method = "canberra"))
dist_che <- as.matrix(dist(sdata6, method = "maximum"))
plot_data <- rbind(
c(max(sdata6[,1]),max(sdata6[,2]),max(sdata6[,3]),max(sdata6[,4])),
c(min(sdata6[,1]),min(sdata6[,2]),min(sdata6[,3]),min(sdata6[,4])),
sdata6)
rownames(plot_data)[1:2] <- LETTERS[1:2]
I chose to plot heatmaps to overview the created distance matrices. It can be noticed that the Euclidean distance produced similar results to Chebyshev distance, but on a different scale. Whereas, the Canberra distance produced a distance matrix that doesn’t even closely resemble the other two matrices.
ggplot(temp, aes(V1, V2, fill= Distance)) +
labs(title = "Euclidean distance matrix",
x = "Object ID", y = "Object ID") +
geom_tile() +
scale_fill_gradient(low="white", high="red")
ggplot(temp2, aes(V1, V2, fill= Distance)) +
labs(title = "Chebyshev distance matrix",
x = "Object ID", y = "Object ID") +
geom_tile() +
scale_fill_gradient(low="white", high="red")
ggplot(temp3, aes(V1, V2, fill= Distance)) +
labs(title = "Canberra distance matrix",
x = "Object ID", y = "Object ID") +
geom_tile() +
scale_fill_gradient(low="white", high="red")
Next I want to show three different comparisons between an object \(x\) and the most similar object to it according to different distance measures.
x_1 <- 7
par(mfrow=c(1,3))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_euc[x_1,] == min(dist_euc[x_1,-x_1]))+2)),]),
title = paste0("Euclidean distance = ",round(min(dist_euc[x_1,-x_1]),2),"\n The most similar object - ",which(dist_euc[x_1,] == min(dist_euc[x_1,-x_1]))))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_che[x_1,] == min(dist_che[x_1,-x_1]))+2)),]),
title = paste0("Chebyshev distance = ",round(min(dist_che[x_1,-x_1]),2),"\n The most similar object - ",which(dist_che[x_1,] == min(dist_che[x_1,-x_1]))))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_can[x_1,] == min(dist_can[x_1,-x_1]))+2)),]),
title = paste0("Canberra distance = ",round(min(dist_can[x_1,-x_1]),2),"\n The most similar object - ",which(dist_can[x_1,] == min(dist_can[x_1,-x_1]))))
x_1 <- 4
par(mfrow=c(1,3))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_euc[x_1,] == min(dist_euc[x_1,-x_1]))+2)),]),
title = paste0("Euclidean distance = ",round(min(dist_euc[x_1,-x_1]),2),"\n The most similar object - ",which(dist_euc[x_1,] == min(dist_euc[x_1,-x_1]))))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_che[x_1,] == min(dist_che[x_1,-x_1]))+2)),]),
title = paste0("Chebyshev distance = ",round(min(dist_che[x_1,-x_1]),2),"\n The most similar object - ",which(dist_che[x_1,] == min(dist_che[x_1,-x_1]))))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_can[x_1,] == min(dist_can[x_1,-x_1]))+2)),]),
title = paste0("Canberra distance = ",round(min(dist_can[x_1,-x_1]),2),"\n The most similar object - ",which(dist_can[x_1,] == min(dist_can[x_1,-x_1]))))
x_1 <- 2
par(mfrow=c(1,3))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_euc[x_1,] == min(dist_euc[x_1,-x_1]))+2)),]),
title = paste0("Euclidean distance = ",round(min(dist_euc[x_1,-x_1]),2),"\n The most similar object - ",which(dist_euc[x_1,] == min(dist_euc[x_1,-x_1]))))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_che[x_1,] == min(dist_che[x_1,-x_1]))+2)),]),
title = paste0("Chebyshev distance = ",round(min(dist_che[x_1,-x_1]),2),"\n The most similar object - ",which(dist_che[x_1,] == min(dist_che[x_1,-x_1]))))
radarchart(as.data.frame(plot_data[c(1:2,(x_1+2),
(which(dist_can[x_1,] == min(dist_can[x_1,-x_1]))+2)),]),
title = paste0("Canberra distance = ",round(min(dist_can[x_1,-x_1]),2),"\n The most similar object - ",which(dist_can[x_1,] == min(dist_can[x_1,-x_1]))))
In the first two comparisons all distance matrices pointed out the same ‘most similar object’ to the object \(x\). Whereas in the third comparison, the Chebyshev distance suggested a different ‘most similar object’ to the other two distance metrics.