Homework No.5

Goal: calculate distances/similarities of multidimensional objects using following techniques: a. Manhattan
b. Euclidean
c. Cosine
d. SMC
e. Jaccard

  1. Meaningfully apply appropriate distance/similarity techniques to get the most similar subgroup of objects.
  2. Comment the results. Are subgroups of similar objects obtained using different techniques are composed of the same objects?
  3. For a distance/similarity techniques that cannot be applied to your data, select a test dataset.
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1        0  60        0  11  67349        0       3       1
## 2        0  43        0  11  91449   257252       0       0
## 3        0  48        0  11  52881        0       2       0
## 4        0  35        1  10  16039   124191       3       1
## 7        0  34        1  12 125301        0       0       0
## 9        1  40        1  11  50815        0       2       1

Non-normalized data

ilgis <- 20
x <- insurance_data$KIDSDRIV[1:20]
y <- insurance_data$AGE[1:20]
j <- insurance_data$HOMEKIDS[1:20]
k <- insurance_data$YOJ[1:20]
l <-insurance_data$INCOME[1:20]
m <- insurance_data$HOME_VAL[1:20]
r <-insurance_data$TRAVTIME[1:20]
df <- data.frame(x, y, j, k, l, m, r)
data <- as.matrix(df)
df
##    x  y j  k      l      m  r
## 1  0 60 0 11  67349      0 14
## 2  0 43 0 11  91449 257252 22
## 3  0 48 0 11  52881      0 26
## 4  0 35 1 10  16039 124191  5
## 5  0 34 1 12 125301      0 46
## 6  1 40 1 11  50815      0 21
## 7  0 44 2 12  43486      0 30
## 8  0 34 0 10  62978      0 34
## 9  0 50 0  7 106952      0 48
## 10 0 53 0 14  77100      0 15
## 11 0 43 0  5  52642 209970 36
## 12 0 55 0 11  59162 180232 25
## 13 0 53 0 11 130795      0 64
## 14 0 45 0  0      0 106859 48
## 15 0 39 3 12  51884 180951 43
## 16 0 59 0 12  87460      0 45
## 17 0 42 0 11  59945 192207 42
## 18 0 34 3 13  34019  91875 27
## 19 0 31 2 12  18903  93797 48
## 20 0 28 1 13  44077 170598 29

Euclidean distance

# euklid

dist_euc <- as.matrix(dist(df, method = "euclidean"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL

for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"


ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between\nelements", low="pink", high="purple")

# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0

D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)

# Least distance between elements:
ats
##   row col
## 6   6   3
insurance_data_cor[ats[1],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 9        1  40        1  11  50815        0       2       1
insurance_data_cor[ats[2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 3        0  48        0  11  52881        0       2       0
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did 
##   row col
## 7   7   2
## 2   2   7
# biggest distance between elements:
insurance_data_cor[did[1,2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 2        0  43        0  11  91449   257252       0       0
insurance_data_cor[did[1,1],]
##    KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 10        0  44        2  12  43486        0       0       0

Manhatan distance

# manhatan

dist_euc <- as.matrix(dist(df, method = "manhattan"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"

ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between elements", low="yellow", high="green")

# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0

D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)

# Least distance between elements:
ats
##   row col
## 6   6   3
insurance_data_cor[ats[1],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 9        1  40        1  11  50815        0       2       1
insurance_data_cor[ats[2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 3        0  48        0  11  52881        0       2       0
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did 
##   row col
## 7   7   2
## 2   2   7
# biggest distance between elements:
insurance_data_cor[did[1,2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 2        0  43        0  11  91449   257252       0       0
insurance_data_cor[did[1,1],]
##    KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 10        0  44        2  12  43486        0       0       0

Chebyshew distance

# cccc,
dist_euc <- as.matrix(dist(df, method = "maximum"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"

ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between\nelements", low="yellow", high="darkorchid")

# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0

D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)

# Least distance between elements:
ats
##   row col
## 6   6   3
insurance_data_cor[ats[1],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 9        1  40        1  11  50815        0       2       1
insurance_data_cor[ats[2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 3        0  48        0  11  52881        0       2       0
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did 
##    row col
## 2    2   1
## 1    1   2
## 3    3   2
## 5    5   2
## 6    6   2
## 7    7   2
## 8    8   2
## 9    9   2
## 10  10   2
## 13  13   2
## 16  16   2
## 2    2   3
## 2    2   5
## 2    2   6
## 2    2   7
## 2    2   8
## 2    2   9
## 2    2  10
## 2    2  13
## 2    2  16
# biggest distance between elements:
insurance_data_cor[did[1,2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1        0  60        0  11  67349        0       3       1
insurance_data_cor[did[1,1],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 2        0  43        0  11  91449   257252       0       0

Normalized data

normalize <- function(x) {
  return ((x - min(x)) / (max(x) - min(x)))
}


x <- normalize(x)
y <- normalize(y)
j <- normalize(j)
k <- normalize(k)
l <- normalize(l)
m <- normalize(m)
r <- normalize(r)
df <- data.frame(x, y, j, k, l, m, r)
data <- as.matrix(df)

df
##    x       y         j         k         l         m         r
## 1  0 1.00000 0.0000000 0.7857143 0.5149203 0.0000000 0.1525424
## 2  0 0.46875 0.0000000 0.7857143 0.6991781 1.0000000 0.2881356
## 3  0 0.62500 0.0000000 0.7857143 0.4043044 0.0000000 0.3559322
## 4  0 0.21875 0.3333333 0.7142857 0.1226270 0.4827601 0.0000000
## 5  0 0.18750 0.3333333 0.8571429 0.9579953 0.0000000 0.6949153
## 6  1 0.37500 0.3333333 0.7857143 0.3885087 0.0000000 0.2711864
## 7  0 0.50000 0.6666667 0.8571429 0.3324745 0.0000000 0.4237288
## 8  0 0.18750 0.0000000 0.7142857 0.4815016 0.0000000 0.4915254
## 9  0 0.68750 0.0000000 0.5000000 0.8177071 0.0000000 0.7288136
## 10 0 0.78125 0.0000000 1.0000000 0.5894721 0.0000000 0.1694915
## 11 0 0.46875 0.0000000 0.3571429 0.4024772 0.8162036 0.5254237
## 12 0 0.84375 0.0000000 0.7857143 0.4523262 0.7006049 0.3389831
## 13 0 0.78125 0.0000000 0.7857143 1.0000000 0.0000000 1.0000000
## 14 0 0.53125 0.0000000 0.0000000 0.0000000 0.4153865 0.7288136
## 15 0 0.34375 1.0000000 0.8571429 0.3966818 0.7033998 0.6440678
## 16 0 0.96875 0.0000000 0.8571429 0.6686800 0.0000000 0.6779661
## 17 0 0.43750 0.0000000 0.7857143 0.4583126 0.7471545 0.6271186
## 18 0 0.18750 1.0000000 0.9285714 0.2600940 0.3571401 0.3728814
## 19 0 0.09375 0.6666667 0.8571429 0.1445239 0.3646114 0.7288136
## 20 0 0.00000 0.3333333 0.9285714 0.3369930 0.6631552 0.4067797

Euclidean distance

# euklid

dist_euc <- as.matrix(dist(df, method = "euclidean"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL

for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"


ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between\nelements", low="pink", high="purple")

# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0

D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)

# Least distance between elements:
ats
##    row col
## 10  10   1
insurance_data_cor[ats[1],]
##    KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 14        0  53        0  14  77100        0       0       0
insurance_data_cor[ats[2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1        0  60        0  11  67349        0       3       1
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did 
##    row col
## 18  18  13
## 13  13  18
# biggest distance between elements:
insurance_data_cor[did[1,2],]
##    KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 17        0  53        0  11 130795        0       3       0
insurance_data_cor[did[1,1],]
##    KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 23        0  34        3  13  34019    91875       0       0

Manhatan distance

# manhatan

dist_euc <- as.matrix(dist(df, method = "manhattan"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"

ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between elements", low="yellow", high="green")

# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0

D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)

# Least distance between elements:
ats
##    row col
## 10  10   1
insurance_data_cor[ats[1],]
##    KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 14        0  53        0  14  77100        0       0       0
insurance_data_cor[ats[2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1        0  60        0  11  67349        0       3       1
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did 
##    row col
## 14  14   6
## 6    6  14
# biggest distance between elements:
insurance_data_cor[did[1,2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 9        1  40        1  11  50815        0       2       1
insurance_data_cor[did[1,1],]
##    KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 19        0  45        0   0      0   106859       3       0

Chebyshew distance

# cccc,
dist_euc <- as.matrix(dist(df, method = "maximum"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
  temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"

ggplot(temp, aes(V1, V2, fill= Distance)) + 
  labs(title = "Distance between elements",
       x = "Object Number", y = "Object Number") +
  geom_tile() +
  scale_fill_gradient("Distance between\nelements", low="yellow", high="darkorchid")

# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0

D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)

# Least distance between elements:
ats
##    row col
## 10  10   3
insurance_data_cor[ats[1],]
##    KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 14        0  53        0  14  77100        0       0       0
insurance_data_cor[ats[2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 3        0  48        0  11  52881        0       2       0
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did 
##    row col
## 2    2   1
## 6    6   1
## 15  15   1
## 18  18   1
## 20  20   1
## 1    1   2
## 3    3   2
## 5    5   2
## 6    6   2
## 7    7   2
## 8    8   2
## 9    9   2
## 10  10   2
## 13  13   2
## 15  15   2
## 16  16   2
## 18  18   2
## 2    2   3
## 6    6   3
## 15  15   3
## 18  18   3
## 6    6   4
## 13  13   4
## 2    2   5
## 6    6   5
## 1    1   6
## 2    2   6
## 3    3   6
## 4    4   6
## 5    5   6
## 7    7   6
## 8    8   6
## 9    9   6
## 10  10   6
## 11  11   6
## 12  12   6
## 13  13   6
## 14  14   6
## 15  15   6
## 16  16   6
## 17  17   6
## 18  18   6
## 19  19   6
## 20  20   6
## 2    2   7
## 6    6   7
## 2    2   8
## 6    6   8
## 15  15   8
## 18  18   8
## 2    2   9
## 6    6   9
## 15  15   9
## 18  18   9
## 2    2  10
## 6    6  10
## 14  14  10
## 15  15  10
## 18  18  10
## 6    6  11
## 15  15  11
## 18  18  11
## 6    6  12
## 15  15  12
## 18  18  12
## 2    2  13
## 4    4  13
## 6    6  13
## 14  14  13
## 15  15  13
## 18  18  13
## 6    6  14
## 10  10  14
## 13  13  14
## 15  15  14
## 18  18  14
## 1    1  15
## 2    2  15
## 3    3  15
## 6    6  15
## 8    8  15
## 9    9  15
## 10  10  15
## 11  11  15
## 12  12  15
## 13  13  15
## 14  14  15
## 16  16  15
## 17  17  15
## 2    2  16
## 6    6  16
## 15  15  16
## 18  18  16
## 6    6  17
## 15  15  17
## 18  18  17
## 1    1  18
## 2    2  18
## 3    3  18
## 6    6  18
## 8    8  18
## 9    9  18
## 10  10  18
## 11  11  18
## 12  12  18
## 13  13  18
## 14  14  18
## 16  16  18
## 17  17  18
## 6    6  19
## 1    1  20
## 6    6  20
# biggest distance between elements:
insurance_data_cor[did[1,2],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1        0  60        0  11  67349        0       3       1
insurance_data_cor[did[1,1],]
##   KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 2        0  43        0  11  91449   257252       0       0