Goal: calculate distances/similarities of multidimensional objects using following techniques: a. Manhattan
b. Euclidean
c. Cosine
d. SMC
e. Jaccard
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1 0 60 0 11 67349 0 3 1
## 2 0 43 0 11 91449 257252 0 0
## 3 0 48 0 11 52881 0 2 0
## 4 0 35 1 10 16039 124191 3 1
## 7 0 34 1 12 125301 0 0 0
## 9 1 40 1 11 50815 0 2 1
ilgis <- 20
x <- insurance_data$KIDSDRIV[1:20]
y <- insurance_data$AGE[1:20]
j <- insurance_data$HOMEKIDS[1:20]
k <- insurance_data$YOJ[1:20]
l <-insurance_data$INCOME[1:20]
m <- insurance_data$HOME_VAL[1:20]
r <-insurance_data$TRAVTIME[1:20]
df <- data.frame(x, y, j, k, l, m, r)
data <- as.matrix(df)
df
## x y j k l m r
## 1 0 60 0 11 67349 0 14
## 2 0 43 0 11 91449 257252 22
## 3 0 48 0 11 52881 0 26
## 4 0 35 1 10 16039 124191 5
## 5 0 34 1 12 125301 0 46
## 6 1 40 1 11 50815 0 21
## 7 0 44 2 12 43486 0 30
## 8 0 34 0 10 62978 0 34
## 9 0 50 0 7 106952 0 48
## 10 0 53 0 14 77100 0 15
## 11 0 43 0 5 52642 209970 36
## 12 0 55 0 11 59162 180232 25
## 13 0 53 0 11 130795 0 64
## 14 0 45 0 0 0 106859 48
## 15 0 39 3 12 51884 180951 43
## 16 0 59 0 12 87460 0 45
## 17 0 42 0 11 59945 192207 42
## 18 0 34 3 13 34019 91875 27
## 19 0 31 2 12 18903 93797 48
## 20 0 28 1 13 44077 170598 29
# euklid
dist_euc <- as.matrix(dist(df, method = "euclidean"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"
ggplot(temp, aes(V1, V2, fill= Distance)) +
labs(title = "Distance between elements",
x = "Object Number", y = "Object Number") +
geom_tile() +
scale_fill_gradient("Distance between\nelements", low="pink", high="purple")
# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0
D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)
# Least distance between elements:
ats
## row col
## 6 6 3
insurance_data_cor[ats[1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 9 1 40 1 11 50815 0 2 1
insurance_data_cor[ats[2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 3 0 48 0 11 52881 0 2 0
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did
## row col
## 7 7 2
## 2 2 7
# biggest distance between elements:
insurance_data_cor[did[1,2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 2 0 43 0 11 91449 257252 0 0
insurance_data_cor[did[1,1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 10 0 44 2 12 43486 0 0 0
# manhatan
dist_euc <- as.matrix(dist(df, method = "manhattan"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"
ggplot(temp, aes(V1, V2, fill= Distance)) +
labs(title = "Distance between elements",
x = "Object Number", y = "Object Number") +
geom_tile() +
scale_fill_gradient("Distance between elements", low="yellow", high="green")
# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0
D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)
# Least distance between elements:
ats
## row col
## 6 6 3
insurance_data_cor[ats[1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 9 1 40 1 11 50815 0 2 1
insurance_data_cor[ats[2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 3 0 48 0 11 52881 0 2 0
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did
## row col
## 7 7 2
## 2 2 7
# biggest distance between elements:
insurance_data_cor[did[1,2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 2 0 43 0 11 91449 257252 0 0
insurance_data_cor[did[1,1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 10 0 44 2 12 43486 0 0 0
# cccc,
dist_euc <- as.matrix(dist(df, method = "maximum"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"
ggplot(temp, aes(V1, V2, fill= Distance)) +
labs(title = "Distance between elements",
x = "Object Number", y = "Object Number") +
geom_tile() +
scale_fill_gradient("Distance between\nelements", low="yellow", high="darkorchid")
# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0
D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)
# Least distance between elements:
ats
## row col
## 6 6 3
insurance_data_cor[ats[1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 9 1 40 1 11 50815 0 2 1
insurance_data_cor[ats[2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 3 0 48 0 11 52881 0 2 0
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did
## row col
## 2 2 1
## 1 1 2
## 3 3 2
## 5 5 2
## 6 6 2
## 7 7 2
## 8 8 2
## 9 9 2
## 10 10 2
## 13 13 2
## 16 16 2
## 2 2 3
## 2 2 5
## 2 2 6
## 2 2 7
## 2 2 8
## 2 2 9
## 2 2 10
## 2 2 13
## 2 2 16
# biggest distance between elements:
insurance_data_cor[did[1,2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1 0 60 0 11 67349 0 3 1
insurance_data_cor[did[1,1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 2 0 43 0 11 91449 257252 0 0
normalize <- function(x) {
return ((x - min(x)) / (max(x) - min(x)))
}
x <- normalize(x)
y <- normalize(y)
j <- normalize(j)
k <- normalize(k)
l <- normalize(l)
m <- normalize(m)
r <- normalize(r)
df <- data.frame(x, y, j, k, l, m, r)
data <- as.matrix(df)
df
## x y j k l m r
## 1 0 1.00000 0.0000000 0.7857143 0.5149203 0.0000000 0.1525424
## 2 0 0.46875 0.0000000 0.7857143 0.6991781 1.0000000 0.2881356
## 3 0 0.62500 0.0000000 0.7857143 0.4043044 0.0000000 0.3559322
## 4 0 0.21875 0.3333333 0.7142857 0.1226270 0.4827601 0.0000000
## 5 0 0.18750 0.3333333 0.8571429 0.9579953 0.0000000 0.6949153
## 6 1 0.37500 0.3333333 0.7857143 0.3885087 0.0000000 0.2711864
## 7 0 0.50000 0.6666667 0.8571429 0.3324745 0.0000000 0.4237288
## 8 0 0.18750 0.0000000 0.7142857 0.4815016 0.0000000 0.4915254
## 9 0 0.68750 0.0000000 0.5000000 0.8177071 0.0000000 0.7288136
## 10 0 0.78125 0.0000000 1.0000000 0.5894721 0.0000000 0.1694915
## 11 0 0.46875 0.0000000 0.3571429 0.4024772 0.8162036 0.5254237
## 12 0 0.84375 0.0000000 0.7857143 0.4523262 0.7006049 0.3389831
## 13 0 0.78125 0.0000000 0.7857143 1.0000000 0.0000000 1.0000000
## 14 0 0.53125 0.0000000 0.0000000 0.0000000 0.4153865 0.7288136
## 15 0 0.34375 1.0000000 0.8571429 0.3966818 0.7033998 0.6440678
## 16 0 0.96875 0.0000000 0.8571429 0.6686800 0.0000000 0.6779661
## 17 0 0.43750 0.0000000 0.7857143 0.4583126 0.7471545 0.6271186
## 18 0 0.18750 1.0000000 0.9285714 0.2600940 0.3571401 0.3728814
## 19 0 0.09375 0.6666667 0.8571429 0.1445239 0.3646114 0.7288136
## 20 0 0.00000 0.3333333 0.9285714 0.3369930 0.6631552 0.4067797
# euklid
dist_euc <- as.matrix(dist(df, method = "euclidean"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"
ggplot(temp, aes(V1, V2, fill= Distance)) +
labs(title = "Distance between elements",
x = "Object Number", y = "Object Number") +
geom_tile() +
scale_fill_gradient("Distance between\nelements", low="pink", high="purple")
# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0
D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)
# Least distance between elements:
ats
## row col
## 10 10 1
insurance_data_cor[ats[1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 14 0 53 0 14 77100 0 0 0
insurance_data_cor[ats[2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1 0 60 0 11 67349 0 3 1
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did
## row col
## 18 18 13
## 13 13 18
# biggest distance between elements:
insurance_data_cor[did[1,2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 17 0 53 0 11 130795 0 3 0
insurance_data_cor[did[1,1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 23 0 34 3 13 34019 91875 0 0
# manhatan
dist_euc <- as.matrix(dist(df, method = "manhattan"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"
ggplot(temp, aes(V1, V2, fill= Distance)) +
labs(title = "Distance between elements",
x = "Object Number", y = "Object Number") +
geom_tile() +
scale_fill_gradient("Distance between elements", low="yellow", high="green")
# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0
D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)
# Least distance between elements:
ats
## row col
## 10 10 1
insurance_data_cor[ats[1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 14 0 53 0 14 77100 0 0 0
insurance_data_cor[ats[2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1 0 60 0 11 67349 0 3 1
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did
## row col
## 14 14 6
## 6 6 14
# biggest distance between elements:
insurance_data_cor[did[1,2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 9 1 40 1 11 50815 0 2 1
insurance_data_cor[did[1,1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 19 0 45 0 0 0 106859 3 0
# cccc,
dist_euc <- as.matrix(dist(df, method = "maximum"))
dist_eu <- as.data.frame(dist_euc)
temp <- NULL
for(i in 1:ilgis)
{
temp <- rbind(temp,cbind(seq(1, ilgis, 1) ,rep(i, ilgis),dist_eu[,i]))
}
temp <- as.data.frame(temp)
temp$V1 <- as.character(temp$V1)
temp$V2 <- as.character(temp$V2)
temp$V1 <- factor(temp$V1, levels = 1:ilgis)
temp$V2 <- factor(temp$V2, levels = 1:ilgis)
names(temp)[3] <- "Distance"
ggplot(temp, aes(V1, V2, fill= Distance)) +
labs(title = "Distance between elements",
x = "Object Number", y = "Object Number") +
geom_tile() +
scale_fill_gradient("Distance between\nelements", low="yellow", high="darkorchid")
# maziausias atstumas
D <- dist_euc
D <- (D + t(D)) / 2
diag(D) <- 0
D[upper.tri(D, diag = TRUE)] <- Inf
ats <- which(D == min(D), arr = TRUE)
# Least distance between elements:
ats
## row col
## 10 10 3
insurance_data_cor[ats[1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 14 0 53 0 14 77100 0 0 0
insurance_data_cor[ats[2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 3 0 48 0 11 52881 0 2 0
# ----------
# didziausio paieska
did <-which(dist_euc == max(dist_euc), arr.ind = TRUE)
did
## row col
## 2 2 1
## 6 6 1
## 15 15 1
## 18 18 1
## 20 20 1
## 1 1 2
## 3 3 2
## 5 5 2
## 6 6 2
## 7 7 2
## 8 8 2
## 9 9 2
## 10 10 2
## 13 13 2
## 15 15 2
## 16 16 2
## 18 18 2
## 2 2 3
## 6 6 3
## 15 15 3
## 18 18 3
## 6 6 4
## 13 13 4
## 2 2 5
## 6 6 5
## 1 1 6
## 2 2 6
## 3 3 6
## 4 4 6
## 5 5 6
## 7 7 6
## 8 8 6
## 9 9 6
## 10 10 6
## 11 11 6
## 12 12 6
## 13 13 6
## 14 14 6
## 15 15 6
## 16 16 6
## 17 17 6
## 18 18 6
## 19 19 6
## 20 20 6
## 2 2 7
## 6 6 7
## 2 2 8
## 6 6 8
## 15 15 8
## 18 18 8
## 2 2 9
## 6 6 9
## 15 15 9
## 18 18 9
## 2 2 10
## 6 6 10
## 14 14 10
## 15 15 10
## 18 18 10
## 6 6 11
## 15 15 11
## 18 18 11
## 6 6 12
## 15 15 12
## 18 18 12
## 2 2 13
## 4 4 13
## 6 6 13
## 14 14 13
## 15 15 13
## 18 18 13
## 6 6 14
## 10 10 14
## 13 13 14
## 15 15 14
## 18 18 14
## 1 1 15
## 2 2 15
## 3 3 15
## 6 6 15
## 8 8 15
## 9 9 15
## 10 10 15
## 11 11 15
## 12 12 15
## 13 13 15
## 14 14 15
## 16 16 15
## 17 17 15
## 2 2 16
## 6 6 16
## 15 15 16
## 18 18 16
## 6 6 17
## 15 15 17
## 18 18 17
## 1 1 18
## 2 2 18
## 3 3 18
## 6 6 18
## 8 8 18
## 9 9 18
## 10 10 18
## 11 11 18
## 12 12 18
## 13 13 18
## 14 14 18
## 16 16 18
## 17 17 18
## 6 6 19
## 1 1 20
## 6 6 20
# biggest distance between elements:
insurance_data_cor[did[1,2],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 1 0 60 0 11 67349 0 3 1
insurance_data_cor[did[1,1],]
## KIDSDRIV AGE HOMEKIDS YOJ INCOME HOME_VAL MVR_PTS REPEAT5
## 2 0 43 0 11 91449 257252 0 0