Package Ready

library(gsheet)
library(dplyr)

Start with Toy Example

df = matrix(c(4.1, 4.1,
         4.1, 2.7,
         2.7, 4.1,
         2.5, 2.5), 
       nrow = 4, ncol=2, byrow = TRUE)
df
##      [,1] [,2]
## [1,]  4.1  4.1
## [2,]  4.1  2.7
## [3,]  2.7  4.1
## [4,]  2.5  2.5

Plot

plt = plot(df,
     xlim = c(0,4.5),
     ylim = c(0,4.5),
     xlab = 'Econ',
     ylab = 'Poli') +
arrows(0,0,df[1,1],df[1,2], length = 0.1, lwd=2)+ 
arrows(0,0,df[2,1],df[2,2], length = 0.1, col='red', lwd=2)+ arrows(0,0,df[3,1],df[3,2], length = 0.1, col='blue', lwd=2)+ 
arrows(0,0,df[4,1],df[4,2], length = 0.1, col='green', lwd = 3)

Write a function to calculate the similarity

\[ cos(\theta) = \frac{\sum_{i=1}^n a_i\cdot b_i} {\sqrt{\sum_{i=1}^n a_i\cdot a_i}\cdot\sqrt{\sum_{i=1}^n b_i\cdot b_i}} \]

cosine <- function(x,y){
  sum(x*y) / (sqrt(sum(x*x)) * sqrt(sum(y*y)))
}

cosine((c(0,1)),c(1,0))
## [1] 0
cosine(df[1,],df[2,])
## [1] 0.979457
cosine(df[1,],df[3,])
## [1] 0.979457
cosine(df[1,],df[4,])
## [1] 1
cosine(df[2,],df[3,])
## [1] 0.9186722
cosine(df[2,],df[4,])
## [1] 0.979457
cosine(df[3,],df[4,])
## [1] 0.979457

Matrix form calculation to get similarity

Matrix = as.matrix(df)
sim <- Matrix / sqrt(rowSums(Matrix * Matrix))
sim <- sim %*% t(sim)
sim
##          [,1]      [,2]      [,3]     [,4]
## [1,] 1.000000 0.9794570 0.9794570 1.000000
## [2,] 0.979457 1.0000000 0.9186722 0.979457
## [3,] 0.979457 0.9186722 1.0000000 0.979457
## [4,] 1.000000 0.9794570 0.9794570 1.000000

How about Correlation?

cor(df)
##           [,1]      [,2]
## [1,] 1.0000000 0.1365639
## [2,] 0.1365639 1.0000000

Get a data

#data = read.csv(choose.files(), header=TRUE)
data = gsheet2tbl('https://docs.google.com/spreadsheets/d/1GHX-CtPwHbfwcX4wvs4m24WqfbwJ1ghVC7jgnwVWJ1U/edit?usp=sharing')
print(dim(data))
## [1]  6 10
data
## # A tibble: 6 x 10
##   univ    policy  econ business   law   etc   num tuition ratio1 ratio2
##   <chr>    <dbl> <dbl>    <dbl> <dbl> <dbl> <dbl>   <dbl>  <dbl>  <dbl>
## 1 khu        0.4   0.3      0.2  0.05  0.05   120     300     10     20
## 2 yonsei     0.3   0.3      0.3  0.1   0      203     700     22     10
## 3 korea      0.4   0.4      0.2  0     0       80     400     17     25
## 4 hufs       0.5   0.3      0.1  0.1   0       45     300     20     20
## 5 hanyang    0.2   0.3      0.3  0.1   0.1     50     400     15     30
## 6 ewha       0.5   0.3      0.1  0.1   0       95     500     10     15

Matrix form calculuateion to get similarity

df = as.data.frame(t(data))
colnames(df) <- lapply(df[1, ], as.character)
df <- df[-1,]
for (i in 1:dim(df)[2]){
  df[,i] = as.numeric(df[,i])
}

Matrix = as.matrix(t(df))
sim <- Matrix / sqrt(rowSums(Matrix * Matrix))
sim <- sim %*% t(sim)
sim
##               khu    yonsei     korea      hufs   hanyang      ewha
## khu     1.0000000 0.9940354 0.9833101 0.9728614 0.9674651 0.9809516
## yonsei  0.9940354 1.0000000 0.9952219 0.9891772 0.9857535 0.9953643
## korea   0.9833101 0.9952219 1.0000000 0.9985303 0.9972546 0.9992097
## hufs    0.9728614 0.9891772 0.9985303 1.0000000 0.9992560 0.9975280
## hanyang 0.9674651 0.9857535 0.9972546 0.9992560 1.0000000 0.9968427
## ewha    0.9809516 0.9953643 0.9992097 0.9975280 0.9968427 1.0000000

Correlation

cor(df)
##               khu    yonsei     korea      hufs   hanyang      ewha
## khu     1.0000000 0.9935781 0.9802924 0.9675563 0.9621013 0.9790002
## yonsei  0.9935781 1.0000000 0.9941584 0.9868130 0.9829980 0.9947214
## korea   0.9802924 0.9941584 1.0000000 0.9982190 0.9968986 0.9993288
## hufs    0.9675563 0.9868130 0.9982190 1.0000000 0.9992354 0.9971981
## hanyang 0.9621013 0.9829980 0.9968986 0.9992354 1.0000000 0.9962173
## ewha    0.9790002 0.9947214 0.9993288 0.9971981 0.9962173 1.0000000