library(dplyr)
library(reshape2)
library(gapminder)
# find countries whose life expectancy over time correlates
cors <- gapminder %>%
acast(year ~ country, value.var = "lifeExp") %>%
cor() %>%
melt(varnames = c("country1", "country2"), value.name = "correlation") %>%
tbl_df()
cors
## Source: local data frame [20,164 x 3]
##
## country1 country2 correlation
## (fctr) (fctr) (dbl)
## 1 Afghanistan Afghanistan 1.0000000
## 2 Albania Afghanistan 0.9656953
## 3 Algeria Afghanistan 0.9868220
## 4 Angola Afghanistan 0.9855294
## 5 Argentina Afghanistan 0.9705203
## 6 Australia Afghanistan 0.9393751
## 7 Austria Afghanistan 0.9557228
## 8 Bahrain Afghanistan 0.9956190
## 9 Bangladesh Afghanistan 0.9466653
## 10 Belgium Afghanistan 0.9626319
## .. ... ... ...
This hopped from one tidy format to another, allowing easy manipulation:
# what countries are most correlated, don't count countries
# matched to themselves
cors %>%
filter(country1 != country2) %>%
arrange(desc(correlation))
## Source: local data frame [20,022 x 3]
##
## country1 country2 correlation
## (fctr) (fctr) (dbl)
## 1 Mauritania Indonesia 0.9996291
## 2 Indonesia Mauritania 0.9996291
## 3 Senegal Morocco 0.9995515
## 4 Morocco Senegal 0.9995515
## 5 West Bank and Gaza Saudi Arabia 0.9995156
## 6 Saudi Arabia West Bank and Gaza 0.9995156
## 7 France Brazil 0.9994246
## 8 Brazil France 0.9994246
## 9 Reunion Bahrain 0.9993649
## 10 Bahrain Reunion 0.9993649
## .. ... ... ...
(I realize that since almost all countries are just going up linearly this isn’t a particularly interesting correlation metric, but there are many datasets where such between-group correlations are interesting).