This vignette describes the composition and calculation of a summary statistic that we have referred to as the product of ranks. Others have called it the rank product (Breitling 2004; Eisinga 2013).

For \(N\) units having \(M\) metrics, \(i = 1 \dots N\), it is defined as:

\[ {RP}_i = \prod_{j=1}^{M}{{R_{ij}} \over {N_{j}}} \]

… where the rank \(R_{ij}\) of value \(X_{ij}\) (for unit \(i\) and metric \(j\)) is defined such that ranks vary from 1 (highest) to N (lowest), assuming no ties and no missing data. The terms of the cumulative product are the rank fractions, which under the same assumptions vary from \(\frac{1}{N}\) (highest) to \(1\) (lowest).

Details

Example code

suppressPackageStartupMessages({
  library(testthat)         # install.packages("testthat")
  library(dplyr)            # install.packages("dplyr")
  library(tidyr)            # install.packages("tidyr")
  library(CalEnviroScreen)  # devtools::install_github("baaqmd/CalEnviroScreen")
  })
data(CES2, package = "CalEnviroScreen")
CES2_tbl <- CES2_data %>% 
  gather(Variable, Value, -FIPS) %>%
  filter(!is.na(Value)) %>%
  group_by(Variable) %>%  
  mutate(Rank = rank(-Value, ties.method = "min"),  # rank: 1 (highest) to N (lowest)
         Frac = Rank / n())                         # rank fraction: 1/N (highest) to 1.0 (lowest)
cut_quantile <- function (x, n = 20, ..., na.rm = TRUE) {
  q <- quantile(x, seq(0, 1, len=n+1), na.rm = na.rm)
  cut(x, breaks = q, labels = names(q)[-length(q)], include.lowest = TRUE, ordered_result = TRUE, na.rm = na.rm)
}
CES2_rankprod <- CES2_tbl %>%
  group_by(FIPS) %>%
  summarize(k = n(), Score = mean(-log(Frac))) %>% # yields positive & increasing scores
  mutate(Range = cut_quantile(Score))

show(CES2_rankprod)
## Source: local data frame [8,035 x 4]
## 
##           FIPS  k  Score Range
## 1  06001400100 19 0.8014   25%
## 2  06001400200 19 0.7087   15%
## 3  06001400300 19 0.7087   15%
## 4  06001400400 19 0.8386   30%
## 5  06001400500 19 0.9633   45%
## 6  06001400600 19 0.9527   45%
## 7  06001400700 19 1.2256   70%
## 8  06001400800 19 1.6038   90%
## 9  06001400900 19 1.4199   80%
## 10 06001401000 19 1.5626   85%
## ..         ... ..    ...   ...
with(CES2_rankprod, table(Range) %>% rev() %>% cumsum() %>% head())
##  95%  90%  85%  80%  75%  70% 
##  402  804 1206 1607 2009 2411
with(CES2_rankprod, density(exp(-Score)) %>% plot(bty='n'))

plot of chunk density_plot

write.csv(CES2_rankprod, file = "CES2_rankprod.csv", row.names = FALSE)

References


Appendix

Equivalent approach to calculating ranks

rank_nonmissing <- function (x, ...) {
  i <- which(!is.na(x))
  as.integer(replace(x, i, rank(x[i], ...)))
}

rank1 <- CES2_tbl %>% select(FIPS, Variable, Rank) %>% spread(Variable, Rank)
rank2 <- CES2_data %>% mutate_each(funs(rank = rank_nonmissing(-1 * ., ties.method="min")), -FIPS)

show(rank2)
## Source: local data frame [8,035 x 20]
## 
##           FIPS Ozone PM25 DieselPM DrinkWat PestUse ToxRel Traffic Cleanup
## 1  06001400100  6258 6245     2663     7731    3012   3704    2634    3127
## 2  06001400200  6258 6263      584     7731    3012   3874    1459    5446
## 3  06001400300  6258 6269      589     7731    3012   3953    1487    4785
## 4  06001400400  6258 6284      591     7731    3012   3875    1750    5446
## 5  06001400500  6258 6306      584     7731    3012   3873    3210    3524
## 6  06001400600  6258 6296      602     7731    3012   3984     840    4162
## 7  06001400700  6258 6317      600     7731    3012   3995    3012    1661
## 8  06001400800  6258 6348      743     7731    3012   3989    4667      93
## 9  06001400900  6258 6338      686     7731    3012   4103    5708     214
## 10 06001401000  6258 6308      362     7731    3012   4135    1520     920
## ..         ...   ...  ...      ...      ...     ...    ...     ...     ...
## Variables not shown: GndWat (int), HazWaste (int), ImpWat (int), SolWaste
##   (int), Age (int), Asthma (int), LBW (int), Edu (int), LingIso (int),
##   Poverty (int), Unemp (int)
expect_equal(rank1, rank2)

Equivalent approach to calculating rank fractions

frac1 <- CES2_tbl %>% select(FIPS, Variable, Frac) %>% spread(Variable, Frac) 
frac2 <- rank2 %>% mutate_each(funs(frac = . / sum(!is.na(.))), -FIPS) 

show(frac2)
## Source: local data frame [8,035 x 20]
## 
##           FIPS  Ozone   PM25 DieselPM DrinkWat PestUse ToxRel Traffic
## 1  06001400100 0.7852 0.7861  0.33143   0.9664  0.3749 0.4617  0.3278
## 2  06001400200 0.7852 0.7884  0.07268   0.9664  0.3749 0.4829  0.1816
## 3  06001400300 0.7852 0.7891  0.07330   0.9664  0.3749 0.4927  0.1851
## 4  06001400400 0.7852 0.7910  0.07355   0.9664  0.3749 0.4830  0.2178
## 5  06001400500 0.7852 0.7938  0.07268   0.9664  0.3749 0.4827  0.3995
## 6  06001400600 0.7852 0.7925  0.07492   0.9664  0.3749 0.4966  0.1045
## 7  06001400700 0.7852 0.7952  0.07467   0.9664  0.3749 0.4979  0.3749
## 8  06001400800 0.7852 0.7991  0.09247   0.9664  0.3749 0.4972  0.5808
## 9  06001400900 0.7852 0.7978  0.08538   0.9664  0.3749 0.5114  0.7104
## 10 06001401000 0.7852 0.7941  0.04505   0.9664  0.3749 0.5154  0.1892
## ..         ...    ...    ...      ...      ...     ...    ...     ...
## Variables not shown: Cleanup (dbl), GndWat (dbl), HazWaste (dbl), ImpWat
##   (dbl), SolWaste (dbl), Age (dbl), Asthma (dbl), LBW (dbl), Edu (dbl),
##   LingIso (dbl), Poverty (dbl), Unemp (dbl)
expect_equal(frac1, frac2)

Equivalent approach to calculating scores

GM <- function (x) prod(na.omit(x)) ^ (1 / sum(!is.na(x)))
GM2 <- function (x) exp(mean(log(na.omit(x))))

score_GM <- function (.data) .data %>% select(-FIPS) %>% apply(GM, MARGIN = 1)
score_GM2 <- function (.data) .data %>% select(-FIPS) %>% apply(GM2, MARGIN = 1)

head(frac2 %>% score_GM)
## [1] 0.4487 0.4923 0.4923 0.4323 0.3816 0.3857
expect_equal(frac2 %>% score_GM, frac2 %>% score_GM2)
expect_equal(frac2 %>% score_GM, CES2_rankprod$Score %>% function (x) exp(-x))
expect_equal(-1 * log(frac2 %>% score_GM), CES2_rankprod$Score)

logGM <- function (x) log(prod(x, na.rm = TRUE)) / sum(!is.na(x))
score_logGM <- function (.data) .data %>% select(-FIPS) %>% apply(logGM, MARGIN = 1)
head(frac2 %>% score_logGM %>% exp())
## [1] 0.4487 0.4923 0.4923 0.4323 0.3816 0.3857