Sys.setlocale("LC_CTYPE","russian")
library(data.table)
library(readr)
library(dplyr)
library(stringr)
library(scales)
library(cowplot)

Intro

The idea of UpSet diagrams https://vcg.github.io/upset is brilliant and the R package https://github.com/hms-dbmi/UpSetR/ provides a wide set of option, yet there is always something that is missing. I am a real newbie in programming, so instead of looking into the code of UpSet and doing some magic cloning via Git, I decided to re-build the UpsetR diagram using a basic ggplot/cowplot functionality.

Data

Assume that there are many objects measured with 4 metrics - a, b, c, d.

Let’s generate some data (no matter how weird is my method):

dataw<- data.frame(obj=paste0("object_",1:50),
  a=ceiling(sample(0:1,50, replace=TRUE)*sample(0:50,50, replace=TRUE)),
  b=ceiling(sample(0:1,50, replace=TRUE)*sample(0:100,50, replace=TRUE)),
  c=ceiling(sample(0:1,50, replace=TRUE)*sample(0:50,50, replace=TRUE)),
  d=ceiling(sample(0:1,50, replace=TRUE)*sample(0:100,50, replace=TRUE)))
head(dataw)
##        obj  a  b  c  d
## 1 object_1 45  0 24 81
## 2 object_2 40  0  7 80
## 3 object_3  9  0  0  0
## 4 object_4 37 44 19  0
## 5 object_5  0 90  0  0
## 6 object_6  0  0 48 33

Now let’s tidy up our data:

dataw<- dataw %>% mutate(pp=1) %>%
  ## pp states for a valid (existing) observation, its value is always 1
  mutate(aa=as.integer(ifelse(a==0,0,1))) %>%
  mutate(bb=as.integer(ifelse(b==0,0,1))) %>%
  mutate(cc=as.integer(ifelse(c==0,0,1))) %>%
  mutate(dd=as.integer(ifelse(d==0,0,1))) %>%  
  # and let's code it via p (metric's value>0) and n (metric's value = 0)
  mutate(code=as.character(paste0(pp,aa,bb,cc,dd))) %>% 
  mutate(code=gsub("1","p",code)) %>% 
  mutate(code=gsub("0","n",code))
head(dataw)
##        obj  a  b  c  d pp aa bb cc dd  code
## 1 object_1 45  0 24 81  1  1  0  1  1 ppnpp
## 2 object_2 40  0  7 80  1  1  0  1  1 ppnpp
## 3 object_3  9  0  0  0  1  1  0  0  0 ppnnn
## 4 object_4 37 44 19  0  1  1  1  1  0 ppppn
## 5 object_5  0 90  0  0  1  0  1  0  0 pnpnn
## 6 object_6  0  0 48 33  1  0  0  1  1 pnnpp

I also have to convert the data into narrow format and prepare the dataframes fo visualization. Even though the pipes are very convenient, I do like to produce separate dataframes.

datan<- dataw %>% melt(measure.vars=c("pp","aa","bb","cc","dd"), variable.name="metric_name", value.name="metric")
head(datan)
##        obj  a  b  c  d  code metric_name metric
## 1 object_1 45  0 24 81 ppnpp          pp      1
## 2 object_2 40  0  7 80 ppnpp          pp      1
## 3 object_3  9  0  0  0 ppnnn          pp      1
## 4 object_4 37 44 19  0 ppppn          pp      1
## 5 object_5  0 90  0  0 pnpnn          pp      1
## 6 object_6  0  0 48 33 pnnpp          pp      1
data1<-datan %>% 
  group_by(code, metric_name, metric) %>% summarize(n=n_distinct(obj)) %>% 
  arrange(n)
head(data1)
## # A tibble: 6 x 4
## # Groups:   code, metric_name [6]
##   code  metric_name metric     n
##   <chr> <fct>        <dbl> <int>
## 1 pnnnp pp               1     1
## 2 pnnnp aa               0     1
## 3 pnnnp bb               0     1
## 4 pnnnp cc               0     1
## 5 pnnnp dd               1     1
## 6 pnppn pp               1     1
data2<-datan %>% 
  group_by(metric_name) %>% summarize(n=sum(metric))
head(data2)
## # A tibble: 5 x 2
##   metric_name     n
##   <fct>       <dbl>
## 1 pp             50
## 2 aa             24
## 3 bb             19
## 4 cc             31
## 5 dd             27
data4<-datan %>% 
  group_by(code) %>% summarize(n=n_distinct(obj)) %>% 
  arrange(desc(n))
head(data4)
## # A tibble: 6 x 2
##   code      n
##   <chr> <int>
## 1 ppnpp     7
## 2 pnnpn     6
## 3 pnnpp     6
## 4 pnppp     4
## 5 ppnnn     4
## 6 pppnp     4
data3<-dataw %>% left_join(.,data4)
## Joining, by = "code"
head(data3)
##        obj  a  b  c  d pp aa bb cc dd  code n
## 1 object_1 45  0 24 81  1  1  0  1  1 ppnpp 7
## 2 object_2 40  0  7 80  1  1  0  1  1 ppnpp 7
## 3 object_3  9  0  0  0  1  1  0  0  0 ppnnn 4
## 4 object_4 37 44 19  0  1  1  1  1  0 ppppn 3
## 5 object_5  0 90  0  0  1  0  1  0  0 pnpnn 2
## 6 object_6  0  0 48 33  1  0  0  1  1 pnnpp 6
codes<-data4 %>% arrange(desc(n)) 
scale_p<-as.character(codes$code)

cols <- c("0" = "white", "1" = "dark blue")  

point <- scales::format_format(big.mark = " ", decimal.mark = ".", scientific = FALSE, digits=0)  # cosy wrapper for format

Data

Now we are ready for visualization. There are 8 separate graphs to be later combined with cowplot function.

Chart1. Bar with a total number of objects/observations

emptybar<-ggplot(data3[1,])+
  geom_text(aes(x=-3, y=9, label="TOTAL\nno of objects"), 
            hjust=0, size=8)+
  geom_text(aes(x=-3, y=7.5, label=as.character(point(NROW(data3)))), 
            hjust=0, size=15, color="brown", fontface="bold")+
  scale_y_continuous(limits=c(-1,10),
                     breaks=c(0,1,3,5,7,9),
                     position="right")+
  scale_x_continuous(limits=c(-3,1.5))+  
  theme_minimal()+
  theme(legend.position = "none", 
        axis.title = element_blank(),
        axis.text = element_blank(),
        panel.grid = element_blank(),
        plot.margin = margin(0,0,0,0,"cm"))
emptybar

Chart2. Totals for intersections

bars_f<-
  ggplot(data1, aes(x=reorder(factor(code),-n), y=n))+
  geom_col(fill="brown", position="dodge")+
  geom_text(aes(label=point(n)),
            size=7,
            position = position_dodge(0.9),
            hjust=0.5, vjust=-0.25)+
  scale_x_discrete(limits=scale_p)+
  scale_y_continuous(expand = expand_scale(mult = c(0, .1)))+
  theme_minimal()+
  theme(legend.position = "none", 
        axis.title = element_blank(),
        axis.text = element_blank(),
        panel.grid = element_blank(),
        plot.margin = margin(0,0,0,0,"cm"))
bars_f

Chart 3. Totals for metrics

bars_metric_f<-
  ggplot(data2, aes(x=metric_name, y=n))+
  geom_col(fill="dark blue", position="dodge")+
  geom_text(aes(label=point(n)),
            position = position_dodge(0.9),
            size=7, hjust=1.1, vjust=0.5)+
  scale_x_discrete(position="top",
                   limits=c("dd","cc","bb","aa","pp"))+
  scale_y_reverse(labels=scales::format_format(big.mark = " ", decimal.mark = ".", scientific = FALSE, digits=0),expand = expand_scale(mult = c(0.6, 0)))+
  coord_flip()+
  theme_minimal()+
  theme(legend.position = "none",
        axis.title = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size=18),
        panel.grid = element_blank(),
        plot.margin = margin(0,0,0,0,"cm"))
bars_metric_f

Chart 4. The Intersections itself

dots1_f<-
  ggplot(data1, aes(y=metric_name, x=reorder(factor(code),-n)))+
  geom_point(shape=21, size=6, colour="black", aes(fill=factor(metric)))+
  scale_fill_manual(values = cols)+
  scale_x_discrete(limits=scale_p)+
  scale_y_discrete(limits=c("dd","cc","bb","aa","pp"))+
  theme_minimal()+
  labs(x="", y="")+
  theme(legend.position = "none",
        axis.title = element_blank(),
        axis.text = element_blank(),
        plot.margin = margin(0,0,0,0,"cm"))
dots1_f

Additional boxplots

Chart 5. Metric 1, right section

box1_f<-
  ggplot(data3, aes(x=reorder(factor(code),-n), y=a))+
  geom_boxplot(fill="coral", alpha=0.9, outlier.alpha = 0)+
  scale_y_continuous(limits=c(-1,max(data3$a)))+
  scale_x_discrete(limits=scale_p)+
  theme_minimal()+
  labs(x="", y="")+
  theme(legend.position = "none",
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.line = element_line(),
        plot.margin = margin(0,0,0,0,"cm"))
box1_f

Chart 6. Metric 1, left section

box1_e<-
  ggplot(data3, aes(x=1, y=a))+
  geom_boxplot(fill="coral", outlier.alpha = 0)+
  geom_text(aes(x=-3, y=0.9*max(data3$a), label="TOTAL\n(metric a)"), 
            hjust=0, size=8)+
  geom_text(aes(x=-3, y=0.75*max(data3$a), label=as.character(point(sum(data3$a)))), 
            hjust=0, size=15, color="coral", fontface="bold")+
  scale_y_continuous(limits=c(-1,max(data3$a)),
                     breaks=pretty_breaks(n=5),
                     position="right")+

  scale_x_continuous(limits=c(-3,1.5))+  
  theme_minimal()+
  theme(legend.position = "none",
        axis.title = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size=18, hjust=0),
        axis.ticks.y =element_line(),
        axis.line = element_blank(),
        panel.grid = element_blank(),
        plot.margin = margin(0,0,0,0,"cm"))
box1_e

Chart 7. Metric 2, right section

box2_f<-
  ggplot(data3, aes(x=reorder(factor(code),-n), y=b))+
  geom_boxplot(fill="green", outlier.alpha = 0)+
  scale_y_continuous(limits=c(-1,max(data3$b)))+
  scale_x_discrete(limits=scale_p)+
  theme_minimal()+
  labs(x="", y="")+
  theme(legend.position = "none",
        axis.title = element_blank(),
        axis.text = element_blank(),
        axis.line = element_line(),
        plot.margin = margin(0,0,0,0,"cm"))
box2_f

Chart 8. Metric 2, left section

box2_e<-ggplot(data3, aes(x=1, y=b))+
  geom_boxplot(fill="green", outlier.alpha = 0)+
  geom_text(aes(x=-3, y=0.9*max(data3$b), label="TOTAL\n(metric b)"), 
            hjust=0, size=8)+
  geom_text(aes(x=-3, y=0.75*max(data3$b), label=as.character(point(sum(data3$b)))), 
            hjust=0, size=15, color="green", fontface="bold")+
  scale_y_continuous(limits=c(-1,max(data3$b)),
                     breaks=pretty_breaks(n=5),
                     position="right")+
  scale_x_continuous(limits=c(-3,1.5))+  
  theme_minimal()+
  theme(legend.position = "none",
        axis.title = element_blank(),
        axis.text.x = element_blank(),
        axis.text.y = element_text(size=18, hjust=1),
        axis.line.y = element_blank(),
        axis.ticks.y =element_line(), 
        panel.grid = element_blank(),
        plot.margin = margin(0,0,0,0,"cm"))
box2_e

UpSet

So now we can build it

p2s<- plot_grid(emptybar, bars_f, 
            bars_metric_f, dots1_f,
            box1_e,box1_f,
            box2_e, box2_f,
            labels=c("A","","","","B","","C",""),
            label_size = 24,
            ncol=2, align="h", 
            rel_widths = c(1,3.5, 1,3.5), rel_heights = c(2,1,2,2))

p2s