Sys.setlocale("LC_CTYPE","russian")
library(data.table)
library(readr)
library(dplyr)
library(stringr)
library(scales)
library(cowplot)
The idea of UpSet diagrams https://vcg.github.io/upset is brilliant and the R package https://github.com/hms-dbmi/UpSetR/ provides a wide set of option, yet there is always something that is missing. I am a real newbie in programming, so instead of looking into the code of UpSet and doing some magic cloning via Git, I decided to re-build the UpsetR diagram using a basic ggplot/cowplot functionality.
Assume that there are many objects measured with 4 metrics - a, b, c, d.
Let’s generate some data (no matter how weird is my method):
dataw<- data.frame(obj=paste0("object_",1:50),
a=ceiling(sample(0:1,50, replace=TRUE)*sample(0:50,50, replace=TRUE)),
b=ceiling(sample(0:1,50, replace=TRUE)*sample(0:100,50, replace=TRUE)),
c=ceiling(sample(0:1,50, replace=TRUE)*sample(0:50,50, replace=TRUE)),
d=ceiling(sample(0:1,50, replace=TRUE)*sample(0:100,50, replace=TRUE)))
head(dataw)
## obj a b c d
## 1 object_1 45 0 24 81
## 2 object_2 40 0 7 80
## 3 object_3 9 0 0 0
## 4 object_4 37 44 19 0
## 5 object_5 0 90 0 0
## 6 object_6 0 0 48 33
Now let’s tidy up our data:
dataw<- dataw %>% mutate(pp=1) %>%
## pp states for a valid (existing) observation, its value is always 1
mutate(aa=as.integer(ifelse(a==0,0,1))) %>%
mutate(bb=as.integer(ifelse(b==0,0,1))) %>%
mutate(cc=as.integer(ifelse(c==0,0,1))) %>%
mutate(dd=as.integer(ifelse(d==0,0,1))) %>%
# and let's code it via p (metric's value>0) and n (metric's value = 0)
mutate(code=as.character(paste0(pp,aa,bb,cc,dd))) %>%
mutate(code=gsub("1","p",code)) %>%
mutate(code=gsub("0","n",code))
head(dataw)
## obj a b c d pp aa bb cc dd code
## 1 object_1 45 0 24 81 1 1 0 1 1 ppnpp
## 2 object_2 40 0 7 80 1 1 0 1 1 ppnpp
## 3 object_3 9 0 0 0 1 1 0 0 0 ppnnn
## 4 object_4 37 44 19 0 1 1 1 1 0 ppppn
## 5 object_5 0 90 0 0 1 0 1 0 0 pnpnn
## 6 object_6 0 0 48 33 1 0 0 1 1 pnnpp
I also have to convert the data into narrow format and prepare the dataframes fo visualization. Even though the pipes are very convenient, I do like to produce separate dataframes.
datan<- dataw %>% melt(measure.vars=c("pp","aa","bb","cc","dd"), variable.name="metric_name", value.name="metric")
head(datan)
## obj a b c d code metric_name metric
## 1 object_1 45 0 24 81 ppnpp pp 1
## 2 object_2 40 0 7 80 ppnpp pp 1
## 3 object_3 9 0 0 0 ppnnn pp 1
## 4 object_4 37 44 19 0 ppppn pp 1
## 5 object_5 0 90 0 0 pnpnn pp 1
## 6 object_6 0 0 48 33 pnnpp pp 1
data1<-datan %>%
group_by(code, metric_name, metric) %>% summarize(n=n_distinct(obj)) %>%
arrange(n)
head(data1)
## # A tibble: 6 x 4
## # Groups: code, metric_name [6]
## code metric_name metric n
## <chr> <fct> <dbl> <int>
## 1 pnnnp pp 1 1
## 2 pnnnp aa 0 1
## 3 pnnnp bb 0 1
## 4 pnnnp cc 0 1
## 5 pnnnp dd 1 1
## 6 pnppn pp 1 1
data2<-datan %>%
group_by(metric_name) %>% summarize(n=sum(metric))
head(data2)
## # A tibble: 5 x 2
## metric_name n
## <fct> <dbl>
## 1 pp 50
## 2 aa 24
## 3 bb 19
## 4 cc 31
## 5 dd 27
data4<-datan %>%
group_by(code) %>% summarize(n=n_distinct(obj)) %>%
arrange(desc(n))
head(data4)
## # A tibble: 6 x 2
## code n
## <chr> <int>
## 1 ppnpp 7
## 2 pnnpn 6
## 3 pnnpp 6
## 4 pnppp 4
## 5 ppnnn 4
## 6 pppnp 4
data3<-dataw %>% left_join(.,data4)
## Joining, by = "code"
head(data3)
## obj a b c d pp aa bb cc dd code n
## 1 object_1 45 0 24 81 1 1 0 1 1 ppnpp 7
## 2 object_2 40 0 7 80 1 1 0 1 1 ppnpp 7
## 3 object_3 9 0 0 0 1 1 0 0 0 ppnnn 4
## 4 object_4 37 44 19 0 1 1 1 1 0 ppppn 3
## 5 object_5 0 90 0 0 1 0 1 0 0 pnpnn 2
## 6 object_6 0 0 48 33 1 0 0 1 1 pnnpp 6
codes<-data4 %>% arrange(desc(n))
scale_p<-as.character(codes$code)
cols <- c("0" = "white", "1" = "dark blue")
point <- scales::format_format(big.mark = " ", decimal.mark = ".", scientific = FALSE, digits=0) # cosy wrapper for format
Now we are ready for visualization. There are 8 separate graphs to be later combined with cowplot function.
emptybar<-ggplot(data3[1,])+
geom_text(aes(x=-3, y=9, label="TOTAL\nno of objects"),
hjust=0, size=8)+
geom_text(aes(x=-3, y=7.5, label=as.character(point(NROW(data3)))),
hjust=0, size=15, color="brown", fontface="bold")+
scale_y_continuous(limits=c(-1,10),
breaks=c(0,1,3,5,7,9),
position="right")+
scale_x_continuous(limits=c(-3,1.5))+
theme_minimal()+
theme(legend.position = "none",
axis.title = element_blank(),
axis.text = element_blank(),
panel.grid = element_blank(),
plot.margin = margin(0,0,0,0,"cm"))
emptybar
bars_f<-
ggplot(data1, aes(x=reorder(factor(code),-n), y=n))+
geom_col(fill="brown", position="dodge")+
geom_text(aes(label=point(n)),
size=7,
position = position_dodge(0.9),
hjust=0.5, vjust=-0.25)+
scale_x_discrete(limits=scale_p)+
scale_y_continuous(expand = expand_scale(mult = c(0, .1)))+
theme_minimal()+
theme(legend.position = "none",
axis.title = element_blank(),
axis.text = element_blank(),
panel.grid = element_blank(),
plot.margin = margin(0,0,0,0,"cm"))
bars_f
bars_metric_f<-
ggplot(data2, aes(x=metric_name, y=n))+
geom_col(fill="dark blue", position="dodge")+
geom_text(aes(label=point(n)),
position = position_dodge(0.9),
size=7, hjust=1.1, vjust=0.5)+
scale_x_discrete(position="top",
limits=c("dd","cc","bb","aa","pp"))+
scale_y_reverse(labels=scales::format_format(big.mark = " ", decimal.mark = ".", scientific = FALSE, digits=0),expand = expand_scale(mult = c(0.6, 0)))+
coord_flip()+
theme_minimal()+
theme(legend.position = "none",
axis.title = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size=18),
panel.grid = element_blank(),
plot.margin = margin(0,0,0,0,"cm"))
bars_metric_f
dots1_f<-
ggplot(data1, aes(y=metric_name, x=reorder(factor(code),-n)))+
geom_point(shape=21, size=6, colour="black", aes(fill=factor(metric)))+
scale_fill_manual(values = cols)+
scale_x_discrete(limits=scale_p)+
scale_y_discrete(limits=c("dd","cc","bb","aa","pp"))+
theme_minimal()+
labs(x="", y="")+
theme(legend.position = "none",
axis.title = element_blank(),
axis.text = element_blank(),
plot.margin = margin(0,0,0,0,"cm"))
dots1_f
box1_f<-
ggplot(data3, aes(x=reorder(factor(code),-n), y=a))+
geom_boxplot(fill="coral", alpha=0.9, outlier.alpha = 0)+
scale_y_continuous(limits=c(-1,max(data3$a)))+
scale_x_discrete(limits=scale_p)+
theme_minimal()+
labs(x="", y="")+
theme(legend.position = "none",
axis.title = element_blank(),
axis.text = element_blank(),
axis.line = element_line(),
plot.margin = margin(0,0,0,0,"cm"))
box1_f
box1_e<-
ggplot(data3, aes(x=1, y=a))+
geom_boxplot(fill="coral", outlier.alpha = 0)+
geom_text(aes(x=-3, y=0.9*max(data3$a), label="TOTAL\n(metric a)"),
hjust=0, size=8)+
geom_text(aes(x=-3, y=0.75*max(data3$a), label=as.character(point(sum(data3$a)))),
hjust=0, size=15, color="coral", fontface="bold")+
scale_y_continuous(limits=c(-1,max(data3$a)),
breaks=pretty_breaks(n=5),
position="right")+
scale_x_continuous(limits=c(-3,1.5))+
theme_minimal()+
theme(legend.position = "none",
axis.title = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size=18, hjust=0),
axis.ticks.y =element_line(),
axis.line = element_blank(),
panel.grid = element_blank(),
plot.margin = margin(0,0,0,0,"cm"))
box1_e
box2_f<-
ggplot(data3, aes(x=reorder(factor(code),-n), y=b))+
geom_boxplot(fill="green", outlier.alpha = 0)+
scale_y_continuous(limits=c(-1,max(data3$b)))+
scale_x_discrete(limits=scale_p)+
theme_minimal()+
labs(x="", y="")+
theme(legend.position = "none",
axis.title = element_blank(),
axis.text = element_blank(),
axis.line = element_line(),
plot.margin = margin(0,0,0,0,"cm"))
box2_f
box2_e<-ggplot(data3, aes(x=1, y=b))+
geom_boxplot(fill="green", outlier.alpha = 0)+
geom_text(aes(x=-3, y=0.9*max(data3$b), label="TOTAL\n(metric b)"),
hjust=0, size=8)+
geom_text(aes(x=-3, y=0.75*max(data3$b), label=as.character(point(sum(data3$b)))),
hjust=0, size=15, color="green", fontface="bold")+
scale_y_continuous(limits=c(-1,max(data3$b)),
breaks=pretty_breaks(n=5),
position="right")+
scale_x_continuous(limits=c(-3,1.5))+
theme_minimal()+
theme(legend.position = "none",
axis.title = element_blank(),
axis.text.x = element_blank(),
axis.text.y = element_text(size=18, hjust=1),
axis.line.y = element_blank(),
axis.ticks.y =element_line(),
panel.grid = element_blank(),
plot.margin = margin(0,0,0,0,"cm"))
box2_e
So now we can build it
p2s<- plot_grid(emptybar, bars_f,
bars_metric_f, dots1_f,
box1_e,box1_f,
box2_e, box2_f,
labels=c("A","","","","B","","C",""),
label_size = 24,
ncol=2, align="h",
rel_widths = c(1,3.5, 1,3.5), rel_heights = c(2,1,2,2))
p2s