This document contains the analysis for the paper “The minimum information for the reporting of an ontology (MIRO) guidelines”, submitted to the Journal of Biomedical Semantics (JBMS) in March 2017.
Demographics
fontsize<-16
## Role Analysis
(count(df_results$role_user))
## x freq
## 1 18
## 2 U 92
(count(df_results$role_developer))
## x freq
## 1 29
## 2 D 81
(count(df_results$role_review))
## x freq
## 1 48
## 2 W 62
(count(df_results$role_reader))
## x freq
## 1 17
## 2 R 93
(count(df_results$role_author))
## x freq
## 1 35
## 2 A 75
colctrole<-colSums(df_results[c("r_user","r_developer","r_review","r_reader","r_author")])
names(colctrole)<-c("User","Developer","Reviewer","Reader","Author")
colctrole<-melt(colctrole)
colctrole$id<-rownames(colctrole)
ggplot(colctrole,aes(id,value)) +geom_bar(stat = "identity") + xlab("") + ylab("") + theme(text = element_text(size=fontsize)) + coord_flip() + theme(plot.margin = unit(c(0,0,-0.5,-0.5), "cm"))

ggsave(paste(chartdir,"role.pdf",sep = ""), width = 3, height = 2.5)
### Cross-correlation matrix for role
names(df_role)<-c("Developer","User","Reader","Author","Reviewer")
corMat <- cor(df_role)
kable(corMat)
| Developer |
1.0000000 |
0.2930597 |
0.3149787 |
0.4329147 |
0.2223921 |
| User |
0.2930597 |
1.0000000 |
0.6266743 |
0.2781847 |
0.3045090 |
| Reader |
0.3149787 |
0.6266743 |
1.0000000 |
0.4098792 |
0.4352007 |
| Author |
0.4329147 |
0.2781847 |
0.4098792 |
1.0000000 |
0.4615392 |
| Reviewer |
0.2223921 |
0.3045090 |
0.4352007 |
0.4615392 |
1.0000000 |
melted_cormat <- melt(corMat)
ggplot(data = melted_cormat, aes(x=Var1, y=Var2, fill=value)) + geom_tile()+ xlab("") + ylab("") + scale_fill_gradient(low="white", high="black") + theme(text = element_text(size=fontsize)) + theme(plot.margin = unit(c(0,0,-0.5,0), "cm"))

ggsave(paste(chartdir,"role_cormat.pdf",sep = ""), width = 7, height = 2.5)
### Combinations of ontology roles
ct_r<-count(df_results$role)
ct_r$pc<-round(ct_r$freq/nrow(df_results)*100,2)
kable(ct_r[order(-ct_r$freq),])
| 11 |
DURAW |
48 |
43.64 |
| 10 |
DURA |
13 |
11.82 |
| 9 |
DUR |
9 |
8.18 |
| 1 |
|
5 |
4.55 |
| 17 |
UR |
5 |
4.55 |
| 20 |
URW |
5 |
4.55 |
| 19 |
URAW |
4 |
3.64 |
| 3 |
D |
3 |
2.73 |
| 16 |
U |
3 |
2.73 |
| 2 |
A |
2 |
1.82 |
| 5 |
DRA |
2 |
1.82 |
| 15 |
RAW |
2 |
1.82 |
| 18 |
URA |
2 |
1.82 |
| 4 |
DA |
1 |
0.91 |
| 6 |
DRW |
1 |
0.91 |
| 7 |
DU |
1 |
0.91 |
| 8 |
DUA |
1 |
0.91 |
| 12 |
DURW |
1 |
0.91 |
| 13 |
DW |
1 |
0.91 |
| 14 |
R |
1 |
0.91 |
##TDL Analysis
ct_tdl<-count(df_results$tdl)
ct_tdl<-ct_tdl[order(-ct_tdl$freq),]
ggplot(ct_tdl,aes(reorder(x,freq),freq)) +geom_bar(stat = "identity") + xlab("") + ylab("") + theme(text = element_text(size=fontsize)) + theme(plot.margin = unit(c(0,0,-0.5,0), "cm"))

ggsave(paste(chartdir,"tdl.pdf",sep = ""), width = 6, height = 2.5)
##Job analysis
ct<-count(df_results$job)
ggplot(ct,aes(x=reorder(x,freq),y=freq)) + geom_bar(stat="identity") + ylab("") + xlab("") + coord_flip() + theme(text = element_text(size=fontsize)) + theme(plot.margin = unit(c(0,0,-0.5,-0.5), "cm"))

ggsave(paste(chartdir,"job.pdf",sep = ""), width = 4, height = 2.5)
write.csv(corMat,file = "correlation_matrix_usergroups.csv")
Overall ranking
##Create ranking by mean
df<-df_results[ , grepl( "^o_" , names( df_results )) ]
df$id<-df_results$id
df_long<-melt(df,id.vars="id")
df_long<-merge(df_long,df_results[ , c("id","r_developer","r_user","r_author","r_review","r_reader","role") ],by="id")
df_long<-merge(df_long,dfl[c("col","label")],by.x = "variable",by.y = "col")
df_long$label<-as.factor(df_long$label)
df_long_agg<-aggregate(df_long$value,by=list(df_long$label),FUN=function(x) mean(x, na.rm=TRUE))
df_long_agg<-df_long_agg[order(-df_long_agg$x),]
df_long$label <- factor(df_long$label, levels = df_long_agg$Group.1)
#head(df_long[c("variable","value")])
###Ordered mean rating of information items across all participants
ggplot(df_long,aes(x=label,y=value)) + stat_summary(fun.y = mean, geom = "bar") + geom_hline(yintercept = 5)+ ylab("Mean Rating") + xlab("")+ geom_hline(yintercept = 3.0) + geom_hline(yintercept = 3.5) + coord_flip(ylim=c(2.6,5)) + theme(text = element_text(size=10)) + theme(plot.margin = unit(c(0,0,-0.5,-0.5), "cm")) + geom_text(aes(x=29.81, y=3.1,label = "SHOULD"), hjust = 0, vjust = 0, size=2.8) + geom_text(aes(x=29.81, y=2.5,label = "OPTIONAL"), hjust = 0, vjust = 0, size=2.8,colour="white") + geom_text(aes(x=29.81, y=4.1,label = "MUST"), hjust = 0, vjust = 0, size=2.8)
## Warning: Removed 38 rows containing non-finite values (stat_summary).

ggsave(paste(chartdir,"overall_mean_results.pdf",sep = ""), width = 6, height = 6)
## Warning: Removed 38 rows containing non-finite values (stat_summary).
## Rankings across roles
df_long_agg<-aggregate_by_role(df_long)
write.csv(df_long_agg,file = "priority_by_group.csv")
options(scipen = 100)
df_long_agg<-df_long_agg[order(df_long_agg$sd.all),]
kable(df_long_agg[c("ontology_feature","All","mean.all","median.all","sd.all")])
| 1 |
Basics: Ontology URL |
1 |
4.715596 |
5 |
0.6816611 |
| 2 |
Basics: Ontology name |
2 |
4.706422 |
5 |
0.6979169 |
| 4 |
Basics: Ontology license |
4 |
4.504587 |
5 |
0.7890906 |
| 6 |
SRD: Scope and coverage |
6 |
4.148148 |
4 |
0.8407253 |
| 25 |
SRD: Development community |
25 |
3.768518 |
4 |
0.8604116 |
| 3 |
Basics: Ontology owner |
3 |
4.527778 |
5 |
0.8696152 |
| 7 |
Content: Ontology relationships |
7 |
4.128440 |
4 |
0.8829760 |
| 9 |
Content: Incorporation of other ontologies |
9 |
4.091743 |
4 |
0.9481101 |
| 13 |
Motivation: Target audience |
13 |
3.944954 |
4 |
0.9606601 |
| 24 |
Content: Axiom patterns |
24 |
3.796296 |
4 |
0.9644064 |
| 5 |
QA: Examples of use |
5 |
4.192661 |
5 |
0.9857979 |
| 14 |
KA: Knowledge acquisition methodology |
14 |
3.926605 |
4 |
0.9879498 |
| 16 |
Content: Entity metadata policy |
16 |
3.889908 |
4 |
1.0214319 |
| 8 |
Content: KR language |
8 |
4.110092 |
4 |
1.0304570 |
| 17 |
Content: Upper ontology |
17 |
3.880734 |
4 |
1.0339136 |
| 22 |
Change: Versioning policy |
23 |
3.798165 |
4 |
1.0344064 |
| 18 |
QA: Testing |
18 |
3.871560 |
4 |
1.0372767 |
| 28 |
KA: Content selection |
28 |
3.379630 |
4 |
1.0386691 |
| 26 |
Content: Entity naming convention |
26 |
3.743119 |
4 |
1.0399757 |
| 10 |
Basics: Ontology repository |
10 |
4.009174 |
4 |
1.0407922 |
| 21 |
Change: Entity deprecation strategy |
21 |
3.834862 |
4 |
1.0673068 |
| 12 |
Motivation: Competition |
12 |
3.963303 |
4 |
1.0708823 |
| 20 |
Motivation: Need |
20 |
3.851852 |
4 |
1.0835630 |
| 19 |
Content: Identifier generation policy |
19 |
3.862385 |
4 |
1.0841270 |
| 11 |
QA: Evaluation |
11 |
3.990826 |
4 |
1.0843620 |
| 23 |
SRD: Communication |
22 |
3.798165 |
4 |
1.0867878 |
| 15 |
Change: Sustainability plan |
15 |
3.889908 |
4 |
1.0915454 |
| 29 |
KA: Source knowledge location |
29 |
3.357798 |
3 |
1.0931007 |
| 27 |
Content: Ontology metrics |
27 |
3.422018 |
3 |
1.1808156 |
| 30 |
Content: Development environment |
30 |
2.878505 |
3 |
1.3010176 |
#print(xtable(df_long_agg[c("ontology_feature","All","mean.all","median.all","sd.all")],digits=c(0,0,0,2,0,2)),include.rownames=FALSE)
### Correlation of standard deviation and mean
kable(cor(df_long_agg[c("sd.all","mean.all")]))
| sd.all |
1.0000000 |
-0.8458595 |
| mean.all |
-0.8458595 |
1.0000000 |
### Ranking table
dfl_i<-df_long_agg[c("ontology_feature","All","Author","Developer","Reviewer","User","Reader")]
dfl_i<-dfl_i[order(dfl_i$All),]
kable(dfl_i)
| 1 |
Basics: Ontology URL |
1 |
2 |
2 |
2 |
2 |
2 |
| 2 |
Basics: Ontology name |
2 |
1 |
1 |
1 |
1 |
1 |
| 3 |
Basics: Ontology owner |
3 |
3 |
3 |
3 |
3 |
3 |
| 4 |
Basics: Ontology license |
4 |
4 |
4 |
4 |
4 |
4 |
| 5 |
QA: Examples of use |
5 |
5 |
8 |
5 |
5 |
5 |
| 6 |
SRD: Scope and coverage |
6 |
8 |
6 |
6 |
8 |
6 |
| 7 |
Content: Ontology relationships |
7 |
7 |
7 |
7 |
7 |
7 |
| 8 |
Content: KR language |
8 |
6 |
5 |
9 |
9 |
8 |
| 9 |
Content: Incorporation of other ontologies |
9 |
10 |
9 |
8 |
6 |
9 |
| 10 |
Basics: Ontology repository |
10 |
14 |
12 |
10 |
10 |
11 |
| 11 |
QA: Evaluation |
11 |
9 |
14 |
11 |
11 |
12 |
| 12 |
Motivation: Competition |
12 |
12 |
11 |
13 |
13 |
14 |
| 13 |
Motivation: Target audience |
13 |
15 |
13 |
12 |
15 |
13 |
| 14 |
KA: Knowledge acquisition methodology |
14 |
21 |
16 |
20 |
17 |
20 |
| 15 |
Change: Sustainability plan |
15 |
13 |
10 |
16 |
12 |
10 |
| 16 |
Content: Entity metadata policy |
16 |
17 |
17 |
17 |
18 |
21 |
| 17 |
Content: Upper ontology |
17 |
11 |
20 |
18 |
21 |
15 |
| 18 |
QA: Testing |
18 |
16 |
24 |
14 |
16 |
19 |
| 19 |
Content: Identifier generation policy |
19 |
25 |
18 |
19 |
19 |
17 |
| 20 |
Motivation: Need |
20 |
20 |
22 |
15 |
22 |
18 |
| 21 |
Change: Entity deprecation strategy |
21 |
18 |
15 |
23 |
14 |
16 |
| 23 |
SRD: Communication |
22 |
22 |
19 |
22 |
23 |
24 |
| 22 |
Change: Versioning policy |
23 |
24 |
21 |
24 |
20 |
22 |
| 24 |
Content: Axiom patterns |
24 |
23 |
25 |
21 |
24 |
23 |
| 25 |
SRD: Development community |
25 |
19 |
23 |
25 |
25 |
25 |
| 26 |
Content: Entity naming convention |
26 |
26 |
26 |
26 |
26 |
26 |
| 27 |
Content: Ontology metrics |
27 |
28 |
28 |
27 |
27 |
27 |
| 28 |
KA: Content selection |
28 |
29 |
27 |
28 |
28 |
28 |
| 29 |
KA: Source knowledge location |
29 |
27 |
29 |
29 |
29 |
29 |
| 30 |
Content: Development environment |
30 |
30 |
30 |
30 |
30 |
30 |
thresh<-4
dfl_i_dev<-dfl_i
#dfl_i_dev$author_d<-ifelse((dfl_i_dev$All-dfl_i_dev$Author)>=thresh,"less",ifelse((dfl_i_dev$All-dfl_i_dev$Author)<=-thresh,"more",""))
#dfl_i_dev$developer_d<-ifelse((dfl_i_dev$All-dfl_i_dev$Developer)>=thresh,"less",ifelse((dfl_i_dev$All-dfl_i_dev$Developer)<=-thresh,"more",""))
#dfl_i_dev$reviewer_d<-ifelse((dfl_i_dev$All-dfl_i_dev$Reviewer)>=thresh,"less",ifelse((dfl_i_dev$All-dfl_i_dev$Reviewer)<=-thresh,"more",""))
#dfl_i_dev$user_d<-ifelse((dfl_i_dev$All-dfl_i_dev$User)>=thresh,"less",ifelse((dfl_i_dev$All-dfl_i_dev$User)<=-thresh,"more",""))
#dfl_i_dev$reader_d<-ifelse((dfl_i_dev$All-dfl_i_dev$Reader)>=thresh,"less",ifelse((dfl_i_dev$All-dfl_i_dev$Reader)<=-thresh,"more",""))
dfl_i_dev$author_d<-dfl_i_dev$All-dfl_i_dev$Author
dfl_i_dev$developer_d<-dfl_i_dev$All-dfl_i_dev$Developer
dfl_i_dev$reviewer_d<-dfl_i_dev$All-dfl_i_dev$Reviewer
dfl_i_dev$user_d<-dfl_i_dev$All-dfl_i_dev$User
dfl_i_dev$reader_d<-dfl_i_dev$All-dfl_i_dev$Reader
kable(dfl_i_dev)
| 1 |
Basics: Ontology URL |
1 |
2 |
2 |
2 |
2 |
2 |
-1 |
-1 |
-1 |
-1 |
-1 |
| 2 |
Basics: Ontology name |
2 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
1 |
| 3 |
Basics: Ontology owner |
3 |
3 |
3 |
3 |
3 |
3 |
0 |
0 |
0 |
0 |
0 |
| 4 |
Basics: Ontology license |
4 |
4 |
4 |
4 |
4 |
4 |
0 |
0 |
0 |
0 |
0 |
| 5 |
QA: Examples of use |
5 |
5 |
8 |
5 |
5 |
5 |
0 |
-3 |
0 |
0 |
0 |
| 6 |
SRD: Scope and coverage |
6 |
8 |
6 |
6 |
8 |
6 |
-2 |
0 |
0 |
-2 |
0 |
| 7 |
Content: Ontology relationships |
7 |
7 |
7 |
7 |
7 |
7 |
0 |
0 |
0 |
0 |
0 |
| 8 |
Content: KR language |
8 |
6 |
5 |
9 |
9 |
8 |
2 |
3 |
-1 |
-1 |
0 |
| 9 |
Content: Incorporation of other ontologies |
9 |
10 |
9 |
8 |
6 |
9 |
-1 |
0 |
1 |
3 |
0 |
| 10 |
Basics: Ontology repository |
10 |
14 |
12 |
10 |
10 |
11 |
-4 |
-2 |
0 |
0 |
-1 |
| 11 |
QA: Evaluation |
11 |
9 |
14 |
11 |
11 |
12 |
2 |
-3 |
0 |
0 |
-1 |
| 12 |
Motivation: Competition |
12 |
12 |
11 |
13 |
13 |
14 |
0 |
1 |
-1 |
-1 |
-2 |
| 13 |
Motivation: Target audience |
13 |
15 |
13 |
12 |
15 |
13 |
-2 |
0 |
1 |
-2 |
0 |
| 14 |
KA: Knowledge acquisition methodology |
14 |
21 |
16 |
20 |
17 |
20 |
-7 |
-2 |
-6 |
-3 |
-6 |
| 15 |
Change: Sustainability plan |
15 |
13 |
10 |
16 |
12 |
10 |
2 |
5 |
-1 |
3 |
5 |
| 16 |
Content: Entity metadata policy |
16 |
17 |
17 |
17 |
18 |
21 |
-1 |
-1 |
-1 |
-2 |
-5 |
| 17 |
Content: Upper ontology |
17 |
11 |
20 |
18 |
21 |
15 |
6 |
-3 |
-1 |
-4 |
2 |
| 18 |
QA: Testing |
18 |
16 |
24 |
14 |
16 |
19 |
2 |
-6 |
4 |
2 |
-1 |
| 19 |
Content: Identifier generation policy |
19 |
25 |
18 |
19 |
19 |
17 |
-6 |
1 |
0 |
0 |
2 |
| 20 |
Motivation: Need |
20 |
20 |
22 |
15 |
22 |
18 |
0 |
-2 |
5 |
-2 |
2 |
| 21 |
Change: Entity deprecation strategy |
21 |
18 |
15 |
23 |
14 |
16 |
3 |
6 |
-2 |
7 |
5 |
| 23 |
SRD: Communication |
22 |
22 |
19 |
22 |
23 |
24 |
0 |
3 |
0 |
-1 |
-2 |
| 22 |
Change: Versioning policy |
23 |
24 |
21 |
24 |
20 |
22 |
-1 |
2 |
-1 |
3 |
1 |
| 24 |
Content: Axiom patterns |
24 |
23 |
25 |
21 |
24 |
23 |
1 |
-1 |
3 |
0 |
1 |
| 25 |
SRD: Development community |
25 |
19 |
23 |
25 |
25 |
25 |
6 |
2 |
0 |
0 |
0 |
| 26 |
Content: Entity naming convention |
26 |
26 |
26 |
26 |
26 |
26 |
0 |
0 |
0 |
0 |
0 |
| 27 |
Content: Ontology metrics |
27 |
28 |
28 |
27 |
27 |
27 |
-1 |
-1 |
0 |
0 |
0 |
| 28 |
KA: Content selection |
28 |
29 |
27 |
28 |
28 |
28 |
-1 |
1 |
0 |
0 |
0 |
| 29 |
KA: Source knowledge location |
29 |
27 |
29 |
29 |
29 |
29 |
2 |
0 |
0 |
0 |
0 |
| 30 |
Content: Development environment |
30 |
30 |
30 |
30 |
30 |
30 |
0 |
0 |
0 |
0 |
0 |
### Ranking of information items broken down by roles
df_long_agg_long<-melt(dfl_i,id.vars="ontology_feature")
df_long_agg_long$ontology_feature <- factor(df_long_agg_long$ontology_feature, levels = dfl_i$ontology_feature)
ggplot(df_long_agg_long,aes(x=ontology_feature,y=value,group=variable,fill=variable)) + geom_bar(stat = "identity",position = "dodge") + coord_flip()

ggsave(paste(chartdir,"overall_ranking_by_role.pdf",sep = ""), width = 4, height = 2.5)
Systematic review of MIRO compliance
paper_metadata<-c("Name","URL","Domain","Language","License","Repository")
df_paper_metadata<-df_paper_results[paper_metadata]
df_paper_coding<-df_paper_results[, !names(df_paper_results) %in% paper_metadata]
latex_paper_metadata<-df_paper_metadata[c("Name","Domain","URL")]
#print(xtable(latex_paper_metadata,digits=c(0,0,0,0)),include.rownames=FALSE)
df_coding_melt<-melt(df_paper_coding,id.vars = "Code")
df_coding_melt$value<-as.factor(df_coding_melt$value)
ggplot(df_coding_melt[df_coding_melt$variable %in% c("Ontology.name","Ontology.owner","OBO.Principles"),],aes(x=value,fill=value)) + geom_bar()+facet_wrap("variable")+ theme_bw() + scale_fill_grey()

ggplot(df_coding_melt[!(df_coding_melt$variable %in% c("Ontology.name","Ontology.owner","OBO.Principles")),],aes(x=value,fill=value)) + geom_bar()+facet_wrap("variable",ncol = 3) + theme_bw() + scale_fill_grey()

ggsave(paste(chartdir,"results_coding.pdf",sep = ""), width = 8, height = 10)
df_coding_melt$value<-as.numeric(as.character(df_coding_melt$value))
agg_compliance<-aggregate(df_coding_melt$value,by=list(df_coding_melt$variable),FUN=mean)
agg_compliance$compliance<-ifelse(agg_compliance$Group.1=="Ontology.name"|agg_compliance$Group.1=="Ontology.owner",agg_compliance$x/2,agg_compliance$x);
agg_compliance$compliance<-round(agg_compliance$compliance*100,2)
names(agg_compliance)<-c("MIRO item","x","Compliance")
write.csv(file = "agg_compliance.csv",agg_compliance)
# AT THIS POINT, A DOCUMENT WAS CREATED MANUALLY WITH THE VALUES FOR COMPLIANCE AND RATINGS, AS MANUALLY MATCHING THEM TURNED OUT IMPOSSIBLE. FOR THAT, agg_compliance and priority_by_group (the mean_all column only) where merged together.
df_cvr<-read.csv(file=ratingvcompliance_f,head=TRUE,sep=",",stringsAsFactors = FALSE)
Compliance vs Ratings
df_cvr$rating_cat<-ifelse(df_cvr$rating<3,"Optional",ifelse(df_cvr$rating<3.5,"Should","Must"))
df_cvr$compliance_cat<-ifelse(df_cvr$compliance<20,"Very Low",ifelse(df_cvr$compliance<50,"Low",ifelse(df_cvr$compliance<80,"Medium","High")))
df_cvr$cat<-paste(substring(df_cvr$rating_cat, 1, 1),substring(df_cvr$compliance_cat, 1, 1),sep="")
df_cvr$compliance_cat<-factor(df_cvr$compliance_cat, levels = c("Very Low","Low","Medium","High"))
df_cvr$rating_cat<-factor(df_cvr$rating_cat, levels = c("Optional","Should","Must"))
d_paper<-df_cvr[!is.na(df_cvr$rating),][c("miro_item_rating","rating","compliance","cat")]
d_paper<-d_paper[order(-d_paper$compliance,-d_paper$rating),]
#print(xtable(d_paper,digits=c(0,2,2,2,0)),include.rownames=FALSE)
ct_cat<-plyr::count(d_paper$cat)
ct_cat$pc<-round((ct_cat$freq/nrow(d_paper))*100,2)
names(df_cvr)<-c("miro_item_rating", "Rating", "miro_item_comp", "mean_compliance", "Compliance", "Rating category", "Compliance category")
ggplot(df_cvr,aes(y=Rating,x=Compliance,label=miro_item_comp,colour=`Rating category`)) + geom_point() +geom_text(aes(label=miro_item_comp),hjust=0.1, vjust=-0.6, size=3)
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_text).

ggsave(paste(chartdir,"scatter_compvrate.pdf",sep = ""), width = 12, height = 4)
## Warning: Removed 6 rows containing missing values (geom_point).
## Warning: Removed 6 rows containing missing values (geom_text).
ggplot(df_cvr[!is.na(df_cvr$Rating),],aes(x=`Compliance category`,fill=`Rating category`)) + geom_bar()

ggsave(paste(chartdir,"bar_cat_compvrate.pdf",sep = ""), width = 8, height = 2.5)
Comment Analysis