Measuring Expert Performance at Manually Classifying Domain Entities under Upper Ontology Classes: Analysis
Measuring Expert Performance at Manually Classifying Domain Entities under Upper Ontology Classes: Analysis
- Demographics
- Rating Analysis
- Fatique analysis
- User answers by concept
- Answer analysis
- Inter-rater Agreement
- Correctness according to Experimenter Verdict
- Impact analysis of BFO classes across OBO ontologies
- BFO: Class hierarchy by expert agreement
- BFO: Class hierarchy by expert agreement: Alt 1
- BFO Concept Analysis: Overview Table
- Comments for concepts with low EEC
- Difficult suparts analysis
Demographics
Gender
| x | freq |
|---|---|
| Female | 4 |
| Male | 4 |
Age and Expertise
| Min. | X1st.Qu. | Median | Mean | X3rd.Qu. | Max. |
|---|---|---|---|---|---|
| 37 | 39.5 | 43 | 44.14286 | 48 | 54 |
| x | freq |
|---|---|
| Expertise = 4 | 3 |
| Expertise = 5 | 5 |
Expertise by Gender
| bfo11expertise | gender | freq |
|---|---|---|
| Expertise = 4 | Female | 3 |
| Expertise = 5 | Female | 1 |
| Expertise = 5 | Male | 4 |
Rating Analysis
df_answers_raw<-dq[dq$variable=="question"&dq$id<fatiquecutoff,]
df_answers_raw$answer<-ifelse(df_answers_raw$answer==""|df_answers_raw$answer=="don't know"|df_answers_raw$answer==" ","Not given",df_answers_raw$answer)
df_answers_raw$variable<-NULL
df_answers_raw$empty<-ifelse(df_answers_raw$answer==" "|df_answers_raw$answer=="don't know"|df_answers_raw$answer=="Not given","EMPTY","OK")
df_answers_all<-df_answers_raw
df_answers_no_0<-df_answers_all[df_answers_all$pid!="0",]
df_confidence<-dq[dq$variable=="confidence",]
asis_output("### Questions per participant")Questions per participant
kable(plyr::count(df_answers_no_0$pid),row.names = FALSE,digits = c(2))| x | freq |
|---|---|
| 1 | 46 |
| 2 | 46 |
| 3 | 46 |
| 4 | 46 |
| 5 | 46 |
| 6 | 46 |
| 7 | 46 |
| 8 | 46 |
#questions per participant
asis_output("### Participants with number of missing answers")Participants with number of missing answers
x<-plyr::count(df_answers_no_0[c("pid","empty")])
kable(plyr::count(x[x$empty=="EMPTY",]$freq),row.names = FALSE,digits = c(2))| x | freq |
|---|---|
| 1 | 2 |
| 2 | 1 |
| 3 | 1 |
| 6 | 1 |
| 7 | 1 |
asis_output("### Missing answers by concept")Missing answers by concept
x<-plyr::count(df_answers_no_0[c("concept","empty")])
x<-(x[x$freq>=1&x$empty=="EMPTY",])
kable(x[order(-x$freq),],row.names = FALSE,digits = c(2))| concept | empty | freq |
|---|---|---|
| situation | EMPTY | 4 |
| why I had problems sleeping | EMPTY | 4 |
| air space tomorrow | EMPTY | 3 |
| time and place | EMPTY | 3 |
| area where the hotel will be built | EMPTY | 1 |
| distance | EMPTY | 1 |
| my hotel room | EMPTY | 1 |
| surface of pool table | EMPTY | 1 |
| tan line | EMPTY | 1 |
| warmth | EMPTY | 1 |
# Distribution of different ratings
asis_output("### Number of different answers (DA)")Number of different answers (DA)
df_different_answers<-aggregate(df_answers_no_0$answer,by=list(df_answers_no_0$concept),FUN=function(x) {length(unique(x))})
names(df_different_answers)<-c("concept","DA")
df_different_answers<-df_different_answers[order(-df_different_answers$DA),]
agg_da<-plyr::count(df_different_answers$DA)
kable(agg_da,row.names = FALSE,digits = c(2))| x | freq |
|---|---|
| 1 | 10 |
| 2 | 12 |
| 3 | 10 |
| 4 | 12 |
| 5 | 2 |
asis_output("### Confidence (CON)")Confidence (CON)
agg_con<-aggregate(as.numeric(df_confidence$answer),by=list(df_confidence$concept),FUN=function(x) {mean(x,na.rm = TRUE)})
names(agg_con)<-c("concept","CON")
asis_output("### Fleiss' Kappa (IER)")Fleiss’ Kappa (IER)
df_answers_matrix_no_0<-reshape(df_answers_no_0[c("pid","concept","answer")],direction = "wide",timevar = "pid",idvar = "concept")
df_answers_matrix<-reshape(df_answers_all[c("pid","concept","answer")],direction = "wide",timevar = "pid",idvar = "concept")
kappam.fleiss(df_answers_matrix_no_0[!(names(df_answers_matrix_no_0) %in% c("concept"))]) Fleiss' Kappa for m Raters
Subjects = 46
Raters = 8
Kappa = 0.519
z = 69
p-value = 0
## Analyse difference from expert
df_answers_matrix_melt_0<-melt(df_answers_matrix,id.vars = c("concept","answer.0"))
df_answers_matrix_melt_0$same<-ifelse(df_answers_matrix_melt_0$value==df_answers_matrix_melt_0$answer.0,1,0)
df_answers_bfo<-merge(df_answers_matrix_melt_0,df_bfo_sim,by.x = c("answer.0","value"),by.y = c("c1_l","c2_l"),all.x = TRUE)
df_answers_bfo$similarity<-ifelse(is.na(df_answers_bfo$similarity),0,df_answers_bfo$similarity)
asis_output("### Experimenter-expert agreement based on ontological similarity (EES)")Experimenter-expert agreement based on ontological similarity (EES)
agg_ees<-aggregate(df_answers_bfo$similarity,by=list(df_answers_bfo$concept),FUN=function(x) {mean(x,na.rm = TRUE)})
names(agg_ees)<-c("concept","EES")
asis_output("### Experimenter-expert Agreement (EEC)")Experimenter-expert Agreement (EEC)
agg_eec<-aggregate(df_answers_bfo$same,by=list(df_answers_bfo$concept),FUN=function(x) {mean(x,na.rm = TRUE)})
names(agg_eec)<-c("concept","EEC")
asis_output("### Shannons Entropy (SE)")Shannons Entropy (SE)
df_answers_matrix_no_0$entropy<-apply(df_answers_matrix_no_0[!(names(df_answers_matrix_no_0) %in% c("concept"))], 1, function(x) entropy(x))
asis_output("### Inter-expert agreement based on ontological similarity (IES)")Inter-expert agreement based on ontological similarity (IES)
# compute majority including missing answers
dmost<-plyr::count(df_answers_no_0[c("concept","answer")])
dmost<-dmost[order(dmost$concept,-dmost$freq),]
dmost <- by(dmost, dmost$concept, function(X) X[which.max(X$freq),])
dmost<-do.call("rbind", dmost)
names(dmost)<-c("concept","majority","freq")
#names(which.max(table(myvector)))
conceptsnotgivenmajority<-dmost[dmost$majority=="Not given",c("concept")]
# compute majority excluding missing answers
dct<-plyr::count(df_answers_no_0[df_answers_no_0$answer!="Not given",c("concept","answer")])
dmost<-dct
dmost<-dmost[order(dmost$concept,-dmost$freq),]
#dmost_max<-aggregate(unique(dmost[c("concept","freq")])$freq,by=list(unique(dmost[c("concept","freq")])$concept),FUN=max)
#names(dmost_max)<-c("concept","max")
#dmost<-merge(dmost,dmost_max)
#dmost<-dmost[dmost$freq==dmost$max,c("concept","answer","freq")]
#dmost<-reshape(dmost,direction = "wide",timevar = "answer",idvar = "concept")
#dmost_ct<-plyr::count(dmost$concept)
#head(dmost_ct[dmost_ct$freq>1,])
dmost <- by(dmost, dmost$concept, function(X) X[which.max(X$freq),])
dmost<-do.call("rbind", dmost)
names(dmost)<-c("concept","majority","freq")
# Compute the majority similarity befor adding an asterisk the majority string
d0simpair<-majoritysimilarity(df_answers_no_0,dmost)
dmost$majority<-ifelse(dmost$concept %in% conceptsnotgivenmajority,paste(dmost$majority,"*",sep=""),dmost$majority)
#plyr::count(d0simpair$concept)
agg_ies<-aggregate(d0simpair$similarity,by=list(d0simpair$concept),FUN=mean)
names(agg_ies)<-c("concept","IES")
asis_output("### Proportion Analysis (???)")Proportion Analysis (???)
dct$pc<-dct$freq/n_participants
dct_agg<-dplyr::summarise(group_by(dct,concept),mean=mean(pc), sd=sd(pc), max=max(pc))
dct_agg$sd<-ifelse(is.nan(dct_agg$sd),0,dct_agg$sd)
#kable(dct_agg)
#kable(d[d$different_responses>1,],row.names = FALSE)
#kable(dkappfleis,row.names = FALSE)
asis_output("### Creating final dataframe")Creating final dataframe
x<-merge(df_different_answers,as.data.frame(dct_agg),by="concept")
x<-merge(x, df_answers_matrix_no_0,by="concept")
x<-merge(x, df_answers_matrix[c("concept","answer.0")],by="concept")
x<-merge(x, agg_eec,by="concept")
x<-merge(x, agg_ees,by="concept")
x<-merge(x, agg_ies,by="concept")
x<-merge(x, agg_con,by="concept")
x<-merge(x, dmost[c("concept","majority")],by="concept")
x$entropy<-abs(x$entropy)
df_full<-x[order(-x$entropy),]Fatique analysis
df_fat<-merge(df_fatigue,df_full[c("concept","EES","IES","CON")],by="concept")
df_fat<-df_fat[order(df_fat$id),]
#df_fat$ies_dist_mean<-df_fat$IES-mean(df_fat$IES)
#View(df_fat)
asis_output("### Cut-off 26 (drop 20 out of 46)")Cut-off 26 (drop 20 out of 46)
x<-melt(df_fat[df_fat$id<26,],id.vars = c("concept","id"))
y<- x %>% group_by(variable) %>% summarise(mean = mean(value))
ggplot(x,aes(id,value)) +geom_point() + geom_smooth()+ geom_hline(data = y, aes(yintercept=mean)) + facet_wrap('variable',scales='free_y')asis_output("### Cut-off 31 (drop 15 out of 46)")Cut-off 31 (drop 15 out of 46)
x1<-melt(df_fat[df_fat$id<31,],id.vars = c("concept","id"))
y1<- x1 %>% group_by(variable) %>% summarise(mean = mean(value))
ggplot(x1,aes(id,value)) +geom_point() + geom_smooth()+ geom_hline(data = y1, aes(yintercept=mean)) + facet_wrap('variable',scales='free_y')asis_output("### Cut-off 36 (drop 10 out of 46)")Cut-off 36 (drop 10 out of 46)
x2<-melt(df_fat[df_fat$id<36,],id.vars = c("concept","id"))
y2<- x2 %>% group_by(variable) %>% summarise(mean = mean(value))
ggplot(x2,aes(id,value)) +geom_point() + geom_smooth()+ geom_hline(data = y2, aes(yintercept=mean)) + facet_wrap('variable',scales='free_y')asis_output("### Drop nothing")Drop nothing
x3<-melt(df_fat,id.vars = c("concept","id"))
y3<- x3 %>% group_by(variable) %>% summarise(mean = mean(value))
ggplot(x3,aes(id,value)) +geom_point() + geom_smooth()+ geom_hline(data = y3, aes(yintercept=mean)) + facet_wrap('variable',scales='free_y')mean(df_fat$EES)[1] 0.7034604
mean(df_fat[1:as.integer(nrow(df_fat)/2),]$EES)[1] 0.8143353
mean(df_fat[as.integer(nrow(df_fat)/2):nrow(df_fat),]$EES)[1] 0.609561
cor.test(df_fat$EES,df_fat$id)
Pearson's product-moment correlation
data: df_fat$EES and df_fat$id
t = -4.9871, df = 44, p-value = 1.006e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.7588514 -0.3762792
sample estimates:
cor
-0.6009352
cor.test(df_fat$IES,df_fat$id)
Pearson's product-moment correlation
data: df_fat$IES and df_fat$id
t = -4.872, df = 44, p-value = 1.47e-05
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.7528828 -0.3642641
sample estimates:
cor
-0.5919648
theme_bfo20(ggplot(df_full,aes(EES,CON)) + geom_point() + geom_smooth() + xlab("Experimenter-expert classification similarity (EES)")+ ylab("Self-rated confidence (CON)"))ggsave("cor_con_ees.jpg",width = 3,height = 2)
theme_bfo20(ggplot(df_full,aes(IES,CON)) + geom_point() + geom_smooth() + xlab("Mean inter-expert alignment similarity (IES)")+ ylab("Self-rated confidence (CON)"))ggsave("cor_con_ies.jpg",width = 3,height = 2)
theme_bfo20(ggplot(df_full,aes(max,EEC)) + geom_point() + geom_smooth() + xlab("Mean inter-expert agreement (MAC)")+ ylab("Experimenter-expert class. agreement (EEC)"))#ggsave("cor_mac_eec.jpg",width = 3,height = 2)
theme_bfo20(ggplot(df_full,aes(IES,EES)) + geom_point() + geom_smooth() + xlab("Mean inter-expert agreement (MAC)")+ ylab("Experimenter-expert class. agreement (EEC)"))#ggsave("cor_mac_eec.jpg",width = 3,height = 2)User answers by concept
answer.0 is the answer determined through expert review.
| concept | answer.2 | answer.6 | answer.3 | answer.4 | answer.5 | answer.1 | answer.7 | answer.8 | answer.0 |
|---|---|---|---|---|---|---|---|---|---|
| moment | spatiotemporal instant | temporal interval | temporal instant | site | temporal instant | fiat process part | temporal instant | spatiotemporal instant | spatiotemporal instant |
| tan line | one dimensional region | fiat object part | object boundary | site | one dimensional region | object boundary | Not given | object boundary | one dimensional region |
| air space tomorrow | three dimensional region | three dimensional region | Not given | Not given | site | site | Not given | object | spatiotemporal interval |
| border | fiat object part | two dimensional region | object boundary | site | fiat object part | site | fiat object part | object boundary | object boundary |
| time and place | spatiotemporal instant | Not given | spatiotemporal interval | Not given | spatiotemporal interval | fiat process part | Not given | spatiotemporal interval | spatiotemporal instant |
| area where the hotel will be built | two dimensional region | site | two dimensional region | Not given | three dimensional region | site | site | site | three dimensional region |
| hotel beach | site | fiat object part | object | site | role | site | site | fiat object part | fiat object part |
| line of lattitude | one dimensional region | one dimensional region | object boundary | site | one dimensional region | site | one dimensional region | two dimensional region | one dimensional region |
| patches on the floor | two dimensional region | object aggregate | object aggregate | object aggregate | fiat object part | object aggregate | site | two dimensional region | scattered spatiotemporal region |
| situation | Not given | process aggregate | Not given | Not given | processual context | process | Not given | processual context | processual context |
| surface of pool table | Not given | object boundary | fiat object part | object | object boundary | object boundary | object boundary | fiat object part | two dimensional region |
| why I had problems sleeping | Not given | process | generically dependent continuant | Not given | processual context | Not given | Not given | processual context | processual context |
| uncovered parts of the beach | two dimensional region | fiat object part | fiat object part | site | fiat object part | site | site | two dimensional region | scattered spatiotemporal region |
| my hotel room | site | site | three dimensional region | Not given | site | site | site | object | site |
| vacation weekend | temporal interval | fiat process part | temporal interval | temporal interval | temporal interval | process | temporal interval | spatiotemporal interval | temporal interval |
| air space | three dimensional region | three dimensional region | three dimensional region | site | three dimensional region | site | site | object aggregate | three dimensional region |
| distance | quality | quality | quality | quality | one dimensional region | one dimensional region | Not given | one dimensional region | quality |
| kilometre zero | zero dimensional region | zero dimensional region | generically dependent continuant | site | zero dimensional region | site | zero dimensional region | site | zero dimensional region |
| take off time | temporal instant | temporal interval | temporal instant | temporal interval | temporal interval | fiat process part | temporal instant | temporal instant | temporal interval |
| place on wall where the postcard is put | two dimensional region | site | fiat object part | site | two dimensional region | site | site | site | two dimensional region |
| point | zero dimensional region | zero dimensional region | zero dimensional region | site | zero dimensional region | site | zero dimensional region | one dimensional region | zero dimensional region |
| timetable | scattered temporal region | generically dependent continuant | scattered temporal region | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | object | generically dependent continuant |
| bay | three dimensional region | site | site | site | site | site | site | object | site |
| vacation location | three dimensional region | site | site | site | spatiotemporal interval | site | site | site | spatial region |
| space | three dimensional region | site | three dimensional region | site | three dimensional region | site | site | site | spatial region |
| to wash | process | process | process | process | process | function | function | function | function |
| airplane flight | process | process | processual context | process | process | process | process | process | process |
| bus | object | object | object | object | object | function | object | object | object |
| clubbing | process | process | object aggregate | process | process | process | process | process | process aggregate |
| digital photograph | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | object | generically dependent continuant |
| dining | process | process | process | process | process aggregate | process | process | process | process aggregate |
| edge | fiat object part | object boundary | object boundary | object boundary | object boundary | object boundary | object boundary | object boundary | object boundary |
| flood plain | two dimensional region | site | site | site | site | site | site | site | spatiotemporal interval |
| tourist area | site | site | site | site | role | site | site | site | fiat object part |
| vacation brochure | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | generically dependent continuant | object aggregate | generically dependent continuant |
| warmth | quality | quality | quality | Not given | quality | quality | quality | quality | quality |
| bus driver | role | role | role | role | role | role | role | role | role |
| deckchairs on the beach | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate |
| drinking a beer | process | process | process | process | process | process | process | process | process |
| end of cooking | process boundary | process boundary | process boundary | process boundary | process boundary | process boundary | process boundary | process boundary | process boundary |
| end of flying | process boundary | process boundary | process boundary | process boundary | process boundary | process boundary | process boundary | process boundary | process boundary |
| person | object | object | object | object | object | object | object | object | object |
| tendency to be mosquito bitten | disposition | disposition | disposition | disposition | disposition | disposition | disposition | disposition | disposition |
| tendency to defiate | disposition | disposition | disposition | disposition | disposition | disposition | disposition | disposition | disposition |
| tour party | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate | object aggregate |
| tourist | role | role | role | role | role | role | role | role | role |
Answer analysis
Metrics breakdown:
- SE: Shannons entropy, measurement of noise (default table sorting)
- EEC: Proportion of ratings in line with expert verdict
- EES: Average concept similarity of the given answer to the expert verdict. Differentiates better than exp_aggr.
- DA: Number of different unique answers for concept
- MAC: Largest proportion of raters voting for the same term. For example, if 4/8 raters have voted concept A (0.5 ratio), and 2/8 concept B (0.25 ratio), that value would be 0.5.
- rat_mean: mean proportion size (perhaps not so nice, but, the larger, the better)
- rat_sd: standard deviation of proportion size.
- CON: Mean confidence of the rater for answering the question.
- IES: Pairwise similarity between answers given (based on ontological similarity)
Inter-rater Agreement
| concept | MAC | MACEEC | IES | SE | DA | CON |
|---|---|---|---|---|---|---|
| bus driver | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.38 |
| deckchairs on the beach | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.50 |
| drinking a beer | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.50 |
| end of cooking | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.50 |
| end of flying | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.62 |
| person | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.25 |
| tendency to be mosquito bitten | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.00 |
| tendency to defiate | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.50 |
| tour party | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.38 |
| tourist | 1.00 | 0.00 | 1.00 | 1.00 | 1 | 4.38 |
| airplane flight | 0.88 | 0.00 | 0.96 | 0.82 | 2 | 4.38 |
| dining | 0.88 | 0.75 | 0.96 | 0.82 | 2 | 4.38 |
| edge | 0.88 | 0.00 | 0.95 | 0.82 | 2 | 3.88 |
| flood plain | 0.88 | 0.88 | 0.93 | 0.82 | 2 | 3.14 |
| digital photograph | 0.88 | 0.00 | 0.92 | 0.82 | 2 | 4.50 |
| vacation brochure | 0.88 | 0.00 | 0.92 | 0.82 | 2 | 4.50 |
| tourist area | 0.88 | 0.88 | 0.92 | 0.82 | 2 | 3.88 |
| bus | 0.88 | 0.00 | 0.91 | 0.82 | 2 | 4.62 |
| clubbing | 0.88 | 0.88 | 0.90 | 0.82 | 2 | 4.25 |
| bay | 0.75 | 0.00 | 0.88 | 0.65 | 3 | 3.62 |
| warmth | 0.88 | 0.00 | 0.88 | 0.82 | 2 | 4.14 |
| vacation location | 0.75 | 0.75 | 0.83 | 0.65 | 3 | 3.88 |
| point | 0.62 | 0.00 | 0.82 | 0.57 | 3 | 4.38 |
| take off time | 0.50 | 0.12 | 0.81 | 0.53 | 3 | 4.50 |
| place on wall where the postcard is put | 0.62 | 0.38 | 0.80 | 0.57 | 3 | 3.71 |
| space | 0.62 | 0.62 | 0.79 | 0.68 | 2 | 4.12 |
| vacation weekend | 0.62 | 0.00 | 0.76 | 0.48 | 4 | 4.38 |
| hotel beach | 0.50 | 0.25 | 0.76 | 0.42 | 4 | 3.88 |
| patches on the floor | 0.50 | 0.50 | 0.75 | 0.42 | 4 | 3.38 |
| my hotel room | 0.62 | 0.00 | 0.75 | 0.48 | 4 | 4.25 |
| line of lattitude | 0.50 | 0.00 | 0.74 | 0.42 | 4 | 3.88 |
| timetable | 0.62 | 0.00 | 0.73 | 0.57 | 3 | 3.88 |
| surface of pool table | 0.50 | 0.50 | 0.71 | 0.42 | 4 | 3.75 |
| kilometre zero | 0.50 | 0.00 | 0.71 | 0.53 | 3 | 3.86 |
| border | 0.38 | 0.12 | 0.71 | 0.36 | 4 | 3.62 |
| air space | 0.50 | 0.00 | 0.71 | 0.53 | 3 | 4.12 |
| to wash | 0.62 | 0.25 | 0.70 | 0.68 | 2 | 4.38 |
| uncovered parts of the beach | 0.38 | 0.38 | 0.68 | 0.48 | 3 | 3.50 |
| area where the hotel will be built | 0.50 | 0.38 | 0.66 | 0.42 | 4 | 4.14 |
| distance | 0.50 | 0.00 | 0.64 | 0.53 | 3 | 4.25 |
| tan line | 0.38 | 0.12 | 0.64 | 0.28 | 5 | 3.62 |
| moment | 0.38 | 0.12 | 0.62 | 0.28 | 5 | 3.14 |
| time and place | 0.38 | 0.25 | 0.51 | 0.40 | 4 | 3.29 |
| air space tomorrow | 0.25 | 0.25 | 0.43 | 0.36 | 4 | 3.17 |
| situation | 0.25 | 0.00 | 0.42 | 0.42 | 4 | 3.00 |
| why I had problems sleeping | 0.25 | 0.00 | 0.36 | 0.42 | 4 | 3.00 |
Correctness according to Experimenter Verdict
| concept | EES | EEC | expert | majority |
|---|---|---|---|---|
| bus driver | 1.00 | 1.00 | role | role |
| deckchairs on the beach | 1.00 | 1.00 | object aggregate | object aggregate |
| drinking a beer | 1.00 | 1.00 | process | process |
| end of cooking | 1.00 | 1.00 | process boundary | process boundary |
| end of flying | 1.00 | 1.00 | process boundary | process boundary |
| person | 1.00 | 1.00 | object | object |
| tendency to be mosquito bitten | 1.00 | 1.00 | disposition | disposition |
| tendency to defiate | 1.00 | 1.00 | disposition | disposition |
| tour party | 1.00 | 1.00 | object aggregate | object aggregate |
| tourist | 1.00 | 1.00 | role | role |
| airplane flight | 0.96 | 0.88 | process | process |
| edge | 0.95 | 0.88 | object boundary | object boundary |
| digital photograph | 0.92 | 0.88 | generically dependent continuant | generically dependent continuant |
| vacation brochure | 0.92 | 0.88 | generically dependent continuant | generically dependent continuant |
| bus | 0.91 | 0.88 | object | object |
| bay | 0.88 | 0.75 | site | site |
| warmth | 0.88 | 0.88 | quality | quality |
| point | 0.82 | 0.62 | zero dimensional region | zero dimensional region |
| take off time | 0.78 | 0.38 | temporal interval | temporal instant |
| vacation weekend | 0.76 | 0.62 | temporal interval | temporal interval |
| my hotel room | 0.75 | 0.62 | site | site |
| line of lattitude | 0.74 | 0.50 | one dimensional region | one dimensional region |
| timetable | 0.73 | 0.62 | generically dependent continuant | generically dependent continuant |
| kilometre zero | 0.71 | 0.50 | zero dimensional region | zero dimensional region |
| dining | 0.71 | 0.12 | process aggregate | process |
| air space | 0.71 | 0.50 | three dimensional region | three dimensional region |
| border | 0.68 | 0.25 | object boundary | fiat object part |
| hotel beach | 0.66 | 0.25 | fiat object part | site |
| distance | 0.64 | 0.50 | quality | quality |
| space | 0.61 | 0.00 | spatial region | site |
| clubbing | 0.61 | 0.00 | process aggregate | process |
| place on wall where the postcard is put | 0.56 | 0.25 | two dimensional region | site |
| tourist area | 0.54 | 0.00 | fiat object part | site |
| tan line | 0.51 | 0.25 | one dimensional region | object boundary |
| vacation location | 0.51 | 0.00 | spatial region | site |
| area where the hotel will be built | 0.51 | 0.12 | three dimensional region | site |
| to wash | 0.50 | 0.38 | function | process |
| moment | 0.49 | 0.25 | spatiotemporal instant | temporal instant |
| time and place | 0.44 | 0.12 | spatiotemporal instant | spatiotemporal interval* |
| situation | 0.42 | 0.25 | processual context | processual context* |
| why I had problems sleeping | 0.36 | 0.25 | processual context | processual context* |
| surface of pool table | 0.35 | 0.00 | two dimensional region | object boundary |
| uncovered parts of the beach | 0.24 | 0.00 | scattered spatiotemporal region | fiat object part |
| patches on the floor | 0.23 | 0.00 | scattered spatiotemporal region | object aggregate |
| flood plain | 0.22 | 0.00 | spatiotemporal interval | site |
| air space tomorrow | 0.14 | 0.00 | spatiotemporal interval | site* |
Key Metrics Summary:
| measure | mean | min | median | max | sd |
|---|---|---|---|---|---|
| EEC | 0.5081522 | 0.0000000 | 0.5000000 | 1.000 | 0.3831541 |
| EES | 0.7034604 | 0.1361111 | 0.7243304 | 1.000 | 0.2528527 |
| IES | 0.8081683 | 0.3645833 | 0.8151042 | 1.000 | 0.1697295 |
| MAC | 0.6902174 | 0.2500000 | 0.6250000 | 1.000 | 0.2468399 |
| SE | 0.6679079 | 0.2814536 | 0.6462406 | 1.000 | 0.2375013 |
| DA | 2.6521739 | 1.0000000 | 3.0000000 | 5.000 | 1.2150426 |
| CON | 4.0075052 | 3.0000000 | 4.1339286 | 4.625 | 0.4707099 |
Correlations
x<-df_full_res
ggplot(x,aes(IES,EES)) + geom_point()z<-df_full_res[c("EEC","EES","MAC","IES","SE","DA","CON")]
cormat <- round(cor(z),2)
cor.test(z$EEC,z$EES)
Pearson's product-moment correlation
data: z$EEC and z$EES
t = 14.906, df = 44, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.8483096 0.9515455
sample estimates:
cor
0.9136177
cor.test(z$EEC,z$MAC)
Pearson's product-moment correlation
data: z$EEC and z$MAC
t = 6.2275, df = 44, p-value = 1.562e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4918959 0.8131786
sample estimates:
cor
0.684456
cor.test(z$EEC,z$CON)
Pearson's product-moment correlation
data: z$EEC and z$CON
t = 5.5508, df = 44, p-value = 1.536e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4319321 0.7856841
sample estimates:
cor
0.6417604
cor.test(z$EEC,z$IES)
Pearson's product-moment correlation
data: z$EEC and z$IES
t = 5.3463, df = 44, p-value = 3.049e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4123534 0.7763937
sample estimates:
cor
0.6275325
cor.test(z$EEC,z$DA)
Pearson's product-moment correlation
data: z$EEC and z$DA
t = -5.6753, df = 44, p-value = 1.01e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.7911050 -0.4435122
sample estimates:
cor
-0.6501084
cor.test(z$EEC,z$SE)
Pearson's product-moment correlation
data: z$EEC and z$SE
t = 6.3651, df = 44, p-value = 9.797e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5032210 0.8182152
sample estimates:
cor
0.6923742
cor.test(z$EES,z$MAC)
Pearson's product-moment correlation
data: z$EES and z$MAC
t = 7.5732, df = 44, p-value = 1.667e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5910092 0.8556833
sample estimates:
cor
0.7522454
cor.test(z$EES,z$CON)
Pearson's product-moment correlation
data: z$EES and z$CON
t = 8.252, df = 44, p-value = 1.759e-10
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.6321365 0.8723312
sample estimates:
cor
0.7794071
cor.test(z$EES,z$IES)
Pearson's product-moment correlation
data: z$EES and z$IES
t = 7.5314, df = 44, p-value = 1.917e-09
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5883009 0.8545674
sample estimates:
cor
0.7504373
cor.test(z$EES,z$DA)
Pearson's product-moment correlation
data: z$EES and z$DA
t = -6.1691, df = 44, p-value = 1.904e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.8109882 -0.4870045
sample estimates:
cor
-0.6810221
cor.test(z$EES,z$SE)
Pearson's product-moment correlation
data: z$EES and z$SE
t = 6.9238, df = 44, p-value = 1.478e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5463195 0.8369502
sample estimates:
cor
0.7220963
cor.test(z$MAC,z$CON)
Pearson's product-moment correlation
data: z$MAC and z$CON
t = 6.3962, df = 44, p-value = 8.819e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5057347 0.8193266
sample estimates:
cor
0.6941255
cor.test(z$MAC,z$IES)
Pearson's product-moment correlation
data: z$MAC and z$IES
t = 20.305, df = 44, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9119025 0.9725029
sample estimates:
cor
0.9505639
cor.test(z$MAC,z$DA)
Pearson's product-moment correlation
data: z$MAC and z$DA
t = -14.062, df = 44, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.9462756 -0.8327739
sample estimates:
cor
-0.9044249
cor.test(z$MAC,z$SE)
Pearson's product-moment correlation
data: z$MAC and z$SE
t = 22.292, df = 44, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.9257525 0.9769407
sample estimates:
cor
0.9584679
cor.test(z$CON,z$IES)
Pearson's product-moment correlation
data: z$CON and z$IES
t = 6.7683, df = 44, p-value = 2.501e-08
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.5347717 0.8319963
sample estimates:
cor
0.7141957
cor.test(z$CON,z$DA)
Pearson's product-moment correlation
data: z$CON and z$DA
t = -5.7065, df = 44, p-value = 9.096e-07
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.7924353 -0.4463720
sample estimates:
cor
-0.6521623
cor.test(z$CON,z$SE)
Pearson's product-moment correlation
data: z$CON and z$SE
t = 5.6595, df = 44, p-value = 1.066e-06
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.4420572 0.7904268
sample estimates:
cor
0.6490622
cor.test(z$IES,z$DA)
Pearson's product-moment correlation
data: z$IES and z$DA
t = -9.9037, df = 44, p-value = 8.998e-13
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.9032850 -0.7123804
sample estimates:
cor
-0.8308554
cor.test(z$IES,z$SE)
Pearson's product-moment correlation
data: z$IES and z$SE
t = 11.905, df = 44, p-value = 2.37e-15
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
0.7814032 0.9284099
sample estimates:
cor
0.8735462
cor.test(z$DA,z$SE)
Pearson's product-moment correlation
data: z$DA and z$SE
t = -32.89, df = 44, p-value < 2.2e-16
alternative hypothesis: true correlation is not equal to 0
95 percent confidence interval:
-0.9890955 -0.9644035
sample estimates:
cor
-0.9802628
kable(cormat,row.names = TRUE)| EEC | EES | MAC | IES | SE | DA | CON | |
|---|---|---|---|---|---|---|---|
| EEC | 1.00 | 0.91 | 0.68 | 0.63 | 0.69 | -0.65 | 0.64 |
| EES | 0.91 | 1.00 | 0.75 | 0.75 | 0.72 | -0.68 | 0.78 |
| MAC | 0.68 | 0.75 | 1.00 | 0.95 | 0.96 | -0.90 | 0.69 |
| IES | 0.63 | 0.75 | 0.95 | 1.00 | 0.87 | -0.83 | 0.71 |
| SE | 0.69 | 0.72 | 0.96 | 0.87 | 1.00 | -0.98 | 0.65 |
| DA | -0.65 | -0.68 | -0.90 | -0.83 | -0.98 | 1.00 | -0.65 |
| CON | 0.64 | 0.78 | 0.69 | 0.71 | 0.65 | -0.65 | 1.00 |
#print(xtable(cormat,digits=c(2)),include.rownames=TRUE)Mean entropy by export ratings
Table is sorted by average entropy: treat it as degree of disagreement. Note that: disposition, object aggregate, process boundary and role where always rated perfectly correctly by all reviewers, while scattered spatiotemporal region, spatiotemporal instant and one dimensional region were really far away from correctly classified.
| expert_opinion | mean_entropy |
|---|---|
| spatiotemporal instant | 1.98 |
| one dimensional region | 1.95 |
| processual context | 1.75 |
| scattered spatiotemporal region | 1.66 |
| three dimensional region | 1.58 |
| two dimensional region | 1.52 |
| temporal interval | 1.48 |
| zero dimensional region | 1.35 |
| site | 1.31 |
| object boundary | 1.22 |
| spatiotemporal interval | 1.22 |
| fiat object part | 1.15 |
| spatial region | 1.01 |
| quality | 0.97 |
| function | 0.95 |
| generically dependent continuant | 0.80 |
| process aggregate | 0.54 |
| object | 0.27 |
| process | 0.27 |
| disposition | 0.00 |
| object aggregate | 0.00 |
| process boundary | 0.00 |
| role | 0.00 |
Impact analysis of BFO classes across OBO ontologies
BFO Concept coverage
Coverage, my favourite metric, is the number of ontologies in the corpus mentioning a particular entity. Here, both by absolute number and proportion (of numbers of ontologies compared to the whole corpus).
| BFO 1.1 Concept | Nr. ontologies | Coverage (%) |
|---|---|---|
| processual entity | 39 | 28.68 |
| quality | 38 | 27.94 |
| disposition | 36 | 26.47 |
| independent continuant | 35 | 25.74 |
| specifically dependent continuant | 33 | 24.26 |
| role | 32 | 23.53 |
| occurrent | 29 | 21.32 |
| continuant | 27 | 19.85 |
| entity | 27 | 19.85 |
| function | 27 | 19.85 |
| realizable entity | 26 | 19.12 |
| generically dependent continuant | 25 | 18.38 |
| site | 20 | 14.71 |
| spatial region | 19 | 13.97 |
| connected temporal region | 18 | 13.24 |
| object | 18 | 13.24 |
| object aggregate | 18 | 13.24 |
| scattered temporal region | 18 | 13.24 |
| temporal region | 18 | 13.24 |
| one dimensional region | 17 | 12.50 |
| three dimensional region | 17 | 12.50 |
| two dimensional region | 17 | 12.50 |
| zero dimensional region | 17 | 12.50 |
| process boundary | 16 | 11.76 |
| spatiotemporal region | 16 | 11.76 |
| fiat object part | 14 | 10.29 |
| object boundary | 14 | 10.29 |
| material entity | 2 | 1.47 |
| process | 2 | 1.47 |
| connected spatiotemporal region | 1 | 0.74 |
| dependent continuant | 1 | 0.74 |
| fiat process part | 1 | 0.74 |
| process aggregate | 1 | 0.74 |
| processual context | 1 | 0.74 |
| scattered spatiotemporal region | 1 | 0.74 |
| spatiotemporal instant | 1 | 0.74 |
| spatiotemporal interval | 1 | 0.74 |
| temporal instant | 1 | 0.74 |
| temporal interval | 1 | 0.74 |
Coverage graph
BFO Concept impact
Impact is the average proportion of axioms containing the entity across the corpus. For example, if ‘continuant’ was used in 10 axioms in ontology O, and O has 1000 axioms, the impact of the entity would be 1 %. The overall BFO impact is the mean impact across all ontologies in OBO (136 in the current snapshot).
| BFO 1.1 Concept | Impact (mean %) |
|---|---|
| processual entity | 0.463 |
| disposition | 0.338 |
| independent continuant | 0.260 |
| role | 0.258 |
| specifically dependent continuant | 0.245 |
| quality | 0.234 |
| occurrent | 0.222 |
| continuant | 0.191 |
| spatial region | 0.187 |
| realizable entity | 0.152 |
| temporal region | 0.140 |
| function | 0.137 |
| entity | 0.131 |
| generically dependent continuant | 0.128 |
| object boundary | 0.114 |
| connected temporal region | 0.091 |
| three dimensional region | 0.083 |
| site | 0.078 |
| object aggregate | 0.076 |
| zero dimensional region | 0.061 |
| spatiotemporal region | 0.059 |
| scattered temporal region | 0.054 |
| one dimensional region | 0.054 |
| two dimensional region | 0.054 |
| process boundary | 0.053 |
| object | 0.034 |
| fiat object part | 0.027 |
| material entity | 0.010 |
| process | 0.008 |
| dependent continuant | 0.006 |
| process aggregate | 0.005 |
| fiat process part | 0.005 |
| processual context | 0.005 |
| connected spatiotemporal region | 0.005 |
| spatiotemporal interval | 0.002 |
| scattered spatiotemporal region | 0.002 |
| temporal interval | 0.002 |
| temporal instant | 0.002 |
| spatiotemporal instant | 0.002 |
Impact graph
BFO Concept usage
Usage is similar to impact, however not normalised by ontology: It is simply the number of axiom containing the entity across the whole corpus divided by the number of all axioms in the corpus (i.e. the sum of all ontology sizes). It is the most biased of the three metrics, as one ontology with an idiosyncratic modelling style will dominate the metric (imagine one ontology with 360 axioms using processual entity).
| BFO 1.1 Concept | Nr. axioms | Usage (%) |
|---|---|---|
| processual entity | 360 | 0.008 |
| role | 342 | 0.007 |
| continuant | 275 | 0.006 |
| independent continuant | 301 | 0.006 |
| occurrent | 284 | 0.006 |
| quality | 276 | 0.006 |
| function | 227 | 0.005 |
| disposition | 172 | 0.004 |
| generically dependent continuant | 183 | 0.004 |
| specifically dependent continuant | 204 | 0.004 |
| entity | 164 | 0.003 |
| realizable entity | 149 | 0.003 |
| spatial region | 160 | 0.003 |
| connected temporal region | 75 | 0.002 |
| object aggregate | 77 | 0.002 |
| object boundary | 72 | 0.002 |
| temporal region | 104 | 0.002 |
| object | 30 | 0.001 |
| one dimensional region | 42 | 0.001 |
| process boundary | 53 | 0.001 |
| scattered temporal region | 38 | 0.001 |
| site | 64 | 0.001 |
| spatiotemporal region | 71 | 0.001 |
| three dimensional region | 54 | 0.001 |
| two dimensional region | 44 | 0.001 |
| zero dimensional region | 50 | 0.001 |
| connected spatiotemporal region | 6 | 0.000 |
| dependent continuant | 7 | 0.000 |
| fiat object part | 21 | 0.000 |
| fiat process part | 6 | 0.000 |
| material entity | 11 | 0.000 |
| process | 8 | 0.000 |
| process aggregate | 6 | 0.000 |
| processual context | 6 | 0.000 |
| scattered spatiotemporal region | 3 | 0.000 |
| spatiotemporal instant | 3 | 0.000 |
| spatiotemporal interval | 3 | 0.000 |
| temporal instant | 3 | 0.000 |
| temporal interval | 3 | 0.000 |
Usage graph
Survey comments
Number of comments by question

| concept | freq | EES | CON |
|---|---|---|---|
| border | 5 | 0.6845238 | 3.625000 |
| bay | 4 | 0.8750000 | 3.625000 |
| hotel beach | 4 | 0.6625000 | 3.875000 |
| take off time | 4 | 0.7790179 | 4.500000 |
| tan line | 4 | 0.5111607 | 3.625000 |
| tourist area | 4 | 0.5375000 | 3.875000 |
| vacation location | 4 | 0.5062500 | 3.875000 |
| why I had problems sleeping | 4 | 0.3645833 | 3.000000 |
| air space tomorrow | 3 | 0.1361111 | 3.166667 |
| clubbing | 3 | 0.6111111 | 4.250000 |
| digital photograph | 3 | 0.9218750 | 4.500000 |
| edge | 3 | 0.9464286 | 3.875000 |
| person | 3 | 1.0000000 | 4.250000 |
| place on wall where the postcard is put | 3 | 0.5647321 | 3.714286 |
| situation | 3 | 0.4166667 | 3.000000 |
| space | 3 | 0.6125000 | 4.125000 |
| time and place | 3 | 0.4397321 | 3.285714 |
| timetable | 3 | 0.7343750 | 3.875000 |
| vacation brochure | 3 | 0.9218750 | 4.500000 |
| airplane flight | 2 | 0.9583333 | 4.375000 |
| area where the hotel will be built | 2 | 0.5059524 | 4.142857 |
| bus | 2 | 0.9125000 | 4.625000 |
| bus driver | 2 | 1.0000000 | 4.375000 |
| deckchairs on the beach | 2 | 1.0000000 | 4.500000 |
| dining | 2 | 0.7083333 | 4.375000 |
| distance | 2 | 0.6406250 | 4.250000 |
| drinking a beer | 2 | 1.0000000 | 4.500000 |
| kilometre zero | 2 | 0.7142857 | 3.857143 |
| line of lattitude | 2 | 0.7440476 | 3.875000 |
| my hotel room | 2 | 0.7500000 | 4.250000 |
| patches on the floor | 2 | 0.2326389 | 3.375000 |
| point | 2 | 0.8154762 | 4.375000 |
| surface of pool table | 2 | 0.3549107 | 3.750000 |
| tendency to be mosquito bitten | 2 | 1.0000000 | 4.000000 |
| to wash | 2 | 0.5000000 | 4.375000 |
| tourist | 2 | 1.0000000 | 4.375000 |
| vacation weekend | 2 | 0.7604167 | 4.375000 |
| warmth | 2 | 0.8750000 | 4.142857 |
| air space | 1 | 0.7075893 | 4.125000 |
| end of cooking | 1 | 1.0000000 | 4.500000 |
| end of flying | 1 | 1.0000000 | 4.625000 |
| flood plain | 1 | 0.2222222 | 3.142857 |
| moment | 1 | 0.4913194 | 3.142857 |
| tour party | 1 | 1.0000000 | 4.375000 |
| uncovered parts of the beach | 1 | 0.2395833 | 3.500000 |
Comment length by BFO concept
| Group.1 | x |
|---|---|
| quality | 1820 |
| object boundary | 1493 |
| processual context | 1398 |
| spatiotemporal instant | 1392 |
| generically dependent continuant | 1349 |
| spatial region | 1333 |
| temporal interval | 1006 |
| two dimensional region | 993 |
| one dimensional region | 855 |
| zero dimensional region | 665 |
| spatiotemporal interval | 585 |
| process | 572 |
| process aggregate | 569 |
| object | 535 |
| role | 522 |
| three dimensional region | 467 |
| disposition | 445 |
| scattered spatiotemporal region | 416 |
| fiat object part | 411 |
| site | 409 |
| process boundary | 213 |
| object aggregate | 186 |
| function | 149 |
Number of comments by participant
| pid | freq |
|---|---|
| 4 | 33 |
| 7 | 33 |
| 6 | 16 |
| 5 | 12 |
| 1 | 11 |
| 2 | 6 |
Number of comments per theme
| x | freq |
|---|---|
| A | 20 |
| B | 52 |
| C | 12 |
| D | 18 |
| E | 26 |
| F | 4 |
Number of comments per concept
| x | freq |
|---|---|
| border | 5 |
| bay | 4 |
| hotel beach | 4 |
| take off time | 4 |
| tan line | 4 |
| tourist area | 4 |
| vacation location | 4 |
| why I had problems sleeping | 4 |
| airspace tomorrow | 3 |
| clubbing | 3 |
| edge | 3 |
| person | 3 |
| place on wall where the postcard is put | 3 |
| situation | 3 |
| time and place | 3 |
| timetable | 3 |
| vacation brochure | 3 |
| airplane flight | 2 |
| area where the hotel will be built | 2 |
| bus | 2 |
| bus driver | 2 |
| deckchairs on the beach | 2 |
| digital photograph | 2 |
| dining | 2 |
| distance | 2 |
| drinking a beer | 2 |
| kilometre zero | 2 |
| line of lattitude | 2 |
| my hotel room | 2 |
| patches on the floor | 2 |
| point | 2 |
| space | 2 |
| surface of pool table | 2 |
| tendency to be mosquito bitten | 2 |
| to wash | 2 |
| tourist | 2 |
| vacation weekend | 2 |
| warmth | 2 |
| airspace | 1 |
| end of cooking | 1 |
| end of flying | 1 |
| flood plain | 1 |
| moment | 1 |
| tour party | 1 |
| uncovered parts of the beach | 1 |
BFO: Class hierarchy by expert agreement
BFO: Class hierarchy by expert agreement: Alt 1
The colour shading indicates mean similarity to expert (exp_sim), and the size of the node indicates coverage of the concept (as determined by a BioPortal survey).
r1<-d_cov[c("BFO 1.1 Concept","Coverage (%)")]
r2<-plyr::count((df_full_res[c("expert")]))
r3<-merge(r1,r2,by.x = "BFO 1.1 Concept",by.y="expert",all.x = TRUE)
r3$freq<-ifelse(is.na(r3$freq),0,r3$freq)
nrow(r3[r3$freq==2,])[1] 21
Correlation analysis
| SE | EEC | EES | IES | DA | MAC | CON | COV | |
|---|---|---|---|---|---|---|---|---|
| SE | 1.00 | 0.72 | 0.74 | 0.87 | -0.99 | 0.97 | 0.72 | 0.42 |
| EEC | 0.72 | 1.00 | 0.91 | 0.66 | -0.68 | 0.72 | 0.67 | 0.52 |
| EES | 0.74 | 0.91 | 1.00 | 0.79 | -0.68 | 0.80 | 0.80 | 0.49 |
| IES | 0.87 | 0.66 | 0.79 | 1.00 | -0.84 | 0.95 | 0.80 | 0.37 |
| DA | -0.99 | -0.68 | -0.68 | -0.84 | 1.00 | -0.93 | -0.73 | -0.45 |
| MAC | 0.97 | 0.72 | 0.80 | 0.95 | -0.93 | 1.00 | 0.78 | 0.44 |
| CON | 0.72 | 0.67 | 0.80 | 0.80 | -0.73 | 0.78 | 1.00 | 0.47 |
| COV | 0.42 | 0.52 | 0.49 | 0.37 | -0.45 | 0.44 | 0.47 | 1.00 |
BFO Concept Analysis: Overview Table
theme_bfo20(ggplot(do,aes(EEC,COV)) + geom_point() + geom_smooth() + xlab("Experimenter-expert class. agreement (EEC)")+ ylab("OBO Foundry Coverage (%)"))ggsave("cor_bfocov_eec.jpg",width = 3,height = 2)
cor(do$EES,do$COV)[1] 0.4874226
cormat2 <- round(cor(do[c("EEC","EES","MAC","IES","SE","DA","COV")]),2)
kable(cormat2,row.names = FALSE)| EEC | EES | MAC | IES | SE | DA | COV |
|---|---|---|---|---|---|---|
| 1.00 | 0.91 | 0.72 | 0.66 | 0.72 | -0.68 | 0.52 |
| 0.91 | 1.00 | 0.80 | 0.79 | 0.74 | -0.68 | 0.49 |
| 0.72 | 0.80 | 1.00 | 0.95 | 0.97 | -0.93 | 0.44 |
| 0.66 | 0.79 | 0.95 | 1.00 | 0.87 | -0.84 | 0.37 |
| 0.72 | 0.74 | 0.97 | 0.87 | 1.00 | -0.99 | 0.42 |
| -0.68 | -0.68 | -0.93 | -0.84 | -0.99 | 1.00 | -0.45 |
| 0.52 | 0.49 | 0.44 | 0.37 | 0.42 | -0.45 | 1.00 |
kable(do,row.names = FALSE,digits = c(2))| concept | EEC | EES | MAC | IES | SE | DA | CON | COV |
|---|---|---|---|---|---|---|---|---|
| quality | 0.69 | 0.76 | 0.69 | 0.76 | 0.68 | 2.50 | 4.20 | 27.94 |
| disposition | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 4.25 | 26.47 |
| role | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 4.38 | 23.53 |
| function | 0.38 | 0.50 | 0.62 | 0.70 | 0.68 | 2.00 | 4.38 | 19.85 |
| generically dependent continuant | 0.79 | 0.86 | 0.79 | 0.86 | 0.73 | 2.33 | 4.29 | 18.38 |
| site | 0.69 | 0.81 | 0.69 | 0.81 | 0.56 | 3.50 | 3.94 | 14.71 |
| spatial region | 0.00 | 0.56 | 0.69 | 0.81 | 0.66 | 2.50 | 4.00 | 13.97 |
| object | 0.94 | 0.96 | 0.94 | 0.96 | 0.91 | 1.50 | 4.44 | 13.24 |
| object aggregate | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 4.44 | 13.24 |
| one dimensional region | 0.38 | 0.63 | 0.44 | 0.69 | 0.35 | 4.50 | 3.75 | 12.50 |
| three dimensional region | 0.31 | 0.61 | 0.50 | 0.68 | 0.47 | 3.50 | 4.13 | 12.50 |
| two dimensional region | 0.12 | 0.46 | 0.56 | 0.76 | 0.49 | 3.50 | 3.73 | 12.50 |
| zero dimensional region | 0.56 | 0.76 | 0.56 | 0.76 | 0.55 | 3.00 | 4.12 | 12.50 |
| process boundary | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 1.00 | 4.56 | 11.76 |
| fiat object part | 0.12 | 0.60 | 0.69 | 0.84 | 0.62 | 3.00 | 3.88 | 10.29 |
| object boundary | 0.56 | 0.82 | 0.62 | 0.83 | 0.59 | 3.00 | 3.75 | 10.29 |
| process | 0.94 | 0.98 | 0.94 | 0.98 | 0.91 | 1.50 | 4.44 | 1.47 |
| process aggregate | 0.06 | 0.66 | 0.88 | 0.93 | 0.82 | 2.00 | 4.31 | 0.74 |
| processual context | 0.25 | 0.39 | 0.25 | 0.39 | 0.42 | 4.00 | 3.00 | 0.74 |
| scattered spatiotemporal region | 0.00 | 0.24 | 0.44 | 0.72 | 0.45 | 3.50 | 3.44 | 0.74 |
| spatiotemporal instant | 0.19 | 0.47 | 0.38 | 0.57 | 0.34 | 4.50 | 3.21 | 0.74 |
| spatiotemporal interval | 0.00 | 0.18 | 0.56 | 0.68 | 0.59 | 3.00 | 3.15 | 0.74 |
| temporal interval | 0.50 | 0.77 | 0.56 | 0.79 | 0.51 | 3.50 | 4.44 | 0.74 |
#print(xtable(do[c("concept","EEC","EES","MAC","IES","CON","COV")],digits=c(2)),include.rownames=FALSE)Comments for concepts with low EEC
com<-merge(df_full_res[df_full_res$EEC==0,c("concept","EEC","expert")],df_commments[c("concept","answer.question","answer.comment")])
kable(com,row.names = FALSE,digits = c(2))| concept | EEC | expert | answer.question | answer.comment |
|---|---|---|---|---|
| clubbing | 0 | process aggregate | process | I’m less confident in how people use the word clubbing , but I think there’s a good case to say that clubbing on a particular evening is a process with participants including the people and the clubs, and with bone fide boundaries such as leaving the home and returning to the home. |
| clubbing | 0 | process aggregate | process | Maybe a process aggregate? |
| clubbing | 0 | process aggregate | process | At least there is such a process. But what you are putting in quotes is somewhat distracting. |
| flood plain | 0 | spatiotemporal interval | site | The plain is a site. The part that’s flooded could also be a good site. To capture the example, I would try to work with an aggregate of those sites. |
| patches on the floor | 0 | scattered spatiotemporal region | object aggregate | Could be object aggregate under the interpretation that what is being referred to is the water parts of the patches. However another plausible interpretation is that the reference is to the combination of water and floor. So then an independent continuants that is a sum object and fiat parts |
| patches on the floor | 0 | scattered spatiotemporal region | site | In the example, there are bona fide boundaries. Otherwise, they can be fiat object parts. |
| space | 0 | spatial region | site | This is the most confusing question so far. A throne room is a site. It is located in a spatial region. The phrase used to be indicates that we’re talking about that spatial region across time, and so a spatiotemporal region. The use of the space makes me think of a currently existing room or building, which would be a new site that happens to be located in the spatial region (now) that corresponds to the spatial region in Hadrian’s time. But space alone would make me think of the spatial region itself. |
| space | 0 | spatial region | site | There could also be arguments in favour of using three-dimensional region here. I don’t see much use of making this distinction. |
| surface of pool table | 0 | two dimensional region | object boundary | in the sense of geometrical surface. If the material is meant, then it would be a fiat object part. |
| surface of pool table | 0 | two dimensional region | object | Language is vague. I see 3 possible interpretations. 1) The material that makes up what is called the surface, namely the felt, in some kinds of pool tables. 2) part of an object boundary. Actually I’m not sure whether all parts of an object boundary are object boundaries themselves. 3) a siteAll 3 are independent continuants, so you could confidently assign that class. I don’t do that because I don’t find it particularly useful to know just that something is an independent continuant. |
| tourist area | 0 | fiat object part | role | A role that inheres in some object or site. |
| tourist area | 0 | fiat object part | site | Again, may be confounded with 3D region |
| tourist area | 0 | fiat object part | site | A particular area of a city is a site. |
| tourist area | 0 | fiat object part | site | should really be site with a role |
| uncovered parts of the beach | 0 | scattered spatiotemporal region | fiat object part | there might be arguments for site |
| vacation location | 0 | spatial region | site | Geographic locators tend to be ambiguous in language as to whether the site is being referred to or the building or other material occupiers of the site. It is a site rather than a spatial region because BFO’s notion of spatial region is that of a newtonian fixed space and on that view earth and places on earth move through different parts of space moment to moment. |
| vacation location | 0 | spatial region | site | There could also be arguments in favour of using three-dimensional region here. I don’t see much use of making this distinction. |
| vacation location | 0 | spatial region | site | Parts of the world that you would vacation to, such as Paris or a particular resort, are sites in BFO, but there’s plenty of room for confusion with BFO spatial regions. |
| vacation location | 0 | spatial region | three dimensional region | may be a site |
Difficult suparts analysis
df_diff<-do[c("concept","MAC","EEC")]
df_diff$MACEECDIFF<-1-(df_diff$MAC-df_diff$EEC)
df_ch<-df_obo_ch[c("sub_l","super_l")]
# add subclass of self back in
x<-df_ch
x$super_l2<-x$sub_l
x<-unique(x[c("super_l2","sub_l")])
names(x)<-c("super_l","sub_l")
df_ch<-rbind(df_ch,x)
#merge to get big frame with all subclasses and their difficulty
## First hierarchy level
df_diff_ch<-merge(df_ch,df_diff,by.x = "sub_l",by.y = "concept",all.x = TRUE)
df_diff_x<-aggregate(cbind(df_diff_ch$MAC,df_diff_ch$EEC,df_diff_ch$MACEECDIFF),by=list(df_diff_ch$super_l),FUN=function(x) mean(x,na.rm = TRUE))
## Second hierarchy level
df_diff_ch<-merge(df_ch,df_diff_x,by.x = "sub_l",by.y = "Group.1",all.x = TRUE)
df_diff_x<-aggregate(cbind(df_diff_ch$V1,df_diff_ch$V2,df_diff_ch$V3),by=list(df_diff_ch$super_l),FUN=function(x) mean(x,na.rm = TRUE))
## Third level
df_diff_ch<-merge(df_ch,df_diff_x,by.x = "sub_l",by.y = "Group.1",all.x = TRUE)
df_diff_x<-aggregate(cbind(df_diff_ch$V1,df_diff_ch$V2,df_diff_ch$V3),by=list(df_diff_ch$super_l),FUN=function(x) mean(x,na.rm = TRUE))
## Fourth level
df_diff_ch<-merge(df_ch,df_diff_x,by.x = "sub_l",by.y = "Group.1",all.x = TRUE)
df_diff_x<-aggregate(cbind(df_diff_ch$V1,df_diff_ch$V2,df_diff_ch$V3),by=list(df_diff_ch$super_l),FUN=function(x) mean(x,na.rm = TRUE))
#View(df_diff_x)
names(df_diff_x)<-c("concept","MAC","EEC","MACEECDIFF")
asis_output("### EEC")EEC
#pdf("graph_bfo_eec.pdf", width=10, height=7)
plot_hierarchy(plot_prepare_df(df_diff_x,vlabel = "concept",vvalue = "EEC"),df_obo_ch,circular = TRUE)#dev.off()
asis_output("### MAC")MAC
plot_hierarchy(plot_prepare_df(df_diff_x,vlabel = "concept",vvalue = "MAC"),df_obo_ch,circular = TRUE)#pdf("graph_bfo_eecmac.pdf", width=10, height=7)
asis_output("### Difference of EEC and MAC")Difference of EEC and MAC
plot_hierarchy(plot_prepare_df(df_diff_x,vlabel = "concept",vvalue = "MACEECDIFF"),df_obo_ch,circular = TRUE)#dev.off()
Comment length by question
