knitr::opts_knit$set(root.dir = here::here())
library(tidyverse)
library(here)
library(jsonlite)
library(assertthat)
library(irr)
pak::pkg_install("hsci-r/gghsci")
library(gghsci)
library(ggbeeswarm)
library(gt)
renv::snapshot()
sorensen <- function(x,y){
assert_that(is.character(x))
assert_that(is.character(y))
index <-
2*(length(intersect(x,y)))/(2*(length(intersect(x,y)))+
length(setdiff(x,y))+
length(setdiff(y,x)))
return(index)
}
sorensen_dist <- function(x,y) {
return(1-sorensen(x,y))
}
clarity_bound <- 0.75
quick_bound_centile <- 3
slow_bound_centile <- 7
short_bound_centile <- 3
long_bound_centile <- 7
raw_data <- list.files(here("data/final_annotations"),full.names=T) %>%
setNames(.,str_replace(basename(.),"annotointi_(.*)\\.csv$","\\1")) %>%
map_dfr(~read_csv(.,show_col_types=F),.id="name") %>%
replace_na(list(theme="not alcohol policy")) %>%
mutate(document_id=as.numeric(factor(text))) %>%
group_by(name) %>%
mutate(lead_time_centile=ntile(lead_time,10),lead_time_quartile=ntile(lead_time,4)) %>%
ungroup() %>%
mutate(speed_type=case_when(
lead_time_centile>=slow_bound_centile ~ "slow",
lead_time_centile<=quick_bound_centile ~ "quick",
T ~ "in-between"
)) %>%
mutate(whole_article=is.na(answer)) %>%
mutate(part_length=map_if(answer,~!is.na(.),~fromJSON(.),.else=~list())) %>%
mutate(part_length=map(part_length,~.$end-.$start)) %>%
mutate(part_length=map_dbl(part_length,~reduce(.,~.x+.y,.init=0))) %>%
group_by(text)%>%
mutate(annotations=n()) %>%
ungroup() %>%
extract(text,into=c("url"),regex = "^([^ ]*)") %>%
rename(annotation_document_id=document_id)
md <- bind_rows(
read_csv(here("data/df_joined_individual.csv")),
read_csv(here("data/df_joined_interannotator.csv"))
) %>%
mutate(content_length_centile=ntile(nr_characters,10),content_length_quartile=ntile(nr_characters,4)) %>%
mutate(length_type=case_when(
content_length_centile<=short_bound_centile ~ "short",
content_length_centile>=long_bound_centile ~ "long",
T ~ "in-between"
))
New names:
• `` -> `...1`
Rows: 1500 Columns: 25
── Column specification ──────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (15): text, url, section, document_id, title, author, subsection, media, subject, genre, ...
dbl (8): ...1, score, sentences, nr_characters, nr_tokens, paragraphs, document_parts, urgency
dttm (2): time_created, time_modified
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
New names:
• `` -> `...1`
Rows: 100 Columns: 25
── Column specification ──────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (15): text, url, section, document_id, title, author, subsection, media, subject, genre, ...
dbl (8): ...1, score, sentences, nr_characters, nr_tokens, paragraphs, document_parts, urgency
dttm (2): time_created, time_modified
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
raw_data <- raw_data %>%
inner_join(md)
Joining, by = "url"
assert_that(nrow(md)==1600)
[1] TRUE
assert_that(nrow(raw_data)==5500)
[1] TRUE
dl <- bind_rows(
raw_data %>% filter(str_sub(theme,1,1)!='{') %>% mutate(multiple=F),
raw_data %>% filter(str_sub(theme,1,1)=='{') %>%
mutate(theme=map(theme,~fromJSON(.) %>% as.data.frame())) %>%
unnest(theme) %>%
rename(theme=choices)
)
New names:
• `...1` -> `...18`
New names:
• `...1` -> `...18`
d <- dl %>%
group_by(across(-theme)) %>%
arrange(theme) %>%
summarize(theme=str_c(theme,collapse="|"),.groups="drop") %>%
group_by(text,theme) %>%
mutate(theme_annotations=n()) %>%
group_by(text,speed_type) %>%
mutate(speed_type_annotations=n()) %>%
group_by(text,whole_article) %>%
mutate(whole_article_annotations=n()) %>%
group_by(text) %>%
arrange(desc(theme_annotations),theme) %>%
mutate(theme_annotation_proportion=theme_annotations/n()) %>%
mutate(majority_theme=str_flatten(na.omit(unique(if_else(theme_annotations==max(theme_annotations),theme,NA_character_))),collapse="|"),majority_theme_annotations=first(theme_annotations),majority_theme_proportion=first(theme_annotations)/n()) %>%
arrange(desc(speed_type_annotations),speed_type) %>%
mutate(speed_type_annotation_proportion=speed_type_annotations/n()) %>%
mutate(majority_speed_type=str_flatten(na.omit(unique(if_else(speed_type_annotations==max(speed_type_annotations),speed_type,NA_character_))),collapse="|"),majority_speed_type_annotations=first(speed_type_annotations),majority_speed_type_proportion=first(speed_type_annotations)/n()) %>%
arrange(desc(whole_article_annotations),whole_article) %>%
mutate(whole_article_annotation_proportion=whole_article_annotations/n()) %>%
mutate(majority_whole_article=str_flatten(na.omit(unique(if_else(whole_article_annotations==max(whole_article_annotations),whole_article,NA))),collapse="|"),majority_whole_article_annotations=first(whole_article_annotations),majority_whole_article_proportion=first(whole_article_annotations)/n()) %>%
ungroup() %>%
mutate(
decision_type=if_else(theme==majority_theme,"assent","dissent"),
clarity=if_else(majority_theme_proportion>=clarity_bound,"clear","unclear")
)
assert_that(nrow(d)==5500)
[1] TRUE
bd <- d %>%
mutate(theme=if_else(theme=="not alcohol policy",theme,"alcohol policy"))
bd <- bd %>%
group_by(text,theme) %>%
mutate(n=n()) %>%
group_by(text) %>%
arrange(desc(n),theme) %>%
mutate(prop=n/n()) %>%
mutate(majority_theme=str_flatten(na.omit(unique(if_else(n==max(n),theme,NA_character_))),collapse="|"),majority_theme_n=first(n),majority_theme_proportion=first(n)/n()) %>%
ungroup() %>%
mutate(
decision_type=if_else(theme==majority_theme,"assent","dissent"),
clarity=if_else(majority_theme_proportion>=clarity_bound,"clear","unclear")
)
bmd <- md %>%
inner_join(bd %>% distinct(url,annotations,majority_theme,majority_theme_annotations,majority_theme_proportion,clarity,majority_speed_type,majority_speed_type_annotations,majority_speed_type_proportion,majority_whole_article))
Joining, by = "url"
assert_that(nrow(bmd)==1600)
[1] TRUE
md <- md %>%
inner_join(d %>% distinct(url,annotations,majority_theme,majority_theme_annotations,majority_theme_proportion,clarity,majority_speed_type,majority_speed_type_annotations,majority_speed_type_proportion,majority_whole_article))
Joining, by = "url"
assert_that(nrow(md)==1600)
[1] TRUE
mvd <- list.files(here("data/mv_annotations/"),full.names=T) %>%
setNames(.,str_replace(basename(.),"annotator_(.*)\\.csv$","\\1")) %>%
map_dfr(~read_csv(.,show_col_types=F),.id="name") %>%
replace_na(list(sentiment="empty")) %>%
filter(sentiment!="empty") %>%
mutate(document_id=as.numeric(factor(text))) %>%
group_by(name) %>%
mutate(lead_time_centile=ntile(lead_time,10),lead_time_quartile=ntile(lead_time,4)) %>%
ungroup() %>%
mutate(speed_type=case_when(
lead_time_centile>=slow_bound_centile ~ "slow",
lead_time_centile<=quick_bound_centile ~ "quick",
T ~ "in-between"
)) %>%
group_by(text,sentiment) %>%
mutate(sentiment_annotations=n()) %>%
group_by(text,speed_type) %>%
mutate(speed_type_annotations=n()) %>%
group_by(text) %>%
arrange(desc(sentiment_annotations)) %>%
mutate(sentiment_annotation_proportion=sentiment_annotations/n()) %>%
mutate(majority_sentiment=first(sentiment),majority_sentiment_annotations=first(sentiment_annotations),majority_sentiment_proportion=first(sentiment_annotations)/n()) %>%
arrange(desc(speed_type_annotations),speed_type) %>%
mutate(speed_type_annotation_proportion=speed_type_annotations/n()) %>%
mutate(majority_speed_type=str_flatten(na.omit(unique(if_else(speed_type_annotations==max(speed_type_annotations),speed_type,NA_character_))),collapse="|"),majority_speed_type_annotations=first(speed_type_annotations),majority_speed_type_proportion=first(speed_type_annotations)/n()) %>%
ungroup() %>%
mutate(
decision_type=if_else(sentiment==majority_sentiment,"assent","dissent"),
clarity=if_else(majority_sentiment_proportion>=clarity_bound,"clear","unclear")
)
mvmd <- read_csv(here("data/MV_data_final_links.csv"),col_select=c("content","link")) %>%
mutate(content_length=str_length(content)) %>%
mutate(content_length_centile=ntile(content_length,10),content_length_quartile=ntile(content_length,4)) %>%
mutate(length_type=case_when(
content_length_centile<=short_bound_centile ~ "short",
content_length_centile>=long_bound_centile ~ "long",
T ~ "in-between"
)) %>% rename(text=link)
New names:
• `` -> `...1`
Rows: 997 Columns: 2
── Column specification ──────────────────────────────────────────────────────────────────────────
Delimiter: ","
chr (2): content, link
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
mvd <- mvd %>% inner_join(mvmd)
Joining, by = "text"
mvmd <- mvmd %>% inner_join(mvd %>% distinct(text,majority_sentiment,majority_sentiment_annotations,majority_sentiment_proportion,clarity,majority_speed_type,majority_speed_type_annotations,majority_speed_type_proportion))
Joining, by = "text"
assert_that(nrow(mvmd)==98)
[1] TRUE
N | Proportion | |
---|---|---|
not alcohol policy | 936 | 58.50% |
alcohol policy | 660 | 41.25% |
alcohol policy|not alcohol policy | 4 | 0.25% |
60/40 split between not AP and AP is a pretty even base.
N | Proportion | |
---|---|---|
alcohol policy | ||
clear | 445 | 67.42% |
unclear | 215 | 32.58% |
not alcohol policy | ||
clear | 757 | 80.88% |
unclear | 179 | 19.12% |
There are proportionally more clear decisions for not alcohol policy than there are for alcohol policy -> it is easier to say that something is not AP than to say that something is AP.
There is a subset of articles where it is quick to decide that they are not AP.
When an article deals only partially with alcohol, it is more likely to not be an alcohol policy article. It is harder to clearly decide whether an article not dealing completely with alcohol is AP or not as compared to an article wholly dealing with alcohol.
Unsurprisingly, longer articles take longer to decide upon. Short non-AP articles are particularly easy to categorize clearly, but there is also the potential for error here, as evidenced by the high number of “unclear” quick selections in the short articles.
Very clearly different task time profiles between annotators.
p <- scales::percent_format(accuracy=1)
bd %>%
full_join(bd,by=c("text")) %>%
mutate(agree=theme.x==theme.y) %>%
group_by(name.x,name.y) %>%
summarize(agree=sum(agree),pagree=sum(agree)/n(),.groups="drop") %>%
ggplot(aes(x=name.x,y=name.y,fill=pagree)) +
geom_raster() +
theme_hsci() +
scale_fill_viridis_c(labels=scales::percent_format(accuracy=1)) +
geom_text(aes(label=p(pagree))) +
labs(fill="Agreement") +
xlab("Annotator 1") +
ylab("Annotator 2")
Kaisa stands out as an annotator with lower agreement to the majority of other annotators, except interestingly Sonja and Susanna.
Almost all annotators are likely to spend more time on “problematic” articles, here proxied by their clarity(=the amount of general agreement wrt them). -> the annotators are doing a good job, and most errors should not be caused by carelessness but by actual decision differences and ambiguities in guidelines.
At the same time, there is also clearly an effect visible where difference in annotation time explains agreement, so there is also an attentiveness component to the disagreements.
kripp.alpha(bd %>%
pivot_wider(id_cols=name,names_from=document_id,values_from=theme) %>%
select(-name) %>%
as.matrix,
method="nominal")
Warning in kripp.alpha(bd %>% pivot_wider(id_cols = name, names_from = document_id, :
NAs introduced by coercion
Krippendorff's alpha
Subjects = 1600
Raters = 10
alpha = 0.657
kripp.alpha(bd %>%
filter(name!="Kaisa") %>%
pivot_wider(id_cols=name,names_from=document_id,values_from=theme) %>%
select(-name) %>%
as.matrix,
method="nominal")
Warning in kripp.alpha(bd %>% filter(name != "Kaisa") %>% pivot_wider(id_cols = name, :
NAs introduced by coercion
Krippendorff's alpha
Subjects = 1600
Raters = 9
alpha = 0.688
N | Proportion | |
---|---|---|
Sisällön kopiointi valtamediasta | 69 | 70.41% |
Oman narratiivin rakentaminen lähdeviitteiden avulla | 24 | 24.49% |
Journalistisen median kritiikki | 5 | 5.10% |
70% of this data is copying from mainstream media. 24 articles is probably enough to analyse the developing of one’s own narrative, but there are really too few “criticism of mainstream media” articles to draw conclusions from.
N | Proportion | |
---|---|---|
Journalistisen median kritiikki | ||
clear | 4 | 80.00% |
unclear | 1 | 20.00% |
Oman narratiivin rakentaminen lähdeviitteiden avulla | ||
clear | 16 | 66.67% |
unclear | 8 | 33.33% |
Sisällön kopiointi valtamediasta | ||
clear | 66 | 95.65% |
unclear | 3 | 4.35% |
Identifying the copying of material from mainstream media seems quite clear, as opposed to whether something is building their own narrative.
The confusion matrix between annotation pairs tells a bit more:
Here, it seems that it is hard to draw boundaries between the development of one’s own narrative and both journalistic critique as well as copying.
Again, too few critique articles to draw conclusions from. Otherwise, seems that the only clear category identified quickly is a subset of articles copying content from mainstream media. Even for clear cases of building one’s own narrative, decisions take long to reach.
Here even more categories lack data to base decisions on. What we can see is that copying can be relatively quickly identified in all length categories, and that own narrative articles are often long and slow to parse.
Again, we have two different annotator profiles evident.
No clear outlier annotators appear in this task:
Here again, annotators are likely to spend more time on “problematic” articles, here proxied by their clarity(=the amount of general agreement wrt them). An interesting exception is the own narrative category, where unclear decisions may have been due to making them in haste.
Again, there is also clearly an effect visible where difference in annotation time explains agreement.
Categorical data is here a bit sparse to draw conclusions, but may further point to attentiveness differences in explaining disagreement on the boundaries of the own narrative category. Similarly, because the critique category requires picking up on individual sentence ques, time taken may explain some of the difference for that category.