General Setup
Analysis table creation. Don’t run if already created. Instead skip
to “Analysis table loads”
dbExecute(con,"DROP TABLE IF EXISTS articles_cdp")
articles_cdp <- articles %>% filter(case_when(
media == "HS" ~ section %in% c("Kotimaa", "Politiikka", "Talous"),
media == "IL" ~ subsection %in% c("kotimaa","politiikka","talous","uutiset"),
media == "STT" ~ section %in% c("Kotimaa","Politiikka","Talous"),
media == "YLE" ~ section == "Yle Uutiset" & str_detect(subject,"Kotimaan uutiset|politiikka|talous") & (!str_detect(subject,"Ulkomaat") | str_detect(subject,"Kotimaan uutiset")),
T ~ F
)) %>%
distinct(a_id) %>%
compute(name="articles_cdp",temporary=F,unique_indexes=c("a_id"))
dbExecute(con,"DROP TABLE IF EXISTS articles_opinionated")
articles_opinionated <- articles %>% mutate(opinionated=case_when(
media == "HS" ~ case_when(
str_to_lower(section) == "pääkirjoitus" & str_detect(str_to_lower(story_logo),"ieras") ~ "external editorial",
str_to_lower(section) == "pääkirjoitus" & is.na(story_logo) ~ "editorial",
str_to_lower(section) == "pääkirjoitus" & str_to_lower(story_logo) == "pääkirjoitus" ~ "editorial",
str_to_lower(section) == "mielipide" | str_to_lower(story_logo) == "mielipide" ~ "external opinion",
str_detect(str_to_lower(title),"analyysi:") | str_detect(str_to_lower(story_logo),"analyysi") ~ "analysis",
str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(story_logo),"näkökulma") ~ "perspective",
str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(story_logo),"kolumni") ~ "column",
str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(story_logo),"blog") ~ "blog"
),
media == "IL" ~ case_when(
subsection == "paakirjoitus" ~ "editorial",
str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
str_detect(str_to_lower(title),"analyysi:") ~ "analysis",
str_detect(str_to_lower(title),"kolumni:") ~ "column",
str_detect(str_to_lower(title),"näkökulma:") ~ "perspective"
),
media == "YLE" ~ case_when(
str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
str_detect(str_to_lower(title),"analyysi:") | str_detect(subject,"Analyysit \\(Yle Uutiset\\)") ~ "analysis",
str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(subject),"kolumn") ~ "column",
str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(subject),"näkökulm") ~ "perspective",
str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(subject),"blog") ~ "blog"
)
)) %>%
filter(!is.na(opinionated)) %>%
distinct(a_id, opinionated) %>%
compute(name="articles_opinionated",temporary=F,unique_indexes=c("a_id"))
labels <- read_tsv(here("data/person_labels.tsv")) %>%
filter(!category %in% c('adjektiivi', 'ei'))
lemmas_of_interest <- labels %>%
rename("lemma" = "name") %>%
copy_to(con,.,name="labels",overwrite=T) %>%
union_all(words %>%
filter(str_detect(lemma, "yhden#ve")) %>%
distinct(lemma) %>%
mutate(category="yhdenvertaisuus")) %>%
union_all(words %>%
filter(str_detect(lemma, "tasa#arv")) %>%
distinct(lemma) %>%
mutate(category="tasa-arvo")) %>%
distinct()
dbExecute(con,"DROP TABLE IF EXISTS words_of_interest")
words_of_interest <- words %>%
inner_join(lemmas_of_interest) %>%
distinct(w_id,lemma,category,genus) %>%
compute(name="words_of_interest",temporary=F,indexes=c("w_id","category","genus"))
dbExecute(con,"DROP TABLE IF EXISTS corpus_of_interest")
corpus_of_interest <- corpus %>%
inner_join(words_of_interest) %>%
compute(name="corpus_of_interest",temporary=F,indexes=list(c("a_id","par_id","s_id","pos"),c("w_id"),c("genus"),c("category"))
dbExecute(con,"DROP TABLE IF EXISTS article_types")
article_types <- articles %>%
left_join(articles_cdp %>% mutate(cdp=T)) %>%
left_join(articles_opinionated) %>%
mutate(type=case_when(
media == "STT" & version != "Loppuversio" ~ "Other",
!is.na(opinionated) & !str_detect(opinionated,"^external ") ~ "Journalistic opinion",
!is.na(opinionated) ~ "External opinion",
cdp ~ "Domestic general/political/economic news",
media == "HS" & section == "Kulttuuri" ~ "Culture/entertainment",
media == "IL" & section == "viihde" ~ "Culture/entertainment",
media == "STT" & section == "Kulttuuri" ~ "Culture/entertainment",
media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"kulttuuri|musiikki|viihde") & !str_detect(subject, "Ulkomaat") ~ "Culture/entertainment",
media == "HS" & section == "Kaupunki" ~ "Local news",
media == "YLE" & section == "Yle Uutiset" & coverage=="local" ~ "Local news",
media == "STT" & section == "Urheilu" ~ "Sports",
media == "HS" & section == "Urheilu" ~ "Sports",
media == "YLE" & section == "YLE Urheilu" ~ "Sports",
media == "IL" & section == "urheilu" ~ "Sports",
media == "STT" & section == "Ulkomaat" ~ "Foreign news",
media == "HS" & section == "Ulkomaat" ~ "Foreign news",
media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Ulkomaat") & !str_detect(subject,"Kotimaan uutiset") ~ "Foreign news",
media == "IL" & subsection == "ulkomaat" ~ "Foreign news",
T ~ "Other"
)) %>%
distinct(a_id,type) %>%
compute(temporary=F,name="article_types",unique_indexes=list(c("a_id"),c("a_id","type"),c("type","a_id")))
dbExecute(con,"DROP TABLE IF EXISTS articles_by_type_by_year")
articles_by_type_by_year <- articles %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created)) %>%
count(media,year_created,type,name="total_articles") %>%
compute(unique_indexes=list(c("media","year_created","type")),temporary=F,name="articles_by_type_by_year")
dbExecute(con,"DROP TABLE IF EXISTS articles_to_ref_categories")
articles_to_ref_categories <- words_of_interest %>%
filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
inner_join(corpus) %>%
group_by(a_id) %>%
summarize(yv=max(category=="yhdenvertaisuus"),ta=max(category=="tasa-arvo"),.groups="drop") %>%
mutate(ref_category=case_when(yv==1 & ta==1 ~ "both",yv==1 ~ "yhdenvertaisuus", ta==1 ~ "tasa-arvo")) %>%
select(a_id,ref_category) %>%
compute()
articles_to_ref_categories <- articles_to_ref_categories %>% union_all(
articles_to_ref_categories %>%
filter(ref_category=="both") %>%
mutate(ref_category="yhdenvertaisuus")
) %>%
union_all(
articles_to_ref_categories %>%
filter(ref_category=="both") %>%
mutate(ref_category="tasa-arvo")
) %>%
compute(temporary=F,name="articles_to_ref_categories",indexes=c("a_id"),unique_indexes=list(c("a_id","ref_category"),c("ref_category","a_id")))
yv_ta_corpus <- words_of_interest %>%
filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
inner_join(corpus)
dbExecute(con,"DROP TABLE IF EXISTS yv_ta_paragraphs")
yv_ta_paragraphs <- yv_ta_corpus %>%
distinct(a_id,par_id) %>%
compute(temporary=F,name="yv_ta_paragraphs", unique_indexes=list(c("a_id","par_id")))
Analysis table loads
articles_cdp <- tbl(con,"articles_cdp")
articles_opinionated <- tbl(con,"articles_opinionated")
article_types <- tbl(con,"article_types")
articles_by_type_by_year <- tbl(con,"articles_by_type_by_year")
words_of_interest <- tbl(con,"words_of_interest")
corpus_of_interest <- tbl(con,"corpus_of_interest")
articles_to_ref_categories <- tbl(con,"articles_to_ref_categories")
yv_ta_paragraphs <- tbl(con,"yv_ta_paragraphs")
Named query definitions
quotation_corpus <- quotes %>%
inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
select(a_id=a_id.x,par_id,s_id,pos, q_id)
quotation_sentence_corpus <- quotes %>%
inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND s_id >= start_s_id AND s_id <= end_s_id") %>%
select(a_id=a_id.x,par_id,s_id,pos, q_id)
yv_corpus <- corpus_of_interest %>%
filter(category=="yhdenvertaisuus")
yv_paragraphs <- yv_corpus %>%
distinct(a_id,par_id)
ta_corpus <- corpus_of_interest %>%
filter(category=="tasa-arvo")
ta_paragraphs <- ta_corpus %>%
distinct(a_id,par_id)
quote_orgs <- read_tsv(here("data/q_id_to_orgs.tsv")) %>%
select(c(author_head, a_id, org_cat))
New names:
• `` -> `...1`
Rows: 19362 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): org_cat, author_head
dbl (3): ...1, q_id, a_id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d <- corpus_of_interest %>%
inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
left_join(articles_to_ref_categories) %>%
left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
mutate(type2=case_when(
str_detect(type,"opinion$") ~ type,
in_quote_sentence ~ str_c("Quotes in ",type),
TRUE ~ str_c("Journalistic text in ",type)
))
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
Joining, by = c("a_id", "par_id", "s_id", "pos", "q_id")
Joining, by = "a_id"
d2 <- corpus_of_interest %>%
inner_join(yv_ta_paragraphs) %>%
inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
inner_join(articles_to_ref_categories) %>%
left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
mutate(type2=case_when(
str_detect(type,"opinion$") ~ type,
in_quote_sentence ~ str_c("Quotes in ",type),
TRUE ~ str_c("Journalistic text in ",type)
))
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos", "q_id")
key_cats <- c('potilas', 'maahanmuutto', 'etnos', 'seksuaalisuus', 'työsuhde')
main_types <- c('core', 'opinionated', 'external opinion')
Analysis 1: development of yhdenvertaisuus/tasa-arvo in different
text genres
Master chart
my_d <- d %>%
filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(media,category,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
collect()
my_d2 <- d %>%
filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(a_id,media,type,type2,year_created) %>%
filter(any(category=="yhdenvertaisuus"),any(category=="tasa-arvo")) %>%
group_by(media,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
collect()
Warning: Missing values are always removed in SQL aggregation functions.
Use `na.rm = TRUE` to silence this warning
This warning is displayed once every 8 hours.
my_d <- my_d %>%
mutate(word="Word") %>%
union_all(my_d2 %>%
mutate(category="tasa-arvo") %>%
mutate(word="Both")) %>%
union_all(my_d2 %>%
mutate(category="yhdenvertaisuus") %>%
mutate(word="Both")) %>%
mutate(
word=fct_relevel(word,"Word"),
category=fct_relevel(category,"tasa-arvo","yhdenvertaisuus"),
type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
my_d %>%
filter(type=="Other") %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid(category~type2,scales="free") +
labs(color="Media",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
1:3 %>% map(~
my_d %>%
filter(type!="Other") %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid_paginate(category~type2,scales="free", nrow=2, ncol=4, page=.x) +
labs(color="Media",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
[[2]]
[[3]]



Conclusions:
- IL is behind other media in change, but same trajectory
- For foreign news, STT (and therefore IL) do not increase tasa-arvo
terminology usage
- For local news, terminology usage increases only for HS (Helsinki)
as opposed to YLE (regional news)
- For culture/entertainment, other sources differ from IL. This is
probably due to category heterogeneity: for IL, this category contains
entertainment news, for others, these are more culture reviews etc.
Final graph to include in article
d %>%
filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
filter(type %in% c("Domestic general/political/economic news", "Journalistic opinion", "External opinion")) %>%
left_join(articles_by_type_by_year %>% group_by(type,year_created) %>%
summarize(total_articles=sum(total_articles),.groups="drop"),by=c("type","year_created")) %>%
group_by(category,type2,year_created) %>%
summarize(total_articles=min(total_articles),articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
collect() %>%
mutate(type2=fct_relevel(type2,"External opinion","Journalistic opinion")) %>%
mutate(category=fct_relevel(category,"yhdenvertaisuus")) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=type2)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=5)) +
facet_wrap(~category, scales="free_y") +
labs(color="Text type") +
xlab("Year") +
ylab("Percentage of articles of type containing the word") +
theme(legend.justification = c(0, 1), legend.position = c(0.02, 0.98), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal")

Analysis 2: distribution of language by speaker and subject
category
d %>%
filter(org_cat %in% c("politiikka", "oikeus"), category %in% c("yhdenvertaisuus", "tasa-arvo"), in_quote == T) %>%
group_by(org_cat,year_created, category) %>%
summarize(n=n(),.groups="drop") %>%
ggplot(aes(x=year_created,y=n,color=category)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
facet_grid(~org_cat, scales="free")

d %>%
filter(org_cat %in% c("politiikka", "oikeus"), category %in% key_cats, in_quote == T) %>%
group_by(org_cat,year_created, category) %>%
summarize(n=n(),.groups="drop") %>%
ggplot(aes(x=year_created,y=n,color=category)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
facet_grid(~org_cat, scales="free")

Analysis 3: subject associations
sp <- yv_ta_paragraphs %>%
inner_join(
words %>%
filter(lemma=="suku#puoli") %>%
inner_join(corpus)
) %>% inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
inner_join(articles_to_ref_categories) %>%
left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
mutate(type2=case_when(
str_detect(type,"opinion$") ~ type,
in_quote_sentence ~ str_c("Quotes in ",type),
TRUE ~ str_c("Journalistic text in ",type)
)) %>%
group_by(media,ref_category,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
collect() %>%
mutate(
ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
Joining, by = "w_id"
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
my_cd <- corpus_of_interest %>%
filter(lemma!="suomalainen") %>%
inner_join(yv_ta_paragraphs) %>%
inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
inner_join(articles_to_ref_categories) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
mutate(type2=case_when(
str_detect(type,"opinion$") ~ type,
in_quote_sentence ~ str_c("Quotes in ",type),
TRUE ~ str_c("Journalistic text in ",type)
))
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
cd2 <- my_cd %>%
group_by(ref_category,genus,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
collect() %>%
mutate(
ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
cd <- my_cd %>%
group_by(ref_category,category,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
collect() %>%
mutate(
ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
Gender
my_d <- sp %>%
filter(type!="Other") %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
collect()
Joining, by = c("type", "year_created")
1:3 %>% map(~
my_d %>%
filter(ref_category!="both") %>%
mutate(fct=ref_category,word="Word") %>%
union_all(
my_d %>%
filter(ref_category=="both") %>%
mutate(fct="tasa-arvo",word="Both")
) %>%
union_all(
my_d %>%
filter(ref_category=="both") %>%
mutate(fct="yhdenvertaisuus",word="Both")
) %>%
mutate(word=fct_relevel(word,"Word")) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
labs(color="Media",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



1:3 %>% map(~
my_d %>%
filter(ref_category!="both") %>%
mutate(fct=ref_category,word="Word") %>%
union_all(
my_d %>%
filter(ref_category=="both") %>%
mutate(fct="tasa-arvo",word="Both")
) %>%
union_all(
my_d %>%
filter(ref_category=="both") %>%
mutate(fct="yhdenvertaisuus",word="Both")
) %>%
mutate(word=fct_relevel(word,"Word")) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_wrap_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
labs(color="Media",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



Conclusions:
- Decline in “sukupuolten tasa-arvo” in STT for domestic news is
interesting. What could cause this?
- Otherwise, all sources seem to be following similar patterns.
sp %>%
filter(type=="Domestic general/political/economic news") %>%
group_by(ref_category,type,type2,year_created) %>%
summarize(articles=sum(articles),.groups="drop") %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category,linetype=type2)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2))
Joining, by = c("type", "year_created")

Conclusions:
- Gender equality discussion also gets a boost after 2014
- When “yhdenvertaisuus” is used in relation to gender, “tasa-arvo” is
almost always also mentioned!
my_d <- cd2 %>%
filter(type!="Other") %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
collect()
Joining, by = c("type", "year_created")
1:3 %>% map(~
my_d %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
labs(color="ref-category",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



1:3 %>% map(~
my_d %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_wrap_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
labs(color="ref-category",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



Conclusions:
- Same behavior seen for “sukupuolten tasa-arvo/yhdenvertaisuus” is
also evident when looking at explicitly gendered words
(mies/nainen)
Subject topic
1:3 %>% map(~
cd %>%
filter(type!="Other") %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



1:3 %>% map(~
cd %>%
filter(type!="Other") %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_wrap_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



Conclusions:
- There don’t seem to be major discernible patterns between different
subjects for equality
Supporting auxiliary analyses
Subject topic graphs using different measures
cd %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=days,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type2,scales="free") +
ggtitle(.y[1]$ref_category))
[[1]]
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]



cd %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=weeks,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type2,scales="free") +
ggtitle(.y[1]$ref_category))
[[1]]
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]



cd %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type2,scales="free") +
ggtitle(.y[1]$ref_category))
Joining, by = c("type", "year_created")
[[1]]
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]



Do the subject topics combined capture the phenomenon?
cd3 <- yv_ta_paragraphs %>%
inner_join(corpus_of_interest) %>%
filter(lemma!="suomalainen") %>%
inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
inner_join(articles_to_ref_categories) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
mutate(category="potilas") %>%
group_by(ref_category,category,type,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
collect()
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
cd3 %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=days,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type,scales="free") +
ggtitle(.y[1]$ref_category))
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]
[[2]]
[[3]]



cd3 %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=weeks,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type,scales="free") +
ggtitle(.y[1]$ref_category))
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]
[[2]]
[[3]]



cd3 %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type,scales="free") +
ggtitle(.y[1]$ref_category))
Joining, by = c("type", "year_created")
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]
[[2]]
[[3]]



Conclusions:
- While we saw no discernible patterns between subjects, we do seem to
be capturing the “whole” of equality discussion by targeting them ->
can conclude that everyone benefits.
Background Analyses
articles %>%
inner_join(articles_to_ref_categories) %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(media,ref_category,type,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
ggplot(aes(x=year_created,y=days,color=media)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"

articles %>%
inner_join(articles_to_ref_categories) %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(media,ref_category,type,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
ggplot(aes(x=year_created,y=weeks,color=media)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"

articles %>%
inner_join(articles_to_ref_categories) %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(media,ref_category,type,year_created) %>%
summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
inner_join(articles_by_type_by_year) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("media", "type", "year_created")

Auxiliary background analyses
articles %>%
mutate(year_created=year(date_created),month_created=month(date_created)) %>%
count(media,year_created,month_created) %>%
ggplot(aes(x=as.Date(str_c(year_created,'-',month_created,'-01')),y=n,color=media)) +
geom_step() +
theme_hsci_discrete()

articles %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created)) %>%
count(media,year_created,type) %>%
ggplot(aes(x=year_created,y=n,color=type)) +
geom_step() +
facet_wrap(~media,scales="free") +
theme_hsci_discrete()
Joining, by = "a_id"

articles %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created)) %>%
mutate(from_stt=author=="STT") %>%
filter(media=="IL") %>%
count(media,year_created,type,from_stt) %>%
collect() %>%
group_by(media) %>%
mutate(type=fct_lump_n(type,11,w=n)) %>%
count(media,year_created,type,from_stt,wt=n) %>%
ggplot(aes(x=year_created,y=n,color=from_stt==1)) +
geom_step() +
facet_wrap(type~media,scales="free") +
theme_hsci_discrete()
Joining, by = "a_id"

---
title: "FLOPO YV/TA analysis"
output:
  html_notebook:
    code_folding: hide
    toc: yes
---

# General Setup

```{r setup,include=FALSE}
knitr::opts_knit$set(root.dir = here::here())
library(here)
library(tidyverse)
library(DBI)
library(glue)
library(lubridate)
library(hms)
library(ggbeeswarm)
library(ggforce)
library(pak)
pkg_install("hsci-r/gghsci")
library(gghsci)
library(RMariaDB)

con <- DBI::dbConnect(
  RMariaDB::MariaDB(), 
  host = "128.214.253.211", 
  dbname = "flopo", 
  user = "root", 
  password = "dhh17",
  bigint = "integer",
  load_data_local_infile = TRUE,
  autocommit = TRUE,
  reconnect = TRUE)
dbExecute(con, "SET SESSION storage_engine=aria")

tbl(con, "a_sim")
actor_mentions <- tbl(con, "actor_mentions")
actor_org <- tbl(con, "actor_org")
actor_roles <- tbl(con, "actor_roles")
actors <- tbl(con, "actors")
articles <- tbl(con, "articles")
corpus <- tbl(con, "corpus")
misc <- tbl(con, "misc")
q_qa <- tbl(con, "q_qa")
quote_authors <- tbl(con, "quote_authors")
quotes <- tbl(con, "quotes")
words <- tbl(con, "words")
quote_author_names_to_canonical_names <- tbl(con, "quote_author_names_to_canonical_names")
actor_names_to_canonical_names <- tbl(con, "actor_names_to_canonical_names")
actor_roles_to_canonical_roles <- tbl(con, "actor_roles_to_canonical_roles")
actor_orgs_to_canonical_orgs <- tbl(con, "actor_orgs_to_canonical_orgs")
a_can_names_to_can_orgs_roles_by_t_created <- tbl(con, "a_can_names_to_can_orgs_roles_by_t_created")
```

# Analysis table creation. Don't run if already created. Instead skip to "Analysis table loads"

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS articles_cdp")
articles_cdp <- articles %>% filter(case_when(
  media == "HS" ~ section %in% c("Kotimaa", "Politiikka", "Talous"),
  media == "IL" ~ subsection %in% c("kotimaa","politiikka","talous","uutiset"),
  media == "STT" ~ section %in% c("Kotimaa","Politiikka","Talous"),
  media == "YLE" ~ section == "Yle Uutiset" & str_detect(subject,"Kotimaan uutiset|politiikka|talous") & (!str_detect(subject,"Ulkomaat") | str_detect(subject,"Kotimaan uutiset")),
  T ~ F
)) %>% 
  distinct(a_id) %>% 
  compute(name="articles_cdp",temporary=F,unique_indexes=c("a_id"))
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS articles_opinionated")
articles_opinionated <- articles %>% mutate(opinionated=case_when(
  media == "HS" ~ case_when(
    str_to_lower(section) == "pääkirjoitus" & str_detect(str_to_lower(story_logo),"ieras") ~ "external editorial",
    str_to_lower(section) == "pääkirjoitus" & is.na(story_logo) ~ "editorial",
    str_to_lower(section) == "pääkirjoitus" & str_to_lower(story_logo) == "pääkirjoitus" ~ "editorial",
    str_to_lower(section) == "mielipide" | str_to_lower(story_logo) == "mielipide" ~ "external opinion",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(str_to_lower(story_logo),"analyysi")  ~ "analysis",
    str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(story_logo),"näkökulma") ~ "perspective",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(story_logo),"kolumni") ~ "column",
    str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(story_logo),"blog") ~ "blog"    
  ),
  media == "IL" ~ case_when(
    subsection == "paakirjoitus" ~ "editorial",
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:") ~ "perspective"
  ),
  media == "YLE" ~ case_when(
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(subject,"Analyysit \\(Yle Uutiset\\)") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(subject),"kolumn") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:")  | str_detect(str_to_lower(subject),"näkökulm") ~ "perspective",
    str_detect(str_to_lower(title),"blogi:")  | str_detect(str_to_lower(subject),"blog") ~ "blog"
  )
)) %>%
  filter(!is.na(opinionated)) %>%
  distinct(a_id, opinionated) %>%
  compute(name="articles_opinionated",temporary=F,unique_indexes=c("a_id"))
```

```{r,eval=FALSE}
labels <- read_tsv(here("data/person_labels.tsv")) %>%
  filter(!category %in% c('adjektiivi', 'ei'))

lemmas_of_interest <- labels %>%
  rename("lemma" = "name") %>%
  copy_to(con,.,name="labels",overwrite=T) %>%
  union_all(words %>%
    filter(str_detect(lemma, "yhden#ve")) %>% 
      distinct(lemma) %>% 
      mutate(category="yhdenvertaisuus")) %>%
  union_all(words %>%
      filter(str_detect(lemma, "tasa#arv")) %>% 
      distinct(lemma) %>% 
      mutate(category="tasa-arvo")) %>%
  distinct()
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS words_of_interest")
words_of_interest <- words %>% 
  inner_join(lemmas_of_interest) %>% 
  distinct(w_id,lemma,category,genus) %>% 
  compute(name="words_of_interest",temporary=F,indexes=c("w_id","category","genus"))
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS corpus_of_interest")
corpus_of_interest <- corpus %>%
  inner_join(words_of_interest) %>%
  compute(name="corpus_of_interest",temporary=F,indexes=list(c("a_id","par_id","s_id","pos"),c("w_id"),c("genus"),c("category"))
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS article_types")
article_types <- articles %>% 
  left_join(articles_cdp %>% mutate(cdp=T)) %>%
  left_join(articles_opinionated) %>%
  mutate(type=case_when(
    media == "STT" & version != "Loppuversio" ~ "Other",
    !is.na(opinionated) & !str_detect(opinionated,"^external ") ~ "Journalistic opinion",
    !is.na(opinionated) ~ "External opinion",
    cdp ~ "Domestic general/political/economic news",
    media == "HS" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "IL" & section == "viihde" ~ "Culture/entertainment",
    media == "STT" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"kulttuuri|musiikki|viihde") & !str_detect(subject, "Ulkomaat") ~ "Culture/entertainment",
    media == "HS" & section == "Kaupunki" ~ "Local news",
    media == "YLE" & section == "Yle Uutiset" & coverage=="local" ~ "Local news",
    media == "STT" & section == "Urheilu" ~ "Sports",
    media == "HS" & section == "Urheilu" ~ "Sports",
    media == "YLE" & section == "YLE Urheilu" ~ "Sports",
    media == "IL" & section == "urheilu" ~ "Sports",
    media == "STT" & section == "Ulkomaat" ~ "Foreign news",
    media == "HS" & section == "Ulkomaat" ~ "Foreign news",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Ulkomaat") & !str_detect(subject,"Kotimaan uutiset") ~ "Foreign news",
    media == "IL" & subsection == "ulkomaat" ~ "Foreign news",
    T ~ "Other"
  )) %>%
  distinct(a_id,type) %>%
  compute(temporary=F,name="article_types",unique_indexes=list(c("a_id"),c("a_id","type"),c("type","a_id")))
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS articles_by_type_by_year")
articles_by_type_by_year <- articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  count(media,year_created,type,name="total_articles") %>%
  compute(unique_indexes=list(c("media","year_created","type")),temporary=F,name="articles_by_type_by_year")
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS articles_to_ref_categories")
articles_to_ref_categories <- words_of_interest %>%
    filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
    inner_join(corpus) %>%
    group_by(a_id) %>%
    summarize(yv=max(category=="yhdenvertaisuus"),ta=max(category=="tasa-arvo"),.groups="drop") %>%
    mutate(ref_category=case_when(yv==1 & ta==1 ~ "both",yv==1 ~ "yhdenvertaisuus", ta==1 ~ "tasa-arvo")) %>% 
  select(a_id,ref_category) %>% 
  compute()
articles_to_ref_categories <- articles_to_ref_categories %>% union_all(
    articles_to_ref_categories %>%
    filter(ref_category=="both") %>% 
    mutate(ref_category="yhdenvertaisuus")
  ) %>%
  union_all(
    articles_to_ref_categories %>%
    filter(ref_category=="both") %>% 
    mutate(ref_category="tasa-arvo")
  ) %>%
  compute(temporary=F,name="articles_to_ref_categories",indexes=c("a_id"),unique_indexes=list(c("a_id","ref_category"),c("ref_category","a_id")))
```

```{r,eval=FALSE}
yv_ta_corpus <- words_of_interest %>%
  filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
  inner_join(corpus) 

dbExecute(con,"DROP TABLE IF EXISTS yv_ta_paragraphs")
yv_ta_paragraphs <- yv_ta_corpus %>%
  distinct(a_id,par_id) %>%
  compute(temporary=F,name="yv_ta_paragraphs", unique_indexes=list(c("a_id","par_id")))
```

# Analysis table loads

```{r}
articles_cdp <- tbl(con,"articles_cdp")
articles_opinionated <- tbl(con,"articles_opinionated")
article_types <- tbl(con,"article_types")
articles_by_type_by_year <- tbl(con,"articles_by_type_by_year")
words_of_interest <- tbl(con,"words_of_interest")
corpus_of_interest <- tbl(con,"corpus_of_interest")
articles_to_ref_categories <- tbl(con,"articles_to_ref_categories")
yv_ta_paragraphs <- tbl(con,"yv_ta_paragraphs")
```

# Named query definitions

```{r}
quotation_corpus <- quotes %>% 
  inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  select(a_id=a_id.x,par_id,s_id,pos, q_id)

quotation_sentence_corpus <- quotes %>% 
  inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND s_id >= start_s_id AND s_id <= end_s_id") %>%
  select(a_id=a_id.x,par_id,s_id,pos, q_id)

yv_corpus <- corpus_of_interest %>% 
  filter(category=="yhdenvertaisuus")

yv_paragraphs <- yv_corpus %>% 
  distinct(a_id,par_id)

ta_corpus <- corpus_of_interest %>% 
  filter(category=="tasa-arvo")

ta_paragraphs <- ta_corpus %>% 
  distinct(a_id,par_id)

quote_orgs <- read_tsv(here("data/q_id_to_orgs.tsv")) %>%
  select(c(author_head, a_id, org_cat))

d <- corpus_of_interest %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  left_join(articles_to_ref_categories) %>%
  left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
  mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))
  
d2 <- corpus_of_interest %>%
  inner_join(yv_ta_paragraphs) %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
  left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))
```

```{r}
key_cats <- c('potilas', 'maahanmuutto', 'etnos', 'seksuaalisuus', 'työsuhde')
main_types <- c('core', 'opinionated', 'external opinion')
```

# Analysis 1: development of yhdenvertaisuus/tasa-arvo in different text genres

## Master chart

```{r}
my_d <- d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
  collect()

my_d2 <- d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(a_id,media,type,type2,year_created) %>%
  filter(any(category=="yhdenvertaisuus"),any(category=="tasa-arvo")) %>%
  group_by(media,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
  collect()

my_d <- my_d %>% 
  mutate(word="Word") %>%
  union_all(my_d2 %>% 
              mutate(category="tasa-arvo") %>%
              mutate(word="Both")) %>%
  union_all(my_d2 %>% 
              mutate(category="yhdenvertaisuus") %>%
              mutate(word="Both")) %>%
  mutate(
    word=fct_relevel(word,"Word"),
    category=fct_relevel(category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))  
```

```{r,fig.width=28,fig.height=8,eval=FALSE}
my_d %>%
  filter(type=="Other") %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
  geom_step() +
  geom_vline(xintercept = 2009,color="red") + 
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=4)) +
  facet_grid(category~type2,scales="free") +
  labs(color="Media",linetype="Signal") +
  xlab("Year") +
  ylab("Percentage of articles of type containing the word")
```

```{r,fig.width=8,fig.height=6}
1:3 %>% map(~
  my_d %>%
    filter(type!="Other") %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(category~type2,scales="free", nrow=2, ncol=4, page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
```

Conclusions:

- IL is behind other media in change, but same trajectory
- For foreign news, STT (and therefore IL) do not increase tasa-arvo terminology usage
- For local news, terminology usage increases only for HS (Helsinki) as opposed to YLE (regional news)
- For culture/entertainment, other sources differ from IL. This is probably due to category heterogeneity: for IL, this category contains entertainment news, for others, these are more culture reviews etc.

## Does change in media composition affect results?

```{r,fig.width=8,fig.height=6}
1:3 %>% map(~
  my_d %>%
    filter(type!="Other") %>%
    mutate(
      l_total_articles=if_else(media %in% c("HS","STT"),total_articles,0L),
      l_articles=if_else(media %in% c("HS","STT"),articles,0L)
      ) %>%
    group_by(year_created,category,type2,word) %>%
    summarize(`HS/STT`=sum(l_articles)/sum(l_total_articles),`All medias`=sum(articles)/sum(total_articles),.groups="drop") %>%
    pivot_longer(`HS/STT`:`All medias`) %>%
    ggplot(aes(x=year_created,y=value,color=name,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(category~type2,scales="free",nrow=2,ncol=4,page=.x) +
    labs(color="Media") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
```

Conclusions:

 - If we mention the IL/STT behavior from above in text, we can then drop the medias from graphs as they do not otherwise affect results for main categories of interest.

## Final graph to include in article

```{r}
d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>% 
  filter(type %in% c("Domestic general/political/economic news", "Journalistic opinion", "External opinion")) %>%
  left_join(articles_by_type_by_year %>% group_by(type,year_created) %>%
               summarize(total_articles=sum(total_articles),.groups="drop"),by=c("type","year_created")) %>%
  group_by(category,type2,year_created) %>%
  summarize(total_articles=min(total_articles),articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  collect() %>%
  mutate(type2=fct_relevel(type2,"External opinion","Journalistic opinion")) %>%
  mutate(category=fct_relevel(category,"yhdenvertaisuus")) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=type2)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=5)) +
  facet_wrap(~category, scales="free_y") +
  labs(color="Text type") +
  xlab("Year") +
  ylab("Percentage of articles of type containing the word") +
  theme(legend.justification = c(0, 1), legend.position = c(0.02, 0.98), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal")
```

# Analysis 2: distribution of language by speaker and subject category

```{r}

d %>%
  filter(org_cat %in% c("politiikka", "oikeus"), category %in% c("yhdenvertaisuus", "tasa-arvo"), in_quote == T) %>%
  group_by(org_cat,year_created, category) %>%
  summarize(n=n(),.groups="drop") %>%
  ggplot(aes(x=year_created,y=n,color=category)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  facet_grid(~org_cat, scales="free")

d %>%
  filter(org_cat %in% c("politiikka", "oikeus"), category %in% key_cats, in_quote == T) %>%
  group_by(org_cat,year_created, category) %>%
  summarize(n=n(),.groups="drop") %>%
  ggplot(aes(x=year_created,y=n,color=category)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  facet_grid(~org_cat, scales="free")
```

# Analysis 3: subject associations

```{r}
sp <- yv_ta_paragraphs %>%
  inner_join(
    words %>% 
      filter(lemma=="suku#puoli") %>%
      inner_join(corpus)
    ) %>% inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  )) %>%
  group_by(media,ref_category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
  
my_cd <- corpus_of_interest %>%
  filter(lemma!="suomalainen") %>%
  inner_join(yv_ta_paragraphs) %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))

cd2 <- my_cd %>%
  group_by(ref_category,genus,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))

cd <- my_cd %>%
  group_by(ref_category,category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
```

## Gender

```{r,fig.width=8,fig.height=6}
my_d <- sp %>%
  filter(type!="Other") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>% 
  collect()

1:3 %>% map(~
  my_d %>% 
    filter(ref_category!="both") %>%
    mutate(fct=ref_category,word="Word") %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="tasa-arvo",word="Both")
    ) %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="yhdenvertaisuus",word="Both")
    ) %>%
    mutate(word=fct_relevel(word,"Word")) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)

1:3 %>% map(~  
  my_d %>% 
    filter(ref_category!="both") %>%
    mutate(fct=ref_category,word="Word") %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="tasa-arvo",word="Both")
    ) %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="yhdenvertaisuus",word="Both")
    ) %>%
    mutate(word=fct_relevel(word,"Word")) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
```

Conclusions:

 - Decline in "sukupuolten tasa-arvo" in STT for domestic news is interesting. What could cause this?
 - Otherwise, all sources seem to be following similar patterns.

```{r}
sp %>% 
  filter(type=="Domestic general/political/economic news") %>%
  group_by(ref_category,type,type2,year_created) %>%
  summarize(articles=sum(articles),.groups="drop") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category,linetype=type2)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2))
```

Conclusions:

 - Gender equality discussion also gets a boost after 2014
 - When "yhdenvertaisuus" is used in relation to gender, "tasa-arvo" is almost always also mentioned!

```{r,fig.width=8,fig.height=6}
my_d <- cd2 %>%
  filter(type!="Other") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>% 
  collect()

1:3 %>% map(~
  my_d %>% 
    ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="ref-category",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)

1:3 %>% map(~  
  my_d %>% 
    ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="ref-category",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
```

Conclusions:

- Same behavior seen for "sukupuolten tasa-arvo/yhdenvertaisuus" is also evident when looking at explicitly gendered words (mies/nainen)

## Subject topic

```{r,fig.width=8,fig.height=6}
1:3 %>% map(~
  cd %>% 
    filter(type!="Other") %>%
    filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
    inner_join(articles_by_type_by_year %>% 
                 group_by(year_created,type) %>%
                 summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_step() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)

1:3 %>% map(~ 
  cd %>% 
    filter(type!="Other") %>%
    filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
    inner_join(articles_by_type_by_year %>% 
                 group_by(year_created,type) %>%
                 summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_step() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)
```

Conclusions:

 - There don't seem to be major discernible patterns between different subjects for equality

## Supporting auxiliary analyses

### Subject topic graphs using different measures

```{r}
cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=days,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))

cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=weeks,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))

cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))
```

### Do the subject topics combined capture the phenomenon?

```{r}
cd3 <- yv_ta_paragraphs %>%
  inner_join(corpus_of_interest) %>%
  filter(lemma!="suomalainen") %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(category="potilas") %>%
  group_by(ref_category,category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect()

cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=days,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))

cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=weeks,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))

cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))
```

Conclusions:

 - While we saw no discernible patterns between subjects, we do seem to be capturing the "whole" of equality discussion by targeting them -> can conclude that everyone benefits.

# Background Analyses

```{r}
articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
  ggplot(aes(x=year_created,y=days,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")

articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
  ggplot(aes(x=year_created,y=weeks,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")

articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")
```

# Auxiliary background analyses

```{r}
articles %>%
  mutate(year_created=year(date_created),month_created=month(date_created)) %>%
  count(media,year_created,month_created) %>%
  ggplot(aes(x=as.Date(str_c(year_created,'-',month_created,'-01')),y=n,color=media)) +
  geom_step() +
  theme_hsci_discrete()
```

```{r}
articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  count(media,year_created,type) %>%
  ggplot(aes(x=year_created,y=n,color=type)) +
  geom_step() +
  facet_wrap(~media,scales="free") +
  theme_hsci_discrete()
```

```{r}
articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(from_stt=author=="STT") %>%
  filter(media=="IL") %>%
  count(media,year_created,type,from_stt) %>%
  collect() %>%
  group_by(media) %>%
  mutate(type=fct_lump_n(type,11,w=n)) %>%
  count(media,year_created,type,from_stt,wt=n) %>%
  ggplot(aes(x=year_created,y=n,color=from_stt==1)) +
  geom_step() +
  facet_wrap(type~media,scales="free") +
  theme_hsci_discrete()
```
