General Setup
Analysis table creation. Don’t run if already created. Instead skip
to “Analysis table loads”
dbExecute(con,"DROP TABLE IF EXISTS articles_cdp")
articles_cdp <- articles %>% filter(case_when(
media == "HS" ~ section %in% c("Kotimaa", "Politiikka", "Talous"),
media == "IL" ~ subsection %in% c("kotimaa","politiikka","talous","uutiset"),
media == "STT" ~ section %in% c("Kotimaa","Politiikka","Talous"),
media == "YLE" ~ section == "Yle Uutiset" & str_detect(subject,"Kotimaan uutiset|politiikka|talous") & (!str_detect(subject,"Ulkomaat") | str_detect(subject,"Kotimaan uutiset")),
T ~ F
)) %>%
distinct(a_id) %>%
compute(name="articles_cdp",temporary=F,unique_indexes=c("a_id"))
dbExecute(con,"DROP TABLE IF EXISTS articles_opinionated")
articles_opinionated <- articles %>% mutate(opinionated=case_when(
media == "HS" ~ case_when(
str_to_lower(section) == "pääkirjoitus" & str_detect(str_to_lower(story_logo),"ieras") ~ "external editorial",
str_to_lower(section) == "pääkirjoitus" & is.na(story_logo) ~ "editorial",
str_to_lower(section) == "pääkirjoitus" & str_to_lower(story_logo) == "pääkirjoitus" ~ "editorial",
str_to_lower(section) == "mielipide" | str_to_lower(story_logo) == "mielipide" ~ "external opinion",
str_detect(str_to_lower(title),"analyysi:") | str_detect(str_to_lower(story_logo),"analyysi") ~ "analysis",
str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(story_logo),"näkökulma") ~ "perspective",
str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(story_logo),"kolumni") ~ "column",
str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(story_logo),"blog") ~ "blog"
),
media == "IL" ~ case_when(
subsection == "paakirjoitus" ~ "editorial",
str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
str_detect(str_to_lower(title),"analyysi:") ~ "analysis",
str_detect(str_to_lower(title),"kolumni:") ~ "column",
str_detect(str_to_lower(title),"näkökulma:") ~ "perspective"
),
media == "YLE" ~ case_when(
str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
str_detect(str_to_lower(title),"analyysi:") | str_detect(subject,"Analyysit \\(Yle Uutiset\\)") ~ "analysis",
str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(subject),"kolumn") ~ "column",
str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(subject),"näkökulm") ~ "perspective",
str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(subject),"blog") ~ "blog"
)
)) %>%
filter(!is.na(opinionated)) %>%
distinct(a_id, opinionated) %>%
compute(name="articles_opinionated",temporary=F,unique_indexes=c("a_id"))
labels <- read_tsv(here("data/person_labels.tsv")) %>%
filter(!category %in% c('adjektiivi', 'ei'))
lemmas_of_interest <- labels %>%
rename("lemma" = "name") %>%
copy_to(con,.,name="labels",overwrite=T) %>%
union_all(words %>%
filter(str_detect(lemma, "yhden#ve")) %>%
distinct(lemma) %>%
mutate(category="yhdenvertaisuus")) %>%
union_all(words %>%
filter(str_detect(lemma, "tasa#arv")) %>%
distinct(lemma) %>%
mutate(category="tasa-arvo")) %>%
distinct()
dbExecute(con,"DROP TABLE IF EXISTS words_of_interest")
words_of_interest <- words %>%
inner_join(lemmas_of_interest) %>%
distinct(w_id,lemma,category,genus) %>%
compute(name="words_of_interest",temporary=F,indexes=c("w_id","category","genus"))
dbExecute(con,"DROP TABLE IF EXISTS corpus_of_interest")
corpus_of_interest <- corpus %>%
inner_join(words_of_interest) %>%
compute(name="corpus_of_interest",temporary=F,indexes=list(c("a_id","par_id","s_id","pos"),c("w_id"),c("genus"),c("category"))
dbExecute(con,"DROP TABLE IF EXISTS article_types")
article_types <- articles %>%
left_join(articles_cdp %>% mutate(cdp=T)) %>%
left_join(articles_opinionated) %>%
mutate(type=case_when(
media == "STT" & version != "Loppuversio" ~ "Other",
!is.na(opinionated) & !str_detect(opinionated,"^external ") ~ "Journalistic opinion",
!is.na(opinionated) ~ "External opinion",
cdp ~ "Domestic general/political/economic news",
media == "HS" & section == "Kulttuuri" ~ "Culture/entertainment",
media == "IL" & section == "viihde" ~ "Culture/entertainment",
media == "STT" & section == "Kulttuuri" ~ "Culture/entertainment",
media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"kulttuuri|musiikki|viihde") & !str_detect(subject, "Ulkomaat") ~ "Culture/entertainment",
media == "HS" & section == "Kaupunki" ~ "Local news",
media == "YLE" & section == "Yle Uutiset" & coverage=="local" ~ "Local news",
media == "STT" & section == "Urheilu" ~ "Sports",
media == "HS" & section == "Urheilu" ~ "Sports",
media == "YLE" & section == "YLE Urheilu" ~ "Sports",
media == "IL" & section == "urheilu" ~ "Sports",
media == "STT" & section == "Ulkomaat" ~ "Foreign news",
media == "HS" & section == "Ulkomaat" ~ "Foreign news",
media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Ulkomaat") & !str_detect(subject,"Kotimaan uutiset") ~ "Foreign news",
media == "IL" & subsection == "ulkomaat" ~ "Foreign news",
T ~ "Other"
)) %>%
distinct(a_id,type) %>%
compute(temporary=F,name="article_types",unique_indexes=list(c("a_id"),c("a_id","type"),c("type","a_id")))
dbExecute(con,"DROP TABLE IF EXISTS articles_by_type_by_year")
articles_by_type_by_year <- articles %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created)) %>%
count(media,year_created,type,name="total_articles") %>%
compute(unique_indexes=list(c("media","year_created","type")),temporary=F,name="articles_by_type_by_year")
dbExecute(con,"DROP TABLE IF EXISTS articles_to_ref_categories")
articles_to_ref_categories <- words_of_interest %>%
filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
inner_join(corpus) %>%
group_by(a_id) %>%
summarize(yv=max(category=="yhdenvertaisuus"),ta=max(category=="tasa-arvo"),.groups="drop") %>%
mutate(ref_category=case_when(yv==1 & ta==1 ~ "both",yv==1 ~ "yhdenvertaisuus", ta==1 ~ "tasa-arvo")) %>%
select(a_id,ref_category) %>%
compute()
articles_to_ref_categories <- articles_to_ref_categories %>% union_all(
articles_to_ref_categories %>%
filter(ref_category=="both") %>%
mutate(ref_category="yhdenvertaisuus")
) %>%
union_all(
articles_to_ref_categories %>%
filter(ref_category=="both") %>%
mutate(ref_category="tasa-arvo")
) %>%
compute(temporary=F,name="articles_to_ref_categories",indexes=c("a_id"),unique_indexes=list(c("a_id","ref_category"),c("ref_category","a_id")))
yv_ta_corpus <- words_of_interest %>%
filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
inner_join(corpus)
dbExecute(con,"DROP TABLE IF EXISTS yv_ta_paragraphs")
yv_ta_paragraphs <- yv_ta_corpus %>%
distinct(a_id,par_id) %>%
compute(temporary=F,name="yv_ta_paragraphs", unique_indexes=list(c("a_id","par_id")))
Analysis table loads
articles_cdp <- tbl(con,"articles_cdp")
articles_opinionated <- tbl(con,"articles_opinionated")
article_types <- tbl(con,"article_types")
articles_by_type_by_year <- tbl(con,"articles_by_type_by_year")
words_of_interest <- tbl(con,"words_of_interest")
corpus_of_interest <- tbl(con,"corpus_of_interest")
articles_to_ref_categories <- tbl(con,"articles_to_ref_categories")
yv_ta_paragraphs <- tbl(con,"yv_ta_paragraphs")
Named query definitions
quotation_corpus <- quotes %>%
inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
select(a_id=a_id.x,par_id,s_id,pos, q_id)
quotation_sentence_corpus <- quotes %>%
inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND s_id >= start_s_id AND s_id <= end_s_id") %>%
select(a_id=a_id.x,par_id,s_id,pos, q_id)
yv_corpus <- corpus_of_interest %>%
filter(category=="yhdenvertaisuus")
yv_paragraphs <- yv_corpus %>%
distinct(a_id,par_id)
ta_corpus <- corpus_of_interest %>%
filter(category=="tasa-arvo")
ta_paragraphs <- ta_corpus %>%
distinct(a_id,par_id)
quote_orgs <- read_tsv(here("data/q_id_to_orgs.tsv")) %>%
select(c(author_head, a_id, org_cat))
New names:
• `` -> `...1`
Rows: 19362 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): org_cat, author_head
dbl (3): ...1, q_id, a_id
ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d <- corpus_of_interest %>%
inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
left_join(articles_to_ref_categories) %>%
left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
mutate(type2=case_when(
str_detect(type,"opinion$") ~ type,
in_quote_sentence ~ str_c("Quotes in ",type),
TRUE ~ str_c("Journalistic text in ",type)
))
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
Joining, by = c("a_id", "par_id", "s_id", "pos", "q_id")
Joining, by = "a_id"
d2 <- corpus_of_interest %>%
inner_join(yv_ta_paragraphs) %>%
inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
inner_join(articles_to_ref_categories) %>%
left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
mutate(type2=case_when(
str_detect(type,"opinion$") ~ type,
in_quote_sentence ~ str_c("Quotes in ",type),
TRUE ~ str_c("Journalistic text in ",type)
))
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos", "q_id")
key_cats <- c('potilas', 'maahanmuutto', 'etnos', 'seksuaalisuus', 'työsuhde')
main_types <- c('core', 'opinionated', 'external opinion')
Analysis 1: development of yhdenvertaisuus/tasa-arvo in different
text genres
Master chart
my_d <- d %>%
filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(media,category,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
collect()
my_d2 <- d %>%
filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(a_id,media,type,type2,year_created) %>%
filter(any(category=="yhdenvertaisuus"),any(category=="tasa-arvo")) %>%
group_by(media,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
collect()
Warning: Missing values are always removed in SQL aggregation functions.
Use `na.rm = TRUE` to silence this warning
This warning is displayed once every 8 hours.
my_d <- my_d %>%
mutate(word="Word") %>%
union_all(my_d2 %>%
mutate(category="tasa-arvo") %>%
mutate(word="Both")) %>%
union_all(my_d2 %>%
mutate(category="yhdenvertaisuus") %>%
mutate(word="Both")) %>%
mutate(
word=fct_relevel(word,"Word"),
category=fct_relevel(category,"tasa-arvo","yhdenvertaisuus"),
type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
my_d %>%
filter(type=="Other") %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid(category~type2,scales="free") +
labs(color="Media",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
1:3 %>% map(~
my_d %>%
filter(type!="Other") %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid_paginate(category~type2,scales="free", nrow=2, ncol=4, page=.x) +
labs(color="Media",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
[[2]]
[[3]]



Conclusions:
- IL is behind other media in change, but same trajectory
- For foreign news, STT (and therefore IL) do not increase tasa-arvo
terminology usage
- For local news, terminology usage increases only for HS (Helsinki)
as opposed to YLE (regional news)
- For culture/entertainment, other sources differ from IL. This is
probably due to category heterogeneity: for IL, this category contains
entertainment news, for others, these are more culture reviews etc.
Final graph to include in article
d %>%
filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
filter(type %in% c("Domestic general/political/economic news", "Journalistic opinion", "External opinion")) %>%
left_join(articles_by_type_by_year %>% group_by(type,year_created) %>%
summarize(total_articles=sum(total_articles),.groups="drop"),by=c("type","year_created")) %>%
group_by(category,type2,year_created) %>%
summarize(total_articles=min(total_articles),articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
collect() %>%
mutate(type2=fct_relevel(type2,"External opinion","Journalistic opinion")) %>%
mutate(category=fct_relevel(category,"yhdenvertaisuus")) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=type2)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=5)) +
facet_wrap(~category, scales="free_y") +
labs(color="Text type") +
xlab("Year") +
ylab("Percentage of articles of type containing the word") +
theme(legend.justification = c(0, 1), legend.position = c(0.02, 0.98), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal")

Analysis 2: distribution of language by speaker and subject
category
d %>%
filter(org_cat %in% c("politiikka", "oikeus"), category %in% c("yhdenvertaisuus", "tasa-arvo"), in_quote == T) %>%
group_by(org_cat,year_created, category) %>%
summarize(n=n(),.groups="drop") %>%
ggplot(aes(x=year_created,y=n,color=category)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
facet_grid(~org_cat, scales="free")

d %>%
filter(org_cat %in% c("politiikka", "oikeus"), category %in% key_cats, in_quote == T) %>%
group_by(org_cat,year_created, category) %>%
summarize(n=n(),.groups="drop") %>%
ggplot(aes(x=year_created,y=n,color=category)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
facet_grid(~org_cat, scales="free")

Analysis 3: subject associations
sp <- yv_ta_paragraphs %>%
inner_join(
words %>%
filter(lemma=="suku#puoli") %>%
inner_join(corpus)
) %>% inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
inner_join(articles_to_ref_categories) %>%
left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
mutate(type2=case_when(
str_detect(type,"opinion$") ~ type,
in_quote_sentence ~ str_c("Quotes in ",type),
TRUE ~ str_c("Journalistic text in ",type)
)) %>%
group_by(media,ref_category,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
collect() %>%
mutate(
ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
Joining, by = "w_id"
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
my_cd <- corpus_of_interest %>%
filter(lemma!="suomalainen") %>%
inner_join(yv_ta_paragraphs) %>%
inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
inner_join(articles_to_ref_categories) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
mutate(type2=case_when(
str_detect(type,"opinion$") ~ type,
in_quote_sentence ~ str_c("Quotes in ",type),
TRUE ~ str_c("Journalistic text in ",type)
))
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
cd2 <- my_cd %>%
group_by(ref_category,genus,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
collect() %>%
mutate(
ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
cd <- my_cd %>%
group_by(ref_category,category,type,type2,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
collect() %>%
mutate(
ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
Gender
my_d <- sp %>%
filter(type!="Other") %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
collect()
Joining, by = c("type", "year_created")
1:3 %>% map(~
my_d %>%
filter(ref_category!="both") %>%
mutate(fct=ref_category,word="Word") %>%
union_all(
my_d %>%
filter(ref_category=="both") %>%
mutate(fct="tasa-arvo",word="Both")
) %>%
union_all(
my_d %>%
filter(ref_category=="both") %>%
mutate(fct="yhdenvertaisuus",word="Both")
) %>%
mutate(word=fct_relevel(word,"Word")) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
labs(color="Media",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



1:3 %>% map(~
my_d %>%
filter(ref_category!="both") %>%
mutate(fct=ref_category,word="Word") %>%
union_all(
my_d %>%
filter(ref_category=="both") %>%
mutate(fct="tasa-arvo",word="Both")
) %>%
union_all(
my_d %>%
filter(ref_category=="both") %>%
mutate(fct="yhdenvertaisuus",word="Both")
) %>%
mutate(word=fct_relevel(word,"Word")) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_wrap_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
labs(color="Media",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



Conclusions:
- Decline in “sukupuolten tasa-arvo” in STT for domestic news is
interesting. What could cause this?
- Otherwise, all sources seem to be following similar patterns.
sp %>%
filter(type=="Domestic general/political/economic news") %>%
group_by(ref_category,type,type2,year_created) %>%
summarize(articles=sum(articles),.groups="drop") %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category,linetype=type2)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2))
Joining, by = c("type", "year_created")

Conclusions:
- Gender equality discussion also gets a boost after 2014
- When “yhdenvertaisuus” is used in relation to gender, “tasa-arvo” is
almost always also mentioned!
my_d <- cd2 %>%
filter(type!="Other") %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
collect()
Joining, by = c("type", "year_created")
1:3 %>% map(~
my_d %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
labs(color="ref-category",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



1:3 %>% map(~
my_d %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
geom_step() +
geom_vline(xintercept = 2009,color="red") +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_wrap_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
labs(color="ref-category",linetype="Signal") +
xlab("Year") +
ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



Conclusions:
- Same behavior seen for “sukupuolten tasa-arvo/yhdenvertaisuus” is
also evident when looking at explicitly gendered words
(mies/nainen)
Subject topic
1:3 %>% map(~
cd %>%
filter(type!="Other") %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_grid_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



1:3 %>% map(~
cd %>%
filter(type!="Other") %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=4)) +
facet_wrap_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?



Conclusions:
- There don’t seem to be major discernible patterns between different
subjects for equality
Supporting auxiliary analyses
Subject topic graphs using different measures
cd %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=days,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type2,scales="free") +
ggtitle(.y[1]$ref_category))
[[1]]
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]



cd %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=weeks,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type2,scales="free") +
ggtitle(.y[1]$ref_category))
[[1]]
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]



cd %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type2,scales="free") +
ggtitle(.y[1]$ref_category))
Joining, by = c("type", "year_created")
[[1]]
[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
[[3]]



Do the subject topics combined capture the phenomenon?
cd3 <- yv_ta_paragraphs %>%
inner_join(corpus_of_interest) %>%
filter(lemma!="suomalainen") %>%
inner_join(articles %>% select(a_id,date_created,media)) %>%
inner_join(article_types) %>%
inner_join(articles_to_ref_categories) %>%
mutate(year_created=year(date_created)) %>%
mutate(week_created=week(date_created)) %>%
mutate(category="potilas") %>%
group_by(ref_category,category,type,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
collect()
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
cd3 %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=days,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type,scales="free") +
ggtitle(.y[1]$ref_category))
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]
[[2]]
[[3]]



cd3 %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=weeks,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type,scales="free") +
ggtitle(.y[1]$ref_category))
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]
[[2]]
[[3]]



cd3 %>%
filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
inner_join(articles_by_type_by_year %>%
group_by(year_created,type) %>%
summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
group_by(ref_category) %>%
group_map(~.x %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
geom_line() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_wrap(~type,scales="free") +
ggtitle(.y[1]$ref_category))
Joining, by = c("type", "year_created")
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]
[[2]]
[[3]]



Conclusions:
- While we saw no discernible patterns between subjects, we do seem to
be capturing the “whole” of equality discussion by targeting them ->
can conclude that everyone benefits.
Background Analyses
articles %>%
inner_join(articles_to_ref_categories) %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(media,ref_category,type,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
ggplot(aes(x=year_created,y=days,color=media)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"

articles %>%
inner_join(articles_to_ref_categories) %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(media,ref_category,type,year_created) %>%
summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
ggplot(aes(x=year_created,y=weeks,color=media)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"

articles %>%
inner_join(articles_to_ref_categories) %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created),week_created=week(date_created)) %>%
group_by(media,ref_category,type,year_created) %>%
summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
inner_join(articles_by_type_by_year) %>%
ggplot(aes(x=year_created,y=articles/total_articles,color=media)) +
geom_step() +
theme_hsci_discrete(base_family="Arial") +
scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
scale_x_continuous(breaks=seq(2000,2020,by=2)) +
facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("media", "type", "year_created")

Auxiliary background analyses
articles %>%
mutate(year_created=year(date_created),month_created=month(date_created)) %>%
count(media,year_created,month_created) %>%
ggplot(aes(x=as.Date(str_c(year_created,'-',month_created,'-01')),y=n,color=media)) +
geom_step() +
theme_hsci_discrete()

articles %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created)) %>%
count(media,year_created,type) %>%
ggplot(aes(x=year_created,y=n,color=type)) +
geom_step() +
facet_wrap(~media,scales="free") +
theme_hsci_discrete()
Joining, by = "a_id"

articles %>%
inner_join(article_types) %>%
mutate(year_created=year(date_created)) %>%
mutate(from_stt=author=="STT") %>%
filter(media=="IL") %>%
count(media,year_created,type,from_stt) %>%
collect() %>%
group_by(media) %>%
mutate(type=fct_lump_n(type,11,w=n)) %>%
count(media,year_created,type,from_stt,wt=n) %>%
ggplot(aes(x=year_created,y=n,color=from_stt==1)) +
geom_step() +
facet_wrap(type~media,scales="free") +
theme_hsci_discrete()
Joining, by = "a_id"

