General Setup

Analysis table creation. Don’t run if already created. Instead skip to “Analysis table loads”

dbExecute(con,"DROP TABLE IF EXISTS articles_cdp")
articles_cdp <- articles %>% filter(case_when(
  media == "HS" ~ section %in% c("Kotimaa", "Politiikka", "Talous"),
  media == "IL" ~ subsection %in% c("kotimaa","politiikka","talous","uutiset"),
  media == "STT" ~ section %in% c("Kotimaa","Politiikka","Talous"),
  media == "YLE" ~ section == "Yle Uutiset" & str_detect(subject,"Kotimaan uutiset|politiikka|talous") & (!str_detect(subject,"Ulkomaat") | str_detect(subject,"Kotimaan uutiset")),
  T ~ F
)) %>% 
  distinct(a_id) %>% 
  compute(name="articles_cdp",temporary=F,unique_indexes=c("a_id"))
dbExecute(con,"DROP TABLE IF EXISTS articles_opinionated")
articles_opinionated <- articles %>% mutate(opinionated=case_when(
  media == "HS" ~ case_when(
    str_to_lower(section) == "pääkirjoitus" & str_detect(str_to_lower(story_logo),"ieras") ~ "external editorial",
    str_to_lower(section) == "pääkirjoitus" & is.na(story_logo) ~ "editorial",
    str_to_lower(section) == "pääkirjoitus" & str_to_lower(story_logo) == "pääkirjoitus" ~ "editorial",
    str_to_lower(section) == "mielipide" | str_to_lower(story_logo) == "mielipide" ~ "external opinion",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(str_to_lower(story_logo),"analyysi")  ~ "analysis",
    str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(story_logo),"näkökulma") ~ "perspective",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(story_logo),"kolumni") ~ "column",
    str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(story_logo),"blog") ~ "blog"    
  ),
  media == "IL" ~ case_when(
    subsection == "paakirjoitus" ~ "editorial",
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:") ~ "perspective"
  ),
  media == "YLE" ~ case_when(
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(subject,"Analyysit \\(Yle Uutiset\\)") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(subject),"kolumn") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:")  | str_detect(str_to_lower(subject),"näkökulm") ~ "perspective",
    str_detect(str_to_lower(title),"blogi:")  | str_detect(str_to_lower(subject),"blog") ~ "blog"
  )
)) %>%
  filter(!is.na(opinionated)) %>%
  distinct(a_id, opinionated) %>%
  compute(name="articles_opinionated",temporary=F,unique_indexes=c("a_id"))
labels <- read_tsv(here("data/person_labels.tsv")) %>%
  filter(!category %in% c('adjektiivi', 'ei'))

lemmas_of_interest <- labels %>%
  rename("lemma" = "name") %>%
  copy_to(con,.,name="labels",overwrite=T) %>%
  union_all(words %>%
    filter(str_detect(lemma, "yhden#ve")) %>% 
      distinct(lemma) %>% 
      mutate(category="yhdenvertaisuus")) %>%
  union_all(words %>%
      filter(str_detect(lemma, "tasa#arv")) %>% 
      distinct(lemma) %>% 
      mutate(category="tasa-arvo")) %>%
  distinct()
dbExecute(con,"DROP TABLE IF EXISTS words_of_interest")
words_of_interest <- words %>% 
  inner_join(lemmas_of_interest) %>% 
  distinct(w_id,lemma,category,genus) %>% 
  compute(name="words_of_interest",temporary=F,indexes=c("w_id","category","genus"))
dbExecute(con,"DROP TABLE IF EXISTS corpus_of_interest")
corpus_of_interest <- corpus %>%
  inner_join(words_of_interest) %>%
  compute(name="corpus_of_interest",temporary=F,indexes=list(c("a_id","par_id","s_id","pos"),c("w_id"),c("genus"),c("category"))
dbExecute(con,"DROP TABLE IF EXISTS article_types")
article_types <- articles %>% 
  left_join(articles_cdp %>% mutate(cdp=T)) %>%
  left_join(articles_opinionated) %>%
  mutate(type=case_when(
    media == "STT" & version != "Loppuversio" ~ "Other",
    !is.na(opinionated) & !str_detect(opinionated,"^external ") ~ "Journalistic opinion",
    !is.na(opinionated) ~ "External opinion",
    cdp ~ "Domestic general/political/economic news",
    media == "HS" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "IL" & section == "viihde" ~ "Culture/entertainment",
    media == "STT" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"kulttuuri|musiikki|viihde") & !str_detect(subject, "Ulkomaat") ~ "Culture/entertainment",
    media == "HS" & section == "Kaupunki" ~ "Local news",
    media == "YLE" & section == "Yle Uutiset" & coverage=="local" ~ "Local news",
    media == "STT" & section == "Urheilu" ~ "Sports",
    media == "HS" & section == "Urheilu" ~ "Sports",
    media == "YLE" & section == "YLE Urheilu" ~ "Sports",
    media == "IL" & section == "urheilu" ~ "Sports",
    media == "STT" & section == "Ulkomaat" ~ "Foreign news",
    media == "HS" & section == "Ulkomaat" ~ "Foreign news",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Ulkomaat") & !str_detect(subject,"Kotimaan uutiset") ~ "Foreign news",
    media == "IL" & subsection == "ulkomaat" ~ "Foreign news",
    T ~ "Other"
  )) %>%
  distinct(a_id,type) %>%
  compute(temporary=F,name="article_types",unique_indexes=list(c("a_id"),c("a_id","type"),c("type","a_id")))
dbExecute(con,"DROP TABLE IF EXISTS articles_by_type_by_year")
articles_by_type_by_year <- articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  count(media,year_created,type,name="total_articles") %>%
  compute(unique_indexes=list(c("media","year_created","type")),temporary=F,name="articles_by_type_by_year")
dbExecute(con,"DROP TABLE IF EXISTS articles_to_ref_categories")
articles_to_ref_categories <- words_of_interest %>%
    filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
    inner_join(corpus) %>%
    group_by(a_id) %>%
    summarize(yv=max(category=="yhdenvertaisuus"),ta=max(category=="tasa-arvo"),.groups="drop") %>%
    mutate(ref_category=case_when(yv==1 & ta==1 ~ "both",yv==1 ~ "yhdenvertaisuus", ta==1 ~ "tasa-arvo")) %>% 
  select(a_id,ref_category) %>% 
  compute()
articles_to_ref_categories <- articles_to_ref_categories %>% union_all(
    articles_to_ref_categories %>%
    filter(ref_category=="both") %>% 
    mutate(ref_category="yhdenvertaisuus")
  ) %>%
  union_all(
    articles_to_ref_categories %>%
    filter(ref_category=="both") %>% 
    mutate(ref_category="tasa-arvo")
  ) %>%
  compute(temporary=F,name="articles_to_ref_categories",indexes=c("a_id"),unique_indexes=list(c("a_id","ref_category"),c("ref_category","a_id")))
yv_ta_corpus <- words_of_interest %>%
  filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
  inner_join(corpus) 

dbExecute(con,"DROP TABLE IF EXISTS yv_ta_paragraphs")
yv_ta_paragraphs <- yv_ta_corpus %>%
  distinct(a_id,par_id) %>%
  compute(temporary=F,name="yv_ta_paragraphs", unique_indexes=list(c("a_id","par_id")))

Analysis table loads

articles_cdp <- tbl(con,"articles_cdp")
articles_opinionated <- tbl(con,"articles_opinionated")
article_types <- tbl(con,"article_types")
articles_by_type_by_year <- tbl(con,"articles_by_type_by_year")
words_of_interest <- tbl(con,"words_of_interest")
corpus_of_interest <- tbl(con,"corpus_of_interest")
articles_to_ref_categories <- tbl(con,"articles_to_ref_categories")
yv_ta_paragraphs <- tbl(con,"yv_ta_paragraphs")

Named query definitions

quotation_corpus <- quotes %>% 
  inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  select(a_id=a_id.x,par_id,s_id,pos, q_id)

quotation_sentence_corpus <- quotes %>% 
  inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND s_id >= start_s_id AND s_id <= end_s_id") %>%
  select(a_id=a_id.x,par_id,s_id,pos, q_id)

yv_corpus <- corpus_of_interest %>% 
  filter(category=="yhdenvertaisuus")

yv_paragraphs <- yv_corpus %>% 
  distinct(a_id,par_id)

ta_corpus <- corpus_of_interest %>% 
  filter(category=="tasa-arvo")

ta_paragraphs <- ta_corpus %>% 
  distinct(a_id,par_id)

quote_orgs <- read_tsv(here("data/q_id_to_orgs.tsv")) %>%
  select(c(author_head, a_id, org_cat))
New names:
• `` -> `...1`
Rows: 19362 Columns: 5
── Column specification ────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────────
Delimiter: "\t"
chr (2): org_cat, author_head
dbl (3): ...1, q_id, a_id

ℹ Use `spec()` to retrieve the full column specification for this data.
ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
d <- corpus_of_interest %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  left_join(articles_to_ref_categories) %>%
  left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
  mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
Joining, by = c("a_id", "par_id", "s_id", "pos", "q_id")
Joining, by = "a_id"
  
d2 <- corpus_of_interest %>%
  inner_join(yv_ta_paragraphs) %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
  left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos", "q_id")
key_cats <- c('potilas', 'maahanmuutto', 'etnos', 'seksuaalisuus', 'työsuhde')
main_types <- c('core', 'opinionated', 'external opinion')

Analysis 1: development of yhdenvertaisuus/tasa-arvo in different text genres

Master chart

my_d <- d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
  collect()

my_d2 <- d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(a_id,media,type,type2,year_created) %>%
  filter(any(category=="yhdenvertaisuus"),any(category=="tasa-arvo")) %>%
  group_by(media,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
  collect()
Warning: Missing values are always removed in SQL aggregation functions.
Use `na.rm = TRUE` to silence this warning
This warning is displayed once every 8 hours.
my_d <- my_d %>% 
  mutate(word="Word") %>%
  union_all(my_d2 %>% 
              mutate(category="tasa-arvo") %>%
              mutate(word="Both")) %>%
  union_all(my_d2 %>% 
              mutate(category="yhdenvertaisuus") %>%
              mutate(word="Both")) %>%
  mutate(
    word=fct_relevel(word,"Word"),
    category=fct_relevel(category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))  
my_d %>%
  filter(type=="Other") %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
  geom_step() +
  geom_vline(xintercept = 2009,color="red") + 
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=4)) +
  facet_grid(category~type2,scales="free") +
  labs(color="Media",linetype="Signal") +
  xlab("Year") +
  ylab("Percentage of articles of type containing the word")
1:3 %>% map(~
  my_d %>%
    filter(type!="Other") %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(category~type2,scales="free", nrow=2, ncol=4, page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
[[1]]

[[2]]

[[3]]

Conclusions:

  • IL is behind other media in change, but same trajectory
  • For foreign news, STT (and therefore IL) do not increase tasa-arvo terminology usage
  • For local news, terminology usage increases only for HS (Helsinki) as opposed to YLE (regional news)
  • For culture/entertainment, other sources differ from IL. This is probably due to category heterogeneity: for IL, this category contains entertainment news, for others, these are more culture reviews etc.

Does change in media composition affect results?

1:3 %>% map(~
  my_d %>%
    filter(type!="Other") %>%
    mutate(
      l_total_articles=if_else(media %in% c("HS","STT"),total_articles,0L),
      l_articles=if_else(media %in% c("HS","STT"),articles,0L)
      ) %>%
    group_by(year_created,category,type2,word) %>%
    summarize(`HS/STT`=sum(l_articles)/sum(l_total_articles),`All medias`=sum(articles)/sum(total_articles),.groups="drop") %>%
    pivot_longer(`HS/STT`:`All medias`) %>%
    ggplot(aes(x=year_created,y=value,color=name,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(category~type2,scales="free",nrow=2,ncol=4,page=.x) +
    labs(color="Media") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
[[1]]

[[2]]

[[3]]

Conclusions:

  • If we mention the IL/STT behavior from above in text, we can then drop the medias from graphs as they do not otherwise affect results for main categories of interest.

Final graph to include in article

d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>% 
  filter(type %in% c("Domestic general/political/economic news", "Journalistic opinion", "External opinion")) %>%
  left_join(articles_by_type_by_year %>% group_by(type,year_created) %>%
               summarize(total_articles=sum(total_articles),.groups="drop"),by=c("type","year_created")) %>%
  group_by(category,type2,year_created) %>%
  summarize(total_articles=min(total_articles),articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  collect() %>%
  mutate(type2=fct_relevel(type2,"External opinion","Journalistic opinion")) %>%
  mutate(category=fct_relevel(category,"yhdenvertaisuus")) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=type2)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=5)) +
  facet_wrap(~category, scales="free_y") +
  labs(color="Text type") +
  xlab("Year") +
  ylab("Percentage of articles of type containing the word") +
  theme(legend.justification = c(0, 1), legend.position = c(0.02, 0.98), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal")

Analysis 2: distribution of language by speaker and subject category


d %>%
  filter(org_cat %in% c("politiikka", "oikeus"), category %in% c("yhdenvertaisuus", "tasa-arvo"), in_quote == T) %>%
  group_by(org_cat,year_created, category) %>%
  summarize(n=n(),.groups="drop") %>%
  ggplot(aes(x=year_created,y=n,color=category)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  facet_grid(~org_cat, scales="free")


d %>%
  filter(org_cat %in% c("politiikka", "oikeus"), category %in% key_cats, in_quote == T) %>%
  group_by(org_cat,year_created, category) %>%
  summarize(n=n(),.groups="drop") %>%
  ggplot(aes(x=year_created,y=n,color=category)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  facet_grid(~org_cat, scales="free")

Analysis 3: subject associations

sp <- yv_ta_paragraphs %>%
  inner_join(
    words %>% 
      filter(lemma=="suku#puoli") %>%
      inner_join(corpus)
    ) %>% inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  )) %>%
  group_by(media,ref_category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
Joining, by = "w_id"
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
  
my_cd <- corpus_of_interest %>%
  filter(lemma!="suomalainen") %>%
  inner_join(yv_ta_paragraphs) %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("a_id", "par_id", "s_id", "pos")
cd2 <- my_cd %>%
  group_by(ref_category,genus,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))

cd <- my_cd %>%
  group_by(ref_category,category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))

Gender

my_d <- sp %>%
  filter(type!="Other") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>% 
  collect()
Joining, by = c("type", "year_created")
1:3 %>% map(~
  my_d %>% 
    filter(ref_category!="both") %>%
    mutate(fct=ref_category,word="Word") %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="tasa-arvo",word="Both")
    ) %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="yhdenvertaisuus",word="Both")
    ) %>%
    mutate(word=fct_relevel(word,"Word")) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

1:3 %>% map(~  
  my_d %>% 
    filter(ref_category!="both") %>%
    mutate(fct=ref_category,word="Word") %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="tasa-arvo",word="Both")
    ) %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="yhdenvertaisuus",word="Both")
    ) %>%
    mutate(word=fct_relevel(word,"Word")) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

Conclusions:

  • Decline in “sukupuolten tasa-arvo” in STT for domestic news is interesting. What could cause this?
  • Otherwise, all sources seem to be following similar patterns.
sp %>% 
  filter(type=="Domestic general/political/economic news") %>%
  group_by(ref_category,type,type2,year_created) %>%
  summarize(articles=sum(articles),.groups="drop") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category,linetype=type2)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2))
Joining, by = c("type", "year_created")

Conclusions:

  • Gender equality discussion also gets a boost after 2014
  • When “yhdenvertaisuus” is used in relation to gender, “tasa-arvo” is almost always also mentioned!
my_d <- cd2 %>%
  filter(type!="Other") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>% 
  collect()
Joining, by = c("type", "year_created")
1:3 %>% map(~
  my_d %>% 
    ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="ref-category",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

1:3 %>% map(~  
  my_d %>% 
    ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="ref-category",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

Conclusions:

  • Same behavior seen for “sukupuolten tasa-arvo/yhdenvertaisuus” is also evident when looking at explicitly gendered words (mies/nainen)

Subject topic

1:3 %>% map(~
  cd %>% 
    filter(type!="Other") %>%
    filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
    inner_join(articles_by_type_by_year %>% 
                 group_by(year_created,type) %>%
                 summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_step() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

1:3 %>% map(~ 
  cd %>% 
    filter(type!="Other") %>%
    filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
    inner_join(articles_by_type_by_year %>% 
                 group_by(year_created,type) %>%
                 summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_step() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
Joining, by = c("type", "year_created")
[[1]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

Conclusions:

  • There don’t seem to be major discernible patterns between different subjects for equality

Supporting auxiliary analyses

Subject topic graphs using different measures

cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=days,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))
[[1]]

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]

cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=weeks,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))
[[1]]

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]

cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))
Joining, by = c("type", "year_created")
[[1]]

[[2]]
geom_path: Each group consists of only one observation. Do you need to adjust the group aesthetic?

[[3]]

Do the subject topics combined capture the phenomenon?

cd3 <- yv_ta_paragraphs %>%
  inner_join(corpus_of_interest) %>%
  filter(lemma!="suomalainen") %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(category="potilas") %>%
  group_by(ref_category,category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect()
Joining, by = c("a_id", "par_id")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = "a_id"
cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=days,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]

[[2]]

[[3]]

cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=weeks,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]

[[2]]

[[3]]

cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))
Joining, by = c("type", "year_created")
Warning: Problem while computing `type = fct_relevel(type, "foreign", "sports", "other", after = Inf)`.
ℹ Unknown levels in `f`: foreign, sports, other
[[1]]

[[2]]

[[3]]

Conclusions:

  • While we saw no discernible patterns between subjects, we do seem to be capturing the “whole” of equality discussion by targeting them -> can conclude that everyone benefits.

Background Analyses

articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
  ggplot(aes(x=year_created,y=days,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"

articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
  ggplot(aes(x=year_created,y=weeks,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"

articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")
Joining, by = "a_id"
Joining, by = "a_id"
Joining, by = c("media", "type", "year_created")

Auxiliary background analyses

articles %>%
  mutate(year_created=year(date_created),month_created=month(date_created)) %>%
  count(media,year_created,month_created) %>%
  ggplot(aes(x=as.Date(str_c(year_created,'-',month_created,'-01')),y=n,color=media)) +
  geom_step() +
  theme_hsci_discrete()

articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  count(media,year_created,type) %>%
  ggplot(aes(x=year_created,y=n,color=type)) +
  geom_step() +
  facet_wrap(~media,scales="free") +
  theme_hsci_discrete()
Joining, by = "a_id"

articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(from_stt=author=="STT") %>%
  filter(media=="IL") %>%
  count(media,year_created,type,from_stt) %>%
  collect() %>%
  group_by(media) %>%
  mutate(type=fct_lump_n(type,11,w=n)) %>%
  count(media,year_created,type,from_stt,wt=n) %>%
  ggplot(aes(x=year_created,y=n,color=from_stt==1)) +
  geom_step() +
  facet_wrap(type~media,scales="free") +
  theme_hsci_discrete()
Joining, by = "a_id"

---
title: "FLOPO YV/TA analysis"
output:
  html_notebook:
    code_folding: hide
    toc: yes
---

# General Setup

```{r setup,include=FALSE}
knitr::opts_knit$set(root.dir = here::here())
library(here)
library(tidyverse)
library(DBI)
library(glue)
library(lubridate)
library(hms)
library(ggbeeswarm)
library(ggforce)
library(pak)
pkg_install("hsci-r/gghsci")
library(gghsci)
library(RMariaDB)

con <- DBI::dbConnect(
  RMariaDB::MariaDB(), 
  host = "128.214.253.211", 
  dbname = "flopo", 
  user = "root", 
  password = "dhh17",
  bigint = "integer",
  load_data_local_infile = TRUE,
  autocommit = TRUE,
  reconnect = TRUE)
dbExecute(con, "SET SESSION storage_engine=aria")

tbl(con, "a_sim")
actor_mentions <- tbl(con, "actor_mentions")
actor_org <- tbl(con, "actor_org")
actor_roles <- tbl(con, "actor_roles")
actors <- tbl(con, "actors")
articles <- tbl(con, "articles")
corpus <- tbl(con, "corpus")
misc <- tbl(con, "misc")
q_qa <- tbl(con, "q_qa")
quote_authors <- tbl(con, "quote_authors")
quotes <- tbl(con, "quotes")
words <- tbl(con, "words")
quote_author_names_to_canonical_names <- tbl(con, "quote_author_names_to_canonical_names")
actor_names_to_canonical_names <- tbl(con, "actor_names_to_canonical_names")
actor_roles_to_canonical_roles <- tbl(con, "actor_roles_to_canonical_roles")
actor_orgs_to_canonical_orgs <- tbl(con, "actor_orgs_to_canonical_orgs")
a_can_names_to_can_orgs_roles_by_t_created <- tbl(con, "a_can_names_to_can_orgs_roles_by_t_created")
```

# Analysis table creation. Don't run if already created. Instead skip to "Analysis table loads"

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS articles_cdp")
articles_cdp <- articles %>% filter(case_when(
  media == "HS" ~ section %in% c("Kotimaa", "Politiikka", "Talous"),
  media == "IL" ~ subsection %in% c("kotimaa","politiikka","talous","uutiset"),
  media == "STT" ~ section %in% c("Kotimaa","Politiikka","Talous"),
  media == "YLE" ~ section == "Yle Uutiset" & str_detect(subject,"Kotimaan uutiset|politiikka|talous") & (!str_detect(subject,"Ulkomaat") | str_detect(subject,"Kotimaan uutiset")),
  T ~ F
)) %>% 
  distinct(a_id) %>% 
  compute(name="articles_cdp",temporary=F,unique_indexes=c("a_id"))
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS articles_opinionated")
articles_opinionated <- articles %>% mutate(opinionated=case_when(
  media == "HS" ~ case_when(
    str_to_lower(section) == "pääkirjoitus" & str_detect(str_to_lower(story_logo),"ieras") ~ "external editorial",
    str_to_lower(section) == "pääkirjoitus" & is.na(story_logo) ~ "editorial",
    str_to_lower(section) == "pääkirjoitus" & str_to_lower(story_logo) == "pääkirjoitus" ~ "editorial",
    str_to_lower(section) == "mielipide" | str_to_lower(story_logo) == "mielipide" ~ "external opinion",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(str_to_lower(story_logo),"analyysi")  ~ "analysis",
    str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(story_logo),"näkökulma") ~ "perspective",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(story_logo),"kolumni") ~ "column",
    str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(story_logo),"blog") ~ "blog"    
  ),
  media == "IL" ~ case_when(
    subsection == "paakirjoitus" ~ "editorial",
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:") ~ "perspective"
  ),
  media == "YLE" ~ case_when(
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(subject,"Analyysit \\(Yle Uutiset\\)") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(subject),"kolumn") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:")  | str_detect(str_to_lower(subject),"näkökulm") ~ "perspective",
    str_detect(str_to_lower(title),"blogi:")  | str_detect(str_to_lower(subject),"blog") ~ "blog"
  )
)) %>%
  filter(!is.na(opinionated)) %>%
  distinct(a_id, opinionated) %>%
  compute(name="articles_opinionated",temporary=F,unique_indexes=c("a_id"))
```

```{r,eval=FALSE}
labels <- read_tsv(here("data/person_labels.tsv")) %>%
  filter(!category %in% c('adjektiivi', 'ei'))

lemmas_of_interest <- labels %>%
  rename("lemma" = "name") %>%
  copy_to(con,.,name="labels",overwrite=T) %>%
  union_all(words %>%
    filter(str_detect(lemma, "yhden#ve")) %>% 
      distinct(lemma) %>% 
      mutate(category="yhdenvertaisuus")) %>%
  union_all(words %>%
      filter(str_detect(lemma, "tasa#arv")) %>% 
      distinct(lemma) %>% 
      mutate(category="tasa-arvo")) %>%
  distinct()
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS words_of_interest")
words_of_interest <- words %>% 
  inner_join(lemmas_of_interest) %>% 
  distinct(w_id,lemma,category,genus) %>% 
  compute(name="words_of_interest",temporary=F,indexes=c("w_id","category","genus"))
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS corpus_of_interest")
corpus_of_interest <- corpus %>%
  inner_join(words_of_interest) %>%
  compute(name="corpus_of_interest",temporary=F,indexes=list(c("a_id","par_id","s_id","pos"),c("w_id"),c("genus"),c("category"))
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS article_types")
article_types <- articles %>% 
  left_join(articles_cdp %>% mutate(cdp=T)) %>%
  left_join(articles_opinionated) %>%
  mutate(type=case_when(
    media == "STT" & version != "Loppuversio" ~ "Other",
    !is.na(opinionated) & !str_detect(opinionated,"^external ") ~ "Journalistic opinion",
    !is.na(opinionated) ~ "External opinion",
    cdp ~ "Domestic general/political/economic news",
    media == "HS" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "IL" & section == "viihde" ~ "Culture/entertainment",
    media == "STT" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"kulttuuri|musiikki|viihde") & !str_detect(subject, "Ulkomaat") ~ "Culture/entertainment",
    media == "HS" & section == "Kaupunki" ~ "Local news",
    media == "YLE" & section == "Yle Uutiset" & coverage=="local" ~ "Local news",
    media == "STT" & section == "Urheilu" ~ "Sports",
    media == "HS" & section == "Urheilu" ~ "Sports",
    media == "YLE" & section == "YLE Urheilu" ~ "Sports",
    media == "IL" & section == "urheilu" ~ "Sports",
    media == "STT" & section == "Ulkomaat" ~ "Foreign news",
    media == "HS" & section == "Ulkomaat" ~ "Foreign news",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Ulkomaat") & !str_detect(subject,"Kotimaan uutiset") ~ "Foreign news",
    media == "IL" & subsection == "ulkomaat" ~ "Foreign news",
    T ~ "Other"
  )) %>%
  distinct(a_id,type) %>%
  compute(temporary=F,name="article_types",unique_indexes=list(c("a_id"),c("a_id","type"),c("type","a_id")))
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS articles_by_type_by_year")
articles_by_type_by_year <- articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  count(media,year_created,type,name="total_articles") %>%
  compute(unique_indexes=list(c("media","year_created","type")),temporary=F,name="articles_by_type_by_year")
```

```{r,eval=FALSE}
dbExecute(con,"DROP TABLE IF EXISTS articles_to_ref_categories")
articles_to_ref_categories <- words_of_interest %>%
    filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
    inner_join(corpus) %>%
    group_by(a_id) %>%
    summarize(yv=max(category=="yhdenvertaisuus"),ta=max(category=="tasa-arvo"),.groups="drop") %>%
    mutate(ref_category=case_when(yv==1 & ta==1 ~ "both",yv==1 ~ "yhdenvertaisuus", ta==1 ~ "tasa-arvo")) %>% 
  select(a_id,ref_category) %>% 
  compute()
articles_to_ref_categories <- articles_to_ref_categories %>% union_all(
    articles_to_ref_categories %>%
    filter(ref_category=="both") %>% 
    mutate(ref_category="yhdenvertaisuus")
  ) %>%
  union_all(
    articles_to_ref_categories %>%
    filter(ref_category=="both") %>% 
    mutate(ref_category="tasa-arvo")
  ) %>%
  compute(temporary=F,name="articles_to_ref_categories",indexes=c("a_id"),unique_indexes=list(c("a_id","ref_category"),c("ref_category","a_id")))
```

```{r,eval=FALSE}
yv_ta_corpus <- words_of_interest %>%
  filter(category %in% c("tasa-arvo","yhdenvertaisuus")) %>%
  inner_join(corpus) 

dbExecute(con,"DROP TABLE IF EXISTS yv_ta_paragraphs")
yv_ta_paragraphs <- yv_ta_corpus %>%
  distinct(a_id,par_id) %>%
  compute(temporary=F,name="yv_ta_paragraphs", unique_indexes=list(c("a_id","par_id")))
```

# Analysis table loads

```{r}
articles_cdp <- tbl(con,"articles_cdp")
articles_opinionated <- tbl(con,"articles_opinionated")
article_types <- tbl(con,"article_types")
articles_by_type_by_year <- tbl(con,"articles_by_type_by_year")
words_of_interest <- tbl(con,"words_of_interest")
corpus_of_interest <- tbl(con,"corpus_of_interest")
articles_to_ref_categories <- tbl(con,"articles_to_ref_categories")
yv_ta_paragraphs <- tbl(con,"yv_ta_paragraphs")
```

# Named query definitions

```{r}
quotation_corpus <- quotes %>% 
  inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  select(a_id=a_id.x,par_id,s_id,pos, q_id)

quotation_sentence_corpus <- quotes %>% 
  inner_join(corpus,sql_on="LHS.a_id=RHS.a_id AND s_id >= start_s_id AND s_id <= end_s_id") %>%
  select(a_id=a_id.x,par_id,s_id,pos, q_id)

yv_corpus <- corpus_of_interest %>% 
  filter(category=="yhdenvertaisuus")

yv_paragraphs <- yv_corpus %>% 
  distinct(a_id,par_id)

ta_corpus <- corpus_of_interest %>% 
  filter(category=="tasa-arvo")

ta_paragraphs <- ta_corpus %>% 
  distinct(a_id,par_id)

quote_orgs <- read_tsv(here("data/q_id_to_orgs.tsv")) %>%
  select(c(author_head, a_id, org_cat))

d <- corpus_of_interest %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  left_join(articles_to_ref_categories) %>%
  left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
  mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))
  
d2 <- corpus_of_interest %>%
  inner_join(yv_ta_paragraphs) %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  left_join(quotation_corpus %>% mutate(in_quote=T)) %>%
  left_join(quote_orgs, copy = TRUE, auto_index = TRUE) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(in_quote_head=in_quote_sentence & !in_quote) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))
```

```{r}
key_cats <- c('potilas', 'maahanmuutto', 'etnos', 'seksuaalisuus', 'työsuhde')
main_types <- c('core', 'opinionated', 'external opinion')
```

# Analysis 1: development of yhdenvertaisuus/tasa-arvo in different text genres

## Master chart

```{r}
my_d <- d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
  collect()

my_d2 <- d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(a_id,media,type,type2,year_created) %>%
  filter(any(category=="yhdenvertaisuus"),any(category=="tasa-arvo")) %>%
  group_by(media,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year,by=c("media","type","year_created")) %>%
  collect()

my_d <- my_d %>% 
  mutate(word="Word") %>%
  union_all(my_d2 %>% 
              mutate(category="tasa-arvo") %>%
              mutate(word="Both")) %>%
  union_all(my_d2 %>% 
              mutate(category="yhdenvertaisuus") %>%
              mutate(word="Both")) %>%
  mutate(
    word=fct_relevel(word,"Word"),
    category=fct_relevel(category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))  
```

```{r,fig.width=28,fig.height=8,eval=FALSE}
my_d %>%
  filter(type=="Other") %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
  geom_step() +
  geom_vline(xintercept = 2009,color="red") + 
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=4)) +
  facet_grid(category~type2,scales="free") +
  labs(color="Media",linetype="Signal") +
  xlab("Year") +
  ylab("Percentage of articles of type containing the word")
```

```{r,fig.width=8,fig.height=6}
1:3 %>% map(~
  my_d %>%
    filter(type!="Other") %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(category~type2,scales="free", nrow=2, ncol=4, page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
```

Conclusions:

- IL is behind other media in change, but same trajectory
- For foreign news, STT (and therefore IL) do not increase tasa-arvo terminology usage
- For local news, terminology usage increases only for HS (Helsinki) as opposed to YLE (regional news)
- For culture/entertainment, other sources differ from IL. This is probably due to category heterogeneity: for IL, this category contains entertainment news, for others, these are more culture reviews etc.

## Does change in media composition affect results?

```{r,fig.width=8,fig.height=6}
1:3 %>% map(~
  my_d %>%
    filter(type!="Other") %>%
    mutate(
      l_total_articles=if_else(media %in% c("HS","STT"),total_articles,0L),
      l_articles=if_else(media %in% c("HS","STT"),articles,0L)
      ) %>%
    group_by(year_created,category,type2,word) %>%
    summarize(`HS/STT`=sum(l_articles)/sum(l_total_articles),`All medias`=sum(articles)/sum(total_articles),.groups="drop") %>%
    pivot_longer(`HS/STT`:`All medias`) %>%
    ggplot(aes(x=year_created,y=value,color=name,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(category~type2,scales="free",nrow=2,ncol=4,page=.x) +
    labs(color="Media") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
```

Conclusions:

 - If we mention the IL/STT behavior from above in text, we can then drop the medias from graphs as they do not otherwise affect results for main categories of interest.

## Final graph to include in article

```{r}
d %>% 
  filter(category %in% c("yhdenvertaisuus","tasa-arvo")) %>% 
  filter(type %in% c("Domestic general/political/economic news", "Journalistic opinion", "External opinion")) %>%
  left_join(articles_by_type_by_year %>% group_by(type,year_created) %>%
               summarize(total_articles=sum(total_articles),.groups="drop"),by=c("type","year_created")) %>%
  group_by(category,type2,year_created) %>%
  summarize(total_articles=min(total_articles),articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  collect() %>%
  mutate(type2=fct_relevel(type2,"External opinion","Journalistic opinion")) %>%
  mutate(category=fct_relevel(category,"yhdenvertaisuus")) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=type2)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=5)) +
  facet_wrap(~category, scales="free_y") +
  labs(color="Text type") +
  xlab("Year") +
  ylab("Percentage of articles of type containing the word") +
  theme(legend.justification = c(0, 1), legend.position = c(0.02, 0.98), legend.background = element_blank(), legend.box.just = "bottom", legend.key = element_blank(), legend.box = "horizontal")
```

# Analysis 2: distribution of language by speaker and subject category

```{r}

d %>%
  filter(org_cat %in% c("politiikka", "oikeus"), category %in% c("yhdenvertaisuus", "tasa-arvo"), in_quote == T) %>%
  group_by(org_cat,year_created, category) %>%
  summarize(n=n(),.groups="drop") %>%
  ggplot(aes(x=year_created,y=n,color=category)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  facet_grid(~org_cat, scales="free")

d %>%
  filter(org_cat %in% c("politiikka", "oikeus"), category %in% key_cats, in_quote == T) %>%
  group_by(org_cat,year_created, category) %>%
  summarize(n=n(),.groups="drop") %>%
  ggplot(aes(x=year_created,y=n,color=category)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  facet_grid(~org_cat, scales="free")
```

# Analysis 3: subject associations

```{r}
sp <- yv_ta_paragraphs %>%
  inner_join(
    words %>% 
      filter(lemma=="suku#puoli") %>%
      inner_join(corpus)
    ) %>% inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  )) %>%
  group_by(media,ref_category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
  
my_cd <- corpus_of_interest %>%
  filter(lemma!="suomalainen") %>%
  inner_join(yv_ta_paragraphs) %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  left_join(quotation_sentence_corpus %>% mutate(in_quote_sentence=T)) %>%
  mutate(type2=case_when(
    str_detect(type,"opinion$") ~ type,
    in_quote_sentence ~ str_c("Quotes in ",type),
    TRUE ~ str_c("Journalistic text in ",type)
  ))

cd2 <- my_cd %>%
  group_by(ref_category,genus,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))

cd <- my_cd %>%
  group_by(ref_category,category,type,type2,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect() %>%
  mutate(
    ref_category=fct_relevel(ref_category,"tasa-arvo","yhdenvertaisuus"),
    type2=fct_relevel(type2,"External opinion", "Journalistic opinion", "Journalistic text in Domestic general/political/economic news", "Quotes in Domestic general/political/economic news", "Journalistic text in Local news", "Quotes in Local news", "Journalistic text in Foreign news", "Quotes in Foreign news", "Journalistic text in Culture/entertainment", "Quotes in Culture/entertainment", "Journalistic text in Sports", "Quotes in Sports"))
```

## Gender

```{r,fig.width=8,fig.height=6}
my_d <- sp %>%
  filter(type!="Other") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>% 
  collect()

1:3 %>% map(~
  my_d %>% 
    filter(ref_category!="both") %>%
    mutate(fct=ref_category,word="Word") %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="tasa-arvo",word="Both")
    ) %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="yhdenvertaisuus",word="Both")
    ) %>%
    mutate(word=fct_relevel(word,"Word")) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)

1:3 %>% map(~  
  my_d %>% 
    filter(ref_category!="both") %>%
    mutate(fct=ref_category,word="Word") %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="tasa-arvo",word="Both")
    ) %>%
    union_all(
      my_d %>%
        filter(ref_category=="both") %>%
        mutate(fct="yhdenvertaisuus",word="Both")
    ) %>%
    mutate(word=fct_relevel(word,"Word")) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=media,linetype=word)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(fct~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="Media",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
```

Conclusions:

 - Decline in "sukupuolten tasa-arvo" in STT for domestic news is interesting. What could cause this?
 - Otherwise, all sources seem to be following similar patterns.

```{r}
sp %>% 
  filter(type=="Domestic general/political/economic news") %>%
  group_by(ref_category,type,type2,year_created) %>%
  summarize(articles=sum(articles),.groups="drop") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category,linetype=type2)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2))
```

Conclusions:

 - Gender equality discussion also gets a boost after 2014
 - When "yhdenvertaisuus" is used in relation to gender, "tasa-arvo" is almost always also mentioned!

```{r,fig.width=8,fig.height=6}
my_d <- cd2 %>%
  filter(type!="Other") %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>% 
  collect()

1:3 %>% map(~
  my_d %>% 
    ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="ref-category",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)

1:3 %>% map(~  
  my_d %>% 
    ggplot(aes(x=year_created,y=articles/total_articles,color=ref_category)) +
    geom_step() +
    geom_vline(xintercept = 2009,color="red") + 
    theme_hsci_discrete(base_family="Arial") +
    scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(genus~type2,scales="free",ncol=4,nrow=2,page=.x) +
    labs(color="ref-category",linetype="Signal") +
    xlab("Year") +
    ylab("Percentage of articles of type containing the word")
)
```

Conclusions:

- Same behavior seen for "sukupuolten tasa-arvo/yhdenvertaisuus" is also evident when looking at explicitly gendered words (mies/nainen)

## Subject topic

```{r,fig.width=8,fig.height=6}
1:3 %>% map(~
  cd %>% 
    filter(type!="Other") %>%
    filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
    inner_join(articles_by_type_by_year %>% 
                 group_by(year_created,type) %>%
                 summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_step() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_grid_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)

1:3 %>% map(~ 
  cd %>% 
    filter(type!="Other") %>%
    filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
    inner_join(articles_by_type_by_year %>% 
                 group_by(year_created,type) %>%
                 summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_step() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    facet_wrap_paginate(ref_category~type2,scales="free",nrow=2,ncol=4,page=.x)
)
```

Conclusions:

 - There don't seem to be major discernible patterns between different subjects for equality

## Supporting auxiliary analyses

### Subject topic graphs using different measures

```{r}
cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=days,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))

cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=weeks,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))

cd %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type2,scales="free") +
    ggtitle(.y[1]$ref_category))
```

### Do the subject topics combined capture the phenomenon?

```{r}
cd3 <- yv_ta_paragraphs %>%
  inner_join(corpus_of_interest) %>%
  filter(lemma!="suomalainen") %>%
  inner_join(articles %>% select(a_id,date_created,media)) %>%
  inner_join(article_types) %>%
  inner_join(articles_to_ref_categories) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(week_created=week(date_created)) %>%
  mutate(category="potilas") %>%
  group_by(ref_category,category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>% 
  collect()

cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=days,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))

cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=weeks,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))

cd3 %>% 
  filter(category %in% c("etnos","potilas","seksuaalisuus","työsuhde","maahanmuutto")) %>%
  inner_join(articles_by_type_by_year %>% 
               group_by(year_created,type) %>%
               summarize(total_articles=sum(total_articles),.groups="drop") %>% collect()) %>%
  mutate(type=fct_relevel(type,"foreign","sports","other",after=Inf)) %>%
  group_by(ref_category) %>%
  group_map(~.x %>%
    ggplot(aes(x=year_created,y=articles/total_articles,color=category)) +
    geom_line() +
    theme_hsci_discrete(base_family="Arial") +
    scale_x_continuous(breaks=seq(2000,2020,by=2)) +
    facet_wrap(~type,scales="free") +
    ggtitle(.y[1]$ref_category))
```

Conclusions:

 - While we saw no discernible patterns between subjects, we do seem to be capturing the "whole" of equality discussion by targeting them -> can conclude that everyone benefits.

# Background Analyses

```{r}
articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
  ggplot(aes(x=year_created,y=days,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")

articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),weeks=n_distinct(week_created),days=n_distinct(date_created),.groups="drop") %>%
  ggplot(aes(x=year_created,y=weeks,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")

articles %>% 
  inner_join(articles_to_ref_categories) %>%
  inner_join(article_types) %>%
  mutate(year_created=year(date_created),week_created=week(date_created)) %>%
  group_by(media,ref_category,type,year_created) %>%
  summarize(articles=n_distinct(a_id),days=n_distinct(date_created),.groups="drop") %>%
  inner_join(articles_by_type_by_year) %>%
  ggplot(aes(x=year_created,y=articles/total_articles,color=media)) +
  geom_step() +
  theme_hsci_discrete(base_family="Arial") +
  scale_y_continuous(labels=scales::percent_format(accuracy=0.1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  facet_grid(ref_category~type,scales="free")
```

# Auxiliary background analyses

```{r}
articles %>%
  mutate(year_created=year(date_created),month_created=month(date_created)) %>%
  count(media,year_created,month_created) %>%
  ggplot(aes(x=as.Date(str_c(year_created,'-',month_created,'-01')),y=n,color=media)) +
  geom_step() +
  theme_hsci_discrete()
```

```{r}
articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  count(media,year_created,type) %>%
  ggplot(aes(x=year_created,y=n,color=type)) +
  geom_step() +
  facet_wrap(~media,scales="free") +
  theme_hsci_discrete()
```

```{r}
articles %>% 
  inner_join(article_types) %>%
  mutate(year_created=year(date_created)) %>%
  mutate(from_stt=author=="STT") %>%
  filter(media=="IL") %>%
  count(media,year_created,type,from_stt) %>%
  collect() %>%
  group_by(media) %>%
  mutate(type=fct_lump_n(type,11,w=n)) %>%
  count(media,year_created,type,from_stt,wt=n) %>%
  ggplot(aes(x=year_created,y=n,color=from_stt==1)) +
  geom_step() +
  facet_wrap(type~media,scales="free") +
  theme_hsci_discrete()
```
