Setup

Analysis table creation (not run)

articles_opinionated_c <- articles_c %>% mutate(opinionated=case_when(
  media == "HS" ~ case_when(
    str_to_lower(section) == "pääkirjoitus" & str_detect(str_to_lower(story_logo),"ieras") ~ "external editorial",
    str_to_lower(section) == "pääkirjoitus" & is.na(story_logo) ~ "editorial",
    str_to_lower(section) == "pääkirjoitus" & str_to_lower(story_logo) == "pääkirjoitus" ~ "editorial",
    str_to_lower(section) == "mielipide" | str_to_lower(story_logo) == "mielipide" ~ "external opinion",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(str_to_lower(story_logo),"analyysi")  ~ "analysis",
    str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(story_logo),"näkökulma") ~ "perspective",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(story_logo),"kolumni") ~ "column",
    str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(story_logo),"blog") ~ "blog"    
  ),
  media == "IL" ~ case_when(
    subsection == "paakirjoitus" ~ "editorial",
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:") ~ "perspective"
  ),
  media == "YLE" ~ case_when(
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(subject,"Analyysit \\(Yle Uutiset\\)") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(subject),"kolumn") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:")  | str_detect(str_to_lower(subject),"näkökulm") ~ "perspective",
    str_detect(str_to_lower(title),"blogi:")  | str_detect(str_to_lower(subject),"blog") ~ "blog"
  )
)) %>%
  filter(!is.na(opinionated)) %>%
  distinct(a_id, opinionated) %>%
  compute_c(name="articles_opinionated_c")

article_types_c <- articles_c %>% 
  left_join(articles_opinionated_c) %>%
  mutate(type=case_when(
    media == "STT" & version != "Loppuversio" ~ "Other",
    !is.na(opinionated) & !str_detect(opinionated,"^external ") ~ "Journalistic opinion",
    !is.na(opinionated) ~ "External opinion",
    media == "HS" & section %in% c("Kotimaa", "Politiikka", "Talous") ~ "Domestic general/political/economic news",
  media == "IL" & subsection %in% c("kotimaa","politiikka","talous","uutiset") ~ "Domestic general/political/economic news",
  media == "STT" & section %in% c("Kotimaa","Politiikka","Talous") ~ "Domestic general/political/economic news",
  media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Kotimaan uutiset|politiikka|talous") & (!str_detect(subject,"Ulkomaat") | str_detect(subject,"Kotimaan uutiset")) ~ "Domestic general/political/economic news",
    media == "HS" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "IL" & section == "viihde" ~ "Culture/entertainment",
    media == "STT" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"kulttuuri|musiikki|viihde") & !str_detect(subject, "Ulkomaat") ~ "Culture/entertainment",
    media == "HS" & section == "Kaupunki" ~ "Local news",
    media == "YLE" & section == "Yle Uutiset" & coverage=="local" ~ "Local news",
    media == "STT" & section == "Urheilu" ~ "Sports",
    media == "HS" & section == "Urheilu" ~ "Sports",
    media == "YLE" & section == "YLE Urheilu" ~ "Sports",
    media == "IL" & section == "urheilu" ~ "Sports",
    media == "STT" & section == "Ulkomaat" ~ "Foreign news",
    media == "HS" & section == "Ulkomaat" ~ "Foreign news",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Ulkomaat") & !str_detect(subject,"Kotimaan uutiset") ~ "Foreign news",
    media == "IL" & subsection == "ulkomaat" ~ "Foreign news",
    T ~ "Other"
  )) %>%
  distinct(a_id,type) %>%
  compute_c(name="article_types")

article_quotes_stats_c <- corpus_c %>% 
  inner_join(quotes_c,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  group_by(a_id=a_id.x, direct) %>%
  summarize(
    nr_quote_tokens=n_distinct(10000L*s_id+pos),
    nr_sentences_with_quotes=n_distinct(s_id),
    nr_paragraphs_with_quotes=n_distinct(par_id),
    nr_quotes=n_distinct(q_id),
    .groups="drop") %>%
  compute_c(name="article_quotes_stats_c")

article_stats_c <- 
  corpus_c %>%
  group_by(a_id) %>%
  summarize(
    nr_tokens=n(),
    nr_sentences=n_distinct(s_id),
    nr_paragraphs=n_distinct(par_id),
    .groups="drop") %>%
  compute_c() %>%
  left_join(
    article_quotes_stats_c %>%
      filter(direct==TRUE) %>%
      select(
        a_id,
        nr_direct_quote_tokens=nr_quote_tokens,
        nr_sentences_with_direct_quotes=nr_sentences_with_quotes,
        nr_paragraphs_with_direct_quotes=nr_paragraphs_with_quotes,
        nr_direct_quotes=nr_quotes)
  ) %>%
  left_join(
    article_quotes_stats_c %>%
      filter(direct==FALSE) %>%
      select(
        a_id,
        nr_indirect_quote_tokens=nr_quote_tokens,
        nr_sentences_with_indirect_quotes=nr_sentences_with_quotes,
        nr_paragraphs_with_indirect_quotes=nr_paragraphs_with_quotes,
        nr_indirect_quotes=nr_quotes)
  ) %>% 
  left_join(
    corpus_c %>% 
    inner_join(quotes_c,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
    group_by(a_id=a_id.x) %>%
    summarize(
      nr_quote_tokens=n_distinct(10000L*s_id+pos),
      nr_sentences_with_quotes=n_distinct(s_id),
      nr_paragraphs_with_quotes=n_distinct(par_id),
      nr_quotes=n_distinct(q_id),
      .groups="drop") %>%
    compute_c()
  ) %>%
  replace_na(list(
    nr_direct_quote_tokens=0L,nr_sentences_with_direct_quotes=0L,nr_paragraphs_with_direct_quotes=0L,nr_direct_quotes=0L,
    nr_indirect_quote_tokens=0L,nr_sentences_with_indirect_quotes=0L,nr_paragraphs_with_indirect_quotes=0L,nr_indirect_quotes=0L,
    nr_quote_tokens=0L,nr_sentences_with_quotes=0L,nr_paragraphs_with_quotes=0L,nr_quotes=0L
    )) %>%
  compute_c(name="article_stats_c")

adjective_counts_c <- words_c %>% 
  filter(upos=="ADJ") %>%
  inner_join(corpus_c) %>%
  left_join(quotes_c %>% mutate(article_part=if_else(direct,"direct quotation","indirect quotation")),sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  select(a_id=a_id.x,par_id,s_id,pos,article_part) %>%
  replace_na(list(article_part="journalistic text")) %>%
  group_by(a_id,article_part) %>%
  summarize(nr_adjectives=n_distinct(10000L*s_id+pos),nr_sentences_with_adjectives=n_distinct(s_id),nr_paragraphs_with_adjectives=n_distinct(par_id),.groups="drop") %>%
  compute_c(name="adjective_counts_c")

hedging_markers <- read_csv(here("data/input/hedging_markers.csv")) %>% 
  rename(type=...1,value=...2,class=...3)

hedging_counts_c <- words_c %>% 
  inner_join(hedging_markers %>% 
               filter(class %in% c("Adverbs","Adjective")) %>%
               select(lemma=value) %>% copy_to_c(con)) %>%
  inner_join(corpus_c) %>%
  left_join(quotes_c %>% mutate(article_part=if_else(direct,"direct quotation","indirect quotation")),sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  replace_na(list(article_part="journalistic text")) %>%
  select(a_id=a_id.x,par_id,s_id,pos, article_part) %>%
  group_by(a_id,article_part) %>%
  summarize(nr_hedgings=n_distinct(10000L*s_id+pos),nr_sentences_with_hedgings=n_distinct(s_id),nr_paragraphs_with_hedgings=n_distinct(par_id),.groups="drop") %>%
  compute_c(name="hedging_counts_c")

Analysis table loads

article_stats_c <- tbl(con,"article_stats_c")
article_quotes_stats_c <- tbl(con,"article_quotes_stats_c")
adjective_counts_c <- tbl(con,"adjective_counts_c")
hedging_counts_c <- tbl(con,"hedging_counts_c")

article_stats_long_c <-
  article_stats_c %>%
  mutate(nr_tokens=nr_tokens-nr_quote_tokens,nr_sentences=nr_sentences-nr_sentences_with_quotes,nr_paragraphs=nr_paragraphs-nr_paragraphs_with_quotes) %>%
  select(a_id,nr_tokens:nr_paragraphs) %>%
  mutate(article_part='journalistic text') %>%
  union_all(
    article_stats_c %>%
      filter(nr_direct_quote_tokens>0) %>%
      select(a_id,nr_tokens=nr_direct_quote_tokens,nr_sentences=nr_sentences_with_direct_quotes,nr_paragraphs=nr_paragraphs_with_direct_quotes) %>% 
      mutate(article_part='direct quotation')
  ) %>%
  union_all(
    article_stats_c %>%
      filter(nr_indirect_quote_tokens>0) %>%
      select(a_id,nr_tokens=nr_indirect_quote_tokens,nr_sentences=nr_sentences_with_indirect_quotes,nr_paragraphs=nr_paragraphs_with_indirect_quotes) %>% 
      mutate(article_part='indirect quotation')
  )  

Analysis

Opinionated genres

article_types_a %>%
  filter(type!="Other") %>%
  inner_join(articles_a %>% select(a_id,year_created,media)) %>%
  group_by(year_created,media) %>%
  summarize(n=sum(type=="Journalistic opinion")/n(),.groups="drop") %>%
  ungroup() %>%
  ggplot(aes(x=year_created,color=media,y=n)) +
  geom_step() +
  theme_hsci_discrete() +
  scale_y_continuous(breaks=seq(0,1,by=0.01),labels=scales::percent_format(accuracy=1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  labs(x="Year",y="Percentage of articles being journalistic opinion")
Joining, by = "a_id"Warning: Missing values are always removed in SQL aggregation functions.
Use `na.rm = TRUE` to silence this warning

Interpretations:

  • The huge bump in opinionated articles for HS comes from a huge increase in columns. Will need to check whether we are picking up columns accurately before 2013. However, general trend towards having more opinionated texts is clear across the outlets.

Article length development

ld <- article_stats_c %>% 
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  inner_join(article_types_c) %>%
  select(media,type,year_created,nr_tokens,nr_sentences,nr_paragraphs) %>% 
  collect()
Joining, by = "a_id"Joining, by = "a_id"
ld2 <- ld %>%
  group_by(year_created,media,type) %>% 
  summarize_all(list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(nr_tokens_q1:nr_paragraphs_q3) %>%
  separate(name,c(NA,"measure","quantile"),sep="_") %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    linetype=if_else(quantile=="q2","mean","1st/3rd quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type=fct_relevel(type,"External opinion", "Journalistic opinion", "Domestic general/political/economic news", "Local news", "Foreign news", "Culture/entertainment", "Sports"),
    measure=recode(measure,
      "tokens" = "Words",
      "sentences" = "Sentences",
      "paragraphs" = "Paragraphs",
    )
  )
ld2 %>% 
  group_by(measure) %>%
  group_map(~.x %>%
    filter(!str_detect(type,"Other")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap(~type,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
#    scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuract=1)) +
    labs(x="Year",y=.y)
  )
[[1]]

[[2]]

[[3]]

ld3 <- ld %>%
  group_by(year_created,media,type) %>%
  summarize_all(sum) %>%
  ungroup() %>%
  mutate(
    type=fct_relevel(type,"External opinion", "Journalistic opinion", "Domestic general/political/economic news", "Local news", "Foreign news", "Culture/entertainment", "Sports")
  )
ld3 %>%
    filter(!str_detect(type,"Other")) %>%
    ggplot(aes(x=year_created,y=nr_sentences/nr_paragraphs,color=media)) +
    geom_step() +
    facet_wrap(~type,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    #scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuracy=1)) +
    labs(x="Year",y="Sentences/paragraph")
ld3 %>%
    filter(!str_detect(type,"Other")) %>%
    ggplot(aes(x=year_created,y=nr_tokens/nr_sentences,color=media)) +
    geom_step() +
    facet_wrap(~type,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    #scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuracy=1)) +
    labs(x="Year",y="Words/sentence")

Interpretations:

  • After 2012, stories get longer across the board, both in number of sentences/paragraphs as well as words per sentence
  • Before 2012 there has been a modest drive toward shorter articles. The cause of this is unclear. However, this needs to be kept in mind when interpreting the following main analyses, which often also show diminishing signals before 2012

Proportion of articles being quotations

qd <- article_stats_c %>% 
  mutate(quote_proportion=nr_quote_tokens/nr_tokens) %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  inner_join(article_types_c) %>%
  select(media,type,year_created,quote_proportion) %>% 
  collect()
Joining, by = "a_id"Joining, by = "a_id"
qd2 <- qd %>%
  group_by(year_created,media,type) %>% 
  summarize_all(list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(q1:q3) %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    linetype=if_else(name=="q2","mean","1st/3rd quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type=fct_relevel(type,"External opinion", "Journalistic opinion", "Domestic general/political/economic news", "Local news", "Foreign news", "Culture/entertainment", "Sports")        
    )

qd3 <- article_stats_c %>% 
  select(a_id,nr_tokens) %>%
  inner_join(article_quotes_stats_c) %>%
  mutate(quote_proportion=nr_quote_tokens/nr_tokens) %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  inner_join(article_types_c) %>%
  select(media,direct,type,year_created,quote_proportion) %>% 
  collect()
Joining, by = "a_id"Joining, by = "a_id"Joining, by = "a_id"
qd4 <- qd3 %>%
  group_by(direct,year_created,media,type) %>% 
  summarize_all(list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(q1:q3) %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    linetype=if_else(name=="q2","mean","1st/3rd quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type=fct_relevel(type,"External opinion", "Journalistic opinion", "Domestic general/political/economic news", "Local news", "Foreign news", "Culture/entertainment", "Sports")        
    )

qd5 <- qd3 %>%
  count(direct,year_created,media,type) %>%
  group_by(year_created,media,type) %>%
  mutate(p=n/sum(n))

Overall

qd2 %>% 
  filter(type!="Other") %>%
  ggplot(aes(x=year_created,y=value,color=media,group=interaction(name,media),linetype=linetype)) +
  geom_step() +
  facet_wrap(~type) +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom") +
  scale_x_continuous(breaks=seq(2000,2020,by=4)) +
  scale_y_continuous(breaks=seq(0,1,by=0.1),labels=scales::percent_format(accuracy=1)) +
  labs(x="Year",y="Proportion of quotes in articles")

Proportion of quotes that are direct

qd5 %>%
  filter(direct==1) %>%
  ggplot(aes(x=year_created,y=p,color=media)) +
  geom_step() +
  facet_wrap(~type) +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom") +
  scale_x_continuous(breaks=seq(2000,2020,by=4)) +
  scale_y_continuous(breaks=seq(0,1,by=0.1),labels=scales::percent_format(accuracy=1)) +
  labs(x="Year",y="Proportion of quotes that are direct")

Interpretations:

  • Usage of quotation constructs is increasing
  • Particularly direct quotes are used more

Adjective proportions

d <- article_stats_long_c %>% 
  left_join(adjective_counts_c) %>%
  replace_na(list(nr_adjectives=0,nr_sentences_with_adjectives=0,nr_paragraphs_with_adjectives=0)) %>%
  inner_join(article_types_c) %>%
  mutate(type2=str_c(article_part," in ",type)) %>%
  union_all(
    article_stats_long_c %>% 
    left_join(adjective_counts_c) %>%
    replace_na(list(nr_adjectives=0,nr_sentences_with_adjectives=0,nr_paragraphs_with_adjectives=0)) %>%
    inner_join(article_types_c %>% rename(type2=type))
  ) %>%
  group_by(a_id,type2) %>%
  summarize(nr_tokens=sum(nr_tokens),nr_adjectives=sum(nr_adjectives),nr_sentences=sum(nr_sentences),nr_paragraphs=sum(nr_paragraphs),nr_sentences_with_adjectives=sum(nr_sentences_with_adjectives),nr_paragraphs_with_adjectives=sum(nr_paragraphs_with_adjectives),.groups="drop") %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  mutate(
    apw=nr_adjectives/nr_tokens,
    aps=nr_adjectives/nr_sentences,
    app=nr_adjectives/nr_paragraphs,
    apa=nr_adjectives,
    asps=nr_sentences_with_adjectives/nr_sentences,
    aspp=nr_sentences_with_adjectives/nr_paragraphs,
    aspa=nr_sentences_with_adjectives,
    appp=nr_paragraphs_with_adjectives/nr_paragraphs,
    appa=nr_paragraphs_with_adjectives,
    aapa=if_else(nr_adjectives>0,1,0)
  ) %>%
  select(year_created,media,type2,apw:aapa) %>%
  collect()
Joining, by = c("a_id", "article_part")Joining, by = "a_id"Joining, by = c("a_id", "article_part")Joining, by = "a_id"Joining, by = "a_id"Warning in for (Cl in classes) if (is(object, Cl)) return(object) :
  call dbDisconnect() when finished working with a connection
d2 <- d %>%
  replace_na(list(apw=0,aps=0,app=0,apa=0,asps=0,aspp=0,aspa=0,appp=0,appa=0,aapa=0)) %>%
  group_by(year_created,media,type2) %>% 
  summarize_at(vars(apw:aapa),list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(apw_q1:aapa_q3) %>%
  separate(name,c("measure","quantile"),sep="_") %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    measure=recode(measure,
      "apw" = "Adjectives/words",
      "aps" = "Adjectives/sentences",
      "app" = "Adjectives/paragrahs",
      "apa" = "Adjectives/articles",
      "asps" = "Sentences containing adjectives/sentences",
      "aspp" = "Sentences containing adjectives/paragrahs",
      "aspa" = "Sentences containing adjectives/articles",
      "appp" = "Paragraphs containing adjectives/paragrahs",
      "appa" = "Paragraphs containing adjectives/articles",
      "aapa" = "Articles containing adjectives/articles"
    ),
    linetype=if_else(quantile=="q2","mean","1/3 quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type2=fct_relevel(type2,"Domestic general/political/economic news", "journalistic text in Domestic general/political/economic news", "indirect quotation in Domestic general/political/economic news", "direct quotation in Domestic general/political/economic news", "Local news", "journalistic text in Local news", "indirect quotation in Local news", "direct quotation in Local news", "Foreign news", "journalistic text in Foreign news", "indirect quotation in Foreign news", "direct quotation in Foreign news", "Culture/entertainment", "journalistic text in Culture/entertainment", "indirect quotation in Culture/entertainment", "direct quotation in Culture/entertainment", "Sports", "journalistic text in Sports", "indirect quotation in Sports", "direct quotation in Sports","External opinion", "Journalistic opinion")
    )
d2 %>% 
  filter(str_detect(type2,"Other")) %>%
  ggplot(aes(x=year_created,y=value,color=media,group=interaction(measure,quantile,media),linetype=linetype)) +
  geom_step() +
  facet_wrap(measure~type2,scales="free") +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom")
d2 %>% 
  group_by(measure) %>%
  group_map(~.x %>%
    filter(!str_detect(type2,"Other")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap(~type2,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
#    scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuract=1)) +
    labs(x="Year",y="Number of adjectives") +
    ggtitle(.y)
  )

Due to the lengthening of article texts causing complexities for analyses, all the following adjective proportions were calculated and evaluated:

  • Adjectives/words
  • Adjectives/sentences
  • Adjectives/paragrahs
  • Adjectives/articles
  • Sentences containing adjectives/sentences
  • Sentences containing adjectives/paragrahs
  • Sentences containing adjectives/articles
  • Paragraphs containing adjectives/paragrahs
  • Paragraphs containing adjectives/articles
  • Articles containing adjectives/articles

Out of these, “Adjectives/words” and “Adjectives/sentences” were retained for final analysis:

Adjectives/words

1:2 %>% map(~
  d2 %>% 
    filter(measure=="Adjectives/words") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Adjectives/words")
  )
[[1]]

[[2]]

Adjectives/sentences

1:2 %>% map(~
  d2 %>% 
    filter(measure=="Adjectives/sentences") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Adjectives/sentences")
)
[[1]]

[[2]]

Interpretations:

  • In general, adjective use seems to decline. In quotes, this happens throughout.
  • However, in the case of journalistic writing, there are the following intriguing aberrations from this:
    • adjective use increases significantly after 2010 in HS & IL sports journalistic writing
    • adjective use increases after 2010 in the journalistic portion of domestic general/political/economic news texts in all outlets
    • adjective use increases in HS local reporting (Helsinki), but not in YLE regional reporting
  • Altogether, these increases in journalistic writing lead to similar increases in the articles overall.

Hedging proportions

hd <- article_stats_long_c %>% 
  inner_join(article_types_c) %>%
  mutate(type2=str_c(article_part," in ",type)) %>%
  union_all(
    article_stats_long_c %>% 
    inner_join(article_types_c %>% rename(type2=type))
  ) %>%
  left_join(hedging_counts_c) %>%
  replace_na(list(nr_hedgings=0,nr_sentences_with_hedgings=0,nr_paragraphs_with_hedgings=0)) %>%
  group_by(a_id,type2) %>%
  summarize(nr_tokens=sum(nr_tokens),nr_hedgings=sum(nr_hedgings),nr_sentences=sum(nr_sentences),nr_paragraphs=sum(nr_paragraphs),nr_sentences_with_hedgings=sum(nr_sentences_with_hedgings),nr_paragraphs_with_hedgings=sum(nr_paragraphs_with_hedgings),.groups="drop") %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  mutate(
    hpw=nr_hedgings/nr_tokens,
    hps=nr_hedgings/nr_sentences,
    hpp=nr_hedgings/nr_paragraphs,
    hpa=nr_hedgings,
    hsps=nr_sentences_with_hedgings/nr_sentences,
    hspp=nr_sentences_with_hedgings/nr_paragraphs,
    hspa=nr_sentences_with_hedgings,
    hppp=nr_paragraphs_with_hedgings/nr_paragraphs,
    hppa=nr_paragraphs_with_hedgings,
    hapa=if_else(nr_hedgings>0,1,0)
  ) %>%
  select(year_created,media,type2,hpw:hapa) %>%
  collect()
Joining, by = "a_id"Joining, by = "a_id"Joining, by = c("a_id", "article_part")Joining, by = "a_id"
hd2 <- hd %>%
  replace_na(list(hpw=0,hps=0,hpp=0,hpa=0,hsps=0,hspp=0,hspa=0,hppp=0,hppa=0,hapa=0)) %>%
  group_by(year_created,media,type2) %>% 
  summarize_at(vars(hpw:hapa),list(q1=~quantile(.x,0.1)[[1]],q2=mean,q3=~quantile(.x,0.9)[[1]])) %>%
  ungroup() %>%
  pivot_longer(hpw_q1:hapa_q3) %>%
  separate(name,c("measure","quantile"),sep="_") %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    measure=recode(measure,
      "hpw" = "Hedgings/words",
      "hps" = "Hedgings/sentences",
      "hpp" = "Hedgings/paragrahs",
      "hpa" = "Hedgings/articles",
      "hsps" = "Sentences containing hedgings/sentences",
      "hspp" = "Sentences containing hedgings/paragrahs",
      "hspa" = "Sentences containing hedgings/articles",
      "hppp" = "Paragraphs containing hedgings/paragrahs",
      "hppa" = "Paragraphs containing hedgings/articles",
      "hapa" = "Articles containing hedgings/articles"
    ),
    linetype=if_else(quantile=="q2","mean","1st/9th centile"),
    linetype=fct_relevel(linetype,"mean"),
    type2=fct_relevel(type2,"journalistic text in Domestic general/political/economic news", "indirect quotation in Domestic general/political/economic news", "direct quotation in Domestic general/political/economic news", "journalistic text in Local news", "indirect quotation in Local news", "direct quotation in Local news", "journalistic text in Foreign news", "indirect quotation in Foreign news", "direct quotation in Foreign news", "journalistic text in Culture/entertainment", "indirect quotation in Culture/entertainment", "direct quotation in Culture/entertainment", "journalistic text in Sports", "indirect quotation in Sports", "direct quotation in Sports","External opinion", "Journalistic opinion")
    )
hd2 %>% 
  filter(str_detect(type2,"Other")) %>%
  ggplot(aes(x=year_created,y=value,color=media,group=interaction(measure,quantile,media),linetype=linetype)) +
  geom_step() +
  facet_wrap(measure~type2,scales="free") +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom")
hd2 %>% 
  group_by(measure) %>%
  group_map(~.x %>%
    filter(!str_detect(type2,"Other")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap(~type2,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
#    scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuract=1)) +
    labs(x="Year",y="Number of hedgings") +
    ggtitle(.y)
  )

Due to the lengthening of article texts causing complexities for analyses, all the following hedging proportions were calculated and evaluated:

  • Hedgings/words
  • Hedgings/sentences
  • Hedgings/paragrahs
  • Hedgings/articles
  • Sentences containing hedgings/sentences
  • Sentences containing hedgings/paragrahs
  • Sentences containing hedgings/articles
  • Paragraphs containing hedgings/paragrahs
  • Paragraphs containing hedgings/articles
  • Articles containing hedgings/articles

Out of these, “hedgings/words” and “hedgings/sentences” were retained for final analysis:

Hedgings/words

1:2 %>% map(~
  hd2 %>% 
    filter(measure=="Hedgings/words") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Hedgings/words")
)
[[1]]

[[2]]

Hedgings/sentences

1:2 %>% map(~
  hd2 %>% 
    filter(measure=="Hedgings/sentences") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Hedgings/sentences")
)
[[1]]

[[2]]

Interpretations:

  • Again, one can see an increase in the use of hedgings in journalistic text of domestic general/political/economic news, HS local news (Kaupunki) as well as this time in foreign news. Further, the increase in the use of direct quotes further adds hedging to the articles as a whole.

Usage of selected conjunctions

The following conjunctions were tracked:

  • subordinate conjunctions jotta, koska, kun, jos, vaikka, kunnes, mikäli
  • sillä, mutta
conjunction_counts_c <- words_c %>%
  filter(lemma %in% c("jotta","koska","kun","jos","vaikka","kunnes","mikäli","sillä","mutta")) %>%
  inner_join(corpus_c) %>%
  left_join(quotes_c %>% mutate(article_part=if_else(direct,"direct quotation","indirect quotation")),sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  replace_na(list(article_part="journalistic text")) %>%
  select(a_id=a_id.x,par_id,s_id,pos, article_part) %>%
  group_by(a_id,article_part) %>%
  summarize(nr_conjunctions=n_distinct(10000L*s_id+pos),nr_sentences_with_conjunctions=n_distinct(s_id),nr_paragraphs_with_conjunctions=n_distinct(par_id),.groups="drop") %>%
  compute_c(name="conjunction_counts_c")
conjunction_counts_c <- tbl(con,"conjunction_counts_c")
cd <- article_stats_long_c %>% 
  inner_join(article_types_c) %>%
  mutate(type2=str_c(article_part," in ",type)) %>%
  union_all(
    article_stats_long_c %>% 
    inner_join(article_types_c %>% rename(type2=type))
  ) %>%
  left_join(hedging_counts_c) %>%
  left_join(conjunction_counts_c) %>%
  replace_na(list(nr_conjunctions=0,nr_sentences_with_conjunctions=0,nr_paragraphs_with_conjunctions=0)) %>%
  group_by(a_id,type2) %>%
  summarize(nr_tokens=sum(nr_tokens),nr_conjunctions=sum(nr_conjunctions),nr_sentences=sum(nr_sentences),nr_paragraphs=sum(nr_paragraphs),nr_sentences_with_conjunctions=sum(nr_sentences_with_conjunctions),nr_paragraphs_with_conjunctions=sum(nr_paragraphs_with_conjunctions),.groups="drop") %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  mutate(
    cpw=nr_conjunctions/nr_tokens,
    cps=nr_conjunctions/nr_sentences,
    cpp=nr_conjunctions/nr_paragraphs,
    cpa=nr_conjunctions,
    csps=nr_sentences_with_conjunctions/nr_sentences,
    cspp=nr_sentences_with_conjunctions/nr_paragraphs,
    cspa=nr_sentences_with_conjunctions,
    cppp=nr_paragraphs_with_conjunctions/nr_paragraphs,
    cppa=nr_paragraphs_with_conjunctions,
    capa=if_else(nr_conjunctions>0,1,0)
  ) %>%
  select(year_created,media,type2,cpw:capa) %>%
  collect()

cd2 <- cd %>%
  replace_na(list(cpw=0,cps=0,cpp=0,cpa=0,csps=0,cspp=0,cspa=0,cppp=0,cppa=0,capa=0)) %>%
  group_by(year_created,media,type2) %>% 
  summarize_at(vars(cpw:capa),list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(cpw_q1:capa_q3) %>%
  separate(name,c("measure","quantile"),sep="_") %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    measure=recode(measure,
      "cpw" = "Conjunctions/words",
      "cps" = "Conjunctions/sentences",
      "cpp" = "Conjunctions/paragrahs",
      "cpa" = "Conjunctions/articles",
      "csps" = "Sentences containing conjunctions/sentences",
      "cspp" = "Sentences containing conjunctions/paragrahs",
      "cspa" = "Sentences containing conjunctions/articles",
      "cppp" = "Paragraphs containing conjunctions/paragrahs",
      "cppa" = "Paragraphs containing conjunctions/articles",
      "capa" = "Articles containing conjunctions/articles"
    ),
    linetype=if_else(quantile=="q2","mean","1st/3rd quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type2=fct_relevel(type2,"Domestic general/political/economic news", "journalistic text in Domestic general/political/economic news", "indirect quotation in Domestic general/political/economic news", "direct quotation in Domestic general/political/economic news", "Local news", "journalistic text in Local news", "indirect quotation in Local news", "direct quotation in Local news", "Foreign news", "journalistic text in Foreign news", "indirect quotation in Foreign news", "direct quotation in Foreign news", "Culture/entertainment", "journalistic text in Culture/entertainment", "indirect quotation in Culture/entertainment", "direct quotation in Culture/entertainment", "Sports", "journalistic text in Sports", "indirect quotation in Sports", "direct quotation in Sports","External opinion", "Journalistic opinion")
    )
cd2 %>% 
  filter(str_detect(type2,"Other")) %>%
  ggplot(aes(x=year_created,y=value,color=media,group=interaction(measure,quantile,media),linetype=linetype)) +
  geom_step() +
  facet_wrap(measure~type2,scales="free") +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom")
cd2 %>% 
  group_by(measure) %>%
  group_map(~.x %>%
    filter(!str_detect(type2,"Other")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap(~type2,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Number of conjunctions") +
    ggtitle(.y)
  )

Due to the lengthening of article texts causing complexities for analyses, all the following conjunction proportions were calculated and evaluated:

  • Conjunctions/words
  • Conjunctions/sentences
  • Conjunctions/paragrahs
  • Conjunctions/articles
  • Sentences containing conjunctions/sentences
  • Sentences containing conjunctions/paragrahs
  • Sentences containing conjunctions/articles
  • Paragraphs containing conjunctions/paragrahs
  • Paragraphs containing conjunctions/articles
  • Articles containing conjunctions/articles

Out of these, “Sentences containing conjunctions/sentences” was retained for final analysis:

Sentences containing conjunctions/sentences

1:2 %>% map(~
  cd2 %>% 
    filter(measure=="Sentences containing conjunctions/sentences") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Sentences containing conjunctions/sentences")
)
[[1]]

[[2]]

Interpretations:

  • There is an increase in sentence constructions with conjunctions denoting interpretive subclauses in particularly domestic news reporting and HS Kaupunki. The overall increase comes mostly from journalistic text and not from an increase in quotes.
---
title: "Development of interpretativeness in journalism"
author: "Eetu Mäkelä"
date: "`r Sys.Date()`"
output: 
  html_notebook:
    code_folding: hide
    toc: yes
---

# Setup

```{r setup,include=FALSE}
knitr::opts_knit$set(root.dir = here::here())

library(here)
source(here("code/common_basis.R"), local = knitr::knit_global())

library(tidyverse)
library(lubridate)
library(hms)
library(ggbeeswarm)
library(gghsci)
library(ggforce)
library(gt)

p <- function(number) {
  return(format(number, scientific = FALSE, big.mark = ","))
}
pp <- function(percentage,accuracy=0.01) {
  return(scales::percent(percentage, accuracy = accuracy))
}

```

## Analysis table creation (not run)

```{r,eval=FALSE}
articles_opinionated_c <- articles_c %>% mutate(opinionated=case_when(
  media == "HS" ~ case_when(
    str_to_lower(section) == "pääkirjoitus" & str_detect(str_to_lower(story_logo),"ieras") ~ "external editorial",
    str_to_lower(section) == "pääkirjoitus" & is.na(story_logo) ~ "editorial",
    str_to_lower(section) == "pääkirjoitus" & str_to_lower(story_logo) == "pääkirjoitus" ~ "editorial",
    str_to_lower(section) == "mielipide" | str_to_lower(story_logo) == "mielipide" ~ "external opinion",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(str_to_lower(story_logo),"analyysi")  ~ "analysis",
    str_detect(str_to_lower(title),"näkökulma:") | str_detect(str_to_lower(story_logo),"näkökulma") ~ "perspective",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(story_logo),"kolumni") ~ "column",
    str_detect(str_to_lower(title),"blogi:") | str_detect(str_to_lower(story_logo),"blog") ~ "blog"    
  ),
  media == "IL" ~ case_when(
    subsection == "paakirjoitus" ~ "editorial",
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:") ~ "perspective"
  ),
  media == "YLE" ~ case_when(
    str_detect(str_to_lower(title),"kommentti:") ~ "commentary",
    str_detect(str_to_lower(title),"analyysi:") | str_detect(subject,"Analyysit \\(Yle Uutiset\\)") ~ "analysis",
    str_detect(str_to_lower(title),"kolumni:") | str_detect(str_to_lower(subject),"kolumn") ~ "column",
    str_detect(str_to_lower(title),"näkökulma:")  | str_detect(str_to_lower(subject),"näkökulm") ~ "perspective",
    str_detect(str_to_lower(title),"blogi:")  | str_detect(str_to_lower(subject),"blog") ~ "blog"
  )
)) %>%
  filter(!is.na(opinionated)) %>%
  distinct(a_id, opinionated) %>%
  compute_c(name="articles_opinionated_c")

article_types_c <- articles_c %>% 
  left_join(articles_opinionated_c) %>%
  mutate(type=case_when(
    media == "STT" & version != "Loppuversio" ~ "Other",
    !is.na(opinionated) & !str_detect(opinionated,"^external ") ~ "Journalistic opinion",
    !is.na(opinionated) ~ "External opinion",
    media == "HS" & section %in% c("Kotimaa", "Politiikka", "Talous") ~ "Domestic general/political/economic news",
  media == "IL" & subsection %in% c("kotimaa","politiikka","talous","uutiset") ~ "Domestic general/political/economic news",
  media == "STT" & section %in% c("Kotimaa","Politiikka","Talous") ~ "Domestic general/political/economic news",
  media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Kotimaan uutiset|politiikka|talous") & (!str_detect(subject,"Ulkomaat") | str_detect(subject,"Kotimaan uutiset")) ~ "Domestic general/political/economic news",
    media == "HS" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "IL" & section == "viihde" ~ "Culture/entertainment",
    media == "STT" & section == "Kulttuuri" ~ "Culture/entertainment",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"kulttuuri|musiikki|viihde") & !str_detect(subject, "Ulkomaat") ~ "Culture/entertainment",
    media == "HS" & section == "Kaupunki" ~ "Local news",
    media == "YLE" & section == "Yle Uutiset" & coverage=="local" ~ "Local news",
    media == "STT" & section == "Urheilu" ~ "Sports",
    media == "HS" & section == "Urheilu" ~ "Sports",
    media == "YLE" & section == "YLE Urheilu" ~ "Sports",
    media == "IL" & section == "urheilu" ~ "Sports",
    media == "STT" & section == "Ulkomaat" ~ "Foreign news",
    media == "HS" & section == "Ulkomaat" ~ "Foreign news",
    media == "YLE" & section == "Yle Uutiset" & str_detect(subject,"Ulkomaat") & !str_detect(subject,"Kotimaan uutiset") ~ "Foreign news",
    media == "IL" & subsection == "ulkomaat" ~ "Foreign news",
    T ~ "Other"
  )) %>%
  distinct(a_id,type) %>%
  compute_c(name="article_types")

article_quotes_stats_c <- corpus_c %>% 
  inner_join(quotes_c,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  group_by(a_id=a_id.x, direct) %>%
  summarize(
    nr_quote_tokens=n_distinct(10000L*s_id+pos),
    nr_sentences_with_quotes=n_distinct(s_id),
    nr_paragraphs_with_quotes=n_distinct(par_id),
    nr_quotes=n_distinct(q_id),
    .groups="drop") %>%
  compute_c(name="article_quotes_stats_c")

article_stats_c <- 
  corpus_c %>%
  group_by(a_id) %>%
  summarize(
    nr_tokens=n(),
    nr_sentences=n_distinct(s_id),
    nr_paragraphs=n_distinct(par_id),
    .groups="drop") %>%
  compute_c() %>%
  left_join(
    article_quotes_stats_c %>%
      filter(direct==TRUE) %>%
      select(
        a_id,
        nr_direct_quote_tokens=nr_quote_tokens,
        nr_sentences_with_direct_quotes=nr_sentences_with_quotes,
        nr_paragraphs_with_direct_quotes=nr_paragraphs_with_quotes,
        nr_direct_quotes=nr_quotes)
  ) %>%
  left_join(
    article_quotes_stats_c %>%
      filter(direct==FALSE) %>%
      select(
        a_id,
        nr_indirect_quote_tokens=nr_quote_tokens,
        nr_sentences_with_indirect_quotes=nr_sentences_with_quotes,
        nr_paragraphs_with_indirect_quotes=nr_paragraphs_with_quotes,
        nr_indirect_quotes=nr_quotes)
  ) %>% 
  left_join(
    corpus_c %>% 
    inner_join(quotes_c,sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
    group_by(a_id=a_id.x) %>%
    summarize(
      nr_quote_tokens=n_distinct(10000L*s_id+pos),
      nr_sentences_with_quotes=n_distinct(s_id),
      nr_paragraphs_with_quotes=n_distinct(par_id),
      nr_quotes=n_distinct(q_id),
      .groups="drop") %>%
    compute_c()
  ) %>%
  replace_na(list(
    nr_direct_quote_tokens=0L,nr_sentences_with_direct_quotes=0L,nr_paragraphs_with_direct_quotes=0L,nr_direct_quotes=0L,
    nr_indirect_quote_tokens=0L,nr_sentences_with_indirect_quotes=0L,nr_paragraphs_with_indirect_quotes=0L,nr_indirect_quotes=0L,
    nr_quote_tokens=0L,nr_sentences_with_quotes=0L,nr_paragraphs_with_quotes=0L,nr_quotes=0L
    )) %>%
  compute_c(name="article_stats_c")

adjective_counts_c <- words_c %>% 
  filter(upos=="ADJ") %>%
  inner_join(corpus_c) %>%
  left_join(quotes_c %>% mutate(article_part=if_else(direct,"direct quotation","indirect quotation")),sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  select(a_id=a_id.x,par_id,s_id,pos,article_part) %>%
  replace_na(list(article_part="journalistic text")) %>%
  group_by(a_id,article_part) %>%
  summarize(nr_adjectives=n_distinct(10000L*s_id+pos),nr_sentences_with_adjectives=n_distinct(s_id),nr_paragraphs_with_adjectives=n_distinct(par_id),.groups="drop") %>%
  compute_c(name="adjective_counts_c")

hedging_markers <- read_csv(here("data/input/hedging_markers.csv")) %>% 
  rename(type=...1,value=...2,class=...3)

hedging_counts_c <- words_c %>% 
  inner_join(hedging_markers %>% 
               filter(class %in% c("Adverbs","Adjective")) %>%
               select(lemma=value) %>% copy_to_c(con)) %>%
  inner_join(corpus_c) %>%
  left_join(quotes_c %>% mutate(article_part=if_else(direct,"direct quotation","indirect quotation")),sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  replace_na(list(article_part="journalistic text")) %>%
  select(a_id=a_id.x,par_id,s_id,pos, article_part) %>%
  group_by(a_id,article_part) %>%
  summarize(nr_hedgings=n_distinct(10000L*s_id+pos),nr_sentences_with_hedgings=n_distinct(s_id),nr_paragraphs_with_hedgings=n_distinct(par_id),.groups="drop") %>%
  compute_c(name="hedging_counts_c")
```

## Analysis table loads

```{r}
article_stats_c <- tbl(con,"article_stats_c")
article_quotes_stats_c <- tbl(con,"article_quotes_stats_c")
adjective_counts_c <- tbl(con,"adjective_counts_c")
hedging_counts_c <- tbl(con,"hedging_counts_c")

article_stats_long_c <-
  article_stats_c %>%
  mutate(nr_tokens=nr_tokens-nr_quote_tokens,nr_sentences=nr_sentences-nr_sentences_with_quotes,nr_paragraphs=nr_paragraphs-nr_paragraphs_with_quotes) %>%
  select(a_id,nr_tokens:nr_paragraphs) %>%
  mutate(article_part='journalistic text') %>%
  union_all(
    article_stats_c %>%
      filter(nr_direct_quote_tokens>0) %>%
      select(a_id,nr_tokens=nr_direct_quote_tokens,nr_sentences=nr_sentences_with_direct_quotes,nr_paragraphs=nr_paragraphs_with_direct_quotes) %>% 
      mutate(article_part='direct quotation')
  ) %>%
  union_all(
    article_stats_c %>%
      filter(nr_indirect_quote_tokens>0) %>%
      select(a_id,nr_tokens=nr_indirect_quote_tokens,nr_sentences=nr_sentences_with_indirect_quotes,nr_paragraphs=nr_paragraphs_with_indirect_quotes) %>% 
      mutate(article_part='indirect quotation')
  )  
```


# Analysis

## Opinionated genres

```{r}
article_types_a %>%
  filter(type!="Other") %>%
  inner_join(articles_a %>% select(a_id,year_created,media)) %>%
  group_by(year_created,media) %>%
  summarize(n=sum(type=="Journalistic opinion")/n(),.groups="drop") %>%
  ungroup() %>%
  ggplot(aes(x=year_created,color=media,y=n)) +
  geom_step() +
  theme_hsci_discrete() +
  scale_y_continuous(breaks=seq(0,1,by=0.01),labels=scales::percent_format(accuracy=1)) +
  scale_x_continuous(breaks=seq(2000,2020,by=2)) +
  labs(x="Year",y="Percentage of articles being journalistic opinion")
```

Interpretations:

 - The huge bump in opinionated articles for HS comes from a huge increase in columns. Will need to check whether we are picking up columns accurately before 2013. However, general trend towards having more opinionated texts is clear across the outlets.

## Article length development

```{r}
ld <- article_stats_c %>% 
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  inner_join(article_types_c) %>%
  select(media,type,year_created,nr_tokens,nr_sentences,nr_paragraphs) %>% 
  collect()

ld2 <- ld %>%
  group_by(year_created,media,type) %>% 
  summarize_all(list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(nr_tokens_q1:nr_paragraphs_q3) %>%
  separate(name,c(NA,"measure","quantile"),sep="_") %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    linetype=if_else(quantile=="q2","mean","1st/3rd quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type=fct_relevel(type,"External opinion", "Journalistic opinion", "Domestic general/political/economic news", "Local news", "Foreign news", "Culture/entertainment", "Sports"),
    measure=recode(measure,
      "tokens" = "Words",
      "sentences" = "Sentences",
      "paragraphs" = "Paragraphs",
    )
  )

```

```{r,fig.width=8}
ld2 %>% 
  group_by(measure) %>%
  group_map(~.x %>%
    filter(!str_detect(type,"Other")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap(~type,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
#    scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuract=1)) +
    labs(x="Year",y=.y)
  )
```


```{r}
ld3 <- ld %>%
  group_by(year_created,media,type) %>%
  summarize_all(sum) %>%
  ungroup() %>%
  mutate(
    type=fct_relevel(type,"External opinion", "Journalistic opinion", "Domestic general/political/economic news", "Local news", "Foreign news", "Culture/entertainment", "Sports")
  )
```


```{r,eval=FALSE}
ld3 %>%
    filter(!str_detect(type,"Other")) %>%
    ggplot(aes(x=year_created,y=nr_sentences/nr_paragraphs,color=media)) +
    geom_step() +
    facet_wrap(~type,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    #scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuracy=1)) +
    labs(x="Year",y="Sentences/paragraph")
```


```{r,fig.width=8}
ld3 %>%
    filter(!str_detect(type,"Other")) %>%
    ggplot(aes(x=year_created,y=nr_tokens/nr_sentences,color=media)) +
    geom_step() +
    facet_wrap(~type,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    #scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuracy=1)) +
    labs(x="Year",y="Words/sentence")
```

Interpretations:

 - After 2012, stories get longer across the board, both in number of sentences/paragraphs as well as words per sentence
 - Before 2012 there has been a modest drive toward shorter articles. The cause of this is unclear. However, this needs to be kept in mind when interpreting the following main analyses, which often also show diminishing signals before 2012

## Proportion of articles being quotations
 
```{r}
qd <- article_stats_c %>% 
  mutate(quote_proportion=nr_quote_tokens/nr_tokens) %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  inner_join(article_types_c) %>%
  select(media,type,year_created,quote_proportion) %>% 
  collect()

qd2 <- qd %>%
  group_by(year_created,media,type) %>% 
  summarize_all(list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(q1:q3) %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    linetype=if_else(name=="q2","mean","1st/3rd quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type=fct_relevel(type,"External opinion", "Journalistic opinion", "Domestic general/political/economic news", "Local news", "Foreign news", "Culture/entertainment", "Sports")        
    )

qd3 <- article_stats_c %>% 
  select(a_id,nr_tokens) %>%
  inner_join(article_quotes_stats_c) %>%
  mutate(quote_proportion=nr_quote_tokens/nr_tokens) %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  inner_join(article_types_c) %>%
  select(media,direct,type,year_created,quote_proportion) %>% 
  collect()

qd4 <- qd3 %>%
  group_by(direct,year_created,media,type) %>% 
  summarize_all(list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(q1:q3) %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    linetype=if_else(name=="q2","mean","1st/3rd quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type=fct_relevel(type,"External opinion", "Journalistic opinion", "Domestic general/political/economic news", "Local news", "Foreign news", "Culture/entertainment", "Sports")        
    )

qd5 <- qd3 %>%
  count(direct,year_created,media,type) %>%
  group_by(year_created,media,type) %>%
  mutate(p=n/sum(n))
```
 
### Overall

```{r,fig.width=8}
qd2 %>% 
  filter(type!="Other") %>%
  ggplot(aes(x=year_created,y=value,color=media,group=interaction(name,media),linetype=linetype)) +
  geom_step() +
  facet_wrap(~type) +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom") +
  scale_x_continuous(breaks=seq(2000,2020,by=4)) +
  scale_y_continuous(breaks=seq(0,1,by=0.1),labels=scales::percent_format(accuracy=1)) +
  labs(x="Year",y="Proportion of quotes in articles")
```

### Proportion of quotes that are direct

```{r,fig.width=8}
qd5 %>%
  filter(direct==1) %>%
  ggplot(aes(x=year_created,y=p,color=media)) +
  geom_step() +
  facet_wrap(~type) +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom") +
  scale_x_continuous(breaks=seq(2000,2020,by=4)) +
  scale_y_continuous(breaks=seq(0,1,by=0.1),labels=scales::percent_format(accuracy=1)) +
  labs(x="Year",y="Proportion of quotes that are direct")
```

 Interpretations:
 
  - Usage of quotation constructs is increasing
  - Particularly direct quotes are used more
  
## Adjective proportions

```{r}
d <- article_stats_long_c %>% 
  left_join(adjective_counts_c) %>%
  replace_na(list(nr_adjectives=0,nr_sentences_with_adjectives=0,nr_paragraphs_with_adjectives=0)) %>%
  inner_join(article_types_c) %>%
  mutate(type2=str_c(article_part," in ",type)) %>%
  union_all(
    article_stats_long_c %>% 
    left_join(adjective_counts_c) %>%
    replace_na(list(nr_adjectives=0,nr_sentences_with_adjectives=0,nr_paragraphs_with_adjectives=0)) %>%
    inner_join(article_types_c %>% rename(type2=type))
  ) %>%
  group_by(a_id,type2) %>%
  summarize(nr_tokens=sum(nr_tokens),nr_adjectives=sum(nr_adjectives),nr_sentences=sum(nr_sentences),nr_paragraphs=sum(nr_paragraphs),nr_sentences_with_adjectives=sum(nr_sentences_with_adjectives),nr_paragraphs_with_adjectives=sum(nr_paragraphs_with_adjectives),.groups="drop") %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  mutate(
    apw=nr_adjectives/nr_tokens,
    aps=nr_adjectives/nr_sentences,
    app=nr_adjectives/nr_paragraphs,
    apa=nr_adjectives,
    asps=nr_sentences_with_adjectives/nr_sentences,
    aspp=nr_sentences_with_adjectives/nr_paragraphs,
    aspa=nr_sentences_with_adjectives,
    appp=nr_paragraphs_with_adjectives/nr_paragraphs,
    appa=nr_paragraphs_with_adjectives,
    aapa=if_else(nr_adjectives>0,1,0)
  ) %>%
  select(year_created,media,type2,apw:aapa) %>%
  collect()

d2 <- d %>%
  replace_na(list(apw=0,aps=0,app=0,apa=0,asps=0,aspp=0,aspa=0,appp=0,appa=0,aapa=0)) %>%
  group_by(year_created,media,type2) %>% 
  summarize_at(vars(apw:aapa),list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(apw_q1:aapa_q3) %>%
  separate(name,c("measure","quantile"),sep="_") %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    measure=recode(measure,
      "apw" = "Adjectives/words",
      "aps" = "Adjectives/sentences",
      "app" = "Adjectives/paragrahs",
      "apa" = "Adjectives/articles",
      "asps" = "Sentences containing adjectives/sentences",
      "aspp" = "Sentences containing adjectives/paragrahs",
      "aspa" = "Sentences containing adjectives/articles",
      "appp" = "Paragraphs containing adjectives/paragrahs",
      "appa" = "Paragraphs containing adjectives/articles",
      "aapa" = "Articles containing adjectives/articles"
    ),
    linetype=if_else(quantile=="q2","mean","1/3 quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type2=fct_relevel(type2,"Domestic general/political/economic news", "journalistic text in Domestic general/political/economic news", "indirect quotation in Domestic general/political/economic news", "direct quotation in Domestic general/political/economic news", "Local news", "journalistic text in Local news", "indirect quotation in Local news", "direct quotation in Local news", "Foreign news", "journalistic text in Foreign news", "indirect quotation in Foreign news", "direct quotation in Foreign news", "Culture/entertainment", "journalistic text in Culture/entertainment", "indirect quotation in Culture/entertainment", "direct quotation in Culture/entertainment", "Sports", "journalistic text in Sports", "indirect quotation in Sports", "direct quotation in Sports","External opinion", "Journalistic opinion")
    )
```

```{r,eval=FALSE}
d2 %>% 
  filter(str_detect(type2,"Other")) %>%
  ggplot(aes(x=year_created,y=value,color=media,group=interaction(measure,quantile,media),linetype=linetype)) +
  geom_step() +
  facet_wrap(measure~type2,scales="free") +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom")
```

```{r,fig.width=8,fig.height=11,eval=FALSE}
d2 %>% 
  group_by(measure) %>%
  group_map(~.x %>%
    filter(!str_detect(type2,"Other")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap(~type2,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
#    scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuract=1)) +
    labs(x="Year",y="Number of adjectives") +
    ggtitle(.y)
  )
```
Due to the lengthening of article texts causing complexities for analyses, all the following adjective proportions were calculated and evaluated:
 
 - Adjectives/words
 - Adjectives/sentences
 - Adjectives/paragrahs
 - Adjectives/articles
 - Sentences containing adjectives/sentences
 - Sentences containing adjectives/paragrahs
 - Sentences containing adjectives/articles
 - Paragraphs containing adjectives/paragrahs
 - Paragraphs containing adjectives/articles
 - Articles containing adjectives/articles
 
Out of these, "Adjectives/words" and "Adjectives/sentences" were retained for final analysis:

### Adjectives/words
```{r,fig.width=8,fig.height=11}
1:2 %>% map(~
  d2 %>% 
    filter(measure=="Adjectives/words") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Adjectives/words")
  )
```

### Adjectives/sentences
```{r,fig.width=8,fig.height=11}
1:2 %>% map(~
  d2 %>% 
    filter(measure=="Adjectives/sentences") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Adjectives/sentences")
)
```

Interpretations:

 - In general, adjective use seems to decline. In quotes, this happens throughout.
 - However, in the case of journalistic writing, there are the following intriguing aberrations from this:
   - adjective use increases significantly after 2010 in HS & IL sports journalistic writing
   - adjective use increases after 2010 in the journalistic portion of domestic general/political/economic news texts in all outlets
   - adjective use increases in HS local reporting (Helsinki), but not in YLE regional reporting
 - Altogether, these increases in journalistic writing lead to similar increases in the articles overall.
   
## Hedging proportions

```{r}
hd <- article_stats_long_c %>% 
  inner_join(article_types_c) %>%
  mutate(type2=str_c(article_part," in ",type)) %>%
  union_all(
    article_stats_long_c %>% 
    inner_join(article_types_c %>% rename(type2=type))
  ) %>%
  left_join(hedging_counts_c) %>%
  replace_na(list(nr_hedgings=0,nr_sentences_with_hedgings=0,nr_paragraphs_with_hedgings=0)) %>%
  group_by(a_id,type2) %>%
  summarize(nr_tokens=sum(nr_tokens),nr_hedgings=sum(nr_hedgings),nr_sentences=sum(nr_sentences),nr_paragraphs=sum(nr_paragraphs),nr_sentences_with_hedgings=sum(nr_sentences_with_hedgings),nr_paragraphs_with_hedgings=sum(nr_paragraphs_with_hedgings),.groups="drop") %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  mutate(
    hpw=nr_hedgings/nr_tokens,
    hps=nr_hedgings/nr_sentences,
    hpp=nr_hedgings/nr_paragraphs,
    hpa=nr_hedgings,
    hsps=nr_sentences_with_hedgings/nr_sentences,
    hspp=nr_sentences_with_hedgings/nr_paragraphs,
    hspa=nr_sentences_with_hedgings,
    hppp=nr_paragraphs_with_hedgings/nr_paragraphs,
    hppa=nr_paragraphs_with_hedgings,
    hapa=if_else(nr_hedgings>0,1,0)
  ) %>%
  select(year_created,media,type2,hpw:hapa) %>%
  collect()

hd2 <- hd %>%
  replace_na(list(hpw=0,hps=0,hpp=0,hpa=0,hsps=0,hspp=0,hspa=0,hppp=0,hppa=0,hapa=0)) %>%
  group_by(year_created,media,type2) %>% 
  summarize_at(vars(hpw:hapa),list(q1=~quantile(.x,0.1)[[1]],q2=mean,q3=~quantile(.x,0.9)[[1]])) %>%
  ungroup() %>%
  pivot_longer(hpw_q1:hapa_q3) %>%
  separate(name,c("measure","quantile"),sep="_") %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    measure=recode(measure,
      "hpw" = "Hedgings/words",
      "hps" = "Hedgings/sentences",
      "hpp" = "Hedgings/paragrahs",
      "hpa" = "Hedgings/articles",
      "hsps" = "Sentences containing hedgings/sentences",
      "hspp" = "Sentences containing hedgings/paragrahs",
      "hspa" = "Sentences containing hedgings/articles",
      "hppp" = "Paragraphs containing hedgings/paragrahs",
      "hppa" = "Paragraphs containing hedgings/articles",
      "hapa" = "Articles containing hedgings/articles"
    ),
    linetype=if_else(quantile=="q2","mean","1st/9th centile"),
    linetype=fct_relevel(linetype,"mean"),
    type2=fct_relevel(type2,"Domestic general/political/economic news", "journalistic text in Domestic general/political/economic news", "indirect quotation in Domestic general/political/economic news", "direct quotation in Domestic general/political/economic news", "Local news", "journalistic text in Local news", "indirect quotation in Local news", "direct quotation in Local news", "Foreign news", "journalistic text in Foreign news", "indirect quotation in Foreign news", "direct quotation in Foreign news", "Culture/entertainment", "journalistic text in Culture/entertainment", "indirect quotation in Culture/entertainment", "direct quotation in Culture/entertainment", "Sports", "journalistic text in Sports", "indirect quotation in Sports", "direct quotation in Sports","External opinion", "Journalistic opinion")
    )
```

```{r,eval=FALSE}
hd2 %>% 
  filter(str_detect(type2,"Other")) %>%
  ggplot(aes(x=year_created,y=value,color=media,group=interaction(measure,quantile,media),linetype=linetype)) +
  geom_step() +
  facet_wrap(measure~type2,scales="free") +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom")
```

```{r,fig.width=8,fig.height=11,eval=FALSE}
hd2 %>% 
  group_by(measure) %>%
  group_map(~.x %>%
    filter(!str_detect(type2,"Other")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap(~type2,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
#    scale_y_continuous(breaks=seq(0,1,by=0.02), labels=scales::percent_format(accuract=1)) +
    labs(x="Year",y="Number of hedgings") +
    ggtitle(.y)
  )
```
Due to the lengthening of article texts causing complexities for analyses, all the following hedging proportions were calculated and evaluated:
 
 - Hedgings/words
 - Hedgings/sentences
 - Hedgings/paragrahs
 - Hedgings/articles
 - Sentences containing hedgings/sentences
 - Sentences containing hedgings/paragrahs
 - Sentences containing hedgings/articles
 - Paragraphs containing hedgings/paragrahs
 - Paragraphs containing hedgings/articles
 - Articles containing hedgings/articles
 
Out of these, "hedgings/words" and "hedgings/sentences" were retained for final analysis:

### Hedgings/words
```{r,fig.width=8,fig.height=11}
1:2 %>% map(~
  hd2 %>% 
    filter(measure=="Hedgings/words") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Hedgings/words")
)
```

### Hedgings/sentences
```{r,fig.width=8,fig.height=11}
1:2 %>% map(~
  hd2 %>% 
    filter(measure=="Hedgings/sentences") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Hedgings/sentences")
)
```

Interpretations:

 - Again, one can see an increase in the use of hedgings in journalistic text of domestic general/political/economic news, HS local news (Kaupunki) as well as this time in foreign news. Further, the increase in the use of direct quotes further adds hedging to the articles as a whole.
 
## Usage of selected conjunctions 

The following conjunctions were tracked:

 - subordinate conjunctions jotta, koska, kun, jos, vaikka, kunnes, mikäli
 - sillä, mutta

```{r,eval=FALSE}
conjunction_counts_c <- words_c %>%
  filter(lemma %in% c("jotta","koska","kun","jos","vaikka","kunnes","mikäli","sillä","mutta")) %>%
  inner_join(corpus_c) %>%
  left_join(quotes_c %>% mutate(article_part=if_else(direct,"direct quotation","indirect quotation")),sql_on="LHS.a_id=RHS.a_id AND (s_id > start_s_id OR (s_id=start_s_id AND pos >= start_pos)) AND (s_id < end_s_id OR (s_id=end_s_id AND pos <= end_pos))") %>%
  replace_na(list(article_part="journalistic text")) %>%
  select(a_id=a_id.x,par_id,s_id,pos, article_part) %>%
  group_by(a_id,article_part) %>%
  summarize(nr_conjunctions=n_distinct(10000L*s_id+pos),nr_sentences_with_conjunctions=n_distinct(s_id),nr_paragraphs_with_conjunctions=n_distinct(par_id),.groups="drop") %>%
  compute_c(name="conjunction_counts_c")
```

```{r}
conjunction_counts_c <- tbl(con,"conjunction_counts_c")
```

```{r}
cd <- article_stats_long_c %>% 
  inner_join(article_types_c) %>%
  mutate(type2=str_c(article_part," in ",type)) %>%
  union_all(
    article_stats_long_c %>% 
    inner_join(article_types_c %>% rename(type2=type))
  ) %>%
  left_join(hedging_counts_c) %>%
  left_join(conjunction_counts_c) %>%
  replace_na(list(nr_conjunctions=0,nr_sentences_with_conjunctions=0,nr_paragraphs_with_conjunctions=0)) %>%
  group_by(a_id,type2) %>%
  summarize(nr_tokens=sum(nr_tokens),nr_conjunctions=sum(nr_conjunctions),nr_sentences=sum(nr_sentences),nr_paragraphs=sum(nr_paragraphs),nr_sentences_with_conjunctions=sum(nr_sentences_with_conjunctions),nr_paragraphs_with_conjunctions=sum(nr_paragraphs_with_conjunctions),.groups="drop") %>%
  inner_join(articles_c %>% select(a_id,media,year_created)) %>%
  mutate(
    cpw=nr_conjunctions/nr_tokens,
    cps=nr_conjunctions/nr_sentences,
    cpp=nr_conjunctions/nr_paragraphs,
    cpa=nr_conjunctions,
    csps=nr_sentences_with_conjunctions/nr_sentences,
    cspp=nr_sentences_with_conjunctions/nr_paragraphs,
    cspa=nr_sentences_with_conjunctions,
    cppp=nr_paragraphs_with_conjunctions/nr_paragraphs,
    cppa=nr_paragraphs_with_conjunctions,
    capa=if_else(nr_conjunctions>0,1,0)
  ) %>%
  select(year_created,media,type2,cpw:capa) %>%
  collect()

cd2 <- cd %>%
  replace_na(list(cpw=0,cps=0,cpp=0,cpa=0,csps=0,cspp=0,cspa=0,cppp=0,cppa=0,capa=0)) %>%
  group_by(year_created,media,type2) %>% 
  summarize_at(vars(cpw:capa),list(q1=~quantile(.x,0.25)[[1]],q2=mean,q3=~quantile(.x,0.75)[[1]])) %>%
  ungroup() %>%
  pivot_longer(cpw_q1:capa_q3) %>%
  separate(name,c("measure","quantile"),sep="_") %>%
  mutate(
    media=fct_relevel(media, "HS","STT"),
    measure=recode(measure,
      "cpw" = "Conjunctions/words",
      "cps" = "Conjunctions/sentences",
      "cpp" = "Conjunctions/paragrahs",
      "cpa" = "Conjunctions/articles",
      "csps" = "Sentences containing conjunctions/sentences",
      "cspp" = "Sentences containing conjunctions/paragrahs",
      "cspa" = "Sentences containing conjunctions/articles",
      "cppp" = "Paragraphs containing conjunctions/paragrahs",
      "cppa" = "Paragraphs containing conjunctions/articles",
      "capa" = "Articles containing conjunctions/articles"
    ),
    linetype=if_else(quantile=="q2","mean","1st/3rd quartile"),
    linetype=fct_relevel(linetype,"mean"),
    type2=fct_relevel(type2,"Domestic general/political/economic news", "journalistic text in Domestic general/political/economic news", "indirect quotation in Domestic general/political/economic news", "direct quotation in Domestic general/political/economic news", "Local news", "journalistic text in Local news", "indirect quotation in Local news", "direct quotation in Local news", "Foreign news", "journalistic text in Foreign news", "indirect quotation in Foreign news", "direct quotation in Foreign news", "Culture/entertainment", "journalistic text in Culture/entertainment", "indirect quotation in Culture/entertainment", "direct quotation in Culture/entertainment", "Sports", "journalistic text in Sports", "indirect quotation in Sports", "direct quotation in Sports","External opinion", "Journalistic opinion")
    )
```

```{r,eval=FALSE}
cd2 %>% 
  filter(str_detect(type2,"Other")) %>%
  ggplot(aes(x=year_created,y=value,color=media,group=interaction(measure,quantile,media),linetype=linetype)) +
  geom_step() +
  facet_wrap(measure~type2,scales="free") +
  theme_hsci_discrete(base_family="Arial") +
  theme(legend.position="bottom")
```

```{r,fig.width=8,fig.height=11,eval=FALSE}
cd2 %>% 
  group_by(measure) %>%
  group_map(~.x %>%
    filter(!str_detect(type2,"Other")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap(~type2,scales="free") +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Number of conjunctions") +
    ggtitle(.y)
  )
```
Due to the lengthening of article texts causing complexities for analyses, all the following conjunction proportions were calculated and evaluated:
 
 - Conjunctions/words
 - Conjunctions/sentences
 - Conjunctions/paragrahs
 - Conjunctions/articles
 - Sentences containing conjunctions/sentences
 - Sentences containing conjunctions/paragrahs
 - Sentences containing conjunctions/articles
 - Paragraphs containing conjunctions/paragrahs
 - Paragraphs containing conjunctions/articles
 - Articles containing conjunctions/articles
 
Out of these, "Sentences containing conjunctions/sentences" was retained for final analysis:

### Sentences containing conjunctions/sentences
```{r,fig.width=8,fig.height=11}
1:2 %>% map(~
  cd2 %>% 
    filter(measure=="Sentences containing conjunctions/sentences") %>%
    filter(!str_detect(type2,"Other"),!str_detect(type2,"in Journalistic opinion"),!str_detect(type2,"in External opinion")) %>%
    ggplot(aes(x=year_created,y=value,color=media,group=interaction(quantile,media),linetype=linetype)) +
    geom_step() +
    facet_wrap_paginate(~type2,ncol=4,nrow=3,page=.x) +
    theme_hsci_discrete(base_family="Arial") +
    theme(legend.position="bottom") +
    scale_x_continuous(breaks=seq(2000,2020,by=4)) +
    labs(x="Year",y="Sentences containing conjunctions/sentences")
)
```

Interpretations:

 - There is an increase in sentence constructions with conjunctions denoting interpretive subclauses in particularly domestic news reporting and HS Kaupunki. The overall increase comes mostly from journalistic text and not from an increase in quotes.