---
title: "Quotation analysis"
author: "Eetu Mäkelä"
date: "`r Sys.Date()`"
format:
html:
code-fold: true
code-tools: true
---
```{r setup,include=FALSE}
knitr:: opts_chunk$ set (message= FALSE ,dpi= 72 ,fig.retina= 2 ,fig.width= 8 )
source (here:: here ("code/common_basis.R" ), local = knitr:: knit_global ())
top_quotes_categorized_local <- read_sheet (ss= "1Lx2nhdwdgJsxrn047Wnre1S3GsrQAQ1OFZ3UOwTVQfE" ,sheet= "Top quoted Categorized" ) %>%
rename (group= ` group 1 ` ,subgroup= ` group 1.1 ` ) %>%
mutate (group= case_when (
! is.na (status) & status== "Pois" ~ "Pois" ,
str_to_lower (organisation) %in% c ("yle" ,"sanoma" ,"stt" ,"il" ,"iltalehti" ) ~ "Media" ,
str_to_lower (canonical_name) %in% c ("yle" ,"sanoma" ,"stt" ,"il" ,"iltalehti" ) ~ "Media" ,
str_to_lower (organisation)== "poliisi" ~ "Poliisi" ,
str_to_lower (canonical_name)== "poliisi" ~ "Poliisi" ,
T~ group),subgroup= case_when (
! is.na (status) & status== "Pois" ~ "Pois" ,
str_to_lower (organisation) %in% c ("yle" ,"sanoma" ,"stt" ,"il" ,"iltalehti" ) ~ "Media" ,
str_to_lower (canonical_name) %in% c ("yle" ,"sanoma" ,"stt" ,"il" ,"iltalehti" ) ~ "Media" ,
str_to_lower (organisation)== "poliisi" ~ "Poliisi" ,
str_to_lower (canonical_name)== "poliisi" ~ "Poliisi" ,
T~ subgroup)) %>%
mutate (subgroup= coalesce (subgroup,group)) %>%
filter (! is.na (group))
random_quotes_categorized_local <- read_sheet (ss= "1Lx2nhdwdgJsxrn047Wnre1S3GsrQAQ1OFZ3UOwTVQfE" ,sheet= "random quotes" ) %>%
union_all (read_sheet (ss= "1Lx2nhdwdgJsxrn047Wnre1S3GsrQAQ1OFZ3UOwTVQfE" ,sheet= "random quotes 2" )) %>%
rename (group= ` group 1 ` ,subgroup= ` group 1.1 ` ) %>%
mutate (group= case_when (
! is.na (status) & status== "Pois" ~ "Pois" ,
str_to_lower (organisation) %in% c ("yle" ,"sanoma" ,"stt" ,"il" ,"iltalehti" ) ~ "Media" ,
str_to_lower (organisation)== "poliisi" ~ "Poliisi" ,
T~ group),subgroup= case_when (
! is.na (status) & status== "Pois" ~ "Pois" ,
str_to_lower (organisation) %in% c ("yle" ,"sanoma" ,"stt" ,"il" ,"iltalehti" ) ~ "Media" ,
str_to_lower (organisation)== "poliisi" ~ "Poliisi" ,
T~ subgroup)) %>%
mutate (subgroup= coalesce (subgroup,group)) %>%
filter (! is.na (group))
top_quotes_categorized_a <- top_quotes_categorized_local %>%
copy_to_a (con, unique_indexes= list (c ("canonical_name" ,"after" ,"before" )))
top_quotes_categorized_c <- top_quotes_categorized_a %>%
compute_c ()
random_quotes_categorized_a <- random_quotes_categorized_local %>%
copy_to_a (con)
random_quotes_categorized_c <- random_quotes_categorized_a %>%
compute_c ()
random_quotes_categorized_local <- random_quotes_categorized_a %>%
left_join (quotes_a %>% select (q_id,a_id)) %>%
left_join (article_types_a) %>%
collect ()
```
# Concentration of quote sources through time
```{r}
qdby <- tbl (con,sql ("
SELECT
media,
type,
year_created,
direct,
canonical_name,
COUNT(*) AS n
FROM flopo.quotes_c
INNER JOIN flopo.q_qa_c USING (q_id)
INNER JOIN flopo.quote_authors_c USING (qa_id)
INNER JOIN flopo.quote_author_names_to_canonical_names_c USING (name)
INNER JOIN flopo.articles_c USING (a_id)
INNER JOIN article_types_c USING (a_id)
GROUP BY media,type,year_created,direct,canonical_name
" )) %>%
anti_join (top_quotes_categorized_c %>% filter (group== "Pois" ) %>% select (canonical_name)) %>%
select (- canonical_name) %>%
collect ()
```
```{r,fig.width=8}
library (ineq)
qdby %>%
filter (type== "Domestic general/political/economic news" ) %>%
filter (! (type %in% c ("Cruft" ,"Other" )),! str_detect (type,"opinion" )) %>%
group_by (media,type,direct,year_created) %>%
summarize (G= Gini (n),.groups= "drop" ) %>%
mutate (direct= if_else (direct== 1 ,"direct" ,"indirect" )) %>%
ggplot (aes (x= year_created,y= G,color= media)) +
geom_line () +
facet_grid (direct~ type) +
theme_hsci_discrete () +
ylab ("Gini coefficient" ) +
xlab ("Year" )
```
Interpretations:
* More diversity in direct quote sources than in indirect quote source
* Diversity is mostly decreasing. YLE does best here (but there may be lingering data problems ATM)
# Top sources in each subgroup
```{r}
yearly_top_quotes_local <- top_quotes_categorized_c %>%
select (- c (indirect: canonical_joined_role_all)) %>%
inner_join (quote_author_names_to_canonical_names_c) %>%
inner_join (quote_authors_c) %>%
inner_join (q_qa_c) %>%
right_join (
article_types_c %>%
filter (type== "Domestic general/political/economic news" ) %>%
inner_join (quotes_c)) %>%
inner_join (articles_c) %>%
filter (is.na (after) | year_created>= after, is.na (before) | year_created< before) %>%
count (canonical_name,group,subgroup,direct,media,year_created) %>%
ungroup () %>%
collect () %>%
mutate (direct= if_else (direct== 1 ,"direct" ,"indirect" ))
```
```{r}
yearly_top_quotes_local %>%
group_by (canonical_name,subgroup,direct) %>%
tally (wt= n) %>%
# filter(n>1000) %>%
group_by (subgroup,direct) %>%
arrange (desc (n)) %>%
slice_head (n= 3 ) %>%
gt ()
```
# Proportions of quotes categorized in the different categories in domestic general/political/economic news
```{r,fig.width=8,fig.height=11}
yearly_top_quotes_local %>%
group_by (group,media,direct,year_created) %>%
summarize (n= sum (n),.groups= "drop" ) %>%
group_by (media,direct,year_created) %>%
mutate (prop= n/ sum (n)) %>%
ungroup () %>%
filter (! is.na (group)) %>%
ggplot (aes (x= year_created,y= prop,color= group)) +
geom_step () +
facet_grid (media~ direct) +
theme_hsci_discrete (base_family= "Arial" ) +
scale_y_continuous (labels= scales:: percent_format (accuracy= 1 )) +
theme (legend.position = 'bottom' )
```
Interpretation:
* No clear patterns apart from the police massively gaining prominence and the rise in media self-references
# Proportions of quotes categorized in the different subcategories in domestic general/political/economic news.
**Largest subgroups of Politiikka,Poliisi,Media and Pois removed**
```{r,fig.width=8,fig.height=11}
yearly_top_quotes_local %>%
filter (subgroup!= "Politiikka" ,subgroup!= "Poliisi" ,subgroup!= "Media" ,subgroup!= "Pois" ) %>%
group_by (subgroup,media,direct,year_created) %>%
summarize (n= sum (n),.groups= "drop" ) %>%
group_by (media,direct,year_created) %>%
mutate (prop= n/ sum (n)) %>%
ungroup () %>%
filter (! is.na (subgroup)) %>%
ggplot (aes (x= year_created,y= prop,color= subgroup)) +
geom_step () +
facet_grid (media~ direct) +
theme_hsci_discrete (base_family= "Arial" ) +
scale_y_continuous (labels= scales:: percent_format (accuracy= 1 )) +
theme (legend.position = 'bottom' )
```
Interpretation:
* Even with the police removed, "Viranomainen/julkistoimija" gains prominence in indirect quotes, mostly consisting of "oikeus" and "pelastuslaitos"
# Overall proportions
```{r,fig.width=8,fig.height=5}
d <- random_quotes_categorized_local %>%
mutate (period= as_factor (2009 + 5 * floor ((year (time_created)- 2009 )/ 5 )), direct= if_else (direct== 0 ,"indirect" ,"direct" )) %>%
count (group,media,direct,period) %>%
group_by (media,direct,period) %>%
group_modify (~ bind_cols (.x,scimp_bmde (.x$ n, p= 1 ))) %>%
mutate (prop= n/ sum (n),rest= TRUE ) %>%
ungroup () %>%
full_join (yearly_top_quotes_local %>%
mutate (period= as_factor (2009 + 5 * floor ((year_created-2009 )/ 5 ))) %>%
group_by (group,media,direct,period) %>%
summarize (n= sum (n),.groups= "drop" ) %>%
group_by (media,direct,period) %>%
mutate (prop= n/ sum (n)) %>%
ungroup () %>%
mutate (rest= is.na (group)),by= c ("media" ,"direct" ,"period" ,"rest" )) %>%
mutate (group= coalesce (group.x,group.y), prop= if_else (is.na (group.y),prop.y* prop.x,prop.y),upper_limit= if_else (is.na (group.y),prop.y* upper_limit,prop.y), lower_limit= if_else (is.na (group.y),prop.y* lower_limit,prop.y)) %>%
group_by (media,direct,period,group) %>%
summarize (prop= sum (prop),upper_limit= sum (upper_limit),lower_limit= sum (lower_limit),.groups= "drop" ) %>%
filter (group!= "Pois" ) %>%
group_by (media,direct,period) %>%
mutate (prop= prop/ sum (prop), upper_limit= upper_limit/ sum (upper_limit), lower_limit= lower_limit/ sum (lower_limit)) %>%
ungroup () %>%
# mutate(period=if_else(period==FALSE,"1999-2008","2009-2018")) %>%
identity ()
d2 <- random_quotes_categorized_local %>%
mutate (period= as_factor (2009 + 5 * floor ((year (time_created)- 2009 )/ 5 )), direct= if_else (direct== 0 ,"indirect" ,"direct" )) %>%
count (subgroup,media,direct,period) %>%
group_by (media,direct,period) %>%
group_modify (~ bind_cols (.x,scimp_bmde (.x$ n, p= 1 ))) %>%
mutate (prop= n/ sum (n),rest= TRUE ) %>%
ungroup () %>%
full_join (yearly_top_quotes_local %>%
mutate (period= as_factor (2009 + 5 * floor ((year_created-2009 )/ 5 ))) %>%
group_by (subgroup,media,direct,period) %>%
summarize (n= sum (n),.groups= "drop" ) %>%
group_by (media,direct,period) %>%
mutate (prop= n/ sum (n)) %>%
ungroup () %>%
mutate (rest= is.na (subgroup)),by= c ("media" ,"direct" ,"period" ,"rest" )) %>%
mutate (subgroup= coalesce (subgroup.x,subgroup.y), prop= if_else (is.na (subgroup.y),prop.y* prop.x,prop.y),upper_limit= if_else (is.na (subgroup.y),prop.y* upper_limit,prop.y), lower_limit= if_else (is.na (subgroup.y),prop.y* lower_limit,prop.y)) %>%
group_by (media,direct,period,subgroup) %>%
summarize (prop= sum (prop),upper_limit= sum (upper_limit),lower_limit= sum (lower_limit),.groups= "drop" ) %>%
filter (subgroup!= "Pois" ) %>%
group_by (media,direct,period) %>%
mutate (prop= prop/ sum (prop), upper_limit= upper_limit/ sum (upper_limit), lower_limit= lower_limit/ sum (lower_limit)) %>%
ungroup () %>%
# mutate(period=if_else(period==FALSE,"1999-2008","2009-2018")) %>%
identity ()
```
## By group
```{r,fig.width=8,fig.height=8}
d %>%
ggplot (aes (x= group,color= period)) +
geom_point (aes (y= prop),position= position_dodge (width= 1 )) +
geom_errorbar (aes (ymin= lower_limit,ymax= upper_limit),position= position_dodge (width= 1 )) +
facet_grid (media~ direct) +
coord_flip () +
theme_hsci_discrete (base_family= "Arial" ) +
scale_y_continuous (labels= scales:: percent_format (accuracy= 1 )) +
theme (legend.position= "bottom" ) +
xlab ("Group" ) +
ylab ("Proportion" )
```
```{r,fig.width=8,fig.height=8}
d %>%
ggplot (aes (x= group,group= media,color= media)) +
geom_point (aes (y= prop),position= position_dodge (width= 1 )) +
geom_errorbar (aes (ymin= lower_limit,ymax= upper_limit),position= position_dodge (width= 1 )) +
facet_grid (period~ direct) +
coord_flip () +
theme_hsci_discrete (base_family= "Arial" ) +
scale_y_continuous (labels= scales:: percent_format (accuracy= 1 )) +
theme (legend.position= "bottom" ) +
xlab ("Group" ) +
ylab ("Proportion" )
```
```{r,fig.width=8,fig.height=11}
d %>%
ggplot (aes (x= group,group= media,color= direct)) +
geom_point (aes (y= prop),position= position_dodge (width= 1 )) +
geom_errorbar (aes (ymin= lower_limit,ymax= upper_limit),position= position_dodge (width= 1 )) +
facet_grid (media~ period) +
coord_flip () +
theme_hsci_discrete (base_family= "Arial" ) +
scale_y_continuous (labels= scales:: percent_format (accuracy= 1 )) +
theme (legend.position= "bottom" ) +
xlab ("Subgroup" ) +
ylab ("Proportion" )
```
## By subgroup
```{r,fig.width=8,fig.height=11}
d2 %>%
ggplot (aes (x= subgroup,color= period)) +
geom_point (aes (y= prop),position= position_dodge (width= 1 )) +
geom_errorbar (aes (ymin= lower_limit,ymax= upper_limit),position= position_dodge (width= 1 )) +
facet_grid (media~ direct) +
coord_flip () +
theme_hsci_discrete (base_family= "Arial" ) +
scale_y_continuous (labels= scales:: percent_format (accuracy= 1 )) +
theme (legend.position= "bottom" ) +
xlab ("Subgroup" ) +
ylab ("Proportion" )
```
```{r,fig.width=8,fig.height=11}
d2 %>%
ggplot (aes (x= subgroup,group= media,color= media)) +
geom_point (aes (y= prop),position= position_dodge (width= 1 )) +
geom_errorbar (aes (ymin= lower_limit,ymax= upper_limit),position= position_dodge (width= 1 )) +
facet_grid (period~ direct) +
coord_flip () +
theme_hsci_discrete (base_family= "Arial" ) +
scale_y_continuous (labels= scales:: percent_format (accuracy= 1 )) +
theme (legend.position= "bottom" ) +
xlab ("Subgroup" ) +
ylab ("Proportion" )
```
```{r,fig.width=8,fig.height=11}
d2 %>%
ggplot (aes (x= subgroup,group= media,color= direct)) +
geom_point (aes (y= prop),position= position_dodge (width= 1 )) +
geom_errorbar (aes (ymin= lower_limit,ymax= upper_limit),position= position_dodge (width= 1 )) +
facet_grid (media~ period) +
coord_flip () +
theme_hsci_discrete (base_family= "Arial" ) +
scale_y_continuous (labels= scales:: percent_format (accuracy= 1 )) +
theme (legend.position= "bottom" ) +
xlab ("Subgroup" ) +
ylab ("Proportion" )
```