Code
library(tidyverse)
library(here)
library(fst)
library(DT)
library(topicmodels)
library(tm)library(tidyverse)
library(here)
library(fst)
library(DT)
library(topicmodels)
library(tm)A separate process pulls posts data from the Crowdtangle (CT) API. We read a “flattened” version of the JSON output.
zzz <- read_fst(here("data/parsed.fst"))
zzz <- zzz %>%
distinct(platformId,name,date,postUrl,score,id,.keep_all = TRUE) %>%
group_by(platformId,name,date,postUrl) %>%
filter(score==max(score)) %>%
ungroup() %>%
group_by(platformId) %>%
slice(1) %>%
ungroup()
## merge on some other account info
load(file=here("data/accounts.RData"))
zzz <- zzz %>%
left_join(accounts %>%
select(list_id,list_title=title,id,pageAdminTopCountry),
by="id")
## merge on federal etc information
fed_info <- read_csv(file=here("data/fb_federal.csv"))
zzz <- zzz %>%
left_join(fed_info %>% select(id,pol_type),by="id") %>%
mutate(pol_type = if_else(
pageCategory %in% c("POLITICAL_PARTY","POLITICAL_ORGANIZATION"),
"federal",
pol_type)
)The data span:
political <- c("POLITICIAN",
"POLITICAL_PARTY",
"POLITICAL_ORGANIZATION",
"POLITICAL_CANDIDATE")
political_table <- zzz %>%
filter(pageCategory %in% political | list_title=="Independents") %>%
group_by(name) %>%
mutate(n=n()) %>%
slice(1) %>%
ungroup() %>%
select(name,handle,cat=pageCategory,pol_type,n) %>%
arrange(desc(n))Posts by politicians and political parties, groups etc can be flagged with the CT pageCategory flag. We also include C200-supported independent candidates, C200 itself, and various “Voices of …” Facebook accounts.
We also code politicians and political entities by whether they were active in the 2022 federal election (we manually coded this, needs automation for scaling to broader set of political entities).
library(janitor)
xt <- zzz %>%
select(id,pol_type,pageCategory) %>%
distinct() %>%
tabyl(pageCategory, pol_type) %>%
adorn_totals("row")
knitr::kable(xt)| pageCategory | federal | former | state | NA_ |
|---|---|---|---|---|
| ARMED_FORCES | 0 | 0 | 0 | 1 |
| ART_GALLERY | 0 | 0 | 0 | 1 |
| ART_MUSEUM | 0 | 0 | 0 | 1 |
| ARTIST | 0 | 0 | 0 | 1 |
| AUDIOLOGIST | 0 | 0 | 0 | 1 |
| BLOGGER | 0 | 0 | 0 | 1 |
| BOTANICAL_GARDEN | 0 | 0 | 0 | 1 |
| BROADCASTING_MEDIA_PRODUCTION | 0 | 0 | 0 | 6 |
| BUSINESS_CONSULTANT | 0 | 0 | 0 | 1 |
| COMMUNITY | 1 | 0 | 0 | 7 |
| COMMUNITY_ORGANIZATION | 4 | 0 | 0 | 2 |
| CONSULATE_EMBASSY | 0 | 0 | 0 | 15 |
| CULTURAL_CENTER | 0 | 0 | 0 | 1 |
| EDU_SITE | 0 | 0 | 0 | 1 |
| EDUCATION_COMPANY | 0 | 0 | 0 | 3 |
| ENTERTAINMENT_SITE | 0 | 0 | 0 | 1 |
| ENTREPRENEUR | 0 | 0 | 0 | 1 |
| FESTIVAL | 0 | 0 | 0 | 1 |
| GOVERNMENT_OFFICIAL | 0 | 0 | 0 | 13 |
| GOVERNMENT_ORGANIZATION | 0 | 0 | 0 | 79 |
| HISTORY_MUSEUM | 0 | 0 | 0 | 2 |
| KIDS_SITE | 0 | 0 | 0 | 1 |
| LAW_ENFORCEMENT | 0 | 0 | 0 | 1 |
| LOCAL | 0 | 0 | 0 | 1 |
| MAGAZINE | 0 | 0 | 0 | 1 |
| MEDIA | 0 | 0 | 0 | 3 |
| MEDIA_NEWS_COMPANY | 0 | 0 | 0 | 14 |
| NEWS_PERSONALITY | 0 | 0 | 0 | 1 |
| NEWS_SITE | 0 | 0 | 0 | 13 |
| NGO | 1 | 0 | 0 | 0 |
| NON_PROFIT | 0 | 0 | 0 | 3 |
| PERSON | 3 | 0 | 0 | 42 |
| POLITICAL_CANDIDATE | 6 | 0 | 3 | 0 |
| POLITICAL_ORGANIZATION | 8 | 0 | 0 | 0 |
| POLITICAL_PARTY | 23 | 0 | 0 | 0 |
| POLITICIAN | 169 | 14 | 139 | 0 |
| PUBLIC_SERVICES_GOVERNMENT | 0 | 0 | 0 | 4 |
| RADIO_STATION | 0 | 0 | 0 | 1 |
| REGIONAL_SITE | 0 | 0 | 0 | 1 |
| SCIENCE_MUSEUM | 0 | 0 | 0 | 1 |
| TOPIC_ARTS_ENTERTAINMENT | 0 | 0 | 0 | 1 |
| TOPIC_ISLAND | 0 | 0 | 0 | 1 |
| TOPIC_LANDMARK | 0 | 0 | 0 | 1 |
| TOPIC_LIBRARY | 0 | 0 | 0 | 2 |
| TOPIC_MUSEUM | 0 | 0 | 0 | 2 |
| TOPIC_NEWSPAPER | 0 | 0 | 0 | 7 |
| TV_CHANNEL | 0 | 0 | 0 | 1 |
| TV_SHOW | 0 | 0 | 0 | 1 |
| UNIVERSITY | 0 | 0 | 0 | 1 |
| VISUAL_ARTS | 0 | 0 | 0 | 1 |
| WEBSITE | 0 | 0 | 0 | 2 |
| YOUTH_ORGANIZATION | 0 | 0 | 0 | 1 |
| NA | 1 | 0 | 0 | 0 |
| Total | 216 | 14 | 142 | 248 |
datatable(
political_table,
extensions = c("Buttons","RowGroup"),
class=c("display","row_grouped"),
options=list(dom="Bft",
paging=FALSE,
scrollY="800px",
scrollCollapse=TRUE,
buttons=c("copy","csv","excel")
)
) %>%
formatRound(columns="n",digits=0)Many of these are not Federal politicians or candidates, so we engage in some matching/filtering to lists of incumbents/candidates in the 2022 federal election.
media <- c("BROADCASTING_MEDIA_PRODUCTION",
"MAGAZINE",
"MEDIA",
"MEDIA_NEWS_COMPANY",
"NEWS_PERSONALITY",
"NEWS_SITE",
"RADIO_STATION",
"TOPIC_NEWSPAPER",
"TV_CHANNEL",
"TV_SHOW")
media_table <- zzz %>%
filter(pageCategory %in% media) %>%
group_by(name) %>%
mutate(n=n()) %>%
slice(1) %>%
ungroup() %>%
select(name,handle,cat=pageCategory,country=pageAdminTopCountry,n) %>%
arrange(desc(n))datatable(media_table,
extensions = c("Buttons","RowGroup"),
class=c("display","row_grouped"),
options=list(dom="Bft",
paging=FALSE,
scrollY="800px",
scrollCollapse=TRUE,
buttons=c("copy","csv","excel"))) %>%
formatRound(columns="n",digits=0)zzz <- zzz %>%
mutate(type = case_when(
pageCategory %in% media ~ "media",
pageCategory %in% political | list_title == "Independents" ~ "political",
TRUE ~ "other")
) %>%
mutate(
class = case_when(
pol_type=="federal" ~ "Political: Federal",
type=="political" ~ "Political: Other",
type=="media" ~ "Media",
TRUE ~ "Other")
) zzz %>%
mutate(d = lubridate::as_date(date),
day=strftime(d,"%a"),
day = if_else(day %in% c("Mon","Wed","Fri"),
str_sub(day,1,1),
str_sub(day,1,2))
) %>%
group_by(d) %>%
summarise(n=n(),day=day[1]) %>%
ungroup() %>%
ggplot(.,
aes(x=d,y=n,label=day)) +
geom_bar(stat="identity") +
geom_text(color="white",size=1.33,vjust=1) +
scale_x_date("",expand=c(0,0))zzz %>%
mutate(d = lubridate::as_date(date),
day=strftime(d,"%a"),
day = if_else(day %in% c("Mon","Wed","Fri"),
str_sub(day,1,1),
str_sub(day,1,2))
) %>%
mutate(
class = case_when(
pol_type=="federal" ~ "Political: Federal",
type=="political" ~ "Political: Other",
type=="media" ~ "Media",
TRUE ~ "Other")
) %>%
group_by(d,class,.drop = FALSE) %>%
summarise(n=n(),day=day[1]) %>%
ungroup() %>%
ggplot(.,
aes(x=d,y=n,
label=day,
group=class,
color=class)) +
geom_line() +
scale_x_date("",expand=c(0,0))Tidying up and tokenizing contents of each post’s message.
library(tidytext)
library(stringr)
## remove some bog standard phrases and sentences
remove_reg <- "&|<|>"
tidy_fb <- zzz %>%
rename(text=message) %>%
mutate(text = str_remove(text,pattern="Watch the full episode at www.skynews.com.au/flash")) %>%
mutate(text = str_remove(text,pattern="7news at 6pm.")) %>%
mutate(text = str_remove(text,pattern="More local news.*$")) %>%
mutate(text = str_remove(text,pattern="Read more news online here: https://www.theage.com.au/")) %>%
mutate(text = if_else(is.na(text),description,text)) %>%
filter(!is.na(text)) %>%
mutate(text = str_remove_all(text, remove_reg)) %>%
unnest_tweets(word, text, strip_url=TRUE,strip_punct = FALSE) %>%
filter(!word %in% stop_words$word,
!word %in% str_remove_all(stop_words$word, "'"),
str_detect(word, "[a-z]")) %>%
mutate(word = str_remove(word,pattern="\\.$|,$"))We remove URLs and hashtags etc from the word list:
badWords <- c("news","7news","watch","flash","00pm","6pm")
tidy_fb <- tidy_fb %>%
filter(
!grepl(pattern="\\.au$",word),
!grepl(pattern="^#",word),
!grepl(pattern="^www",word),
!grepl(pattern="[0-9]{1,}[a|p]m",word),
!grepl(pattern="[0-9]{1,}st",word),
!grepl(pattern="[0-9]{1,}nd",word),
!grepl(pattern="[0-9]{1,}rd",word),
!grepl(pattern="[0-9]{1,}th",word),
!(word %in% badWords),
str_count(word,pattern="\\.") < 1
)tidy_fb <- tidy_fb %>%
mutate(word_stem=stemDocument(word)) %>%
filter(word_stem !="it'") %>%
mutate(word_stem = if_else(word_stem == "australian","australia",word_stem),
word_stem = if_else(word_stem == "victorian","victoria",word_stem))We have 92842 unique stemmed words. We’ll subset down to words appearing at least 100 times.
word_ratios <- tidy_fb %>%
filter(class %in% c("Political: Federal","Media")) %>%
mutate(class = if_else(class=="Political: Federal",
"federal",
tolower(class))
) %>%
count(class,word_stem) %>%
group_by(word_stem) %>%
filter(sum(n) >= 100) %>%
ungroup() %>%
pivot_wider(names_from = class,values_from = n,values_fill = 0) %>%
mutate(across(where(is.numeric),sum,.names = "{.col}_sum")) %>%
mutate(media_norm = (media+1)/(media_sum),
federal_norm = (federal+1)/(federal_sum)) %>%
mutate(logratio = log(federal_norm / media_norm)) %>%
arrange(desc(logratio))library(DT)
datatable(word_ratios %>%
select(word_stem,federal,media,logratio),
rownames = FALSE) %>%
formatRound(columns="logratio",2)word_political <- tidy_fb %>%
filter(class %in% c("Political: Federal")) %>%
group_by(word_stem) %>%
mutate(word_total=n()) %>%
filter(word_total>=100)
political_dtm <- word_political %>%
unite(post,name,platformId) %>%
count(post,word_stem) %>%
cast_dtm(post,word_stem,n)m <- LDA(political_dtm,k=6)m %>%
tidy() %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()phat <- posterior(m)
phat <- as_tibble(phat$topics) %>%
mutate(lab = dimnames(phat$topics)[[1]]) %>%
mutate(platformId = gsub(lab,pattern="^.*_(.*_.*)$",replacement="\\1")) %>%
left_join(zzz %>%
select(id,platformId,postUrl),
by="platformId")word_media <- tidy_fb %>%
filter(type=="media") %>%
group_by(word_stem) %>%
mutate(word_total=n()) %>%
filter(word_total>=100)
media_dtm <- word_media %>%
unite(post,name,platformId) %>%
count(post,word_stem) %>%
cast_dtm(post,word_stem,n)m_media <- LDA(political_dtm,k=12)m_media %>%
tidy() %>%
group_by(topic) %>%
slice_max(beta, n = 10) %>%
ungroup() %>%
mutate(term = reorder_within(term, beta, topic)) %>%
ggplot(aes(beta, term, fill = factor(topic))) +
geom_col(show.legend = FALSE) +
facet_wrap(~ topic, scales = "free") +
scale_y_reordered()