The election campaign on FB

Authors

Affiliations

Professor Simon Jackman

University of Sydney

Associate Professor Andrea Carson

La Trobe University

Published

2:19PM 2 July 2022

Code

library(tidyverse)
library(here)
library(fst)
library(DT)
library(topicmodels)
library(tm)

Read data

A separate process pulls posts data from the Crowdtangle (CT) API. We read a “flattened” version of the JSON output.

Code

zzz <- read_fst(here("data/parsed.fst"))
zzz <- zzz %>% 
  distinct(platformId,name,date,postUrl,score,id,.keep_all = TRUE) %>% 
  group_by(platformId,name,date,postUrl) %>% 
  filter(score==max(score)) %>% 
  ungroup() %>% 
  group_by(platformId) %>% 
  slice(1) %>% 
  ungroup()


## merge on some other account info
load(file=here("data/accounts.RData"))
zzz <- zzz %>% 
  left_join(accounts %>%
              select(list_id,list_title=title,id,pageAdminTopCountry),
            by="id")

## merge on federal etc information
fed_info <- read_csv(file=here("data/fb_federal.csv"))
zzz <- zzz %>% 
  left_join(fed_info %>% select(id,pol_type),by="id") %>% 
  mutate(pol_type = if_else(
    pageCategory %in% c("POLITICAL_PARTY","POLITICAL_ORGANIZATION"),
    "federal",
    pol_type)
  )

The data span:

250,555 unique FB posts
2022-01-01 00:00:00 to 2022-05-21 23:59:45
620 unique accounts/posting-entities

Code

political <- c("POLITICIAN",
               "POLITICAL_PARTY",
               "POLITICAL_ORGANIZATION",
               "POLITICAL_CANDIDATE")

political_table <- zzz %>% 
    filter(pageCategory %in% political | list_title=="Independents") %>% 
    group_by(name) %>%
    mutate(n=n()) %>%
    slice(1) %>% 
    ungroup() %>% 
    select(name,handle,cat=pageCategory,pol_type,n) %>% 
    arrange(desc(n))

372 political entities

Posts by politicians and political parties, groups etc can be flagged with the CT pageCategory flag. We also include C200-supported independent candidates, C200 itself, and various “Voices of …” Facebook accounts.

We also code politicians and political entities by whether they were active in the 2022 federal election (we manually coded this, needs automation for scaling to broader set of political entities).

Code

library(janitor)
xt <- zzz %>%
  select(id,pol_type,pageCategory) %>%
  distinct() %>%
  tabyl(pageCategory, pol_type) %>%
  adorn_totals("row") 
knitr::kable(xt)

pageCategory	federal	former	state	NA_
ARMED_FORCES	0	0	0	1
ART_GALLERY	0	0	0	1
ART_MUSEUM	0	0	0	1
ARTIST	0	0	0	1
AUDIOLOGIST	0	0	0	1
BLOGGER	0	0	0	1
BOTANICAL_GARDEN	0	0	0	1
BROADCASTING_MEDIA_PRODUCTION	0	0	0	6
BUSINESS_CONSULTANT	0	0	0	1
COMMUNITY	1	0	0	7
COMMUNITY_ORGANIZATION	4	0	0	2
CONSULATE_EMBASSY	0	0	0	15
CULTURAL_CENTER	0	0	0	1
EDU_SITE	0	0	0	1
EDUCATION_COMPANY	0	0	0	3
ENTERTAINMENT_SITE	0	0	0	1
ENTREPRENEUR	0	0	0	1
FESTIVAL	0	0	0	1
GOVERNMENT_OFFICIAL	0	0	0	13
GOVERNMENT_ORGANIZATION	0	0	0	79
HISTORY_MUSEUM	0	0	0	2
KIDS_SITE	0	0	0	1
LAW_ENFORCEMENT	0	0	0	1
LOCAL	0	0	0	1
MAGAZINE	0	0	0	1
MEDIA	0	0	0	3
MEDIA_NEWS_COMPANY	0	0	0	14
NEWS_PERSONALITY	0	0	0	1
NEWS_SITE	0	0	0	13
NGO	1	0	0	0
NON_PROFIT	0	0	0	3
PERSON	3	0	0	42
POLITICAL_CANDIDATE	6	0	3	0
POLITICAL_ORGANIZATION	8	0	0	0
POLITICAL_PARTY	23	0	0	0
POLITICIAN	169	14	139	0
PUBLIC_SERVICES_GOVERNMENT	0	0	0	4
RADIO_STATION	0	0	0	1
REGIONAL_SITE	0	0	0	1
SCIENCE_MUSEUM	0	0	0	1
TOPIC_ARTS_ENTERTAINMENT	0	0	0	1
TOPIC_ISLAND	0	0	0	1
TOPIC_LANDMARK	0	0	0	1
TOPIC_LIBRARY	0	0	0	2
TOPIC_MUSEUM	0	0	0	2
TOPIC_NEWSPAPER	0	0	0	7
TV_CHANNEL	0	0	0	1
TV_SHOW	0	0	0	1
UNIVERSITY	0	0	0	1
VISUAL_ARTS	0	0	0	1
WEBSITE	0	0	0	2
YOUTH_ORGANIZATION	0	0	0	1
NA	1	0	0	0
Total	216	14	142	248

Code

datatable(
  political_table,
  extensions = c("Buttons","RowGroup"),
  class=c("display","row_grouped"),
  options=list(dom="Bft",
               paging=FALSE,
               scrollY="800px",
               scrollCollapse=TRUE,
               buttons=c("copy","csv","excel")
               )
  ) %>%
  formatRound(columns="n",digits=0)

Many of these are not Federal politicians or candidates, so we engage in some matching/filtering to lists of incumbents/candidates in the 2022 federal election.

Code

media <- c("BROADCASTING_MEDIA_PRODUCTION",
           "MAGAZINE",
           "MEDIA",
           "MEDIA_NEWS_COMPANY",
           "NEWS_PERSONALITY",
           "NEWS_SITE",
           "RADIO_STATION",
           "TOPIC_NEWSPAPER",
           "TV_CHANNEL",
           "TV_SHOW")

media_table <- zzz %>% 
    filter(pageCategory %in% media) %>% 
     group_by(name) %>%
    mutate(n=n()) %>%
    slice(1) %>% 
    ungroup() %>% 
    select(name,handle,cat=pageCategory,country=pageAdminTopCountry,n) %>% 
    arrange(desc(n))

48 Media entities

Code

datatable(media_table,
          extensions = c("Buttons","RowGroup"),
          class=c("display","row_grouped"),
          options=list(dom="Bft",
                       paging=FALSE,
                       scrollY="800px",
                       scrollCollapse=TRUE,
                       buttons=c("copy","csv","excel"))) %>%
  formatRound(columns="n",digits=0)

Code

zzz <- zzz %>% 
  mutate(type = case_when(
    pageCategory %in% media ~ "media",
    pageCategory %in% political | list_title == "Independents" ~ "political",
    TRUE ~ "other")
  ) %>% 
  mutate(
    class = case_when(
      pol_type=="federal" ~ "Political: Federal",
      type=="political" ~ "Political: Other",
      type=="media" ~ "Media",
      TRUE ~ "Other")
  )

Count of posts by day

Code

zzz %>% 
  mutate(d = lubridate::as_date(date),
         day=strftime(d,"%a"),
         day = if_else(day %in% c("Mon","Wed","Fri"),
                       str_sub(day,1,1),
                       str_sub(day,1,2))
         ) %>% 
  group_by(d) %>% 
  summarise(n=n(),day=day[1]) %>% 
  ungroup() %>% 
  ggplot(.,
         aes(x=d,y=n,label=day)) + 
  geom_bar(stat="identity") + 
  geom_text(color="white",size=1.33,vjust=1) + 
  scale_x_date("",expand=c(0,0))

Code

zzz %>% 
  mutate(d = lubridate::as_date(date),
         day=strftime(d,"%a"),
         day = if_else(day %in% c("Mon","Wed","Fri"),
                       str_sub(day,1,1),
                       str_sub(day,1,2))
         ) %>% 
  mutate(
    class = case_when(
      pol_type=="federal" ~ "Political: Federal",
      type=="political" ~ "Political: Other",
      type=="media" ~ "Media",
      TRUE ~ "Other")
    ) %>% 
  group_by(d,class,.drop = FALSE) %>% 
  summarise(n=n(),day=day[1]) %>% 
  ungroup() %>% 
  ggplot(.,
         aes(x=d,y=n,
             label=day,
             group=class,
             color=class)) + 
  geom_line() + 
  scale_x_date("",expand=c(0,0))

Text pre-processing

Tidying up and tokenizing contents of each post’s message.

Code

library(tidytext)
library(stringr)

## remove some bog standard phrases and sentences

remove_reg <- "&amp;|&lt;|&gt;"
tidy_fb <- zzz %>%
  rename(text=message) %>% 
  mutate(text = str_remove(text,pattern="Watch the full episode at www.skynews.com.au/flash")) %>% 
  mutate(text = str_remove(text,pattern="7news at 6pm.")) %>% 
  mutate(text = str_remove(text,pattern="More local news.*$")) %>% 
  mutate(text = str_remove(text,pattern="Read more news online here: https://www.theage.com.au/")) %>% 
  mutate(text = if_else(is.na(text),description,text)) %>% 
  filter(!is.na(text)) %>% 
  mutate(text = str_remove_all(text, remove_reg)) %>%
  unnest_tweets(word, text, strip_url=TRUE,strip_punct = FALSE) %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$word, "'"),
         str_detect(word, "[a-z]")) %>% 
  mutate(word = str_remove(word,pattern="\\.$|,$"))

We remove URLs and hashtags etc from the word list:

Code

badWords <- c("news","7news","watch","flash","00pm","6pm")
tidy_fb <- tidy_fb %>% 
  filter(
    !grepl(pattern="\\.au$",word),
    !grepl(pattern="^#",word),
    !grepl(pattern="^www",word),
    !grepl(pattern="[0-9]{1,}[a|p]m",word),
    !grepl(pattern="[0-9]{1,}st",word),
    !grepl(pattern="[0-9]{1,}nd",word),
    !grepl(pattern="[0-9]{1,}rd",word),
    !grepl(pattern="[0-9]{1,}th",word),
    !(word %in% badWords),
    str_count(word,pattern="\\.") < 1
  )

Stemming

Code

tidy_fb <- tidy_fb %>% 
  mutate(word_stem=stemDocument(word)) %>% 
  filter(word_stem !="it'") %>% 
  mutate(word_stem = if_else(word_stem == "australian","australia",word_stem),
         word_stem = if_else(word_stem == "victorian","victoria",word_stem))

We have 92842 unique stemmed words. We’ll subset down to words appearing at least 100 times.

Differences in word usage between posts from political and media accounts

Code

word_ratios <- tidy_fb %>%
  filter(class %in% c("Political: Federal","Media")) %>%
  mutate(class = if_else(class=="Political: Federal",
                         "federal",
                         tolower(class))
         ) %>% 
  count(class,word_stem) %>% 
  group_by(word_stem) %>% 
  filter(sum(n) >= 100) %>% 
  ungroup() %>% 
  pivot_wider(names_from = class,values_from = n,values_fill = 0) %>% 
  mutate(across(where(is.numeric),sum,.names = "{.col}_sum")) %>%
  mutate(media_norm = (media+1)/(media_sum),
         federal_norm = (federal+1)/(federal_sum)) %>% 
  mutate(logratio = log(federal_norm / media_norm)) %>%
  arrange(desc(logratio))

Code

library(DT)
datatable(word_ratios %>%
            select(word_stem,federal,media,logratio),
          rownames = FALSE) %>%
  formatRound(columns="logratio",2)

Topic model fit to posts by federal political entities

Code

word_political <- tidy_fb %>% 
  filter(class %in% c("Political: Federal")) %>%
  group_by(word_stem) %>% 
  mutate(word_total=n()) %>% 
  filter(word_total>=100)

political_dtm <- word_political %>% 
  unite(post,name,platformId) %>% 
  count(post,word_stem) %>% 
  cast_dtm(post,word_stem,n)

Code

m <- LDA(political_dtm,k=6)

Code

m %>%
  tidy() %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

Code

phat <- posterior(m)
phat <- as_tibble(phat$topics) %>% 
  mutate(lab = dimnames(phat$topics)[[1]]) %>% 
  mutate(platformId = gsub(lab,pattern="^.*_(.*_.*)$",replacement="\\1")) %>% 
  left_join(zzz %>% 
              select(id,platformId,postUrl),
            by="platformId")

Topic model fit to posts by media entities

Code

word_media <- tidy_fb %>% 
  filter(type=="media") %>% 
  group_by(word_stem) %>% 
  mutate(word_total=n()) %>% 
  filter(word_total>=100)

media_dtm <- word_media %>% 
  unite(post,name,platformId) %>% 
  count(post,word_stem) %>% 
  cast_dtm(post,word_stem,n)

Code

m_media <- LDA(political_dtm,k=12)

Code

m_media %>%
  tidy() %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

--- title: "The election campaign on FB" author: - name: Professor Simon Jackman affiliation: University of Sydney - name: Associate Professor Andrea Carson affiliation: La Trobe University date: now date-format: "h:mmA D MMMM YYYY" format: html: theme: - cosmo - custom.scss mainfont: Avenir fontsize: 14px toc: true fig-width: 8 fig-height: 6 echo: true code-tools: true code-fold: true smooth-scroll: true self-contained: true tbl-cap-location: bottom crossref: tbl-title: Table knitr: opts_knit: echo: TRUE warnings: FALSE message: FALSE execute: keep-md: true warning: false error: false echo: true --- ```{r packages} library(tidyverse) library(here) library(fst) library(DT) library(topicmodels) library(tm) ``` # Read data A separate process pulls posts data from the Crowdtangle (CT) API. We read a "flattened" version of the `JSON` output. ```{r read-data-de-duplicate} zzz <- read_fst(here("data/parsed.fst")) zzz <- zzz %>% distinct(platformId,name,date,postUrl,score,id,.keep_all = TRUE) %>% group_by(platformId,name,date,postUrl) %>% filter(score==max(score)) %>% ungroup() %>% group_by(platformId) %>% slice(1) %>% ungroup() ## merge on some other account info load(file=here("data/accounts.RData")) zzz <- zzz %>% left_join(accounts %>% select(list_id,list_title=title,id,pageAdminTopCountry), by="id") ## merge on federal etc information fed_info <- read_csv(file=here("data/fb_federal.csv")) zzz <- zzz %>% left_join(fed_info %>% select(id,pol_type),by="id") %>% mutate(pol_type = if_else( pageCategory %in% c("POLITICAL_PARTY","POLITICAL_ORGANIZATION"), "federal", pol_type) ) ``` The data span: - `r format(nrow(zzz),big.mark=",")` unique FB posts - `r min(zzz$date)` to `r max(zzz$date)` - `r format(length(unique(zzz$id)),big.mark=",")` unique accounts/posting-entities ```{r make-political-table} political <- c("POLITICIAN", "POLITICAL_PARTY", "POLITICAL_ORGANIZATION", "POLITICAL_CANDIDATE") political_table <- zzz %>% filter(pageCategory %in% political | list_title=="Independents") %>% group_by(name) %>% mutate(n=n()) %>% slice(1) %>% ungroup() %>% select(name,handle,cat=pageCategory,pol_type,n) %>% arrange(desc(n)) ``` ## `r nrow(political_table)` political entities Posts by politicians and political parties, groups etc can be flagged with the CT `pageCategory` flag. We also include C200-supported independent candidates, C200 itself, and various "Voices of ..." Facebook accounts. We also code politicians and political entities by whether they were active in the 2022 `federal` election (we manually coded this, needs automation for scaling to broader set of political entities). ```{r} library(janitor) xt <- zzz %>% select(id,pol_type,pageCategory) %>% distinct() %>% tabyl(pageCategory, pol_type) %>% adorn_totals("row") knitr::kable(xt) ``` ```{r political-table} datatable( political_table, extensions = c("Buttons","RowGroup"), class=c("display","row_grouped"), options=list(dom="Bft", paging=FALSE, scrollY="800px", scrollCollapse=TRUE, buttons=c("copy","csv","excel") ) ) %>% formatRound(columns="n",digits=0) ``` Many of these are not Federal politicians or candidates, so we engage in some matching/filtering to lists of incumbents/candidates in the 2022 federal election. ```{r match-to-federal-politicians} ``` ```{r media-table} media <- c("BROADCASTING_MEDIA_PRODUCTION", "MAGAZINE", "MEDIA", "MEDIA_NEWS_COMPANY", "NEWS_PERSONALITY", "NEWS_SITE", "RADIO_STATION", "TOPIC_NEWSPAPER", "TV_CHANNEL", "TV_SHOW") media_table <- zzz %>% filter(pageCategory %in% media) %>% group_by(name) %>% mutate(n=n()) %>% slice(1) %>% ungroup() %>% select(name,handle,cat=pageCategory,country=pageAdminTopCountry,n) %>% arrange(desc(n)) ``` ## `r nrow(media_table)` Media entities ```{r media} datatable(media_table, extensions = c("Buttons","RowGroup"), class=c("display","row_grouped"), options=list(dom="Bft", paging=FALSE, scrollY="800px", scrollCollapse=TRUE, buttons=c("copy","csv","excel"))) %>% formatRound(columns="n",digits=0) ``` ```{r code-categories} zzz <- zzz %>% mutate(type = case_when( pageCategory %in% media ~ "media", pageCategory %in% political | list_title == "Independents" ~ "political", TRUE ~ "other") ) %>% mutate( class = case_when( pol_type=="federal" ~ "Political: Federal", type=="political" ~ "Political: Other", type=="media" ~ "Media", TRUE ~ "Other") ) ``` ## Count of posts by day ```{r,post-by-day,fig.width=9} zzz %>% mutate(d = lubridate::as_date(date), day=strftime(d,"%a"), day = if_else(day %in% c("Mon","Wed","Fri"), str_sub(day,1,1), str_sub(day,1,2)) ) %>% group_by(d) %>% summarise(n=n(),day=day[1]) %>% ungroup() %>% ggplot(., aes(x=d,y=n,label=day)) + geom_bar(stat="identity") + geom_text(color="white",size=1.33,vjust=1) + scale_x_date("",expand=c(0,0)) ``` ```{r,post-by-day-bytype,fig.width=9} zzz %>% mutate(d = lubridate::as_date(date), day=strftime(d,"%a"), day = if_else(day %in% c("Mon","Wed","Fri"), str_sub(day,1,1), str_sub(day,1,2)) ) %>% mutate( class = case_when( pol_type=="federal" ~ "Political: Federal", type=="political" ~ "Political: Other", type=="media" ~ "Media", TRUE ~ "Other") ) %>% group_by(d,class,.drop = FALSE) %>% summarise(n=n(),day=day[1]) %>% ungroup() %>% ggplot(., aes(x=d,y=n, label=day, group=class, color=class)) + geom_line() + scale_x_date("",expand=c(0,0)) ``` # Text pre-processing Tidying up and tokenizing contents of each post's `message`. ```{r tidy-and-token} library(tidytext) library(stringr) ## remove some bog standard phrases and sentences remove_reg <- "&|<|>" tidy_fb <- zzz %>% rename(text=message) %>% mutate(text = str_remove(text,pattern="Watch the full episode at www.skynews.com.au/flash")) %>% mutate(text = str_remove(text,pattern="7news at 6pm.")) %>% mutate(text = str_remove(text,pattern="More local news.*$")) %>% mutate(text = str_remove(text,pattern="Read more news online here: https://www.theage.com.au/")) %>% mutate(text = if_else(is.na(text),description,text)) %>% filter(!is.na(text)) %>% mutate(text = str_remove_all(text, remove_reg)) %>% unnest_tweets(word, text, strip_url=TRUE,strip_punct = FALSE) %>% filter(!word %in% stop_words$word, !word %in% str_remove_all(stop_words$word, "'"), str_detect(word, "[a-z]")) %>% mutate(word = str_remove(word,pattern="\\.$|,$")) ``` We remove URLs and hashtags etc from the word list: ```{r badWords-removal} badWords <- c("news","7news","watch","flash","00pm","6pm") tidy_fb <- tidy_fb %>% filter( !grepl(pattern="\\.au$",word), !grepl(pattern="^#",word), !grepl(pattern="^www",word), !grepl(pattern="[0-9]{1,}[a|p]m",word), !grepl(pattern="[0-9]{1,}st",word), !grepl(pattern="[0-9]{1,}nd",word), !grepl(pattern="[0-9]{1,}rd",word), !grepl(pattern="[0-9]{1,}th",word), !(word %in% badWords), str_count(word,pattern="\\.") < 1 ) ``` ## Stemming ```{r stemming} tidy_fb <- tidy_fb %>% mutate(word_stem=stemDocument(word)) %>% filter(word_stem !="it'") %>% mutate(word_stem = if_else(word_stem == "australian","australia",word_stem), word_stem = if_else(word_stem == "victorian","victoria",word_stem)) ``` We have `r length(unique(tidy_fb$word_stem))` unique stemmed words. We'll subset down to words appearing at least 100 times. ## Differences in word usage between posts from political and media accounts ```{r differences-in-word-usage} word_ratios <- tidy_fb %>% filter(class %in% c("Political: Federal","Media")) %>% mutate(class = if_else(class=="Political: Federal", "federal", tolower(class)) ) %>% count(class,word_stem) %>% group_by(word_stem) %>% filter(sum(n) >= 100) %>% ungroup() %>% pivot_wider(names_from = class,values_from = n,values_fill = 0) %>% mutate(across(where(is.numeric),sum,.names = "{.col}_sum")) %>% mutate(media_norm = (media+1)/(media_sum), federal_norm = (federal+1)/(federal_sum)) %>% mutate(logratio = log(federal_norm / media_norm)) %>% arrange(desc(logratio)) ``` ```{r} library(DT) datatable(word_ratios %>% select(word_stem,federal,media,logratio), rownames = FALSE) %>% formatRound(columns="logratio",2) ``` # Topic model fit to posts by federal political entities ```{r political-dtm} #| eval: true word_political <- tidy_fb %>% filter(class %in% c("Political: Federal")) %>% group_by(word_stem) %>% mutate(word_total=n()) %>% filter(word_total>=100) political_dtm <- word_political %>% unite(post,name,platformId) %>% count(post,word_stem) %>% cast_dtm(post,word_stem,n) ``` ```{r topic-model-political-entities} #| eval: true m <- LDA(political_dtm,k=6) ``` ```{r topic-model-visualise} #| eval: true m %>% tidy() %>% group_by(topic) %>% slice_max(beta, n = 10) %>% ungroup() %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(beta, term, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + scale_y_reordered() ``` ```{r} phat <- posterior(m) phat <- as_tibble(phat$topics) %>% mutate(lab = dimnames(phat$topics)[[1]]) %>% mutate(platformId = gsub(lab,pattern="^.*_(.*_.*)$",replacement="\\1")) %>% left_join(zzz %>% select(id,platformId,postUrl), by="platformId") ``` # Topic model fit to posts by media entities ```{r media-dtm} #| eval: false word_media <- tidy_fb %>% filter(type=="media") %>% group_by(word_stem) %>% mutate(word_total=n()) %>% filter(word_total>=100) media_dtm <- word_media %>% unite(post,name,platformId) %>% count(post,word_stem) %>% cast_dtm(post,word_stem,n) ``` ```{r topic-model-media-entities} #| eval: false m_media <- LDA(political_dtm,k=12) ``` ```{r} #| eval: false m_media %>% tidy() %>% group_by(topic) %>% slice_max(beta, n = 10) %>% ungroup() %>% mutate(term = reorder_within(term, beta, topic)) %>% ggplot(aes(beta, term, fill = factor(topic))) + geom_col(show.legend = FALSE) + facet_wrap(~ topic, scales = "free") + scale_y_reordered() ```