The election campaign on FB

Authors
Affiliations
Professor Simon Jackman

University of Sydney

Associate Professor Andrea Carson

La Trobe University

Published

2:19PM 2 July 2022

Code
library(tidyverse)
library(here)
library(fst)
library(DT)
library(topicmodels)
library(tm)

Read data

A separate process pulls posts data from the Crowdtangle (CT) API. We read a “flattened” version of the JSON output.

Code
zzz <- read_fst(here("data/parsed.fst"))
zzz <- zzz %>% 
  distinct(platformId,name,date,postUrl,score,id,.keep_all = TRUE) %>% 
  group_by(platformId,name,date,postUrl) %>% 
  filter(score==max(score)) %>% 
  ungroup() %>% 
  group_by(platformId) %>% 
  slice(1) %>% 
  ungroup()


## merge on some other account info
load(file=here("data/accounts.RData"))
zzz <- zzz %>% 
  left_join(accounts %>%
              select(list_id,list_title=title,id,pageAdminTopCountry),
            by="id")

## merge on federal etc information
fed_info <- read_csv(file=here("data/fb_federal.csv"))
zzz <- zzz %>% 
  left_join(fed_info %>% select(id,pol_type),by="id") %>% 
  mutate(pol_type = if_else(
    pageCategory %in% c("POLITICAL_PARTY","POLITICAL_ORGANIZATION"),
    "federal",
    pol_type)
  )

The data span:

  • 250,555 unique FB posts
  • 2022-01-01 00:00:00 to 2022-05-21 23:59:45
  • 620 unique accounts/posting-entities
Code
political <- c("POLITICIAN",
               "POLITICAL_PARTY",
               "POLITICAL_ORGANIZATION",
               "POLITICAL_CANDIDATE")

political_table <- zzz %>% 
    filter(pageCategory %in% political | list_title=="Independents") %>% 
    group_by(name) %>%
    mutate(n=n()) %>%
    slice(1) %>% 
    ungroup() %>% 
    select(name,handle,cat=pageCategory,pol_type,n) %>% 
    arrange(desc(n))

372 political entities

Posts by politicians and political parties, groups etc can be flagged with the CT pageCategory flag. We also include C200-supported independent candidates, C200 itself, and various “Voices of …” Facebook accounts.

We also code politicians and political entities by whether they were active in the 2022 federal election (we manually coded this, needs automation for scaling to broader set of political entities).

Code
library(janitor)
xt <- zzz %>%
  select(id,pol_type,pageCategory) %>%
  distinct() %>%
  tabyl(pageCategory, pol_type) %>%
  adorn_totals("row") 
knitr::kable(xt)
pageCategory federal former state NA_
ARMED_FORCES 0 0 0 1
ART_GALLERY 0 0 0 1
ART_MUSEUM 0 0 0 1
ARTIST 0 0 0 1
AUDIOLOGIST 0 0 0 1
BLOGGER 0 0 0 1
BOTANICAL_GARDEN 0 0 0 1
BROADCASTING_MEDIA_PRODUCTION 0 0 0 6
BUSINESS_CONSULTANT 0 0 0 1
COMMUNITY 1 0 0 7
COMMUNITY_ORGANIZATION 4 0 0 2
CONSULATE_EMBASSY 0 0 0 15
CULTURAL_CENTER 0 0 0 1
EDU_SITE 0 0 0 1
EDUCATION_COMPANY 0 0 0 3
ENTERTAINMENT_SITE 0 0 0 1
ENTREPRENEUR 0 0 0 1
FESTIVAL 0 0 0 1
GOVERNMENT_OFFICIAL 0 0 0 13
GOVERNMENT_ORGANIZATION 0 0 0 79
HISTORY_MUSEUM 0 0 0 2
KIDS_SITE 0 0 0 1
LAW_ENFORCEMENT 0 0 0 1
LOCAL 0 0 0 1
MAGAZINE 0 0 0 1
MEDIA 0 0 0 3
MEDIA_NEWS_COMPANY 0 0 0 14
NEWS_PERSONALITY 0 0 0 1
NEWS_SITE 0 0 0 13
NGO 1 0 0 0
NON_PROFIT 0 0 0 3
PERSON 3 0 0 42
POLITICAL_CANDIDATE 6 0 3 0
POLITICAL_ORGANIZATION 8 0 0 0
POLITICAL_PARTY 23 0 0 0
POLITICIAN 169 14 139 0
PUBLIC_SERVICES_GOVERNMENT 0 0 0 4
RADIO_STATION 0 0 0 1
REGIONAL_SITE 0 0 0 1
SCIENCE_MUSEUM 0 0 0 1
TOPIC_ARTS_ENTERTAINMENT 0 0 0 1
TOPIC_ISLAND 0 0 0 1
TOPIC_LANDMARK 0 0 0 1
TOPIC_LIBRARY 0 0 0 2
TOPIC_MUSEUM 0 0 0 2
TOPIC_NEWSPAPER 0 0 0 7
TV_CHANNEL 0 0 0 1
TV_SHOW 0 0 0 1
UNIVERSITY 0 0 0 1
VISUAL_ARTS 0 0 0 1
WEBSITE 0 0 0 2
YOUTH_ORGANIZATION 0 0 0 1
NA 1 0 0 0
Total 216 14 142 248
Code
datatable(
  political_table,
  extensions = c("Buttons","RowGroup"),
  class=c("display","row_grouped"),
  options=list(dom="Bft",
               paging=FALSE,
               scrollY="800px",
               scrollCollapse=TRUE,
               buttons=c("copy","csv","excel")
               )
  ) %>%
  formatRound(columns="n",digits=0)

Many of these are not Federal politicians or candidates, so we engage in some matching/filtering to lists of incumbents/candidates in the 2022 federal election.

Code
media <- c("BROADCASTING_MEDIA_PRODUCTION",
           "MAGAZINE",
           "MEDIA",
           "MEDIA_NEWS_COMPANY",
           "NEWS_PERSONALITY",
           "NEWS_SITE",
           "RADIO_STATION",
           "TOPIC_NEWSPAPER",
           "TV_CHANNEL",
           "TV_SHOW")

media_table <- zzz %>% 
    filter(pageCategory %in% media) %>% 
     group_by(name) %>%
    mutate(n=n()) %>%
    slice(1) %>% 
    ungroup() %>% 
    select(name,handle,cat=pageCategory,country=pageAdminTopCountry,n) %>% 
    arrange(desc(n))

48 Media entities

Code
datatable(media_table,
          extensions = c("Buttons","RowGroup"),
          class=c("display","row_grouped"),
          options=list(dom="Bft",
                       paging=FALSE,
                       scrollY="800px",
                       scrollCollapse=TRUE,
                       buttons=c("copy","csv","excel"))) %>%
  formatRound(columns="n",digits=0)
Code
zzz <- zzz %>% 
  mutate(type = case_when(
    pageCategory %in% media ~ "media",
    pageCategory %in% political | list_title == "Independents" ~ "political",
    TRUE ~ "other")
  ) %>% 
  mutate(
    class = case_when(
      pol_type=="federal" ~ "Political: Federal",
      type=="political" ~ "Political: Other",
      type=="media" ~ "Media",
      TRUE ~ "Other")
  ) 

Count of posts by day

Code
zzz %>% 
  mutate(d = lubridate::as_date(date),
         day=strftime(d,"%a"),
         day = if_else(day %in% c("Mon","Wed","Fri"),
                       str_sub(day,1,1),
                       str_sub(day,1,2))
         ) %>% 
  group_by(d) %>% 
  summarise(n=n(),day=day[1]) %>% 
  ungroup() %>% 
  ggplot(.,
         aes(x=d,y=n,label=day)) + 
  geom_bar(stat="identity") + 
  geom_text(color="white",size=1.33,vjust=1) + 
  scale_x_date("",expand=c(0,0))

Code
zzz %>% 
  mutate(d = lubridate::as_date(date),
         day=strftime(d,"%a"),
         day = if_else(day %in% c("Mon","Wed","Fri"),
                       str_sub(day,1,1),
                       str_sub(day,1,2))
         ) %>% 
  mutate(
    class = case_when(
      pol_type=="federal" ~ "Political: Federal",
      type=="political" ~ "Political: Other",
      type=="media" ~ "Media",
      TRUE ~ "Other")
    ) %>% 
  group_by(d,class,.drop = FALSE) %>% 
  summarise(n=n(),day=day[1]) %>% 
  ungroup() %>% 
  ggplot(.,
         aes(x=d,y=n,
             label=day,
             group=class,
             color=class)) + 
  geom_line() + 
  scale_x_date("",expand=c(0,0))

Text pre-processing

Tidying up and tokenizing contents of each post’s message.

Code
library(tidytext)
library(stringr)

## remove some bog standard phrases and sentences

remove_reg <- "&amp;|&lt;|&gt;"
tidy_fb <- zzz %>%
  rename(text=message) %>% 
  mutate(text = str_remove(text,pattern="Watch the full episode at www.skynews.com.au/flash")) %>% 
  mutate(text = str_remove(text,pattern="7news at 6pm.")) %>% 
  mutate(text = str_remove(text,pattern="More local news.*$")) %>% 
  mutate(text = str_remove(text,pattern="Read more news online here: https://www.theage.com.au/")) %>% 
  mutate(text = if_else(is.na(text),description,text)) %>% 
  filter(!is.na(text)) %>% 
  mutate(text = str_remove_all(text, remove_reg)) %>%
  unnest_tweets(word, text, strip_url=TRUE,strip_punct = FALSE) %>%
  filter(!word %in% stop_words$word,
         !word %in% str_remove_all(stop_words$word, "'"),
         str_detect(word, "[a-z]")) %>% 
  mutate(word = str_remove(word,pattern="\\.$|,$"))

We remove URLs and hashtags etc from the word list:

Code
badWords <- c("news","7news","watch","flash","00pm","6pm")
tidy_fb <- tidy_fb %>% 
  filter(
    !grepl(pattern="\\.au$",word),
    !grepl(pattern="^#",word),
    !grepl(pattern="^www",word),
    !grepl(pattern="[0-9]{1,}[a|p]m",word),
    !grepl(pattern="[0-9]{1,}st",word),
    !grepl(pattern="[0-9]{1,}nd",word),
    !grepl(pattern="[0-9]{1,}rd",word),
    !grepl(pattern="[0-9]{1,}th",word),
    !(word %in% badWords),
    str_count(word,pattern="\\.") < 1
  )

Stemming

Code
tidy_fb <- tidy_fb %>% 
  mutate(word_stem=stemDocument(word)) %>% 
  filter(word_stem !="it'") %>% 
  mutate(word_stem = if_else(word_stem == "australian","australia",word_stem),
         word_stem = if_else(word_stem == "victorian","victoria",word_stem))

We have 92842 unique stemmed words. We’ll subset down to words appearing at least 100 times.

Differences in word usage between posts from political and media accounts

Code
word_ratios <- tidy_fb %>%
  filter(class %in% c("Political: Federal","Media")) %>%
  mutate(class = if_else(class=="Political: Federal",
                         "federal",
                         tolower(class))
         ) %>% 
  count(class,word_stem) %>% 
  group_by(word_stem) %>% 
  filter(sum(n) >= 100) %>% 
  ungroup() %>% 
  pivot_wider(names_from = class,values_from = n,values_fill = 0) %>% 
  mutate(across(where(is.numeric),sum,.names = "{.col}_sum")) %>%
  mutate(media_norm = (media+1)/(media_sum),
         federal_norm = (federal+1)/(federal_sum)) %>% 
  mutate(logratio = log(federal_norm / media_norm)) %>%
  arrange(desc(logratio))
Code
library(DT)
datatable(word_ratios %>%
            select(word_stem,federal,media,logratio),
          rownames = FALSE) %>%
  formatRound(columns="logratio",2)

Topic model fit to posts by federal political entities

Code
word_political <- tidy_fb %>% 
  filter(class %in% c("Political: Federal")) %>%
  group_by(word_stem) %>% 
  mutate(word_total=n()) %>% 
  filter(word_total>=100)

political_dtm <- word_political %>% 
  unite(post,name,platformId) %>% 
  count(post,word_stem) %>% 
  cast_dtm(post,word_stem,n)
Code
m <- LDA(political_dtm,k=6)
Code
m %>%
  tidy() %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()

Code
phat <- posterior(m)
phat <- as_tibble(phat$topics) %>% 
  mutate(lab = dimnames(phat$topics)[[1]]) %>% 
  mutate(platformId = gsub(lab,pattern="^.*_(.*_.*)$",replacement="\\1")) %>% 
  left_join(zzz %>% 
              select(id,platformId,postUrl),
            by="platformId")

Topic model fit to posts by media entities

Code
word_media <- tidy_fb %>% 
  filter(type=="media") %>% 
  group_by(word_stem) %>% 
  mutate(word_total=n()) %>% 
  filter(word_total>=100)

media_dtm <- word_media %>% 
  unite(post,name,platformId) %>% 
  count(post,word_stem) %>% 
  cast_dtm(post,word_stem,n)
Code
m_media <- LDA(political_dtm,k=12)
Code
m_media %>%
  tidy() %>%
  group_by(topic) %>%
  slice_max(beta, n = 10) %>%
  ungroup() %>%
  mutate(term = reorder_within(term, beta, topic)) %>%
  ggplot(aes(beta, term, fill = factor(topic))) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~ topic, scales = "free") +
  scale_y_reordered()