Twitter Agendas

Full R script:
#NOTE: bearer token has to be set in the environment. See: ?set_bearer() for details
if (!require("academictwitteR")) install.packages("academictwitteR")
if (!require("readr")) install.packages("readr")
if (!require("tidyverse")) install.packages("tidyverse")

library(academictwitteR)
library(readr)
library(lubridate)
require(academictwitteR)
require(tibble)
library(dplyr)
library(tidyverse)

###########################################
# Scrape Congress tweets
###########################################

taddresses <- read.csv("TwitterAddressesAllStates.csv")
tidlist <- as.list(taddresses$ID)

FetchedTweets <- data.frame(tweet_id=character(),
                            user_username=character(),
                            text=character(),
                            author_id=character(),
                            created_at=character(),
                            conversation_id=character(),
                            user_name=character(),
                            retweet_count=character(),
                            like_count=character(),
                            quote_count=character(),
                            sourcetweet_type=character(),
                            sourcetweet_id=character(),
                            sourcetweet_text=character(),
                            sourcetweet_author_id=character(),
                            ID=character())
keepvars <- c("tweet_id",
              "user_username",
              "text",
              "author_id",
              "created_at",
              "conversation_id",
              "user_name",
              "retweet_count",
              "like_count",
              "quote_count",
              "sourcetweet_type",
              "sourcetweet_id",
              "sourcetweet_text",
              "sourcetweet_author_id")

folder <- "mydata"
if (file.exists(folder)) {
  cat("The folder already exists")
} else {
  dir.create(folder)
}

for(ID in tidlist){
  unlink("mydata", recursive = TRUE)
  my_query <- build_query(users = ID)
  Sys.sleep(3)
  thisfetch <- get_all_tweets(
    query = my_query,
    start_tweets = "2021-01-02T00:00:00Z",
    end_tweets = "2022-10-15T00:00:00Z",
    bearer_token = get_bearer(),
    data_path = "mydata",
    n = Inf)
  colcount <- ncol(thisfetch)
  print(colcount)
  if(colcount == 0){
    next
  } else {
    thisfetch <- bind_tweets(data_path = "mydata/",
                             output_format = "tidy")
    thisfetch <- thisfetch[keepvars]
    thisfetch$ID <- ID
    FetchedTweets <- rbind(FetchedTweets,thisfetch)
  }
}

FetchedTweets$date <- ymd_hms(FetchedTweets$created_at, tz = "US/Eastern")
#Use OlsonNames() to generate a list of available time zones
#write.csv(CongressTweets,"CongressTweets20220101to20220922.csv")

FetchedTweets$sourcetweet_type <- FetchedTweets$sourcetweet_type %>% 
  replace_na('original')

FetchedTweets <- FetchedTweets %>% 
  mutate(
    pre_text = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = sub(":.*", "",FetchedTweets$text), 
      false     = ""
    )
  )
FetchedTweets <- FetchedTweets %>% 
  mutate(
    text_new = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = sourcetweet_text, 
      false     = text
    )
  )
FetchedTweets <- FetchedTweets %>% 
  mutate(
    text_new = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = FetchedTweets$finaltext <- paste(FetchedTweets$pre_text,
                                                   FetchedTweets$text_new,
                                                   sep=": "), 
      false     = text
    )
  )

Congress <- merge(FetchedTweets, taddresses,
                  by.x = c("ID"),
                  by.y = c("ID"))
Congress$URL <- paste("https://twitter.com/user/status/",Congress$tweet_id, sep = "")
Congress$text <- Congress$text_new

Congress <- subset(Congress,
                   select = -c(sourcetweet_text,
                               pre_text,
                               text_new,
                               full_name))

# Cleanup
rm(FetchedTweets,
   taddresses,
   thisfetch,
   tidlist,
   colcount,
   folder,
   ID,
   keepvars,
   my_query)

# Characterizing retweets

Congress %>% 
  group_by(party, sourcetweet_type) %>% 
  summarise(n = n())

Counts <- Congress %>% 
  group_by(party, user_name, sourcetweet_type) %>% 
  summarise(n = n())

Retweets <- Congress[Congress$sourcetweet_type == "retweeted",]
Retweet_sample <- sample_n(Retweets,50)

rm(Counts, Retweet_sample, Retweets)

#Saving Congress dataframe as a .csv
write_csv(Congress,"Congress.csv")

###########################################
# Scrape media tweets
###########################################

taddresses <- read.csv("MediaTwitterSources.csv")
tidlist <- as.list(taddresses$ID)

FetchedTweets <- data.frame(tweet_id=character(),
                            user_username=character(),
                            text=character(),
                            author_id=character(),
                            created_at=character(),
                            conversation_id=character(),
                            user_name=character(),
                            retweet_count=character(),
                            like_count=character(),
                            quote_count=character(),
                            sourcetweet_type=character(),
                            sourcetweet_id=character(),
                            sourcetweet_text=character(),
                            sourcetweet_author_id=character(),
                            ID=character())
keepvars <- c("tweet_id",
              "user_username",
              "text",
              "author_id",
              "created_at",
              "conversation_id",
              "user_name",
              "retweet_count",
              "like_count",
              "quote_count",
              "sourcetweet_type",
              "sourcetweet_id",
              "sourcetweet_text",
              "sourcetweet_author_id")

folder <- "mydata"
if (file.exists(folder)) {
  cat("The folder already exists")
} else {
  dir.create(folder)
}

for(ID in tidlist){
  unlink("mydata", recursive = TRUE)
  my_query <- build_query(users = ID)
  Sys.sleep(3)
  thisfetch <- get_all_tweets(
    query = my_query,
    start_tweets = "2021-01-02T00:00:00Z",
    end_tweets = "2022-10-15T00:00:00Z",
    bearer_token = get_bearer(),
    data_path = "mydata",
    n = Inf)
  colcount <- ncol(thisfetch)
  print(colcount)
  if(colcount == 0){
    next
  } else {
    thisfetch <- bind_tweets(data_path = "mydata/",
                             output_format = "tidy")
    thisfetch <- thisfetch[keepvars]
    thisfetch$ID <- ID
    FetchedTweets <- rbind(FetchedTweets,thisfetch)
  }
}

FetchedTweets$date <- ymd_hms(FetchedTweets$created_at, tz = "US/Eastern")
#Use OlsonNames() to generate a list of available time zones
#write.csv(CongressTweets,"CongressTweets20220101to20220922.csv")

FetchedTweets$sourcetweet_type <- FetchedTweets$sourcetweet_type %>% 
  replace_na('original')

FetchedTweets <- FetchedTweets %>% 
  mutate(
    pre_text = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = sub(":.*", "",FetchedTweets$text), 
      false     = ""
    )
  )
FetchedTweets <- FetchedTweets %>% 
  mutate(
    text_new = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = sourcetweet_text, 
      false     = text
    )
  )
FetchedTweets <- FetchedTweets %>% 
  mutate(
    text_new = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = FetchedTweets$finaltext <- paste(FetchedTweets$pre_text,
                                                   FetchedTweets$text_new,
                                                   sep=": "), 
      false     = text
    )
  )

Media <- merge(FetchedTweets, taddresses,
               by.x = c("ID"),
               by.y = c("ID"))
Media$URL <- paste("https://twitter.com/user/status/",Media$tweet_id, sep = "")
Media$text <- Media$text_new

Media <- subset(Media,
                select = -c(sourcetweet_text,
                            pre_text,
                            text_new,
                            full_name))

write_csv(Media,"MediaTweets.csv")

# Cleanup
rm(FetchedTweets,
   taddresses,
   thisfetch,
   tidlist,
   colcount,
   folder,
   ID,
   keepvars,
   my_query)

# Characterizing retweets

Media %>% 
  group_by(party, sourcetweet_type) %>% 
  summarise(n = n())

Counts <- Media %>% 
  group_by(party, user_name, sourcetweet_type) %>% 
  summarise(n = n())

Retweets <- Media[Media$sourcetweet_type == "retweeted",]
Retweet_sample <- sample_n(Retweets,50)

rm(Counts, Retweet_sample, Retweets)

#Saving Congress dataframe as a .csv
write_csv(Media,"Media.csv")

# Getting data on retweeted users
# Frequently hits rate limit. May want to rewrite
# to throttle the request frequency.

userlist <- Congress %>% 
  group_by(party, sourcetweet_author_id) %>% 
  summarise(n = n())

userlist <- as.list(Congress$sourcetweet_author_id)
userlist <- userlist[!is.na(userlist)]
userinfo <- get_user_profile(userlist, bearer_token = get_bearer())

vars2keep <- c("id","username", "name", "description")
retweetinfo <- userinfo[vars2keep]
retweetinfo <- retweetinfo %>% 
  rename(sourcetweet_author_id = id,
         rt_username = username,
         rt_name = name,
         rt_description = description)

retweetinfo <- retweetinfo %>% 
  distinct(sourcetweet_author_id,
           .keep_all = TRUE)

write_csv(retweetinfo,"Info_on_retweeted_users.csv")


if (!require("data.table")) install.packages("data.table")
library(data.table)

mydt <- setDT(Congress)
mylookup_dt <- setDT(retweetinfo)

joined_dt1 <- merge(mydt, mylookup_dt, 
                    by.x = "sourcetweet_author_id",
                    by.y = "sourcetweet_author_id", 
                    all.x = TRUE, all.y = FALSE)

Congress <- as.data.frame(joined_dt1)

rm(userinfo,
   userlist,
   users,
   vars2keep,
   mydt,
   mylookup_dt,
   joined_dt1,
   retweetinfo)

retweeted_by_username <- Congress %>% 
  group_by(rt_username, rt_name, rt_description) %>% 
  summarise(n = n())

retweeted_by_party <- Congress %>% 
  group_by(party, rt_username, rt_name, rt_description) %>% 
  summarise(n = n())

write_csv(retweeted_by_party,"retweeted_by_party.csv")

rm(retweeted_by_party,
   retweeted_by_username)

###########################################
# Scrape partisan-preferred media tweets
###########################################

taddresses <- read.csv("PartisanPreferredMedia.csv")
tidlist <- as.list(taddresses$ID)

FetchedTweets <- data.frame(tweet_id=character(),
                            user_username=character(),
                            text=character(),
                            author_id=character(),
                            created_at=character(),
                            conversation_id=character(),
                            user_name=character(),
                            retweet_count=character(),
                            like_count=character(),
                            quote_count=character(),
                            sourcetweet_type=character(),
                            sourcetweet_id=character(),
                            sourcetweet_text=character(),
                            sourcetweet_author_id=character(),
                            ID=character())
keepvars <- c("tweet_id",
              "user_username",
              "text",
              "author_id",
              "created_at",
              "conversation_id",
              "user_name",
              "retweet_count",
              "like_count",
              "quote_count",
              "sourcetweet_type",
              "sourcetweet_id",
              "sourcetweet_text",
              "sourcetweet_author_id")

folder <- "mydata"
if (file.exists(folder)) {
  cat("The folder already exists")
} else {
  dir.create(folder)
}

for(ID in tidlist){
  unlink("mydata", recursive = TRUE)
  my_query <- build_query(users = ID)
  Sys.sleep(3)
  thisfetch <- get_all_tweets(
    query = my_query,
    start_tweets = "2021-01-02T00:00:00Z",
    end_tweets = "2022-10-15T00:00:00Z",
    bearer_token = get_bearer(),
    data_path = "mydata",
    n = Inf)
  colcount <- ncol(thisfetch)
  print(colcount)
  if(colcount == 0){
    next
  } else {
    thisfetch <- bind_tweets(data_path = "mydata/",
                             output_format = "tidy")
    thisfetch <- thisfetch[keepvars]
    thisfetch$ID <- ID
    FetchedTweets <- rbind(FetchedTweets,thisfetch)
  }
}

FetchedTweets$date <- ymd_hms(FetchedTweets$created_at, tz = "US/Eastern")
#Use OlsonNames() to generate a list of available time zones
#write.csv(CongressTweets,"CongressTweets20220101to20220922.csv")

FetchedTweets$sourcetweet_type <- FetchedTweets$sourcetweet_type %>% 
  replace_na('original')

FetchedTweets <- FetchedTweets %>% 
  mutate(
    pre_text = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = sub(":.*", "",FetchedTweets$text), 
      false     = ""
    )
  )
FetchedTweets <- FetchedTweets %>% 
  mutate(
    text_new = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = sourcetweet_text, 
      false     = text
    )
  )
FetchedTweets <- FetchedTweets %>% 
  mutate(
    text_new = if_else(
      condition = sourcetweet_type == "retweeted", 
      true      = FetchedTweets$finaltext <- paste(FetchedTweets$pre_text,
                                                   FetchedTweets$text_new,
                                                   sep=": "), 
      false     = text
    )
  )

PPMedia <- merge(FetchedTweets, taddresses,
                 by.x = c("ID"),
                 by.y = c("ID"))
PPMedia$URL <- paste("https://twitter.com/user/status/",PPMedia$tweet_id, sep = "")
PPMedia$text <- PPMedia$text_new

PPMedia <- subset(PPMedia,
                  select = -c(sourcetweet_text,
                              pre_text,
                              text_new,
                              full_name))
write_csv(PPMedia,"PartisanPreferredMediaTweets.csv")

rm(FetchedTweets,
   taddresses,
   thisfetch,
   tidlist,
   colcount,
   folder,
   ID,
   keepvars,
   my_query)

##################################################
# TF-IDF analysis

# Packages and libraries
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("dplyr")) install.packages("dplyr")
if (!require("readr")) install.packages("readr")
if (!require("tidytext")) install.packages("tidytext")


library(ggplot2)
library(dplyr)
library(readr)
library(tidytext)
library(stringr) # Part of the tidyverse package

# Tokenizing by word and counting words by source

tidy_text <- Congress %>% 
  unnest_tokens(word,text) %>% 
  count(party,word, sort = TRUE)

# Total words by source
total_words <- tidy_text %>% 
  group_by(party) %>% 
  summarize(total = sum(n))

# Adding total words column to data file
tidy_text <- left_join(tidy_text,total_words)

# Deleting data frames that are no longer needed
rm("total_words")

# TF-IDF
tidy_text_tf_idf <- tidy_text %>%
  bind_tf_idf(word, party, n) %>% 
  arrange(desc(tf_idf))

# Graphing high TF-IDF words by source

library(forcats)

tidy_text_tf_idf %>%
  group_by(party) %>%
  slice_max(tf_idf, n = 12) %>%
  ungroup() %>%
  ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = party)) +
  geom_col(show.legend = FALSE) +
  facet_wrap(~party, ncol = 2, scales = "free") +
  labs(x = "tf-idf", y = NULL)

rm("tidy_text")

##################################################

#Combining Congress and PPMedia dataframes
PPMedia['rt_username'] <- NA
PPMedia['rt_name'] <- NA
PPMedia['rt_description'] <- NA
CongressAndPPM <- rbind(Congress,PPMedia)

#Combining Congress and Media dataframes
Media['rt_username'] <- NA
Media['rt_name'] <- NA
Media['rt_description'] <- NA
CongressAndMedia <- rbind(Congress,Media)

### Using the grepl function ###

imgterms <- "border|bidenbordercrisis
|immigra|illegal aliens|alien"
CongressAndPPM$immigration <- ifelse(grepl(imgterms,
                                           CongressAndPPM$text,
                                           ignore.case = TRUE),1,0)
rm("imgterms")

abterms <- "abort|reproductive rights|unborn|right to life
|right to choose|roe v"
CongressAndPPM$abortion <- ifelse(grepl(abterms,
                                        CongressAndPPM$text,
                                        ignore.case = TRUE),1,0)
rm("abterms")


econterms <- "inflation|rising prices|economic|economy"
CongressAndPPM$economy <- ifelse(grepl(econterms,
                                       CongressAndPPM$text,
                                       ignore.case = TRUE),1,0)
rm("econterms")


Trumpterms <- "Trump|Donald Trump|MAGA"
CongressAndPPM$Trump <- ifelse(grepl(Trumpterms,
                                     CongressAndPPM$text,
                                     ignore.case = FALSE),1,0)
rm("Trumpterms")

Bidenterms <- "Biden|Joe Biden|Uncle Joe"
CongressAndPPM$Biden <- ifelse(grepl(Bidenterms,
                                     CongressAndPPM$text,
                                     ignore.case = TRUE),1,0)
rm("Bidenterms")

### Rounding off dates ###

CongressAndPPM$Week <- round_date(
  CongressAndPPM$date,
  unit = "week",
  week_start = getOption("lubridate.week.start", 1)
)

### Getting weekly tweet counts by party
TweetsByPartyAndWeek <- CongressAndPPM %>% 
  group_by(Week,party) %>% 
  summarize(TweetCount = n()) %>% 
  pivot_wider(
    names_from = "party",
    values_from = c("TweetCount"))
TweetsByPartyAndWeek <- subset(TweetsByPartyAndWeek,
                               select = -c(Independent))

### Pivoting and collapsing the data

Data_Wide <- CongressAndPPM %>%
  pivot_wider(
    names_from = "party",
    values_from = c("immigration","abortion","economy","Trump","Biden"))

Data_Wide_Coll <- Data_Wide %>%
  group_by(Week) %>%
  summarize(Immigration_R = sum(immigration_Republican, na.rm=TRUE),
            Immigration_D = sum(immigration_Democrat, na.rm=TRUE),
            Immigration_DPO = sum(immigration_DPO, na.rm=TRUE),
            Immigration_RPO = sum(immigration_RPO, na.rm=TRUE),
            Abortion_R = sum(abortion_Republican, na.rm=TRUE),
            Abortion_D = sum(abortion_Democrat, na.rm=TRUE),
            Abortion_DPO = sum(abortion_DPO, na.rm=TRUE),
            Abortion_RPO = sum(abortion_RPO, na.rm=TRUE),
            Economy_R = sum(economy_Republican, na.rm=TRUE),
            Economy_D = sum(economy_Democrat, na.rm=TRUE),
            Economy_DPO = sum(economy_DPO, na.rm=TRUE),
            Economy_RPO = sum(economy_RPO, na.rm=TRUE),
            DTrump_R = sum(Trump_Republican, na.rm=TRUE),
            DTrump_D = sum(Trump_Democrat, na.rm=TRUE),
            DTrump_DPO = sum(Trump_DPO, na.rm=TRUE),
            DTrump_RPO = sum(Trump_RPO, na.rm=TRUE),
            JBiden_R = sum(Biden_Republican, na.rm=TRUE),
            JBiden_D = sum(Biden_Democrat, na.rm=TRUE),
            JBiden_DPO = sum(Biden_DPO, na.rm=TRUE),
            JBiden_RPO = sum(Biden_RPO, na.rm=TRUE),
            TweetCount = n())

FinalData <- merge(Data_Wide_Coll, TweetsByPartyAndWeek, by=c("Week")) %>% 
  mutate(Immigration_R_Pct = (Immigration_R / Republican)*100,
         Immigration_D_Pct = (Immigration_D / Democrat)*100,
         Immigration_DPO_Pct = (Immigration_DPO / DPO)*100,
         Immigration_RPO_PCT = (Immigration_RPO / RPO)*100,
         Abortion_R_Pct = (Abortion_R / Republican)*100,
         Abortion_D_Pct = (Abortion_D / Democrat)*100,
         Abortion_DPO_Pct = (Abortion_DPO / DPO)*100,
         Abortion_RPO_PCT = (Abortion_RPO / RPO)*100,
         Economy_R_Pct = (Economy_R / Republican)*100,
         Economy_D_Pct = (Economy_D / Democrat)*100,
         Economy_DPO_Pct = (Economy_DPO / DPO)*100,
         Economy_RPO_PCT = (Economy_RPO / RPO)*100,
         DTrump_R_Pct = (DTrump_R / Republican)*100,
         DTrump_D_Pct = (DTrump_D / Democrat)*100,
         DTrump_DPO_Pct = (DTrump_DPO / DPO)*100,
         DTrump_RPO_PCT = (DTrump_RPO / RPO)*100,
         JBiden_R_Pct = (JBiden_R / Republican)*100,
         JBiden_D_Pct = (JBiden_D / Democrat)*100,
         JBiden_DPO_Pct = (JBiden_DPO / DPO)*100,
         JBiden_RPO_PCT = (JBiden_RPO / RPO)*100)

write_csv(FinalData,"FinalData_PPM.csv")

# Cleanup
rm("Data_Wide",
   "Data_Wide_Coll",
   "TweetsByPartyAndWeek")
Twitter Agendas

Ken Blake, Jason Reineke, Jun Zhang

2022-11-09