Full R script:
#NOTE: bearer token has to be set in the environment. See: ?set_bearer() for details
if (!require("academictwitteR")) install.packages("academictwitteR")
if (!require("readr")) install.packages("readr")
if (!require("tidyverse")) install.packages("tidyverse")
library(academictwitteR)
library(readr)
library(lubridate)
require(academictwitteR)
require(tibble)
library(dplyr)
library(tidyverse)
###########################################
# Scrape Congress tweets
###########################################
taddresses <- read.csv("TwitterAddressesAllStates.csv")
tidlist <- as.list(taddresses$ID)
FetchedTweets <- data.frame(tweet_id=character(),
user_username=character(),
text=character(),
author_id=character(),
created_at=character(),
conversation_id=character(),
user_name=character(),
retweet_count=character(),
like_count=character(),
quote_count=character(),
sourcetweet_type=character(),
sourcetweet_id=character(),
sourcetweet_text=character(),
sourcetweet_author_id=character(),
ID=character())
keepvars <- c("tweet_id",
"user_username",
"text",
"author_id",
"created_at",
"conversation_id",
"user_name",
"retweet_count",
"like_count",
"quote_count",
"sourcetweet_type",
"sourcetweet_id",
"sourcetweet_text",
"sourcetweet_author_id")
folder <- "mydata"
if (file.exists(folder)) {
cat("The folder already exists")
} else {
dir.create(folder)
}
for(ID in tidlist){
unlink("mydata", recursive = TRUE)
my_query <- build_query(users = ID)
Sys.sleep(3)
thisfetch <- get_all_tweets(
query = my_query,
start_tweets = "2021-01-02T00:00:00Z",
end_tweets = "2022-10-15T00:00:00Z",
bearer_token = get_bearer(),
data_path = "mydata",
n = Inf)
colcount <- ncol(thisfetch)
print(colcount)
if(colcount == 0){
next
} else {
thisfetch <- bind_tweets(data_path = "mydata/",
output_format = "tidy")
thisfetch <- thisfetch[keepvars]
thisfetch$ID <- ID
FetchedTweets <- rbind(FetchedTweets,thisfetch)
}
}
FetchedTweets$date <- ymd_hms(FetchedTweets$created_at, tz = "US/Eastern")
#Use OlsonNames() to generate a list of available time zones
#write.csv(CongressTweets,"CongressTweets20220101to20220922.csv")
FetchedTweets$sourcetweet_type <- FetchedTweets$sourcetweet_type %>%
replace_na('original')
FetchedTweets <- FetchedTweets %>%
mutate(
pre_text = if_else(
condition = sourcetweet_type == "retweeted",
true = sub(":.*", "",FetchedTweets$text),
false = ""
)
)
FetchedTweets <- FetchedTweets %>%
mutate(
text_new = if_else(
condition = sourcetweet_type == "retweeted",
true = sourcetweet_text,
false = text
)
)
FetchedTweets <- FetchedTweets %>%
mutate(
text_new = if_else(
condition = sourcetweet_type == "retweeted",
true = FetchedTweets$finaltext <- paste(FetchedTweets$pre_text,
FetchedTweets$text_new,
sep=": "),
false = text
)
)
Congress <- merge(FetchedTweets, taddresses,
by.x = c("ID"),
by.y = c("ID"))
Congress$URL <- paste("https://twitter.com/user/status/",Congress$tweet_id, sep = "")
Congress$text <- Congress$text_new
Congress <- subset(Congress,
select = -c(sourcetweet_text,
pre_text,
text_new,
full_name))
# Cleanup
rm(FetchedTweets,
taddresses,
thisfetch,
tidlist,
colcount,
folder,
ID,
keepvars,
my_query)
# Characterizing retweets
Congress %>%
group_by(party, sourcetweet_type) %>%
summarise(n = n())
Counts <- Congress %>%
group_by(party, user_name, sourcetweet_type) %>%
summarise(n = n())
Retweets <- Congress[Congress$sourcetweet_type == "retweeted",]
Retweet_sample <- sample_n(Retweets,50)
rm(Counts, Retweet_sample, Retweets)
#Saving Congress dataframe as a .csv
write_csv(Congress,"Congress.csv")
###########################################
# Scrape media tweets
###########################################
taddresses <- read.csv("MediaTwitterSources.csv")
tidlist <- as.list(taddresses$ID)
FetchedTweets <- data.frame(tweet_id=character(),
user_username=character(),
text=character(),
author_id=character(),
created_at=character(),
conversation_id=character(),
user_name=character(),
retweet_count=character(),
like_count=character(),
quote_count=character(),
sourcetweet_type=character(),
sourcetweet_id=character(),
sourcetweet_text=character(),
sourcetweet_author_id=character(),
ID=character())
keepvars <- c("tweet_id",
"user_username",
"text",
"author_id",
"created_at",
"conversation_id",
"user_name",
"retweet_count",
"like_count",
"quote_count",
"sourcetweet_type",
"sourcetweet_id",
"sourcetweet_text",
"sourcetweet_author_id")
folder <- "mydata"
if (file.exists(folder)) {
cat("The folder already exists")
} else {
dir.create(folder)
}
for(ID in tidlist){
unlink("mydata", recursive = TRUE)
my_query <- build_query(users = ID)
Sys.sleep(3)
thisfetch <- get_all_tweets(
query = my_query,
start_tweets = "2021-01-02T00:00:00Z",
end_tweets = "2022-10-15T00:00:00Z",
bearer_token = get_bearer(),
data_path = "mydata",
n = Inf)
colcount <- ncol(thisfetch)
print(colcount)
if(colcount == 0){
next
} else {
thisfetch <- bind_tweets(data_path = "mydata/",
output_format = "tidy")
thisfetch <- thisfetch[keepvars]
thisfetch$ID <- ID
FetchedTweets <- rbind(FetchedTweets,thisfetch)
}
}
FetchedTweets$date <- ymd_hms(FetchedTweets$created_at, tz = "US/Eastern")
#Use OlsonNames() to generate a list of available time zones
#write.csv(CongressTweets,"CongressTweets20220101to20220922.csv")
FetchedTweets$sourcetweet_type <- FetchedTweets$sourcetweet_type %>%
replace_na('original')
FetchedTweets <- FetchedTweets %>%
mutate(
pre_text = if_else(
condition = sourcetweet_type == "retweeted",
true = sub(":.*", "",FetchedTweets$text),
false = ""
)
)
FetchedTweets <- FetchedTweets %>%
mutate(
text_new = if_else(
condition = sourcetweet_type == "retweeted",
true = sourcetweet_text,
false = text
)
)
FetchedTweets <- FetchedTweets %>%
mutate(
text_new = if_else(
condition = sourcetweet_type == "retweeted",
true = FetchedTweets$finaltext <- paste(FetchedTweets$pre_text,
FetchedTweets$text_new,
sep=": "),
false = text
)
)
Media <- merge(FetchedTweets, taddresses,
by.x = c("ID"),
by.y = c("ID"))
Media$URL <- paste("https://twitter.com/user/status/",Media$tweet_id, sep = "")
Media$text <- Media$text_new
Media <- subset(Media,
select = -c(sourcetweet_text,
pre_text,
text_new,
full_name))
write_csv(Media,"MediaTweets.csv")
# Cleanup
rm(FetchedTweets,
taddresses,
thisfetch,
tidlist,
colcount,
folder,
ID,
keepvars,
my_query)
# Characterizing retweets
Media %>%
group_by(party, sourcetweet_type) %>%
summarise(n = n())
Counts <- Media %>%
group_by(party, user_name, sourcetweet_type) %>%
summarise(n = n())
Retweets <- Media[Media$sourcetweet_type == "retweeted",]
Retweet_sample <- sample_n(Retweets,50)
rm(Counts, Retweet_sample, Retweets)
#Saving Congress dataframe as a .csv
write_csv(Media,"Media.csv")
# Getting data on retweeted users
# Frequently hits rate limit. May want to rewrite
# to throttle the request frequency.
userlist <- Congress %>%
group_by(party, sourcetweet_author_id) %>%
summarise(n = n())
userlist <- as.list(Congress$sourcetweet_author_id)
userlist <- userlist[!is.na(userlist)]
userinfo <- get_user_profile(userlist, bearer_token = get_bearer())
vars2keep <- c("id","username", "name", "description")
retweetinfo <- userinfo[vars2keep]
retweetinfo <- retweetinfo %>%
rename(sourcetweet_author_id = id,
rt_username = username,
rt_name = name,
rt_description = description)
retweetinfo <- retweetinfo %>%
distinct(sourcetweet_author_id,
.keep_all = TRUE)
write_csv(retweetinfo,"Info_on_retweeted_users.csv")
if (!require("data.table")) install.packages("data.table")
library(data.table)
mydt <- setDT(Congress)
mylookup_dt <- setDT(retweetinfo)
joined_dt1 <- merge(mydt, mylookup_dt,
by.x = "sourcetweet_author_id",
by.y = "sourcetweet_author_id",
all.x = TRUE, all.y = FALSE)
Congress <- as.data.frame(joined_dt1)
rm(userinfo,
userlist,
users,
vars2keep,
mydt,
mylookup_dt,
joined_dt1,
retweetinfo)
retweeted_by_username <- Congress %>%
group_by(rt_username, rt_name, rt_description) %>%
summarise(n = n())
retweeted_by_party <- Congress %>%
group_by(party, rt_username, rt_name, rt_description) %>%
summarise(n = n())
write_csv(retweeted_by_party,"retweeted_by_party.csv")
rm(retweeted_by_party,
retweeted_by_username)
###########################################
# Scrape partisan-preferred media tweets
###########################################
taddresses <- read.csv("PartisanPreferredMedia.csv")
tidlist <- as.list(taddresses$ID)
FetchedTweets <- data.frame(tweet_id=character(),
user_username=character(),
text=character(),
author_id=character(),
created_at=character(),
conversation_id=character(),
user_name=character(),
retweet_count=character(),
like_count=character(),
quote_count=character(),
sourcetweet_type=character(),
sourcetweet_id=character(),
sourcetweet_text=character(),
sourcetweet_author_id=character(),
ID=character())
keepvars <- c("tweet_id",
"user_username",
"text",
"author_id",
"created_at",
"conversation_id",
"user_name",
"retweet_count",
"like_count",
"quote_count",
"sourcetweet_type",
"sourcetweet_id",
"sourcetweet_text",
"sourcetweet_author_id")
folder <- "mydata"
if (file.exists(folder)) {
cat("The folder already exists")
} else {
dir.create(folder)
}
for(ID in tidlist){
unlink("mydata", recursive = TRUE)
my_query <- build_query(users = ID)
Sys.sleep(3)
thisfetch <- get_all_tweets(
query = my_query,
start_tweets = "2021-01-02T00:00:00Z",
end_tweets = "2022-10-15T00:00:00Z",
bearer_token = get_bearer(),
data_path = "mydata",
n = Inf)
colcount <- ncol(thisfetch)
print(colcount)
if(colcount == 0){
next
} else {
thisfetch <- bind_tweets(data_path = "mydata/",
output_format = "tidy")
thisfetch <- thisfetch[keepvars]
thisfetch$ID <- ID
FetchedTweets <- rbind(FetchedTweets,thisfetch)
}
}
FetchedTweets$date <- ymd_hms(FetchedTweets$created_at, tz = "US/Eastern")
#Use OlsonNames() to generate a list of available time zones
#write.csv(CongressTweets,"CongressTweets20220101to20220922.csv")
FetchedTweets$sourcetweet_type <- FetchedTweets$sourcetweet_type %>%
replace_na('original')
FetchedTweets <- FetchedTweets %>%
mutate(
pre_text = if_else(
condition = sourcetweet_type == "retweeted",
true = sub(":.*", "",FetchedTweets$text),
false = ""
)
)
FetchedTweets <- FetchedTweets %>%
mutate(
text_new = if_else(
condition = sourcetweet_type == "retweeted",
true = sourcetweet_text,
false = text
)
)
FetchedTweets <- FetchedTweets %>%
mutate(
text_new = if_else(
condition = sourcetweet_type == "retweeted",
true = FetchedTweets$finaltext <- paste(FetchedTweets$pre_text,
FetchedTweets$text_new,
sep=": "),
false = text
)
)
PPMedia <- merge(FetchedTweets, taddresses,
by.x = c("ID"),
by.y = c("ID"))
PPMedia$URL <- paste("https://twitter.com/user/status/",PPMedia$tweet_id, sep = "")
PPMedia$text <- PPMedia$text_new
PPMedia <- subset(PPMedia,
select = -c(sourcetweet_text,
pre_text,
text_new,
full_name))
write_csv(PPMedia,"PartisanPreferredMediaTweets.csv")
rm(FetchedTweets,
taddresses,
thisfetch,
tidlist,
colcount,
folder,
ID,
keepvars,
my_query)
##################################################
# TF-IDF analysis
# Packages and libraries
if (!require("ggplot2")) install.packages("ggplot2")
if (!require("dplyr")) install.packages("dplyr")
if (!require("readr")) install.packages("readr")
if (!require("tidytext")) install.packages("tidytext")
library(ggplot2)
library(dplyr)
library(readr)
library(tidytext)
library(stringr) # Part of the tidyverse package
# Tokenizing by word and counting words by source
tidy_text <- Congress %>%
unnest_tokens(word,text) %>%
count(party,word, sort = TRUE)
# Total words by source
total_words <- tidy_text %>%
group_by(party) %>%
summarize(total = sum(n))
# Adding total words column to data file
tidy_text <- left_join(tidy_text,total_words)
# Deleting data frames that are no longer needed
rm("total_words")
# TF-IDF
tidy_text_tf_idf <- tidy_text %>%
bind_tf_idf(word, party, n) %>%
arrange(desc(tf_idf))
# Graphing high TF-IDF words by source
library(forcats)
tidy_text_tf_idf %>%
group_by(party) %>%
slice_max(tf_idf, n = 12) %>%
ungroup() %>%
ggplot(aes(tf_idf, fct_reorder(word, tf_idf), fill = party)) +
geom_col(show.legend = FALSE) +
facet_wrap(~party, ncol = 2, scales = "free") +
labs(x = "tf-idf", y = NULL)
rm("tidy_text")
##################################################
#Combining Congress and PPMedia dataframes
PPMedia['rt_username'] <- NA
PPMedia['rt_name'] <- NA
PPMedia['rt_description'] <- NA
CongressAndPPM <- rbind(Congress,PPMedia)
#Combining Congress and Media dataframes
Media['rt_username'] <- NA
Media['rt_name'] <- NA
Media['rt_description'] <- NA
CongressAndMedia <- rbind(Congress,Media)
### Using the grepl function ###
imgterms <- "border|bidenbordercrisis
|immigra|illegal aliens|alien"
CongressAndPPM$immigration <- ifelse(grepl(imgterms,
CongressAndPPM$text,
ignore.case = TRUE),1,0)
rm("imgterms")
abterms <- "abort|reproductive rights|unborn|right to life
|right to choose|roe v"
CongressAndPPM$abortion <- ifelse(grepl(abterms,
CongressAndPPM$text,
ignore.case = TRUE),1,0)
rm("abterms")
econterms <- "inflation|rising prices|economic|economy"
CongressAndPPM$economy <- ifelse(grepl(econterms,
CongressAndPPM$text,
ignore.case = TRUE),1,0)
rm("econterms")
Trumpterms <- "Trump|Donald Trump|MAGA"
CongressAndPPM$Trump <- ifelse(grepl(Trumpterms,
CongressAndPPM$text,
ignore.case = FALSE),1,0)
rm("Trumpterms")
Bidenterms <- "Biden|Joe Biden|Uncle Joe"
CongressAndPPM$Biden <- ifelse(grepl(Bidenterms,
CongressAndPPM$text,
ignore.case = TRUE),1,0)
rm("Bidenterms")
### Rounding off dates ###
CongressAndPPM$Week <- round_date(
CongressAndPPM$date,
unit = "week",
week_start = getOption("lubridate.week.start", 1)
)
### Getting weekly tweet counts by party
TweetsByPartyAndWeek <- CongressAndPPM %>%
group_by(Week,party) %>%
summarize(TweetCount = n()) %>%
pivot_wider(
names_from = "party",
values_from = c("TweetCount"))
TweetsByPartyAndWeek <- subset(TweetsByPartyAndWeek,
select = -c(Independent))
### Pivoting and collapsing the data
Data_Wide <- CongressAndPPM %>%
pivot_wider(
names_from = "party",
values_from = c("immigration","abortion","economy","Trump","Biden"))
Data_Wide_Coll <- Data_Wide %>%
group_by(Week) %>%
summarize(Immigration_R = sum(immigration_Republican, na.rm=TRUE),
Immigration_D = sum(immigration_Democrat, na.rm=TRUE),
Immigration_DPO = sum(immigration_DPO, na.rm=TRUE),
Immigration_RPO = sum(immigration_RPO, na.rm=TRUE),
Abortion_R = sum(abortion_Republican, na.rm=TRUE),
Abortion_D = sum(abortion_Democrat, na.rm=TRUE),
Abortion_DPO = sum(abortion_DPO, na.rm=TRUE),
Abortion_RPO = sum(abortion_RPO, na.rm=TRUE),
Economy_R = sum(economy_Republican, na.rm=TRUE),
Economy_D = sum(economy_Democrat, na.rm=TRUE),
Economy_DPO = sum(economy_DPO, na.rm=TRUE),
Economy_RPO = sum(economy_RPO, na.rm=TRUE),
DTrump_R = sum(Trump_Republican, na.rm=TRUE),
DTrump_D = sum(Trump_Democrat, na.rm=TRUE),
DTrump_DPO = sum(Trump_DPO, na.rm=TRUE),
DTrump_RPO = sum(Trump_RPO, na.rm=TRUE),
JBiden_R = sum(Biden_Republican, na.rm=TRUE),
JBiden_D = sum(Biden_Democrat, na.rm=TRUE),
JBiden_DPO = sum(Biden_DPO, na.rm=TRUE),
JBiden_RPO = sum(Biden_RPO, na.rm=TRUE),
TweetCount = n())
FinalData <- merge(Data_Wide_Coll, TweetsByPartyAndWeek, by=c("Week")) %>%
mutate(Immigration_R_Pct = (Immigration_R / Republican)*100,
Immigration_D_Pct = (Immigration_D / Democrat)*100,
Immigration_DPO_Pct = (Immigration_DPO / DPO)*100,
Immigration_RPO_PCT = (Immigration_RPO / RPO)*100,
Abortion_R_Pct = (Abortion_R / Republican)*100,
Abortion_D_Pct = (Abortion_D / Democrat)*100,
Abortion_DPO_Pct = (Abortion_DPO / DPO)*100,
Abortion_RPO_PCT = (Abortion_RPO / RPO)*100,
Economy_R_Pct = (Economy_R / Republican)*100,
Economy_D_Pct = (Economy_D / Democrat)*100,
Economy_DPO_Pct = (Economy_DPO / DPO)*100,
Economy_RPO_PCT = (Economy_RPO / RPO)*100,
DTrump_R_Pct = (DTrump_R / Republican)*100,
DTrump_D_Pct = (DTrump_D / Democrat)*100,
DTrump_DPO_Pct = (DTrump_DPO / DPO)*100,
DTrump_RPO_PCT = (DTrump_RPO / RPO)*100,
JBiden_R_Pct = (JBiden_R / Republican)*100,
JBiden_D_Pct = (JBiden_D / Democrat)*100,
JBiden_DPO_Pct = (JBiden_DPO / DPO)*100,
JBiden_RPO_PCT = (JBiden_RPO / RPO)*100)
write_csv(FinalData,"FinalData_PPM.csv")
# Cleanup
rm("Data_Wide",
"Data_Wide_Coll",
"TweetsByPartyAndWeek")