Github
Repository
Source: TweetNLP: Cutting-Edge Natural Language
Processing for Social Media”. Jose Camacho-Collados, Kiamehr Rezaee,
Talayeh Riahi, Asahi Ushio, et al. Proceedings of the 2022 Conference on
Empirical Methods in Natural Language Processing (EMNLP): System
Demonstrations Link
paper
library(tidyverse)
library(httr)
library(jsonlite)
library(sentimentr)
library(janitor)
The code below connects to the Twitter API and returns posts from the last seven days that match a search query. In the twitter query below, it searches for recent tweets that contain the word “tariff”.
# Retrieve bearer token from environment
bearer_token <- Sys.getenv("BEARER_TOKEN")
# Define token header and paste together (sprintf similar to paste0)
headers <- c(`Authorization` = sprintf('Bearer %s', bearer_token))
# Define parameters where max results is 500, querying for the string tariff, and then sorting results by relevance
params <- list(
max_results = 100,
query = "tariffs",
sort_order = "relevancy"
)
# defining url_handle for the recent tweets url
url_handle <- "https://api.twitter.com/2/tweets/search/recent"
# saving the response object
res <-
httr::GET(url = url_handle,
httr::add_headers(.headers = headers),
query = params)
# Check response type
http_type(res)
# Check for errors
http_error(res)
# parsing the json
recent_search_body <-
content(
res,
as = 'parsed',
type = 'application/json',
simplifyDataFrame = TRUE
)
# df of tweet id, edit history of tweets, and text
recent_text_body <- recent_search_body$data
The code below connects to the Roberta Base Sentiment model (documentation can be seen here).
By developing the function get_sentiment, an individual
tweet sentiment can be detected.
# Retrieve bearer token from environment for HF Model
hf_bearer_token <- Sys.getenv("HF_TOKEN")
# creating get_sentiment funct.
get_sentiment <- function(tweet){
# Get sentiment predictions from the Roberta Base Sentiment Model
sent_res <- POST(
url = "https://api-inference.huggingface.co/models/cardiffnlp/twitter-roberta-base-sentiment",
add_headers("Authorization" = paste("Bearer", hf_bearer_token)),
content_type("application/json"),
body = toJSON(
list(
inputs = tweet
),
auto_unbox = TRUE
)
)
sent_content <- fromJSON(rawToChar(sent_res$content))
}
The code below tests the sentiment for one tweet:
# example tweet
tweet <- "You know who doesn't have to pay tariffs"
# checking that the funct above works
sentiment_result <- get_sentiment(tweet)
To properly analyze tweets, the text has to be cleaned, and
normalized so each tweet can be properly compared. To clean the tweets,
the function clean_text, is developed and applied to the
‘recent_text_body’ dataframe, creating a new column in the dataframe
titled ‘clean_text’.
clean_text = function(rawtext)
{
# convert to lower case
rawtext = tolower(rawtext)
# remove rt
rawtext = gsub("rt", "", rawtext)
# remove at
rawtext = gsub("@\\w+", "", rawtext)
# remove punctuation
rawtext = gsub("[[:punct:]]", "", rawtext)
# remove numbers
rawtext = gsub("[[:digit:]]", "", rawtext)
# remove links http
rawtext = gsub("http\\w+", "", rawtext)
# remove tabs
rawtext = gsub("[ |\t]{2,}", "", rawtext)
# remove blank spaces at the beginning
rawtext = gsub("^ ", "", rawtext)
# remove blank spaces at the end
rawtext = gsub(" $", "", rawtext)
# some other cleaning text
rawtext = gsub('https://','',rawtext)
rawtext = gsub('http://','',rawtext)
rawtext = gsub('[^[:graph:]]', ' ',rawtext)
rawtext = gsub('[[:punct:]]', '', rawtext)
rawtext = gsub('[[:cntrl:]]', '', rawtext)
rawtext = gsub('\\d+', '', rawtext)
rawtext = str_replace_all(rawtext,"[^[:graph:]]", " ")
return(rawtext)
}
#
cleaned_recent_text <- recent_text_body |>
mutate(clean_text = clean_text(recent_text_body$text))
To get the sentiments for all tweets in the dataframe
‘recent_text_body’, another function is developed titled
get_sentiments_batch, where each text goes through a loop
and the sentiment is recorded. A delay is incorporated into the function
to ensure the Roberta Base Sentiment API is not overwhelmed.
# Create function with batching to apply get_sentiment() to all tweets
get_sentiments_batch <- function(tweets, ids, delay = 1) {
results <- vector("list", length(tweets)) # create empty result list
# iterate through each tweet
for (i in seq_along(tweets)) {
tweet <- tweets[i]
id <- ids[i]
# get the sentiment of the tweets
sentiment <- tryCatch({
get_sentiment(tweet)
}, error = function(e) {
list(negative = NA, neutral = NA, positive = NA)
})
# record the sentiment results in the results list
results[[i]] <- c(id = id, sentiment)
Sys.sleep(delay)
}
# bind the rows and save the results as a data frame
return(do.call(rbind, lapply(results, as.data.frame)))
}
# incorporating 1 second delay; 0 = negative, 1= neutral, 2= positive
sentiment_results <- get_sentiments_batch(
tweets = cleaned_recent_text$clean_text,
ids = cleaned_recent_text$id,
delay = 1
)
Then, the dataframe is pivoted wider so that each observation is a tweet with the corresponding sentiment scores.
# pivot wider so each observation is a tweet with a corresponding sentiment scores
sentiment_results_wider <- sentiment_results |>
pivot_wider(
names_from = label,
values_from = score
) |>
left_join(recent_text_body |> select(id, text), by = "id") |>
rename(
"negative" = "LABEL_0",
"neutral" = "LABEL_1",
"positive" = "LABEL_2"
)
To better understand the sentiment scores as a whole, the summarize function can be used to outline the mean, median, standard deviation, minimum and maximum values, and interquartile range of the negative, neutral, and positive scores.
# mean, median, sd, min, max, iqr of for each sentiment type
sentiment_roberta_base_summary <- sentiment_results_wider |>
summarize(
mean_negative = mean(negative, na.rm = TRUE),
median_negative = median(negative, na.rm = TRUE),
sd_negative = sd(negative, na.rm = TRUE),
min_negative = min(negative, na.rm = TRUE),
max_negative = max(negative, na.rm = TRUE),
iqr_negative = IQR(negative, na.rm = TRUE),
mean_neutral = mean(neutral, na.rm = TRUE),
median_neutral = median(neutral, na.rm = TRUE),
sd_neutral = sd(neutral, na.rm = TRUE),
min_neutral = min(neutral, na.rm = TRUE),
max_neutral = max(neutral, na.rm = TRUE),
iqr_neutral = IQR(neutral, na.rm = TRUE),
mean_positive = mean(positive, na.rm = TRUE),
median_positive = median(positive, na.rm = TRUE),
sd_positive = sd(positive, na.rm = TRUE),
min_positive = min(positive, na.rm = TRUE),
max_positive = max(positive, na.rm = TRUE),
iqr_positive = IQR(positive, na.rm = TRUE)
)
The data is then pivoted wider so that each sentiment type and the proportion of the sentiment type is recorded.
# pivoting longer to analyze all sentiment types
sentiment_long <- sentiment_results_wider %>%
pivot_longer(cols = c(negative, neutral, positive),
names_to = "sentiment_type",
values_to = "proportion") |>
mutate(sentiment_type = case_when(
sentiment_type == "negative" ~ "Negative",
sentiment_type == "neutral" ~ "Neutral",
sentiment_type == "positive" ~ "Positive"
)) |>
clean_names("title")
To better understand the distribution of the sentiment types for the tweets, a histogram can be used to visualize distribution by sentiment type.
ggplot(sentiment_long, aes(x = Proportion, fill = `Sentiment Type`)) +
geom_histogram(binwidth = 0.05, color = "black", alpha = 0.7) +
facet_wrap(~ `Sentiment Type`, ncol = 1, scales = "free_y") +
theme_minimal() +
labs(title = "Histogram of Sentiment Proportions",
x = "Proportion", y = "Tweet Count")
A density plot can also be used to better understand the distribution of the sentiment detected for the sample.
ggplot(sentiment_long, aes(x = Proportion, color = `Sentiment Type`, fill = `Sentiment Type`)) +
geom_density(alpha = 0.3) +
theme_minimal() +
labs(title = "Density Plot of Sentiment Proportions",
x = "Proportion", y = "Density")
A boxplot may also be used to better understand the distribution,
according to the plot below, neutral sentiment most frequently detected
in the tweets.
ggplot(sentiment_long, aes(x = `Sentiment Type`, y = Proportion, fill = `Sentiment Type`)) +
geom_boxplot(alpha = 0.7) +
theme_minimal() +
labs(title = "Boxplot of Sentiment Proportions",
x = "Sentiment Type", y = "Proportion")
sentimentr
packageAnother way to measure sentiment analysis is through the
sentimentr package, and by using the built in
sentiment_by function. To compare the results from the
sentimentr package and the Roberta Base Sentiment, a
left_join() is used to join the dataframes ‘sentimentr_results’ and
‘roberta_net_sentiment’ by the tweet id. Then, a new column, titled
‘roberta_net_sentiment’ is created by taking the positive sentiment
value minus the negative sentiment value.
# sentiment analysis with sentimentr package
sentimentr_results <- sentiment_by(cleaned_recent_text$clean_text,
by = cleaned_recent_text$id)
sentiment_compare <- sentimentr_results |>
left_join(sentiment_results_wider,
by="id") |>
mutate(roberta_net_sentiment = positive - negative) |>
select(-sd)
To understand the relationship between the two senitment types, a scatterplot can be used to visualize the data:
ggplot(sentiment_compare, aes(x = roberta_net_sentiment, y = ave_sentiment)) +
geom_point(alpha = 0.7) +
geom_smooth(method = "lm", color = "blue", se = FALSE) +
scale_y_continuous(limits=c(-1,1))+
theme_minimal() +
labs(title = "Comparison of Roberta and sentimentr Scores",
x = "Roberta Base Sentiment Result",
y = "sentimentr Sentiment Result")
To better understand the correlation, the slope of the best fit line is calculated using the lm() function. The slope of a regression line indicates how the response variable changes on average for every unit change in the predictor variable. It also shows the relationsip of the variables. The positive slope means that as the predictor variable increases, the response variable also increases.
lm(data=sentiment_compare,
formula = ave_sentiment~roberta_net_sentiment)
Another way to measure the relationship between the variables is the correlation.
cor(sentiment_compare$roberta_net_sentiment, sentiment_compare$ave_sentiment, use = "complete.obs")
The results are slightly different. This means that further analysis and research should be conducted to better understand how the models differ and which result may be more accurate.