Introduction

The Twitter dataset, #ChatGPT 1000 Daily Tweets,containing tweets related to “ChatGPT,” “GPT3,” or “GPT4,” is interesting and relevant for analysis due to its dynamic nature and potential insights into user engagement with these topics. In this report, we will explore the dataset, posing research questions, conducting data wrangling, and presenting the results.

Why is the dataset interesting/relevant?

The dataset provides a snapshot of Twitter activity around the specified keywords, offering opportunities for analyzing trends, user engagement, and sentiment. Understanding the dynamics of discussions related to advanced language models like GPT3 and GPT4 on Twitter can be valuable for various purposes, including community sentiment analysis, tracking popularity, and exploring user interactions.

Research Questions

What are the most common languages in tweets related to ChatGPT, GPT3, or GPT4?
How does user engagement (likes, retweets, impressions) vary across different languages?

Audience

Target audience for this study are the language researchers, students and social workers.

Data Wrangling

The data wrangling process involves handling missing values, converting language code in real names, and preparing the data for analysis.

Analysis

The dataset encompasses a diverse range of 53 languages. The distribution of tweet counts across these languages serves as an indicative measure of awareness in various parts of the world regarding these widely discussed language models. The exploration of tweet activity across different languages provides valuable insights into the global awareness and engagement with these popular language models.

# Libraries
library(tidyverse)
library(dplyr)

tweets <- read.csv("dataset/chatgpt_daily_tweets.csv")

# Prepare a dataframe for labels
label_data <- tweets %>%
  filter(!is.null(lang) & lang != "und" & lang != "zxx") %>% # "zxx": No linguistic content (zxx) and "und": Undefined (und)
  group_by(lang) %>%
  count()

label_data$n_normalized <- ifelse(label_data$n > 100, label_data$n / 10, label_data$n)

# Replace language codes with full language names
label_data$lang <- case_when(
  label_data$lang == "147.0" ~ "Bosnian",
  label_data$lang == "1480.0" ~ "Croatian",
  label_data$lang == "14918.0" ~ "Latvian",
  label_data$lang == "6715.0" ~ "Azerbaijani",
  label_data$lang == "ar" ~ "Arabic",
  label_data$lang == "bg" ~ "Bulgarian",
  label_data$lang == "ca" ~ "Catalan",
  label_data$lang == "ckb" ~ "Sorani Kurdish",
  label_data$lang == "cs" ~ "Czech",
  label_data$lang == "cy" ~ "Welsh",
  label_data$lang == "da" ~ "Danish",
  label_data$lang == "de" ~ "German",
  label_data$lang == "el" ~ "Greek",
  label_data$lang == "en" ~ "English",
  label_data$lang == "es" ~ "Spanish",
  label_data$lang == "et" ~ "Estonian",
  label_data$lang == "eu" ~ "Basque",
  label_data$lang == "fa" ~ "Persian",
  label_data$lang == "fi" ~ "Finnish",
  label_data$lang == "fr" ~ "French",
  label_data$lang == "gu" ~ "Gujarati",
  label_data$lang == "hi" ~ "Hindi",
  label_data$lang == "ht" ~ "Haitian Creole",
  label_data$lang == "hu" ~ "Hungarian",
  label_data$lang == "hy" ~ "Armenian",
  label_data$lang == "in" ~ "Indonesian",
  label_data$lang == "it" ~ "Italian",
  label_data$lang == "iw" ~ "Hebrew",
  label_data$lang == "ja" ~ "Japanese",
  label_data$lang == "kn" ~ "Kannada",
  label_data$lang == "ko" ~ "Korean",
  label_data$lang == "lt" ~ "Lithuanian",
  label_data$lang == "lv" ~ "Latvian",
  label_data$lang == "ml" ~ "Malayalam",
  label_data$lang == "mr" ~ "Marathi",
  label_data$lang == "ne" ~ "Nepali",
  label_data$lang == "nl" ~ "Dutch",
  label_data$lang == "no" ~ "Norwegian",
  label_data$lang == "pl" ~ "Polish",
  label_data$lang == "pt" ~ "Portuguese",
  label_data$lang == "qam" ~ "Amharic",
  label_data$lang == "qht" ~ "Haitian Creole",
  label_data$lang == "qme" ~ "Khmer",
  label_data$lang == "ro" ~ "Romanian",
  label_data$lang == "ru" ~ "Russian",
  label_data$lang == "sl" ~ "Slovenian",
  label_data$lang == "sr" ~ "Serbian",
  label_data$lang == "sv" ~ "Swedish",
  label_data$lang == "ta" ~ "Tamil",
  label_data$lang == "th" ~ "Thai",
  label_data$lang == "tl" ~ "Tagalog",
  label_data$lang == "tr" ~ "Turkish",
  label_data$lang == "uk" ~ "Ukrainian",
  label_data$lang == "ur" ~ "Urdu",
  label_data$lang == "vi" ~ "Vietnamese",
  label_data$lang == "zh" ~ "Chinese"
)

# Arrange label_data based on the value of n in descending order
label_data <- label_data %>%
  arrange(n_normalized)


empty_bar <- 4

# Assuming you want to add 5 NA rows, adjust the number as needed
na_rows <- data.frame(id = rep(NA, empty_bar), lang = rep(NA, empty_bar), n = rep(NA, empty_bar), angle = rep(NA, empty_bar), half = rep(NA, empty_bar), hjust = rep(NA, empty_bar))

# Add NA rows to label_data
label_data <- rbind(label_data, na_rows)

# Calculate the ANGLE of the labels
number_of_bar <- nrow(label_data) 
label_data$id <- seq_along(label_data$lang)
angle <- 90 - 360 * (label_data$id - 0.5) / number_of_bar

# Calculate the alignment of labels: right or left
label_data$hjust <- ifelse(angle < -90, 1, 0)

# Flip angle to make them readable
label_data$angle <- ifelse(angle < -90, angle + 180, angle)

# Start the plot with larger size and a bigger inner circle
p <- ggplot(label_data, aes(x = as.factor(id))) +
  
  # Add the bars with a blue color and adjust width
  geom_bar(aes(y = n_normalized), stat = "identity", fill = alpha("skyblue", 0.7), width = 1) + 
  
  # Limits of the plot with a bigger inner circle
  ylim(-3000, 3000) + 
  
  # Custom the theme
  theme_minimal() +
  theme(
    axis.text = element_blank(),
    axis.title = element_blank(),
    panel.grid = element_blank(),
    plot.margin = unit(rep(0, 4), "cm")
  ) +
  
  # Set polar coordinates
  coord_polar(start = 0) +
  
  # Add the labels
  geom_text(
    aes(x = as.factor(id), y = n_normalized + 5, label = ifelse(label_data$hjust == 1, paste(lang, "-", n), paste(n, "-", lang)), hjust = hjust), 
    color = "black",
    fontface = "bold",
    alpha = 0.8,
    size = 2.5,
    angle = label_data$angle,
    inherit.aes = FALSE
  ) +

  # Add title and explanation label
  ggtitle("Circular Bar Plot of Tweet Counts") +
  labs(subtitle = "The plot represents tweet counts in a specific language in language-count format.")

# Print the plot
 print(p)

The popularity score was calculated as a composite metric, considering the number of likes, retweets, comments, and impressions for each tweet. This comprehensive approach provides a holistic measure of a tweet’s impact and reach within the Twitter community. The calculated popularity scores were then analyzed and plotted against the top languages in the dataset, allowing for a visual representation of the relative popularity and engagement with tweets related to ChatGPT, GPT3, and GPT4 across different language communities. This visualization is instrumental in identifying trends, assessing content effectiveness, and gaining insights into the nuanced reception and impact of discussions surrounding these language models on Twitter.

# Tweet subset with evaluation
social_tweets <- subset(tweets, select = c("lang", "impression_count","reply_count", "like_count", "retweet_count"))

# Replace language codes with full language names
social_tweets$lang <- case_when(
  social_tweets$lang == "147.0" ~ "Bosnian",
  social_tweets$lang == "1480.0" ~ "Croatian",
  social_tweets$lang == "14918.0" ~ "Latvian",
  social_tweets$lang == "6715.0" ~ "Azerbaijani",
  social_tweets$lang == "ar" ~ "Arabic",
  social_tweets$lang == "bg" ~ "Bulgarian",
  social_tweets$lang == "ca" ~ "Catalan",
  social_tweets$lang == "ckb" ~ "Sorani Kurdish",
  social_tweets$lang == "cs" ~ "Czech",
  social_tweets$lang == "cy" ~ "Welsh",
  social_tweets$lang == "da" ~ "Danish",
  social_tweets$lang == "de" ~ "German",
  social_tweets$lang == "el" ~ "Greek",
  social_tweets$lang == "en" ~ "English",
  social_tweets$lang == "es" ~ "Spanish",
  social_tweets$lang == "et" ~ "Estonian",
  social_tweets$lang == "eu" ~ "Basque",
  social_tweets$lang == "fa" ~ "Persian",
  social_tweets$lang == "fi" ~ "Finnish",
  social_tweets$lang == "fr" ~ "French",
  social_tweets$lang == "gu" ~ "Gujarati",
  social_tweets$lang == "hi" ~ "Hindi",
  social_tweets$lang == "ht" ~ "Haitian Creole",
  social_tweets$lang == "hu" ~ "Hungarian",
  social_tweets$lang == "hy" ~ "Armenian",
  social_tweets$lang == "in" ~ "Indonesian",
  social_tweets$lang == "it" ~ "Italian",
  social_tweets$lang == "iw" ~ "Hebrew",
  social_tweets$lang == "ja" ~ "Japanese",
  social_tweets$lang == "kn" ~ "Kannada",
  social_tweets$lang == "ko" ~ "Korean",
  social_tweets$lang == "lt" ~ "Lithuanian",
  social_tweets$lang == "lv" ~ "Latvian",
  social_tweets$lang == "ml" ~ "Malayalam",
  social_tweets$lang == "mr" ~ "Marathi",
  social_tweets$lang == "ne" ~ "Nepali",
  social_tweets$lang == "nl" ~ "Dutch",
  social_tweets$lang == "no" ~ "Norwegian",
  social_tweets$lang == "pl" ~ "Polish",
  social_tweets$lang == "pt" ~ "Portuguese",
  social_tweets$lang == "qam" ~ "Amharic",
  social_tweets$lang == "qht" ~ "Haitian Creole",
  social_tweets$lang == "qme" ~ "Khmer",
  social_tweets$lang == "ro" ~ "Romanian",
  social_tweets$lang == "ru" ~ "Russian",
  social_tweets$lang == "sl" ~ "Slovenian",
  social_tweets$lang == "sr" ~ "Serbian",
  social_tweets$lang == "sv" ~ "Swedish",
  social_tweets$lang == "ta" ~ "Tamil",
  social_tweets$lang == "th" ~ "Thai",
  social_tweets$lang == "tl" ~ "Tagalog",
  social_tweets$lang == "tr" ~ "Turkish",
  social_tweets$lang == "uk" ~ "Ukrainian",
  social_tweets$lang == "ur" ~ "Urdu",
  social_tweets$lang == "vi" ~ "Vietnamese",
  social_tweets$lang == "zh" ~ "Chinese"
)

# Calculate popularity score (you can define your own formula based on the importance of each metric)
social_tweets$popularity_score <- with(social_tweets, impression_count + 2 * reply_count + like_count + 2 * retweet_count)

# Aggregate scores by language
agg_data <- aggregate(popularity_score ~ lang, data = social_tweets, sum)

# Find the language with the highest total popularity score
most_popular_language <- agg_data[which.max(agg_data$popularity_score), "lang"]

# Print the aggregated data and the most popular language
#print(agg_data)
#print(paste("Most Popular Language:", most_popular_language))

sorted_agg_data <- agg_data[order(-agg_data$popularity_score), ]

# Select top 10 languages
top_10_languages <- head(sorted_agg_data$lang, 10)

# Filter data for the top 10 languages
top_10_data <- social_tweets[social_tweets$lang %in% top_10_languages, ]

# Create a scatter plot for popularity score
ggplot(top_10_data, aes(x = lang, y = popularity_score, color = lang)) +
  geom_point(size = 3, alpha = 0.7) +
  labs(title = "Scatter Plot of Popularity Score for Top 10 Languages", x = "Language", y = "Popularity Score") +
  theme_minimal()

Twitter Dataset Analysis Report

Monira Heya

12/20/2023