The Twitter dataset, #ChatGPT 1000 Daily Tweets,containing tweets related to “ChatGPT,” “GPT3,” or “GPT4,” is interesting and relevant for analysis due to its dynamic nature and potential insights into user engagement with these topics. In this report, we will explore the dataset, posing research questions, conducting data wrangling, and presenting the results.
The dataset provides a snapshot of Twitter activity around the specified keywords, offering opportunities for analyzing trends, user engagement, and sentiment. Understanding the dynamics of discussions related to advanced language models like GPT3 and GPT4 on Twitter can be valuable for various purposes, including community sentiment analysis, tracking popularity, and exploring user interactions.
Target audience for this study are the language researchers, students and social workers.
The data wrangling process involves handling missing values, converting language code in real names, and preparing the data for analysis.
# Install and load ggplot2 if not already installed
if (!requireNamespace("tidyverse", quietly = TRUE) ||
!requireNamespace("dplyr", quietly = TRUE) ||
!requireNamespace("ggplot2", quietly = TRUE)) {
install.packages(c("tidyverse", "dplyr", "ggplot2"))
}
# Libraries
library(tidyverse)
library(dplyr)
library(ggplot2)
tweets <- read.csv("dataset/chatgpt_daily_tweets.csv")
# Replace language codes with full language names
tweets$lang <- case_when(
tweets$lang == "147.0" ~ "Bosnian",
tweets$lang == "1480.0" ~ "Croatian",
tweets$lang == "14918.0" ~ "Latvian",
tweets$lang == "6715.0" ~ "Azerbaijani",
tweets$lang == "ar" ~ "Arabic",
tweets$lang == "bg" ~ "Bulgarian",
tweets$lang == "ca" ~ "Catalan",
tweets$lang == "ckb" ~ "Sorani Kurdish",
tweets$lang == "cs" ~ "Czech",
tweets$lang == "cy" ~ "Welsh",
tweets$lang == "da" ~ "Danish",
tweets$lang == "de" ~ "German",
tweets$lang == "el" ~ "Greek",
tweets$lang == "en" ~ "English",
tweets$lang == "es" ~ "Spanish",
tweets$lang == "et" ~ "Estonian",
tweets$lang == "eu" ~ "Basque",
tweets$lang == "fa" ~ "Persian",
tweets$lang == "fi" ~ "Finnish",
tweets$lang == "fr" ~ "French",
tweets$lang == "gu" ~ "Gujarati",
tweets$lang == "hi" ~ "Hindi",
tweets$lang == "ht" ~ "Haitian Creole",
tweets$lang == "hu" ~ "Hungarian",
tweets$lang == "hy" ~ "Armenian",
tweets$lang == "in" ~ "Indonesian",
tweets$lang == "it" ~ "Italian",
tweets$lang == "iw" ~ "Hebrew",
tweets$lang == "ja" ~ "Japanese",
tweets$lang == "kn" ~ "Kannada",
tweets$lang == "ko" ~ "Korean",
tweets$lang == "lt" ~ "Lithuanian",
tweets$lang == "lv" ~ "Latvian",
tweets$lang == "ml" ~ "Malayalam",
tweets$lang == "mr" ~ "Marathi",
tweets$lang == "ne" ~ "Nepali",
tweets$lang == "nl" ~ "Dutch",
tweets$lang == "no" ~ "Norwegian",
tweets$lang == "pl" ~ "Polish",
tweets$lang == "pt" ~ "Portuguese",
tweets$lang == "qam" ~ "Amharic",
tweets$lang == "qht" ~ "Haitian Creole",
tweets$lang == "qme" ~ "Khmer",
tweets$lang == "ro" ~ "Romanian",
tweets$lang == "ru" ~ "Russian",
tweets$lang == "sl" ~ "Slovenian",
tweets$lang == "sr" ~ "Serbian",
tweets$lang == "sv" ~ "Swedish",
tweets$lang == "ta" ~ "Tamil",
tweets$lang == "th" ~ "Thai",
tweets$lang == "tl" ~ "Tagalog",
tweets$lang == "tr" ~ "Turkish",
tweets$lang == "uk" ~ "Ukrainian",
tweets$lang == "ur" ~ "Urdu",
tweets$lang == "vi" ~ "Vietnamese",
tweets$lang == "zh" ~ "Chinese"
)
# Prepare a dataframe for labels
label_data <- tweets %>%
filter(!is.null(lang) & lang != "und" & lang != "zxx") %>% # "zxx": No linguistic content (zxx) and "und": Undefined (und)
group_by(lang) %>%
count()
# Arrange label_data based on the value of n in descending order
label_data <- label_data %>%
arrange(desc(n))
# Filter out null, undefined, and zero counts
label_data <- label_data %>%
filter(!is.null(lang) & lang != "und" & lang != "zxx" & n > 0)
# Select top 10 languages
top_10_languages <- head(label_data$lang, 10)
# Filter data for the top 10 languages
top_10_data <- label_data[label_data$lang %in% top_10_languages, ]
# Lollipop plot for top 10 languages with values on top and different colors
ggplot(top_10_data, aes(x = fct_reorder(lang, n), y = n)) +
geom_segment(aes(xend = fct_reorder(lang, n), yend = 0), color = "skyblue") +
geom_point(size = 3, color = "purple") +
geom_text(aes(label = n),
vjust = -0.5,
color = "black",
size = 3,
hjust = -0.2, # Adjust hjust for center alignment
margin = margin(b = 20)) + # Add a margin to the text
labs(title = "Top 10 Language Distribution",
x = "Language",
y = "Count") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1.1)) +
theme(axis.text.x = element_text(margin = margin(b = 10)))
# Tweet subset with evaluation
social_tweets <- subset(tweets, select = c("lang", "impression_count","reply_count", "like_count", "retweet_count"))
# Calculate popularity score (you can define your own formula based on the importance of each metric)
social_tweets$popularity_score <- with(social_tweets, impression_count + 2 * reply_count + like_count + 2 * retweet_count)
# Aggregate scores by language
agg_data <- aggregate(popularity_score ~ lang, data = social_tweets, sum)
# Find the language with the highest total popularity score
most_popular_language <- agg_data[which.max(agg_data$popularity_score), "lang"]
sorted_agg_data <- agg_data[order(-agg_data$popularity_score), ]
# Select top 10 languages
top_10_languages <- head(sorted_agg_data$lang, 10)
# Filter data for the top 10 languages
top_10_data <- social_tweets[social_tweets$lang %in% top_10_languages, ]
# Create a scatter plot for popularity score
ggplot(top_10_data, aes(x = lang, y = popularity_score, color = lang)) +
geom_point(size = 3, alpha = 0.7) +
labs(title = "Scatter Plot of Popularity Score for Top 10 Languages", x = "Language", y = "Popularity Score") +
theme_minimal() +
theme(axis.text.x = element_text(angle = 45, hjust = 1, vjust = 1.1))
The analysis of the Twitter dataset has provided insights into user engagement, language distribution, and other trends related to ChatGPT, GPT3, and GPT4. The dataset serves as a valuable resource for understanding online discussions and sentiments surrounding advanced language models on Twitter.