Last compiled at 2023-01-15 23:08:44.
Importing libraries
library(dplyr) # data cleaning and processing
library(rtweet) # importing and processing Twitter data structure
library(igraph) # network visualization
myfiles <- list.files(getwd(), pattern = "", full.names = F)
## [1] "2023-01-13 23:27:42 EST"
## [1] "2023-01-15 22:51:39 EST"
NOTES: Due to Twitter API rate limits and unknown technical issues, it took 26 rounds of successful download and many other interrupted (failed) attempts to collect the data. The dataset was downloaded between 2023-01-13 23:27:42 and 2023-01-15 22:51:39.
nlist <- vector("list", length(myfiles)) # create an empty list that will contain all the data frames
for (n in 1:length(myfiles)) {
nlist[[n]] <- read_twitter_csv(myfiles[n])
}
rt118 <- do.call(what = "rbind", lapply(nlist, as.data.frame))
rt118 <- rt118[order(rt118$created_at),] # order the observations by time sequence
rt118 <- unique(rt118) # remove duplicated rows, if any
slist <- vector("list", length(rt_source))
for (n in 1: length(rt_source)) {
slist[[n]] <- read_twitter_csv(rt_source[n])
}
source118 <- do.call(what = "rbind", lapply(slist, as.data.frame))
source118 <- source118[order(source118$created_at),]
source118 <- unique(source118)
dim(source118)
## [1] 27491 90
ids <- unique(rt118$status_id) # unique tweet ids in the retweet (primary) dataset
refids <- unique(rt118$referenced_status_id[rt118$referenced_status_id %in% source118$status_id]) # referenced ids whose information are available in the source dataset
allids <- c(ids, refids) # combine the ids in the retweet (primary) dataset and the referenced ids
allids <- unique(allids) # remove duplicates
length(allids) # TOTAL TWEETS AVAILABLE FOR ANALYSIS
## [1] 354974
KEY NUMBERS:
The time range of the main retweets dataset is between 2021-01-03 and 2023-01-03.
The original tweets were posted between 2019-02-09 20:47:11 and 2023-01-03 22:16:29.
Combining the tweets in the retweet (primary) dataset and the original (source) tweets, the final number of tweets available for analysis is 354,974.
There are 328,760 observations (retweeted tweets) in the dataset.
ts_plot(rt118) +
ggtitle(label = paste(as.Date(range(rt118$created_at)[1]), " to ", as.Date(range(rt118$created_at)[2]), sep = ""), subtitle = paste("Total ", format(nrow(rt118), big.mark = ",", scientific = F), " tweets", sep = "")) +
theme_minimal() +
xlab("Time") + ylab("Daily tweets")
What are the variables?
This demonstration only uses several variables.
author_id
is a unique Twitter user identifier for
those who retweeted others’ tweets.
status_id
is a unique tweet identifier of a given
Twitter post.
referenced_user_id
is a Twitter user identifier for
the users whose tweets had been retweeted.
referenced_status_id
is a unique tweet identier of a
given original Twitter post that had been retweeted.
names(rt118)
## [1] "account_created_at" "author_id"
## [3] "conversation_id" "created_at"
## [5] "description" "hashtag"
## [7] "lang" "like_count"
## [9] "mention" "name"
## [11] "quote_count" "reference_created_at"
## [13] "referenced_status_id" "referenced_text"
## [15] "referenced_user_created_at" "referenced_user_description"
## [17] "referenced_user_followers_count" "referenced_user_following_count"
## [19] "referenced_user_id" "referenced_user_listed_count"
## [21] "referenced_user_location" "referenced_user_name"
## [23] "referenced_user_protected" "referenced_user_tweet_count"
## [25] "referenced_user_verified" "referenced_username"
## [27] "reply_count" "retweet_count"
## [29] "status_id" "text"
## [31] "type" "username"
How many unique original tweets? Are there any discrepancies? It is possible to have a small number of discrepancies. Some tweets are unavailable if they had been deleted at the time of data collection or have enhanced privacy setting (e.g., only available to followers).
length(unique(rt118$referenced_status_id))
## [1] 26248
length(unique(source118$status_id))
## [1] 26214
There’re 26248 original tweets that have been retweeted.
Which are the most retweeted tweet? How many times have they been retweeted? Both in the dataset and according to the Twitter statistics?
rt_freq <- rt118 %>%
group_by(referenced_status_id) %>%
summarize(Freq = n()) # Frequency table from the dataset
rt_stats <- source118[,c("status_id", "retweet_count")] # retweet count from Twitter
rt_stats <- unique(rt_stats)
rt_tabs <- merge(rt_freq, rt_stats, by.x = "referenced_status_id", by.y = "status_id")
rt_tabs <- rt_tabs[rev(order(rt_tabs$retweet_count)),]
rt_tabs$ratio <- rt_tabs$Freq/rt_tabs$retweet_count
rt_tabs$ratio[rt_tabs$retweet_count==0] <- 0
head(rt_tabs[,c("Freq", "retweet_count", "ratio")]) # expected high correlation between Freq and retweet_count, differences likely due to the original tweet was created earlier so that most its retweets weren't captured in this dataset.
## Freq retweet_count ratio
## 62 52 5891 0.0088270243
## 26 10 1501 0.0066622252
## 4947 1032 1158 0.8911917098
## 4946 1032 1157 0.8919619706
## 312 1 1101 0.0009082652
## 23582 941 1023 0.9198435973
hist(rt_tabs$ratio, xlab = "Freq/retweet_count", main = "Retweet Data Quality Evaluation") # This is to evaluate data quality: whether qualified retweets have been captured in our dataset. Ideally, they should be equal to 1.0. >1.0 indicates Twitter statistics error.
rm(rt_freq, rt_stats)
length(which(rt_tabs$ratio>1))/nrow(rt_tabs) # % of observations with actual Freq>Twitter reported retweet_count, indicating data quality problem.
## [1] 0.01908745
mean(rt_tabs$ratio)
## [1] 0.9373848
NOTES:
The ratio
of actual retweet count
(Freq
) should be equal or very close to the official
retweet count (retweet_count
). If the two counts are
greatly different, maybe the original tweet was posted
earlier than the queried time range of this
dataset.
Among the 26300 original tweets, there’re 502 tweets, or, 1.909% of the tweets that have actual retweet count higher than the official retweet count.
Gini coef of re-tweet distribution by tweets
rt_freq <- rt118 %>%
group_by(referenced_status_id) %>%
summarize(Freq = n()) # Frequency table from dataset
rt_freq <- rt_freq[order(rt_freq$Freq),]
rt_freq$pct <- rt_freq$Freq/sum(rt_freq$Freq)
# Auxiliary line
x1 = seq(round(.5*nrow(rt_freq)), nrow(rt_freq), length.out = 100)
y1 = seq(.5, 0, length.out = 100)
n = nrow(rt_freq)
Area_Bi <- rep(NA, n)
for (i in 1: n) {
Area_Bi[i] <- (n - i + 0.5)*rt_freq$pct[i]/n
}
sum(Area_Bi)
## [1] 0.1340361
Gini_coef = 1-2*sum(Area_Bi); Gini_coef
## [1] 0.7319278
\[Gini = \frac{A}{A+B}\]
plot(cumsum(rt_freq$pct), pch = ".", xaxt = "n", xlab = "cumulative % of number of tweets", ylab = "cumulative % of number of retweets", main = paste("Retweet Gini Coefficient = ",round(Gini_coef, 3), sep = ""), col = "blue", lwd = 2, type = "l")
axis(side = 1, at = c(0, .25*nrow(rt_freq), .5*nrow(rt_freq), .75*nrow(rt_freq), nrow(rt_freq)), labels = c("0","25%", "50%", "75%", "100%"))
lines(rbind(c(0,0), c(nrow(rt_freq),1)), type = "l", col = "red") # Line of Equality created between Point A (0,0) and Point B(nrow(df),1)
lines(rbind(c(0,0),c(nrow(rt_freq),0)), type = "l", lty = 2) # horizontal line
lines(rbind(c(nrow(rt_freq),0), c(nrow(rt_freq),1)), type = "l", lty = 2) # vertical line
text(x = .25*nrow(rt_freq), y = .3, "Perfectly equality line", srt = 45)
text(x = .75*nrow(rt_freq), y = cumsum(rt_freq$pct)[.8*nrow(rt_freq)]+0.05, "Lorenz Curve", srt = 45) # Lorenz Curve at 75%
#lines(rbind(c(nrow(rt_freq), 0), c(.5*nrow(rt_freq),.5)), type = "l", lty = 3) # auxiliary line
text(x = x1[15], y = y1[15], "A", cex = 3)
text(x = x1[85], y = y1[85], "B", cex = 3)
text(x = .2*nrow(rt_freq), y = .8, expression(Gini==frac(A, A+B)), cex= 1.5)
Dataset to calculate Gini coefficient
tail(rt_freq[,c("Freq", "pct")])
## # A tibble: 6 × 2
## Freq pct
## <int> <dbl>
## 1 571 0.00174
## 2 595 0.00181
## 3 635 0.00193
## 4 644 0.00196
## 5 941 0.00286
## 6 1032 0.00314
Retweet Network In-degree Gini Coefficient
ref_user <- rt118 %>%
group_by(referenced_user_id) %>%
summarize(Freq = n())
ref_user <- ref_user[order(ref_user$Freq), ]
ref_user$pct <- ref_user$Freq/sum(ref_user$Freq)
# Auxiliary line
x2 = seq(round(.5*nrow(ref_user)), nrow(ref_user), length.out = 100)
y2 = seq(.5, 0, length.out = 100)
n = nrow(ref_user)
Area_Bi <- rep(NA, n)
for ( i in 1:n) {
Area_Bi[i] <- (n - i + 0.5)*ref_user$pct[i]/n
}
sum(Area_Bi)
## [1] 0.04290992
1 - 2*sum(Area_Bi) # Gini Coef
## [1] 0.9141802
Gini_coef = 1 - 2*sum(Area_Bi)
plot(cumsum(ref_user$pct), pch = ".", xaxt = "n", xlab = "cumulative % of number of users", ylab = "cumulative % of number of retweet indegrees", main = paste("Network Indegree Gini Coefficient = ",round(Gini_coef, 3), sep = ""), col = "blue", lwd = 2, type = "l")
axis(side = 1, at = c(0, .25*nrow(ref_user), .5*nrow(ref_user), .75*nrow(ref_user), nrow(ref_user)), labels = c("0","25%", "50%", "75%", "100%"))
lines(rbind(c(0,0), c(nrow(ref_user),1)), type = "l", col = "red") # Line of Equality created between Point A (0,0) and Point B(nrow(df),1)
lines(rbind(c(0,0),c(nrow(ref_user),0)), type = "l", lty = 2) # horizontal line
lines(rbind(c(nrow(ref_user),0), c(nrow(ref_user),1)), type = "l", lty = 2) # vertical line
text(x = .25*nrow(ref_user), y = .3, "Perfectly equality line", srt = 45)
text(x = .75*nrow(ref_user), y = cumsum(ref_user$pct)[.9*nrow(ref_user)]+0.05, "Lorenz Curve", srt = 45) # Lorenz Curve at 75%
#lines(rbind(c(nrow(ref_user), 0), c(.5*nrow(ref_user),.5)), type = "l", lty = 3) # auxiliary line
text(x = x2[15], y = y2[15], "A", cex = 3)
text(x = x2[85], y = y2[85], "B", cex = 3)
text(x = .2*nrow(ref_user), y = .8, expression(Gini==frac(A, A+B)), cex= 1.5)
Visualize the retweet network that retains n % of the ties.
rt_edgelist <- rt118[,c("author_id", "referenced_user_id")]
rt_edgelist <- rt_edgelist[complete.cases(rt_edgelist), ]
indeg_user <- rt_edgelist %>%
group_by(referenced_user_id) %>%
summarise(Freq = n())
indeg_user <- indeg_user[rev(order(indeg_user$Freq)),]
indeg_user$cumsum <- cumsum(indeg_user$Freq)/sum(indeg_user$Freq)
totalusers <- length(unique(union_all(rt_edgelist$author_id, rt_edgelist$referenced_user_id))) # total users, served as the denominator to get % of selected indegree retweet recipients
# e.g. retain x% of total ties
toppct <- 0.9
threshold <- min(which(indeg_user$cumsum>toppct))
keeps <- indeg_user$referenced_user_id[1:threshold] # selected recipients whose indegree met threshold
rt_edgelist_reduced <- rt_edgelist[rt_edgelist$referenced_user_id %in% keeps,]
grt <- graph_from_data_frame(rt_edgelist_reduced)
grt <- simplify(grt, remove.multiple = F, remove.loops = T)
vcount(grt);ecount(grt)
## [1] 13477
## [1] 293427
set.seed(1234)
par(mar=c(1,.5,1.5,.5))
V(grt)$label = ""
V(grt)$size = log(degree(grt)+2)
summary(V(grt)$size) # node size distribution. Make sure the values are positive.
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 1.099 1.099 1.099 1.828 1.946 11.122
plottitle <- paste("Visualizing ", toppct*100, "% of ties", ", including ",length(keeps),", or ", paste(round(100*length(keeps)/totalusers,2),"%", sep = ""), " most retweeted users", sep = "")
plot(grt, edge.arrow.size = .1, edge.curved = 0,edge.width = .1, vertex.size = V(grt)$size,vertex.frame.color = "gray50", edge.color = "gray70", main = plottitle, vertex.color = "lightsteelblue")
This network has 13,477 nodes and 293,427 ties.
To validate the network reduction, confirm that the number of ties in
the retweet network (grt
) equals to or slightly higher than
the number of observations in the dataset. The subset of
[rt118$author_id != rt118$referenced_user_id,]
is to remove
self-retweets.
ecount(grt)/nrow(rt118[rt118$author_id != rt118$referenced_user_id,]) # They should be equal or slightly largher than the threshold number.
## [1] 0.9010779
Execution ended at 2023-01-15 23:10:12.