Last compiled at 2023-01-15 23:08:44.

Importing libraries

library(dplyr) # data cleaning and processing
library(rtweet) # importing and processing Twitter data structure
library(igraph) # network visualization

Importing Datasets

myfiles <- list.files(getwd(), pattern = "", full.names = F)

## [1] "2023-01-13 23:27:42 EST"
## [1] "2023-01-15 22:51:39 EST"

NOTES: Due to Twitter API rate limits and unknown technical issues, it took 26 rounds of successful download and many other interrupted (failed) attempts to collect the data. The dataset was downloaded between 2023-01-13 23:27:42 and 2023-01-15 22:51:39.

nlist <- vector("list", length(myfiles)) # create an empty list that will contain all the data frames
for (n in 1:length(myfiles)) {
  nlist[[n]] <- read_twitter_csv(myfiles[n])
}
rt118 <- do.call(what = "rbind", lapply(nlist, as.data.frame))
rt118 <- rt118[order(rt118$created_at),] # order the observations by time sequence
rt118 <- unique(rt118) # remove duplicated rows, if any
slist <- vector("list", length(rt_source))
for (n in 1: length(rt_source)) {
  slist[[n]] <- read_twitter_csv(rt_source[n])
}
source118 <- do.call(what = "rbind", lapply(slist, as.data.frame))
source118 <- source118[order(source118$created_at),]
source118 <- unique(source118)
dim(source118)
## [1] 27491    90
ids <- unique(rt118$status_id) # unique tweet ids in the retweet (primary) dataset
refids <- unique(rt118$referenced_status_id[rt118$referenced_status_id %in% source118$status_id]) # referenced ids whose information are available in the source dataset
allids <- c(ids, refids) # combine the ids in the retweet (primary) dataset and the referenced ids
allids <- unique(allids) # remove duplicates
length(allids) # TOTAL TWEETS AVAILABLE FOR ANALYSIS
## [1] 354974

KEY NUMBERS:

  1. The time range of the main retweets dataset is between 2021-01-03 and 2023-01-03.

  2. The original tweets were posted between 2019-02-09 20:47:11 and 2023-01-03 22:16:29.

  3. Combining the tweets in the retweet (primary) dataset and the original (source) tweets, the final number of tweets available for analysis is 354,974.

Exploratory Analysis

There are 328,760 observations (retweeted tweets) in the dataset.

ts_plot(rt118) +
  ggtitle(label = paste(as.Date(range(rt118$created_at)[1]), " to ",  as.Date(range(rt118$created_at)[2]), sep = ""), subtitle = paste("Total ", format(nrow(rt118), big.mark = ",", scientific = F), " tweets", sep = "")) +
  theme_minimal() +
  xlab("Time") + ylab("Daily tweets")

What are the variables?

This demonstration only uses several variables.

  1. author_id is a unique Twitter user identifier for those who retweeted others’ tweets.

  2. status_id is a unique tweet identifier of a given Twitter post.

  3. referenced_user_id is a Twitter user identifier for the users whose tweets had been retweeted.

  4. referenced_status_id is a unique tweet identier of a given original Twitter post that had been retweeted.

names(rt118)
##  [1] "account_created_at"              "author_id"                      
##  [3] "conversation_id"                 "created_at"                     
##  [5] "description"                     "hashtag"                        
##  [7] "lang"                            "like_count"                     
##  [9] "mention"                         "name"                           
## [11] "quote_count"                     "reference_created_at"           
## [13] "referenced_status_id"            "referenced_text"                
## [15] "referenced_user_created_at"      "referenced_user_description"    
## [17] "referenced_user_followers_count" "referenced_user_following_count"
## [19] "referenced_user_id"              "referenced_user_listed_count"   
## [21] "referenced_user_location"        "referenced_user_name"           
## [23] "referenced_user_protected"       "referenced_user_tweet_count"    
## [25] "referenced_user_verified"        "referenced_username"            
## [27] "reply_count"                     "retweet_count"                  
## [29] "status_id"                       "text"                           
## [31] "type"                            "username"

How many unique original tweets? Are there any discrepancies? It is possible to have a small number of discrepancies. Some tweets are unavailable if they had been deleted at the time of data collection or have enhanced privacy setting (e.g., only available to followers).

length(unique(rt118$referenced_status_id))
## [1] 26248
length(unique(source118$status_id))
## [1] 26214

There’re 26248 original tweets that have been retweeted.

Which are the most retweeted tweet? How many times have they been retweeted? Both in the dataset and according to the Twitter statistics?

rt_freq <- rt118 %>%
  group_by(referenced_status_id) %>%
  summarize(Freq = n()) # Frequency table from the dataset
rt_stats <- source118[,c("status_id", "retweet_count")] # retweet count from Twitter
rt_stats <- unique(rt_stats)
rt_tabs <- merge(rt_freq, rt_stats, by.x = "referenced_status_id", by.y = "status_id")
rt_tabs <- rt_tabs[rev(order(rt_tabs$retweet_count)),]
rt_tabs$ratio <- rt_tabs$Freq/rt_tabs$retweet_count
rt_tabs$ratio[rt_tabs$retweet_count==0] <- 0
head(rt_tabs[,c("Freq", "retweet_count", "ratio")]) # expected high correlation between Freq and retweet_count, differences likely due to the original tweet was created earlier so that most its retweets weren't captured in this dataset.
##       Freq retweet_count        ratio
## 62      52          5891 0.0088270243
## 26      10          1501 0.0066622252
## 4947  1032          1158 0.8911917098
## 4946  1032          1157 0.8919619706
## 312      1          1101 0.0009082652
## 23582  941          1023 0.9198435973
hist(rt_tabs$ratio, xlab = "Freq/retweet_count", main = "Retweet Data Quality Evaluation") # This is to evaluate data quality: whether qualified retweets have been captured in our dataset. Ideally, they should be equal to 1.0. >1.0 indicates Twitter statistics error. 

rm(rt_freq, rt_stats)
length(which(rt_tabs$ratio>1))/nrow(rt_tabs) # % of observations with actual Freq>Twitter reported retweet_count, indicating data quality problem.
## [1] 0.01908745
mean(rt_tabs$ratio)
## [1] 0.9373848

NOTES:

  1. The ratio of actual retweet count (Freq) should be equal or very close to the official retweet count (retweet_count). If the two counts are greatly different, maybe the original tweet was posted earlier than the queried time range of this dataset.

  2. Among the 26300 original tweets, there’re 502 tweets, or, 1.909% of the tweets that have actual retweet count higher than the official retweet count.

Gini Coefficient Calculation

Gini coef of re-tweet distribution by tweets

rt_freq <- rt118 %>%
  group_by(referenced_status_id) %>%
  summarize(Freq = n()) # Frequency table from dataset
rt_freq <- rt_freq[order(rt_freq$Freq),]
rt_freq$pct <- rt_freq$Freq/sum(rt_freq$Freq)
# Auxiliary line
x1 = seq(round(.5*nrow(rt_freq)), nrow(rt_freq), length.out = 100)
y1 = seq(.5, 0, length.out = 100)

n = nrow(rt_freq)
Area_Bi <- rep(NA, n)
for (i in 1: n) {
  Area_Bi[i] <- (n - i + 0.5)*rt_freq$pct[i]/n
} 
sum(Area_Bi)
## [1] 0.1340361
Gini_coef = 1-2*sum(Area_Bi); Gini_coef
## [1] 0.7319278

\[Gini = \frac{A}{A+B}\]

plot(cumsum(rt_freq$pct), pch = ".", xaxt = "n", xlab = "cumulative % of number of tweets", ylab = "cumulative % of number of retweets", main = paste("Retweet Gini Coefficient = ",round(Gini_coef, 3), sep = ""), col = "blue", lwd = 2, type = "l")
axis(side = 1, at = c(0, .25*nrow(rt_freq), .5*nrow(rt_freq), .75*nrow(rt_freq), nrow(rt_freq)), labels = c("0","25%", "50%", "75%", "100%"))
lines(rbind(c(0,0), c(nrow(rt_freq),1)), type = "l", col = "red") # Line of Equality created between Point A (0,0) and Point B(nrow(df),1)
lines(rbind(c(0,0),c(nrow(rt_freq),0)), type = "l", lty = 2) # horizontal line
lines(rbind(c(nrow(rt_freq),0), c(nrow(rt_freq),1)), type = "l", lty = 2) # vertical line
text(x = .25*nrow(rt_freq), y = .3, "Perfectly equality line", srt = 45)
text(x = .75*nrow(rt_freq), y = cumsum(rt_freq$pct)[.8*nrow(rt_freq)]+0.05, "Lorenz Curve", srt = 45) # Lorenz Curve at 75%
#lines(rbind(c(nrow(rt_freq), 0), c(.5*nrow(rt_freq),.5)), type = "l", lty = 3) # auxiliary line
text(x = x1[15], y = y1[15], "A", cex = 3)
text(x = x1[85], y = y1[85], "B", cex = 3)
text(x = .2*nrow(rt_freq), y = .8, expression(Gini==frac(A, A+B)), cex= 1.5)

Dataset to calculate Gini coefficient

tail(rt_freq[,c("Freq", "pct")])
## # A tibble: 6 × 2
##    Freq     pct
##   <int>   <dbl>
## 1   571 0.00174
## 2   595 0.00181
## 3   635 0.00193
## 4   644 0.00196
## 5   941 0.00286
## 6  1032 0.00314

Retweet Network In-degree Gini Coefficient

ref_user <- rt118 %>%
  group_by(referenced_user_id) %>%
  summarize(Freq = n())
ref_user <- ref_user[order(ref_user$Freq), ]
ref_user$pct <- ref_user$Freq/sum(ref_user$Freq)
# Auxiliary line
x2 = seq(round(.5*nrow(ref_user)), nrow(ref_user), length.out = 100)
y2 = seq(.5, 0, length.out = 100)

n = nrow(ref_user)
Area_Bi <- rep(NA, n)
for ( i in 1:n) {
  Area_Bi[i] <- (n - i + 0.5)*ref_user$pct[i]/n
}
sum(Area_Bi)
## [1] 0.04290992
1 - 2*sum(Area_Bi) # Gini Coef
## [1] 0.9141802
Gini_coef = 1 - 2*sum(Area_Bi)
plot(cumsum(ref_user$pct), pch = ".", xaxt = "n", xlab = "cumulative % of number of users", ylab = "cumulative % of number of retweet indegrees", main = paste("Network Indegree Gini Coefficient = ",round(Gini_coef, 3), sep = ""), col = "blue", lwd = 2, type = "l")
axis(side = 1, at = c(0, .25*nrow(ref_user), .5*nrow(ref_user), .75*nrow(ref_user), nrow(ref_user)), labels = c("0","25%", "50%", "75%", "100%"))
lines(rbind(c(0,0), c(nrow(ref_user),1)), type = "l", col = "red") # Line of Equality created between Point A (0,0) and Point B(nrow(df),1)
lines(rbind(c(0,0),c(nrow(ref_user),0)), type = "l", lty = 2) # horizontal line
lines(rbind(c(nrow(ref_user),0), c(nrow(ref_user),1)), type = "l", lty = 2) # vertical line
text(x = .25*nrow(ref_user), y = .3, "Perfectly equality line", srt = 45)
text(x = .75*nrow(ref_user), y = cumsum(ref_user$pct)[.9*nrow(ref_user)]+0.05, "Lorenz Curve", srt = 45) # Lorenz Curve at 75%
#lines(rbind(c(nrow(ref_user), 0), c(.5*nrow(ref_user),.5)), type = "l", lty = 3) # auxiliary line
text(x = x2[15], y = y2[15], "A", cex = 3)
text(x = x2[85], y = y2[85], "B", cex = 3)
text(x = .2*nrow(ref_user), y = .8, expression(Gini==frac(A, A+B)), cex= 1.5)

Visualizing Large Networks

Visualize the retweet network that retains n % of the ties.

rt_edgelist <- rt118[,c("author_id", "referenced_user_id")]
rt_edgelist <- rt_edgelist[complete.cases(rt_edgelist), ]
indeg_user <- rt_edgelist %>%
  group_by(referenced_user_id) %>%
  summarise(Freq = n())
indeg_user <- indeg_user[rev(order(indeg_user$Freq)),]
indeg_user$cumsum <- cumsum(indeg_user$Freq)/sum(indeg_user$Freq)
totalusers <- length(unique(union_all(rt_edgelist$author_id, rt_edgelist$referenced_user_id))) # total users, served as the denominator to get % of selected indegree retweet recipients
# e.g. retain x% of total ties
toppct <- 0.9
threshold <- min(which(indeg_user$cumsum>toppct))
keeps <- indeg_user$referenced_user_id[1:threshold] # selected recipients whose indegree met threshold
rt_edgelist_reduced <- rt_edgelist[rt_edgelist$referenced_user_id %in% keeps,]
grt <- graph_from_data_frame(rt_edgelist_reduced)
grt <- simplify(grt, remove.multiple = F, remove.loops = T)
vcount(grt);ecount(grt)
## [1] 13477
## [1] 293427
set.seed(1234)
par(mar=c(1,.5,1.5,.5))
V(grt)$label = ""
V(grt)$size = log(degree(grt)+2)
summary(V(grt)$size) # node size distribution. Make sure the values are positive.
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   1.099   1.099   1.099   1.828   1.946  11.122
plottitle <- paste("Visualizing ", toppct*100, "% of ties", ", including ",length(keeps),", or ", paste(round(100*length(keeps)/totalusers,2),"%", sep = ""), " most retweeted users", sep = "")
plot(grt, edge.arrow.size = .1, edge.curved = 0,edge.width = .1, vertex.size = V(grt)$size,vertex.frame.color = "gray50", edge.color = "gray70", main = plottitle, vertex.color = "lightsteelblue")

This network has 13,477 nodes and 293,427 ties.

To validate the network reduction, confirm that the number of ties in the retweet network (grt) equals to or slightly higher than the number of observations in the dataset. The subset of [rt118$author_id != rt118$referenced_user_id,] is to remove self-retweets.

ecount(grt)/nrow(rt118[rt118$author_id != rt118$referenced_user_id,]) # They should be equal or slightly largher than the threshold number.
## [1] 0.9010779

Execution ended at 2023-01-15 23:10:12.