Compiled on 2023-01-24 00:55:26
Join the following three tables: (1) tweets, (2) tweet/referenced tweet authors, and (3) referenced tweets
NOTE: The users
table will be joined
twice because this table includes user information for both the original
tweets and the referenced tweets.
time1 = Sys.time()
library(rtweet) # save_as_csv() that prepends numerical ids as characters
library(dplyr) # %>% convenient data cleaning
library(tibble) # enframe(table(data$variable)) frequency table, name, value
library(tidyr) # separate_rows
library(reshape2) # acast()
library(igraph) # plot
library(DT) # datatable
query <- c("gcbacklog", "greencardbacklog")
toupper(query)
## [1] "GCBACKLOG" "GREENCARDBACKLOG"
df_user <- read_twitter_csv("gcbcklog_11774.csv") # user
## Warning: `read_twitter_csv()` was deprecated in rtweet 1.0.0.
## ℹ Works on rtweet data saved before 1.0.0 version.
df_tweet <- read_twitter_csv("gcbcklog_all2011-2023_79997.csv") # tweet
df_ref <- read_twitter_csv("gcbcklog_ref2009to2023total30633obs.csv") # referenced tweets
Here’re the columns(variables) from the tables. We can use either all or some of the variables from the three tables.
names(df_user)
## [1] "author_id" "account_created_at" "username"
## [4] "name" "description" "verified"
## [7] "protected" "followers_count" "tweet_count"
## [10] "following_count"
names(df_tweet)
## [1] "author_id" "conversation_id" "created_at"
## [4] "hashtag" "lang" "like_count"
## [7] "mention" "quote_count" "referenced_status_id"
## [10] "referenced_user_id" "reply_count" "retweet_count"
## [13] "status_id" "text" "type"
names(df_ref)
## [1] "referenced_author_id" "referenced_conversation_id"
## [3] "referenced_created_at" "referenced_hashtag"
## [5] "referenced_in_reply_to_user_id" "referenced_lang"
## [7] "referenced_like_count" "referenced_mention"
## [9] "referenced_mentioned_id" "referenced_quote_count"
## [11] "referenced_reply_count" "referenced_retweet_count"
## [13] "referenced_status_id" "referenced_text"
tweet
+ user
Which is the common variable?
intersect(names(df_user), names(df_tweet))
## [1] "author_id"
Join the tweet and the user tables.
Exclude the protected
column from the user
table.
Exclude the lang
column from the tweet
table.
df1 <- merge(df_user[ , names(df_user) != "protected"],
df_tweet[ , names(df_tweet) != "lang"],
by.x = "author_id", by.y = "author_id", all.y = TRUE)
dim(df1)
## [1] 80740 22
names(df1)
## [1] "author_id" "account_created_at" "username"
## [4] "name" "description" "verified"
## [7] "followers_count" "tweet_count" "following_count"
## [10] "conversation_id" "created_at" "hashtag"
## [13] "like_count" "mention" "quote_count"
## [16] "referenced_status_id" "referenced_user_id" "reply_count"
## [19] "retweet_count" "status_id" "text"
## [22] "type"
tweet
+ user
+
referenced
Which is the common column (variable)?
intersect(names(df1), names(df_ref))
## [1] "referenced_status_id"
Exclude a few less-used columns (variables)
df2 <- merge(df1,
df_ref[, !names(df_ref) %in%
c("referenced_in_reply_to_user_id",
"referenced_lang",
"referenced_mentioned_id")],
by = "referenced_status_id", all.x = TRUE)
dim(df2)
## [1] 80740 32
names(df2)
## [1] "referenced_status_id" "author_id"
## [3] "account_created_at" "username"
## [5] "name" "description"
## [7] "verified" "followers_count"
## [9] "tweet_count" "following_count"
## [11] "conversation_id" "created_at"
## [13] "hashtag" "like_count"
## [15] "mention" "quote_count"
## [17] "referenced_user_id" "reply_count"
## [19] "retweet_count" "status_id"
## [21] "text" "type"
## [23] "referenced_author_id" "referenced_conversation_id"
## [25] "referenced_created_at" "referenced_hashtag"
## [27] "referenced_like_count" "referenced_mention"
## [29] "referenced_quote_count" "referenced_reply_count"
## [31] "referenced_retweet_count" "referenced_text"
Sort the variables alphabetically, and then order the observations by time
df2 <- df2[, order(colnames(df2))]
df2 <- df2[order(df2$created_at), ]
names(df2)
## [1] "account_created_at" "author_id"
## [3] "conversation_id" "created_at"
## [5] "description" "followers_count"
## [7] "following_count" "hashtag"
## [9] "like_count" "mention"
## [11] "name" "quote_count"
## [13] "referenced_author_id" "referenced_conversation_id"
## [15] "referenced_created_at" "referenced_hashtag"
## [17] "referenced_like_count" "referenced_mention"
## [19] "referenced_quote_count" "referenced_reply_count"
## [21] "referenced_retweet_count" "referenced_status_id"
## [23] "referenced_text" "referenced_user_id"
## [25] "reply_count" "retweet_count"
## [27] "status_id" "text"
## [29] "tweet_count" "type"
## [31] "username" "verified"
tweet
+ user
+ referenced
+
referenced_user
To differentiate the Twitter users between the original tweets authors and referenced authors, it is necessary to rename the user data
names(df_user)
## [1] "author_id" "account_created_at" "username"
## [4] "name" "description" "verified"
## [7] "protected" "followers_count" "tweet_count"
## [10] "following_count"
prefix = "referenced_"
df_user_ref <- df_user
colnames(df_user_ref)[!grepl(pattern = prefix, colnames(df_user_ref), ignore.case = F)] <-
paste(prefix, colnames(df_user), sep = "")
colnames(df_user_ref)
## [1] "referenced_author_id" "referenced_account_created_at"
## [3] "referenced_username" "referenced_name"
## [5] "referenced_description" "referenced_verified"
## [7] "referenced_protected" "referenced_followers_count"
## [9] "referenced_tweet_count" "referenced_following_count"
Join the main data (tweet + user + referenced tweets) by
referenced_user_id
and referenced user data by
referenced_author_id
df3 <- merge(
df2,
df_user_ref[, colnames(df_user_ref) != c("referenced_protected", "referenced_following_count")],
by.x = "referenced_user_id", by.y = "referenced_author_id", all.x = TRUE
)
df3 <- df3[, order(colnames(df3))]
df3 <- df3[order(df3$created_at), ]
names(df3)
## [1] "account_created_at" "author_id"
## [3] "conversation_id" "created_at"
## [5] "description" "followers_count"
## [7] "following_count" "hashtag"
## [9] "like_count" "mention"
## [11] "name" "quote_count"
## [13] "referenced_account_created_at" "referenced_author_id"
## [15] "referenced_conversation_id" "referenced_created_at"
## [17] "referenced_description" "referenced_followers_count"
## [19] "referenced_hashtag" "referenced_like_count"
## [21] "referenced_mention" "referenced_name"
## [23] "referenced_quote_count" "referenced_reply_count"
## [25] "referenced_retweet_count" "referenced_status_id"
## [27] "referenced_text" "referenced_tweet_count"
## [29] "referenced_user_id" "referenced_username"
## [31] "referenced_verified" "reply_count"
## [33] "retweet_count" "status_id"
## [35] "text" "tweet_count"
## [37] "type" "username"
## [39] "verified"
The final data frame df3
has 39 columns that contains
information of (1) original tweets, (2) author of original tweets, (3)
referenced tweets, and (4) author of referenced tweets.
An all-inclusive data frame has the advantage of making later analyses convenient. It is forgetting-curve-friendly because we may forget the variable names and attributes, ad-hoc data processing tips, data storage location, file name, etc.
Working with an all-inclusive data frame has disadvantages, too. First, the computational cost is substantial. storing and processing larger data volume slows down the processing time. This is particularly true for elaborated and complex analytically assignments. Second, if we are to combine/merge the data frame with others, we must make sure the column names are exactly the same, and the data values are of the same class and measured on the same scale…too much too worry about. For projects that collect data over a long time period, the second disadvantage almost always creates data quality problems.
If we have a specific question to answer, it is most efficient to subset the variables and observations. The following customized joint tables are the most common for social media network analysis.
The complete Twitter dataset has the potential to answer many research questions.
First, start with the overall longitudinal patterns, including the number of tweets and authors by year.
Second, study their communication patterns, including mention, retweet, and reply networks. Was there (an) all-time most mentioned person(s)? Who were the most retweeted person(s)? Were they the opinion leaders in the discussion? What were the network attributes of the communication networks?
Third, investigate the tweet texts. Which were the most frequent hashtags in each time period except the query pattern? Which hashtags co-occurred the most? Were their clusters or polarization in the hashtag networks?
Fourth, if the topic issue experienced external shocks (e.g., policy change or substantial policy process advancement), were the patterns of communication networks and tweet texts different from the uneventful times?
ts_plot()
generates a time series of daily tweet counts.
Alternatively, we can also subset the columns of the time stamp
(created_at
) and tweet id(status_id
)
tweet_ts <- df_tweet[, c("created_at", "status_id")] # get columns with time and id info
tweet_ts$year <- substr(df_tweet$created_at, 1, 4) # extract year from time
tweet_ts <- tweet_ts[order(tweet_ts$created_at), ] # sort obs by ascending time
tweet_ts <- unique(tweet_ts) # remove duplicated tweet ids
enframe(table(tweet_ts$year)) # frequency table by year
## # A tibble: 13 × 2
## name value
## <chr> <table[1d]>
## 1 2011 263
## 2 2012 35
## 3 2013 83
## 4 2014 18
## 5 2015 200
## 6 2016 112
## 7 2017 1254
## 8 2018 12180
## 9 2019 5152
## 10 2020 22058
## 11 2021 28292
## 12 2022 10326
## 13 2023 24
plot(
enframe(table(tweet_ts$year[tweet_ts$year < max(tweet_ts$year)])),
type = "l", xlab = "year", ylab = "tweets",
main = paste(
as.numeric(min(tweet_ts$year)), " to ", as.numeric(max(tweet_ts$year))-1, " tweet count",
sep = ""
)
)
merge
selected variables from user and
tweets
Remove rows without mention information
Split the mention
variable using
separate_rows
We only need a username
and a mention
column to show the mention relationship. Join the username
variable from the user data and the mention
variable from
the tweet data by the shared variable author_id
.
And then, transform the mention
variable into multiple
rows by ,
so that each row represents one mention link.
The number of rows would increase greatly after splitting the
mention
column into the format of each row per mention.
mention_df <- merge(df_user[, c("author_id", "username")],
df_tweet[,c("author_id", "mention", "created_at")],
by.x = "author_id", by.y = "author_id", all.y = TRUE)
# drop the author_id column and rows w/o mention
net_mention <- mention_df[!is.na(mention_df$mention),c("username", "mention", "created_at")]
# split the mention column into multiple rows, separated by ,
net_mention <- separate_rows(data = net_mention, mention, sep = ",", convert = TRUE)
Take a look at the resultant mention network data frame
net_mentinon
, with usernames partially hidden.
head(
data.frame(
cbind(
username = paste(substr(net_mention$username, 1, 3), "***", sep = ""),
mention = paste(substr(net_mention$mention, 1, 3), "***", sep = "")
)
)
)
## username mention
## 1 shi*** elo***
## 2 shi*** pat***
## 3 shi*** ash***
## 4 shi*** par***
## 5 abh*** SII***
## 6 abh*** Sen***
Fence-riders were more mentioned than the known supporters/opponents.
mentioned_table <- enframe(table(net_mention$mention))
mentioned_table$value <- as.numeric(mentioned_table$value)
DT::datatable(mentioned_table[mentioned_table$value>2,])
Extract the year
information as the time unit by which
we split the data
net_mention$year <- substr(net_mention$created_at, 1, 4)
net_mention$year <- factor(as.character(net_mention$year), ordered = TRUE)
net_mention <- net_mention[order(net_mention$created_at),]
levels(net_mention$year)
## [1] "2011" "2012" "2013" "2014" "2015" "2016" "2017" "2018" "2019" "2020"
## [11] "2021" "2022" "2023"
MOST MENTIONED USERS BY YEAR
Create frequency tables for the mentioned users by each year.
Compared to hashtag frequency tables, the mentioned tables are slightly
simpler because (1) no need to convert the usernames to upper-case
toupper
and (2) no need to remove strings used in the
query.
The number of frequency tables generated equals to the number of years. Each frequency table is stored as an element of a list.
Here’re the steps to construct the frequency tables of each year:
Create an empty list mentioned_list
and set its
length to the number of years.
Assign the name of each element (still empty) as each year respectively.
Create an iteration loop to calculate the frequency table of each year.
mentioned_list <- vector(mode = "list", length = length(unique(net_mention$year))) # Step 1
names(mentioned_list) <- unique(net_mention$year) # Step 2
for (i in unique(net_mention$year)) # Step 3
{
mentioned_list[[i]] <- enframe(table(net_mention$mention[net_mention$year == i]))
mentioned_list[[i]]$value <- as.numeric(mentioned_list[[i]]$value)
mentioned_list[[i]] <- mentioned_list[[i]][order(mentioned_list[[i]]$value),] # non-desc
mentioned_list[[i]]$pct <- mentioned_list[[i]]$value/sum(mentioned_list[[i]]$value)
mentioned_list[[i]]$cumsum <- cumsum(mentioned_list[[i]]$value)/sum(mentioned_list[[i]]$value)
mentioned_list[[i]]$year <- i
print(tail(mentioned_list[[i]]))
}
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 MalignantNews 1 0.0417 0.458 2011
## 2 menghe 1 0.0417 0.5 2011
## 3 wadhwa 1 0.0417 0.542 2011
## 4 WhiteHouse 1 0.0417 0.583 2011
## 5 gauks 2 0.0833 0.667 2011
## 6 nytimes 8 0.333 1 2011
## # A tibble: 5 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 IndiaWest 1 0.0476 0.0476 2012
## 2 johntabin 1 0.0476 0.0952 2012
## 3 unmc 1 0.0476 0.143 2012
## 4 ShareThis 4 0.190 0.333 2012
## 5 KauffmanFDN 14 0.667 1 2012
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 RepGoodlatte 3 0.0566 0.509 2013
## 2 BarackObama 4 0.0755 0.585 2013
## 3 GOPWhip 4 0.0755 0.660 2013
## 4 HouseJudiciary 4 0.0755 0.736 2013
## 5 washingtonpost 4 0.0755 0.811 2013
## 6 Wharton 10 0.189 1 2013
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 SimonWDC 1 0.0714 0.357 2014
## 2 thehill 1 0.0714 0.429 2014
## 3 USCIS 1 0.0714 0.5 2014
## 4 WhiteHouse 1 0.0714 0.571 2014
## 5 YouTube 1 0.0714 0.643 2014
## 6 BarackObama 5 0.357 1 2014
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 TheToddSchulte 18 0.0552 0.638 2015
## 2 hwimmi 20 0.0613 0.699 2015
## 3 USCIS 21 0.0644 0.764 2015
## 4 WhiteHouse 22 0.0675 0.831 2015
## 5 DHSgov 26 0.0798 0.911 2015
## 6 fwd_us 29 0.0890 1 2015
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 RepDavid 3 0.0265 0.761 2016
## 2 TheToddSchulte 3 0.0265 0.788 2016
## 3 narendramodi 4 0.0354 0.823 2016
## 4 USCIS 6 0.0531 0.876 2016
## 5 fwd_us 7 0.0619 0.938 2016
## 6 POTUS 7 0.0619 1 2016
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 TomCottonAR 61 0.0209 0.740 2017
## 2 McCaskillOffice 91 0.0311 0.771 2017
## 3 SenTomCotton 100 0.0342 0.806 2017
## 4 HouseGOP 115 0.0394 0.845 2017
## 5 HouseDemocrats 116 0.0397 0.885 2017
## 6 SIIA_US 337 0.115 1 2017
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 SpeakerRyan 1504 0.0255 0.766 2018
## 2 HouseDemocrats 2580 0.0438 0.810 2018
## 3 SenateGOP 2693 0.0457 0.855 2018
## 4 SenateDems 2778 0.0471 0.902 2018
## 5 realDonaldTrump 2805 0.0476 0.950 2018
## 6 HouseGOP 2943 0.0499 1 2018
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 SenateGOP 352 0.0232 0.741 2019
## 2 realDonaldTrump 384 0.0253 0.766 2019
## 3 HouseDemocrats 495 0.0326 0.799 2019
## 4 SenateDems 610 0.0402 0.839 2019
## 5 SenMikeLee 705 0.0465 0.885 2019
## 6 SenatorDurbin 1739 0.115 1 2019
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 DickDurbin 2193 0.0324 0.545 2020
## 2 ChrisHartline 3431 0.0507 0.596 2020
## 3 CNN 4377 0.0647 0.661 2020
## 4 SenateDems 6098 0.0902 0.751 2020
## 5 SenRickScott 7145 0.106 0.856 2020
## 6 SenatorDurbin 9703 0.144 1 2020
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 SecMayorkas 2933 0.0371 0.660 2021
## 2 VP 3405 0.0431 0.703 2021
## 3 JudiciaryDems 4443 0.0563 0.759 2021
## 4 POTUS 4761 0.0603 0.820 2021
## 5 USCIS 6661 0.0843 0.904 2021
## 6 SenatorDurbin 7592 0.0961 1 2021
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 SenatorDurbin 726 0.0334 0.756 2022
## 2 AILANational 739 0.0340 0.790 2022
## 3 PabloReports 911 0.0419 0.832 2022
## 4 POTUS 928 0.0427 0.874 2022
## 5 USCISDirector 1177 0.0541 0.928 2022
## 6 USCIS 1560 0.0717 1 2022
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 USCISDirector 1 0.0303 0.515 2023
## 2 PramilaJayapal 2 0.0606 0.576 2023
## 3 RepZoeLofgren 2 0.0606 0.636 2023
## 4 TheDemocrats 2 0.0606 0.697 2023
## 5 USCIS 3 0.0909 0.788 2023
## 6 POTUS 7 0.212 1 2023
Extract frequency tables from the list and rbind
them
into one dataframe
mentioned_year_df <- do.call(what = "rbind", args = lapply(mentioned_list, as.data.frame))
mentioned_year_df <- mentioned_year_df[order(mentioned_year_df$year, mentioned_year_df$value), ]
mentioned_year_df <- rapply(object = mentioned_year_df, f = round, classes = "numeric", how = "replace", digits = 5)
rownames(mentioned_year_df) <- 1:nrow(mentioned_year_df)
mentioned_year_N <- enframe(table(mentioned_year_df$year)) # total mentions by year
years_gini <- mentioned_year_N$name[mentioned_year_N$value > 1000] # years N>1000
years_gini
## [1] "2018" "2019" "2020" "2021" "2022"
# Selectively display the frequency table: top mentioned users who contribute to 50% of total mentions, and
# Remove years in which the count of tweets was extremely small
DT::datatable(mentioned_year_df[mentioned_year_df$cumsum >=0.5 & mentioned_year_df$year %in% years_gini, ])
Create nested loops (k
iterations of all selected years
and i
iterations of each row in the dataframe of each
k
year) to compute in-degree mention networks Gini
coefficients.
N <- rep(NA, length(years_gini))
Area_Bi <- vector(mode = "list", length = length(years_gini)) #
names(Area_Bi) <- years_gini
Gini_coef_mentioned <- rep(NA, length(years_gini))
for (k in years_gini) {
N[k] <- nrow(mentioned_year_df[mentioned_year_df$year == k,])
N <- N[!is.na(N)] # NAs generated in earlier iterations
for (i in 1:N[k]) {
## double [[]] to index a list element
Area_Bi[[k]][i] <- (N[k] - i + 0.5)*mentioned_year_df$pct[mentioned_year_df$year == k][i]/N[k]
}
Gini_coef_mentioned[k] <- 1 - 2*sum(Area_Bi[[k]])
Gini_coef_mentioned <- Gini_coef_mentioned[!is.na(Gini_coef_mentioned)]
Gini_coef_mentioned_df <- enframe(Gini_coef_mentioned)
colnames(Gini_coef_mentioned_df) <- c("year", "Gini_coef")
}
Gini_coef_mentioned_df
## # A tibble: 5 × 2
## year Gini_coef
## <chr> <dbl>
## 1 2018 0.874
## 2 2019 0.790
## 3 2020 0.933
## 4 2021 0.912
## 5 2022 0.804
Select the relevant variables from all dataframes to generate hashtag networks. Hashtag networks can be analyzed and visualized by so many ways. We can either study hashtags by an author or by each tweet. We can study recipient-based hashtag networks, too. For example, which hashtags did a given recipient had received during a specific period? This approach is useful in online advocates.
hashtag_df <- merge(df_tweet[,c("author_id", "status_id", "hashtag", "created_at")],
df_user[,c("author_id", "username")],
by = "author_id", all.x = TRUE)
hashtag_df <- unique(hashtag_df) # remove unique rows
Subset the hashtag_df
that retains rows with non-blank
hashtags. 67898, or, 84.876% of the tweets have at least one
hashtag.
hashtag_df <- hashtag_df[!is.na(hashtag_df$hashtag), ]
hashtag_df <- hashtag_df[order(hashtag_df$created_at), ]
Get hashtag co-occurrence based on each tweet
hashtag_tw <- separate_rows(data = hashtag_df, hashtag, sep = ",", convert = TRUE)
hashtag_tw$year <- substr(hashtag_tw$created_at, 1, 4)
enframe(table(hashtag_tw$year))
## # A tibble: 13 × 2
## name value
## <chr> <table[1d]>
## 1 2011 83
## 2 2012 26
## 3 2013 53
## 4 2014 8
## 5 2015 318
## 6 2016 85
## 7 2017 2001
## 8 2018 40463
## 9 2019 11608
## 10 2020 71843
## 11 2021 64891
## 12 2022 17556
## 13 2023 48
plot(enframe(table(hashtag_tw$year[hashtag_tw$year<max(hashtag_tw$year)])), type = "l",
main = "Hashtag count by year", xlab = "year", ylab = "hashtag")
hashtag_tw$upcase <- toupper(hashtag_tw$hashtag)
hashtag_tw$year <- factor(as.character(hashtag_tw$year), ordered = TRUE)
length(unique(hashtag_tw$year))
## [1] 13
hashtag_list <- vector(mode = "list", length = length(unique(hashtag_tw$year)))
names(hashtag_list) <- unique(hashtag_tw$year)
names(hashtag_list)
## [1] "2011" "2012" "2013" "2014" "2015" "2016" "2017" "2018" "2019" "2020"
## [11] "2021" "2022" "2023"
MOST FREQUENT HASHTAGS BY YEAR (words used in query: GCBACKLOG, GREENCARDBACKLOG were removed from the tables)
for (i in unique(hashtag_tw$year) )
{
hashtag_list[[i]] <- enframe(table(hashtag_tw$upcase[hashtag_tw$year == i]))
hashtag_list[[i]]$value <- as.numeric(hashtag_list[[i]]$value) # coerce table to numeric
hashtag_list[[i]] <-
hashtag_list[[i]][order(hashtag_list[[i]]$value), ] # non-descending order
hashtag_list[[i]] <- hashtag_list[[i]][!hashtag_list[[i]]$name %in% toupper(query), ]
hashtag_list[[i]]$pct <-
hashtag_list[[i]]$value/sum(hashtag_list[[i]]$value) # pct
hashtag_list[[i]]$cumsum <-
cumsum(hashtag_list[[i]]$value)/sum(hashtag_list[[i]]$value) # cumsum
hashtag_list[[i]]$year <- i
print(tail(hashtag_list[[i]]))
}
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 NYTIMES 3 0.0366 0.512 2011
## 2 US 3 0.0366 0.549 2011
## 3 POLITICS 4 0.0488 0.598 2011
## 4 WORLD 5 0.0610 0.659 2011
## 5 FREENYT 6 0.0732 0.732 2011
## 6 IMMIGRATION 22 0.268 1 2011
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 LAWYER 1 0.0385 0.192 2012
## 2 STARTUPACT2 1 0.0385 0.231 2012
## 3 GREENCARD 2 0.0769 0.308 2012
## 4 HR3012 2 0.0769 0.385 2012
## 5 IMMIGRATION 7 0.269 0.654 2012
## 6 STARTUPACT 9 0.346 1 2012
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 TIMEISNOW 1 0.0189 0.264 2013
## 2 WANTTOBEHERE 1 0.0189 0.283 2013
## 3 WP 1 0.0189 0.302 2013
## 4 CIR 2 0.0377 0.340 2013
## 5 IMMIGRATION 10 0.189 0.528 2013
## 6 IMMIGRATIONREFORM 25 0.472 1 2013
## # A tibble: 5 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 CLEARGCBACKLOG 1 0.143 0.143 2014
## 2 GREEN 1 0.143 0.286 2014
## 3 H4EADRULE 1 0.143 0.429 2014
## 4 IMMIGRATIONREFORM 1 0.143 0.571 2014
## 5 IMMIGRATION 3 0.429 1 2014
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 AINF 22 0.0884 0.450 2015
## 2 IMMIGRATIONREFORM 22 0.0884 0.538 2015
## 3 TNTVOTE 22 0.0884 0.627 2015
## 4 HR213 25 0.100 0.727 2015
## 5 H4EAD 32 0.129 0.855 2015
## 6 IMMIGRATION 36 0.145 1 2015
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 H1B 3 0.0455 0.576 2016
## 2 LEGALIMMIGRANTS 3 0.0455 0.621 2016
## 3 I140EAD 5 0.0758 0.697 2016
## 4 IMMIGRATION 5 0.0758 0.773 2016
## 5 I140EADAP 7 0.106 0.879 2016
## 6 HR213 8 0.121 1 2016
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 LEGALIMMIGRANTS 47 0.0369 0.360 2017
## 2 H4EAD 60 0.0471 0.407 2017
## 3 S281 108 0.0848 0.492 2017
## 4 H4DREAMERS 146 0.115 0.606 2017
## 5 7MONTHSVS70YEARS 192 0.151 0.757 2017
## 6 HR392 309 0.243 1 2017
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 H4DREAMERS 1698 0.0541 0.592 2018
## 2 LEGALIMMIGRANTS 1853 0.0590 0.651 2018
## 3 H4EAD 1946 0.0620 0.713 2018
## 4 ISQUARED 2126 0.0677 0.781 2018
## 5 SAVEH4EAD 2191 0.0698 0.851 2018
## 6 HR392 4691 0.149 1 2018
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 H4DREAMERS 351 0.0399 0.527 2019
## 2 S386ISFAIR 371 0.0421 0.569 2019
## 3 PASSS386 385 0.0437 0.612 2019
## 4 YESS386 566 0.0643 0.677 2019
## 5 HR1044 856 0.0973 0.774 2019
## 6 S386 1989 0.226 1 2019
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 UNBLOCKS386 3431 0.0520 0.441 2020
## 2 PASSS386 3639 0.0551 0.497 2020
## 3 S386ISFAIR 4438 0.0672 0.564 2020
## 4 FAIRNESS 6281 0.0952 0.659 2020
## 5 YESS386 8026 0.122 0.781 2020
## 6 S386 14479 0.219 1 2020
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 REMOVECOUNTRYCAPS 1993 0.0392 0.658 2021
## 2 H4EADDELAYS 2537 0.0498 0.708 2021
## 3 HR3648 3265 0.0642 0.772 2021
## 4 EAGLEACT 3684 0.0724 0.845 2021
## 5 DURBINISRACIST 3893 0.0765 0.921 2021
## 6 DURBINHATESINDIANS 4006 0.0787 1 2021
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 HR3648 236 0.0247 0.581 2022
## 2 H1B 366 0.0383 0.619 2022
## 3 IMMIGRATION 495 0.0518 0.671 2022
## 4 GREENCARD 532 0.0556 0.726 2022
## 5 EAGLEACT 843 0.0882 0.815 2022
## 6 H4EADDELAYS 1773 0.185 1 2022
## # A tibble: 6 × 5
## name value pct cumsum year
## <chr> <dbl> <dbl> <dbl> <chr>
## 1 NEWYEAR 1 0.0333 0.7 2023
## 2 TECHCOMPANIES 1 0.0333 0.733 2023
## 3 US 1 0.0333 0.767 2023
## 4 WARONMERIT 1 0.0333 0.8 2023
## 5 H1B 2 0.0667 0.867 2023
## 6 EAGLEACT 4 0.133 1 2023
ALL HASHTAGS ALL YEARS IN ONE TABLE
hashtag_year_df <- do.call(what = "rbind", args = lapply(hashtag_list, as.data.frame))
hashtag_year_df <- hashtag_year_df[order(hashtag_year_df$year, hashtag_year_df$value), ]
hashtag_year_df <- rapply(object = hashtag_year_df, f = round, classes = "numeric", how = "replace", digits = 5)
rownames(hashtag_year_df) <- 1:nrow(hashtag_year_df)
DT::datatable(hashtag_year_df[hashtag_year_df$value>1, ])
length(unique(hashtag_tw$upcase)) # Total unique hashtags
## [1] 5535
length(unique(hashtag_tw$status_id)) # Total unique tweets
## [1] 67898
length(unique(hashtag_tw$author_id)) # Total unique users
## [1] 5389
hashtag_year <- hashtag_tw[, c("upcase", "year")]
hashtag_year <- unique(hashtag_year)
table(hashtag_year$year)
##
## 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023
## 32 10 19 6 51 36 116 1416 725 1446 2217 1485 28
plot(table(hashtag_year$year),
type = "l", ylab = "unique hashtag",
main = "Unique Hashtag Count by Year (ignore the most recent year)")
status_id by hashtag counts for ACTUAL HASHTAG COOCURRENCE,
acast()
is the command to reshape the edgelist to a matrix.
The resultant matrix has tweet ids (status_id
) in the rows
and uppercase hashtags (upcase
) in the column. The
resultant dimension of the matrix is high. So, it is strongly suggested
to slice the edgelist into smaller time units!
In the tweet by hashtag matrix, the row sum is the total number of different hashtags appeared in one tweet. The column sum is the total count of a hashtag in all the tweets.
hashtag_tw_23 <- hashtag_tw[hashtag_tw$year == 2023,]
hashtag_tw_23 <- hashtag_tw_23[!hashtag_tw_23$upcase %in% toupper(query),]
hashtag_tw_23 <- hashtag_tw_23[!is.na(hashtag_tw_23$upcase), ]
head(hashtag_tw_23[, c("status_id", "upcase")] )
## # A tibble: 6 × 2
## status_id upcase
## <chr> <chr>
## 1 1609380347053211649 NEWYEAR
## 2 1609380347053211649 12YEARSASLAVE
## 3 1609380347053211649 DEPRESSION
## 4 1609380347053211649 FUCKTHEUSA
## 5 1609587076982358017 CONGRESS
## 6 1609587076982358017 2023NEWYEAR
tw_hashtag_m <- acast(data = hashtag_tw_23[, c("status_id", "upcase")],
formula = hashtag_tw_23$status_id ~ hashtag_tw_23$upcase,
length, value.var = "upcase")
head(tw_hashtag_m[1:10, 1:5])
## 12YEARSASLAVE 2023NEWYEAR AILA CHATGPT CONGRESS
## 1609380347053211649 1 0 0 0 0
## 1609587076982358017 0 1 0 0 1
## 1609593386167672835 0 0 0 0 0
## 1609625074058907649 0 0 0 0 0
## 1609680475320193024 0 0 0 0 0
## 1609681929619791872 0 0 0 0 0
class(tw_hashtag_m)
## [1] "matrix" "array"
colnames(tw_hashtag_m)
## [1] "12YEARSASLAVE" "2023NEWYEAR" "AILA"
## [4] "CHATGPT" "CONGRESS" "DECADES"
## [7] "DEPRESSION" "EAD" "EAGLEACT"
## [10] "F1VISA" "FINANCIALINSTITUTIONS" "FUCKTHEUSA"
## [13] "GREENCARD" "H1B" "H4"
## [16] "H4EADDELAYS" "HAPPYNEWYEAR" "I140"
## [19] "I485" "INDIA" "INDIAN"
## [22] "LCA" "NEWYEAR" "TECHCOMPANIES"
## [25] "US" "WARONMERIT"
dim(tw_hashtag_m)
## [1] 12 26
\[M_{hashtag \_ adj} = t(M_{tweet \times hashtag}) \times M_{tweet \times hashtag}\]
hashtag_adjm <- t(tw_hashtag_m) %*% tw_hashtag_m
diag(hashtag_adjm) <- 0 # set diagonal values to 0
hashtag_adjm[1:6,1:6] # Hashtag adjacency matrix is symmetric
## 12YEARSASLAVE 2023NEWYEAR AILA CHATGPT CONGRESS DECADES
## 12YEARSASLAVE 0 0 0 0 0 0
## 2023NEWYEAR 0 0 0 0 1 0
## AILA 0 0 0 1 0 0
## CHATGPT 0 0 1 0 0 0
## CONGRESS 0 1 0 0 0 0
## DECADES 0 0 0 0 0 0
dim(hashtag_adjm)
## [1] 26 26
This is a rudimentary network visualization. The network isn’t large so there’s no need to set up many parameters before visualizing it.
g_hashtag_adj <- graph_from_adjacency_matrix(hashtag_adjm)
vcount(g_hashtag_adj);ecount(g_hashtag_adj)
## [1] 26
## [1] 96
# raw ideas of setting network visualize parameters
seq(from = 1, to = 40, length.out = vcount(g_hashtag_adj))
## [1] 1.00 2.56 4.12 5.68 7.24 8.80 10.36 11.92 13.48 15.04 16.60 18.16
## [13] 19.72 21.28 22.84 24.40 25.96 27.52 29.08 30.64 32.20 33.76 35.32 36.88
## [25] 38.44 40.00
set.seed(12345)
summary(degree(g_hashtag_adj))
## Min. 1st Qu. Median Mean 3rd Qu. Max.
## 0.000 2.000 7.000 7.385 14.000 16.000
V(g_hashtag_adj)$label <- V(g_hashtag_adj)$name
par(mar=c(2,.8,2,.8))
plot(g_hashtag_adj,
vertex.size = degree(g_hashtag_adj)+1, vertex.color = "snow",
vertex.shape = "circle",
vertex.label = V(g_hashtag_adj)$label,
vertex.label.color = "grey20",
vertex.label.font = 2, # bold
vertex.label.cex = 0.6 + log(degree(g_hashtag_adj)+1)*0.12,
vertex.frame.color = "grey90",
edge.arrow.size = 0,
edge.curved = 0,
main = "Words used in the query were removed from the network visualization")
time2 = Sys.time()
time2 - time1
## Time difference of 9.536939 secs