Compiled on 2023-01-24 00:55:26

Join the following three tables: (1) tweets, (2) tweet/referenced tweet authors, and (3) referenced tweets

NOTE: The users table will be joined twice because this table includes user information for both the original tweets and the referenced tweets.

time1 = Sys.time()
library(rtweet) # save_as_csv() that prepends numerical ids as characters
library(dplyr) # %>% convenient data cleaning
library(tibble) # enframe(table(data$variable)) frequency table, name, value
library(tidyr) # separate_rows
library(reshape2) # acast()
library(igraph) # plot
library(DT) # datatable
query <- c("gcbacklog", "greencardbacklog")
toupper(query)
## [1] "GCBACKLOG"        "GREENCARDBACKLOG"

Import Dataframes

df_user <- read_twitter_csv("gcbcklog_11774.csv") # user
## Warning: `read_twitter_csv()` was deprecated in rtweet 1.0.0.
## ℹ Works on rtweet data saved before 1.0.0 version.
df_tweet <- read_twitter_csv("gcbcklog_all2011-2023_79997.csv") # tweet
df_ref <- read_twitter_csv("gcbcklog_ref2009to2023total30633obs.csv") # referenced tweets

Join Tables

Here’re the columns(variables) from the tables. We can use either all or some of the variables from the three tables.

names(df_user)
##  [1] "author_id"          "account_created_at" "username"          
##  [4] "name"               "description"        "verified"          
##  [7] "protected"          "followers_count"    "tweet_count"       
## [10] "following_count"
names(df_tweet)
##  [1] "author_id"            "conversation_id"      "created_at"          
##  [4] "hashtag"              "lang"                 "like_count"          
##  [7] "mention"              "quote_count"          "referenced_status_id"
## [10] "referenced_user_id"   "reply_count"          "retweet_count"       
## [13] "status_id"            "text"                 "type"
names(df_ref)
##  [1] "referenced_author_id"           "referenced_conversation_id"    
##  [3] "referenced_created_at"          "referenced_hashtag"            
##  [5] "referenced_in_reply_to_user_id" "referenced_lang"               
##  [7] "referenced_like_count"          "referenced_mention"            
##  [9] "referenced_mentioned_id"        "referenced_quote_count"        
## [11] "referenced_reply_count"         "referenced_retweet_count"      
## [13] "referenced_status_id"           "referenced_text"

tweet + user

Which is the common variable?

intersect(names(df_user), names(df_tweet))
## [1] "author_id"
  1. Join the tweet and the user tables.

  2. Exclude the protected column from the user table.

  3. Exclude the lang column from the tweet table.

df1 <- merge(df_user[ , names(df_user) != "protected"], 
             df_tweet[ , names(df_tweet) != "lang"], 
             by.x = "author_id", by.y = "author_id", all.y = TRUE)
dim(df1)
## [1] 80740    22
names(df1)
##  [1] "author_id"            "account_created_at"   "username"            
##  [4] "name"                 "description"          "verified"            
##  [7] "followers_count"      "tweet_count"          "following_count"     
## [10] "conversation_id"      "created_at"           "hashtag"             
## [13] "like_count"           "mention"              "quote_count"         
## [16] "referenced_status_id" "referenced_user_id"   "reply_count"         
## [19] "retweet_count"        "status_id"            "text"                
## [22] "type"

tweet + user + referenced

Which is the common column (variable)?

intersect(names(df1), names(df_ref))
## [1] "referenced_status_id"

Exclude a few less-used columns (variables)

df2 <- merge(df1, 
             df_ref[, !names(df_ref) %in% 
                      c("referenced_in_reply_to_user_id", 
                        "referenced_lang", 
                        "referenced_mentioned_id")], 
             by = "referenced_status_id", all.x = TRUE)
dim(df2)
## [1] 80740    32
names(df2)
##  [1] "referenced_status_id"       "author_id"                 
##  [3] "account_created_at"         "username"                  
##  [5] "name"                       "description"               
##  [7] "verified"                   "followers_count"           
##  [9] "tweet_count"                "following_count"           
## [11] "conversation_id"            "created_at"                
## [13] "hashtag"                    "like_count"                
## [15] "mention"                    "quote_count"               
## [17] "referenced_user_id"         "reply_count"               
## [19] "retweet_count"              "status_id"                 
## [21] "text"                       "type"                      
## [23] "referenced_author_id"       "referenced_conversation_id"
## [25] "referenced_created_at"      "referenced_hashtag"        
## [27] "referenced_like_count"      "referenced_mention"        
## [29] "referenced_quote_count"     "referenced_reply_count"    
## [31] "referenced_retweet_count"   "referenced_text"

Sort the variables alphabetically, and then order the observations by time

df2 <- df2[, order(colnames(df2))]
df2 <- df2[order(df2$created_at), ]
names(df2)
##  [1] "account_created_at"         "author_id"                 
##  [3] "conversation_id"            "created_at"                
##  [5] "description"                "followers_count"           
##  [7] "following_count"            "hashtag"                   
##  [9] "like_count"                 "mention"                   
## [11] "name"                       "quote_count"               
## [13] "referenced_author_id"       "referenced_conversation_id"
## [15] "referenced_created_at"      "referenced_hashtag"        
## [17] "referenced_like_count"      "referenced_mention"        
## [19] "referenced_quote_count"     "referenced_reply_count"    
## [21] "referenced_retweet_count"   "referenced_status_id"      
## [23] "referenced_text"            "referenced_user_id"        
## [25] "reply_count"                "retweet_count"             
## [27] "status_id"                  "text"                      
## [29] "tweet_count"                "type"                      
## [31] "username"                   "verified"

tweet + user + referenced + referenced_user

To differentiate the Twitter users between the original tweets authors and referenced authors, it is necessary to rename the user data

names(df_user)
##  [1] "author_id"          "account_created_at" "username"          
##  [4] "name"               "description"        "verified"          
##  [7] "protected"          "followers_count"    "tweet_count"       
## [10] "following_count"
prefix  = "referenced_"
df_user_ref <- df_user
colnames(df_user_ref)[!grepl(pattern = prefix, colnames(df_user_ref), ignore.case = F)] <- 
  paste(prefix, colnames(df_user), sep = "")
colnames(df_user_ref)
##  [1] "referenced_author_id"          "referenced_account_created_at"
##  [3] "referenced_username"           "referenced_name"              
##  [5] "referenced_description"        "referenced_verified"          
##  [7] "referenced_protected"          "referenced_followers_count"   
##  [9] "referenced_tweet_count"        "referenced_following_count"

Join the main data (tweet + user + referenced tweets) by referenced_user_id and referenced user data by referenced_author_id

df3 <- merge(
  df2, 
  df_user_ref[, colnames(df_user_ref) != c("referenced_protected", "referenced_following_count")], 
  by.x = "referenced_user_id", by.y = "referenced_author_id", all.x = TRUE
)
df3 <- df3[, order(colnames(df3))]
df3 <- df3[order(df3$created_at), ]
names(df3)
##  [1] "account_created_at"            "author_id"                    
##  [3] "conversation_id"               "created_at"                   
##  [5] "description"                   "followers_count"              
##  [7] "following_count"               "hashtag"                      
##  [9] "like_count"                    "mention"                      
## [11] "name"                          "quote_count"                  
## [13] "referenced_account_created_at" "referenced_author_id"         
## [15] "referenced_conversation_id"    "referenced_created_at"        
## [17] "referenced_description"        "referenced_followers_count"   
## [19] "referenced_hashtag"            "referenced_like_count"        
## [21] "referenced_mention"            "referenced_name"              
## [23] "referenced_quote_count"        "referenced_reply_count"       
## [25] "referenced_retweet_count"      "referenced_status_id"         
## [27] "referenced_text"               "referenced_tweet_count"       
## [29] "referenced_user_id"            "referenced_username"          
## [31] "referenced_verified"           "reply_count"                  
## [33] "retweet_count"                 "status_id"                    
## [35] "text"                          "tweet_count"                  
## [37] "type"                          "username"                     
## [39] "verified"

The final data frame df3 has 39 columns that contains information of (1) original tweets, (2) author of original tweets, (3) referenced tweets, and (4) author of referenced tweets.

An all-inclusive data frame has the advantage of making later analyses convenient. It is forgetting-curve-friendly because we may forget the variable names and attributes, ad-hoc data processing tips, data storage location, file name, etc.

Working with an all-inclusive data frame has disadvantages, too. First, the computational cost is substantial. storing and processing larger data volume slows down the processing time. This is particularly true for elaborated and complex analytically assignments. Second, if we are to combine/merge the data frame with others, we must make sure the column names are exactly the same, and the data values are of the same class and measured on the same scale…too much too worry about. For projects that collect data over a long time period, the second disadvantage almost always creates data quality problems.

If we have a specific question to answer, it is most efficient to subset the variables and observations. The following customized joint tables are the most common for social media network analysis.

Back to top

Customized Data Construction and Analyses

The complete Twitter dataset has the potential to answer many research questions.

First, start with the overall longitudinal patterns, including the number of tweets and authors by year.

Second, study their communication patterns, including mention, retweet, and reply networks. Was there (an) all-time most mentioned person(s)? Who were the most retweeted person(s)? Were they the opinion leaders in the discussion? What were the network attributes of the communication networks?

Third, investigate the tweet texts. Which were the most frequent hashtags in each time period except the query pattern? Which hashtags co-occurred the most? Were their clusters or polarization in the hashtag networks?

Fourth, if the topic issue experienced external shocks (e.g., policy change or substantial policy process advancement), were the patterns of communication networks and tweet texts different from the uneventful times?

Longitudinal Patterns

ts_plot() generates a time series of daily tweet counts. Alternatively, we can also subset the columns of the time stamp (created_at) and tweet id(status_id)

tweet_ts <- df_tweet[, c("created_at", "status_id")] # get columns with time and id info
tweet_ts$year <- substr(df_tweet$created_at, 1, 4) # extract year from time
tweet_ts <- tweet_ts[order(tweet_ts$created_at), ] # sort obs by ascending time
tweet_ts <- unique(tweet_ts) # remove duplicated tweet ids
enframe(table(tweet_ts$year)) # frequency table by year
## # A tibble: 13 × 2
##    name  value      
##    <chr> <table[1d]>
##  1 2011    263      
##  2 2012     35      
##  3 2013     83      
##  4 2014     18      
##  5 2015    200      
##  6 2016    112      
##  7 2017   1254      
##  8 2018  12180      
##  9 2019   5152      
## 10 2020  22058      
## 11 2021  28292      
## 12 2022  10326      
## 13 2023     24
plot(
  enframe(table(tweet_ts$year[tweet_ts$year < max(tweet_ts$year)])), 
     type = "l", xlab = "year", ylab = "tweets", 
     main = paste(
       as.numeric(min(tweet_ts$year)), " to ", as.numeric(max(tweet_ts$year))-1, " tweet count",
       sep = ""
       ) 
     )

Back to top

Mention Network

  1. merge selected variables from user and tweets

  2. Remove rows without mention information

  3. Split the mention variable using separate_rows

We only need a username and a mention column to show the mention relationship. Join the username variable from the user data and the mention variable from the tweet data by the shared variable author_id.

And then, transform the mention variable into multiple rows by , so that each row represents one mention link.

The number of rows would increase greatly after splitting the mention column into the format of each row per mention.

mention_df <- merge(df_user[, c("author_id", "username")], 
                    df_tweet[,c("author_id", "mention", "created_at")], 
                    by.x = "author_id", by.y = "author_id", all.y = TRUE)
# drop the author_id column and rows w/o mention
net_mention <- mention_df[!is.na(mention_df$mention),c("username", "mention", "created_at")]
# split the mention column into multiple rows, separated by ,
net_mention <- separate_rows(data = net_mention, mention, sep = ",", convert = TRUE)

Take a look at the resultant mention network data frame net_mentinon, with usernames partially hidden.

head(
  data.frame(
    cbind(
      username = paste(substr(net_mention$username, 1, 3), "***", sep = ""),
      mention = paste(substr(net_mention$mention, 1, 3), "***", sep = "")
    )
  )
)
##   username mention
## 1   shi***  elo***
## 2   shi***  pat***
## 3   shi***  ash***
## 4   shi***  par***
## 5   abh***  SII***
## 6   abh***  Sen***

Back to top

Most Mentioned Users

Fence-riders were more mentioned than the known supporters/opponents.

mentioned_table <- enframe(table(net_mention$mention))
mentioned_table$value <- as.numeric(mentioned_table$value)
DT::datatable(mentioned_table[mentioned_table$value>2,])

Back to top

Most Mentioned Users by Year (loop)

Extract the year information as the time unit by which we split the data

net_mention$year <- substr(net_mention$created_at, 1, 4)
net_mention$year <- factor(as.character(net_mention$year), ordered = TRUE)
net_mention <- net_mention[order(net_mention$created_at),]
levels(net_mention$year)
##  [1] "2011" "2012" "2013" "2014" "2015" "2016" "2017" "2018" "2019" "2020"
## [11] "2021" "2022" "2023"

MOST MENTIONED USERS BY YEAR

Create frequency tables for the mentioned users by each year. Compared to hashtag frequency tables, the mentioned tables are slightly simpler because (1) no need to convert the usernames to upper-case toupper and (2) no need to remove strings used in the query.

The number of frequency tables generated equals to the number of years. Each frequency table is stored as an element of a list.

Here’re the steps to construct the frequency tables of each year:

  1. Create an empty list mentioned_list and set its length to the number of years.

  2. Assign the name of each element (still empty) as each year respectively.

  3. Create an iteration loop to calculate the frequency table of each year.

mentioned_list <- vector(mode = "list", length = length(unique(net_mention$year))) # Step 1
names(mentioned_list) <- unique(net_mention$year) # Step 2
for (i in unique(net_mention$year)) # Step 3
{
  mentioned_list[[i]] <- enframe(table(net_mention$mention[net_mention$year == i]))
  mentioned_list[[i]]$value <- as.numeric(mentioned_list[[i]]$value)
  mentioned_list[[i]] <- mentioned_list[[i]][order(mentioned_list[[i]]$value),] # non-desc
  mentioned_list[[i]]$pct <- mentioned_list[[i]]$value/sum(mentioned_list[[i]]$value)
  mentioned_list[[i]]$cumsum <- cumsum(mentioned_list[[i]]$value)/sum(mentioned_list[[i]]$value)
  mentioned_list[[i]]$year <- i
  print(tail(mentioned_list[[i]]))
}
## # A tibble: 6 × 5
##   name          value    pct cumsum year 
##   <chr>         <dbl>  <dbl>  <dbl> <chr>
## 1 MalignantNews     1 0.0417  0.458 2011 
## 2 menghe            1 0.0417  0.5   2011 
## 3 wadhwa            1 0.0417  0.542 2011 
## 4 WhiteHouse        1 0.0417  0.583 2011 
## 5 gauks             2 0.0833  0.667 2011 
## 6 nytimes           8 0.333   1     2011 
## # A tibble: 5 × 5
##   name        value    pct cumsum year 
##   <chr>       <dbl>  <dbl>  <dbl> <chr>
## 1 IndiaWest       1 0.0476 0.0476 2012 
## 2 johntabin       1 0.0476 0.0952 2012 
## 3 unmc            1 0.0476 0.143  2012 
## 4 ShareThis       4 0.190  0.333  2012 
## 5 KauffmanFDN    14 0.667  1      2012 
## # A tibble: 6 × 5
##   name           value    pct cumsum year 
##   <chr>          <dbl>  <dbl>  <dbl> <chr>
## 1 RepGoodlatte       3 0.0566  0.509 2013 
## 2 BarackObama        4 0.0755  0.585 2013 
## 3 GOPWhip            4 0.0755  0.660 2013 
## 4 HouseJudiciary     4 0.0755  0.736 2013 
## 5 washingtonpost     4 0.0755  0.811 2013 
## 6 Wharton           10 0.189   1     2013 
## # A tibble: 6 × 5
##   name        value    pct cumsum year 
##   <chr>       <dbl>  <dbl>  <dbl> <chr>
## 1 SimonWDC        1 0.0714  0.357 2014 
## 2 thehill         1 0.0714  0.429 2014 
## 3 USCIS           1 0.0714  0.5   2014 
## 4 WhiteHouse      1 0.0714  0.571 2014 
## 5 YouTube         1 0.0714  0.643 2014 
## 6 BarackObama     5 0.357   1     2014 
## # A tibble: 6 × 5
##   name           value    pct cumsum year 
##   <chr>          <dbl>  <dbl>  <dbl> <chr>
## 1 TheToddSchulte    18 0.0552  0.638 2015 
## 2 hwimmi            20 0.0613  0.699 2015 
## 3 USCIS             21 0.0644  0.764 2015 
## 4 WhiteHouse        22 0.0675  0.831 2015 
## 5 DHSgov            26 0.0798  0.911 2015 
## 6 fwd_us            29 0.0890  1     2015 
## # A tibble: 6 × 5
##   name           value    pct cumsum year 
##   <chr>          <dbl>  <dbl>  <dbl> <chr>
## 1 RepDavid           3 0.0265  0.761 2016 
## 2 TheToddSchulte     3 0.0265  0.788 2016 
## 3 narendramodi       4 0.0354  0.823 2016 
## 4 USCIS              6 0.0531  0.876 2016 
## 5 fwd_us             7 0.0619  0.938 2016 
## 6 POTUS              7 0.0619  1     2016 
## # A tibble: 6 × 5
##   name            value    pct cumsum year 
##   <chr>           <dbl>  <dbl>  <dbl> <chr>
## 1 TomCottonAR        61 0.0209  0.740 2017 
## 2 McCaskillOffice    91 0.0311  0.771 2017 
## 3 SenTomCotton      100 0.0342  0.806 2017 
## 4 HouseGOP          115 0.0394  0.845 2017 
## 5 HouseDemocrats    116 0.0397  0.885 2017 
## 6 SIIA_US           337 0.115   1     2017 
## # A tibble: 6 × 5
##   name            value    pct cumsum year 
##   <chr>           <dbl>  <dbl>  <dbl> <chr>
## 1 SpeakerRyan      1504 0.0255  0.766 2018 
## 2 HouseDemocrats   2580 0.0438  0.810 2018 
## 3 SenateGOP        2693 0.0457  0.855 2018 
## 4 SenateDems       2778 0.0471  0.902 2018 
## 5 realDonaldTrump  2805 0.0476  0.950 2018 
## 6 HouseGOP         2943 0.0499  1     2018 
## # A tibble: 6 × 5
##   name            value    pct cumsum year 
##   <chr>           <dbl>  <dbl>  <dbl> <chr>
## 1 SenateGOP         352 0.0232  0.741 2019 
## 2 realDonaldTrump   384 0.0253  0.766 2019 
## 3 HouseDemocrats    495 0.0326  0.799 2019 
## 4 SenateDems        610 0.0402  0.839 2019 
## 5 SenMikeLee        705 0.0465  0.885 2019 
## 6 SenatorDurbin    1739 0.115   1     2019 
## # A tibble: 6 × 5
##   name          value    pct cumsum year 
##   <chr>         <dbl>  <dbl>  <dbl> <chr>
## 1 DickDurbin     2193 0.0324  0.545 2020 
## 2 ChrisHartline  3431 0.0507  0.596 2020 
## 3 CNN            4377 0.0647  0.661 2020 
## 4 SenateDems     6098 0.0902  0.751 2020 
## 5 SenRickScott   7145 0.106   0.856 2020 
## 6 SenatorDurbin  9703 0.144   1     2020 
## # A tibble: 6 × 5
##   name          value    pct cumsum year 
##   <chr>         <dbl>  <dbl>  <dbl> <chr>
## 1 SecMayorkas    2933 0.0371  0.660 2021 
## 2 VP             3405 0.0431  0.703 2021 
## 3 JudiciaryDems  4443 0.0563  0.759 2021 
## 4 POTUS          4761 0.0603  0.820 2021 
## 5 USCIS          6661 0.0843  0.904 2021 
## 6 SenatorDurbin  7592 0.0961  1     2021 
## # A tibble: 6 × 5
##   name          value    pct cumsum year 
##   <chr>         <dbl>  <dbl>  <dbl> <chr>
## 1 SenatorDurbin   726 0.0334  0.756 2022 
## 2 AILANational    739 0.0340  0.790 2022 
## 3 PabloReports    911 0.0419  0.832 2022 
## 4 POTUS           928 0.0427  0.874 2022 
## 5 USCISDirector  1177 0.0541  0.928 2022 
## 6 USCIS          1560 0.0717  1     2022 
## # A tibble: 6 × 5
##   name           value    pct cumsum year 
##   <chr>          <dbl>  <dbl>  <dbl> <chr>
## 1 USCISDirector      1 0.0303  0.515 2023 
## 2 PramilaJayapal     2 0.0606  0.576 2023 
## 3 RepZoeLofgren      2 0.0606  0.636 2023 
## 4 TheDemocrats       2 0.0606  0.697 2023 
## 5 USCIS              3 0.0909  0.788 2023 
## 6 POTUS              7 0.212   1     2023

Extract frequency tables from the list and rbind them into one dataframe

mentioned_year_df <- do.call(what = "rbind", args = lapply(mentioned_list, as.data.frame))
mentioned_year_df <- mentioned_year_df[order(mentioned_year_df$year, mentioned_year_df$value), ]
mentioned_year_df <- rapply(object = mentioned_year_df, f = round, classes = "numeric", how = "replace", digits = 5)
rownames(mentioned_year_df) <- 1:nrow(mentioned_year_df)
mentioned_year_N <- enframe(table(mentioned_year_df$year)) # total mentions by year
years_gini <- mentioned_year_N$name[mentioned_year_N$value > 1000] # years N>1000
years_gini
## [1] "2018" "2019" "2020" "2021" "2022"
# Selectively display the frequency table: top mentioned users who contribute to 50% of total mentions, and
# Remove years in which the count of tweets was extremely small
DT::datatable(mentioned_year_df[mentioned_year_df$cumsum >=0.5 & mentioned_year_df$year %in% years_gini, ])

Back to top

Mention Network Gini Coefficients by Year (nested loops)

Create nested loops (k iterations of all selected years and i iterations of each row in the dataframe of each k year) to compute in-degree mention networks Gini coefficients.

N <- rep(NA, length(years_gini))
Area_Bi <- vector(mode = "list", length = length(years_gini)) #
names(Area_Bi) <- years_gini
Gini_coef_mentioned <- rep(NA, length(years_gini))
for (k in years_gini) {
  N[k] <- nrow(mentioned_year_df[mentioned_year_df$year == k,])
  N <- N[!is.na(N)] # NAs generated in earlier iterations
  for (i in 1:N[k]) {
    ## double [[]] to index a list element
    Area_Bi[[k]][i] <- (N[k] - i + 0.5)*mentioned_year_df$pct[mentioned_year_df$year == k][i]/N[k]
  }
  Gini_coef_mentioned[k] <- 1 - 2*sum(Area_Bi[[k]])
  Gini_coef_mentioned <- Gini_coef_mentioned[!is.na(Gini_coef_mentioned)]
  Gini_coef_mentioned_df <- enframe(Gini_coef_mentioned)
  colnames(Gini_coef_mentioned_df) <- c("year", "Gini_coef")
}
 Gini_coef_mentioned_df
## # A tibble: 5 × 2
##   year  Gini_coef
##   <chr>     <dbl>
## 1 2018      0.874
## 2 2019      0.790
## 3 2020      0.933
## 4 2021      0.912
## 5 2022      0.804

Back to top

Hashtag (Network) Investigation

Select the relevant variables from all dataframes to generate hashtag networks. Hashtag networks can be analyzed and visualized by so many ways. We can either study hashtags by an author or by each tweet. We can study recipient-based hashtag networks, too. For example, which hashtags did a given recipient had received during a specific period? This approach is useful in online advocates.

hashtag_df <- merge(df_tweet[,c("author_id", "status_id", "hashtag", "created_at")],
                    df_user[,c("author_id", "username")],
                   by = "author_id", all.x = TRUE)
hashtag_df <- unique(hashtag_df) # remove unique rows

Subset the hashtag_df that retains rows with non-blank hashtags. 67898, or, 84.876% of the tweets have at least one hashtag.

hashtag_df <- hashtag_df[!is.na(hashtag_df$hashtag), ]
hashtag_df <- hashtag_df[order(hashtag_df$created_at), ]

Back to top

Split Hashtag Column into Tweet-Hashtag Edgelist

Get hashtag co-occurrence based on each tweet

hashtag_tw <- separate_rows(data = hashtag_df, hashtag, sep = ",", convert = TRUE)
hashtag_tw$year <- substr(hashtag_tw$created_at, 1, 4)
enframe(table(hashtag_tw$year))
## # A tibble: 13 × 2
##    name  value      
##    <chr> <table[1d]>
##  1 2011     83      
##  2 2012     26      
##  3 2013     53      
##  4 2014      8      
##  5 2015    318      
##  6 2016     85      
##  7 2017   2001      
##  8 2018  40463      
##  9 2019  11608      
## 10 2020  71843      
## 11 2021  64891      
## 12 2022  17556      
## 13 2023     48
plot(enframe(table(hashtag_tw$year[hashtag_tw$year<max(hashtag_tw$year)])), type = "l",
     main = "Hashtag count by year", xlab = "year", ylab = "hashtag")

hashtag_tw$upcase <- toupper(hashtag_tw$hashtag)

Back to top

hashtag_tw$year <- factor(as.character(hashtag_tw$year), ordered = TRUE)
length(unique(hashtag_tw$year))
## [1] 13
hashtag_list <- vector(mode = "list", length = length(unique(hashtag_tw$year)))
names(hashtag_list) <- unique(hashtag_tw$year)
names(hashtag_list)
##  [1] "2011" "2012" "2013" "2014" "2015" "2016" "2017" "2018" "2019" "2020"
## [11] "2021" "2022" "2023"

Hashtag by Year and Numbers

MOST FREQUENT HASHTAGS BY YEAR (words used in query: GCBACKLOG, GREENCARDBACKLOG were removed from the tables)

for (i in unique(hashtag_tw$year) ) 
{
 hashtag_list[[i]] <- enframe(table(hashtag_tw$upcase[hashtag_tw$year == i]))
 hashtag_list[[i]]$value <- as.numeric(hashtag_list[[i]]$value) # coerce table to numeric
 hashtag_list[[i]] <- 
   hashtag_list[[i]][order(hashtag_list[[i]]$value), ] # non-descending order
 hashtag_list[[i]] <- hashtag_list[[i]][!hashtag_list[[i]]$name %in% toupper(query), ]
 hashtag_list[[i]]$pct <- 
   hashtag_list[[i]]$value/sum(hashtag_list[[i]]$value) # pct
 hashtag_list[[i]]$cumsum <- 
   cumsum(hashtag_list[[i]]$value)/sum(hashtag_list[[i]]$value) # cumsum
 hashtag_list[[i]]$year <- i
 print(tail(hashtag_list[[i]]))
}
## # A tibble: 6 × 5
##   name        value    pct cumsum year 
##   <chr>       <dbl>  <dbl>  <dbl> <chr>
## 1 NYTIMES         3 0.0366  0.512 2011 
## 2 US              3 0.0366  0.549 2011 
## 3 POLITICS        4 0.0488  0.598 2011 
## 4 WORLD           5 0.0610  0.659 2011 
## 5 FREENYT         6 0.0732  0.732 2011 
## 6 IMMIGRATION    22 0.268   1     2011 
## # A tibble: 6 × 5
##   name        value    pct cumsum year 
##   <chr>       <dbl>  <dbl>  <dbl> <chr>
## 1 LAWYER          1 0.0385  0.192 2012 
## 2 STARTUPACT2     1 0.0385  0.231 2012 
## 3 GREENCARD       2 0.0769  0.308 2012 
## 4 HR3012          2 0.0769  0.385 2012 
## 5 IMMIGRATION     7 0.269   0.654 2012 
## 6 STARTUPACT      9 0.346   1     2012 
## # A tibble: 6 × 5
##   name              value    pct cumsum year 
##   <chr>             <dbl>  <dbl>  <dbl> <chr>
## 1 TIMEISNOW             1 0.0189  0.264 2013 
## 2 WANTTOBEHERE          1 0.0189  0.283 2013 
## 3 WP                    1 0.0189  0.302 2013 
## 4 CIR                   2 0.0377  0.340 2013 
## 5 IMMIGRATION          10 0.189   0.528 2013 
## 6 IMMIGRATIONREFORM    25 0.472   1     2013 
## # A tibble: 5 × 5
##   name              value   pct cumsum year 
##   <chr>             <dbl> <dbl>  <dbl> <chr>
## 1 CLEARGCBACKLOG        1 0.143  0.143 2014 
## 2 GREEN                 1 0.143  0.286 2014 
## 3 H4EADRULE             1 0.143  0.429 2014 
## 4 IMMIGRATIONREFORM     1 0.143  0.571 2014 
## 5 IMMIGRATION           3 0.429  1     2014 
## # A tibble: 6 × 5
##   name              value    pct cumsum year 
##   <chr>             <dbl>  <dbl>  <dbl> <chr>
## 1 AINF                 22 0.0884  0.450 2015 
## 2 IMMIGRATIONREFORM    22 0.0884  0.538 2015 
## 3 TNTVOTE              22 0.0884  0.627 2015 
## 4 HR213                25 0.100   0.727 2015 
## 5 H4EAD                32 0.129   0.855 2015 
## 6 IMMIGRATION          36 0.145   1     2015 
## # A tibble: 6 × 5
##   name            value    pct cumsum year 
##   <chr>           <dbl>  <dbl>  <dbl> <chr>
## 1 H1B                 3 0.0455  0.576 2016 
## 2 LEGALIMMIGRANTS     3 0.0455  0.621 2016 
## 3 I140EAD             5 0.0758  0.697 2016 
## 4 IMMIGRATION         5 0.0758  0.773 2016 
## 5 I140EADAP           7 0.106   0.879 2016 
## 6 HR213               8 0.121   1     2016 
## # A tibble: 6 × 5
##   name             value    pct cumsum year 
##   <chr>            <dbl>  <dbl>  <dbl> <chr>
## 1 LEGALIMMIGRANTS     47 0.0369  0.360 2017 
## 2 H4EAD               60 0.0471  0.407 2017 
## 3 S281               108 0.0848  0.492 2017 
## 4 H4DREAMERS         146 0.115   0.606 2017 
## 5 7MONTHSVS70YEARS   192 0.151   0.757 2017 
## 6 HR392              309 0.243   1     2017 
## # A tibble: 6 × 5
##   name            value    pct cumsum year 
##   <chr>           <dbl>  <dbl>  <dbl> <chr>
## 1 H4DREAMERS       1698 0.0541  0.592 2018 
## 2 LEGALIMMIGRANTS  1853 0.0590  0.651 2018 
## 3 H4EAD            1946 0.0620  0.713 2018 
## 4 ISQUARED         2126 0.0677  0.781 2018 
## 5 SAVEH4EAD        2191 0.0698  0.851 2018 
## 6 HR392            4691 0.149   1     2018 
## # A tibble: 6 × 5
##   name       value    pct cumsum year 
##   <chr>      <dbl>  <dbl>  <dbl> <chr>
## 1 H4DREAMERS   351 0.0399  0.527 2019 
## 2 S386ISFAIR   371 0.0421  0.569 2019 
## 3 PASSS386     385 0.0437  0.612 2019 
## 4 YESS386      566 0.0643  0.677 2019 
## 5 HR1044       856 0.0973  0.774 2019 
## 6 S386        1989 0.226   1     2019 
## # A tibble: 6 × 5
##   name        value    pct cumsum year 
##   <chr>       <dbl>  <dbl>  <dbl> <chr>
## 1 UNBLOCKS386  3431 0.0520  0.441 2020 
## 2 PASSS386     3639 0.0551  0.497 2020 
## 3 S386ISFAIR   4438 0.0672  0.564 2020 
## 4 FAIRNESS     6281 0.0952  0.659 2020 
## 5 YESS386      8026 0.122   0.781 2020 
## 6 S386        14479 0.219   1     2020 
## # A tibble: 6 × 5
##   name               value    pct cumsum year 
##   <chr>              <dbl>  <dbl>  <dbl> <chr>
## 1 REMOVECOUNTRYCAPS   1993 0.0392  0.658 2021 
## 2 H4EADDELAYS         2537 0.0498  0.708 2021 
## 3 HR3648              3265 0.0642  0.772 2021 
## 4 EAGLEACT            3684 0.0724  0.845 2021 
## 5 DURBINISRACIST      3893 0.0765  0.921 2021 
## 6 DURBINHATESINDIANS  4006 0.0787  1     2021 
## # A tibble: 6 × 5
##   name        value    pct cumsum year 
##   <chr>       <dbl>  <dbl>  <dbl> <chr>
## 1 HR3648        236 0.0247  0.581 2022 
## 2 H1B           366 0.0383  0.619 2022 
## 3 IMMIGRATION   495 0.0518  0.671 2022 
## 4 GREENCARD     532 0.0556  0.726 2022 
## 5 EAGLEACT      843 0.0882  0.815 2022 
## 6 H4EADDELAYS  1773 0.185   1     2022 
## # A tibble: 6 × 5
##   name          value    pct cumsum year 
##   <chr>         <dbl>  <dbl>  <dbl> <chr>
## 1 NEWYEAR           1 0.0333  0.7   2023 
## 2 TECHCOMPANIES     1 0.0333  0.733 2023 
## 3 US                1 0.0333  0.767 2023 
## 4 WARONMERIT        1 0.0333  0.8   2023 
## 5 H1B               2 0.0667  0.867 2023 
## 6 EAGLEACT          4 0.133   1     2023

Back to top

ALL HASHTAGS ALL YEARS IN ONE TABLE

hashtag_year_df <- do.call(what = "rbind", args = lapply(hashtag_list, as.data.frame))
hashtag_year_df <- hashtag_year_df[order(hashtag_year_df$year, hashtag_year_df$value), ]
hashtag_year_df <- rapply(object = hashtag_year_df, f = round, classes = "numeric", how = "replace", digits = 5)
rownames(hashtag_year_df) <- 1:nrow(hashtag_year_df)
DT::datatable(hashtag_year_df[hashtag_year_df$value>1, ])

Back to top

length(unique(hashtag_tw$upcase)) # Total unique hashtags
## [1] 5535
length(unique(hashtag_tw$status_id)) # Total unique tweets
## [1] 67898
length(unique(hashtag_tw$author_id)) # Total unique users
## [1] 5389
hashtag_year <- hashtag_tw[, c("upcase", "year")]
hashtag_year <- unique(hashtag_year)
table(hashtag_year$year)
## 
## 2011 2012 2013 2014 2015 2016 2017 2018 2019 2020 2021 2022 2023 
##   32   10   19    6   51   36  116 1416  725 1446 2217 1485   28
plot(table(hashtag_year$year), 
     type = "l", ylab = "unique hashtag", 
     main = "Unique Hashtag Count by Year (ignore the most recent year)")

Tweet-Hashtag Matrix (high-dimensional)

status_id by hashtag counts for ACTUAL HASHTAG COOCURRENCE, acast() is the command to reshape the edgelist to a matrix. The resultant matrix has tweet ids (status_id) in the rows and uppercase hashtags (upcase) in the column. The resultant dimension of the matrix is high. So, it is strongly suggested to slice the edgelist into smaller time units!

In the tweet by hashtag matrix, the row sum is the total number of different hashtags appeared in one tweet. The column sum is the total count of a hashtag in all the tweets.

hashtag_tw_23 <- hashtag_tw[hashtag_tw$year == 2023,]
hashtag_tw_23 <- hashtag_tw_23[!hashtag_tw_23$upcase %in% toupper(query),]
hashtag_tw_23 <- hashtag_tw_23[!is.na(hashtag_tw_23$upcase), ]
head(hashtag_tw_23[, c("status_id", "upcase")] )
## # A tibble: 6 × 2
##   status_id           upcase       
##   <chr>               <chr>        
## 1 1609380347053211649 NEWYEAR      
## 2 1609380347053211649 12YEARSASLAVE
## 3 1609380347053211649 DEPRESSION   
## 4 1609380347053211649 FUCKTHEUSA   
## 5 1609587076982358017 CONGRESS     
## 6 1609587076982358017 2023NEWYEAR
tw_hashtag_m <- acast(data = hashtag_tw_23[, c("status_id", "upcase")], 
                      formula = hashtag_tw_23$status_id ~ hashtag_tw_23$upcase, 
                      length, value.var = "upcase")
head(tw_hashtag_m[1:10, 1:5])
##                     12YEARSASLAVE 2023NEWYEAR AILA CHATGPT CONGRESS
## 1609380347053211649             1           0    0       0        0
## 1609587076982358017             0           1    0       0        1
## 1609593386167672835             0           0    0       0        0
## 1609625074058907649             0           0    0       0        0
## 1609680475320193024             0           0    0       0        0
## 1609681929619791872             0           0    0       0        0
class(tw_hashtag_m)
## [1] "matrix" "array"
colnames(tw_hashtag_m)
##  [1] "12YEARSASLAVE"         "2023NEWYEAR"           "AILA"                 
##  [4] "CHATGPT"               "CONGRESS"              "DECADES"              
##  [7] "DEPRESSION"            "EAD"                   "EAGLEACT"             
## [10] "F1VISA"                "FINANCIALINSTITUTIONS" "FUCKTHEUSA"           
## [13] "GREENCARD"             "H1B"                   "H4"                   
## [16] "H4EADDELAYS"           "HAPPYNEWYEAR"          "I140"                 
## [19] "I485"                  "INDIA"                 "INDIAN"               
## [22] "LCA"                   "NEWYEAR"               "TECHCOMPANIES"        
## [25] "US"                    "WARONMERIT"
dim(tw_hashtag_m)
## [1] 12 26

Hashtag Co-occurrence Adjacent Matrix

\[M_{hashtag \_ adj} = t(M_{tweet \times hashtag}) \times M_{tweet \times hashtag}\]

hashtag_adjm <- t(tw_hashtag_m) %*% tw_hashtag_m
diag(hashtag_adjm) <- 0 # set diagonal values to 0
hashtag_adjm[1:6,1:6] # Hashtag adjacency matrix is symmetric
##               12YEARSASLAVE 2023NEWYEAR AILA CHATGPT CONGRESS DECADES
## 12YEARSASLAVE             0           0    0       0        0       0
## 2023NEWYEAR               0           0    0       0        1       0
## AILA                      0           0    0       1        0       0
## CHATGPT                   0           0    1       0        0       0
## CONGRESS                  0           1    0       0        0       0
## DECADES                   0           0    0       0        0       0
dim(hashtag_adjm)
## [1] 26 26

Hashtag Co-occurrence Visualization

This is a rudimentary network visualization. The network isn’t large so there’s no need to set up many parameters before visualizing it.

g_hashtag_adj <- graph_from_adjacency_matrix(hashtag_adjm)
vcount(g_hashtag_adj);ecount(g_hashtag_adj)
## [1] 26
## [1] 96
# raw ideas of setting network visualize parameters
seq(from = 1, to = 40, length.out = vcount(g_hashtag_adj)) 
##  [1]  1.00  2.56  4.12  5.68  7.24  8.80 10.36 11.92 13.48 15.04 16.60 18.16
## [13] 19.72 21.28 22.84 24.40 25.96 27.52 29.08 30.64 32.20 33.76 35.32 36.88
## [25] 38.44 40.00
set.seed(12345)
summary(degree(g_hashtag_adj))
##    Min. 1st Qu.  Median    Mean 3rd Qu.    Max. 
##   0.000   2.000   7.000   7.385  14.000  16.000
V(g_hashtag_adj)$label <- V(g_hashtag_adj)$name
par(mar=c(2,.8,2,.8))
plot(g_hashtag_adj, 
     vertex.size = degree(g_hashtag_adj)+1, vertex.color = "snow", 
     vertex.shape = "circle",
     vertex.label = V(g_hashtag_adj)$label, 
     vertex.label.color = "grey20",
     vertex.label.font = 2, # bold
     vertex.label.cex = 0.6 + log(degree(g_hashtag_adj)+1)*0.12,
     vertex.frame.color = "grey90",
     edge.arrow.size = 0, 
     edge.curved = 0, 
     main = "Words used in the query were removed from the network visualization")

time2 = Sys.time()
time2 - time1
## Time difference of 9.536939 secs

Back to top