This script helps visualize the hashtag co-occurrence networks of any given time range (e.g., 14 days interval). The network ties represent hashtag co-occurrences in a tweet. The colors of ties represent the policy position (either a supporter or an opponent) of the Twitter user.

Hashtags

Supporters

df_pro_hashtag = df_pro[which(is.na(df_pro$hashtag)==F),] # select rows that contain at least one hashtag
nrow(df_pro_hashtag)

## [1] 11513

nrow(df_pro_hashtag)/nrow(df_pro) # % of rows retained

## [1] 0.6752493

df_pro_hashtag = separate_rows(df_pro_hashtag, hashtag) # release each hashtag to a new row. At this step, the status_id will have duplicates
nrow(df_pro_hashtag)

## [1] 31982

df_pro_hashtag_timerange <- df_pro_hashtag[df_pro_hashtag$created_at > start_time & df_pro_hashtag$created_at <= end_time,]
df_pro_hashtag_timerange <- df_pro_hashtag_timerange[nchar(df_pro_hashtag_timerange$hashtag)>1,]
nrow(df_pro_hashtag_timerange)

## [1] 332

range(df_pro_hashtag_timerange$created_at)

## [1] "2020-12-18T00:21:42.000Z" "2020-12-31T21:13:34.000Z"

edgelist_pro_hashtag_timerange = df_pro_hashtag_timerange[,c("status_id","username","hashtag")]
edgelist_pro_hashtag_timerange$hashtag <- tolower(edgelist_pro_hashtag_timerange$hashtag)
dropcolumns <- c("s386","hr1044","hr392","s281","hr3012")
#edgelist_pro_hashtag_timerange <- edgelist_pro_hashtag_timerange[!edgelist_pro_hashtag_timerange$hashtag%in% dropcolumns,]
## user by hashtag counts
user_by_hashtag_pro_timerange <- acast(edgelist_pro_hashtag_timerange, formula = edgelist_pro_hashtag_timerange$username ~ edgelist_pro_hashtag_timerange$hashtag, length, value.var = "hashtag")
user_by_hashtag_pro_timerange <- user_by_hashtag_pro_timerange[,apply(user_by_hashtag_pro_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>1] # set the threshold of hashtag frequency at 2

## status_id by hashtag counts for ACTUAL HASHTAG COOCURRENCE
tweet_by_hashtag_pro_timerange <- acast(edgelist_pro_hashtag_timerange, formula = edgelist_pro_hashtag_timerange$status_id ~ edgelist_pro_hashtag_timerange$hashtag, length, value.var = "hashtag")
class(tweet_by_hashtag_pro_timerange)

## [1] "matrix" "array"

tweet_by_hashtag_pro_timerange <- tweet_by_hashtag_pro_timerange[,apply(tweet_by_hashtag_pro_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>1] # this data frame itself is less useful, but it can be used to get the hashtag cooccurrence. 
##
hashtag_m_pro_timerange <- t(tweet_by_hashtag_pro_timerange)%*%tweet_by_hashtag_pro_timerange # to get the actual count of hashtag coocurrence, do not multiply the user-hashtag matrix. Rather, multiply the tweet-hashtag matrix. This way, the actual count won't be inflated.
diag(hashtag_m_pro_timerange) <- 0
hashtag_m_pro_timerange <- hashtag_m_pro_timerange[,apply(hashtag_m_pro_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>0] # delete column sum 0s
hashtag_m_pro_timerange <- hashtag_m_pro_timerange[apply(hashtag_m_pro_timerange, MARGIN = 1, FUN = sum, na.rm = TRUE)>0,] # delete row sum 0s
## as of now, no isolates in the network

g.hashtag_pro_timerange <- graph_from_adjacency_matrix(hashtag_m_pro_timerange)
ecount(g.hashtag_pro_timerange)

## [1] 722

par(mar=c(.8,.8,2,.8))
set.seed(1234)
plot(g.hashtag_pro_timerange, vertex.shape = "none", edge.arrow.size = 0, edge.width = .5, vertex.label.cex = 1, vertex.label.font = 2, vertex.label.color = "black", edge.curved = 0.1, edge.color = alpha("steelblue",0.5), layout = layout_with_lgl)
title(main = paste("Hashtag Cooccurrence","_pro_", filename_suffix))

ecount(g.hashtag_pro_timerange)

## [1] 722

vcount(g.hashtag_pro_timerange)

## [1] 24

edgelist_adj_pro_hashtag_timerange <- data.frame(get.edgelist(g.hashtag_pro_timerange))
dim(edgelist_adj_pro_hashtag_timerange)

## [1] 722   2

edgelist_adj_pro_hashtag_timerange$X3 <- "_pro_" # data frame

Opponents

df_con_hashtag = df_con[which(is.na(df_con$hashtag)==F),] # select rows that contain at least one hashtag
nrow(df_con_hashtag)

## [1] 6071

nrow(df_con_hashtag)/nrow(df_con) # % of rows retained

## [1] 0.8294849

df_con_hashtag = separate_rows(df_con_hashtag, hashtag) # release each hashtag to a new row. At this step, the status_id will have duplicates
nrow(df_con_hashtag)

## [1] 48525

df_con_hashtag_timerange <- df_con_hashtag[df_con_hashtag$created_at > start_time & df_con_hashtag$created_at <= end_time,]
df_con_hashtag_timerange <- df_con_hashtag_timerange[nchar(df_con_hashtag_timerange$hashtag)>1,]
nrow(df_con_hashtag_timerange)

## [1] 89

range(df_con_hashtag_timerange$created_at)

## [1] "2020-12-18T00:05:36.000Z" "2020-12-21T20:37:51.000Z"

edgelist_con_hashtag_timerange = df_con_hashtag_timerange[,c("status_id","username","hashtag")]
edgelist_con_hashtag_timerange$hashtag <- tolower(edgelist_con_hashtag_timerange$hashtag)
dropcolumns <- c("s386","hr1044","hr392","s281","hr3012")
#edgelist_con_hashtag_timerange <- edgelist_con_hashtag_timerange[!edgelist_con_hashtag_timerange$hashtag%in% dropcolumns,]
## user by hashtag counts
user_by_hashtag_con_timerange <- acast(edgelist_con_hashtag_timerange, formula = edgelist_con_hashtag_timerange$username ~ edgelist_con_hashtag_timerange$hashtag, length, value.var = "hashtag")
user_by_hashtag_con_timerange <- user_by_hashtag_con_timerange[,apply(user_by_hashtag_con_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>1] # set the threshold of hashtag frequency at 2
## status_id by hashtag counts for ACTUAL HASHTAG COOCURRENCE
tweet_by_hashtag_con_timerange <- acast(edgelist_con_hashtag_timerange, formula = edgelist_con_hashtag_timerange$status_id ~ edgelist_con_hashtag_timerange$hashtag, length, value.var = "hashtag")
class(tweet_by_hashtag_con_timerange)

## [1] "matrix" "array"

tweet_by_hashtag_con_timerange <- tweet_by_hashtag_con_timerange[,apply(tweet_by_hashtag_con_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>1] # this data frame itself is less useful, but it can be used to get the hashtag cooccurrence. 
##
hashtag_m_con_timerange <- t(tweet_by_hashtag_con_timerange)%*%tweet_by_hashtag_con_timerange # to get the actual count of hashtag coocurrence, do not multiply the user-hashtag matrix. Rather, multiply the tweet-hashtag matrix. This way, the actual count won't be inflated.
diag(hashtag_m_con_timerange) <- 0
hashtag_m_con_timerange <- hashtag_m_con_timerange[,apply(hashtag_m_con_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>0] # delete column sum 0s
hashtag_m_con_timerange <- hashtag_m_con_timerange[apply(hashtag_m_con_timerange, MARGIN = 1, FUN = sum, na.rm = TRUE)>0,] # delete row sum 0s
## as of now, no isolates in the network
g.hashtag_con_timerange <- graph_from_adjacency_matrix(hashtag_m_con_timerange)
ecount(g.hashtag_con_timerange)

## [1] 180

par(mar=c(.8,.8,2,.8))
set.seed(1234)
plot(g.hashtag_con_timerange, vertex.shape = "none", edge.arrow.size = 0, edge.width = .5, vertex.label.cex = 1, vertex.label.font = 2, vertex.label.color = "black", edge.curved = 0.1, edge.color = alpha("tomato",0.5), layout = layout_with_lgl)
title(main = paste("Hashtag Cooccurrence","_con_", filename_suffix))

ecount(g.hashtag_con_timerange)

## [1] 180

vcount(g.hashtag_con_timerange)

## [1] 11

edgelist_adj_con_hashtag_timerange <- data.frame(get.edgelist(g.hashtag_con_timerange))
dim(edgelist_adj_con_hashtag_timerange)

## [1] 180   2

edgelist_adj_con_hashtag_timerange$X3 <- "_con_" # data frame

Combined

edgelist_adj_all_hashtag_timerange <- data.frame(rbind(edgelist_adj_pro_hashtag_timerange,edgelist_adj_con_hashtag_timerange))
#edgelist_adj_all_hashtag_timerange <- edgelist_adj_all_hashtag_timerange[!edgelist_adj_all_hashtag_timerange$X1 %in% dropcolumns,]
#edgelist_adj_all_hashtag_timerange <- edgelist_adj_all_hashtag_timerange[!edgelist_adj_all_hashtag_timerange$X2 %in% dropcolumns,]
edgelist_adj_all_hashtag_timerange <- edgelist_adj_all_hashtag_timerange[nchar(edgelist_adj_all_hashtag_timerange$X1)>1,]
edgelist_adj_all_hashtag_timerange <- edgelist_adj_all_hashtag_timerange[nchar(edgelist_adj_all_hashtag_timerange$X2)>1,]

g.hashtag_all_timerange <- graph_from_data_frame(edgelist_adj_all_hashtag_timerange)
ecount(g.hashtag_all_timerange)

## [1] 902

## combined 1
par(mar=c(.8,.8,2,.8))
set.seed(1234)
E(g.hashtag_all_timerange)$position <- edgelist_adj_all_hashtag_timerange$X3
E(g.hashtag_all_timerange)[E(g.hashtag_all_timerange)$position=="_pro_"]$color <- alpha("steelblue",0.4)
E(g.hashtag_all_timerange)[E(g.hashtag_all_timerange)$position=="_con_"]$color <- alpha("tomato",0.4)
plot(g.hashtag_all_timerange, vertex.shape = "circle", vertex.color = "white", vertex.size = 1, vertex.frame.color = "gray90", edge.arrow.size = 0, edge.width = .4, vertex.label.cex = 1, vertex.label.font = 2, vertex.label.color = "black", edge.curved = 0.1, layout = layout_nicely)
title(main = paste("Hashtag Cooccurrence","_all_", filename_suffix))
legend("topright", legend = c("supporters","opponents"),lty = 1, col = c("steelblue","tomato"))

## combined 2
set.seed(1234)
E(g.hashtag_all_timerange)$position <- edgelist_adj_all_hashtag_timerange$X3
E(g.hashtag_all_timerange)[E(g.hashtag_all_timerange)$position=="_pro_"]$color <- alpha("steelblue",0.4)
E(g.hashtag_all_timerange)[E(g.hashtag_all_timerange)$position=="_con_"]$color <- alpha("tomato",0.4)
plot(g.hashtag_all_timerange, vertex.shape = "circle", vertex.color = "white", vertex.size = 1, vertex.frame.color = "gray90", edge.arrow.size = 0, edge.width = .5, vertex.label.cex = 0.7, vertex.label.font = 2, vertex.label.color = "black", edge.curved = 0.1, layout = layout_in_circle)
title(main = paste("Hashtag Cooccurrence","_all_", filename_suffix))
legend("topright", legend = c("supporters","opponents"),lty = 1, col = c("steelblue","tomato"))

Hashtag Frequency Tables

Supporters

datatable(as.data.frame(table(tolower(df_pro_hashtag_timerange$hashtag))))

Opponents

datatable(as.data.frame(table(tolower(df_con_hashtag_timerange$hashtag))))

Hashtag Co-Occurrence Networks

Tiangeng Lu

Last compiled on 12/09/2021

Preparation—Set Time Range

Hashtags

Supporters

Opponents

Combined

Hashtag Frequency Tables

Supporters

Opponents

Plots Output