S.386 Debate - 7/21/20
This script helps visualize the hashtag co-occurrence networks of any given time range (e.g., 14 days interval). The network ties represent hashtag co-occurrences in a tweet. The colors of ties represent the policy position (either a supporter or an opponent) of the Twitter user.
Data
Set the time range of investigation
start_time = as.Date("2020-07-21T00:06:00.000Z")
end_time = start_time+14
filename_suffix <- paste(start_time,end_time, sep = " to ") # add this to the file name for data output
filename_suffix
## [1] "2020-07-21 to 2020-08-04"
The time range of the following analyses is 2020-07-21 to 2020-08-04.
df_pro_hashtag = df_pro[which(is.na(df_pro$hashtag)==F),] # select rows that contain at least one hashtag
nrow(df_pro_hashtag)/nrow(df_pro) # % of rows retained
## [1] 0.6752493
df_pro_hashtag = separate_rows(df_pro_hashtag, hashtag) # release each hashtag to a new row. At this step, the status_id will have duplicates
df_pro_hashtag_timerange <- df_pro_hashtag[df_pro_hashtag$created_at > start_time & df_pro_hashtag$created_at <= end_time,]
df_pro_hashtag_timerange <- df_pro_hashtag_timerange[nchar(df_pro_hashtag_timerange$hashtag)>1,]
nrow(df_pro_hashtag_timerange)
## [1] 362
range(df_pro_hashtag_timerange$created_at)
## [1] "2020-07-22T00:19:10.000Z" "2020-08-04T18:43:52.000Z"
edgelist_pro_hashtag_timerange = df_pro_hashtag_timerange[,c("status_id","username","hashtag")]
edgelist_pro_hashtag_timerange$hashtag <- tolower(edgelist_pro_hashtag_timerange$hashtag)
dropcolumns <- c("s386","hr1044","hr392","s281","hr3012")
#edgelist_pro_hashtag_timerange <- edgelist_pro_hashtag_timerange[!edgelist_pro_hashtag_timerange$hashtag%in% dropcolumns,]
## user by hashtag counts
user_by_hashtag_pro_timerange <- acast(edgelist_pro_hashtag_timerange, formula = edgelist_pro_hashtag_timerange$username ~ edgelist_pro_hashtag_timerange$hashtag, length, value.var = "hashtag")
user_by_hashtag_pro_timerange <- user_by_hashtag_pro_timerange[,apply(user_by_hashtag_pro_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>1] # set the threshold of hashtag frequency at 2
## status_id by hashtag counts for ACTUAL HASHTAG COOCURRENCE
tweet_by_hashtag_pro_timerange <- acast(edgelist_pro_hashtag_timerange, formula = edgelist_pro_hashtag_timerange$status_id ~ edgelist_pro_hashtag_timerange$hashtag, length, value.var = "hashtag")
class(tweet_by_hashtag_pro_timerange)
## [1] "matrix" "array"
tweet_by_hashtag_pro_timerange <- tweet_by_hashtag_pro_timerange[,apply(tweet_by_hashtag_pro_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>1] # this data frame itself is less useful, but it can be used to get the hashtag cooccurrence.
##
hashtag_m_pro_timerange <- t(tweet_by_hashtag_pro_timerange)%*%tweet_by_hashtag_pro_timerange # to get the actual count of hashtag coocurrence, do not multiply the user-hashtag matrix. Rather, multiply the tweet-hashtag matrix. This way, the actual count won't be inflated.
diag(hashtag_m_pro_timerange) <- 0
hashtag_m_pro_timerange <- hashtag_m_pro_timerange[,apply(hashtag_m_pro_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>0] # delete column sum 0s
hashtag_m_pro_timerange <- hashtag_m_pro_timerange[apply(hashtag_m_pro_timerange, MARGIN = 1, FUN = sum, na.rm = TRUE)>0,] # delete row sum 0s
## as of now, no isolates in the network
g.hashtag_pro_timerange <- graph_from_adjacency_matrix(hashtag_m_pro_timerange)
ecount(g.hashtag_pro_timerange)
## [1] 1200
par(mar=c(.8,.8,2,.8))
set.seed(1234)
plot(g.hashtag_pro_timerange, vertex.shape = "none", edge.arrow.size = 0, edge.width = .4, vertex.label.cex = 1, vertex.label.font = 2, vertex.label.color = "black", edge.curved = 0.1, edge.color = alpha("steelblue",0.3), layout = layout_with_lgl)
title(main = paste("Hashtag Cooccurrence","_pro_", filename_suffix))
ecount(g.hashtag_pro_timerange)
## [1] 1200
vcount(g.hashtag_pro_timerange)
## [1] 41
edgelist_adj_pro_hashtag_timerange <- data.frame(get.edgelist(g.hashtag_pro_timerange))
dim(edgelist_adj_pro_hashtag_timerange)
## [1] 1200 2
edgelist_adj_pro_hashtag_timerange$X3 <- "_pro_" # data frame
df_con_hashtag = df_con[which(is.na(df_con$hashtag)==F),] # select rows that contain at least one hashtag
nrow(df_con_hashtag)/nrow(df_con) # % of rows retained
## [1] 0.8294849
df_con_hashtag = separate_rows(df_con_hashtag, hashtag) # release each hashtag to a new row. At this step, the status_id will have duplicates
df_con_hashtag_timerange <- df_con_hashtag[df_con_hashtag$created_at > start_time & df_con_hashtag$created_at <= end_time,]
df_con_hashtag_timerange <- df_con_hashtag_timerange[nchar(df_con_hashtag_timerange$hashtag)>1,]
nrow(df_con_hashtag_timerange)
## [1] 40
range(df_con_hashtag_timerange$created_at)
## [1] "2020-07-22T00:41:39.000Z" "2020-08-04T21:34:00.000Z"
edgelist_con_hashtag_timerange = df_con_hashtag_timerange[,c("status_id","username","hashtag")]
edgelist_con_hashtag_timerange$hashtag <- tolower(edgelist_con_hashtag_timerange$hashtag)
dropcolumns <- c("s386","hr1044","hr392","s281","hr3012")
#edgelist_con_hashtag_timerange <- edgelist_con_hashtag_timerange[!edgelist_con_hashtag_timerange$hashtag%in% dropcolumns,]
## user by hashtag counts
user_by_hashtag_con_timerange <- acast(edgelist_con_hashtag_timerange, formula = edgelist_con_hashtag_timerange$username ~ edgelist_con_hashtag_timerange$hashtag, length, value.var = "hashtag")
user_by_hashtag_con_timerange <- user_by_hashtag_con_timerange[,apply(user_by_hashtag_con_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>1] # set the threshold of hashtag frequency at 2
## status_id by hashtag counts for ACTUAL HASHTAG COOCURRENCE
tweet_by_hashtag_con_timerange <- acast(edgelist_con_hashtag_timerange, formula = edgelist_con_hashtag_timerange$status_id ~ edgelist_con_hashtag_timerange$hashtag, length, value.var = "hashtag")
class(tweet_by_hashtag_con_timerange)
## [1] "matrix" "array"
tweet_by_hashtag_con_timerange <- tweet_by_hashtag_con_timerange[,apply(tweet_by_hashtag_con_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>1] # this data frame itself is less useful, but it can be used to get the hashtag cooccurrence.
##
hashtag_m_con_timerange <- t(tweet_by_hashtag_con_timerange)%*%tweet_by_hashtag_con_timerange # to get the actual count of hashtag coocurrence, do not multiply the user-hashtag matrix. Rather, multiply the tweet-hashtag matrix. This way, the actual count won't be inflated.
diag(hashtag_m_con_timerange) <- 0
hashtag_m_con_timerange <- hashtag_m_con_timerange[,apply(hashtag_m_con_timerange, MARGIN = 2, FUN = sum, na.rm = TRUE)>0] # delete column sum 0s
hashtag_m_con_timerange <- hashtag_m_con_timerange[apply(hashtag_m_con_timerange, MARGIN = 1, FUN = sum, na.rm = TRUE)>0,] # delete row sum 0s
## as of now, no isolates in the network
g.hashtag_con_timerange <- graph_from_adjacency_matrix(hashtag_m_con_timerange)
ecount(g.hashtag_con_timerange)
## [1] 34
par(mar=c(.8,.8,2,.8))
set.seed(1234)
plot(g.hashtag_con_timerange, vertex.shape = "none", edge.arrow.size = 0, edge.width = .5, vertex.label.cex = 1, vertex.label.font = 2, vertex.label.color = "black", edge.curved = 0.1, edge.color = alpha("tomato",0.5), layout = layout_with_lgl)
title(main = paste("Hashtag Cooccurrence","_con_", filename_suffix))
ecount(g.hashtag_con_timerange)
## [1] 34
vcount(g.hashtag_con_timerange)
## [1] 4
edgelist_adj_con_hashtag_timerange <- data.frame(get.edgelist(g.hashtag_con_timerange))
dim(edgelist_adj_con_hashtag_timerange)
## [1] 34 2
edgelist_adj_con_hashtag_timerange$X3 <- "_con_" # data frame
edgelist_adj_all_hashtag_timerange <- data.frame(rbind(edgelist_adj_pro_hashtag_timerange,edgelist_adj_con_hashtag_timerange))
#edgelist_adj_all_hashtag_timerange <- edgelist_adj_all_hashtag_timerange[!edgelist_adj_all_hashtag_timerange$X1 %in% dropcolumns,]
#edgelist_adj_all_hashtag_timerange <- edgelist_adj_all_hashtag_timerange[!edgelist_adj_all_hashtag_timerange$X2 %in% dropcolumns,]
edgelist_adj_all_hashtag_timerange <- edgelist_adj_all_hashtag_timerange[nchar(edgelist_adj_all_hashtag_timerange$X1)>1,]
edgelist_adj_all_hashtag_timerange <- edgelist_adj_all_hashtag_timerange[nchar(edgelist_adj_all_hashtag_timerange$X2)>1,]
g.hashtag_all_timerange <- graph_from_data_frame(edgelist_adj_all_hashtag_timerange)
ecount(g.hashtag_all_timerange);vcount(g.hashtag_all_timerange)
## [1] 1234
## [1] 44
## combined 1
par(mar=c(.8,.8,2,.8))
set.seed(1234)
E(g.hashtag_all_timerange)$position <- edgelist_adj_all_hashtag_timerange$X3
E(g.hashtag_all_timerange)[E(g.hashtag_all_timerange)$position=="_pro_"]$color <- alpha("steelblue",0.3)
E(g.hashtag_all_timerange)[E(g.hashtag_all_timerange)$position=="_con_"]$color <- alpha("tomato",0.4)
plot(g.hashtag_all_timerange, vertex.shape = "circle", vertex.color = "white", vertex.size = 1, vertex.frame.color = "gray90", edge.arrow.size = 0, edge.width = .3, vertex.label.cex = 0.9, vertex.label.font = 2, vertex.label.color = "black", edge.curved = 0.1, layout = layout_nicely)
title(main = paste("Hashtag Cooccurrence","_all_", filename_suffix))
legend("topright", legend = c("supporters","opponents"),lty = 1, col = c("steelblue","tomato"), bty = "n")
## combined 2
set.seed(1234)
E(g.hashtag_all_timerange)$position <- edgelist_adj_all_hashtag_timerange$X3
E(g.hashtag_all_timerange)[E(g.hashtag_all_timerange)$position=="_pro_"]$color <- alpha("steelblue",0.3)
E(g.hashtag_all_timerange)[E(g.hashtag_all_timerange)$position=="_con_"]$color <- alpha("tomato",0.4)
plot(g.hashtag_all_timerange, vertex.shape = "circle", vertex.color = "white", vertex.size = 1, vertex.frame.color = "gray90", edge.arrow.size = 0, edge.width = .5, vertex.label.cex = 0.7, vertex.label.font = 2, vertex.label.color = "black", edge.curved = 0.1, layout = layout_in_circle)
title(main = paste("Hashtag Cooccurrence","_all_", filename_suffix))
legend("topright", legend = c("supporters","opponents"),lty = 1, col = c("steelblue","tomato"), bty = "n")
datatable(as.data.frame(table(tolower(df_pro_hashtag_timerange$hashtag))))
datatable(as.data.frame(table(tolower(df_con_hashtag_timerange$hashtag))))