Install new libraries
install.packages("networkD3") #for interactive network visualization
install.packages("igraph") #for network analysis
Load required libraries
library(twitteR)
library(dplyr)
library(qdap)
library(networkD3)
library(igraph)
library(stringr)
Collect tweets through Twitter API
setup_twitter_oauth("enter consumer key",
"enter consumer secret",
access_token="xxxxx",
access_secret="xxxx")
#collect recent 5000 tweets
alltweets <- twListToDF(searchTwitter("#MAGA", n=5000, lang=NULL,since=NULL, until=NULL,locale=NULL, geocode=NULL, sinceID=NULL, maxID=NULL,resultType=NULL, retryOnRateLimit=120))
#save tweets as .csv
write.csv(alltweets, "alltweets.csv")
Alternatively, you can use pre-saved data
alltweets <- read.csv("alltweets.csv", header = TRUE)
Extract network information from retweets. If A retweets B, then there is a link from A to B.
#Network analysis and visualization is computationally intensive. We will use only the first 500 tweets for demo.
alltweets<-alltweets[1:500,]
#split the data into two sets; one for retweet network and the other for mention network.
#create an edge-list for retweet network
sp = split(alltweets, alltweets$isRetweet)
rt = mutate(sp[['TRUE']], sender = substr(text, 5, regexpr(':', text) - 1))
el = as.data.frame(cbind(sender = tolower(rt$sender), receiver = tolower(rt$screenName)))
el = count(el, sender, receiver)
el[1:5,] #show the first 5 edges in the edgelist
Based on the edge-list, create a retweet network.
rt_graph <- graph_from_data_frame(d=el, directed=T)
Visualize the retweet network.
glay = layout.fruchterman.reingold(rt_graph)
plot(rt_graph)
Isn’t it too messy? Let’s beautify it.
glay = layout.fruchterman.reingold(rt_graph)
par(bg="gray15", mar=c(1,1,1,1))
plot(rt_graph, layout=glay,
vertex.color="gray25",
vertex.size=(degree(rt_graph, mode = "in")), #sized by in-degree centrality
vertex.label = NA,
edge.arrow.size=0.8,
edge.arrow.width=0.5,
edge.width=edge_attr(rt_graph)$n/10, #sized by edge weight
edge.color=hsv(h=.95, s=1, v=.7, alpha=0.5))
title("Retweet Network", cex.main=1, col.main="gray95")
The above network visualization does not show vertex label. Let’s add a few lines to make the graph more informative. igraph is a very versatile by allowing a high degree of customization for visualization.
glay = layout.fruchterman.reingold(rt_graph)
par(bg="gray15", mar=c(1,1,1,1))
plot(rt_graph, layout=glay,
vertex.color="gray25",
vertex.size=(degree(rt_graph, mode = "in")), #sized by in-degree centrality
vertex.label.family="sans",
vertex.shape="circle", #can also try "square", "rectangle", etc. More in igraph manual
vertex.label.color=hsv(h=0, s=0, v=.95, alpha=0.5),
vertex.label.cex=(degree(rt_graph, mode = "in"))/300, #sized by in-degree centrality
edge.arrow.size=0.8,
edge.arrow.width=0.5,
edge.width=edge_attr(rt_graph)$n/10, #sized by edge weight
edge.color=hsv(h=.95, s=1, v=.7, alpha=0.5))
title("Retweet Network", cex.main=1, col.main="gray95")
Create an interactive visualization for the retweet network
wc <- cluster_walktrap(rt_graph)
members <- membership(wc)
d3_rt <- igraph_to_networkD3(rt_graph, group = members)
forceNetwork(Links = d3_rt$links, Nodes = d3_rt$nodes,
Source = 'source', Target = 'target',
NodeID = 'name', Group = 'group')
Calculate some network-level statistics for the retweet network
ecount(rt_graph) #the number of edges
## [1] 388
vcount(rt_graph) #the number of vertices
## [1] 460
E(rt_graph)[1:50] #list the first 50 edges
## + 50/388 edges (vertex names):
## [1] 1611paul ->sonsmary aaronmblake_ ->newspeakislies
## [3] albert1776 ->jazzyjo20 alexisinnh ->desimc46
## [5] alexisinnh ->elvisinoregon alexisinnh ->libertpartarian
## [7] alwaysintegrity->alangib35785534 alwaysintegrity->tj1701a
## [9] andrewluck23 ->donnas0818 antipcpatriot ->gridironbeastyo
## [11] barbinut ->zakerforce7 bfraser747 ->abqnicole
## [13] bfraser747 ->davejscorona bfraser747 ->deborahhunnicu2
## [15] bfraser747 ->exx_treme bfraser747 ->healthyheartfit
## [17] bfraser747 ->hinz_dale bfraser747 ->icufromhere
## [19] bfraser747 ->rainestopper bfraser747 ->rocofagi1
## + ... omitted several edges
V(rt_graph)[1:50] #list the first 50 vertex ids
## + 50/460 vertices, named:
## [1] 1611paul aaronmblake_ albert1776 alexisinnh
## [5] alwaysintegrity andrewluck23 antipcpatriot barbinut
## [9] bfraser747 biggunz1965 blinkz04 brothervet
## [13] carrieksada carriepetty cbgbmd charlie31200514
## [17] christichat christinakb citizenanalyst constancequeen8
## [21] culturewarcrack danscavino danslezak2 davidtheroux
## [25] dbargen debsellsslc djtrumpit drdaveorts
## [29] dreamedofdust elwood_jack ericshapiro3 eyerighton18
## [33] freedombill_ca funnyanimals gdltothepdx gerfingerpoken
## [37] gerfingerpoken2 ggander2 greeneyes0084 hardcase726
## + ... omitted several vertices
#Calculate density:The proportion of present edges from all possible edges in the network.
edge_density(rt_graph, loops=F) #for an undirected network
## [1] 0.001837643
ecount(rt_graph)/(vcount(rt_graph)*(vcount(rt_graph)-1)) #for a directed network
## [1] 0.001837643
#Calculate reciprocity:The proportion of reciprocated ties (for a directed network).
reciprocity(rt_graph)
## [1] 0
#Calculate centralization
centr_degree(rt_graph, mode = c("in"), loops = TRUE,normalized = TRUE)$centralization
## [1] 0.004698304
#Calculate transitivity:the probability that the neighbors of a vertex are connected.
transitivity(rt_graph, type="local")
## [1] NaN NaN NaN 0 0 NaN NaN NaN 0 NaN NaN NaN 0 0 NaN NaN 0
## [18] 0 0 0 NaN 0 0 NaN NaN 0 NaN 0 0 NaN NaN NaN 0 NaN
## [35] NaN 0 NaN NaN 0 0 NaN NaN NaN NaN NaN 0 NaN NaN NaN NaN NaN
## [52] 0 NaN 0 0 NaN NaN NaN 0 0 0 0 NaN 0 NaN 0 NaN 0
## [69] 0 NaN 0 NaN NaN 0 NaN NaN NaN 0 NaN NaN 0 0 NaN NaN 0
## [86] 0 0 0 NaN NaN 0 0 NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN
## [103] NaN NaN NaN 0 NaN NaN 0 0 NaN 0 0 NaN NaN NaN 0 NaN NaN
## [120] NaN NaN NaN 0 NaN NaN NaN NaN NaN 0 NaN NaN NaN NaN 0 NaN NaN
## [137] NaN 0 NaN NaN NaN NaN NaN 0 NaN NaN NaN 0 NaN NaN 0 NaN NaN
## [154] 0 NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN 0 NaN NaN NaN NaN NaN
## [171] NaN 0 NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN NaN NaN
## [188] 0 NaN NaN 0 NaN NaN NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN 0
## [205] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN 0 NaN 0 NaN NaN
## [222] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN
## [239] NaN NaN NaN NaN NaN NaN NaN NaN NaN 0 NaN NaN NaN 0 NaN NaN NaN
## [256] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [273] NaN NaN 0 NaN 0 NaN 0 NaN 0 NaN NaN NaN NaN NaN NaN NaN 0
## [290] NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [307] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [324] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [341] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [358] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [375] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [392] NaN NaN NaN NaN 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [409] 0 NaN 0 NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [426] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [443] NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN NaN
## [460] NaN
#Calculate the length of the longest path between two vertices in the network
diameter(rt_graph, directed=F, weights=NA)
## [1] 12
Some vertex-level statistics (based on the retweet network)
#Calculate in-degree centrality
indegree <- sort(degree(rt_graph,mode = "in"),decreasing = TRUE)
indegree[1:20] #show the top vertices by in-degree
## elvisinoregon truthstation susihoffman5 jamesedgriffin
## 3 3 3 3
## sonsmary zakerforce7 hinz_dale elisabethphili1
## 2 2 2 2
## gconsidder jkelley2836 odcusa branumsr
## 2 2 2 2
## joycep7777 crowebar_67 starra78811933 jwaters142
## 2 2 2 2
## mllnola deplorabletexa1 trailschap cookingconquest
## 2 2 2 2
#Calculate out-degree
outdegree <- sort(degree(rt_graph,mode = "out"),decreasing = TRUE)
outdegree[1:20] #show the top vertices by out-degree
## lindasuhler loudobbs sandratxas leahr77
## 62 54 19 16
## johnkstahlusa bfraser747 christichat wdfx2eu7
## 14 12 11 11
## danscavino constancequeen8 danslezak2 greeneyes0084
## 10 9 7 7
## hardcase726 lvnancy realscotternst votetrumppics
## 5 5 5 5
## marilynkj miami4trump realdonaldtrump alexisinnh
## 4 4 4 3
#Calculate betweenness centrality
bt <- sort(betweenness(rt_graph, directed=F, weights=NA), decreasing = TRUE)
bt[1:20] #show the top vertices by betweenness centrality
## lindasuhler loudobbs christichat jkelley2836 sandratxas
## 20567.833 13871.533 11100.167 9763.533 6905.500
## susihoffman5 odcusa bfraser747 leahr77 danscavino
## 6717.167 4590.833 4474.667 3871.967 3618.667
## erengwam jc3me2 johnkstahlusa hinz_dale flexi215
## 3328.000 3328.000 2918.667 2808.667 2698.000
## phil200269 axlfoley1 starra78811933 wdfx2eu7 jamesedgriffin
## 2596.000 2530.000 2371.000 2355.000 1871.000
#Calculate closeness centrality: measures how many steps is required to access every other vertex from a given vertex
cc <- sort(closeness(rt_graph, mode="all", weights=NA), decreasing = TRUE)
cc[1:20] #show the top vertices by closeness centrality
## lindasuhler susihoffman5 odcusa christichat
## 9.891295e-06 9.886405e-06 9.883865e-06 9.880056e-06
## erengwam jc3me2 rhinonewt txchiks4trump
## 9.874105e-06 9.874105e-06 9.873910e-06 9.873910e-06
## flexi215 jkelley2836 elvisinoregon dwulke
## 9.873033e-06 9.872935e-06 9.870986e-06 9.868454e-06
## lawdawgseven mikehotchky deplorabletexa1 noisyhill1
## 9.868454e-06 9.868454e-06 9.868064e-06 9.868064e-06
## loudobbs aadvantagepaint absabella adboa95
## 9.867967e-06 9.867869e-06 9.867869e-06 9.867869e-06
#Calculate eigenvector centrality: connectivity with highly connected neighbors
ec <- eigen_centrality(rt_graph, directed=T, weights=NA)
sort(ec$vector)[1:20] #show the top vertices by eigenvector centrality centrality
## 1611paul aaronmblake_ albert1776 alexisinnh
## 0 0 0 0
## alwaysintegrity andrewluck23 antipcpatriot barbinut
## 0 0 0 0
## bfraser747 biggunz1965 blinkz04 brothervet
## 0 0 0 0
## carrieksada carriepetty cbgbmd charlie31200514
## 0 0 0 0
## christichat christinakb citizenanalyst constancequeen8
## 0 0 0 0
Find hubs and authorities. Hubs: lots outgoing edges; Authorities: lots incoming edges.
hs <- hub_score(rt_graph, weights=NA)$vector
as <- authority_score(rt_graph, weights=NA)$vector
sort(hs, decreasing = TRUE)[1:20] #show the top 20 vertices by hub score
## lindasuhler sandratxas leahr77 johnkstahlusa
## 1.000000e+00 4.607487e-02 4.308313e-02 4.245399e-02
## christichat alexisinnh magnifier661 ninjah__
## 3.978244e-02 1.753206e-02 1.681807e-02 1.681807e-02
## suzost phil200269 eyerighton18 johnfromcranber
## 1.681807e-02 1.653990e-02 1.627078e-02 1.627078e-02
## loudobbs wdfx2eu7 bfraser747 lvnancy
## 4.744485e-03 8.953555e-04 8.710917e-04 7.649553e-04
## ritzybacon2 barbinut danscavino greeneyes0084
## 7.620737e-04 7.496742e-04 1.070784e-04 8.696554e-05
sort(as, decreasing = TRUE)[1:20] #show the top 20 vertices by authority score
## susihoffman5 elvisinoregon erengwam jc3me2
## 1.0000000 0.9794404 0.9665863 0.9665863
## rhinonewt txchiks4trump odcusa dwulke
## 0.9638219 0.9638219 0.9607720 0.9395526
## lawdawgseven mikehotchky flexi215 deplorabletexa1
## 0.9395526 0.9395526 0.9392956 0.9390469
## noisyhill1 aadvantagepaint absabella adboa95
## 0.9390469 0.9240125 0.9240125 0.9240125
## agree1967 az49ersfan azblacksheep blawatid
## 0.9240125 0.9240125 0.9240125 0.9240125
Find clusters
cliques(rt_graph)[1:10]
## [[1]]
## + 1/460 vertex, named:
## [1] 1611paul
##
## [[2]]
## + 1/460 vertex, named:
## [1] aaronmblake_
##
## [[3]]
## + 1/460 vertex, named:
## [1] albert1776
##
## [[4]]
## + 1/460 vertex, named:
## [1] alexisinnh
##
## [[5]]
## + 1/460 vertex, named:
## [1] alwaysintegrity
##
## [[6]]
## + 1/460 vertex, named:
## [1] andrewluck23
##
## [[7]]
## + 1/460 vertex, named:
## [1] antipcpatriot
##
## [[8]]
## + 1/460 vertex, named:
## [1] barbinut
##
## [[9]]
## + 1/460 vertex, named:
## [1] bfraser747
##
## [[10]]
## + 1/460 vertex, named:
## [1] biggunz1965
sapply(cliques(rt_graph), length)
## [1] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [36] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [71] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [106] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [141] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [176] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [211] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [246] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [281] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [316] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [351] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [386] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [421] 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1 1
## [456] 1 1 1 1 1 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [491] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [526] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [561] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [596] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [631] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [666] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [701] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [736] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [771] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [806] 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2 2
## [841] 2 2 2 2 2 2
largest_cliques(rt_graph)[1:20] #list only 20 vertices in that cluster
## [[1]]
## + 2/460 vertices, named:
## [1] ericlevitsky youngdems4trump
##
## [[2]]
## + 2/460 vertices, named:
## [1] anthonyzafarino youngdems4trump
##
## [[3]]
## + 2/460 vertices, named:
## [1] rottiefreek yankeefanno3
##
## [[4]]
## + 2/460 vertices, named:
## [1] solomonofzion wdfx2eu7
##
## [[5]]
## + 2/460 vertices, named:
## [1] rosegalwines wdfx2eu7
##
## [[6]]
## + 2/460 vertices, named:
## [1] republic2016 wdfx2eu7
##
## [[7]]
## + 2/460 vertices, named:
## [1] nylovestrump wdfx2eu7
##
## [[8]]
## + 2/460 vertices, named:
## [1] nanamaryto8 wdfx2eu7
##
## [[9]]
## + 2/460 vertices, named:
## [1] francisortis_2 wdfx2eu7
##
## [[10]]
## + 2/460 vertices, named:
## [1] customjewel wdfx2eu7
##
## [[11]]
## + 2/460 vertices, named:
## [1] chase4trump wdfx2eu7
##
## [[12]]
## + 2/460 vertices, named:
## [1] carminezozzora wdfx2eu7
##
## [[13]]
## + 2/460 vertices, named:
## [1] asterixny wdfx2eu7
##
## [[14]]
## + 2/460 vertices, named:
## [1] tlfhappy votetrumppics
##
## [[15]]
## + 2/460 vertices, named:
## [1] mackette52 votetrumppics
##
## [[16]]
## + 2/460 vertices, named:
## [1] kjw49419 votetrumppics
##
## [[17]]
## + 2/460 vertices, named:
## [1] appsame votetrumppics
##
## [[18]]
## + 2/460 vertices, named:
## [1] italo_renda vipernthetemple
##
## [[19]]
## + 2/460 vertices, named:
## [1] lexsicle tyler_tortoise
##
## [[20]]
## + 2/460 vertices, named:
## [1] tinasvue trumpsuperpac
ceb <- cluster_edge_betweenness(rt_graph) #Community detection based on edge betweenness (Newman-Girvan)
length(ceb)
## [1] 120
membership(ceb)[1:20] #list only 20 vertices
## 1611paul aaronmblake_ albert1776 alexisinnh
## 1 2 3 4
## alwaysintegrity andrewluck23 antipcpatriot barbinut
## 5 6 7 8
## bfraser747 biggunz1965 blinkz04 brothervet
## 9 10 11 12
## carrieksada carriepetty cbgbmd charlie31200514
## 13 14 15 16
## christichat christinakb citizenanalyst constancequeen8
## 17 18 19 20
modularity(ceb)
## [1] 0.7058202
In the above, we have visualized and analyzed the retweet network. Can you follow the previous steps to work on the mention network? Use the following to create an edge-list from Twitter mentions.
#extract senders and receivers in MENTIONS
orig = sp[['FALSE']]
mentioned =
lapply(orig$text, function(tx) {
matches = gregexpr('@[^([:blank:]|[:punct:])]+', tx)[[1]]
sapply(seq_along(matches), function(i)
substr(tx, matches[i] + 1, matches[i] + attr(matches, 'match.length')[i] - 1))
})
mentionEL =
lapply(seq_along(orig$text), function(i) {
if(mentioned[[i]] == '')
return(NULL)
lapply(mentioned[[i]], function(m)
c(sender = as.character(orig$screenName[i]), receiver = m)) %>%
do.call(rbind, .) %>% as.data.frame()
}) %>%
do.call(rbind, .) %>%
count(tolower(sender), tolower(receiver))