This is an R Markdown Notebook. When you execute code within the notebook, the results appear beneath the code.

Try executing this chunk by clicking the Run button within the chunk or by placing your cursor inside it and pressing Cmd+Shift+Enter.

Install the necessary packages then comment them out

Load and process data

mydata <- read_rds("YCombinator.RDS")
n <- length(mydata$screenName)
tData <- mydata[,c("screenName","text")]
tData <- cbind(1:n, tData)
tData$screenName <- as.character(tData$screenName)
tData$text <- as.character(tData$text)

Function to create a list of Poster-retweeter

createList <- function(tData) {
  
  # Reads data
  nData <- 
    tData %>% 
   set_colnames(c("id", "screenname", "tweet")) %>%
      tbl_df()
  
  # Extracts poster information
  retweeterPoster <- 
    nData %>%
    mutate(is_retweeted = stri_detect_regex(tweet, "(RT|via)((?:\\b\\W*@\\w+)+)")) %>%
    filter(is_retweeted) %>%
    rowwise() %>%
    do({
      # Gets retwitter
      who_retweet <- 
        stri_extract_first_regex(.$tweet, "(RT|via)((?:\\b\\W*@\\w+)+)")[[1]] %>%
        stri_extract_first_regex("@[a-zA-Z0-9_]{1,}") %>%
        stri_replace_all_fixed("@", "")
      
      # Returns pair
      data_frame(who_post = .$screenname, who_retweet = who_retweet, 
                 combi = stri_c(sort(c(.$screenname, who_retweet)), collapse = " "))
    }) %>%
    ungroup() %>%
    group_by(combi) %>%
    dplyr::summarize(from = min(who_post, who_retweet), 
              to = max(who_post, who_retweet), 
              weight = n()) %>%
    ungroup() %>%
    select(-combi)
  
  # Returns results
  retweeterPoster
}

retweeterPoster <- createList(tData)
retweeterPoster <- na.omit(retweeterPoster)

Create graph from retweeterPoster

#library(influenceR)
#g = graph_from_edgelist(as.matrix(retweeterPoster[, 1:2]), directed = FALSE)
#betweenness(g, snap = T)
#bridging(g)
#constraint(g)
#ens(g)
# identify the top ten kep players
# This function implements KPP-Pos, a metric intended to identify k nodes which optimize resource diffusion through the network.
#keyplayer(g, 10, prob = 0, tol = 1e-04, maxsec = 120, roundsec = 30)

m <- ftM2adjM(ft = as.matrix(retweeterPoster[, 1:2]), W = retweeterPoster$weight, edgemode = "directed")
g1 <- as(m, "graphNEL")

# Calculate centrality
node <- data.frame(nodes(g1))
node$betweenness <-  sna::betweenness(m)
node$degree <- sna::degree(m)
sortlist <- node[order(-node$degree),]
head(sortlist, 20)
##           nodes.g1. betweenness degree
## 155        garrytan        1797     83
## 309      TechCrunch          75     30
## 322     ycombinator           0     25
## 271          Recode          60     18
## 359          ugodre           0     17
## 220 KateClarkTweets         146     15
## 242       MaxCRoser          24     14
## 205       JijoSunny         290     12
## 362      techcrunch           0     10
## 367     VentureBeat           0      8
## 268      prosunjoyi          12      7
## 297      StartupLou          10      7
## 291   sheetal_sonar           3      6
## 350      thedatainc           0      6
## 464      WSJmarkets           0      6
## 1      _alexguillot           0      5
## 64           Bitfi6           0      5
## 151        ForbesME           9      5
## 230     LibbyMClark           6      5
## 375        tianhuil           0      5
# Defines clusters for nodes in 3 groups by degree centrality
node %<>%
  mutate(size = log(node$degree)) %>%
  mutate(size = ifelse(size == -Inf, 1, size))
N = 2
node %<>%
  mutate(group = Mclust(size, G = N)$classification)

Visualize the network with visNetwork

library(visNetwork)
## Warning: package 'visNetwork' was built under R version 3.5.3
gnode <- data.frame(node$nodes.g1.)
gnode<- setNames(gnode, "id")
gnode$shape <- "dot"  
gnode$shadow <- TRUE # Nodes will drop shadow
gnode$title <- node$nodes.g1. #Click to show title
gnode$label <- node$degree # Node label by degree centrality
gnode$group <- node$group
gnode$size <- gnode$group*3 # Node size by group
gnode$color.background <- c("slategrey", "tomato", "gold")[gnode$group]

visNetwork(
  gnode,
  setNames(retweeterPoster, c("from", "to", "weight"))
) %>%
 visOptions(highlightNearest = TRUE, 
             selectedBy = "group")

Based on the metrics of betweenness and degree in the above data frame, several poignant influencers may be found. At the top of the list is user “garrytan” with a good number of direct connections in their network (83) and an exceptional betweenness score (1797) meaning they have significant influence over information and knowledge in their cluster.

Below garrytan would be user “JijoSunny”, based on their betweenness score of 290. In terms of influencers and their sway over a cluster/network, a high betweenness score seems more indicative of ‘influence’ than direct connections (degrees), as their thoughts, opinions, and endorsements reach a higher number of people in different circles/clusters.