Objective

The objective is to analyze the dataset from linkedin users to study the behavior of social networking.

#Required Libraries
library("readr")
library("ggplot2")
library("igraph")
library("poweRlaw")
library("scales")
library("cowplot")
library("psych")
library("dplyr")
library('RColorBrewer')
setwd("~/Desktop/Network Analytics")
df_talent_flows = read_csv("talent_flows.csv")
## Rows: 81114 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): from, to
## dbl (1): migration_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_company = read_csv("linkedin_company_metadata.csv")
## Rows: 473 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_id, name, industry, city, country, hq, overview
## dbl (2): founded, emp_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_talent_flows)
head(df_company)

1. Create a dataframe

df_edges = df_talent_flows

2. Getting Familiar with the data

igrapgh = graph_from_data_frame(df_edges,directed = TRUE)
# print and view the igrapgh 
print(igrapgh, e=TRUE, v=TRUE)
## IGRAPH 2cf45b8 DN-- 473 81114 -- 
## + attr: name (v/c), migration_count (e/n)
## + edges from 2cf45b8 (vertex names):
##  [1] at&t                ->oracle               
##  [2] colgate-palmolive   ->nike                 
##  [3] agilent-technologies->stryker              
##  [4] ebay                ->expedia              
##  [5] comcast             ->republic-services-inc
##  [6] aon                 ->aig                  
##  [7] costco-wholesale    ->apple                
##  [8] facebook            ->cisco                
## + ... omitted several edges

2.2 Calculate the in-degree and out-degree for each firm.

# idenitfy total nodes in dataframe
a1 <- df_edges %>%
        distinct(from) %>%
        rename(label = "from")

b1 <- df_edges %>%
        distinct(to) %>%
        rename(label = "to")

df_edges_nodes_all <- full_join(a1, b1, by = "label")
df_edges_nodes_all <- as.data.frame(df_edges_nodes_all)

# create in-link weights
df_edges_in_deg <- df_edges %>%
        group_by(to, from) %>%
        summarise(weight = n()) %>%
        ungroup()

# create out-link weights
df_edges_out_deg <- df_edges %>%
        group_by(from, to) %>%
        summarise(weight = n()) %>%
        ungroup()

# add the weights for in-degree
df_edges_in_deg <-
        aggregate(df_edges_in_deg$weight,
                  by = list(name = df_edges_in_deg$to),
                  FUN = sum)

# add the weights for out-degree
df_edges_out_deg <-
        aggregate(df_edges_out_deg$weight,
                  by = list(name = df_edges_out_deg$from),
                  FUN = sum)

# combine in-degree with nodes list
df_edges_in_deg_data <- df_edges_in_deg %>%
        right_join(df_edges_nodes_all, by = c("name" = "label"))

df_edges_in_deg_data <- as.data.frame(df_edges_in_deg_data)

# combine out-degree with nodes list
df_edges_out_deg_data <- df_edges_out_deg %>%
        right_join(df_edges_nodes_all, by = c("name" = "label"))

df_edges_out_deg_data <- as.data.frame(df_edges_out_deg_data)

# generate top 10 for in degree
top10_in_deg_firms <-
        as.data.frame(head(df_edges_in_deg_data[order(df_edges_in_deg_data$x, 
                                                            decreasing = T),],n=10))

top10_in_deg_firms_list <- top10_in_deg_firms %>%
        inner_join(df_company, by = c("name" = "company_id")) %>%
  select(name.y, industry, city, country, emp_count) %>%
  rename("company name" = "name.y", "employee count" = "emp_count")

# generate top 10 for out degree 
top10_out_deg_firms <-
        as.data.frame(head(df_edges_out_deg_data[order(df_edges_out_deg_data$x, 
                                                            decreasing = T),],n=10))

top10_out_deg_firms_list <- top10_out_deg_firms %>%
        inner_join(df_company, by = c("name" = "company_id")) %>%
  select(name.y, industry, city, country, emp_count) %>%
  rename("company name" = "name.y", "employee count" = "emp_count")

print("Top 10 firms with the highest in-degree are shown below : ")
## [1] "Top 10 firms with the highest in-degree are shown below : "
print(top10_in_deg_firms_list)
##                  company name                            industry
## 1                         IBM information technology and services
## 2                   Accenture information technology and services
## 3  Hewlett Packard Enterprise information technology and services
## 4                        AT&T                  telecommunications
## 5                      Amazon                            internet
## 6             Bank of America                             banking
## 7                 Wells Fargo                  financial services
## 8        JPMorgan Chase & Co.                  financial services
## 9                   Microsoft                   computer software
## 10                       Citi                  financial services
##                city       country employee count
## 1  Armonk, New York United States         771986
## 2          Dublin 2       Ireland         480235
## 3         Palo Alto United States         506236
## 4            Dallas United States         321692
## 5           Seattle United States         139917
## 6         Charlotte United States         326301
## 7     San Francisco United States         302434
## 8          New York United States         270061
## 9           Redmond United States         302297
## 10         New York United States         323488
print("The top 10 firms with the highest out-degree are shown below : ")
## [1] "The top 10 firms with the highest out-degree are shown below : "
print(top10_out_deg_firms_list)
##                  company name                            industry
## 1                         IBM information technology and services
## 2                        AT&T                  telecommunications
## 3  Hewlett Packard Enterprise information technology and services
## 4        JPMorgan Chase & Co.                  financial services
## 5             Bank of America                             banking
## 6                   Accenture information technology and services
## 7                          GE electrical/electronic manufacturing
## 8                 Wells Fargo                  financial services
## 9                        Citi                  financial services
## 10                     Target                              retail
##                city       country employee count
## 1  Armonk, New York United States         771986
## 2            Dallas United States         321692
## 3         Palo Alto United States         506236
## 4          New York United States         270061
## 5         Charlotte United States         326301
## 6          Dublin 2       Ireland         480235
## 7            Boston United States         142190
## 8     San Francisco United States         302434
## 9          New York United States         323488
## 10      Minneapolis United States         233172

2.3 Explain Correlation

There are several possible reasons we see this type of correlation.

  • Larger firms are more recognized brands and carry a the stigma that the employee comes from a more reputable organization.
  • Employee churn is much higher at larger organizations which leads to the possibility of more available talent.
  • There may be previous success with employees who worked at lager companies previously.
  • Larger companies tend to operate at a higher capacity, which may lead to employeed being more well trained and prepared.

2.4 Statistical Test

Linear Regression for in-degree

df_edges_in_deg_data_reg <- df_edges_in_deg_data %>%
        inner_join(df_company, by = c("name" = "company_id")) %>%
  select(x,emp_count) %>%
  rename("in degree" = "x", "employee count" = "emp_count")

df_edges_out_deg_data_reg <- df_edges_out_deg_data %>%
        inner_join(df_company, by = c("name" = "company_id")) %>%
  select(x,emp_count) %>%
  rename("out degree" = "x", "employee count" = "emp_count")

lr_in_deg = lm(df_edges_in_deg_data_reg$`in degree`~df_edges_in_deg_data_reg$`employee count`,
               data = df_edges_in_deg_data_reg)
options(scipen=999)
summary(lr_in_deg)
## 
## Call:
## lm(formula = df_edges_in_deg_data_reg$`in degree` ~ df_edges_in_deg_data_reg$`employee count`, 
##     data = df_edges_in_deg_data_reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -456.46  -42.82   -3.25   47.08  143.81 
## 
## Coefficients:
##                                               Estimate   Std. Error t value
## (Intercept)                               136.62554978   3.40651346   40.11
## df_edges_in_deg_data_reg$`employee count`   0.00095446   0.00004365   21.87
##                                                      Pr(>|t|)    
## (Intercept)                               <0.0000000000000002 ***
## df_edges_in_deg_data_reg$`employee count` <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 65.47 on 471 degrees of freedom
## Multiple R-squared:  0.5038, Adjusted R-squared:  0.5027 
## F-statistic: 478.2 on 1 and 471 DF,  p-value: < 0.00000000000000022

Linear Regression for out-degree

lr_out_deg = lm(df_edges_out_deg_data_reg$`out degree`~df_edges_out_deg_data_reg$`employee count`,
                data = df_edges_out_deg_data_reg)
options(scipen=999)
summary(lr_out_deg)
## 
## Call:
## lm(formula = df_edges_out_deg_data_reg$`out degree` ~ df_edges_out_deg_data_reg$`employee count`, 
##     data = df_edges_out_deg_data_reg)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -507.64  -49.69   -2.16   56.89  134.65 
## 
## Coefficients:
##                                                Estimate   Std. Error t value
## (Intercept)                                132.94148993   3.68439167   36.08
## df_edges_out_deg_data_reg$`employee count`   0.00105532   0.00004721   22.36
##                                                       Pr(>|t|)    
## (Intercept)                                <0.0000000000000002 ***
## df_edges_out_deg_data_reg$`employee count` <0.0000000000000002 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 70.81 on 471 degrees of freedom
## Multiple R-squared:  0.5148, Adjusted R-squared:  0.5138 
## F-statistic: 499.8 on 1 and 471 DF,  p-value: < 0.00000000000000022

2.5 Report and interpret

Key takeaways

  • The P-Value for both regressions are less than .05 which means they are statistically significant
  • For every one unit of change in employee count of the company in-degree will increase by 0.00095446 units and out-degree will increase by 0.00105532 units.

3. Weighted Edges

3.1 Calculatetheweightforeachedge,and add this as anew column called weight to the existing df_edges dataframe.

df_edges_weighted = df_edges %>%
  inner_join(df_company, by = c("from"="company_id")) %>%
  select(from,to,migration_count,emp_count) %>%
  mutate(weight = migration_count/emp_count, .after = to)

df_edges_weighted

3.2 Creating a directed graph (with weighted edges) using the df_edges dataframe.

# Ensure weight is the 3rd column in the df_edges dataframe.
df_edges_weighted = df_edges_weighted[1:3]

# create graph using the new dataframe
igrapgh_weighted = graph_from_data_frame(df_edges_weighted,directed = TRUE)

# print graph
print(igrapgh_weighted, e=TRUE, v=TRUE)
## IGRAPH e5911e2 DNW- 473 81114 -- 
## + attr: name (v/c), weight (e/n)
## + edges from e5911e2 (vertex names):
##  [1] at&t                ->oracle               
##  [2] colgate-palmolive   ->nike                 
##  [3] agilent-technologies->stryker              
##  [4] ebay                ->expedia              
##  [5] comcast             ->republic-services-inc
##  [6] aon                 ->aig                  
##  [7] costco-wholesale    ->apple                
##  [8] facebook            ->cisco                
## + ... omitted several edges
#view the new graph and size of vertex, edges
V(igrapgh_weighted)
## + 473/473 vertices, named, from e5911e2:
##   [1] at&t                                 colgate-palmolive                   
##   [3] agilent-technologies                 ebay                                
##   [5] comcast                              aon                                 
##   [7] costco-wholesale                     facebook                            
##   [9] john-deere                           ross-stores                         
##  [11] american-express                     target                              
##  [13] cme-group                            jpmorgan-chase                      
##  [15] united-airlines                      the-home-depot                      
##  [17] xerox                                wellsfargo                          
##  [19] boeing                               jefferies                           
## + ... omitted several vertices
gorder(igrapgh_weighted)
## [1] 473
E(igrapgh_weighted)
## + 81114/81114 edges from e5911e2 (vertex names):
##  [1] at&t                ->oracle               
##  [2] colgate-palmolive   ->nike                 
##  [3] agilent-technologies->stryker              
##  [4] ebay                ->expedia              
##  [5] comcast             ->republic-services-inc
##  [6] aon                 ->aig                  
##  [7] costco-wholesale    ->apple                
##  [8] facebook            ->cisco                
##  [9] john-deere          ->ge                   
## [10] ross-stores         ->walmart              
## + ... omitted several edges
gsize(igrapgh_weighted)
## [1] 81114
is.directed(igrapgh_weighted)
## [1] TRUE

3.3 Take the top 10 edges

# dataframe for top10
top10_df_edges_weighted <-
        as.data.frame(head(df_edges_weighted[order(df_edges_weighted$weight, 
                                                            decreasing = T),],n=10))

igrapgh_top10_df_edges_weighted = 
  graph_from_data_frame(top10_df_edges_weighted,directed = TRUE)

E(igrapgh_top10_df_edges_weighted)$width <- E(igrapgh_top10_df_edges_weighted)$weight

plot(igrapgh_top10_df_edges_weighted, 
     layout = layout.kamada.kawai, 
     edge.arrow.size = .2, 
     edge.color="#CB4335", 
     vertex.color="#56B489",
     vertex.label.color="#17202A", 
     vertex.label.cex=.9, 
     edge.label = round(E(igrapgh_top10_df_edges_weighted)$weight,2), 
     edge.width = 1 )

3.4 Interpret the graph

If edge has a high weight, that means the two nodes are strongly connected and there is a lot of movement between the nodes on the network graph. There also seems to be edges with companies that are closely related to each other such ebay and paypal, cisco and juniper networks, HP and hewlett-packard, Marriott and host-hotel & resorts.

4) Page rank

The random surfer algorithm or model suggests that the chance that a surfer on the web lands on a specific web page is the sum of all the probabilities of all the pages that same user visited prior to landing on the specific page. Random surfer suggests that since the surfer does not know the quality of the landing page prior to clicking on it, the probability of clicking on the link is random. For example of a web surfer is clicking links at random, and the quality of the landing page is irrelevant, than page rank plays a bigger role. If the page rank is higher, the random surfer is more likely to click. The weighted edges approach on the other hand references how many other web pages is referencing that specific page. The more inbound links referencing that page, than the more likely a user is to click.

4.2 Find the top 10 nodes with the highest unweighted page rank

df_edges_weighted_igraph = graph_from_data_frame(df_edges_weighted,directed = TRUE)

pagerank_weighted = page_rank(df_edges_weighted_igraph, 
                              weights = df_edges_weighted_igraph$weight)$vector

pagerank_unweighted = page_rank(df_edges_weighted_igraph,weights = NA)$vector

pagerank_weighted_df = as.data.frame(pagerank_weighted)
pagerank_unweighted_df = as.data.frame(pagerank_unweighted)

pagerank_weighted_df_top10 = pagerank_weighted_df %>%
  arrange(desc(pagerank_weighted)) %>%
  top_n(10)
## Selecting by pagerank_weighted
pagerank_unweighted_df_top10 = pagerank_unweighted_df %>%
  arrange(desc(pagerank_unweighted)) %>%
  top_n(10)
## Selecting by pagerank_unweighted
print("Top 10 nodes with the highest weighted PageRank are shown below")
## [1] "Top 10 nodes with the highest weighted PageRank are shown below"
pagerank_weighted_df_top10
print("Top 10 nodes with the highest unweighted PageRank are shown below")
## [1] "Top 10 nodes with the highest unweighted PageRank are shown below"
pagerank_unweighted_df_top10

4.3 Plot the distribution (histogram) of weighted and unweighted PageRanks. Comment on the differences,and explain why these might be different.

chartcolor <- "#AE4371"
mychartattributes <- theme_bw() +
  theme(text = element_text(size=10),
        panel.border = element_blank(),
        panel.grid.major = element_blank(),
        panel.grid.minor = element_blank(),
        axis.line = element_line(color = "gray"),
        axis.ticks.x = element_blank(),
        axis.ticks.y = element_blank()
        )

weighted_hist <- ggplot(pagerank_weighted_df, aes(x=pagerank_weighted)) +
  geom_histogram(
    binwidth = 0.001,
    fill = chartcolor, color = "white") +
  stat_bin(
    binwidth = 0.001, 
    aes(y=..count.., label=..count..), 
    geom="text", vjust=-.5) +
  labs(title = "Distribution of weighted pagerank",
       caption = "Source: LinkedIn_company_metadata.csv",
       x = "Weighted PageRank",
       y = "Frequency"
       ) +
  mychartattributes +
  scale_fill_gradient("Count") +
  scale_y_continuous(breaks = c(0,50,100,150,200,250,300))

unweighted_hist <-ggplot(pagerank_unweighted_df, aes(x=pagerank_unweighted)) +
  geom_histogram(
    binwidth = 0.001,
    fill = chartcolor, color = "white") +
  stat_bin(
    binwidth = 0.001, 
    aes(y=..count.., label=..count..), 
    geom="text", vjust=-.5) +
  labs(title = "Distribution of unweighted pagerank",
       caption = "Source: LinkedIn_company_metadata.csv",
       x = "Unweighted PageRank",
       y = "Frequency"
       ) +
  mychartattributes +
  scale_fill_gradient("Count") +
  scale_y_continuous(breaks = c(0,50,100,150,200,250,300))

plot_grid(weighted_hist,unweighted_hist)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.

5. Community Detection

In this section, we will use graph structure to detect communities of firms that are similar to each other. Use the graph with weighted edges for this question.

5.1 Walktrap is an algorithm for community detection that is computationally very efficient and that we discussed briefly in class. Use the walktrap.community command in igraph to detect communities of firms. You can play around with steps argument to get different communities, the default steps=4 seems to produce good results.

community_cluster <- cluster_walktrap(
  df_edges_weighted_igraph,
  weights = E(df_edges_weighted_igraph)$weight,
  steps = 4,
  merges = TRUE,
  modularity = TRUE,
  membership = TRUE
)

print(paste("Number of communities observed:",length(community_cluster)))
## [1] "Number of communities observed: 10"
plot(community_cluster,df_edges_weighted_igraph, 
     layout = layout_with_fr, edge.arrow.size = .1, 
     edge.color="#00FF00", 
     vertex.color="#F8C471", 
     vertex.label="", 
     edge.width = .8, 
     vertex.size = 8)

community_members <- membership(community_cluster)

community_members_df = data.frame(as_ids(V(df_edges_weighted_igraph)),
                                  community_cluster$membership)
names(community_members_df) = c("company_id","community_cluster")

community_members_list <- community_members_df %>%
  inner_join(df_company, company_id=company_id) %>%
  select(name,community_cluster,industry,emp_count) %>%
  arrange(community_cluster)


community_members_list_1 <- community_members_list %>%
  filter(community_cluster==1)
  
community_members_list_2 <- community_members_list %>%
  filter(community_cluster==2)

community_members_list_3 <- community_members_list %>%
  filter(community_cluster==3)

community_members_list_4 <- community_members_list %>%
  filter(community_cluster==4)

community_members_list_5 <- community_members_list %>%
  filter(community_cluster==5)

community_members_list_6 <- community_members_list %>%
  filter(community_cluster==6)

community_members_list_7 <- community_members_list %>%
  filter(community_cluster==7)

community_members_list_8 <- community_members_list %>%
  filter(community_cluster==8)

community_members_list_9 <- community_members_list %>%
  filter(community_cluster==9)

community_members_list_10 <- community_members_list %>%
  filter(community_cluster==10)
#1
plot1 <- ggplot(community_members_list_1, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(
       x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()
#2
plot2 <- ggplot(community_members_list_2, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(
       x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()

#3
plot3 <- ggplot(community_members_list_3, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(
       x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()

#4
plot4 <- ggplot(community_members_list_4, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(
       x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()

#5
plot5 <- ggplot(community_members_list_5, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(
       x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()

#6
plot6 <- ggplot(community_members_list_6, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(
       x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()

#7
plot7 <- ggplot(community_members_list_7, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()

#8
plot8 <- ggplot(community_members_list_8, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()

#9
plot9 <- ggplot(community_members_list_9, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()

#10
plot10 <- ggplot(community_members_list_10, aes(x = industry, y = emp_count)) +
  geom_bar(stat="identity",
             fill = chartcolor
           ) +
  labs(x = "Industry",
       y = "Employee Count"
       ) +
  mychartattributes +
  scale_y_continuous(labels = scales::comma) +
  coord_flip()
plot1

plot2

plot3

plot4

plot5

plot6

plot7

plot8

plot9

plot10

5.2 Inspect the members of each community. What commonalities do you observe within each community?

Firms in the same community are in the same industry. The communities are grouped by: * Oil and energy * Healthcare related * Travel and tourism * financial firms * tech

6. Assortativity

A network is graph said to display assortative mixing if the nodes in the network that have many connections tend to be connected to other nodes with many connections as well.

igraph_homophily = graph_from_data_frame(df_edges, vertices = df_company, directed = TRUE)


V(igraph_homophily)$industry = df_company$industry


assortativity_coefficient = assortativity.nominal(igraph_homophily, 
                                                  types =as.factor(V(igraph_homophily)$industry), 
                                                  directed = TRUE)


assortativity_deg = assortativity.degree(igraph_homophily, directed = TRUE)


print(paste("The level of assortative mixing for industry in the network is: ",
            round(assortativity_coefficient,6)))
## [1] "The level of assortative mixing for industry in the network is:  0.030048"
print(paste("The Assortativity Degree of the network is: ",
            round(assortativity_deg,6)))
## [1] "The Assortativity Degree of the network is:  -0.198701"