The objective is to analyze the dataset from linkedin users to study the behavior of social networking.
#Required Libraries
library("readr")
library("ggplot2")
library("igraph")
library("poweRlaw")
library("scales")
library("cowplot")
library("psych")
library("dplyr")
library('RColorBrewer')
setwd("~/Desktop/Network Analytics")
df_talent_flows = read_csv("talent_flows.csv")
## Rows: 81114 Columns: 3
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (2): from, to
## dbl (1): migration_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
df_company = read_csv("linkedin_company_metadata.csv")
## Rows: 473 Columns: 9
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (7): company_id, name, industry, city, country, hq, overview
## dbl (2): founded, emp_count
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
head(df_talent_flows)
head(df_company)
df_edges = df_talent_flows
igrapgh = graph_from_data_frame(df_edges,directed = TRUE)
# print and view the igrapgh
print(igrapgh, e=TRUE, v=TRUE)
## IGRAPH 2cf45b8 DN-- 473 81114 --
## + attr: name (v/c), migration_count (e/n)
## + edges from 2cf45b8 (vertex names):
## [1] at&t ->oracle
## [2] colgate-palmolive ->nike
## [3] agilent-technologies->stryker
## [4] ebay ->expedia
## [5] comcast ->republic-services-inc
## [6] aon ->aig
## [7] costco-wholesale ->apple
## [8] facebook ->cisco
## + ... omitted several edges
# idenitfy total nodes in dataframe
a1 <- df_edges %>%
distinct(from) %>%
rename(label = "from")
b1 <- df_edges %>%
distinct(to) %>%
rename(label = "to")
df_edges_nodes_all <- full_join(a1, b1, by = "label")
df_edges_nodes_all <- as.data.frame(df_edges_nodes_all)
# create in-link weights
df_edges_in_deg <- df_edges %>%
group_by(to, from) %>%
summarise(weight = n()) %>%
ungroup()
# create out-link weights
df_edges_out_deg <- df_edges %>%
group_by(from, to) %>%
summarise(weight = n()) %>%
ungroup()
# add the weights for in-degree
df_edges_in_deg <-
aggregate(df_edges_in_deg$weight,
by = list(name = df_edges_in_deg$to),
FUN = sum)
# add the weights for out-degree
df_edges_out_deg <-
aggregate(df_edges_out_deg$weight,
by = list(name = df_edges_out_deg$from),
FUN = sum)
# combine in-degree with nodes list
df_edges_in_deg_data <- df_edges_in_deg %>%
right_join(df_edges_nodes_all, by = c("name" = "label"))
df_edges_in_deg_data <- as.data.frame(df_edges_in_deg_data)
# combine out-degree with nodes list
df_edges_out_deg_data <- df_edges_out_deg %>%
right_join(df_edges_nodes_all, by = c("name" = "label"))
df_edges_out_deg_data <- as.data.frame(df_edges_out_deg_data)
# generate top 10 for in degree
top10_in_deg_firms <-
as.data.frame(head(df_edges_in_deg_data[order(df_edges_in_deg_data$x,
decreasing = T),],n=10))
top10_in_deg_firms_list <- top10_in_deg_firms %>%
inner_join(df_company, by = c("name" = "company_id")) %>%
select(name.y, industry, city, country, emp_count) %>%
rename("company name" = "name.y", "employee count" = "emp_count")
# generate top 10 for out degree
top10_out_deg_firms <-
as.data.frame(head(df_edges_out_deg_data[order(df_edges_out_deg_data$x,
decreasing = T),],n=10))
top10_out_deg_firms_list <- top10_out_deg_firms %>%
inner_join(df_company, by = c("name" = "company_id")) %>%
select(name.y, industry, city, country, emp_count) %>%
rename("company name" = "name.y", "employee count" = "emp_count")
print("Top 10 firms with the highest in-degree are shown below : ")
## [1] "Top 10 firms with the highest in-degree are shown below : "
print(top10_in_deg_firms_list)
## company name industry
## 1 IBM information technology and services
## 2 Accenture information technology and services
## 3 Hewlett Packard Enterprise information technology and services
## 4 AT&T telecommunications
## 5 Amazon internet
## 6 Bank of America banking
## 7 Wells Fargo financial services
## 8 JPMorgan Chase & Co. financial services
## 9 Microsoft computer software
## 10 Citi financial services
## city country employee count
## 1 Armonk, New York United States 771986
## 2 Dublin 2 Ireland 480235
## 3 Palo Alto United States 506236
## 4 Dallas United States 321692
## 5 Seattle United States 139917
## 6 Charlotte United States 326301
## 7 San Francisco United States 302434
## 8 New York United States 270061
## 9 Redmond United States 302297
## 10 New York United States 323488
print("The top 10 firms with the highest out-degree are shown below : ")
## [1] "The top 10 firms with the highest out-degree are shown below : "
print(top10_out_deg_firms_list)
## company name industry
## 1 IBM information technology and services
## 2 AT&T telecommunications
## 3 Hewlett Packard Enterprise information technology and services
## 4 JPMorgan Chase & Co. financial services
## 5 Bank of America banking
## 6 Accenture information technology and services
## 7 GE electrical/electronic manufacturing
## 8 Wells Fargo financial services
## 9 Citi financial services
## 10 Target retail
## city country employee count
## 1 Armonk, New York United States 771986
## 2 Dallas United States 321692
## 3 Palo Alto United States 506236
## 4 New York United States 270061
## 5 Charlotte United States 326301
## 6 Dublin 2 Ireland 480235
## 7 Boston United States 142190
## 8 San Francisco United States 302434
## 9 New York United States 323488
## 10 Minneapolis United States 233172
df_edges_in_deg_data_reg <- df_edges_in_deg_data %>%
inner_join(df_company, by = c("name" = "company_id")) %>%
select(x,emp_count) %>%
rename("in degree" = "x", "employee count" = "emp_count")
df_edges_out_deg_data_reg <- df_edges_out_deg_data %>%
inner_join(df_company, by = c("name" = "company_id")) %>%
select(x,emp_count) %>%
rename("out degree" = "x", "employee count" = "emp_count")
lr_in_deg = lm(df_edges_in_deg_data_reg$`in degree`~df_edges_in_deg_data_reg$`employee count`,
data = df_edges_in_deg_data_reg)
options(scipen=999)
summary(lr_in_deg)
##
## Call:
## lm(formula = df_edges_in_deg_data_reg$`in degree` ~ df_edges_in_deg_data_reg$`employee count`,
## data = df_edges_in_deg_data_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -456.46 -42.82 -3.25 47.08 143.81
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 136.62554978 3.40651346 40.11
## df_edges_in_deg_data_reg$`employee count` 0.00095446 0.00004365 21.87
## Pr(>|t|)
## (Intercept) <0.0000000000000002 ***
## df_edges_in_deg_data_reg$`employee count` <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 65.47 on 471 degrees of freedom
## Multiple R-squared: 0.5038, Adjusted R-squared: 0.5027
## F-statistic: 478.2 on 1 and 471 DF, p-value: < 0.00000000000000022
lr_out_deg = lm(df_edges_out_deg_data_reg$`out degree`~df_edges_out_deg_data_reg$`employee count`,
data = df_edges_out_deg_data_reg)
options(scipen=999)
summary(lr_out_deg)
##
## Call:
## lm(formula = df_edges_out_deg_data_reg$`out degree` ~ df_edges_out_deg_data_reg$`employee count`,
## data = df_edges_out_deg_data_reg)
##
## Residuals:
## Min 1Q Median 3Q Max
## -507.64 -49.69 -2.16 56.89 134.65
##
## Coefficients:
## Estimate Std. Error t value
## (Intercept) 132.94148993 3.68439167 36.08
## df_edges_out_deg_data_reg$`employee count` 0.00105532 0.00004721 22.36
## Pr(>|t|)
## (Intercept) <0.0000000000000002 ***
## df_edges_out_deg_data_reg$`employee count` <0.0000000000000002 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 70.81 on 471 degrees of freedom
## Multiple R-squared: 0.5148, Adjusted R-squared: 0.5138
## F-statistic: 499.8 on 1 and 471 DF, p-value: < 0.00000000000000022
df_edges_weighted = df_edges %>%
inner_join(df_company, by = c("from"="company_id")) %>%
select(from,to,migration_count,emp_count) %>%
mutate(weight = migration_count/emp_count, .after = to)
df_edges_weighted
# Ensure weight is the 3rd column in the df_edges dataframe.
df_edges_weighted = df_edges_weighted[1:3]
# create graph using the new dataframe
igrapgh_weighted = graph_from_data_frame(df_edges_weighted,directed = TRUE)
# print graph
print(igrapgh_weighted, e=TRUE, v=TRUE)
## IGRAPH e5911e2 DNW- 473 81114 --
## + attr: name (v/c), weight (e/n)
## + edges from e5911e2 (vertex names):
## [1] at&t ->oracle
## [2] colgate-palmolive ->nike
## [3] agilent-technologies->stryker
## [4] ebay ->expedia
## [5] comcast ->republic-services-inc
## [6] aon ->aig
## [7] costco-wholesale ->apple
## [8] facebook ->cisco
## + ... omitted several edges
#view the new graph and size of vertex, edges
V(igrapgh_weighted)
## + 473/473 vertices, named, from e5911e2:
## [1] at&t colgate-palmolive
## [3] agilent-technologies ebay
## [5] comcast aon
## [7] costco-wholesale facebook
## [9] john-deere ross-stores
## [11] american-express target
## [13] cme-group jpmorgan-chase
## [15] united-airlines the-home-depot
## [17] xerox wellsfargo
## [19] boeing jefferies
## + ... omitted several vertices
gorder(igrapgh_weighted)
## [1] 473
E(igrapgh_weighted)
## + 81114/81114 edges from e5911e2 (vertex names):
## [1] at&t ->oracle
## [2] colgate-palmolive ->nike
## [3] agilent-technologies->stryker
## [4] ebay ->expedia
## [5] comcast ->republic-services-inc
## [6] aon ->aig
## [7] costco-wholesale ->apple
## [8] facebook ->cisco
## [9] john-deere ->ge
## [10] ross-stores ->walmart
## + ... omitted several edges
gsize(igrapgh_weighted)
## [1] 81114
is.directed(igrapgh_weighted)
## [1] TRUE
# dataframe for top10
top10_df_edges_weighted <-
as.data.frame(head(df_edges_weighted[order(df_edges_weighted$weight,
decreasing = T),],n=10))
igrapgh_top10_df_edges_weighted =
graph_from_data_frame(top10_df_edges_weighted,directed = TRUE)
E(igrapgh_top10_df_edges_weighted)$width <- E(igrapgh_top10_df_edges_weighted)$weight
plot(igrapgh_top10_df_edges_weighted,
layout = layout.kamada.kawai,
edge.arrow.size = .2,
edge.color="#CB4335",
vertex.color="#56B489",
vertex.label.color="#17202A",
vertex.label.cex=.9,
edge.label = round(E(igrapgh_top10_df_edges_weighted)$weight,2),
edge.width = 1 )
If edge has a high weight, that means the two nodes are strongly connected and there is a lot of movement between the nodes on the network graph. There also seems to be edges with companies that are closely related to each other such ebay and paypal, cisco and juniper networks, HP and hewlett-packard, Marriott and host-hotel & resorts.
The random surfer algorithm or model suggests that the chance that a surfer on the web lands on a specific web page is the sum of all the probabilities of all the pages that same user visited prior to landing on the specific page. Random surfer suggests that since the surfer does not know the quality of the landing page prior to clicking on it, the probability of clicking on the link is random. For example of a web surfer is clicking links at random, and the quality of the landing page is irrelevant, than page rank plays a bigger role. If the page rank is higher, the random surfer is more likely to click. The weighted edges approach on the other hand references how many other web pages is referencing that specific page. The more inbound links referencing that page, than the more likely a user is to click.
df_edges_weighted_igraph = graph_from_data_frame(df_edges_weighted,directed = TRUE)
pagerank_weighted = page_rank(df_edges_weighted_igraph,
weights = df_edges_weighted_igraph$weight)$vector
pagerank_unweighted = page_rank(df_edges_weighted_igraph,weights = NA)$vector
pagerank_weighted_df = as.data.frame(pagerank_weighted)
pagerank_unweighted_df = as.data.frame(pagerank_unweighted)
pagerank_weighted_df_top10 = pagerank_weighted_df %>%
arrange(desc(pagerank_weighted)) %>%
top_n(10)
## Selecting by pagerank_weighted
pagerank_unweighted_df_top10 = pagerank_unweighted_df %>%
arrange(desc(pagerank_unweighted)) %>%
top_n(10)
## Selecting by pagerank_unweighted
print("Top 10 nodes with the highest weighted PageRank are shown below")
## [1] "Top 10 nodes with the highest weighted PageRank are shown below"
pagerank_weighted_df_top10
print("Top 10 nodes with the highest unweighted PageRank are shown below")
## [1] "Top 10 nodes with the highest unweighted PageRank are shown below"
pagerank_unweighted_df_top10
chartcolor <- "#AE4371"
mychartattributes <- theme_bw() +
theme(text = element_text(size=10),
panel.border = element_blank(),
panel.grid.major = element_blank(),
panel.grid.minor = element_blank(),
axis.line = element_line(color = "gray"),
axis.ticks.x = element_blank(),
axis.ticks.y = element_blank()
)
weighted_hist <- ggplot(pagerank_weighted_df, aes(x=pagerank_weighted)) +
geom_histogram(
binwidth = 0.001,
fill = chartcolor, color = "white") +
stat_bin(
binwidth = 0.001,
aes(y=..count.., label=..count..),
geom="text", vjust=-.5) +
labs(title = "Distribution of weighted pagerank",
caption = "Source: LinkedIn_company_metadata.csv",
x = "Weighted PageRank",
y = "Frequency"
) +
mychartattributes +
scale_fill_gradient("Count") +
scale_y_continuous(breaks = c(0,50,100,150,200,250,300))
unweighted_hist <-ggplot(pagerank_unweighted_df, aes(x=pagerank_unweighted)) +
geom_histogram(
binwidth = 0.001,
fill = chartcolor, color = "white") +
stat_bin(
binwidth = 0.001,
aes(y=..count.., label=..count..),
geom="text", vjust=-.5) +
labs(title = "Distribution of unweighted pagerank",
caption = "Source: LinkedIn_company_metadata.csv",
x = "Unweighted PageRank",
y = "Frequency"
) +
mychartattributes +
scale_fill_gradient("Count") +
scale_y_continuous(breaks = c(0,50,100,150,200,250,300))
plot_grid(weighted_hist,unweighted_hist)
## Warning: The dot-dot notation (`..count..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(count)` instead.
In this section, we will use graph structure to detect communities of firms that are similar to each other. Use the graph with weighted edges for this question.
community_cluster <- cluster_walktrap(
df_edges_weighted_igraph,
weights = E(df_edges_weighted_igraph)$weight,
steps = 4,
merges = TRUE,
modularity = TRUE,
membership = TRUE
)
print(paste("Number of communities observed:",length(community_cluster)))
## [1] "Number of communities observed: 10"
plot(community_cluster,df_edges_weighted_igraph,
layout = layout_with_fr, edge.arrow.size = .1,
edge.color="#00FF00",
vertex.color="#F8C471",
vertex.label="",
edge.width = .8,
vertex.size = 8)
community_members <- membership(community_cluster)
community_members_df = data.frame(as_ids(V(df_edges_weighted_igraph)),
community_cluster$membership)
names(community_members_df) = c("company_id","community_cluster")
community_members_list <- community_members_df %>%
inner_join(df_company, company_id=company_id) %>%
select(name,community_cluster,industry,emp_count) %>%
arrange(community_cluster)
community_members_list_1 <- community_members_list %>%
filter(community_cluster==1)
community_members_list_2 <- community_members_list %>%
filter(community_cluster==2)
community_members_list_3 <- community_members_list %>%
filter(community_cluster==3)
community_members_list_4 <- community_members_list %>%
filter(community_cluster==4)
community_members_list_5 <- community_members_list %>%
filter(community_cluster==5)
community_members_list_6 <- community_members_list %>%
filter(community_cluster==6)
community_members_list_7 <- community_members_list %>%
filter(community_cluster==7)
community_members_list_8 <- community_members_list %>%
filter(community_cluster==8)
community_members_list_9 <- community_members_list %>%
filter(community_cluster==9)
community_members_list_10 <- community_members_list %>%
filter(community_cluster==10)
#1
plot1 <- ggplot(community_members_list_1, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(
x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#2
plot2 <- ggplot(community_members_list_2, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(
x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#3
plot3 <- ggplot(community_members_list_3, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(
x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#4
plot4 <- ggplot(community_members_list_4, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(
x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#5
plot5 <- ggplot(community_members_list_5, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(
x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#6
plot6 <- ggplot(community_members_list_6, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(
x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#7
plot7 <- ggplot(community_members_list_7, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#8
plot8 <- ggplot(community_members_list_8, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#9
plot9 <- ggplot(community_members_list_9, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
#10
plot10 <- ggplot(community_members_list_10, aes(x = industry, y = emp_count)) +
geom_bar(stat="identity",
fill = chartcolor
) +
labs(x = "Industry",
y = "Employee Count"
) +
mychartattributes +
scale_y_continuous(labels = scales::comma) +
coord_flip()
plot1
plot2
plot3
plot4
plot5
plot6
plot7
plot8
plot9
plot10
Firms in the same community are in the same industry. The communities are grouped by: * Oil and energy * Healthcare related * Travel and tourism * financial firms * tech
A network is graph said to display assortative mixing if the nodes in the network that have many connections tend to be connected to other nodes with many connections as well.
igraph_homophily = graph_from_data_frame(df_edges, vertices = df_company, directed = TRUE)
V(igraph_homophily)$industry = df_company$industry
assortativity_coefficient = assortativity.nominal(igraph_homophily,
types =as.factor(V(igraph_homophily)$industry),
directed = TRUE)
assortativity_deg = assortativity.degree(igraph_homophily, directed = TRUE)
print(paste("The level of assortative mixing for industry in the network is: ",
round(assortativity_coefficient,6)))
## [1] "The level of assortative mixing for industry in the network is: 0.030048"
print(paste("The Assortativity Degree of the network is: ",
round(assortativity_deg,6)))
## [1] "The Assortativity Degree of the network is: -0.198701"
The talent flow graph shows a clear display of assortative mixing and homophility within the network. Weighted edges between similar companies and subsidiaries are quite high, and there was clear community detection within different industries of labor. Another example would be those who leave a company tend to go work for another company within the same industry.
An example of disassortative mixing can be possibly found in network graphs with negative assortavity coefficients. An example might be if a cluster of linkedin users are trying to shift industries.