packages = c('igraph', 'tidygraph', 'ggraph', 'visNetwork', 'lubridate', 'tidyverse')
for(p in packages){library
if(!require(p, character.only = T)){
install.packages(p)
}
library(p, character.only = T)
}
## Loading required package: igraph
##
## Attaching package: 'igraph'
## The following objects are masked from 'package:stats':
##
## decompose, spectrum
## The following object is masked from 'package:base':
##
## union
## Loading required package: tidygraph
##
## Attaching package: 'tidygraph'
## The following object is masked from 'package:igraph':
##
## groups
## The following object is masked from 'package:stats':
##
## filter
## Loading required package: ggraph
## Loading required package: ggplot2
## Loading required package: visNetwork
## Loading required package: lubridate
##
## Attaching package: 'lubridate'
## The following object is masked from 'package:igraph':
##
## %--%
## The following object is masked from 'package:base':
##
## date
## Loading required package: tidyverse
## -- Attaching packages ----------------------------------------------- tidyverse 1.2.1 --
## v tibble 2.1.3 v purrr 0.3.2
## v tidyr 1.0.0 v dplyr 0.8.3
## v readr 1.3.1 v stringr 1.4.0
## v tibble 2.1.3 v forcats 0.4.0
## -- Conflicts -------------------------------------------------- tidyverse_conflicts() --
## x lubridate::%--%() masks igraph::%--%()
## x lubridate::as.difftime() masks base::as.difftime()
## x dplyr::as_data_frame() masks tibble::as_data_frame(), igraph::as_data_frame()
## x purrr::compose() masks igraph::compose()
## x tidyr::crossing() masks igraph::crossing()
## x lubridate::date() masks base::date()
## x dplyr::filter() masks tidygraph::filter(), stats::filter()
## x dplyr::groups() masks tidygraph::groups(), igraph::groups()
## x lubridate::intersect() masks base::intersect()
## x dplyr::lag() masks stats::lag()
## x lubridate::setdiff() masks base::setdiff()
## x purrr::simplify() masks igraph::simplify()
## x lubridate::union() masks igraph::union(), base::union()
Using the read_csv() of readr package, import the GAStech_email_node.csv and GAStech_email_edges.csv into RStudio Environment.
GAStech_nodes <- read_csv("data/GAStech_email_node.csv")
## Parsed with column specification:
## cols(
## id = col_double(),
## label = col_character(),
## Department = col_character(),
## Title = col_character()
## )
GAStech_edges <- read_csv("data/GAStech_email_edge-v2.csv")
## Parsed with column specification:
## cols(
## source = col_double(),
## target = col_double(),
## SentDate = col_character(),
## SentTime = col_time(format = ""),
## Subject = col_character(),
## MainSubject = col_character(),
## sourceLabel = col_character(),
## targetLabel = col_character()
## )
Let’s take a look at what’s contained inside GAStech_edges
glimpse(GAStech_edges)
## Observations: 9,063
## Variables: 8
## $ source <dbl> 43, 43, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 26...
## $ target <dbl> 41, 40, 51, 52, 53, 45, 44, 46, 48, 49, 47, 54, 27...
## $ SentDate <chr> "6/1/2014", "6/1/2014", "6/1/2014", "6/1/2014", "6...
## $ SentTime <time> 08:39:00, 08:39:00, 08:58:00, 08:58:00, 08:58:00,...
## $ Subject <chr> "GT-SeismicProcessorPro Bug Report", "GT-SeismicPr...
## $ MainSubject <chr> "Work related", "Work related", "Work related", "W...
## $ sourceLabel <chr> "Sven.Flecha", "Sven.Flecha", "Kanon.Herrero", "Ka...
## $ targetLabel <chr> "Isak.Baza", "Lucas.Alcazar", "Felix.Resumir", "Hi...
As seen from glimpse(GAStech_edges), the output of SentDate is a “Character” data type instead of “Date” data type. Therefore, we change the data type of SentDate field back to “Date” data type.
GAStech_edges$SentDate = dmy(GAStech_edges$SentDate)
GAStech_edges$Weekday = wday(GAStech_edges$SentDate, label = TRUE, abbr = FALSE)
Here’s another look at what’s contained inside GAStech_edges again
glimpse(GAStech_edges)
## Observations: 9,063
## Variables: 9
## $ source <dbl> 43, 43, 44, 44, 44, 44, 44, 44, 44, 44, 44, 44, 26...
## $ target <dbl> 41, 40, 51, 52, 53, 45, 44, 46, 48, 49, 47, 54, 27...
## $ SentDate <date> 2014-01-06, 2014-01-06, 2014-01-06, 2014-01-06, 2...
## $ SentTime <time> 08:39:00, 08:39:00, 08:58:00, 08:58:00, 08:58:00,...
## $ Subject <chr> "GT-SeismicProcessorPro Bug Report", "GT-SeismicPr...
## $ MainSubject <chr> "Work related", "Work related", "Work related", "W...
## $ sourceLabel <chr> "Sven.Flecha", "Sven.Flecha", "Kanon.Herrero", "Ka...
## $ targetLabel <chr> "Isak.Baza", "Lucas.Alcazar", "Felix.Resumir", "Hi...
## $ Weekday <ord> Monday, Monday, Monday, Monday, Monday, Monday, Mo...
Wrangling attributes - we will aggregate the individual by date, senders, receivers, main subject and day of the week (as done in class).
GAStech_edges_aggregated <- GAStech_edges %>%
filter(MainSubject == "Work related") %>%
group_by(source, target, Weekday) %>%
summarise(Weight = n()) %>%
filter(source!=target) %>%
filter(Weight > 1) %>%
ungroup()
GAStech_edges_aggregated
## # A tibble: 1,456 x 4
## source target Weekday Weight
## <dbl> <dbl> <ord> <int>
## 1 1 2 Monday 4
## 2 1 2 Tuesday 3
## 3 1 2 Wednesday 5
## 4 1 2 Friday 8
## 5 1 3 Monday 4
## 6 1 3 Tuesday 3
## 7 1 3 Wednesday 5
## 8 1 3 Friday 8
## 9 1 4 Monday 4
## 10 1 4 Tuesday 3
## # ... with 1,446 more rows
Creating netwrok objects using tidygraph - create a network graph (as done in class)
GAStech_graph <- tbl_graph(nodes = GAStech_nodes, edges = GAStech_edges_aggregated, directed = TRUE)
GAStech_graph
## # A tbl_graph: 54 nodes and 1456 edges
## #
## # A directed multigraph with 1 component
## #
## # Node Data: 54 x 4 (active)
## id label Department Title
## <dbl> <chr> <chr> <chr>
## 1 1 Mat.Bramar Administrati~ Assistant to CEO
## 2 2 Anda.Ribera Administrati~ Assistant to CFO
## 3 3 Rachel.Pantanal Administrati~ Assistant to CIO
## 4 4 Linda.Lagos Administrati~ Assistant to COO
## 5 5 Ruscella.Mies.Hab~ Administrati~ Assistant to Engineering Group Ma~
## 6 6 Carla.Forluniau Administrati~ Assistant to IT Group Manager
## # ... with 48 more rows
## #
## # Edge Data: 1,456 x 4
## from to Weekday Weight
## <int> <int> <ord> <int>
## 1 1 2 Monday 4
## 2 1 2 Tuesday 3
## 3 1 2 Wednesday 5
## # ... with 1,453 more rows
GAStech_graph %>%
activate(edges) %>%
arrange(desc(Weight))
## # A tbl_graph: 54 nodes and 1456 edges
## #
## # A directed multigraph with 1 component
## #
## # Edge Data: 1,456 x 4 (active)
## from to Weekday Weight
## <int> <int> <ord> <int>
## 1 40 41 Tuesday 23
## 2 40 43 Tuesday 19
## 3 41 43 Tuesday 15
## 4 41 40 Tuesday 14
## 5 42 41 Tuesday 13
## 6 42 40 Tuesday 12
## # ... with 1,450 more rows
## #
## # Node Data: 54 x 4
## id label Department Title
## <dbl> <chr> <chr> <chr>
## 1 1 Mat.Bramar Administration Assistant to CEO
## 2 2 Anda.Ribera Administration Assistant to CFO
## 3 3 Rachel.Pantanal Administration Assistant to CIO
## # ... with 51 more rows
g <- GAStech_graph %>%
mutate(betweenness_centrality = centrality_betweenness()) %>%
mutate(closeness_centrality = centrality_closeness()) %>%
ggraph(layout = "nicely") +
geom_edge_link(aes()) +
geom_node_point(aes(colour = closeness_centrality, size=betweenness_centrality))
g + theme_graph()
Answer: Instead of having to mutate the graph such that betweenness_centrality and closeness_centrality exist in the data like in the graph presented in Section 6.1 of Hands-on Exercise 10, ggraph2 allows one to put centrality_closeness() and centrality_betweenness() as part of aesthetic in geom_node_point.
one_a_graph <-
ggraph(GAStech_graph, layout = "nicely",) +
geom_edge_link(aes()) +
geom_node_point(aes(colour = centrality_closeness(), size=centrality_betweenness()))
one_a_graph + theme_graph()
Answer: (1) The color of the nodes and the line color complement each other, hence it is hard to see the nodes and line clearly. This is especially so when the centrality closeness gets smaller (hence, the blue color gets darker).
The layout of the whole graph is complicated and does not offer a clear view of where each line points from and to.
It is hard to see the relationship between people as there is no indication of who is the person at each node and which people are talking to one another.
There was no title in the previous graph visualization.
You will see this 4 changes in part (c) in the sketches made as well as in part (d) where a new visNetwork is shown.
Question 1 Sketch
In this graph, I have applied the 4 changes mentioned in question 1b. First, I change the line color to a light red color so that it is visible yet not distracting. In addition, the nodes color remained as blue where the shade of blue depends on the centrality_closeness(). Second, I changed the whole layout of the graph using geom_edge_bend so that the graph looks much neater as a whole, having a nice squarish shape. Third, I have also added the name of the person representing each node so that there’s an indication of who is the person at each node and who is connected to who.
one_d_graph <-
ggraph(GAStech_graph, layout = "nicely") +
geom_edge_bend(aes(colour="red"), alpha=0.2) +
geom_node_point(aes(colour = centrality_closeness(), size=centrality_betweenness())) +
geom_node_text(label=GAStech_nodes$label, aes(size=2)) +
ggtitle("Network Visualization of Gas Tech")
one_d_graph + theme_graph()
Data Preparation for Section 7.4
GAStech_edges_aggregated <- GAStech_edges %>%
left_join(GAStech_nodes, by = c("sourceLabel" = "label")) %>%
rename(from = id) %>%
left_join(GAStech_nodes, by = c("targetLabel" = "label")) %>%
rename(to = id) %>%
filter(MainSubject == "Work related") %>%
group_by(from, to) %>%
summarise(weight = n()) %>%
filter(from!=to) %>%
filter(weight > 1) %>%
ungroup()
The code chunk below rename Department field to group
GAStech_nodes <- GAStech_nodes %>%
rename(group = Department)
Code Chunk for Section 7.4
visNetwork(GAStech_nodes, GAStech_edges_aggregated) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE)
As per the question requirement, when a name is selected from the drop down list ow when the node on the interactive graph is selected, the node will be highlighted and labelled with the employee name inside the circle. All the linked nodes of the selected circle is labelled as well.
The color of the nodes represent the department the employee is in.
visNetwork(GAStech_nodes, GAStech_edges_aggregated) %>%
visIgraphLayout(layout = "layout_with_fr") %>%
visNodes(label=GAStech_nodes$id, shape="circle") %>%
visOptions(highlightNearest = TRUE, nodesIdSelection = TRUE)
Answer: (1) The size of the circle was previously based on the length of the name of the employee and it may imply some unintended importance to the nodes that are bigger.
There was no title in the previous graph visualization.
It is hard to see the relationship between people as there is no indication of who is talking to who. There is no indication of the sender and receiver as they are all connected by lines.
No legends to show what does the color implement.
You will see this 4 changes in part (c) in the sketches made as well as in part (d) where a new visNetwork is shown.
Question 2 Sketch
Here, I resized the nodes using widthConstraint so that users would not see a difference in node sizes and wonder if there’s any hidden meaning implied. Next, I added a title to the graph as well. Following which, I used visEdges to show the direction of communication between the people who communicated and set the color to a bright red that shows the communications between the employee that was selected and other employees. I also added navigation buttons to improve the view of the graphs and added legends that represent where each color represents business unit.
visNetwork(GAStech_nodes, GAStech_edges_aggregated, main = "Network Visualization of Gas Tech") %>%
visInteraction(navigationButtons = TRUE) %>%
visIgraphLayout(layout = "layout_with_fr", randomSeed = 5) %>%
visEdges(arrows = "to", color=list(highlight="red")) %>%
visNodes(label=GAStech_nodes$id, shape="circle", widthConstraint = 175, labelHighlightBold = TRUE, font = list(size=20)) %>%
visOptions(highlightNearest = list(enabled = TRUE, labelOnly = TRUE, degree = 1), nodesIdSelection = TRUE) %>%
visLegend(zoom = FALSE, width = 0.2)