Below are the files we obtained using mapreduce and hadoop. It was enough to have one main mapper and reducer. We focused on sent and recieved emails over the period May 1999 and July 2001. Enron declared bankruptcy in December 2001 and the scandal started in November. We would like to observe the Enron Email Network up to the point where the internal community of Enron started suffering from fraudulent practices.
set.seed(123)
dir("data")
FALSE [1] "conns-inbox.txt" "inbox-emails-long.txt" "n-conns-inbox.txt"
FALSE [4] "sent-emails-long.txt"
# read data in
dat = read.delim("data/sent-emails-long.txt", colClasses = 'character')
names(dat) = c('sender', 'date', 'reciever', 'email_id')
# explore
head(dat)
FALSE sender date
FALSE 1 elizabeth sager sara shackleton 1998-10-30 07:56:00
FALSE 2 alison smythe 1998-10-30 08:02:00
FALSE 3 brent hendry at enron_development@ccmail @ enron 1998-10-30 09:06:00
FALSE 4 janette elbertson 1998-11-13 04:07:00
FALSE 5 tana jones 1998-11-13 08:09:00
FALSE 6 per sekse 1998-11-13 14:57:52
FALSE reciever email_id
FALSE 1 mark - ect legal taylor 39502
FALSE 2 mark - ect legal taylor 39587
FALSE 3 mark - ect legal taylor 39870
FALSE 4 mark taylor 39094
FALSE 5 mark taylor 39182
FALSE 6 mark - ect legal taylor 39183
summary(dat)
FALSE sender date reciever
FALSE Length:40865 Length:40865 Length:40865
FALSE Class :character Class :character Class :character
FALSE Mode :character Mode :character Mode :character
FALSE email_id
FALSE Length:40865
FALSE Class :character
FALSE Mode :character
# leave only those connections that are employees
dat = dat[dat$reciever %in% unique(dat$sender),]
dat = dat[dat$sender %in% unique(dat$reciever),]
# length(union(dat$sender,dat$reciever))
# nrow(dat)
# add year month column
dat$yearm = as.numeric(substr(gsub("\\-","",dat$date),1,6))
# necessary packages
# install.packages("ndtv", "networkD3", "igraph", dependencies=T)
# install.packages('animation')
library(igraph)
# edges
links = dat[,c("sender","reciever", "yearm")]
links = links[order(links$yearm,links$sender,links$reciever),]
links = links[links$sender != links$reciever,] # remove loops
i <- sapply(links, is.factor)
links[i] <- lapply(links[i], as.character)
# nodes
nodes.dyn = as.data.frame(table(unlist(links[,1:2])))
names(nodes.dyn) = c("name","freq")
vertices = base::union(links$sender,links$reciever)
nodes.dyn = nodes.dyn[nodes.dyn$name %in% vertices,]
# net
net = graph.data.frame(links[,1:2], nodes.dyn, directed=F)
# plot network
l = layout.fruchterman.reingold(net)
plot(net,
rescale = T,
layout = l,
vertex.size = 10,
main = 'Enron network over the period 1998-11 to 2001-04\nbased on sent emails',
vertex.label.cex = 0.6) # font size
# add colors
# The vertex and edge betweenness are (roughly) defined by the number of
# geodesics (shortest paths) going through a vertex or an edge.
# http://www.inside-r.org/packages/cran/igraph/docs/betweenness
V(net)$community <- igraph::betweenness(net)
V(net)$group <- edge.betweenness.community(net)$membership
plot(net,
vertex.color = V(net)$community,
vertex.size = log(nodes.dyn$freq)*3,
mark.groups = V(net)$group,
rescale = T,
layout = l,
vertex.size = 10,
vertex.label.cex = 0.6, # font size
xlab = 'Size indicates of connections.\nColor is a grouping by edge betweenness\n
Background indicates the biggest tightly connected group by edgebetweenness '
)
library(networkD3)
data(MisLinks)
data(MisNodes)
linksd3 = links[!duplicated(links[,1]) | !duplicated(links[,2]),]
nodesd3 = as.data.frame(table(unlist(linksd3[,1:2])))
names(nodesd3) = c("name","freq")
nodesd3$group = V(net)$community
nodesd3 = nodesd3[order(nodesd3$name),]
linksd3 = linksd3[order(linksd3$sender),]
forceNetwork(Links = linksd3, Nodes = nodesd3,
Source = "sender", Target = "reciever",
NodeID = "name", #Value = '
Group = "group", opacity = 0.8, bounded = T, fontSize = 20)