#ENRON EMAIL CORPUS NETWORK ANALYSIS
#QUICK AND DIRTY - Peter Prevos (prevos.net)
#Enron Email Dataset: https://www.cs.cmu.edu/~./enron/
#Load specialised libraries
library(stringr) #String manipulation
library(igraph, warn.conflicts=F) #Network analysis
#E-mail corpus consists of nested folders per user with e-mails as text files
#Create list of all available e-mails
emails <- list.files("maildir/", full.names=T, recursive=T)
length(emails)
## [1] 517401
#Filter by inbox only
emails <- emails[grep("/inbox", emails)]
length(emails)
## [1] 44859
#Create list of sender and receiver (inbox owner)
inboxes <- data.frame(
from=apply(as.data.frame(emails), 1, function(x){readLines(x, warn=F)[3]}),
to=emails,
stringsAsFactors=F
)
#Keep only enron.com and strip all but username
inboxes <- inboxes[grepl("@enron.com", inboxes$from),]
inboxes$from <- str_sub(inboxes$from, 7, nchar(inboxes$from)-10)
to <- str_split(inboxes$to, "/")
inboxes$to <- sapply(to, "[", 3)
#Create username list
users <- data.frame(user=paste0("maildir/", unique(inboxes$to)))
#Remove those without sent mails
sent <- apply(users, 1, function(x){sum(grepl("sent", dir(x)))})
users <- subset(users, !sent==0)
#Replace username with e-mail name
users$mailname <- NA
for (i in 1:nrow(users)){
sentmail <- dir(paste0(users$user[i], "/sent_items/"))
name <- readLines(paste0(users$user[i], "/sent_items/", sentmail[1]), warn=F)[3]
name <- str_sub(name, 7, nchar(name)-10)
users$mailname[i] <- name
}
users$user <- str_sub(users$user, 9)
inboxes <- merge(inboxes, by.x="to", users, by.y="user")
inboxes <- data.frame(from=inboxes$from, to=inboxes$mailname)
inboxes$from <- as.character(inboxes$from)
inboxes$to <- as.character(inboxes$to)
#Only e-mails between inbox users
inboxes <- inboxes[inboxes$from %in% inboxes$to,]
#Remove no.address
inboxes <- subset(inboxes, from!="no.address" & to!="no.address")
#Remove mail to self
inboxes<- subset(inboxes, inboxes$from!=inboxes$to)
#Define network
g <- graph_from_edgelist(as.matrix(inboxes), directed=F)
coms <- spinglass.community(g)
#Plot network
par(mar=c(0,0,2,0))
plot(coms, g,
vertex.label=NA,
layout=layout.fruchterman.reingold,
vertex.size=3,
main="Enron e-mail network snapshot"
)

#Analyse network
degree(g)[order(degree(g), decreasing = T)]
## mike.grigsby gerald.nemec barry.tycholiz
## 196 167 165
## rod.hayslett jeff.dasovich kimberly.watson
## 160 152 143
## sara.shackleton tracy.geaccone mark.whitt
## 143 138 133
## marie.heard tana.jones kam.keiser
## 130 125 123
## lindy.donoho louise.kitchen d..steffes
## 118 117 113
## errol.mclaughlin jay.reitmeyer m..love
## 95 93 92
## larry.may cara.semperger debra.perlingiere
## 89 81 81
## susan.scott holden.salisbury shelley.corman
## 75 73 70
## stephanie.panus legal <.taylor@ bill.williams
## 65 65 63
## bill.rapp paul.y'barbo michelle.lokay
## 60 60 58
## chris.germany lynn.blair c..giron
## 58 55 55
## teb.lokey mark.mcconnell sally.beck
## 53 53 50
## lavorato john.hodge rick.buy
## 50 49 47
## doug.gilbert-smith elizabeth.sager greg.whalley
## 46 46 45
## theresa.staab m..forney m..presto
## 45 44 43
## juan.hernandez a..martin craig.dean
## 43 42 42
## t..lucci john.griffith scott.neal
## 42 41 41
## eric.saibi richard.shapiro kenneth.lay
## 41 40 39
## susan.bailey e..haedicke don.baughman
## 39 39 38
## jason.wolfe j..kean john.arnold
## 38 36 35
## sean.crandall kevin.hyatt andy.zipper
## 33 32 32
## s..shively frank.ermis matt.smith
## 30 30 29
## monique.sanchez w..white l..mims
## 28 27 27
## geir.solberg danny.mccarty joe.parks
## 26 25 25
## kim.ward stanley.horton diana.scholtes
## 25 24 23
## darrell.schoolcraft dutch.quigley kay.mann
## 22 20 20
## robert.benson eric.bass keith.holst
## 19 18 18
## a..shankman judy.townsend matthew.lenhart
## 17 17 16
## vladi.pimenov joe.stepenovitch l..gay
## 16 16 16
## mike.maggi b..sanders k..allen
## 16 15 14
## f..campbell f..brawner ryan.slinger
## 14 13 13
## mike.swerzbin albert.meyers jim.schwieger
## 13 13 12
## j..sturm brad.mckay d..thomas
## 12 12 11
## lisa.gang james.derrick peter.keavey
## 11 11 11
## w..delainey scott.hendrickson tori.kuykendall
## 9 9 9
## matt.motley charles.weldon michelle.cash
## 8 7 7
## john.zufferli chris.dorland geoff.storey
## 7 7 7
## dana.davis j.kaminski mike.carson
## 6 6 6
## benjamin.rogers martin.cuilla jonathan.mckay
## 6 6 6
## h..lewis phillip.platter jeff.skilling
## 6 6 6
## harry.arora j..farmer jason.williams
## 5 5 5
## tom.donohoe jeff.king andrea.ring
## 4 3 3
## cooper.richey kevin.ruscitti joe.quenet
## 3 3 1
## robert.badeer w..pereira
## 1 1