#ENRON EMAIL CORPUS NETWORK ANALYSIS
#QUICK AND DIRTY - Peter Prevos (prevos.net)

#Enron Email Dataset: https://www.cs.cmu.edu/~./enron/

#Load specialised libraries
library(stringr) #String manipulation
library(igraph, warn.conflicts=F) #Network analysis

#E-mail corpus consists of nested folders per user with e-mails as text files

#Create list of all available e-mails
emails <- list.files("maildir/", full.names=T, recursive=T)
length(emails)
## [1] 517401
#Filter by inbox only
emails <- emails[grep("/inbox", emails)]
length(emails)
## [1] 44859
#Create list of sender and receiver (inbox owner)
inboxes <- data.frame(
  from=apply(as.data.frame(emails), 1, function(x){readLines(x, warn=F)[3]}),
  to=emails, 
  stringsAsFactors=F
  )

#Keep only enron.com and strip all but username
inboxes <- inboxes[grepl("@enron.com", inboxes$from),]
inboxes$from <- str_sub(inboxes$from, 7, nchar(inboxes$from)-10)
to <- str_split(inboxes$to, "/")
inboxes$to <- sapply(to, "[", 3)

#Create username list
users <- data.frame(user=paste0("maildir/", unique(inboxes$to)))

#Remove those without sent mails
sent <- apply(users, 1, function(x){sum(grepl("sent", dir(x)))})
users <- subset(users, !sent==0) 

#Replace username with e-mail name
users$mailname <- NA
for (i in 1:nrow(users)){
  sentmail <- dir(paste0(users$user[i], "/sent_items/"))
  name <- readLines(paste0(users$user[i], "/sent_items/", sentmail[1]), warn=F)[3]
  name <- str_sub(name, 7, nchar(name)-10)
  users$mailname[i] <- name
}
users$user <- str_sub(users$user, 9)
inboxes <- merge(inboxes, by.x="to", users, by.y="user")
inboxes <- data.frame(from=inboxes$from, to=inboxes$mailname)

inboxes$from <- as.character(inboxes$from)
inboxes$to <- as.character(inboxes$to)

#Only e-mails between inbox users
inboxes <- inboxes[inboxes$from %in% inboxes$to,]

#Remove no.address
inboxes <- subset(inboxes, from!="no.address" & to!="no.address")

#Remove mail to self
inboxes<- subset(inboxes, inboxes$from!=inboxes$to)

#Define network
g <- graph_from_edgelist(as.matrix(inboxes), directed=F)
coms <- spinglass.community(g)

#Plot network
par(mar=c(0,0,2,0))
plot(coms, g, 
     vertex.label=NA, 
     layout=layout.fruchterman.reingold,
     vertex.size=3,
     main="Enron e-mail network snapshot"
     )

#Analyse network
degree(g)[order(degree(g), decreasing = T)]
##        mike.grigsby        gerald.nemec      barry.tycholiz 
##                 196                 167                 165 
##        rod.hayslett       jeff.dasovich     kimberly.watson 
##                 160                 152                 143 
##     sara.shackleton      tracy.geaccone          mark.whitt 
##                 143                 138                 133 
##         marie.heard          tana.jones          kam.keiser 
##                 130                 125                 123 
##        lindy.donoho      louise.kitchen          d..steffes 
##                 118                 117                 113 
##    errol.mclaughlin       jay.reitmeyer             m..love 
##                  95                  93                  92 
##           larry.may      cara.semperger   debra.perlingiere 
##                  89                  81                  81 
##         susan.scott    holden.salisbury      shelley.corman 
##                  75                  73                  70 
##     stephanie.panus     legal <.taylor@       bill.williams 
##                  65                  65                  63 
##           bill.rapp        paul.y'barbo      michelle.lokay 
##                  60                  60                  58 
##       chris.germany          lynn.blair            c..giron 
##                  58                  55                  55 
##           teb.lokey      mark.mcconnell          sally.beck 
##                  53                  53                  50 
##            lavorato          john.hodge            rick.buy 
##                  50                  49                  47 
##  doug.gilbert-smith     elizabeth.sager        greg.whalley 
##                  46                  46                  45 
##       theresa.staab           m..forney           m..presto 
##                  45                  44                  43 
##      juan.hernandez           a..martin          craig.dean 
##                  43                  42                  42 
##            t..lucci       john.griffith          scott.neal 
##                  42                  41                  41 
##          eric.saibi     richard.shapiro         kenneth.lay 
##                  41                  40                  39 
##        susan.bailey         e..haedicke        don.baughman 
##                  39                  39                  38 
##         jason.wolfe             j..kean         john.arnold 
##                  38                  36                  35 
##       sean.crandall         kevin.hyatt         andy.zipper 
##                  33                  32                  32 
##          s..shively         frank.ermis          matt.smith 
##                  30                  30                  29 
##     monique.sanchez            w..white             l..mims 
##                  28                  27                  27 
##        geir.solberg       danny.mccarty           joe.parks 
##                  26                  25                  25 
##            kim.ward      stanley.horton      diana.scholtes 
##                  25                  24                  23 
## darrell.schoolcraft       dutch.quigley            kay.mann 
##                  22                  20                  20 
##       robert.benson           eric.bass         keith.holst 
##                  19                  18                  18 
##         a..shankman       judy.townsend     matthew.lenhart 
##                  17                  17                  16 
##       vladi.pimenov    joe.stepenovitch              l..gay 
##                  16                  16                  16 
##          mike.maggi          b..sanders            k..allen 
##                  16                  15                  14 
##         f..campbell          f..brawner        ryan.slinger 
##                  14                  13                  13 
##       mike.swerzbin       albert.meyers       jim.schwieger 
##                  13                  13                  12 
##            j..sturm          brad.mckay           d..thomas 
##                  12                  12                  11 
##           lisa.gang       james.derrick        peter.keavey 
##                  11                  11                  11 
##         w..delainey   scott.hendrickson     tori.kuykendall 
##                   9                   9                   9 
##         matt.motley      charles.weldon       michelle.cash 
##                   8                   7                   7 
##       john.zufferli       chris.dorland        geoff.storey 
##                   7                   7                   7 
##          dana.davis          j.kaminski         mike.carson 
##                   6                   6                   6 
##     benjamin.rogers       martin.cuilla      jonathan.mckay 
##                   6                   6                   6 
##            h..lewis     phillip.platter       jeff.skilling 
##                   6                   6                   6 
##         harry.arora           j..farmer      jason.williams 
##                   5                   5                   5 
##         tom.donohoe           jeff.king         andrea.ring 
##                   4                   3                   3 
##       cooper.richey      kevin.ruscitti          joe.quenet 
##                   3                   3                   1 
##       robert.badeer          w..pereira 
##                   1                   1