The National- The Reynes of Castamere
I wanted to play around with the excellent networkD3 package some more and decided to try and map out the books behind the Game of Thrones TV series. I was interested as a fan of the books, but also because of the sheer number of characters that appear and weave in and out of the story.
This has been done before but only for one book (as far as I can tell) and it seemed simple enough to do it for the rest.
First get the links to each chapter from towerofthehand.com
library(rvest)
library(RCurl)
library(magrittr)
base_url <- "http://towerofthehand.com"
suffix_url <- "/index.html"
#loop for url for all 5 books
for (book in 1:5){
url <- paste0(base_url, "/books/10", book, suffix_url)
ifelse(!exists("book_urls"),
book_urls <- url,
book_urls <- append(book_urls, url))
}
rm(suffix_url)
#get the linksfor every chapter
chapter_func <- function(book_url){
read <- read_html(book_url)
nodes <- read %>% html_nodes(".binding .chaptergrid a")
links <- paste0(base_url, nodes %>% html_attr("href"))
return(links)
}
chapter_urls <- unlist(lapply(book_urls, chapter_func))
#later we'll also want the number of chapters/book
#To spare scraping again/ more complicated here they are
books <- data.frame(book = c(1:5),
chapters = c(73, 70, 82, 46, 73),
title = c("A Game of Thrones", "A Clash of Kings",
"A Storm of Swords", "A Feast for Crows",
"A Dance with Dragons"))
Then get list of all characters that appear in these chapters. There’s a little complication in that some characters are duplicated (e.g. there are three characters named “Walder Frey” because the original is a complete narcissist) so we get the unique links, THEN the names
#get the characters in each chapter
character_func <- function(chapter_url){
read <- read_html(chapter_url)
nodes <- read %>% html_nodes("#appearances > ol > li > a")
character_urls <- paste0("https://towerofthehand.com", nodes %>% html_attr("href"))
return(character_urls)
}
character_urls <- lapply(chapter_urls, character_func)
all_characters <- unlist(character_urls)
#get the names of each character from their links
names_func <- function(character_url){
read <- read_html(character_url)
nodes <- read %>% html_nodes("#headline > h2")
names <- nodes %>% html_text()
return(names)
}
names <- lapply(unique(all_characters), names_func)
Characters <- data.frame(character = unlist(names), links = unique(all_characters))
Characters$character <- make.unique(as.character(Characters$character))
Print the characters using table() and get and idea of how often each appears
#how often each character appears
head(table(all_characters))
all_characters
https://towerofthehand.com/reference/k/00002/index.html
2
https://towerofthehand.com/reference/k/00006/index.html
7
https://towerofthehand.com/reference/k/00011/index.html
29
https://towerofthehand.com/reference/k/00012/index.html
1
https://towerofthehand.com/reference/k/00013/index.html
24
https://towerofthehand.com/reference/k/00015/index.html
10
#the total number of unique characters
print(paste("There are", length(unique(all_characters)),
"named characters in the Song of Ice and Fire series"))
[1] "There are 1152 named characters in the Song of Ice and Fire series"
Then make an empty data frame to pass these characters appearances into
#make an empty data frame of characters = rows and chapters = columns
chapter_df <- data.frame(matrix(NA,
nrow = length(unique(all_characters)),
ncol = length(chapter_urls)))
row_names <- unique(all_characters)
rownames(chapter_df) <- row_names
col_names <- 1:(length(chapter_urls))
colnames(chapter_df) <- col_names
And run the function where a “1” means that character is in that chapter
#find which chapter each character appears in
#represented as "1" in chapter_df
for (character in 1:nrow(chapter_df)){
name <- rownames(chapter_df)[character]
chapter_df[character, grep(name, character_urls)] <- 1
}
#every cell that isn't 1 is 0
chapter_df[is.na(chapter_df)] <- 0
For each character, this function then find how often every character co-occurs with every other character. Given it has to connect each of 1111 characters to the other 1110 it takes a while (a few hours) to run and gives a data frame of each source, target and “value” (how often the character co-occur).
I’ve included to code I used for error checking and in case you want to do it yourself. However, in running the script I will download the .csv I uploaded to my GitHub
#find how often each character co-occurs with every other character
#for (protag in 1:nrow(chapter_df)){
# name1 <- rownames(chapter_df)[protag]
# for (other in (protag + 1):nrow(chapter_df)){
# name2 <- rownames(chapter_df)[other]
# connection <- length(which(
# chapter_df[protag,] == 1 &
# chapter_df[other,] == 1))
#
# if(connection > 0){
# newrow <- data.frame(source = name1,
# target = name2,
# value = connection)
# ifelse(!exists("Links"),
# Links <- newrow,
# Links <- rbind(Links, newrow))
# }
# }
#}
#load the Links file from my GitHub
Github <- "https://raw.githubusercontent.com/RobWHickman/"
Project <- "Databases-and-Files/master/GoT-Network/"
Network_file <- paste0(Github, Project, "GoT-Network.csv")
Links <- read.csv(Network_file, stringsAsFactors = FALSE)
To produce the network, the Nodes are also needed
#get a df of each character and their frequency
Nodes <- data.frame(table(all_characters))
#add in a size column for the nodes we plot
Nodes$size <- (Nodes$Freq^2)/8
#change names from urls to character names
Nodes$all_characters <- as.character(Nodes$all_characters)
colnames(Nodes)[1] <- "name"
for (character in 1:nrow(Nodes)){
Nodes$name[character] <- Characters$character[which(Nodes$name[character] == Characters$links)]
}
#get "actual" names (i.e. remove unduplication) and surnames
Nodes$duplicatenames <- gsub("\\..*", "", Nodes$name)
Nodes$surname <- gsub(".* ", "", Nodes$name)
Nodes$surname[which(Nodes$surname == Nodes$duplicatenames)] <- "no surname"
head(Nodes)
I wanted to use each person’s house to colour them. First get the Houses
#get every House in the series
houses_url <- "http://towerofthehand.com/books/houses.html"
houses_read <- read_html(houses_url)
houses_nodes <- houses_read %>% html_nodes("td:nth-child(2) a , td:nth-child(4) a , td:nth-child(6) a")
houses_names <- unique(houses_nodes %>% html_text())
#there are two Houses called "Fossoway"
#to avoid annoyance, take it out now and bind it back later
houses_names <- houses_names[-c(which(houses_names == "Fossoway"))]
#get the links to each House's page
links_func <- function(html, house_name){
paste0("http://towerofthehand.com",
html %>%
html_nodes(xpath=paste0('//a[text()="', house_name, '"]')) %>%
html_attr("href"))
}
house_links <- sapply(houses_names, function(x)links_func(houses_read, x))
#make a df of the House names and links
house <- data.frame(house = as.character(houses_names),
link = as.character(unlist(house_links)))
#add the Fossoway House back in
Fossoways <- data.frame(house = "Fossoway",
link = "http://towerofthehand.com/reference/k/02044/index.html")
house <- rbind(house, Fossoways)
house$surname <- gsub(" .*", "", house$house)
#remove Houses that will form duplicates when trying to find a character's House
remove <- c(which(house$house == "Arryn of Gulltown"),
which(house$house == "Brune of Brownhollow"),
which(house$house == "Baratheon of Dragonstone"),
which(house$house == "Baratheon of King's Landing"),
which(house$house == "Dayne of High Hermitage"),
which(house$house == "Farwynd of Sealskin Point"),
which(house$house == "Goodbrother of Shatterstone"),
which(house$house == "Goodbrother of Orkmont"),
which(house$house == "Kenning of Kayce"),
which(house$house == "Lannister of Lannisport"),
which(house$house == "Royce of Runestone"),
which(house$house == "Vance of Wayfarer's Rest"))
house <- house[-c(remove),]
Each House has a sigil. To get the colours we download this sigil and use RImagePalette to extract a single colour that is representative
library(jpeg)
library(RImagePalette)
#download each House's sigil and get a colour
colour_func <- function(house_link){
read <- read_html(as.character(house_link))
Cnodes <- read %>% html_nodes("div.linear-rail > div:nth-child(2) > div > a > img")
link <- Cnodes %>% html_attr("src")
#only download a file and extract colour for those Houses that have a sigil posted
if(!identical(link, character(0))){
download.file(link, "arms.jpg", mode = "wb", quiet = TRUE)
}
if(!identical(link, character(0))){
img <- readJPEG("arms.jpg")
}
ifelse(identical(link, character(0)),
#if there's no sigil, use black
colour <- "#000000",
colour <- image_palette(img, n=1))
Rnodes <- read %>% html_nodes("div:nth-child(2) > div.linear-rail > div.title > span")
region <- gsub(".* ", "", Rnodes %>% html_text())
return(c(colour, region))
}
housecolours <- do.call("rbind", lapply(house$link, colour_func))
colnames(housecolours) <- c("colours", "region")
house <- cbind(house, housecolours)
#plot a pie chart of all the Houses and their "colour"
pie(rep(1, length(house$colours)), col = house$colours,
main = "House Colours", labels = house$house)

It was about this point I realised passing colours to Nodes in forceNetwork() isn’t so easy. The House colours will come in useful when I plot using igraph later, but for NetworkD3, I also scraped the “location” of each House (which of the 7 Kingdoms it is based). This is the “region” return in the colour_func().
Unfortunately, some Houses don’t have a region listed (doing some research it seems that it is unknown) so we’ll leave those as “Westeros.” For some others the function doesn’t quite find what we’re looking for, or the information is incorrect, so some quick gsubs are necessary
#gsub incorrect region data
house$region <- as.character(house$region)
house$region[which(house$region == "Farwynd.")] <- "Islands."
house$region[which(house$region == "Cronwlands.")] <- "Crownlands."
house$region[which(house$region == "Stark.")] <- "North."
house$region[which(house$region == 'Us."')] <- "Islands."
house$region[which(house$region == "house.")] <- "Crownlands."
house$region[which(house$house == "Woolfield")] <- "North."
house$region[which(house$region == "Rebellion.")] <- "Crownlands."
table(house$region)
Crownlands. Dorne. Islands. North. Reach. Riverlands. Stormlands.
34 18 26 30 54 27 27
Vale. Westerlands. Westeros.
27 34 9
#plot a pie chart of all the Houses and their "region"
regions <- data.frame(table(house$region))
pie(regions$Freq, labels = regions$Var1,
main = "Locations of Houses")

Add this colour/region information to each node using a quick lookup function
#add the colour/region to each character in the Node df
node_func <- function(name){
region <- house$region[which(house$surname == name)]
ifelse(length(region) > 0,
region <- unlist(region),
region <- "none")
colour <- house$colours[which(house$surname == name)]
ifelse(length(colour) > 0,
colour <- unlist(colour),
colour <- "#000000")
return(c(colour, region))
}
colourinfo <- do.call("rbind", lapply(Nodes$surname, node_func))
colnames(colourinfo) <- c("colour", "region")
#cbind it to the Nodes
#Nodes <- cbind(Nodes, colourinfo)
head(Nodes)
Before plotting, the names in the Links df need to be indexed to the rownames for the Nodes. Again, just use a simple lookup loop to do this.
And then plot using forceNetwork() from networkD3. I’m far from an expert at this so there’s probably some more that can be done to make the network clearer/ more aesthetic, but for now I’m not displeased with the results.
If you just want to see the output network, you can find it at my rPubs page
library(networkD3)
#networkD3 uses zero-indexing
for (node in 1:nrow(Nodes)){
Norder <- which(Nodes$name[node] == Characters$character) - 1
ifelse(node < 2, order <- Norder, order <- append(order, Norder))
}
Nodes$order <- order
rownames(Nodes) <- Nodes$order
Nodes <- Nodes[order(Nodes$order),]
Links$source <- as.numeric(Links$source) -1
Links$target <- as.numeric(Links$target) -1
#plot it!
#very large file
#forceNetwork(Links = Links, Nodes = Nodes, NodeID = "duplicatenames", Group = "region", Source = "source", Target = "target", Value = "value", zoom = TRUE, Nodesize = "size", fontSize = 30, linkColour = "#d8d8d8", opacity = 0.8, linkDistance = networkD3::JS("function(d) { return 100*d.value; }"), linkWidth = networkD3::JS("function(d) { return d.value/5; }"), charge = -100, legend = TRUE, colourScale = JS("d3.scale.category20()"))
