library(xml2)
library(dplyr)
library(purrr)
library(stringr)
library(tibble)
library(widyr)
library(igraph)
library(ggraph)
doc <- read_xml("C:/Users/prodn/Downloads/War_and_Peace.xml")
# Удаляем namespace, чтобы упростить запросы
doc <- doc |> xml_ns_strip()
# В TEI-разметке список персонажей хранится в listPerson
persons <- doc |>
xml_find_all(".//listPerson/person") |>
map_df(\(node){
# Уникальный идентификатор персонажа
id <- node |> xml_attr("id")
# Имя персонажа собираем из всех частей persName
name <- node |>
xml_find_all(".//persName/*") |>
xml_text() |>
str_squish() |>
str_c(collapse = " ")
tibble(
id = id,
name = name
)
})
nrow(persons)
## [1] 84
head(persons)
## # A tibble: 6 × 2
## id name
## <chr> <chr>
## 1 Mavra_Kuzminishna "Мавра Кузминишна"
## 2 Ilyin "Ильин"
## 3 Dron_Zakharych ""
## 4 Count_Ilya_Rostov "Илья Андреевич Андреич Ростов"
## 5 Lavrushka "Лаврушка"
## 6 Bilibin "Билибин"
mentions <- doc |>
xml_find_all(".//rs[@ref]") |>
map_df(\(node){
tibble(
character = node |>
xml_attr("ref") |>
str_remove("#"),
paragraph = node |>
xml_find_first("ancestor::p") |>
xml_path()
)
})
edges <- mentions |>
pairwise_count(character, paragraph, sort = TRUE) |>
rename(weight = n)
edges <- edges |>
filter(weight >= 40) # иначе слишком много вершин
g <- graph_from_data_frame(edges, directed = FALSE)
g |>
ggraph(layout = "fr") +
geom_edge_link(alpha = 0.2) +
geom_node_point(size = 3) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
theme_void()
## Описание графа
vcount(g)
## [1] 22
ecount(g)
## [1] 76
edge_density(g)
## [1] 0.3290043
components(g)
## $membership
## Sonya_Rostova
## 1
## NatashaRostova
## 1
## Pierre_Bezukhov
## 1
## AndreyBolkonsky
## 1
## Nikolai_Rostov
## 1
## Vasily__Vasska__Denisov
## 1
## Princess_Mariya_Bolkonskaya
## 1
## Prince_Nikolay_Bolkonsky
## 1
## Countess_Natalya_Rostova
## 1
## Count_Ilya_Rostov
## 1
## Tsar_Alexander_I_of_Russia
## 1
## Mikhail_Ilarionovich_Kutuzov
## 1
## HeleneKuragin
## 1
## Napoleon_Bonaparte
## 1
## Vasili_Kuragin
## 1
## Fedor_Ivanovich_Dolokhov
## 1
## Mademoiselle_Bourienne
## 1
## Boris_Drubetskoy
## 1
## Petya_Rostov
## 1
## Princess_Anna_Mikhaylovna_Drubetskaya
## 1
## Platon_Karataev
## 1
## Princess_Elisabeta__Lisa__Karlovna_Bolkonskaya
## 1
##
## $csize
## [1] 22
##
## $no
## [1] 1
mentions <- doc |>
xml_find_all(".//rs[@ref]") |>
purrr::map_df(\(node){
tibble::tibble(
character = node |>
xml_attr("ref") |>
stringr::str_remove("#"),
paragraph = node |>
xml_find_first("ancestor::p") |>
xml_path(),
tie_type = ifelse(
length(xml2::xml_find_all(node, "ancestor::sp")) > 0,
"dialogue",
"narration"
)
)
}) |>
dplyr::filter(!is.na(paragraph))
# считаем совместные упоминания
edges <- mentions |>
widyr::pairwise_count(character, paragraph, sort = TRUE) |>
dplyr::rename(weight = n)
# создаём тип связи на основе веса
edges <- edges |>
dplyr::mutate(
tie_type = dplyr::case_when(
weight >= 150 ~ "family / very strong interaction",
weight >= 70 ~ "close interaction",
weight >= 20 ~ "regular interaction",
TRUE ~ "weak interaction"
)
)
edges |>
head(10)
## # A tibble: 10 × 4
## item1 item2 weight tie_type
## <chr> <chr> <dbl> <chr>
## 1 Sonya_Rostova NatashaRostova 210 family / very strong interaction
## 2 NatashaRostova Sonya_Rostova 210 family / very strong interaction
## 3 NatashaRostova Pierre_Bezukhov 178 family / very strong interaction
## 4 Pierre_Bezukhov NatashaRostova 178 family / very strong interaction
## 5 Pierre_Bezukhov AndreyBolkonsky 158 family / very strong interaction
## 6 AndreyBolkonsky Pierre_Bezukhov 158 family / very strong interaction
## 7 Nikolai_Rostov NatashaRostova 138 close interaction
## 8 NatashaRostova Nikolai_Rostov 138 close interaction
## 9 NatashaRostova AndreyBolkonsky 128 close interaction
## 10 AndreyBolkonsky NatashaRostova 128 close interaction
# считаем степень узла
V(g)$degree <- igraph::degree(g)
node_attributes <- tibble::tibble(
character = V(g)$name,
degree = V(g)$degree
) |>
dplyr::arrange(desc(degree))
node_attributes |>
head(10)
## # A tibble: 10 × 2
## character degree
## <chr> <dbl>
## 1 NatashaRostova 20
## 2 Nikolai_Rostov 20
## 3 Pierre_Bezukhov 18
## 4 AndreyBolkonsky 12
## 5 Princess_Mariya_Bolkonskaya 12
## 6 Sonya_Rostova 8
## 7 Countess_Natalya_Rostova 8
## 8 Count_Ilya_Rostov 8
## 9 Vasily__Vasska__Denisov 6
## 10 Tsar_Alexander_I_of_Russia 6
comm <- cluster_louvain(g)
V(g)$community <- membership(comm)
g |>
ggraph(layout = "fr") +
geom_edge_link(alpha = 0.2) +
geom_node_point(aes(color = factor(community)), size = 4) +
geom_node_text(aes(label = name), repel = TRUE, size = 3) +
theme_void()
modularity(comm)
## [1] 0.3074934
ego_nodes <- ego(g, order = 1, nodes = "Nikolai_Rostov")
sub_g <- induced_subgraph(g, ego_nodes[[1]])
ego_nodes <- ego(g, order = 1, nodes = "Nikolai_Rostov")
sub_g <- induced_subgraph(g, ego_nodes[[1]])
# оставляем только сильные связи
sub_g <- delete_edges(sub_g, E(sub_g)[weight < 40])
# удаляем изолированные вершины
sub_g <- delete_vertices(sub_g, degree(sub_g) == 0)
sub_g |>
ggraph(layout = "fr") +
geom_edge_link(aes(width = weight), alpha = 0.4) +
scale_edge_width(range = c(0.3, 2)) +
geom_node_point(size = 6, color = "steelblue") +
geom_node_text(aes(label = name), repel = TRUE) +
theme_void()