Библиотеки

library(xml2)
library(dplyr)
library(purrr)
library(stringr)
library(tibble)
library(widyr)
library(igraph)
library(ggraph)

Загрузка файла

doc <- read_xml("C:/Users/prodn/Downloads/War_and_Peace.xml")

# Удаляем namespace, чтобы упростить запросы
doc <- doc |> xml_ns_strip()

Извлечение списка персонажей

# В TEI-разметке список персонажей хранится в listPerson
persons <- doc |>
  xml_find_all(".//listPerson/person") |>
  map_df(\(node){
    
    # Уникальный идентификатор персонажа
    id <- node |> xml_attr("id")
    
    # Имя персонажа собираем из всех частей persName
    name <- node |>
      xml_find_all(".//persName/*") |>
      xml_text() |>
      str_squish() |>
      str_c(collapse = " ")
    
    tibble(
      id = id,
      name = name
    )
    
  })
nrow(persons)
## [1] 84
head(persons)
## # A tibble: 6 × 2
##   id                name                           
##   <chr>             <chr>                          
## 1 Mavra_Kuzminishna "Мавра Кузминишна"             
## 2 Ilyin             "Ильин"                        
## 3 Dron_Zakharych    ""                             
## 4 Count_Ilya_Rostov "Илья Андреевич Андреич Ростов"
## 5 Lavrushka         "Лаврушка"                     
## 6 Bilibin           "Билибин"

Извлечение упоминаний

mentions <- doc |>
  xml_find_all(".//rs[@ref]") |>
  map_df(\(node){

    tibble(
      character = node |>
        xml_attr("ref") |>
        str_remove("#"),

      paragraph = node |>
        xml_find_first("ancestor::p") |>
        xml_path()
    )

  })

Создание и визуалиция графа

edges <- mentions |>
  pairwise_count(character, paragraph, sort = TRUE) |>
  rename(weight = n)

edges <- edges |>
  filter(weight >= 40) # иначе слишком много вершин

g <- graph_from_data_frame(edges, directed = FALSE)

g |>
  ggraph(layout = "fr") +
  geom_edge_link(alpha = 0.2) +
  geom_node_point(size = 3) +
  geom_node_text(aes(label = name), repel = TRUE, size = 3) +
  theme_void()

## Описание графа

vcount(g)
## [1] 22
ecount(g)
## [1] 76
edge_density(g)
## [1] 0.3290043
components(g)
## $membership
##                                  Sonya_Rostova 
##                                              1 
##                                 NatashaRostova 
##                                              1 
##                                Pierre_Bezukhov 
##                                              1 
##                                AndreyBolkonsky 
##                                              1 
##                                 Nikolai_Rostov 
##                                              1 
##                        Vasily__Vasska__Denisov 
##                                              1 
##                    Princess_Mariya_Bolkonskaya 
##                                              1 
##                       Prince_Nikolay_Bolkonsky 
##                                              1 
##                       Countess_Natalya_Rostova 
##                                              1 
##                              Count_Ilya_Rostov 
##                                              1 
##                     Tsar_Alexander_I_of_Russia 
##                                              1 
##                   Mikhail_Ilarionovich_Kutuzov 
##                                              1 
##                                  HeleneKuragin 
##                                              1 
##                             Napoleon_Bonaparte 
##                                              1 
##                                 Vasili_Kuragin 
##                                              1 
##                       Fedor_Ivanovich_Dolokhov 
##                                              1 
##                         Mademoiselle_Bourienne 
##                                              1 
##                               Boris_Drubetskoy 
##                                              1 
##                                   Petya_Rostov 
##                                              1 
##          Princess_Anna_Mikhaylovna_Drubetskaya 
##                                              1 
##                                Platon_Karataev 
##                                              1 
## Princess_Elisabeta__Lisa__Karlovna_Bolkonskaya 
##                                              1 
## 
## $csize
## [1] 22
## 
## $no
## [1] 1

Тип связи

mentions <- doc |>
  xml_find_all(".//rs[@ref]") |>
  purrr::map_df(\(node){

    tibble::tibble(
      character = node |>
        xml_attr("ref") |>
        stringr::str_remove("#"),

      paragraph = node |>
        xml_find_first("ancestor::p") |>
        xml_path(),

      tie_type = ifelse(
        length(xml2::xml_find_all(node, "ancestor::sp")) > 0,
        "dialogue",
        "narration"
      )
    )

  }) |>
  dplyr::filter(!is.na(paragraph))


# считаем совместные упоминания
edges <- mentions |>
  widyr::pairwise_count(character, paragraph, sort = TRUE) |>
  dplyr::rename(weight = n)


# создаём тип связи на основе веса
edges <- edges |>
  dplyr::mutate(
    tie_type = dplyr::case_when(
      weight >= 150 ~ "family / very strong interaction",
      weight >= 70  ~ "close interaction",
      weight >= 20  ~ "regular interaction",
      TRUE ~ "weak interaction"
    )
  )


edges |>
  head(10)
## # A tibble: 10 × 4
##    item1           item2           weight tie_type                        
##    <chr>           <chr>            <dbl> <chr>                           
##  1 Sonya_Rostova   NatashaRostova     210 family / very strong interaction
##  2 NatashaRostova  Sonya_Rostova      210 family / very strong interaction
##  3 NatashaRostova  Pierre_Bezukhov    178 family / very strong interaction
##  4 Pierre_Bezukhov NatashaRostova     178 family / very strong interaction
##  5 Pierre_Bezukhov AndreyBolkonsky    158 family / very strong interaction
##  6 AndreyBolkonsky Pierre_Bezukhov    158 family / very strong interaction
##  7 Nikolai_Rostov  NatashaRostova     138 close interaction               
##  8 NatashaRostova  Nikolai_Rostov     138 close interaction               
##  9 NatashaRostova  AndreyBolkonsky    128 close interaction               
## 10 AndreyBolkonsky NatashaRostova     128 close interaction

Степень центральности

# считаем степень узла
V(g)$degree <- igraph::degree(g)

node_attributes <- tibble::tibble(
  character = V(g)$name,
  degree = V(g)$degree
) |>
  dplyr::arrange(desc(degree))

node_attributes |>
  head(10)
## # A tibble: 10 × 2
##    character                   degree
##    <chr>                        <dbl>
##  1 NatashaRostova                  20
##  2 Nikolai_Rostov                  20
##  3 Pierre_Bezukhov                 18
##  4 AndreyBolkonsky                 12
##  5 Princess_Mariya_Bolkonskaya     12
##  6 Sonya_Rostova                    8
##  7 Countess_Natalya_Rostova         8
##  8 Count_Ilya_Rostov                8
##  9 Vasily__Vasska__Denisov          6
## 10 Tsar_Alexander_I_of_Russia       6

Анализ сообществ

comm <- cluster_louvain(g)

V(g)$community <- membership(comm)

Визуализация сообществ

g |>
  ggraph(layout = "fr") +
  geom_edge_link(alpha = 0.2) +
  geom_node_point(aes(color = factor(community)), size = 4) +
  geom_node_text(aes(label = name), repel = TRUE, size = 3) +
  theme_void()

Модулярность

modularity(comm)
## [1] 0.3074934

Сообщества умеренно разделены

Подграф

ego_nodes <- ego(g, order = 1, nodes = "Nikolai_Rostov")

sub_g <- induced_subgraph(g, ego_nodes[[1]])

Визуализация подграфа (только сильные связи)

ego_nodes <- ego(g, order = 1, nodes = "Nikolai_Rostov")

sub_g <- induced_subgraph(g, ego_nodes[[1]])

# оставляем только сильные связи
sub_g <- delete_edges(sub_g, E(sub_g)[weight < 40])

# удаляем изолированные вершины
sub_g <- delete_vertices(sub_g, degree(sub_g) == 0)

sub_g |>
  ggraph(layout = "fr") +
  geom_edge_link(aes(width = weight), alpha = 0.4) +
  scale_edge_width(range = c(0.3, 2)) +
  geom_node_point(size = 6, color = "steelblue") +
  geom_node_text(aes(label = name), repel = TRUE) +
  theme_void()