Creating the Dataset

The Graz Database on Reduplication contains a database with a collection of languages that utilize reduplication in order to convey some meaning. These languages are listed in a directory and distributed among several links, each of which gives us a table of data pertaining to the language, its features, and its purpose behind reduplication.

The first step of this process entails web scraping the data using R. For illustrative purposes right now, we will only populate the table with five rows (five languages). The table will be constructed as such:

Code for webscraping and creating a table:

rm(list = ls())
# this cell... genuinely took me forever
# ============== WEB SCRAPING SETUP ================= #
library(tidyverse) # the og
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.2
## ✔ ggplot2   4.0.0     ✔ tibble    3.3.0
## ✔ lubridate 1.9.4     ✔ tidyr     1.3.1
## ✔ purrr     1.1.0     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(rvest) # for webscraping
## Warning: package 'rvest' was built under R version 4.5.2
## 
## Attaching package: 'rvest'
## 
## The following object is masked from 'package:readr':
## 
##     guess_encoding
# ========= USER VARIABLES ( modify me :) ) =========

# one string ONLY per cell
single_keys <- c(
  "Area",
  "General Data",
  "Reduplication Form-Function",
  "Relationship Form-Function",
  "Reduplication System",
  "Comments",
  "Diachrony",
  "Productivity",
  "Repetitive Operations",
  "Stylistic Information",
  "Recursive Operations",
  "Typological Information"
)

# needs a list for every cell
multi_keys <- c(
  "Alternative Names",
  "Family/Group"
)

# by default, multi-valued
dropdown_keys <- c(
  "Functions",
  "Semantics"
)

# ======= SETUP ( don't modify me >:( ) =======
single_keys <- make.names(single_keys, unique = TRUE)
multi_keys <- make.names(multi_keys, unique = TRUE)
dropdown_keys <- make.names(dropdown_keys, unique = TRUE)
table_keys <- union(single_keys, multi_keys)
bullet_point <- "[\u0095\u2022]"

# ============== FUNCTIONS ================= #

# returns: (xml_doc) contents of the page
get_page <- function(url) return(read_html(url))

# returns: (string) name of the language
get_language_name <- function(page){
  lang_name <- page %>% 
    html_element("body") %>% html_text() %>% 
    str_match("Language:\\s*([^\\n\\()]+)") %>% 
    .[, 2] %>% 
    str_squish()
  return(lang_name)
}

# returns: (character vector) normalized whitespace
fix_ws <- function(x) {
  return(x %>% str_replace_all("\r", "") %>%
           str_replace_all("\t", " ") %>%
           str_replace_all(bullet_point, "") %>% str_squish())
}

# returns: (tibble with key/val columns) the biggest table
get_table <- function(page) {
  # --- for the table itself ---
  res <- page %>% 
    html_table(fill = TRUE) %>% 
    # grabs the biggest table by row count
    pluck(which.max(map_int(., nrow))) %>% 
    as_tibble(.name_repair = "minimal") %>% 
    select(1, 2) %>% set_names(c("key", "value")) %>% 
    # often the site lists a key once, and the rows below have blank keys. let's squish them into the above key
    mutate(
      key = fix_ws(key),
      value = fix_ws(value)
    ) %>% 
    fill(key, .direction = "down") %>% 
    group_by(key) %>% 
    summarize(value = list(value[value != ""]), .groups = "drop") %>%
    mutate(key = make.names(key, unique = TRUE)) %>% 
    filter(key %in% table_keys)
  return(res)
}

# returns: (tibble with key/val columns) the specified dropdown boxes
# note: this function *in particular* was written using ChatGPT
get_dropdown <- function(page) {
  rows <- page %>% html_elements("tr")
  rows <- rows[map_lgl(rows, ~ length(html_elements(.x, "select")) > 0)]
  if (length(rows) == 0) return(tibble(key = character(), value = list()))

  out <- map_dfr(rows, function(r) {
    cells <- r %>% html_elements(xpath = ".//th|.//td")
    if (length(cells) == 0) return(NULL)

    key <- cells[[1]] %>% html_text2() %>% fix_ws()
    key <- str_remove(key, ":\\s*$") %>% fix_ws()
    key <- make.names(key, unique = TRUE)

    if (!(key %in% dropdown_keys)) return(NULL)

    opts <- r %>% html_elements("select option") %>% html_text2() %>% fix_ws()
    opts <- opts[opts != ""]
    opts <- opts[!str_to_lower(opts) %in% c("select value", "select", "value")]
    opts <- unique(opts)

    tibble(key = key, value = list(opts))
  })
  out
}


# returns: (string or vector of strings) one singlular cell 
make_cell <- function(key, values) {
  text <- fix_ws(str_c(values, collapse = "\n"))
  if(is.null(text) || text == "") return(NA)
  
  if(key %in% single_keys) return(text)
  
  if(key %in% multi_keys){
    split_chars = "[;,\n]"
    output <- str_split(text, split_chars)[[1]] %>% 
      fix_ws() %>% discard(~ .x == "") %>% unique()
    return(output)
  }
  
  return(NA)
}

# returns: (one row tibble) entire row for a language
scrape_language <- function(url) {
  page <- get_page(url)
  lang_name <- get_language_name(page)

  table <- get_table(page) %>%
    mutate(key = make.names(key, unique = TRUE),
      value = map2(key, value, make_cell))

  dropdown <- get_dropdown(page) %>%
    mutate(key = make.names(key, unique = TRUE))

  table <- table %>% filter(!key %in% dropdown$key) %>% bind_rows(dropdown)

  table %>%
    pivot_wider(names_from = key, values_from = value) %>%
    mutate(language = lang_name, .before = 1) %>%
    mutate(url = url)
}


# returns: (vector of strings) a list of all languages on the site
get_language_urls <- function() {
  # get the first search tree page
  base <- "https://reduplication.uni-graz.at/redup/"
  dir_root <- "tree_root.php?languagePage="
  lang_root <- "tree_lang\\.php\\?idlang=\\d+"

  first_page <- get_page(paste0(base, dir_root, 1))
  
  max_page <- first_page %>% 
    html_elements("a") %>% html_attr("href") %>% 
    str_extract("languagePage=\\d+")  %>% 
    str_remove("languagePage=") %>% 
    as.integer() %>% 
    max(na.rm = TRUE)
  
  # list of all available pages
  dir_urls <- paste0(base, dir_root, seq_len(max_page))
  
  all_hyperlinks <- character()
  
  for (dir_url in dir_urls){
    page_hyperlinks <- get_page(dir_url) %>% 
      html_elements("a") %>% html_attr("href")
    page_hyperlinks <- page_hyperlinks[str_detect(page_hyperlinks, lang_root)]
    all_hyperlinks <- c(all_hyperlinks, page_hyperlinks)
  }
  return(paste0(base, all_hyperlinks))
}

write_csv_xlsx <- function(df, path, list_sep = ";", na = "NA") {
  library(writexl)
  out <- df
  is_list <- vapply(out, is.list, logical(1))

  if (any(is_list)) {
    out[is_list] <- lapply(out[is_list], function(col) {
      vapply(col, function(x) {
        if (is.null(x) || length(x) == 0) na else paste(x, collapse = list_sep)
      }, character(1))
    })
  }

  write.csv(out, paste0(path, ".csv"), row.names = FALSE, na = na)
  writexl::write_xlsx(out, paste0(path, ".xlsx"))

  invisible(out)
}


# =============== SCRAPE THE DATA ================

lang_urls <- get_language_urls()

# we'll scrape using a list of tibbles, before we make the big table
# makes it way easier to work with because the data is really messy (like, diff columns per page)
tibble_table <- vector("list", length(lang_urls))
i <- 1

for(i in 1:length(lang_urls)){
  lang <- lang_urls[[i]]
  tibble_table[[i]] <- scrape_language(lang)
  Sys.sleep(0.1)
  #if(i > 5) break
  if(i%%10 == 0) print(paste0(i, "/", length(lang_urls), " scraped so far"))
}
## [1] "10/82 scraped so far"
## [1] "20/82 scraped so far"
## [1] "30/82 scraped so far"
## [1] "40/82 scraped so far"
## [1] "50/82 scraped so far"
## [1] "60/82 scraped so far"
## [1] "70/82 scraped so far"
## [1] "80/82 scraped so far"
master_table <- bind_rows(tibble_table)

Tidying the data for data analysis:

# things that can be analyzed:
# area -- 6 macro-areas
# family.group -- family + subgroup
# functions -- pluralization, intensification etc
# semantics -- continuity, intensity etc
# relationship.form.function -- 4 level of typology
# word order -- SOV, SVO etc
# reduplication.system -- messy but I want to analyze this.. full redup, no affix redup etc
# productivity -- fully productive vs lexicalized vs unproductive
# typological.information -- polysynetic, agglun., tones, ergative...
# repetitive.operations

master_table <- master_table %>% 
  mutate( Word.Order = str_extract( as.character(Typological.Information), "(?:SOV|SVO|VSO|VOS|OSV|OVS)" ) ) %>% 
  mutate(Word.Order = na_if(Word.Order, ""))

# to-do: split family groups


# functions-centric table
functions_table <- master_table %>% 
  select(language, Functions) %>% 
  unnest_longer(Functions) %>% 
  filter(Functions != "", !is.na(Functions)) %>% 
  distinct(.keep_all = TRUE)

Making Sense of the Data

While this is not a comprehensive list of every language that utilizes reduplication, this sample can nonetheless provide us with some valuable insights.

master_table %>%
  unnest_longer(Functions, values_to = "func") %>%
  mutate(func = str_squish(func)) %>%
  filter(!is.na(func), func != "") %>%
  count(func, sort = TRUE) %>%
  slice_head(n = 15) %>%
  ggplot(aes(x = n, y = forcats::fct_reorder(func, n))) +
  geom_col() +
  labs(x = "Count", y = NULL, title = "Top Functions")

master_table2 <- master_table %>%
  unnest_longer(Semantics, values_to = "sem") %>%
  mutate(
    sem  = str_squish(as.character(sem)),
    Area = str_squish(as.character(Area))
  ) %>%
  filter(!is.na(sem), sem != "", !is.na(Area), Area != "") %>%
  count(Area, sem)

ggplot(master_table2, aes(Area, sem, fill = n)) +
  geom_tile(width = 0.95, height = 0.95) +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

master_table %>%
  unnest_longer(Functions, values_to = "func") %>%
  mutate(func = str_squish(func)) %>%
  filter(!is.na(func), func != "", !is.na(Word.Order)) %>%
  count(func, Word.Order) %>%
  group_by(func) %>% mutate(total = sum(n)) %>% ungroup() %>%
  ggplot(aes(x = fct_reorder(func, total), y = n, fill = Word.Order)) +
  geom_col(position = "fill") +
  coord_flip()