Download data

# conda install -c enversion subversion
# copied the git address of the directory that I want to download and
# change 'tree' to 'trunk' and remove 'master' or 'main'
svn checkout https://github.com/Sefaria/Sefaria-Export/trunk/txt/Tanakh

Prepare environment

library(tidytext)
library(stringr)
library(magrittr)
library(tibble)
library(tidyr)
library(dplyr)
library(ggplot2)
library(Unicode)
library(purrr)

Load data

data <- 
list.files(path = "~/Tanakh", pattern = "Tanach with Ta'amei Hamikra.txt"
                      , recursive = T, full.names = T) %>% tibble() %>% 
  separate(".", into = paste0("lvl",1:7), sep = .Platform$file.sep
           , fill = "left", remove = F) %>% 
  select(1, , lvl5, lvl6)
## Warning: Expected 7 pieces. Additional pieces discarded in 39 rows [1, 2, 3, 4,
## 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16, 17, 18, 19, 20, ...].
names(data) <- c("path", "part", "book")
data %<>% select(book, part, path)
rownames(data) <- data$book
## Warning: Setting row names on a tibble is deprecated.
# data$path <- file.path("Tanakh", data$path)

read_no_head <- function(bookpath){
  book <- readLines(bookpath, ok = T, warn = F)
  no_head <- book[7:length(book)]
  return(tibble(line = 1:length(no_head), psukim = no_head))
}

data$text <- map(data$path, read_no_head)
special = c("\u05a1", "\u0595", "\u0598", "\u059E")
sum_taam <- function(bookpath){
  book <- readLines(bookpath, ok = T, warn = F)
  return(sum(str_count(book, special[c(1,4)])))
}

read_taam <- function(bookpath){
  book <- readLines(bookpath, ok = T, warn = F)
  no_head <- book[7:length(book)]
  no_chapters <- gsub("[A-z]", "", no_head)
  no_numbers <-  gsub("[0-9]", "", no_chapters)
  no_brackets <- gsub("[\\(\\)]", "", no_numbers)
  # teaamim_only <- gsub("[\u05B0-\u05EA\\D+]", "", no_brackets)
  teaamim_only <- gsub("[\u05B0-\u05EA]", "", no_brackets)
  return(teaamim_only)
}

sum_ot <- function(bookpath){
  normal <- paste0("[\u05D0-\u05EA]")
  book <- readLines(bookpath, ok = T, warn = F)
  return(sum(str_count(book, normal)))
}

sum_words <- function(bookpath){
  book <- readLines(bookpath, ok = T, warn = F)
  return(sum(str_count(book, " ")))
}


data$special <- lapply(data$path, sum_taam) %>% unlist()
data$ot <- lapply(data$path, sum_ot) %>% unlist()
data$words <- lapply(data$path, sum_words) %>% unlist()

data$specialratio <- data$special/data$words

# data %>% select(-path) %>% View()
# names(data)[6] <- "גרשיים+פזר /סך האותיות"

Percent of גרשייים and פזר in each book

data %>% ggplot(aes(y = specialratio, x = reorder(book, specialratio), color = part)) + 
  geom_point() + theme_test() + theme(axis.text.x = element_text(angle = -45, hjust = 0))

Unicode data

# strtoi("05C3", 16L) # Convert hex number to decimal, for easier ranges handling.
teamim <- data.frame(dec = c(1425:1454, 1475))
teamim$names <- u_char_label(as.u_char(teamim$dec))
teamim$sign <- intToUtf8(teamim$dec, multiple = T)
teamim$category <- "teaamim"

hebrew <- data.frame(dec = 1488:1514)
hebrew$names <- u_char_label(as.u_char(hebrew$dec))
hebrew$sign <- intToUtf8(hebrew$dec, multiple = T)
hebrew$category <- "otiyot"

nikud <- data.frame(dec = 1456:1468)
nikud$names <- u_char_label(as.u_char(nikud$dec))
nikud$sign <- intToUtf8(nikud$dec, multiple = T)
nikud$category <- "nikud"

hebrew %<>% bind_rows(teamim, nikud)

הפונקציה mod_tokenization היא סוס העבודה של הניתוח בשלב הזה הפונקציה מחשבת את האחוז היחסי של כל אחד מהטעמים בספר ומחזירה את זה כטבלה. מגבלות ידועות: הפונקציה לא סופרת מילים אשר לא מפויע עליהם טעם (בעיקר במקרים בהן יש מילים בנסמך). הפונקציה גם מחשבת את מספר המילים הממוצע בפסוק.

mod_tokenization <- function(df){
  table <- unnest_tokens(df, word, psukim)
  table$taam <- table$word %>% str_extract("[\u0590-\u05AE]")
  psukim_n <- table %>% filter(!is.na(taam)) %>% pull(line) %>% 
    unique() %>% length()
  table %<>% filter(!is.na(taam)) %>% count(taam)
  table %<>% bind_rows(list(taam = "׃", n = psukim_n))
  table %<>%  mutate(ratio = n/sum(n, na.rm = T))
  table %<>% t() %>% data.frame()
  colnames(table) <- table[1,]
  # rownames(table) <- table[,1]
  # table %<>% select(-1)
  return(table[3,])
}
  
data %<>% mutate(tokenized = map(text, mod_tokenization))
# data$tokenized[[1]]
# tt <- data$text[[1]] %>% unnest_tokens(word, psukim)
# tt$taam <- tt$word %>% str_extract("[\u0590-\u05AE]")
# psukim_n <- tt %>% filter(!is.na(taam)) %>% unique() %>% length()
# tt %<>% filter(!is.na(taam)) 
#  tt$line %>% unique() %>% length()
# count(taam) %>%
#   bind_rows(c(taam = "sof", n = psukim_n))
# psukim_n <- tt$line %>% max()
ratios <- 
data %>% unnest_auto(tokenized) %>% select(any_of(teamim$sign)) %>% as.matrix() 
rownames(ratios) <- data$book
# data %<>% mutate(taam = map(tokenized, get_taam))
# data$text[[1]]
# 
# data %>% unnest_longer(text, values_to = psukim)
ratios[is.na(ratios)] <- 0
ratios <- apply(ratios, 2, as.numeric)
library(gghighlight)
library(ggrepel)
res.pca <- prcomp(ratios)
rownames(res.pca$x) <- data$book

pl <- res.pca$x %>% 
  as.data.frame %>%
  rownames_to_column("book") %>% 
  left_join(data, by = "book") %>%
  ggplot(aes(x=PC1,y=PC2, color = part, label = book)) + 
  # geom_point(size=3) +
  # gghighlight(PC1 > 0.1, label_key = type) +
  geom_text() +
  theme_test() +
  theme(legend.position="top")
# plotly::ggplotly(pl)
pl

Without EMET books

en_emet <- ratios[c(1:32,34:35,38,39),]
res.pca <- prcomp(en_emet)
rownames(res.pca$x) <- data$book[c(1:32,34:35,38,39)]
pl <- res.pca$x %>% 
  as.data.frame %>%
  rownames_to_column("book") %>% 
  left_join(data, by = "book") %>% 
  ggplot(aes(x=PC1,y=PC2, color = part, label = book)) + geom_point(size=4) +
  theme_test() +
  theme(legend.position="top")
plotly::ggplotly(pl)

PCoA

library(vegan)
library(ape)
ratios.log <- log(ratios + 1)
ratios.D <- vegdist(ratios.log, "bray")
# rownames(ratios.D) <- data$book
res <- pcoa(ratios.D)
rownames(res$vectors)<- data$book
biplot(res)