library(tidyverse)
d18th <- read_csv("18th-data.csv") %>% filter(`Ok for festschrift?`!="yess")

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  `Earliest letter` = col_double(),
  Frequency = col_double(),
  `Total DF after` = col_double(),
  `Total DF before` = col_double(),
  word_daterange_start = col_double(),
  sense_daterange_start = col_double(),
  MultiSenders = col_logical(),
  MultiRec = col_logical(),
  Year = col_double(),
  YearUncertain = col_logical(),
  WordCount = col_double(),
  AddressFormula = col_logical(),
  ClosingFormula = col_logical(),
  CorrespondentNotes = col_logical(),
  LetterNumber = col_logical(),
  Copyright = col_logical(),
  Complete = col_logical(),
  Updated = col_datetime(format = ""),
  `non-normalized` = col_double(),
  words = col_double()
  # ... with 16 more columns
)
ℹ Use `spec()` for the full column specifications.
s18th <- read_csv("18th-sampleinfo.csv")

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  MultiSenders = col_logical(),
  MultiRec = col_logical(),
  Year = col_double(),
  YearUncertain = col_logical(),
  WordCount = col_double(),
  AddressFormula = col_logical(),
  ClosingFormula = col_logical(),
  LetterNumber = col_logical(),
  Copyright = col_logical(),
  Complete = col_logical(),
  Updated = col_datetime(format = ""),
  `non-normalized` = col_double(),
  words = col_double(),
  SenderYBirth = col_double(),
  SenderYDeath = col_double(),
  SenderReligion = col_logical(),
  SenderSentLettcont = col_logical(),
  SenderRecLettcont = col_logical(),
  SenderUpdated = col_datetime(format = ""),
  RecipientYBirth = col_double()
  # ... with 9 more columns
)
ℹ Use `spec()` for the full column specifications.
d17th <- read_csv("17th-data.csv") %>% filter(`Ok for festschrift?`!="yess")

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  `Earliest letter` = col_double(),
  Frequency = col_double(),
  `Total DF after` = col_double(),
  `Total DF before` = col_double(),
  word_daterange_start = col_double(),
  sense_daterange_start = col_double(),
  MultiSenders = col_logical(),
  MultiRec = col_logical(),
  Year = col_double(),
  YearUncertain = col_logical(),
  WordCount = col_double(),
  AddressFormula = col_logical(),
  ClosingFormula = col_logical(),
  CorrespondentNotes = col_logical(),
  LetterNumber = col_logical(),
  Copyright = col_logical(),
  Complete = col_logical(),
  Updated = col_datetime(format = ""),
  `non-normalized` = col_double(),
  words = col_double()
  # ... with 10 more columns
)
ℹ Use `spec()` for the full column specifications.
s17th <- read_csv("17th-sampleinfo.csv")

── Column specification ──────────────────────────────────────────────────────────────────────────────────────────────────
cols(
  .default = col_character(),
  MultiSenders = col_logical(),
  MultiRec = col_logical(),
  Year = col_double(),
  YearUncertain = col_logical(),
  WordCount = col_double(),
  AddressFormula = col_logical(),
  ClosingFormula = col_logical(),
  LetterNumber = col_logical(),
  Copyright = col_logical(),
  Complete = col_logical(),
  Updated = col_datetime(format = ""),
  `non-normalized` = col_double(),
  words = col_double(),
  SenderYBirth = col_double(),
  SenderYDeath = col_double(),
  SenderUpdated = col_datetime(format = ""),
  RecipientYBirth = col_double(),
  RecipientYDeath = col_double(),
  RecipientUpdated = col_datetime(format = ""),
  SenderAge = col_double()
  # ... with 3 more columns
)
ℹ Use `spec()` for the full column specifications.
g <- function(d1,d2,grouping) {
  grouping = enquo(grouping)
  return(d1 %>% group_by(!!grouping) %>% summarize(tokens=n(),lemmas=n_distinct(Lemma),authors=n_distinct(Sender),letters=n_distinct(Letters)) %>% right_join(d2 %>% group_by(!!grouping) %>% summarize(nauthors=n_distinct(Sender),nwords=sum(WordCount),nletters=n()),by=c(quo_name(grouping))) %>% mutate(tokens_per_10k_words=tokens/nwords*10000,letter_proportion=letters/nletters,author_proportion=authors/nauthors,lemmas_per_10k_words=lemmas/nwords*10000) %>% arrange(desc(tokens_per_10k_words)) %>% relocate(!!grouping,tokens_per_10k_words,lemmas_per_10k_words,letter_proportion,author_proportion,lemmas,letters,authors,nwords,nletters,nauthors))
}

Distinct values in each column

17th

d17th %>% summarize_all(~length(unique(.)))

18th

d18th %>% summarize_all(~length(unique(.)))

Sex

17th

g(d17th,s17th,SenderSex)

18th

g(d18th,s18th,SenderSex)

Rank

17th

g(d17th %>% mutate(SenderRank=str_sub(SenderRank,1,1)),s17th %>% mutate(SenderRank=str_sub(SenderRank,1,1)),SenderRank)

18th

g(d18th %>% mutate(SenderRank=str_sub(SenderRank,1,1)),s18th %>% mutate(SenderRank=str_sub(SenderRank,1,1)),SenderRank)

Relationship

17th

g(d17th,s17th,RelCode)

18th

g(d18th,s18th,RelCode)

Education

17th

g(d17th %>% mutate(SenderEduCode=str_sub(SenderEduCode,1,1)),s17th %>% mutate(SenderEduCode=str_sub(SenderEduCode,1,1)),SenderEduCode)

18th

g(d18th %>% mutate(SenderEduCode=str_sub(SenderEduCode,1,1)),s18th %>% mutate(SenderEduCode=str_sub(SenderEduCode,1,1)),SenderEduCode)

Region

17th

g(d17th,s17th,SenderRegion)

18th

g(d18th,s18th,SenderRegion)

Sender

17th

g(d17th,s17th,Sender)

18th

g(d18th,s18th,Sender)
LS0tCnRpdGxlOiAiUiBOb3RlYm9vayIKb3V0cHV0OiBodG1sX25vdGVib29rCi0tLQoKCmBgYHtyfQpsaWJyYXJ5KHRpZHl2ZXJzZSkKYGBgCgpgYGB7cn0KZDE4dGggPC0gcmVhZF9jc3YoIjE4dGgtZGF0YS5jc3YiKSAlPiUgZmlsdGVyKGBPayBmb3IgZmVzdHNjaHJpZnQ/YCE9Inllc3MiKQpzMTh0aCA8LSByZWFkX2NzdigiMTh0aC1zYW1wbGVpbmZvLmNzdiIpCmQxN3RoIDwtIHJlYWRfY3N2KCIxN3RoLWRhdGEuY3N2IikgJT4lIGZpbHRlcihgT2sgZm9yIGZlc3RzY2hyaWZ0P2AhPSJ5ZXNzIikKczE3dGggPC0gcmVhZF9jc3YoIjE3dGgtc2FtcGxlaW5mby5jc3YiKQpgYGAKCmBgYHtyfQpnIDwtIGZ1bmN0aW9uKGQxLGQyLGdyb3VwaW5nKSB7CiAgZ3JvdXBpbmcgPSBlbnF1byhncm91cGluZykKICByZXR1cm4oZDEgJT4lIGdyb3VwX2J5KCEhZ3JvdXBpbmcpICU+JSBzdW1tYXJpemUodG9rZW5zPW4oKSxsZW1tYXM9bl9kaXN0aW5jdChMZW1tYSksYXV0aG9ycz1uX2Rpc3RpbmN0KFNlbmRlciksbGV0dGVycz1uX2Rpc3RpbmN0KExldHRlcnMpKSAlPiUgcmlnaHRfam9pbihkMiAlPiUgZ3JvdXBfYnkoISFncm91cGluZykgJT4lIHN1bW1hcml6ZShuYXV0aG9ycz1uX2Rpc3RpbmN0KFNlbmRlciksbndvcmRzPXN1bShXb3JkQ291bnQpLG5sZXR0ZXJzPW4oKSksYnk9YyhxdW9fbmFtZShncm91cGluZykpKSAlPiUgbXV0YXRlKHRva2Vuc19wZXJfMTBrX3dvcmRzPXRva2Vucy9ud29yZHMqMTAwMDAsbGV0dGVyX3Byb3BvcnRpb249bGV0dGVycy9ubGV0dGVycyxhdXRob3JfcHJvcG9ydGlvbj1hdXRob3JzL25hdXRob3JzLGxlbW1hc19wZXJfMTBrX3dvcmRzPWxlbW1hcy9ud29yZHMqMTAwMDApICU+JSBhcnJhbmdlKGRlc2ModG9rZW5zX3Blcl8xMGtfd29yZHMpKSAlPiUgcmVsb2NhdGUoISFncm91cGluZyx0b2tlbnNfcGVyXzEwa193b3JkcyxsZW1tYXNfcGVyXzEwa193b3JkcyxsZXR0ZXJfcHJvcG9ydGlvbixhdXRob3JfcHJvcG9ydGlvbixsZW1tYXMsbGV0dGVycyxhdXRob3JzLG53b3JkcyxubGV0dGVycyxuYXV0aG9ycykpCn0KYGBgCgojIERpc3RpbmN0IHZhbHVlcyBpbiBlYWNoIGNvbHVtbgoKIyMgMTd0aApgYGB7cn0KZDE3dGggJT4lIHN1bW1hcml6ZV9hbGwofmxlbmd0aCh1bmlxdWUoLikpKQpgYGAKCiMjIDE4dGgKYGBge3J9CmQxOHRoICU+JSBzdW1tYXJpemVfYWxsKH5sZW5ndGgodW5pcXVlKC4pKSkKYGBgCgojIFNleAoKIyMgMTd0aApgYGB7cn0KZyhkMTd0aCxzMTd0aCxTZW5kZXJTZXgpCmBgYAoKIyMgMTh0aApgYGB7cn0KZyhkMTh0aCxzMTh0aCxTZW5kZXJTZXgpCmBgYAojIFJhbmsKCiMjIDE3dGgKYGBge3J9CmcoZDE3dGggJT4lIG11dGF0ZShTZW5kZXJSYW5rPXN0cl9zdWIoU2VuZGVyUmFuaywxLDEpKSxzMTd0aCAlPiUgbXV0YXRlKFNlbmRlclJhbms9c3RyX3N1YihTZW5kZXJSYW5rLDEsMSkpLFNlbmRlclJhbmspCmBgYAoKIyMgMTh0aApgYGB7cn0KZyhkMTh0aCAlPiUgbXV0YXRlKFNlbmRlclJhbms9c3RyX3N1YihTZW5kZXJSYW5rLDEsMSkpLHMxOHRoICU+JSBtdXRhdGUoU2VuZGVyUmFuaz1zdHJfc3ViKFNlbmRlclJhbmssMSwxKSksU2VuZGVyUmFuaykKYGBgCgojIFJlbGF0aW9uc2hpcAoKIyMgMTd0aApgYGB7cn0KZyhkMTd0aCxzMTd0aCxSZWxDb2RlKQpgYGAKCiMjIDE4dGgKYGBge3J9CmcoZDE4dGgsczE4dGgsUmVsQ29kZSkKYGBgCgoKIyBFZHVjYXRpb24KCiMjIDE3dGgKYGBge3J9CmcoZDE3dGggJT4lIG11dGF0ZShTZW5kZXJFZHVDb2RlPXN0cl9zdWIoU2VuZGVyRWR1Q29kZSwxLDEpKSxzMTd0aCAlPiUgbXV0YXRlKFNlbmRlckVkdUNvZGU9c3RyX3N1YihTZW5kZXJFZHVDb2RlLDEsMSkpLFNlbmRlckVkdUNvZGUpCmBgYAoKIyMgMTh0aApgYGB7cn0KZyhkMTh0aCAlPiUgbXV0YXRlKFNlbmRlckVkdUNvZGU9c3RyX3N1YihTZW5kZXJFZHVDb2RlLDEsMSkpLHMxOHRoICU+JSBtdXRhdGUoU2VuZGVyRWR1Q29kZT1zdHJfc3ViKFNlbmRlckVkdUNvZGUsMSwxKSksU2VuZGVyRWR1Q29kZSkKYGBgCgojIFJlZ2lvbgoKIyMgMTd0aApgYGB7cn0KZyhkMTd0aCxzMTd0aCxTZW5kZXJSZWdpb24pCmBgYAoKIyMgMTh0aApgYGB7cn0KZyhkMTh0aCxzMTh0aCxTZW5kZXJSZWdpb24pCmBgYAoKCiMgU2VuZGVyCgojIyAxN3RoCmBgYHtyfQpnKGQxN3RoLHMxN3RoLFNlbmRlcikKYGBgCgojIyAxOHRoCmBgYHtyfQpnKGQxOHRoLHMxOHRoLFNlbmRlcikKYGBgCg==