In this document, I test the predictions of Zipf’s law of word frequency distribution using TOEFL essays. I examine the essays according to the essay-takers’ primary language and the essay scores to see whether and how primary language and essay score affects the word frequency distribution.

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)

library(tidyverse)
library(feather)
library(minpack.lm)
library(latex2exp)
library(knitr)

new_data <- read_feather("essay_word_counts_clean.feather")
metadata <- read.csv("merged_metadata.csv") %>%
  transform(L1_code = as.character(L1_code)) %>%
  transform(essay_id = as.character(essay_id))

set.seed(1234)
theme_set(theme_bw())
metadata <- metadata %>%
  mutate(score_range = ifelse(score <= 3, "Low", ifelse(score > 3, "High", NA))) %>%
  na.omit()

score_avg <- metadata %>%
  group_by(L1_code) %>%
  summarize(avg = mean(score)) %>%
  arrange(desc(avg))

#reorder country codes, for graphing purposes
metadata2 <- metadata %>%
  transform(L1_code = factor(L1_code, levels = score_avg$L1_code))

ggplot(metadata2, aes(L1_code, score)) + 
  geom_boxplot() + 
  labs(title = "Score distributions for TOEFL essays, by language", 
       x = "Language code", y = "Score")

metadata_count <- metadata2 %>%
  group_by(L1_code, score_range) %>%
  summarize(count = n())
  
ggplot(metadata_count, aes(L1_code, count)) + 
  geom_bar(aes(fill = score_range), stat = "identity", position = "stack", alpha = 0.7) + 
  labs(title = "Low vs. high scores for TOEFL essays, by language",
       subtitle = "High = higher than 3, Low = 3 or lower",
       x = "Language code", y = "Count",
       fill = "Score\ntype")

lang_list <- unique(new_data$L1_code)

#for splitting corpus into two independent corpora for calculating frequency and frequency rank
split <- function(data) {
  data$freq_n <- sapply(data$freq, function(x) rbinom(n = 1, size = x, prob = 0.5))
  data <- data %>%
    mutate(rank_n = freq - freq_n) %>%
    rename(n = freq)
  data
}

#for calculating log frequency
logfreq <- function(lang_code, data) {
  batch <- data %>%
    filter(L1_code == lang_code) %>%
    group_by(word, L1_code) %>%
    summarize(freq = sum(freq_n)) %>%
    arrange(desc(freq)) %>%
    ungroup() %>%
    mutate(logfreq = log(freq + 1))
  batch
}

logfreq_compile <- function(data) {
  #initialize dataframe
  logfreqd <- data.frame(word=character(), 
                 L1_code=character(), 
                 freq=numeric(),
                 logfreq=numeric())
  
  for(i in seq_along(lang_list)) {
    lang <- lang_list[i]
    logfreqd <- rbind(logfreqd, logfreq(lang, data))
  }
  logfreqd
}

#for calculating log rank

logrank <- function(lang_code, data) {
  batch <- data %>%
    filter(L1_code == lang_code) %>%
    group_by(word, L1_code) %>%
    summarize(rank_freq = sum(rank_n)) %>%
    arrange(desc(rank_freq)) %>%
    ungroup()
  batch$rank <- c(1:length(batch$word))
  batch <- batch %>%
    mutate(logrank = log(rank + 1))
  batch
}

logrank_compile <- function(data) {
  #initialize dataframe
  logranked <- data.frame(word=character(), 
                 L1_code=character(), 
                 rank_freq=numeric(),
                 rank=numeric(),
                 logrank=numeric())
  
  for(i in seq_along(lang_list)) {
    lang <- lang_list[i]
    logranked <- rbind(logranked, logrank(lang, data))
  }
  logranked
}

Word frequency-rank distributions for all words

#attach score metadata to frequency data

#separate by score type
new_data_score <- left_join(new_data, metadata)
new_data_high <- subset(new_data_score, score_range == "High")
new_data_low <- subset(new_data_score, score_range == "Low")
rm(new_data_score)

#perform binomial split on "high" and "low" subsets

#split each subset
word_freq_all_high <- split(new_data_high)
word_freq_all_low <- split(new_data_low)

#calculate log frequency for each subset
log_freq_all_high <- logfreq_compile(word_freq_all_high)
log_freq_all_low <- logfreq_compile(word_freq_all_low)

#calculate log rank for each subset
log_rank_all_high <- logrank_compile(word_freq_all_high)
log_rank_all_low <- logrank_compile(word_freq_all_low)

#combine data
log_data_all_high <- left_join(log_freq_all_high, log_rank_all_high) %>%
  mutate(essay_type = "High-scoring Essays")

log_data_all_low <- left_join(log_freq_all_low, log_rank_all_low) %>%
  mutate(essay_type = "Low-scoring Essays")

log_data_all <- rbind(log_data_all_high, log_data_all_low)

ggplot(log_data_all) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_smooth(aes(logrank, logfreq, color = L1_code)) + 
  labs(title = "All words") +
  facet_wrap(~essay_type) + 
  theme(legend.position = "bottom")

Testing Zipf’s Law

#estimate parameters p = rho, b = beta, a = alpha

#for high scores only

eqfit_high <- nlsLM(freq ~ p *(1/(rank + b)^a),
              start = list(p = 1, b = 2.7, a = 1), #starting values
              data = log_data_all_high)

coefs_high <- coef(eqfit_high)
p_high <- coefs_high[[1]]
b_high <- coefs_high[[2]]
a_high <- coefs_high[[3]]

##create predicted values for log frequency
log_data_all_high2 <- log_data_all_high %>%
  mutate(logfreqpred = log(p_high *(1/(rank + b_high)^a_high)))

##plot predicted log frequency against actual log frequency
ggplot(log_data_all_high2) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpred), color = "red", size = 1.6) + 
  labs(title = "Log rank vs. log frequency with best fit line, high-scoring essays",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$'),
       caption = TeX('$\\rho = 17791.98, \\beta = 2.67, \\alpha = 1.04$'))

#for low scores only
eqfit_low <- nlsLM(freq ~ p *(1/(rank + b)^a),
              start = list(p = 1, b = 2.7, a = 1), #starting values
              data = log_data_all_low)

coefs_low <- coef(eqfit_low)
p_low <- coefs_low[[1]]
b_low <- coefs_low[[2]]
a_low <- coefs_low[[3]]

##create predicted values for log frequency
log_data_all_low2 <- log_data_all_low %>%
  mutate(logfreqpred = log(p_low *(1/(rank + b_low)^a_low)))

##plot predicted log frequency against actual log frequency
ggplot(log_data_all_low2) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpred), color = "red", size = 1.6) + 
  labs(title = "Log rank vs. log frequency with best fit line, low-scoring essays",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$'),
       caption = TeX('$\\rho = 13109.11, \\beta = 3.22, \\alpha = 1.04$'))

Next, we plot the residuals against log frequencies.

#high scores only

##calculate residuals
log_data_all_high2$logfreqresid <- log_data_all_high2$logfreq - log_data_all_high2$logfreqpred

##plot log frequencies and residuals
ggplot(data = log_data_all_high2, mapping = aes(x = logfreq, y = logfreqresid)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual, high-scoring essays",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple")

#low scores only

##calculate residuals
log_data_all_low2$logfreqresid <- log_data_all_low2$logfreq - log_data_all_low2$logfreqpred

##plot log frequencies and residuals
ggplot(data = log_data_all_low2, mapping = aes(x = logfreq, y = logfreqresid)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual, low-scoring essays",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple")

Repeat for each language code and add loess estimates.

lang_list <- unique(new_data$L1_code)

#function to estimate zipf parameters and 
#calculate predicted log frequencies and residuals for each language code

eqfit_lc <- function(lang_code, data) {
  lcdata <- subset(data, L1_code == lang_code)
  #estimate parameters
  mod <- nlsLM(freq ~ p *(1/(rank + b)^a),
              start = list(p = 1, b = 2.7, a = 1), #starting values
              data = lcdata)
  coefs <- coef(mod)
  lcdata$p <- coefs[[1]]
  lcdata$b <- coefs[[2]]
  lcdata$a <- coefs[[3]]  
  
  #calculate predicted values
  lcdata <- mutate(lcdata, logfreqpredz = log(p *(1/(rank + b)^a)))
  
  #calculate residuals
  lcdata <- mutate(lcdata, logfreqresidz = logfreq - logfreqpredz)
  
  lcdata
}

#function to calculate loess estimates

loess50 <- function(lang_code, data, sp) {
  data <- subset(data, L1_code == lang_code)
  loess_mod50 <- loess(logfreq ~ logrank, data=data, span=sp)
  loess_est50 <- predict(loess_mod50)
  df <- as.data.frame(loess_est50)
}

#compile estimates for all language codes, high scores only

zipf_loess_est_high <- data.frame(word = character(), #initalize dataframe
                             L1_code=character(),
                             freq=integer(),
                             logfreq=numeric(),
                             rank=integer(),
                             logrank=numeric(),
                             p=numeric(),
                             b=numeric(),
                             a=numeric(),
                             logfreqpredz=numeric(),
                             logfreqresidz=numeric(),
                             loess_est50=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_est_high <- rbind(zipf_loess_est_high, cbind(eqfit_lc(lang, log_data_all_high), 
                                                          loess50(lang, log_data_all_high, 0.50)))
}

#compile estimates for all language codes, low scores only

zipf_loess_est_low <- data.frame(word = character(), #initalize dataframe
                             L1_code=character(),
                             freq=integer(),
                             logfreq=numeric(),
                             rank=integer(),
                             logrank=numeric(),
                             p=numeric(),
                             b=numeric(),
                             a=numeric(),
                             logfreqpredz=numeric(),
                             logfreqresidz=numeric(),
                             loess_est50=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_est_low <- rbind(zipf_loess_est_low, cbind(eqfit_lc(lang, log_data_all_low), 
                                                          loess50(lang, log_data_all_low, 0.50)))
}
#plot log frequencies versus zipf predictions

##high scores only
ggplot(zipf_loess_est_high) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpredz), color = "red") + 
  geom_line(aes(logrank, loess_est50), color = "cyan") + 
  labs(title = "Log rank vs. log frequency with best fit line, by language code",
       subtitle = "High-scoring essays",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$')) +
  facet_wrap(~L1_code)

##low scores only

ggplot(zipf_loess_est_low) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpredz), color = "red") + 
  geom_line(aes(logrank, loess_est50), color = "cyan") + 
  labs(title = "Log rank vs. log frequency with best fit line, by language code",
       subtitle = "Low-scoring essays",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$')) +
  facet_wrap(~L1_code)

#plot log frequencies and residuals

##high scores only
ggplot(data = zipf_loess_est_high, mapping = aes(x = logfreq, y = logfreqresidz)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual by language code, high-scoring essays",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple") + 
  facet_wrap(~L1_code)

##low scores only
ggplot(data = zipf_loess_est_low, mapping = aes(x = logfreq, y = logfreqresidz)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual by language code, low-scoring essays",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple") + 
  facet_wrap(~L1_code)

Zipfian distribution parameter estimates

zipf_param_high <- zipf_loess_est_high %>%
  select(L1_code, p, b, a) %>%
  unique() %>%
  transform(b = round(b, 2)) %>%
  transform(a = round(a, 2)) %>%
  arrange(desc(b))

rownames(zipf_param_high) <- NULL

zipf_param_high %>%
  kable(format = "html", caption = "High-scoring essays",
        col.names = c("L1_code", "rho", "beta", "alpha"), align = 'l', table.attr = "style='width:70%;'")
High-scoring essays
L1_code rho beta alpha
KOR 15284.74 3.71 1.07
GER 34649.03 3.62 1.09
FRE 25505.80 3.47 1.09
JPN 11994.46 3.42 1.06
ARA 12693.26 3.40 1.09
ITA 18942.78 3.40 1.11
SPA 26469.85 3.36 1.10
HIN 22202.32 1.99 1.02
TUR 14834.17 1.92 0.99
CHI 11799.37 1.74 0.98
TEL 11579.37 1.24 0.96
zipf_param_low <- zipf_loess_est_low %>%
  select(L1_code, p, b, a) %>%
  unique() %>%
  transform(b = round(b, 2)) %>%
  transform(a = round(a, 2)) %>%
  arrange(desc(b))

rownames(zipf_param_low) <- NULL

zipf_param_low %>%
  kable(format = "html", caption = "Low-scoring essays",
        col.names = c("L1_code", "rho", "beta", "alpha"), align = 'l', table.attr = "style='width:70%;'")
Low-scoring essays
L1_code rho beta alpha
JPN 28433.009 5.90 1.14
KOR 24845.454 5.69 1.12
GER 6905.231 3.95 1.09
FRE 13852.982 3.86 1.10
TUR 12361.966 3.53 1.04
ARA 18372.062 3.28 1.05
ITA 16242.756 2.85 1.07
SPA 11690.797 2.79 1.06
HIN 5692.325 2.27 0.99
CHI 9886.426 1.74 0.94
TEL 9071.447 1.48 0.94

Pearson correlations between Zipf and loess estimates by language code

#function to calculate correlations and correlation p-value using Pearson test

get_corr <- function(lang_code, data) {
  data2 <- subset(data, L1_code == lang_code)
  data3 <- select(data2, -L1_code)
  x <- as.matrix(data3)
  c <- cor.test(x[,1], x[,2], method = "pearson", conf.level = 0.95)
  cor_val <- round(c$estimate[[1]], 4)
  cor_CI <- c$conf.int
  cor_CI_lower <- round(cor_CI[1], 4)
  cor_CI_upper <- round(cor_CI[2], 4)
  row <- as.data.frame(t(c(lang_code, cor_val, cor_CI_lower, cor_CI_upper)))
  names(row) <- c("L1_code", "cor_val", "cor_CI_lower", "cor_CI_upper")
  rownames(row) <- NULL
  row
}

#high scores

zipf_loess_est_sum_high <- zipf_loess_est_high %>%
  select(L1_code, logfreqpredz, loess_est50)

##compile correlations for all language codes

zipf_loess_cor_high <- data.frame(L1_code=character(), #initalize dataframe
                            cor_val=numeric(),
                            cor_CI_lower=numeric(),
                            cor_CI_upper=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_cor_high <- rbind(zipf_loess_cor_high, get_corr(lang, zipf_loess_est_sum_high)) %>%
    transform(cor_val = as.numeric(as.character(cor_val))) %>%
    transform(cor_CI_lower = as.numeric(as.character(cor_CI_lower))) %>%
    transform(cor_CI_upper = as.numeric(as.character(cor_CI_upper))) %>%
    arrange(desc(cor_val))
}

##re-order factor levels for language code, for graphing purposes
zipf_loess_cor_high$L1_code <- factor(zipf_loess_cor_high$L1_code, 
                                      levels = zipf_loess_cor_high$L1_code)

#low scores

zipf_loess_est_sum_low <- zipf_loess_est_low %>%
  select(L1_code, logfreqpredz, loess_est50)

##compile correlations for all language codes

zipf_loess_cor_low <- data.frame(L1_code=character(), #initalize dataframe
                            cor_val=numeric(),
                            cor_CI_lower=numeric(),
                            cor_CI_upper=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_cor_low <- rbind(zipf_loess_cor_low, get_corr(lang, zipf_loess_est_sum_low)) %>%
    transform(cor_val = as.numeric(as.character(cor_val))) %>%
    transform(cor_CI_lower = as.numeric(as.character(cor_CI_lower))) %>%
    transform(cor_CI_upper = as.numeric(as.character(cor_CI_upper))) %>%
    arrange(desc(cor_val))
}

##re-order factor levels for language code, for graphing purposes
zipf_loess_cor_low$L1_code <- factor(zipf_loess_cor_low$L1_code, 
                                      levels = zipf_loess_cor_low$L1_code)
##correlations table
zipf_loess_cor_high %>%
  mutate(cor_CI = paste0("(", cor_CI_lower, ", ", cor_CI_upper, ")")) %>%
  select(-cor_CI_lower, -cor_CI_upper) %>%
  kable(format = "html", caption = "High-scoring essays",
        col.names = c("L1_code", "Pearson coefficient", "95% Conf. Interval"), align = 'l', table.attr = "style='width:70%;'")
High-scoring essays
L1_code Pearson coefficient 95% Conf. Interval
GER 0.9388 (0.9362, 0.9412)
HIN 0.9320 (0.9294, 0.9344)
TUR 0.9307 (0.9277, 0.9336)
SPA 0.9290 (0.926, 0.9319)
FRE 0.9262 (0.9231, 0.9292)
CHI 0.9244 (0.921, 0.9276)
ITA 0.9242 (0.9206, 0.9275)
TEL 0.9210 (0.9177, 0.9242)
JPN 0.9203 (0.9162, 0.9243)
KOR 0.9188 (0.9149, 0.9225)
ARA 0.9076 (0.9032, 0.9119)
zipf_loess_cor_low %>%
  mutate(cor_CI = paste0("(", cor_CI_lower, ", ", cor_CI_upper, ")")) %>%
  select(-cor_CI_lower, -cor_CI_upper) %>%
  kable(format = "html", caption = "Low-scoring essays",
        col.names = c("L1_code", "Pearson coefficient", "95% Conf. Interval"), align = 'l', table.attr = "style='width:70%;'")
Low-scoring essays
L1_code Pearson coefficient 95% Conf. Interval
TUR 0.9243 (0.9204, 0.9281)
CHI 0.9229 (0.9191, 0.9265)
JPN 0.9213 (0.9175, 0.925)
ITA 0.9175 (0.9133, 0.9214)
KOR 0.9174 (0.9134, 0.9211)
TEL 0.9114 (0.9072, 0.9153)
SPA 0.9048 (0.8998, 0.9096)
FRE 0.9027 (0.8976, 0.9075)
ARA 0.8982 (0.8937, 0.9026)
GER 0.8897 (0.8828, 0.8961)
HIN 0.8860 (0.88, 0.8918)
#combine correlations data for single correlations plot

zipf_loess_cor_high <- zipf_loess_cor_high %>%
  mutate(score_type = "high")

zipf_loess_cor_low <- zipf_loess_cor_low %>%
  mutate(score_type = "low")

zipf_loess_cor <- rbind(zipf_loess_cor_high, zipf_loess_cor_low)

#correlations plot

ggplot(zipf_loess_cor) +
  labs(title = "Pearson coefficients for Zipf and loess estimates with 95% CIs",
       x = "Language code", y = "Correlation") + 
  geom_point(aes(L1_code, cor_val)) + 
  geom_segment(data = zipf_loess_cor,
               aes(x = L1_code, y = cor_CI_lower, xend = L1_code, yend = cor_CI_upper,
                   color = score_type))