knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)

library(tidyverse)
library(feather)
library(minpack.lm)
library(latex2exp)
library(knitr)

new_freq_data <- read_feather("essay_word_counts_clean_freq.feather")
set.seed(1234)
theme_set(theme_bw())

Testing Zipf’s Law

We use Mandelbrot’s generalization of the Zipfian distribution (Mandelbrot 1962):

\(f(r) \propto \frac{1}{(r + \beta)^\alpha}\)

where \(f(r)\) is the word frequency, \(r\) is the freqency rank, \(\alpha \approx 1\), and \(\beta \approx 2.7\).

To fit the word frequency data, we use the equation

\(frequency = \rho\frac{1}{(rank + \beta)^\alpha}\)

where \(\rho\) is the unknown proportionality constant.

#estimate parameters p = rho, b = beta, a = alpha

eqfit <- nlsLM(freq ~ p *(1/(rank + b)^a),
              start = list(p = 1, b = 2.7, a = 1), #starting values
              data = new_freq_data)

coefs <- coef(eqfit)
p <- coefs[[1]]
b <- coefs[[2]]
a <- coefs[[3]]

#create predicted values for log frequency
new_freq_data2 <- new_freq_data %>%
  mutate(logfreqpred = log(p *(1/(rank + b)^a)))

#plot predicted log frequency against actual log frequency
ggplot(new_freq_data2) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpred), color = "red", size = 1.6) + 
  labs(title = "Log rank vs. log frequency with best fit line",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$'),
       caption = TeX('$\\rho = 34052.70, \\beta = 2.90, \\alpha = 1.05$'))

Next, we plot the residuals against log frequencies.

#calculate residuals
new_freq_data2$logfreqresid <- new_freq_data2$logfreq - new_freq_data2$logfreqpred

#plot log frequencies and residuals
ggplot(data = new_freq_data2, mapping = aes(x = logfreq, y = logfreqresid)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual, aggregate data",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple")

Repeat for each language code and add loess estimates.

lang_list <- unique(new_freq_data$L1_code)

#function to estimate zipf parameters and 
#calculate predicted log frequencies and residuals for each language code

eqfit_lc <- function(lang_code) {
  lcdata <- subset(new_freq_data, L1_code == lang_code)
  #estimate parameters
  mod <- nlsLM(freq ~ p *(1/(rank + b)^a),
              start = list(p = 1, b = 2.7, a = 1), #starting values
              data = lcdata)
  coefs <- coef(mod)
  lcdata$p <- coefs[[1]]
  lcdata$b <- coefs[[2]]
  lcdata$a <- coefs[[3]]  
  
  #calculate predicted values
  lcdata <- mutate(lcdata, logfreqpredz = log(p *(1/(rank + b)^a)))
  
  #calculate residuals
  lcdata <- mutate(lcdata, logfreqresidz = logfreq - logfreqpredz)
  
  lcdata
}

#function to calculate loess estimates

loess50 <- function(lang_code, sp) {
  data <- subset(new_freq_data, L1_code == lang_code)
  loess_mod50 <- loess(logfreq ~ logrank, data=data, span=sp)
  loess_est50 <- predict(loess_mod50)
  df <- as.data.frame(loess_est50)
}

#compile estimates for all language codes

zipf_loess_est <- data.frame(word = character(), #initalize dataframe
                             L1_code=character(),
                             freq=integer(),
                             logfreq=numeric(),
                             rank=integer(),
                             logrank=numeric(),
                             p=numeric(),
                             b=numeric(),
                             a=numeric(),
                             logfreqpredz=numeric(),
                             logfreqresidz=numeric(),
                             loess_est50=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_est <- rbind(zipf_loess_est, cbind(eqfit_lc(lang), loess50(lang, 0.50)))
}
#plot log frequencies versus zipf predictions
ggplot(zipf_loess_est) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpredz), color = "red") + 
  geom_line(aes(logrank, loess_est50), color = "cyan") + 
  labs(title = "Log rank vs. log frequency with best fit line, by language code",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$')) +
  facet_wrap(~L1_code)

#plot log frequencies and residuals
ggplot(data = zipf_loess_est, mapping = aes(x = logfreq, y = logfreqresidz)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual, by language code",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple") + 
  facet_wrap(~L1_code)

Zipfian distribution parameter estimates

zipf_param <- zipf_loess_est %>%
  select(L1_code, p, b, a) %>%
  unique() %>%
  transform(b = round(b, 2)) %>%
  transform(a = round(a, 2)) %>%
  arrange(desc(b))

rownames(zipf_param) <- NULL

zipf_param %>%
  kable(format = "html", col.names = c("L1_code", "rho", "beta", "alpha"), align = 'l', table.attr = "style='width:70%;'")
L1_code rho beta alpha
JPN 41789.82 4.66 1.10
KOR 43003.85 4.64 1.09
GER 45494.88 3.67 1.09
ITA 42597.25 3.51 1.12
FRE 41541.33 3.45 1.09
ARA 33391.43 3.23 1.06
SPA 41180.60 3.11 1.08
TUR 29694.38 2.54 1.01
HIN 31072.39 2.11 1.02
CHI 24056.43 1.79 0.96
TEL 20537.65 1.09 0.93

Pearson correlations between Zipf and loess estimates by language code

zipf_loess_est_sum <- zipf_loess_est %>%
  select(L1_code, logfreqpredz, loess_est50)

#function to calculate correlations and correlation p-value using Pearson test

get_corr <- function(lang_code) {
  data <- subset(zipf_loess_est_sum, L1_code == lang_code)
  data2 <- select(data, -L1_code)
  x <- as.matrix(data2)
  c <- cor.test(x[,1], x[,2], method = "pearson", conf.level = 0.95)
  cor_val <- round(c$estimate[[1]], 4)
  cor_CI <- c$conf.int
  cor_CI_lower <- round(cor_CI[1], 4)
  cor_CI_upper <- round(cor_CI[2], 4)
  row <- as.data.frame(t(c(lang_code, cor_val, cor_CI_lower, cor_CI_upper)))
  names(row) <- c("L1_code", "cor_val", "cor_CI_lower", "cor_CI_upper")
  rownames(row) <- NULL
  row
}

#compile correlations for all language codes

zipf_loess_cor <- data.frame(L1_code=character(), #initalize dataframe
                            cor_val=numeric(),
                            cor_CI_lower=numeric(),
                            cor_CI_upper=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_cor <- rbind(zipf_loess_cor, get_corr(lang)) %>%
    transform(cor_val = as.numeric(as.character(cor_val))) %>%
    transform(cor_CI_lower = as.numeric(as.character(cor_CI_lower))) %>%
    transform(cor_CI_upper = as.numeric(as.character(cor_CI_upper))) %>%
    arrange(desc(cor_val))
}

#re-order factor levels for language code, for graphing purposes
zipf_loess_cor$L1_code <- factor(zipf_loess_cor$L1_code, levels = zipf_loess_cor$L1_code)

#correlations table
zipf_loess_cor %>%
  mutate(cor_CI = paste0("(", cor_CI_lower, ", ", cor_CI_upper, ")")) %>%
  select(-cor_CI_lower, -cor_CI_upper) %>%
  kable(format = "html", col.names = c("L1_code", "Pearson coefficient", "95% Conf. Interval"), align = 'l', table.attr = "style='width:70%;'")
L1_code Pearson coefficient 95% Conf. Interval
GER 0.9410 (0.9387, 0.9432)
TUR 0.9402 (0.9379, 0.9425)
CHI 0.9387 (0.9363, 0.941)
HIN 0.9359 (0.9336, 0.938)
KOR 0.9348 (0.9322, 0.9373)
ITA 0.9347 (0.9321, 0.9373)
SPA 0.9347 (0.9322, 0.9371)
FRE 0.9346 (0.9321, 0.9371)
JPN 0.9338 (0.9311, 0.9365)
TEL 0.9312 (0.9287, 0.9336)
ARA 0.9164 (0.9132, 0.9195)
#correlations plot
ggplot(zipf_loess_cor) +
  labs(title = "Pearson coefficients for Zipf and loess estimates with 95% CIs",
       x = "Language code", y = "Correlation") + 
  geom_point(aes(L1_code, cor_val)) + 
  geom_segment(aes(x = L1_code, y = cor_CI_lower, xend = L1_code, yend = cor_CI_upper), color = "#6495ed")