knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)
library(tidyverse)
library(feather)
library(minpack.lm)
library(latex2exp)
library(knitr)
new_freq_data <- read_feather("essay_word_counts_clean_freq.feather")
set.seed(1234)
theme_set(theme_bw())
We use Mandelbrot’s generalization of the Zipfian distribution (Mandelbrot 1962):
\(f(r) \propto \frac{1}{(r + \beta)^\alpha}\)
where \(f(r)\) is the word frequency, \(r\) is the freqency rank, \(\alpha \approx 1\), and \(\beta \approx 2.7\).
To fit the word frequency data, we use the equation
\(frequency = \rho\frac{1}{(rank + \beta)^\alpha}\)
where \(\rho\) is the unknown proportionality constant.
#estimate parameters p = rho, b = beta, a = alpha
eqfit <- nlsLM(freq ~ p *(1/(rank + b)^a),
start = list(p = 1, b = 2.7, a = 1), #starting values
data = new_freq_data)
coefs <- coef(eqfit)
p <- coefs[[1]]
b <- coefs[[2]]
a <- coefs[[3]]
#create predicted values for log frequency
new_freq_data2 <- new_freq_data %>%
mutate(logfreqpred = log(p *(1/(rank + b)^a)))
#plot predicted log frequency against actual log frequency
ggplot(new_freq_data2) +
geom_point(aes(logrank, logfreq)) +
geom_line(aes(logrank, logfreqpred), color = "red", size = 1.6) +
labs(title = "Log rank vs. log frequency with best fit line",
x = "Log rank", y = "Log frequency",
subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$'),
caption = TeX('$\\rho = 34052.70, \\beta = 2.90, \\alpha = 1.05$'))
Next, we plot the residuals against log frequencies.
#calculate residuals
new_freq_data2$logfreqresid <- new_freq_data2$logfreq - new_freq_data2$logfreqpred
#plot log frequencies and residuals
ggplot(data = new_freq_data2, mapping = aes(x = logfreq, y = logfreqresid)) +
geom_point() +
labs(title = "Log Frequency vs. residual, aggregate data",
x = "Log frequency",
y = "Residual") +
geom_hline(yintercept = 0, linetype="longdash", color = "purple")
Repeat for each language code and add loess estimates.
lang_list <- unique(new_freq_data$L1_code)
#function to estimate zipf parameters and
#calculate predicted log frequencies and residuals for each language code
eqfit_lc <- function(lang_code) {
lcdata <- subset(new_freq_data, L1_code == lang_code)
#estimate parameters
mod <- nlsLM(freq ~ p *(1/(rank + b)^a),
start = list(p = 1, b = 2.7, a = 1), #starting values
data = lcdata)
coefs <- coef(mod)
lcdata$p <- coefs[[1]]
lcdata$b <- coefs[[2]]
lcdata$a <- coefs[[3]]
#calculate predicted values
lcdata <- mutate(lcdata, logfreqpredz = log(p *(1/(rank + b)^a)))
#calculate residuals
lcdata <- mutate(lcdata, logfreqresidz = logfreq - logfreqpredz)
lcdata
}
#function to calculate loess estimates
loess50 <- function(lang_code, sp) {
data <- subset(new_freq_data, L1_code == lang_code)
loess_mod50 <- loess(logfreq ~ logrank, data=data, span=sp)
loess_est50 <- predict(loess_mod50)
df <- as.data.frame(loess_est50)
}
#compile estimates for all language codes
zipf_loess_est <- data.frame(word = character(), #initalize dataframe
L1_code=character(),
freq=integer(),
logfreq=numeric(),
rank=integer(),
logrank=numeric(),
p=numeric(),
b=numeric(),
a=numeric(),
logfreqpredz=numeric(),
logfreqresidz=numeric(),
loess_est50=numeric())
for(i in seq_along(lang_list)) {
lang <- lang_list[i]
zipf_loess_est <- rbind(zipf_loess_est, cbind(eqfit_lc(lang), loess50(lang, 0.50)))
}
#plot log frequencies versus zipf predictions
ggplot(zipf_loess_est) +
geom_point(aes(logrank, logfreq)) +
geom_line(aes(logrank, logfreqpredz), color = "red") +
geom_line(aes(logrank, loess_est50), color = "cyan") +
labs(title = "Log rank vs. log frequency with best fit line, by language code",
x = "Log rank", y = "Log frequency",
subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$')) +
facet_wrap(~L1_code)
#plot log frequencies and residuals
ggplot(data = zipf_loess_est, mapping = aes(x = logfreq, y = logfreqresidz)) +
geom_point() +
labs(title = "Log Frequency vs. residual, by language code",
x = "Log frequency",
y = "Residual") +
geom_hline(yintercept = 0, linetype="longdash", color = "purple") +
facet_wrap(~L1_code)
zipf_param <- zipf_loess_est %>%
select(L1_code, p, b, a) %>%
unique() %>%
transform(b = round(b, 2)) %>%
transform(a = round(a, 2)) %>%
arrange(desc(b))
rownames(zipf_param) <- NULL
zipf_param %>%
kable(format = "html", col.names = c("L1_code", "rho", "beta", "alpha"), align = 'l', table.attr = "style='width:70%;'")
| L1_code | rho | beta | alpha |
|---|---|---|---|
| JPN | 41789.82 | 4.66 | 1.10 |
| KOR | 43003.85 | 4.64 | 1.09 |
| GER | 45494.88 | 3.67 | 1.09 |
| ITA | 42597.25 | 3.51 | 1.12 |
| FRE | 41541.33 | 3.45 | 1.09 |
| ARA | 33391.43 | 3.23 | 1.06 |
| SPA | 41180.60 | 3.11 | 1.08 |
| TUR | 29694.38 | 2.54 | 1.01 |
| HIN | 31072.39 | 2.11 | 1.02 |
| CHI | 24056.43 | 1.79 | 0.96 |
| TEL | 20537.65 | 1.09 | 0.93 |
zipf_loess_est_sum <- zipf_loess_est %>%
select(L1_code, logfreqpredz, loess_est50)
#function to calculate correlations and correlation p-value using Pearson test
get_corr <- function(lang_code) {
data <- subset(zipf_loess_est_sum, L1_code == lang_code)
data2 <- select(data, -L1_code)
x <- as.matrix(data2)
c <- cor.test(x[,1], x[,2], method = "pearson", conf.level = 0.95)
cor_val <- round(c$estimate[[1]], 4)
cor_CI <- c$conf.int
cor_CI_lower <- round(cor_CI[1], 4)
cor_CI_upper <- round(cor_CI[2], 4)
row <- as.data.frame(t(c(lang_code, cor_val, cor_CI_lower, cor_CI_upper)))
names(row) <- c("L1_code", "cor_val", "cor_CI_lower", "cor_CI_upper")
rownames(row) <- NULL
row
}
#compile correlations for all language codes
zipf_loess_cor <- data.frame(L1_code=character(), #initalize dataframe
cor_val=numeric(),
cor_CI_lower=numeric(),
cor_CI_upper=numeric())
for(i in seq_along(lang_list)) {
lang <- lang_list[i]
zipf_loess_cor <- rbind(zipf_loess_cor, get_corr(lang)) %>%
transform(cor_val = as.numeric(as.character(cor_val))) %>%
transform(cor_CI_lower = as.numeric(as.character(cor_CI_lower))) %>%
transform(cor_CI_upper = as.numeric(as.character(cor_CI_upper))) %>%
arrange(desc(cor_val))
}
#re-order factor levels for language code, for graphing purposes
zipf_loess_cor$L1_code <- factor(zipf_loess_cor$L1_code, levels = zipf_loess_cor$L1_code)
#correlations table
zipf_loess_cor %>%
mutate(cor_CI = paste0("(", cor_CI_lower, ", ", cor_CI_upper, ")")) %>%
select(-cor_CI_lower, -cor_CI_upper) %>%
kable(format = "html", col.names = c("L1_code", "Pearson coefficient", "95% Conf. Interval"), align = 'l', table.attr = "style='width:70%;'")
| L1_code | Pearson coefficient | 95% Conf. Interval |
|---|---|---|
| GER | 0.9410 | (0.9387, 0.9432) |
| TUR | 0.9402 | (0.9379, 0.9425) |
| CHI | 0.9387 | (0.9363, 0.941) |
| HIN | 0.9359 | (0.9336, 0.938) |
| KOR | 0.9348 | (0.9322, 0.9373) |
| ITA | 0.9347 | (0.9321, 0.9373) |
| SPA | 0.9347 | (0.9322, 0.9371) |
| FRE | 0.9346 | (0.9321, 0.9371) |
| JPN | 0.9338 | (0.9311, 0.9365) |
| TEL | 0.9312 | (0.9287, 0.9336) |
| ARA | 0.9164 | (0.9132, 0.9195) |
#correlations plot
ggplot(zipf_loess_cor) +
labs(title = "Pearson coefficients for Zipf and loess estimates with 95% CIs",
x = "Language code", y = "Correlation") +
geom_point(aes(L1_code, cor_val)) +
geom_segment(aes(x = L1_code, y = cor_CI_lower, xend = L1_code, yend = cor_CI_upper), color = "#6495ed")