In this document, I test the predictions of Zipf’s law of word frequency distribution using TOEFL essays. I examine the essays according to the essay-takers’ primary language and the essay scores to see whether and how primary language and essay score affects the word frequency distribution.
knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)
library(tidyverse)
library(feather)
library(minpack.lm)
library(latex2exp)
library(knitr)
new_data <- read_feather("essay_word_counts_clean.feather")
metadata <- read.csv("merged_metadata.csv") %>%
transform(L1_code = as.character(L1_code)) %>%
transform(essay_id = as.character(essay_id))
set.seed(1234)
theme_set(theme_bw())
metadata <- metadata %>%
mutate(score_range = ifelse(score <= 3, "Low", ifelse(score > 3, "High", NA))) %>%
na.omit()
score_avg <- metadata %>%
group_by(L1_code) %>%
summarize(avg = mean(score)) %>%
arrange(desc(avg))
#reorder country codes, for graphing purposes
metadata2 <- metadata %>%
transform(L1_code = factor(L1_code, levels = score_avg$L1_code))
ggplot(metadata2, aes(L1_code, score)) +
geom_boxplot() +
labs(title = "Score distributions for TOEFL essays, by language",
x = "Language code", y = "Score")

metadata_count <- metadata2 %>%
group_by(L1_code, score_range) %>%
summarize(count = n())
ggplot(metadata_count, aes(L1_code, count)) +
geom_bar(aes(fill = score_range), stat = "identity", position = "stack", alpha = 0.7) +
labs(title = "Low vs. high scores for TOEFL essays, by language",
subtitle = "High = higher than 3, Low = 3 or lower",
x = "Language code", y = "Count",
fill = "Score\ntype")

lang_list <- unique(new_data$L1_code)
#for splitting corpus into two independent corpora for calculating frequency and frequency rank
split <- function(data) {
data$freq_n <- sapply(data$freq, function(x) rbinom(n = 1, size = x, prob = 0.5))
data <- data %>%
mutate(rank_n = freq - freq_n) %>%
rename(n = freq)
data
}
#for calculating log frequency
logfreq <- function(lang_code, data) {
batch <- data %>%
filter(L1_code == lang_code) %>%
group_by(word, L1_code) %>%
summarize(freq = sum(freq_n)) %>%
arrange(desc(freq)) %>%
ungroup() %>%
mutate(logfreq = log(freq + 1))
batch
}
logfreq_compile <- function(data) {
#initialize dataframe
logfreqd <- data.frame(word=character(),
L1_code=character(),
freq=numeric(),
logfreq=numeric())
for(i in seq_along(lang_list)) {
lang <- lang_list[i]
logfreqd <- rbind(logfreqd, logfreq(lang, data))
}
logfreqd
}
#for calculating log rank
logrank <- function(lang_code, data) {
batch <- data %>%
filter(L1_code == lang_code) %>%
group_by(word, L1_code) %>%
summarize(rank_freq = sum(rank_n)) %>%
arrange(desc(rank_freq)) %>%
ungroup()
batch$rank <- c(1:length(batch$word))
batch <- batch %>%
mutate(logrank = log(rank + 1))
batch
}
logrank_compile <- function(data) {
#initialize dataframe
logranked <- data.frame(word=character(),
L1_code=character(),
rank_freq=numeric(),
rank=numeric(),
logrank=numeric())
for(i in seq_along(lang_list)) {
lang <- lang_list[i]
logranked <- rbind(logranked, logrank(lang, data))
}
logranked
}
Word frequency-rank distributions for all words
#attach score metadata to frequency data
#separate by score type
new_data_score <- left_join(new_data, metadata)
new_data_high <- subset(new_data_score, score_range == "High")
new_data_low <- subset(new_data_score, score_range == "Low")
rm(new_data_score)
#perform binomial split on "high" and "low" subsets
#split each subset
word_freq_all_high <- split(new_data_high)
word_freq_all_low <- split(new_data_low)
#calculate log frequency for each subset
log_freq_all_high <- logfreq_compile(word_freq_all_high)
log_freq_all_low <- logfreq_compile(word_freq_all_low)
#calculate log rank for each subset
log_rank_all_high <- logrank_compile(word_freq_all_high)
log_rank_all_low <- logrank_compile(word_freq_all_low)
#combine data
log_data_all_high <- left_join(log_freq_all_high, log_rank_all_high) %>%
mutate(essay_type = "High-scoring Essays")
log_data_all_low <- left_join(log_freq_all_low, log_rank_all_low) %>%
mutate(essay_type = "Low-scoring Essays")
log_data_all <- rbind(log_data_all_high, log_data_all_low)
ggplot(log_data_all) +
geom_point(aes(logrank, logfreq)) +
geom_smooth(aes(logrank, logfreq, color = L1_code)) +
labs(title = "All words") +
facet_wrap(~essay_type) +
theme(legend.position = "bottom")

Testing Zipf’s Law
#estimate parameters p = rho, b = beta, a = alpha
#for high scores only
eqfit_high <- nlsLM(freq ~ p *(1/(rank + b)^a),
start = list(p = 1, b = 2.7, a = 1), #starting values
data = log_data_all_high)
coefs_high <- coef(eqfit_high)
p_high <- coefs_high[[1]]
b_high <- coefs_high[[2]]
a_high <- coefs_high[[3]]
##create predicted values for log frequency
log_data_all_high2 <- log_data_all_high %>%
mutate(logfreqpred = log(p_high *(1/(rank + b_high)^a_high)))
##plot predicted log frequency against actual log frequency
ggplot(log_data_all_high2) +
geom_point(aes(logrank, logfreq)) +
geom_line(aes(logrank, logfreqpred), color = "red", size = 1.6) +
labs(title = "Log rank vs. log frequency with best fit line, high-scoring essays",
x = "Log rank", y = "Log frequency",
subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$'),
caption = TeX('$\\rho = 17791.98, \\beta = 2.67, \\alpha = 1.04$'))

#for low scores only
eqfit_low <- nlsLM(freq ~ p *(1/(rank + b)^a),
start = list(p = 1, b = 2.7, a = 1), #starting values
data = log_data_all_low)
coefs_low <- coef(eqfit_low)
p_low <- coefs_low[[1]]
b_low <- coefs_low[[2]]
a_low <- coefs_low[[3]]
##create predicted values for log frequency
log_data_all_low2 <- log_data_all_low %>%
mutate(logfreqpred = log(p_low *(1/(rank + b_low)^a_low)))
##plot predicted log frequency against actual log frequency
ggplot(log_data_all_low2) +
geom_point(aes(logrank, logfreq)) +
geom_line(aes(logrank, logfreqpred), color = "red", size = 1.6) +
labs(title = "Log rank vs. log frequency with best fit line, low-scoring essays",
x = "Log rank", y = "Log frequency",
subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$'),
caption = TeX('$\\rho = 13109.11, \\beta = 3.22, \\alpha = 1.04$'))

Next, we plot the residuals against log frequencies.
#high scores only
##calculate residuals
log_data_all_high2$logfreqresid <- log_data_all_high2$logfreq - log_data_all_high2$logfreqpred
##plot log frequencies and residuals
ggplot(data = log_data_all_high2, mapping = aes(x = logfreq, y = logfreqresid)) +
geom_point() +
labs(title = "Log Frequency vs. residual, high-scoring essays",
x = "Log frequency",
y = "Residual") +
geom_hline(yintercept = 0, linetype="longdash", color = "purple")

#low scores only
##calculate residuals
log_data_all_low2$logfreqresid <- log_data_all_low2$logfreq - log_data_all_low2$logfreqpred
##plot log frequencies and residuals
ggplot(data = log_data_all_low2, mapping = aes(x = logfreq, y = logfreqresid)) +
geom_point() +
labs(title = "Log Frequency vs. residual, low-scoring essays",
x = "Log frequency",
y = "Residual") +
geom_hline(yintercept = 0, linetype="longdash", color = "purple")

Repeat for each language code and add loess estimates.
lang_list <- unique(new_data$L1_code)
#function to estimate zipf parameters and
#calculate predicted log frequencies and residuals for each language code
eqfit_lc <- function(lang_code, data) {
lcdata <- subset(data, L1_code == lang_code)
#estimate parameters
mod <- nlsLM(freq ~ p *(1/(rank + b)^a),
start = list(p = 1, b = 2.7, a = 1), #starting values
data = lcdata)
coefs <- coef(mod)
lcdata$p <- coefs[[1]]
lcdata$b <- coefs[[2]]
lcdata$a <- coefs[[3]]
#calculate predicted values
lcdata <- mutate(lcdata, logfreqpredz = log(p *(1/(rank + b)^a)))
#calculate residuals
lcdata <- mutate(lcdata, logfreqresidz = logfreq - logfreqpredz)
lcdata
}
#function to calculate loess estimates
loess50 <- function(lang_code, data, sp) {
data <- subset(data, L1_code == lang_code)
loess_mod50 <- loess(logfreq ~ logrank, data=data, span=sp)
loess_est50 <- predict(loess_mod50)
df <- as.data.frame(loess_est50)
}
#compile estimates for all language codes, high scores only
zipf_loess_est_high <- data.frame(word = character(), #initalize dataframe
L1_code=character(),
freq=integer(),
logfreq=numeric(),
rank=integer(),
logrank=numeric(),
p=numeric(),
b=numeric(),
a=numeric(),
logfreqpredz=numeric(),
logfreqresidz=numeric(),
loess_est50=numeric())
for(i in seq_along(lang_list)) {
lang <- lang_list[i]
zipf_loess_est_high <- rbind(zipf_loess_est_high, cbind(eqfit_lc(lang, log_data_all_high),
loess50(lang, log_data_all_high, 0.50)))
}
#compile estimates for all language codes, low scores only
zipf_loess_est_low <- data.frame(word = character(), #initalize dataframe
L1_code=character(),
freq=integer(),
logfreq=numeric(),
rank=integer(),
logrank=numeric(),
p=numeric(),
b=numeric(),
a=numeric(),
logfreqpredz=numeric(),
logfreqresidz=numeric(),
loess_est50=numeric())
for(i in seq_along(lang_list)) {
lang <- lang_list[i]
zipf_loess_est_low <- rbind(zipf_loess_est_low, cbind(eqfit_lc(lang, log_data_all_low),
loess50(lang, log_data_all_low, 0.50)))
}
#plot log frequencies versus zipf predictions
##high scores only
ggplot(zipf_loess_est_high) +
geom_point(aes(logrank, logfreq)) +
geom_line(aes(logrank, logfreqpredz), color = "red") +
geom_line(aes(logrank, loess_est50), color = "cyan") +
labs(title = "Log rank vs. log frequency with best fit line, by language code",
subtitle = "High-scoring essays",
x = "Log rank", y = "Log frequency",
subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$')) +
facet_wrap(~L1_code)

##low scores only
ggplot(zipf_loess_est_low) +
geom_point(aes(logrank, logfreq)) +
geom_line(aes(logrank, logfreqpredz), color = "red") +
geom_line(aes(logrank, loess_est50), color = "cyan") +
labs(title = "Log rank vs. log frequency with best fit line, by language code",
subtitle = "Low-scoring essays",
x = "Log rank", y = "Log frequency",
subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$')) +
facet_wrap(~L1_code)

#plot log frequencies and residuals
##high scores only
ggplot(data = zipf_loess_est_high, mapping = aes(x = logfreq, y = logfreqresidz)) +
geom_point() +
labs(title = "Log Frequency vs. residual by language code, high-scoring essays",
x = "Log frequency",
y = "Residual") +
geom_hline(yintercept = 0, linetype="longdash", color = "purple") +
facet_wrap(~L1_code)

##low scores only
ggplot(data = zipf_loess_est_low, mapping = aes(x = logfreq, y = logfreqresidz)) +
geom_point() +
labs(title = "Log Frequency vs. residual by language code, low-scoring essays",
x = "Log frequency",
y = "Residual") +
geom_hline(yintercept = 0, linetype="longdash", color = "purple") +
facet_wrap(~L1_code)

Zipfian distribution parameter estimates
zipf_param_high <- zipf_loess_est_high %>%
select(L1_code, p, b, a) %>%
unique() %>%
transform(b = round(b, 2)) %>%
transform(a = round(a, 2)) %>%
arrange(desc(b))
rownames(zipf_param_high) <- NULL
zipf_param_high %>%
kable(format = "html", caption = "High-scoring essays",
col.names = c("L1_code", "rho", "beta", "alpha"), align = 'l', table.attr = "style='width:70%;'")
High-scoring essays
|
L1_code
|
rho
|
beta
|
alpha
|
|
KOR
|
15284.74
|
3.71
|
1.07
|
|
GER
|
34649.03
|
3.62
|
1.09
|
|
FRE
|
25505.80
|
3.47
|
1.09
|
|
JPN
|
11994.46
|
3.42
|
1.06
|
|
ARA
|
12693.26
|
3.40
|
1.09
|
|
ITA
|
18942.78
|
3.40
|
1.11
|
|
SPA
|
26469.85
|
3.36
|
1.10
|
|
HIN
|
22202.32
|
1.99
|
1.02
|
|
TUR
|
14834.17
|
1.92
|
0.99
|
|
CHI
|
11799.37
|
1.74
|
0.98
|
|
TEL
|
11579.37
|
1.24
|
0.96
|
zipf_param_low <- zipf_loess_est_low %>%
select(L1_code, p, b, a) %>%
unique() %>%
transform(b = round(b, 2)) %>%
transform(a = round(a, 2)) %>%
arrange(desc(b))
rownames(zipf_param_low) <- NULL
zipf_param_low %>%
kable(format = "html", caption = "Low-scoring essays",
col.names = c("L1_code", "rho", "beta", "alpha"), align = 'l', table.attr = "style='width:70%;'")
Low-scoring essays
|
L1_code
|
rho
|
beta
|
alpha
|
|
JPN
|
28433.009
|
5.90
|
1.14
|
|
KOR
|
24845.454
|
5.69
|
1.12
|
|
GER
|
6905.231
|
3.95
|
1.09
|
|
FRE
|
13852.982
|
3.86
|
1.10
|
|
TUR
|
12361.966
|
3.53
|
1.04
|
|
ARA
|
18372.062
|
3.28
|
1.05
|
|
ITA
|
16242.756
|
2.85
|
1.07
|
|
SPA
|
11690.797
|
2.79
|
1.06
|
|
HIN
|
5692.325
|
2.27
|
0.99
|
|
CHI
|
9886.426
|
1.74
|
0.94
|
|
TEL
|
9071.447
|
1.48
|
0.94
|
Pearson correlations between Zipf and loess estimates by language code
#function to calculate correlations and correlation p-value using Pearson test
get_corr <- function(lang_code, data) {
data2 <- subset(data, L1_code == lang_code)
data3 <- select(data2, -L1_code)
x <- as.matrix(data3)
c <- cor.test(x[,1], x[,2], method = "pearson", conf.level = 0.95)
cor_val <- round(c$estimate[[1]], 4)
cor_CI <- c$conf.int
cor_CI_lower <- round(cor_CI[1], 4)
cor_CI_upper <- round(cor_CI[2], 4)
row <- as.data.frame(t(c(lang_code, cor_val, cor_CI_lower, cor_CI_upper)))
names(row) <- c("L1_code", "cor_val", "cor_CI_lower", "cor_CI_upper")
rownames(row) <- NULL
row
}
#high scores
zipf_loess_est_sum_high <- zipf_loess_est_high %>%
select(L1_code, logfreqpredz, loess_est50)
##compile correlations for all language codes
zipf_loess_cor_high <- data.frame(L1_code=character(), #initalize dataframe
cor_val=numeric(),
cor_CI_lower=numeric(),
cor_CI_upper=numeric())
for(i in seq_along(lang_list)) {
lang <- lang_list[i]
zipf_loess_cor_high <- rbind(zipf_loess_cor_high, get_corr(lang, zipf_loess_est_sum_high)) %>%
transform(cor_val = as.numeric(as.character(cor_val))) %>%
transform(cor_CI_lower = as.numeric(as.character(cor_CI_lower))) %>%
transform(cor_CI_upper = as.numeric(as.character(cor_CI_upper))) %>%
arrange(desc(cor_val))
}
##re-order factor levels for language code, for graphing purposes
zipf_loess_cor_high$L1_code <- factor(zipf_loess_cor_high$L1_code,
levels = zipf_loess_cor_high$L1_code)
#low scores
zipf_loess_est_sum_low <- zipf_loess_est_low %>%
select(L1_code, logfreqpredz, loess_est50)
##compile correlations for all language codes
zipf_loess_cor_low <- data.frame(L1_code=character(), #initalize dataframe
cor_val=numeric(),
cor_CI_lower=numeric(),
cor_CI_upper=numeric())
for(i in seq_along(lang_list)) {
lang <- lang_list[i]
zipf_loess_cor_low <- rbind(zipf_loess_cor_low, get_corr(lang, zipf_loess_est_sum_low)) %>%
transform(cor_val = as.numeric(as.character(cor_val))) %>%
transform(cor_CI_lower = as.numeric(as.character(cor_CI_lower))) %>%
transform(cor_CI_upper = as.numeric(as.character(cor_CI_upper))) %>%
arrange(desc(cor_val))
}
##re-order factor levels for language code, for graphing purposes
zipf_loess_cor_low$L1_code <- factor(zipf_loess_cor_low$L1_code,
levels = zipf_loess_cor_low$L1_code)
##correlations table
zipf_loess_cor_high %>%
mutate(cor_CI = paste0("(", cor_CI_lower, ", ", cor_CI_upper, ")")) %>%
select(-cor_CI_lower, -cor_CI_upper) %>%
kable(format = "html", caption = "High-scoring essays",
col.names = c("L1_code", "Pearson coefficient", "95% Conf. Interval"), align = 'l', table.attr = "style='width:70%;'")
High-scoring essays
|
L1_code
|
Pearson coefficient
|
95% Conf. Interval
|
|
GER
|
0.9388
|
(0.9362, 0.9412)
|
|
HIN
|
0.9320
|
(0.9294, 0.9344)
|
|
TUR
|
0.9307
|
(0.9277, 0.9336)
|
|
SPA
|
0.9290
|
(0.926, 0.9319)
|
|
FRE
|
0.9262
|
(0.9231, 0.9292)
|
|
CHI
|
0.9244
|
(0.921, 0.9276)
|
|
ITA
|
0.9242
|
(0.9206, 0.9275)
|
|
TEL
|
0.9210
|
(0.9177, 0.9242)
|
|
JPN
|
0.9203
|
(0.9162, 0.9243)
|
|
KOR
|
0.9188
|
(0.9149, 0.9225)
|
|
ARA
|
0.9076
|
(0.9032, 0.9119)
|
zipf_loess_cor_low %>%
mutate(cor_CI = paste0("(", cor_CI_lower, ", ", cor_CI_upper, ")")) %>%
select(-cor_CI_lower, -cor_CI_upper) %>%
kable(format = "html", caption = "Low-scoring essays",
col.names = c("L1_code", "Pearson coefficient", "95% Conf. Interval"), align = 'l', table.attr = "style='width:70%;'")
Low-scoring essays
|
L1_code
|
Pearson coefficient
|
95% Conf. Interval
|
|
TUR
|
0.9243
|
(0.9204, 0.9281)
|
|
CHI
|
0.9229
|
(0.9191, 0.9265)
|
|
JPN
|
0.9213
|
(0.9175, 0.925)
|
|
ITA
|
0.9175
|
(0.9133, 0.9214)
|
|
KOR
|
0.9174
|
(0.9134, 0.9211)
|
|
TEL
|
0.9114
|
(0.9072, 0.9153)
|
|
SPA
|
0.9048
|
(0.8998, 0.9096)
|
|
FRE
|
0.9027
|
(0.8976, 0.9075)
|
|
ARA
|
0.8982
|
(0.8937, 0.9026)
|
|
GER
|
0.8897
|
(0.8828, 0.8961)
|
|
HIN
|
0.8860
|
(0.88, 0.8918)
|
#combine correlations data for single correlations plot
zipf_loess_cor_high <- zipf_loess_cor_high %>%
mutate(score_type = "high")
zipf_loess_cor_low <- zipf_loess_cor_low %>%
mutate(score_type = "low")
zipf_loess_cor <- rbind(zipf_loess_cor_high, zipf_loess_cor_low)
#correlations plot
ggplot(zipf_loess_cor) +
labs(title = "Pearson coefficients for Zipf and loess estimates with 95% CIs",
x = "Language code", y = "Correlation") +
geom_point(aes(L1_code, cor_val)) +
geom_segment(data = zipf_loess_cor,
aes(x = L1_code, y = cor_CI_lower, xend = L1_code, yend = cor_CI_upper,
color = score_type))
