toefl_zipfs.Rmd

In this document, I test the predictions of Zipf’s law of word frequency distribution using TOEFL essays. I examine the essays according to the essay-takers’ primary language and the essay scores to see whether and how primary language and essay score affects the word frequency distribution.

knitr::opts_chunk$set(echo = TRUE, message = FALSE, warning = FALSE, cache = TRUE)

library(tidyverse)
library(feather)
library(minpack.lm)
library(latex2exp)
library(knitr)

new_data <- read_feather("essay_word_counts_clean.feather")
metadata <- read.csv("merged_metadata.csv") %>%
  transform(L1_code = as.character(L1_code)) %>%
  transform(essay_id = as.character(essay_id))

set.seed(1234)
theme_set(theme_bw())

metadata <- metadata %>%
  mutate(score_range = ifelse(score <= 3, "Low", ifelse(score > 3, "High", NA))) %>%
  na.omit()

score_avg <- metadata %>%
  group_by(L1_code) %>%
  summarize(avg = mean(score)) %>%
  arrange(desc(avg))

#reorder country codes, for graphing purposes
metadata2 <- metadata %>%
  transform(L1_code = factor(L1_code, levels = score_avg$L1_code))

ggplot(metadata2, aes(L1_code, score)) + 
  geom_boxplot() + 
  labs(title = "Score distributions for TOEFL essays, by language", 
       x = "Language code", y = "Score")

metadata_count <- metadata2 %>%
  group_by(L1_code, score_range) %>%
  summarize(count = n())
  
ggplot(metadata_count, aes(L1_code, count)) + 
  geom_bar(aes(fill = score_range), stat = "identity", position = "stack", alpha = 0.7) + 
  labs(title = "Low vs. high scores for TOEFL essays, by language",
       subtitle = "High = higher than 3, Low = 3 or lower",
       x = "Language code", y = "Count",
       fill = "Score\ntype")

lang_list <- unique(new_data$L1_code)

#for splitting corpus into two independent corpora for calculating frequency and frequency rank
split <- function(data) {
  data$freq_n <- sapply(data$freq, function(x) rbinom(n = 1, size = x, prob = 0.5))
  data <- data %>%
    mutate(rank_n = freq - freq_n) %>%
    rename(n = freq)
  data
}

#for calculating log frequency
logfreq <- function(lang_code, data) {
  batch <- data %>%
    filter(L1_code == lang_code) %>%
    group_by(word, L1_code) %>%
    summarize(freq = sum(freq_n)) %>%
    arrange(desc(freq)) %>%
    ungroup() %>%
    mutate(logfreq = log(freq + 1))
  batch
}

logfreq_compile <- function(data) {
  #initialize dataframe
  logfreqd <- data.frame(word=character(), 
                 L1_code=character(), 
                 freq=numeric(),
                 logfreq=numeric())
  
  for(i in seq_along(lang_list)) {
    lang <- lang_list[i]
    logfreqd <- rbind(logfreqd, logfreq(lang, data))
  }
  logfreqd
}

#for calculating log rank

logrank <- function(lang_code, data) {
  batch <- data %>%
    filter(L1_code == lang_code) %>%
    group_by(word, L1_code) %>%
    summarize(rank_freq = sum(rank_n)) %>%
    arrange(desc(rank_freq)) %>%
    ungroup()
  batch$rank <- c(1:length(batch$word))
  batch <- batch %>%
    mutate(logrank = log(rank + 1))
  batch
}

logrank_compile <- function(data) {
  #initialize dataframe
  logranked <- data.frame(word=character(), 
                 L1_code=character(), 
                 rank_freq=numeric(),
                 rank=numeric(),
                 logrank=numeric())
  
  for(i in seq_along(lang_list)) {
    lang <- lang_list[i]
    logranked <- rbind(logranked, logrank(lang, data))
  }
  logranked
}

Word frequency-rank distributions for all words

#attach score metadata to frequency data

#separate by score type
new_data_score <- left_join(new_data, metadata)
new_data_high <- subset(new_data_score, score_range == "High")
new_data_low <- subset(new_data_score, score_range == "Low")
rm(new_data_score)

#perform binomial split on "high" and "low" subsets

#split each subset
word_freq_all_high <- split(new_data_high)
word_freq_all_low <- split(new_data_low)

#calculate log frequency for each subset
log_freq_all_high <- logfreq_compile(word_freq_all_high)
log_freq_all_low <- logfreq_compile(word_freq_all_low)

#calculate log rank for each subset
log_rank_all_high <- logrank_compile(word_freq_all_high)
log_rank_all_low <- logrank_compile(word_freq_all_low)

#combine data
log_data_all_high <- left_join(log_freq_all_high, log_rank_all_high) %>%
  mutate(essay_type = "High-scoring Essays")

log_data_all_low <- left_join(log_freq_all_low, log_rank_all_low) %>%
  mutate(essay_type = "Low-scoring Essays")

log_data_all <- rbind(log_data_all_high, log_data_all_low)

ggplot(log_data_all) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_smooth(aes(logrank, logfreq, color = L1_code)) + 
  labs(title = "All words") +
  facet_wrap(~essay_type) + 
  theme(legend.position = "bottom")

Testing Zipf’s Law

#estimate parameters p = rho, b = beta, a = alpha

#for high scores only

eqfit_high <- nlsLM(freq ~ p *(1/(rank + b)^a),
              start = list(p = 1, b = 2.7, a = 1), #starting values
              data = log_data_all_high)

coefs_high <- coef(eqfit_high)
p_high <- coefs_high[[1]]
b_high <- coefs_high[[2]]
a_high <- coefs_high[[3]]

##create predicted values for log frequency
log_data_all_high2 <- log_data_all_high %>%
  mutate(logfreqpred = log(p_high *(1/(rank + b_high)^a_high)))

##plot predicted log frequency against actual log frequency
ggplot(log_data_all_high2) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpred), color = "red", size = 1.6) + 
  labs(title = "Log rank vs. log frequency with best fit line, high-scoring essays",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$'),
       caption = TeX('$\\rho = 17791.98, \\beta = 2.67, \\alpha = 1.04$'))

#for low scores only
eqfit_low <- nlsLM(freq ~ p *(1/(rank + b)^a),
              start = list(p = 1, b = 2.7, a = 1), #starting values
              data = log_data_all_low)

coefs_low <- coef(eqfit_low)
p_low <- coefs_low[[1]]
b_low <- coefs_low[[2]]
a_low <- coefs_low[[3]]

##create predicted values for log frequency
log_data_all_low2 <- log_data_all_low %>%
  mutate(logfreqpred = log(p_low *(1/(rank + b_low)^a_low)))

##plot predicted log frequency against actual log frequency
ggplot(log_data_all_low2) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpred), color = "red", size = 1.6) + 
  labs(title = "Log rank vs. log frequency with best fit line, low-scoring essays",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$'),
       caption = TeX('$\\rho = 13109.11, \\beta = 3.22, \\alpha = 1.04$'))

Next, we plot the residuals against log frequencies.

#high scores only

##calculate residuals
log_data_all_high2$logfreqresid <- log_data_all_high2$logfreq - log_data_all_high2$logfreqpred

##plot log frequencies and residuals
ggplot(data = log_data_all_high2, mapping = aes(x = logfreq, y = logfreqresid)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual, high-scoring essays",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple")

#low scores only

##calculate residuals
log_data_all_low2$logfreqresid <- log_data_all_low2$logfreq - log_data_all_low2$logfreqpred

##plot log frequencies and residuals
ggplot(data = log_data_all_low2, mapping = aes(x = logfreq, y = logfreqresid)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual, low-scoring essays",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple")

Repeat for each language code and add loess estimates.

lang_list <- unique(new_data$L1_code)

#function to estimate zipf parameters and 
#calculate predicted log frequencies and residuals for each language code

eqfit_lc <- function(lang_code, data) {
  lcdata <- subset(data, L1_code == lang_code)
  #estimate parameters
  mod <- nlsLM(freq ~ p *(1/(rank + b)^a),
              start = list(p = 1, b = 2.7, a = 1), #starting values
              data = lcdata)
  coefs <- coef(mod)
  lcdata$p <- coefs[[1]]
  lcdata$b <- coefs[[2]]
  lcdata$a <- coefs[[3]]  
  
  #calculate predicted values
  lcdata <- mutate(lcdata, logfreqpredz = log(p *(1/(rank + b)^a)))
  
  #calculate residuals
  lcdata <- mutate(lcdata, logfreqresidz = logfreq - logfreqpredz)
  
  lcdata
}

#function to calculate loess estimates

loess50 <- function(lang_code, data, sp) {
  data <- subset(data, L1_code == lang_code)
  loess_mod50 <- loess(logfreq ~ logrank, data=data, span=sp)
  loess_est50 <- predict(loess_mod50)
  df <- as.data.frame(loess_est50)
}

#compile estimates for all language codes, high scores only

zipf_loess_est_high <- data.frame(word = character(), #initalize dataframe
                             L1_code=character(),
                             freq=integer(),
                             logfreq=numeric(),
                             rank=integer(),
                             logrank=numeric(),
                             p=numeric(),
                             b=numeric(),
                             a=numeric(),
                             logfreqpredz=numeric(),
                             logfreqresidz=numeric(),
                             loess_est50=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_est_high <- rbind(zipf_loess_est_high, cbind(eqfit_lc(lang, log_data_all_high), 
                                                          loess50(lang, log_data_all_high, 0.50)))
}

#compile estimates for all language codes, low scores only

zipf_loess_est_low <- data.frame(word = character(), #initalize dataframe
                             L1_code=character(),
                             freq=integer(),
                             logfreq=numeric(),
                             rank=integer(),
                             logrank=numeric(),
                             p=numeric(),
                             b=numeric(),
                             a=numeric(),
                             logfreqpredz=numeric(),
                             logfreqresidz=numeric(),
                             loess_est50=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_est_low <- rbind(zipf_loess_est_low, cbind(eqfit_lc(lang, log_data_all_low), 
                                                          loess50(lang, log_data_all_low, 0.50)))
}

#plot log frequencies versus zipf predictions

##high scores only
ggplot(zipf_loess_est_high) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpredz), color = "red") + 
  geom_line(aes(logrank, loess_est50), color = "cyan") + 
  labs(title = "Log rank vs. log frequency with best fit line, by language code",
       subtitle = "High-scoring essays",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$')) +
  facet_wrap(~L1_code)

##low scores only

ggplot(zipf_loess_est_low) + 
  geom_point(aes(logrank, logfreq)) + 
  geom_line(aes(logrank, logfreqpredz), color = "red") + 
  geom_line(aes(logrank, loess_est50), color = "cyan") + 
  labs(title = "Log rank vs. log frequency with best fit line, by language code",
       subtitle = "Low-scoring essays",
       x = "Log rank", y = "Log frequency",
       subtitle = TeX('$frequency = \\rho * (1/(rank + \\beta)^\\alpha$')) +
  facet_wrap(~L1_code)

#plot log frequencies and residuals

##high scores only
ggplot(data = zipf_loess_est_high, mapping = aes(x = logfreq, y = logfreqresidz)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual by language code, high-scoring essays",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple") + 
  facet_wrap(~L1_code)

##low scores only
ggplot(data = zipf_loess_est_low, mapping = aes(x = logfreq, y = logfreqresidz)) +
  geom_point() + 
  labs(title = "Log Frequency vs. residual by language code, low-scoring essays",
       x = "Log frequency", 
       y = "Residual") + 
  geom_hline(yintercept = 0, linetype="longdash", color = "purple") + 
  facet_wrap(~L1_code)

Zipfian distribution parameter estimates

zipf_param_high <- zipf_loess_est_high %>%
  select(L1_code, p, b, a) %>%
  unique() %>%
  transform(b = round(b, 2)) %>%
  transform(a = round(a, 2)) %>%
  arrange(desc(b))

rownames(zipf_param_high) <- NULL

zipf_param_high %>%
  kable(format = "html", caption = "High-scoring essays",
        col.names = c("L1_code", "rho", "beta", "alpha"), align = 'l', table.attr = "style='width:70%;'")

High-scoring essays
L1_code	rho	beta	alpha
KOR	15284.74	3.71	1.07
GER	34649.03	3.62	1.09
FRE	25505.80	3.47	1.09
JPN	11994.46	3.42	1.06
ARA	12693.26	3.40	1.09
ITA	18942.78	3.40	1.11
SPA	26469.85	3.36	1.10
HIN	22202.32	1.99	1.02
TUR	14834.17	1.92	0.99
CHI	11799.37	1.74	0.98
TEL	11579.37	1.24	0.96

zipf_param_low <- zipf_loess_est_low %>%
  select(L1_code, p, b, a) %>%
  unique() %>%
  transform(b = round(b, 2)) %>%
  transform(a = round(a, 2)) %>%
  arrange(desc(b))

rownames(zipf_param_low) <- NULL

zipf_param_low %>%
  kable(format = "html", caption = "Low-scoring essays",
        col.names = c("L1_code", "rho", "beta", "alpha"), align = 'l', table.attr = "style='width:70%;'")

Low-scoring essays
L1_code	rho	beta	alpha
JPN	28433.009	5.90	1.14
KOR	24845.454	5.69	1.12
GER	6905.231	3.95	1.09
FRE	13852.982	3.86	1.10
TUR	12361.966	3.53	1.04
ARA	18372.062	3.28	1.05
ITA	16242.756	2.85	1.07
SPA	11690.797	2.79	1.06
HIN	5692.325	2.27	0.99
CHI	9886.426	1.74	0.94
TEL	9071.447	1.48	0.94

Pearson correlations between Zipf and loess estimates by language code

#function to calculate correlations and correlation p-value using Pearson test

get_corr <- function(lang_code, data) {
  data2 <- subset(data, L1_code == lang_code)
  data3 <- select(data2, -L1_code)
  x <- as.matrix(data3)
  c <- cor.test(x[,1], x[,2], method = "pearson", conf.level = 0.95)
  cor_val <- round(c$estimate[[1]], 4)
  cor_CI <- c$conf.int
  cor_CI_lower <- round(cor_CI[1], 4)
  cor_CI_upper <- round(cor_CI[2], 4)
  row <- as.data.frame(t(c(lang_code, cor_val, cor_CI_lower, cor_CI_upper)))
  names(row) <- c("L1_code", "cor_val", "cor_CI_lower", "cor_CI_upper")
  rownames(row) <- NULL
  row
}

#high scores

zipf_loess_est_sum_high <- zipf_loess_est_high %>%
  select(L1_code, logfreqpredz, loess_est50)

##compile correlations for all language codes

zipf_loess_cor_high <- data.frame(L1_code=character(), #initalize dataframe
                            cor_val=numeric(),
                            cor_CI_lower=numeric(),
                            cor_CI_upper=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_cor_high <- rbind(zipf_loess_cor_high, get_corr(lang, zipf_loess_est_sum_high)) %>%
    transform(cor_val = as.numeric(as.character(cor_val))) %>%
    transform(cor_CI_lower = as.numeric(as.character(cor_CI_lower))) %>%
    transform(cor_CI_upper = as.numeric(as.character(cor_CI_upper))) %>%
    arrange(desc(cor_val))
}

##re-order factor levels for language code, for graphing purposes
zipf_loess_cor_high$L1_code <- factor(zipf_loess_cor_high$L1_code, 
                                      levels = zipf_loess_cor_high$L1_code)

#low scores

zipf_loess_est_sum_low <- zipf_loess_est_low %>%
  select(L1_code, logfreqpredz, loess_est50)

##compile correlations for all language codes

zipf_loess_cor_low <- data.frame(L1_code=character(), #initalize dataframe
                            cor_val=numeric(),
                            cor_CI_lower=numeric(),
                            cor_CI_upper=numeric())

for(i in seq_along(lang_list)) {
  lang <- lang_list[i]
  zipf_loess_cor_low <- rbind(zipf_loess_cor_low, get_corr(lang, zipf_loess_est_sum_low)) %>%
    transform(cor_val = as.numeric(as.character(cor_val))) %>%
    transform(cor_CI_lower = as.numeric(as.character(cor_CI_lower))) %>%
    transform(cor_CI_upper = as.numeric(as.character(cor_CI_upper))) %>%
    arrange(desc(cor_val))
}

##re-order factor levels for language code, for graphing purposes
zipf_loess_cor_low$L1_code <- factor(zipf_loess_cor_low$L1_code, 
                                      levels = zipf_loess_cor_low$L1_code)

##correlations table
zipf_loess_cor_high %>%
  mutate(cor_CI = paste0("(", cor_CI_lower, ", ", cor_CI_upper, ")")) %>%
  select(-cor_CI_lower, -cor_CI_upper) %>%
  kable(format = "html", caption = "High-scoring essays",
        col.names = c("L1_code", "Pearson coefficient", "95% Conf. Interval"), align = 'l', table.attr = "style='width:70%;'")

High-scoring essays
L1_code	Pearson coefficient	95% Conf. Interval
GER	0.9388	(0.9362, 0.9412)
HIN	0.9320	(0.9294, 0.9344)
TUR	0.9307	(0.9277, 0.9336)
SPA	0.9290	(0.926, 0.9319)
FRE	0.9262	(0.9231, 0.9292)
CHI	0.9244	(0.921, 0.9276)
ITA	0.9242	(0.9206, 0.9275)
TEL	0.9210	(0.9177, 0.9242)
JPN	0.9203	(0.9162, 0.9243)
KOR	0.9188	(0.9149, 0.9225)
ARA	0.9076	(0.9032, 0.9119)

zipf_loess_cor_low %>%
  mutate(cor_CI = paste0("(", cor_CI_lower, ", ", cor_CI_upper, ")")) %>%
  select(-cor_CI_lower, -cor_CI_upper) %>%
  kable(format = "html", caption = "Low-scoring essays",
        col.names = c("L1_code", "Pearson coefficient", "95% Conf. Interval"), align = 'l', table.attr = "style='width:70%;'")

Low-scoring essays
L1_code	Pearson coefficient	95% Conf. Interval
TUR	0.9243	(0.9204, 0.9281)
CHI	0.9229	(0.9191, 0.9265)
JPN	0.9213	(0.9175, 0.925)
ITA	0.9175	(0.9133, 0.9214)
KOR	0.9174	(0.9134, 0.9211)
TEL	0.9114	(0.9072, 0.9153)
SPA	0.9048	(0.8998, 0.9096)
FRE	0.9027	(0.8976, 0.9075)
ARA	0.8982	(0.8937, 0.9026)
GER	0.8897	(0.8828, 0.8961)
HIN	0.8860	(0.88, 0.8918)

#combine correlations data for single correlations plot

zipf_loess_cor_high <- zipf_loess_cor_high %>%
  mutate(score_type = "high")

zipf_loess_cor_low <- zipf_loess_cor_low %>%
  mutate(score_type = "low")

zipf_loess_cor <- rbind(zipf_loess_cor_high, zipf_loess_cor_low)

#correlations plot

ggplot(zipf_loess_cor) +
  labs(title = "Pearson coefficients for Zipf and loess estimates with 95% CIs",
       x = "Language code", y = "Correlation") + 
  geom_point(aes(L1_code, cor_val)) + 
  geom_segment(data = zipf_loess_cor,
               aes(x = L1_code, y = cor_CI_lower, xend = L1_code, yend = cor_CI_upper,
                   color = score_type))

toefl_zipfs.Rmd

Soo Wan Kim

December 22, 2017

Word frequency-rank distributions for all words

Testing Zipf’s Law

Zipfian distribution parameter estimates

Pearson correlations between Zipf and loess estimates by language code