Read in doctag indices, docvecs, and metadata
metadata <- read_csv("../all_data/merged_metadata.csv")
metadata_clean <- metadata %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
essay_word_counts <- read_feather("../all_data/essay_word_counts.feather")
wc_clean = left_join(essay_word_counts, metadata_clean)
freq_by_rank <- word_counts_by_lang %>%
group_by(L1_code) %>%
arrange(-count) %>%
mutate(rank = row_number(),
`term frequency` = count/total)
rank_subset <- freq_by_rank %>%
filter(rank < 2000,
rank > 50)
mod = lm(log10(`term frequency`) ~ log10(rank), data = rank_subset)
freq_by_rank %>%
filter(rank < 2000,
rank > 50) %>%
ggplot(aes(rank, `term frequency`, color = L1_code)) +
geom_line(size = .8, alpha = 0.8) +
geom_abline(intercept = mod$coefficients[1][[1]] , slope = mod$coefficients[2][[1]], color = "black", linetype = 2) +
scale_x_log10() +
scale_y_log10() +
ylab("log normalized term frequency") +
xlab("log rank")+
ggtitle("Zipf's law") +
theme_minimal()
as.list(sample_n(as.data.frame(unique(word_counts_by_lang$word)), 50))
## $`unique(word_counts_by_lang$word)`
## [1] neigbours health-related groundsand
## [4] treasuring adulthood exhaustion
## [7] expositions physical specilaizing
## [10] otherhand rngkj specificly
## [13] non-philosophical decides scool
## [16] weight-loss giuge advertismente
## [19] substances foot-ball neibourghoods
## [22] whicj speciallities concieved
## [25] threr romancing bundeskanzler
## [28] conditon including traway
## [31] navigate ppirt regardless
## [34] inindia manuplate wideranged
## [37] puzzeld econmic sovles
## [40] evertything fines confidant
## [43] money-consuming blockling tehy
## [46] soalr conseme sin
## [49] sophosticated snakes
## 61382 Levels: _ __ ... zurich
a lot of misspellings - could that be the source of the semantic similarity?
Get binomial-split corpus as in Piantadosi (2014)
Plot residuals
MISC
By gender
By score
By age