Read in doctag indices, docvecs, and metadata
doctag_indices <- read_feather("../all_data/doctag_indices.feather")
docvecs <- read_feather("../all_data/docvecs.feather") %>%
as.data.frame() %>%
mutate(offset = 0:(n()-1)) %>%
select(offset, everything())
#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv")
metadata_clean <- metadata %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
Merge all data sources together
d <- doctag_indices %>%
left_join(metadata_clean) %>%
left_join(docvecs) %>%
nest(-1:-10, .key = "doc_vector") %>%
select(-doc_count, -offset, -test_center_country_code) %>%
mutate(prompt_id = fct_recode(prompt_id,
broad.academics = "VC079857",
young.enjoy = "VC139555",
young.help = "VC182427",
advertisements = "VC199494",
fewer.cars = "VC199744",
tour.travel = "VC212639",
students.facts = "VC247638",
success.risks = "VC251653"))
Scores by language
d %>%
group_by(L1_code) %>%
multi_boot_standard(col = "score", na.rm = T) %>%
arrange(mean) %>%
kable()
| L1_code | ci_lower | ci_upper | mean |
|---|---|---|---|
| ARA | 2.812475 | 2.927500 | 2.8695 |
| JPN | 2.908475 | 3.016025 | 2.9615 |
| KOR | 3.063975 | 3.166513 | 3.1165 |
| ITA | 3.154488 | 3.266012 | 3.2090 |
| CHI | 3.213963 | 3.307513 | 3.2590 |
| TEL | 3.321487 | 3.422525 | 3.3720 |
| TUR | 3.406500 | 3.511038 | 3.4590 |
| FRE | 3.502000 | 3.606012 | 3.5535 |
| SPA | 3.514500 | 3.621500 | 3.5670 |
| HIN | 3.831975 | 3.930013 | 3.8815 |
| GER | 3.872975 | 3.957012 | 3.9115 |
Get TSNE coorinates
mats <- d %>%
select(doc_vector) %>%
unnest() %>%
as.matrix()
tsne_out = Rtsne::Rtsne(mats)
tsne_dims <- tsne_out$Y %>%
as.data.frame() %>%
rename(tsne_X = V1,
tsne_Y = V2) %>%
bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
select(everything(), tsne_X, tsne_Y)
get_group_centroid <- function(doc_vecs_df, group_name){
centroid <- doc_vecs_df %>%
select(doc_vector) %>%
unnest(doc_vector) %>%
colMeans()
this_group_name <- doc_vecs_df %>%
select_(group_name) %>%
slice(1) %>%
unlist()
df <- data.frame(t(centroid)) %>%
mutate(x = this_group_name) %>%
select(x, everything())
names(df)[1] = c(as.name(group_name))
df
}
tsne_centroids_L1_score_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(score)) %>%
group_by(L1_code, prompt_id) %>%
mutate(score_bin = as.factor(ntile(score, 2))) %>%
ungroup() %>%
mutate(L1_prompt_score = paste(L1_code, prompt_id, score_bin, sep = "_")) %>%
split(.$L1_prompt_score) %>%
map_df(get_group_centroid, "L1_prompt_score") %>%
separate(L1_prompt_score, c("L1_code", "prompt_id", "score_bin"), sep = "_")
tsne_centroids_L1_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(score)) %>%
mutate(L1_prompt = paste(L1_code, prompt_id, sep = "_")) %>%
split(.$L1_prompt) %>%
map_df(get_group_centroid, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
tsne_prompt_score_language = tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
group_by(L1_code, prompt_id) %>%
filter(!is.na(score)) %>%
mutate(score_bin = as.factor(ntile(score, 2))) %>%
unnest()
ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, fill = score_bin, color = score_bin)) +
geom_encircle(alpha = .2) +
facet_grid(L1_code ~ prompt_id, scales = "free") +
#geom_point(data = tsne_centroids_L1_score_prompt, size = .5) +
geom_point(data = mutate(tsne_centroids_L1_prompt, score_bin = NA), size = .8, color = "black") +
theme_bw()
tsne_centroids_score_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(score)) %>%
group_by(prompt_id) %>%
mutate(score_bin = as.factor(ntile(score, 2)),
prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
ungroup() %>%
split(.$prompt_score) %>%
map_df(get_group_centroid, "prompt_score") %>%
separate(prompt_score, c( "prompt_id", "score_bin"), sep = "_")
tsne_centroids_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(score)) %>%
split(.$prompt_id) %>%
map_df(get_group_centroid, "prompt_id")
ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, fill = score_bin, color = score_bin)) +
geom_point(size = .2)+
geom_encircle(alpha = .2) +
facet_wrap(~ prompt_id, scales = "free") +
geom_point(data = tsne_centroids_score_prompt, color = "black") +
theme_bw()
ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, fill = score_bin, color = score_bin)) +
geom_density2d(alpha=.5) +
facet_grid(score_bin ~ prompt_id, scales = "free") +
geom_point(data = tsne_centroids_score_prompt, color = "black") +
theme_bw()
ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y,
fill = score_bin, color = score_bin)) +
geom_point(data = tsne_centroids_score_prompt) +
theme_bw()
The is the distance from the lanugage-prompt mean as a function of score and language.
tsne_dims_with_centers <- tsne_dims %>%
left_join(tsne_centroids_L1_prompt %>%
rename(tsne_X_prompt_L1 = tsne_X,
tsne_Y_prompt_L1 = tsne_Y), by = c("prompt_id", "L1_code")) %>%
rowwise() %>%
mutate(dist_from_prompt_L1_center = dist(rbind(c(tsne_X_prompt_L1,
tsne_Y_prompt_L1),
c(tsne_X, tsne_Y)),
method = "euclidean")[1])
model = lme4::lmer(dist_from_prompt_L1_center ~ L1_code + score +
(1|prompt_id), tsne_dims_with_centers)
summary(model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: dist_from_prompt_L1_center ~ L1_code + score + (1 | prompt_id)
## Data: tsne_dims_with_centers
##
## REML criterion at convergence: 51550.8
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.7563 -0.5577 -0.1373 0.3430 16.1961
##
## Random effects:
## Groups Name Variance Std.Dev.
## prompt_id (Intercept) 0.2375 0.4873
## Residual 6.3183 2.5136
## Number of obs: 11000, groups: prompt_id, 8
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 4.41792 0.20695 21.348
## L1_codeCHI -0.37241 0.11297 -3.296
## L1_codeFRE -0.51936 0.11437 -4.541
## L1_codeGER -0.45782 0.11674 -3.922
## L1_codeHIN -0.76156 0.11651 -6.537
## L1_codeITA -0.69088 0.11346 -6.089
## L1_codeJPN -0.41875 0.11245 -3.724
## L1_codeKOR -0.27082 0.11264 -2.404
## L1_codeSPA -0.37921 0.11440 -3.315
## L1_codeTEL -0.63923 0.11390 -5.612
## L1_codeTUR -0.23637 0.11410 -2.072
## score -0.19477 0.02879 -6.766
##
## Correlation of Fixed Effects:
## (Intr) L1_CHI L1_FRE L1_GER L1_HIN L1_ITA L1_JPN L1_KOR L1_SPA
## L1_codeCHI -0.231
## L1_codeFRE -0.197 0.506
## L1_codeGER -0.158 0.504 0.522
## L1_codeHIN -0.161 0.504 0.521 0.535
## L1_codeITA -0.233 0.502 0.507 0.507 0.505
## L1_codeJPN -0.262 0.500 0.495 0.487 0.488 0.497
## L1_codeKOR -0.246 0.503 0.501 0.496 0.497 0.500 0.500
## L1_codeSPA -0.197 0.507 0.515 0.517 0.516 0.502 0.495 0.501
## L1_codeTEL -0.217 0.503 0.510 0.514 0.516 0.506 0.496 0.500 0.505
## L1_codeTUR -0.207 0.505 0.515 0.518 0.517 0.508 0.496 0.501 0.512
## score -0.399 -0.099 -0.174 -0.260 -0.252 -0.090 -0.023 -0.063 -0.175
## L1_TEL L1_TUR
## L1_codeCHI
## L1_codeFRE
## L1_codeGER
## L1_codeHIN
## L1_codeITA
## L1_codeJPN
## L1_codeKOR
## L1_codeSPA
## L1_codeTEL
## L1_codeTUR 0.510
## score -0.128 -0.152
ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, fill = score_bin)) +
geom_point(size = .2, aes(color = score_bin)) +
facet_grid(L1_code ~ prompt_id, scales = "free") +
geom_point(data = mutate(tsne_centroids_L1_prompt, score_bin = NA), size = .8, color = "black") +
theme_bw()
Languages further away have lower scores (contra what I said earlier); there’s still an effect of language, controlling for score.
Older people look different from younger and males look different from females
get_group_centroid_with_CI <- function(doc_vecs_df, group_name){
centroid <- doc_vecs_df %>%
select(doc_vector) %>%
unnest(doc_vector) %>%
gather("coordinate_name", "value") %>%
group_by(coordinate_name) %>%
multi_boot_standard(col = "value") %>%
rename(center = mean) %>%
gather(variable, value, -coordinate_name) %>%
unite(temp, coordinate_name, variable) %>%
spread(temp, value)
this_group_name <- doc_vecs_df %>%
select_(group_name) %>%
slice(1) %>%
unlist()
df <- centroid %>%
mutate(x = this_group_name) %>%
select(x, everything())
names(df)[1] = c(quo_name(group_name))
df
}
tsne_dims_gender <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(gender)) %>%
mutate(gender = as.character(gender)) %>%
split(.$gender) %>%
map_df(get_group_centroid_with_CI, "gender")
ggplot(tsne_dims_gender, aes(x = tsne_X_center, y = tsne_Y_center, fill = gender, color = gender)) +
geom_point() +
geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper)) +
geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
theme_bw()
tsne_dims_age<- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(age)) %>%
mutate(age_bin = as.factor(cut(age, 8))) %>%
split(.$age_bin) %>%
map_df(get_group_centroid_with_CI, "age_bin")
ggplot(tsne_dims_age, aes(x = tsne_X_center, y = tsne_Y_center, fill = age_bin, color = age_bin)) +
geom_point() +
geom_pointrange(aes(ymin = tsne_Y_ci_lower,ymax = tsne_Y_ci_upper)) +
geom_errorbarh(aes(xmin = tsne_X_ci_lower,xmax = tsne_X_ci_upper, height = 0)) +
theme_bw()
Gender by Language
tsne_dims_gender <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(gender)) %>%
mutate(L1_gender = paste(L1_code, gender, sep = "_")) %>%
split(.$L1_gender) %>%
map_df(get_group_centroid_with_CI, "L1_gender") %>%
separate(L1_gender, c("L1_code", "gender"), sep = "_")
ggplot(tsne_dims_gender, aes(x = tsne_X_center, y = tsne_Y_center, fill = gender, color = gender)) +
geom_point() +
geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper)) +
geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
facet_wrap( ~L1_code) +
theme_bw()
Gender by prompt
tsne_dims_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(gender)) %>%
mutate(prompt_gender = paste(prompt_id, gender, sep = "_")) %>%
split(.$prompt_gender) %>%
map_df(get_group_centroid_with_CI, "prompt_gender") %>%
separate(prompt_gender, c("prompt_id", "gender"), sep = "_")
ggplot(tsne_dims_prompt, aes(x = tsne_X_center, y = tsne_Y_center, fill = gender, color = gender)) +
geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper), size = .5) +
geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
facet_wrap(~prompt_id) +
theme_bw()
All the differences in gender are accounted for by language.
Age by Language
tsne_dims_age <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(age)) %>%
group_by(L1_code) %>%
mutate(age_bin = ntile(age, 2)) %>%
ungroup() %>%
mutate(L1_age = paste(L1_code, age_bin, sep = "_")) %>%
split(.$L1_age) %>%
map_df(get_group_centroid_with_CI, "L1_age") %>%
separate(L1_age, c("L1_code", "age_bin"), sep = "_")
ggplot(tsne_dims_age, aes(x = tsne_X_center, y = tsne_Y_center, fill = age_bin, color = age_bin)) +
geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper), size = .2) +
geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
facet_wrap(~L1_code, scales = "free") +
theme_bw()
Age by Prompt
tsne_dims_prompt_age <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
filter(!is.na(age)) %>%
group_by(L1_code) %>%
mutate(age_bin = ntile(age, 2)) %>%
ungroup() %>%
mutate(prompt_age = paste(prompt_id, age_bin, sep = "_")) %>%
split(.$prompt_age) %>%
map_df(get_group_centroid_with_CI, "prompt_age") %>%
separate(prompt_age, c("prompt_id", "age_bin"), sep = "_")
ggplot(tsne_dims_prompt_age, aes(x = tsne_X_center, y = tsne_Y_center, fill = age_bin, color = age_bin)) +
geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper), size = .2) +
geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
facet_wrap(~prompt_id, scales = "free") +
theme_bw()
** patterns are variable depending on tsne