Read in doctag indices, docvecs, and metadata
doctag_indices <- read_feather("../all_data/doctag_indices.feather")
docvecs <- read_feather("../all_data/docvecs.feather") %>%
as.data.frame() %>%
mutate(offset = 0:(n()-1)) %>%
select(offset, everything())
#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv")
Metadata
metadata_clean <- metadata %>%
mutate_if(is.character, as.factor) %>%
mutate(essay_id = as.character(essay_id))
ggplot(metadata_clean, aes(x = age)) +
geom_histogram() +
ggtitle("Age distribution") +
theme_minimal()

ggplot(metadata_clean, aes(x = score)) +
geom_histogram(bins = 10) +
ggtitle("Score distribution") +
theme_minimal()

kable(count(metadata_clean, gender), caption = "gender distribution")
gender distribution
| F |
4573 |
| M |
6015 |
| NA |
1512 |
kable(count(metadata_clean, L1_code), caption = "L1 distribution")
L1 distribution
| ARA |
1100 |
| CHI |
1100 |
| FRE |
1100 |
| GER |
1100 |
| HIN |
1100 |
| ITA |
1100 |
| JPN |
1100 |
| KOR |
1100 |
| SPA |
1100 |
| TEL |
1100 |
| TUR |
1100 |
kable(count(metadata_clean, prompt_id), caption = "Prompt distribution")
Prompt distribution
| VC079857 |
1656 |
| VC139555 |
1562 |
| VC182427 |
1396 |
| VC199494 |
1509 |
| VC199744 |
1648 |
| VC212639 |
960 |
| VC247638 |
1686 |
| VC251653 |
1683 |
Prompts
prompts = data.frame()
prompt_files <- list.files("../all_data/prompts/")
for (i in 1:length(prompt_files)){
fileName <- paste0("../all_data/prompts/", prompt_files[i])
prompt <- readChar(fileName, file.info(fileName)$size)
prompts <- bind_rows(prompts,
data.frame(prompt_id = prompt_files[i],
prompt = prompt))
}
prompts_clean <- prompts %>%
rowwise() %>%
mutate(prompt_id = str_replace_all(prompt_id, ".txt", ""),
prompt = str_trim(str_replace_all(prompt, "\n", " ")),
prompt = str_replace_all(prompt, "Use reasons and examples to support your answer.", ""),
prompt = str_replace_all(prompt, "Use specific reasons and examples to support your answer.", ""),
prompt = str_replace_all(prompt, "Do you agree or disagree with the following statement\\?", ""))
kable(prompts_clean)
| VC079857 |
It is better to have broad knowledge of many academic subjects than to specialize in one specific subject. |
| VC139555 |
Young people enjoy life more than older people do. |
| VC182427 |
Young people nowadays do not give enough time to helping their communities. |
| VC199494 |
Most advertisements make products seem much better than they really are. |
| VC199744 |
In twenty years, there will be fewer cars in use than there are today. |
| VC212639 |
The best way to travel is in a group led by a tour guide. |
| VC247638 |
It is more important for students to understand ideas and concepts than it is for them to learn facts. |
| VC251653 |
Successful people try new things and take risks rather than only doing what they already know how to do well. |
Merge all data sources together
d <- doctag_indices %>%
left_join(metadata_clean) %>%
left_join(docvecs) %>%
nest(-1:-10, .key = "doc_vector") %>%
select(-doc_count, -offset, -test_center_country_code) %>%
mutate(prompt_id = fct_recode(prompt_id,
broad_academics = "VC079857",
young_enjoy = "VC139555",
young_help = "VC182427",
advertisements = "VC199494",
fewer_cars = "VC199744",
tour_travel = "VC212639",
students_facts = "VC247638",
success_risks = "VC251653"))
High-d space
get_group_centroid <- function(doc_vecs_df, group_name){
centroid <- doc_vecs_df %>%
select(doc_vector) %>%
unnest(doc_vector) %>%
colMeans()
this_group_name <- doc_vecs_df %>%
select_(group_name) %>%
slice(1) %>%
unlist()
df <- data.frame(t(centroid)) %>%
mutate(x = this_group_name) %>%
select(x, everything())
names(df)[1] = c(as.name(group_name))
df
}
get_group_diameter <- function(doc_vecs_df, group_name){
these_vecs <- doc_vecs_df %>%
select(doc_vector) %>%
unnest(doc_vector) %>%
as.matrix()
dist_sims <- dist2(these_vecs, method = "euclidean", norm = "none") # get pairwise distances
diag(dist_sims) <- NA # get pairwise distances
max_dist = max(as.data.frame(dist_sims), na.rm = T) # get max distances between points
this_group_name <- doc_vecs_df %>%
select_(group_name) %>%
slice(1) %>%
unlist()
df <- data.frame(x = this_group_name,
max_dist = max_dist)
names(df)[1] = c(as.name(group_name))
df
}
By language (pooling prompts)
Centroid distance
centroids <- d %>%
split(.$L1_code) %>%
map_df(get_group_centroid, "L1_code")
centroid_mat = as.matrix(centroids[,-1])
rownames(centroid_mat) = centroids[,1]
ggdendro::ggdendrogram(hclust(dist(centroid_mat)), size = 2) +
ggtitle("High-Dimensional Pairwise Centroid Distance, pooling across prompts")

#plot(hclust(dist(centroid_mat)))
Diameters
diameters <- d %>%
split(.$L1_code) %>%
map_df(get_group_diameter, "L1_code")
ggplot(diameters, aes(x = reorder(L1_code, max_dist),
y = max_dist,
fill = L1_code)) +
geom_bar(stat = "identity") +
xlab("L1") +
ggtitle("Diameter") +
theme_minimal() +
theme(legend.position = "none")

#plot(ape::as.phylo(hclust(dist(centroid_mat))), type = "unrooted")
By language and prompt (averaging across prompts)
Centroid distance
centroids_prompt <- d %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_centroid, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
prompt_mean_centroids <- centroids_prompt %>%
split(.$L1_code) %>%
map_df(~colMeans(.[,-1:-2])) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("L1_code")
prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]
ggdendro::ggdendrogram(hclust(dist(prompt_mean_centroids_mat))) +
ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")

Diameters
diameters_prompt <- d %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_diameter, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
prompt_mean_diameter<- diameters_prompt %>%
group_by(L1_code) %>%
summarize(mean = mean(max_dist))
#multi_boot_standard(column = "max_dist")
ggplot(prompt_mean_diameter, aes(x = reorder(L1_code, mean),
y = mean,
fill = L1_code)) +
geom_bar(stat = "identity") +
#geom_linerange(aes(ymin = summary_ci_lower, ymax = summary_ci_upper)) +
ylab("diameter") +
xlab("L1") +
ggtitle("Diameter, averaging across prompts") +
theme_minimal() +
theme(legend.position = "none")

2D Space
Visualize Distributions
mats <- d %>%
select(doc_vector) %>%
unnest() %>%
as.matrix()
tsne_out = Rtsne::Rtsne(mats)
tsne_dims <- tsne_out$Y %>%
as.data.frame() %>%
rename(tsne_X = V1,
tsne_Y = V2) %>%
bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
select(everything(), tsne_X, tsne_Y)
ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
geom_point(size = .2) +
theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = prompt_id, fill = prompt_id)) +
#geom_encircle(alpha = .2) +
#geom_point(size = .2) +
geom_density2d(alpha=.5) +
theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = prompt_id, fill = prompt_id)) +
facet_grid(prompt_id ~L1_code ) +
geom_encircle(alpha = .2) +
theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code, fill = L1_code)) +
facet_wrap(~prompt_id) +
geom_encircle(alpha = .2) +
theme_minimal()

By language (pooling prompts)
Centroid Distances
tsne_centroids <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
split(.$L1_code) %>%
map_df(get_group_centroid, "L1_code")
#ggplot(tsne_centroids, aes(x = tsne_X, y = tsne_Y, fill = L1_code)) +
# geom_label(aes(label = L1_code)) +
# theme_minimal() +
# theme(legend.position = "none")
ggplot(tsne_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 4, color = 'black') +
ggrepel::geom_label_repel(
aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
fontface = 'bold', color = 'black',
box.padding = unit(0.35, "lines"),
point.padding = unit(0.2, "lines"),
segment.color = 'black') +
ggtitle("Centroids in 2D") +
theme_minimal() +
theme(legend.position = "none")

Diameters
tsne_diameters <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
split(.$L1_code) %>%
map_df(get_group_diameter, "L1_code")
ggplot(tsne_diameters, aes(x = reorder(L1_code, max_dist),
y = max_dist,
fill = L1_code)) +
geom_bar(stat = "identity") +
xlab("L1") +
ggtitle("Diameter") +
theme_minimal() +
theme(legend.position = "none")

By language and prompt (averaging across prompts)
Centroid distances
tsne_centroids_by_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_centroid, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
tsne_prompt_mean_centroids <- tsne_centroids_by_prompt %>%
split(.$L1_code) %>%
map_df(~colMeans(.[,-1:-2])) %>%
t() %>%
as.data.frame() %>%
rownames_to_column("L1_code") %>%
rename(tsne_X = V1,
tsne_Y = V2)
ggplot(tsne_centroids_by_prompt, aes(x = tsne_X, y = tsne_Y)) +
geom_point(aes(color = L1_code)) +
facet_wrap( ~prompt_id, scales = "free") +
theme_minimal()

ggplot(tsne_prompt_mean_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 4, color = 'black') +
ggrepel::geom_label_repel(
aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
fontface = 'bold', color = 'black',
box.padding = unit(0.35, "lines"),
point.padding = unit(0.2, "lines"),
segment.color = 'black') +
ggtitle("Centroids in 2D, averaging across prompts") +
theme_minimal() +
theme(legend.position = "none")

Diameters
tsne_diameters_prompt <- tsne_dims %>%
nest(1:2, .key = "doc_vector") %>%
mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
split(.$L1_prompt) %>%
map_df(get_group_diameter, "L1_prompt") %>%
separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")
tsne_prompt_mean_diameter<- tsne_diameters_prompt %>%
group_by(L1_code) %>%
summarize(mean = mean(max_dist))
#multi_boot_standard(column = "max_dist")
ggplot(tsne_prompt_mean_diameter, aes(x = reorder(L1_code, mean),
y = mean,
fill = L1_code)) +
geom_bar(stat = "identity") +
#geom_linerange(aes(ymin = summary_ci_lower, ymax = summary_ci_upper)) +
xlab("L1") +
ggtitle("Diameter, averaging across prompts") +
theme_minimal() +
theme(legend.position = "none")

Area overlap in 2D
pooling across prompts
get_2d_overlap <- function(group_1, group_2, df){
df1 <- filter(df, L1_code == group_1)
df2 <- filter(df, L1_code == group_2)
siar_obj = siar::overlap(df1$tsne_X,df1$tsne_Y, df2$tsne_X, df2$tsne_Y)
overlap = siar_obj$overlap
p_overlap = overlap/mean(siar_obj$area1, siar_obj$area2) # overlaps normalized by mean area
data.frame(group_1 = group_1,
group_2 = group_2,
overlap = overlap,
p_overlap = p_overlap)
}
all_lang_combos <- expand.grid(group_1 = as.character(unique(tsne_dims$L1_code)),
group_2 = as.character(unique(tsne_dims$L1_code)))
pairwise_tsne_overlap <- map2_df(all_lang_combos$group_1,
all_lang_combos$group_2,
get_2d_overlap,
tsne_dims)
pairwise_tsne_overlap_dist <- pairwise_tsne_overlap %>%
mutate(dist_p_overlap = 1 - p_overlap) %>%
select(group_1, group_2, dist_p_overlap) %>%
spread(group_2,dist_p_overlap) %>%
remove_rownames() %>%
column_to_rownames("group_1") %>%
as.matrix()
prompt_mean_centroids_mat <- as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]
ggdendro::ggdendrogram(hclust(dist(pairwise_tsne_overlap_dist))) +
ggtitle("Proportion overlap of ellipses, pooling across prompts")

averaging across prompts
get_2d_overlap_prompt <- function(group_1, group_2, prompt, df){
df1 <- filter(df, L1_code == group_1, prompt_id == prompt)
df2 <- filter(df, L1_code == group_2, prompt_id == prompt)
siar_obj = siar::overlap(df1$tsne_X,df1$tsne_Y, df2$tsne_X, df2$tsne_Y)
overlap = siar_obj$overlap
p_overlap = overlap/mean(siar_obj$area1, siar_obj$area2) # overlaps normalized by mean area
data.frame(group_1 = group_1,
group_2 = group_2,
prompt = prompt,
overlap = overlap,
p_overlap = p_overlap)
}
all_prompts <- rep(unique(tsne_dims$prompt_id), each = nrow(all_lang_combos))
all_lang_combos2 <- do.call("rbind", replicate(length(unique(tsne_dims$prompt_id)),
all_lang_combos, simplify = FALSE)) %>%
bind_cols(prompt = all_prompts) %>%
mutate_all(funs(as.character))
pairwise_tsne_overlap_prompt <- pmap_df(all_lang_combos2,
get_2d_overlap_prompt,
tsne_dims)
pairwise_tsne_overlap_prompt_mat <- pairwise_tsne_overlap_prompt %>%
group_by(group_1, group_2) %>%
summarize(p_overlap = mean(p_overlap)) %>%
mutate(dist_p_overlap = 1 - p_overlap) %>%
select(group_1, group_2, dist_p_overlap) %>%
spread(group_2,dist_p_overlap) %>%
remove_rownames() %>%
column_to_rownames("group_1") %>%
as.matrix()
pairwise_tsne_overlap_prompt_mat <- as.matrix(pairwise_tsne_overlap_prompt_mat)
ggdendro::ggdendrogram(hclust(dist(pairwise_tsne_overlap_prompt_mat))) +
ggtitle("Proportion overlap of ellipses, averaging across prompts")

Prop. overlap language matrix
diag(pairwise_tsne_overlap_prompt_mat) <- ""
pairwise_tsne_overlap_prompt_mat[upper.tri(pairwise_tsne_overlap_prompt_mat)] <- ""
kable(pairwise_tsne_overlap_prompt_mat)
| ARA |
|
|
|
|
|
|
|
|
|
|
|
| CHI |
0.247233837870026 |
|
|
|
|
|
|
|
|
|
|
| FRE |
0.17785820751051 |
0.27269483865517 |
|
|
|
|
|
|
|
|
|
| GER |
0.19261301405023 |
0.305131384258742 |
0.184189439645891 |
|
|
|
|
|
|
|
|
| HIN |
0.678398161107181 |
0.688971399134594 |
0.654901764079375 |
0.65904866225239 |
|
|
|
|
|
|
|
| ITA |
0.0984127614333606 |
0.245789748926822 |
0.224353716968205 |
0.416436601355935 |
0.749772502363741 |
|
|
|
|
|
|
| JPN |
0.324830619217136 |
0.293185999947818 |
0.495341862084987 |
0.623535937718317 |
0.771661993550938 |
0.482842510859474 |
|
|
|
|
|
| KOR |
0.331923593561158 |
0.262751623404783 |
0.476213924165527 |
0.606124049174115 |
0.788831165987201 |
0.511814612841325 |
0.238487156149559 |
|
|
|
|
| SPA |
0.100746291739933 |
0.267401745495666 |
0.24146112922371 |
0.369292297402464 |
0.729251179619222 |
0.327300714079131 |
0.395852142654985 |
0.354162721651465 |
|
|
|
| TEL |
0.641150469122112 |
0.660124196884375 |
0.642619787547842 |
0.62856410984248 |
0.282475673128749 |
0.803794470438355 |
0.74160105479356 |
0.67200881345187 |
0.699333507548088 |
|
|
| TUR |
0.241883027472553 |
0.324921790304212 |
0.315397794477124 |
0.385279892055217 |
0.659447705171416 |
0.478343128044118 |
0.405953568396441 |
0.370423528949248 |
0.33022417608558 |
0.671758335611655 |
|
Prompts sorted by overlap
pairwise_tsne_overlap_prompt %>%
rename(prompt_id = prompt) %>%
group_by(prompt_id) %>%
summarize(mean = mean(p_overlap)) %>%
arrange(mean) %>%
left_join(prompts_clean) %>%
kable()
| VC079857 |
0.5167640 |
It is better to have broad knowledge of many academic subjects than to specialize in one specific subject. |
| VC139555 |
0.5376194 |
Young people enjoy life more than older people do. |
| VC182427 |
0.5438269 |
Young people nowadays do not give enough time to helping their communities. |
| VC199744 |
0.5493628 |
In twenty years, there will be fewer cars in use than there are today. |
| VC212639 |
0.5780431 |
The best way to travel is in a group led by a tour guide. |
| VC247638 |
0.6274587 |
It is more important for students to understand ideas and concepts than it is for them to learn facts. |
| VC251653 |
0.6696384 |
Successful people try new things and take risks rather than only doing what they already know how to do well. |
| VC199494 |
0.6931459 |
Most advertisements make products seem much better than they really are. |