Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../all_data/doctag_indices.feather") 
docvecs <- read_feather("../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv")

Metadata

metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id)) 

ggplot(metadata_clean, aes(x = age)) +
  geom_histogram() +
  ggtitle("Age distribution") +
  theme_minimal()

ggplot(metadata_clean, aes(x = score)) +
  geom_histogram(bins = 10) +
  ggtitle("Score distribution") +
  theme_minimal()

kable(count(metadata_clean, gender), caption = "gender distribution")

gender distribution
gender	n
F	4573
M	6015
NA	1512

kable(count(metadata_clean, L1_code), caption = "L1 distribution")

L1 distribution
L1_code	n
ARA	1100
CHI	1100
FRE	1100
GER	1100
HIN	1100
ITA	1100
JPN	1100
KOR	1100
SPA	1100
TEL	1100
TUR	1100

kable(count(metadata_clean, prompt_id), caption = "Prompt distribution")

Prompt distribution
prompt_id	n
VC079857	1656
VC139555	1562
VC182427	1396
VC199494	1509
VC199744	1648
VC212639	960
VC247638	1686
VC251653	1683

Prompts

prompts = data.frame()
prompt_files <- list.files("../all_data/prompts/")
 for (i in 1:length(prompt_files)){
   fileName <- paste0("../all_data/prompts/", prompt_files[i])
   prompt <- readChar(fileName, file.info(fileName)$size)
   prompts <- bind_rows(prompts, 
                        data.frame(prompt_id = prompt_files[i], 
                                   prompt = prompt))
 }
prompts_clean <- prompts %>%
  rowwise() %>%
  mutate(prompt_id = str_replace_all(prompt_id, ".txt", ""),
         prompt = str_trim(str_replace_all(prompt, "\n", " ")),
         prompt = str_replace_all(prompt, "Use reasons and examples to support your answer.", ""),
         prompt = str_replace_all(prompt, "Use specific reasons and examples to support your answer.", ""),
         prompt = str_replace_all(prompt, "Do you agree or disagree with the following statement\\?", ""))
kable(prompts_clean)

prompt_id	prompt
VC079857	It is better to have broad knowledge of many academic subjects than to specialize in one specific subject.
VC139555	Young people enjoy life more than older people do.
VC182427	Young people nowadays do not give enough time to helping their communities.
VC199494	Most advertisements make products seem much better than they really are.
VC199744	In twenty years, there will be fewer cars in use than there are today.
VC212639	The best way to travel is in a group led by a tour guide.
VC247638	It is more important for students to understand ideas and concepts than it is for them to learn facts.
VC251653	Successful people try new things and take risks rather than only doing what they already know how to do well.

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad_academics = "VC079857", 
                                young_enjoy = "VC139555", 
                                young_help = "VC182427",
                                advertisements = "VC199494",
                                fewer_cars = "VC199744",
                                tour_travel = "VC212639",
                                students_facts = "VC247638",
                                success_risks = "VC251653"))

High-d space

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}
get_group_diameter <- function(doc_vecs_df, group_name){
  
  these_vecs <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    as.matrix()
  
  dist_sims <- dist2(these_vecs, method = "euclidean", norm = "none")  # get pairwise distances
  diag(dist_sims) <- NA # get pairwise distances 
  max_dist = max(as.data.frame(dist_sims), na.rm = T) # get max distances between points
  
  this_group_name <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(x = this_group_name,
             max_dist = max_dist)
  names(df)[1] = c(as.name(group_name))
  df
}

By language (pooling prompts)

Centroid distance

centroids <- d %>%
  split(.$L1_code) %>% 
  map_df(get_group_centroid, "L1_code")

centroid_mat = as.matrix(centroids[,-1])
rownames(centroid_mat) = centroids[,1]

ggdendro::ggdendrogram(hclust(dist(centroid_mat)), size = 2) +
  ggtitle("High-Dimensional Pairwise Centroid Distance, pooling across prompts")

#plot(hclust(dist(centroid_mat)))

Diameters

diameters <- d %>%
  split(.$L1_code) %>% 
  map_df(get_group_diameter, "L1_code") 

ggplot(diameters, aes(x = reorder(L1_code, max_dist), 
                      y = max_dist, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  xlab("L1") +
  ggtitle("Diameter") +
  theme_minimal() +
  theme(legend.position = "none")

#plot(ape::as.phylo(hclust(dist(centroid_mat))), type = "unrooted")

By language and prompt (averaging across prompts)

Centroid distance

centroids_prompt <- d %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

prompt_mean_centroids <- centroids_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code")

prompt_mean_centroids_mat = as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]

ggdendro::ggdendrogram(hclust(dist(prompt_mean_centroids_mat))) +
  ggtitle("High-Dimensional Pairwise Centroid Distance, averaging across prompts")

Diameters

diameters_prompt <- d %>%
    mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

prompt_mean_diameter<- diameters_prompt %>%
  group_by(L1_code)  %>%
  summarize(mean = mean(max_dist))
  #multi_boot_standard(column = "max_dist")

ggplot(prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  #geom_linerange(aes(ymin = summary_ci_lower, ymax = summary_ci_upper)) +
  ylab("diameter") +
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none")

2D Space

Visualize Distributions

mats <- d %>%
  select(doc_vector) %>%
  unnest() %>%
  as.matrix()

tsne_out = Rtsne::Rtsne(mats)
tsne_dims <- tsne_out$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2) %>%
  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
  select(everything(), tsne_X, tsne_Y)

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code)) +
  geom_point(size = .2) +
  theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = prompt_id, fill = prompt_id)) +
  #geom_encircle(alpha = .2) +
  #geom_point(size = .2) +
  geom_density2d(alpha=.5) + 
  theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = prompt_id, fill = prompt_id)) +
  facet_grid(prompt_id ~L1_code ) +
  geom_encircle(alpha = .2) +
  theme_minimal()

ggplot(tsne_dims, aes(x = tsne_X, y = tsne_Y, color = L1_code, fill = L1_code)) +
  facet_wrap(~prompt_id) +
  geom_encircle(alpha = .2) +
  theme_minimal()

By language (pooling prompts)

Centroid Distances

tsne_centroids <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$L1_code) %>% 
  map_df(get_group_centroid, "L1_code")

#ggplot(tsne_centroids, aes(x = tsne_X, y = tsne_Y, fill = L1_code)) +
#  geom_label(aes(label = L1_code)) +
#  theme_minimal() +
#  theme(legend.position = "none") 

ggplot(tsne_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 4, color = 'black') +
  ggrepel::geom_label_repel(
    aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
    fontface = 'bold', color = 'black',
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.2, "lines"),
    segment.color = 'black') +
  ggtitle("Centroids in 2D") +
  theme_minimal() +
  theme(legend.position = "none")

Diameters

tsne_diameters <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  split(.$L1_code) %>% 
  map_df(get_group_diameter, "L1_code")  

ggplot(tsne_diameters, aes(x = reorder(L1_code, max_dist), 
                      y = max_dist, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  xlab("L1") +
  ggtitle("Diameter") +
  theme_minimal() +
  theme(legend.position = "none")

By language and prompt (averaging across prompts)

Centroid distances

tsne_centroids_by_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_") 

tsne_prompt_mean_centroids <- tsne_centroids_by_prompt %>%
  split(.$L1_code) %>% 
  map_df(~colMeans(.[,-1:-2])) %>%
  t() %>% 
  as.data.frame() %>%
  rownames_to_column("L1_code") %>%
  rename(tsne_X = V1,
         tsne_Y = V2)

ggplot(tsne_centroids_by_prompt, aes(x = tsne_X, y = tsne_Y)) +
 geom_point(aes(color = L1_code)) +
  facet_wrap( ~prompt_id, scales = "free") +
  theme_minimal()

ggplot(tsne_prompt_mean_centroids) +
geom_point(aes(x = tsne_X, y = tsne_Y), size = 4, color = 'black') +
  ggrepel::geom_label_repel(
    aes(tsne_X, y = tsne_Y, fill = L1_code, label = L1_code),
    fontface = 'bold', color = 'black',
    box.padding = unit(0.35, "lines"),
    point.padding = unit(0.2, "lines"),
    segment.color = 'black') +
  ggtitle("Centroids in 2D, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none")

Diameters

tsne_diameters_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  mutate(L1_prompt = paste0(L1_code, "_", prompt_id )) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_diameter, "L1_prompt")  %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

tsne_prompt_mean_diameter<- tsne_diameters_prompt %>%
  group_by(L1_code) %>%
  summarize(mean = mean(max_dist))
  #multi_boot_standard(column = "max_dist")

ggplot(tsne_prompt_mean_diameter, aes(x = reorder(L1_code, mean), 
                      y = mean, 
                      fill = L1_code)) +
  geom_bar(stat = "identity") +
  #geom_linerange(aes(ymin = summary_ci_lower, ymax = summary_ci_upper)) +
  
  xlab("L1") +
  ggtitle("Diameter, averaging across prompts") +
  theme_minimal() +
  theme(legend.position = "none")

Area overlap in 2D

pooling across prompts

get_2d_overlap <- function(group_1, group_2, df){
  df1 <- filter(df, L1_code == group_1)
  df2 <- filter(df, L1_code == group_2)

  siar_obj = siar::overlap(df1$tsne_X,df1$tsne_Y, df2$tsne_X, df2$tsne_Y)
  overlap = siar_obj$overlap
  p_overlap = overlap/mean(siar_obj$area1, siar_obj$area2) # overlaps normalized by mean area
  
  data.frame(group_1  = group_1,
             group_2 = group_2,
             overlap = overlap, 
             p_overlap = p_overlap)
}

all_lang_combos <- expand.grid(group_1 = as.character(unique(tsne_dims$L1_code)), 
                               group_2 = as.character(unique(tsne_dims$L1_code)))

pairwise_tsne_overlap <- map2_df(all_lang_combos$group_1, 
                                 all_lang_combos$group_2,
                                 get_2d_overlap, 
                                 tsne_dims)

pairwise_tsne_overlap_dist <- pairwise_tsne_overlap %>%
  mutate(dist_p_overlap = 1 - p_overlap) %>%
  select(group_1, group_2, dist_p_overlap) %>%
  spread(group_2,dist_p_overlap) %>%
  remove_rownames() %>%
  column_to_rownames("group_1") %>%
  as.matrix()

prompt_mean_centroids_mat <- as.matrix(prompt_mean_centroids[,-1])
rownames(prompt_mean_centroids_mat) = prompt_mean_centroids[,1]

ggdendro::ggdendrogram(hclust(dist(pairwise_tsne_overlap_dist))) +
  ggtitle("Proportion overlap of ellipses, pooling across prompts")

averaging across prompts

get_2d_overlap_prompt <- function(group_1, group_2, prompt, df){
   df1 <- filter(df, L1_code == group_1, prompt_id == prompt)
  df2 <- filter(df, L1_code == group_2, prompt_id == prompt)

  siar_obj = siar::overlap(df1$tsne_X,df1$tsne_Y, df2$tsne_X, df2$tsne_Y)
  overlap = siar_obj$overlap
  p_overlap = overlap/mean(siar_obj$area1, siar_obj$area2) # overlaps normalized by mean area
  
  data.frame(group_1  = group_1,
             group_2 = group_2,
             prompt = prompt,
             overlap = overlap, 
             p_overlap = p_overlap)
}

all_prompts <- rep(unique(tsne_dims$prompt_id), each = nrow(all_lang_combos))

all_lang_combos2 <- do.call("rbind", replicate(length(unique(tsne_dims$prompt_id)), 
                                               all_lang_combos, simplify = FALSE)) %>%
                    bind_cols(prompt = all_prompts) %>%
  mutate_all(funs(as.character))


pairwise_tsne_overlap_prompt <- pmap_df(all_lang_combos2,
                                 get_2d_overlap_prompt, 
                                 tsne_dims)

pairwise_tsne_overlap_prompt_mat <- pairwise_tsne_overlap_prompt %>%
  group_by(group_1, group_2) %>%
  summarize(p_overlap = mean(p_overlap)) %>%
  mutate(dist_p_overlap = 1 - p_overlap) %>%
  select(group_1, group_2, dist_p_overlap) %>%
  spread(group_2,dist_p_overlap) %>%
  remove_rownames() %>%
  column_to_rownames("group_1") %>%
  as.matrix()

pairwise_tsne_overlap_prompt_mat <- as.matrix(pairwise_tsne_overlap_prompt_mat)

ggdendro::ggdendrogram(hclust(dist(pairwise_tsne_overlap_prompt_mat))) +
  ggtitle("Proportion overlap of ellipses, averaging across prompts")

Prop. overlap language matrix

diag(pairwise_tsne_overlap_prompt_mat) <- ""
pairwise_tsne_overlap_prompt_mat[upper.tri(pairwise_tsne_overlap_prompt_mat)] <- ""
kable(pairwise_tsne_overlap_prompt_mat)

	ARA	CHI	FRE	GER	HIN	ITA	JPN	KOR	SPA	TEL
ARA
CHI	0.247233837870026
FRE	0.17785820751051	0.27269483865517
GER	0.19261301405023	0.305131384258742	0.184189439645891
HIN	0.678398161107181	0.688971399134594	0.654901764079375	0.65904866225239
ITA	0.0984127614333606	0.245789748926822	0.224353716968205	0.416436601355935	0.749772502363741
JPN	0.324830619217136	0.293185999947818	0.495341862084987	0.623535937718317	0.771661993550938	0.482842510859474
KOR	0.331923593561158	0.262751623404783	0.476213924165527	0.606124049174115	0.788831165987201	0.511814612841325	0.238487156149559
SPA	0.100746291739933	0.267401745495666	0.24146112922371	0.369292297402464	0.729251179619222	0.327300714079131	0.395852142654985	0.354162721651465
TEL	0.641150469122112	0.660124196884375	0.642619787547842	0.62856410984248	0.282475673128749	0.803794470438355	0.74160105479356	0.67200881345187	0.699333507548088
TUR	0.241883027472553	0.324921790304212	0.315397794477124	0.385279892055217	0.659447705171416	0.478343128044118	0.405953568396441	0.370423528949248	0.33022417608558	0.671758335611655

Prompts sorted by overlap

pairwise_tsne_overlap_prompt %>%
  rename(prompt_id = prompt) %>%
  group_by(prompt_id) %>%
  summarize(mean = mean(p_overlap)) %>%
  arrange(mean) %>%
  left_join(prompts_clean) %>%
  kable()

prompt_id	mean	prompt
VC079857	0.5167640	It is better to have broad knowledge of many academic subjects than to specialize in one specific subject.
VC139555	0.5376194	Young people enjoy life more than older people do.
VC182427	0.5438269	Young people nowadays do not give enough time to helping their communities.
VC199744	0.5493628	In twenty years, there will be fewer cars in use than there are today.
VC212639	0.5780431	The best way to travel is in a group led by a tour guide.
VC247638	0.6274587	It is more important for students to understand ideas and concepts than it is for them to learn facts.
VC251653	0.6696384	Successful people try new things and take risks rather than only doing what they already know how to do well.
VC199494	0.6931459	Most advertisements make products seem much better than they really are.

L2ETS Study 1 analyses

Molly Lewis

2017-09-28

Metadata

High-d space

By language (pooling prompts)

Centroid distance

Diameters

By language and prompt (averaging across prompts)

Centroid distance

Diameters

2D Space

Visualize Distributions

By language (pooling prompts)

Centroid Distances

Diameters

By language and prompt (averaging across prompts)

Centroid distances

Diameters

Area overlap in 2D

pooling across prompts

averaging across prompts

Prop. overlap language matrix

Prompts sorted by overlap