Read in doctag indices, docvecs, and metadata

doctag_indices <- read_feather("../all_data/doctag_indices.feather") 
docvecs <- read_feather("../all_data/docvecs.feather") %>%
  as.data.frame() %>%
  mutate(offset = 0:(n()-1)) %>%
  select(offset, everything())

#write_tsv(docvecs, "../all_data/docvecs.txt")
metadata <- read_csv("../all_data/merged_metadata.csv") 
metadata_clean <- metadata %>%
  mutate_if(is.character, as.factor)  %>%
  mutate(essay_id = as.character(essay_id)) 

Merge all data sources together

d <- doctag_indices %>%
  left_join(metadata_clean) %>%
  left_join(docvecs) %>%
  nest(-1:-10, .key = "doc_vector") %>%
  select(-doc_count, -offset, -test_center_country_code) %>%
  mutate(prompt_id = fct_recode(prompt_id, 
                                broad.academics = "VC079857", 
                                young.enjoy = "VC139555", 
                                young.help = "VC182427",
                                advertisements = "VC199494",
                                fewer.cars = "VC199744",
                                tour.travel = "VC212639",
                                students.facts = "VC247638",
                                success.risks = "VC251653"))

Scores by language

d %>%
  group_by(L1_code) %>%
  multi_boot_standard(col = "score", na.rm = T) %>%
  arrange(mean) %>% 
  kable()
L1_code ci_lower ci_upper mean
ARA 2.812475 2.927500 2.8695
JPN 2.908475 3.016025 2.9615
KOR 3.063975 3.166513 3.1165
ITA 3.154488 3.266012 3.2090
CHI 3.213963 3.307513 3.2590
TEL 3.321487 3.422525 3.3720
TUR 3.406500 3.511038 3.4590
FRE 3.502000 3.606012 3.5535
SPA 3.514500 3.621500 3.5670
HIN 3.831975 3.930013 3.8815
GER 3.872975 3.957012 3.9115

Visualize Distributions as a function of score (2D space)

Get TSNE coorinates

mats <- d %>%
  select(doc_vector) %>%
  unnest() %>%
  as.matrix()

tsne_out = Rtsne::Rtsne(mats)
tsne_dims <- tsne_out$Y %>%
  as.data.frame() %>%
  rename(tsne_X = V1,
         tsne_Y = V2) %>%
  bind_cols(d %>% select(essay_id, score, age, gender, L1_code, prompt_id)) %>%
  select(everything(), tsne_X, tsne_Y)

With centroids, by language and prompt

get_group_centroid <- function(doc_vecs_df, group_name){
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    colMeans() 

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()

  df <- data.frame(t(centroid)) %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(as.name(group_name))
   df
}

tsne_centroids_L1_score_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(score)) %>%
  group_by(L1_code, prompt_id) %>%
  mutate(score_bin = as.factor(ntile(score, 2))) %>%
  ungroup() %>%
  mutate(L1_prompt_score = paste(L1_code, prompt_id, score_bin, sep = "_")) %>%
  split(.$L1_prompt_score) %>% 
  map_df(get_group_centroid, "L1_prompt_score") %>%
  separate(L1_prompt_score, c("L1_code", "prompt_id", "score_bin"), sep = "_")

tsne_centroids_L1_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(score)) %>%
  mutate(L1_prompt = paste(L1_code, prompt_id, sep = "_")) %>%
  split(.$L1_prompt) %>% 
  map_df(get_group_centroid, "L1_prompt") %>%
  separate(L1_prompt, c("L1_code", "prompt_id"), sep = "_")

tsne_prompt_score_language = tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  group_by(L1_code, prompt_id) %>%
  filter(!is.na(score)) %>%
  mutate(score_bin = as.factor(ntile(score, 2))) %>%
  unnest() 

ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, fill = score_bin, color = score_bin)) +
  geom_encircle(alpha = .2) +
  facet_grid(L1_code ~ prompt_id, scales = "free") +
  #geom_point(data = tsne_centroids_L1_score_prompt, size = .5) +
  geom_point(data = mutate(tsne_centroids_L1_prompt, score_bin = NA), size = .8, color = "black") +
  theme_bw()

With centroids, by prompt

tsne_centroids_score_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(score)) %>%
  group_by(prompt_id) %>%
  mutate(score_bin = as.factor(ntile(score, 2)),
         prompt_score = paste(prompt_id, score_bin, sep = "_")) %>%
  ungroup() %>%
  split(.$prompt_score) %>% 
  map_df(get_group_centroid, "prompt_score") %>%
  separate(prompt_score, c( "prompt_id", "score_bin"), sep = "_")

tsne_centroids_prompt <- tsne_dims %>%
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(score)) %>%
  split(.$prompt_id) %>% 
  map_df(get_group_centroid, "prompt_id") 

ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, fill = score_bin, color = score_bin)) +
  geom_point(size = .2)+
  geom_encircle(alpha = .2) +
  facet_wrap(~ prompt_id, scales = "free") +
  geom_point(data = tsne_centroids_score_prompt, color = "black") +
  theme_bw()

ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, fill = score_bin, color = score_bin)) +
  geom_density2d(alpha=.5) + 
  facet_grid(score_bin ~ prompt_id, scales = "free") +
  geom_point(data = tsne_centroids_score_prompt, color = "black") +
  theme_bw()

ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, 
                                       fill = score_bin, color = score_bin)) +
  geom_point(data = tsne_centroids_score_prompt) +
  theme_bw()

Analyses controlling for score

The is the distance from the lanugage-prompt mean as a function of score and language.

tsne_dims_with_centers <- tsne_dims %>%
  left_join(tsne_centroids_L1_prompt %>% 
              rename(tsne_X_prompt_L1 = tsne_X,
         tsne_Y_prompt_L1 = tsne_Y), by = c("prompt_id", "L1_code")) %>%
  rowwise() %>%
  mutate(dist_from_prompt_L1_center = dist(rbind(c(tsne_X_prompt_L1,
                                                   tsne_Y_prompt_L1),
                                                 c(tsne_X, tsne_Y)), 
                                           method = "euclidean")[1])

model = lme4::lmer(dist_from_prompt_L1_center ~ L1_code + score +
                     (1|prompt_id), tsne_dims_with_centers)
summary(model)
## Linear mixed model fit by REML ['lmerMod']
## Formula: dist_from_prompt_L1_center ~ L1_code + score + (1 | prompt_id)
##    Data: tsne_dims_with_centers
## 
## REML criterion at convergence: 51550.8
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.7563 -0.5577 -0.1373  0.3430 16.1961 
## 
## Random effects:
##  Groups    Name        Variance Std.Dev.
##  prompt_id (Intercept) 0.2375   0.4873  
##  Residual              6.3183   2.5136  
## Number of obs: 11000, groups:  prompt_id, 8
## 
## Fixed effects:
##             Estimate Std. Error t value
## (Intercept)  4.41792    0.20695  21.348
## L1_codeCHI  -0.37241    0.11297  -3.296
## L1_codeFRE  -0.51936    0.11437  -4.541
## L1_codeGER  -0.45782    0.11674  -3.922
## L1_codeHIN  -0.76156    0.11651  -6.537
## L1_codeITA  -0.69088    0.11346  -6.089
## L1_codeJPN  -0.41875    0.11245  -3.724
## L1_codeKOR  -0.27082    0.11264  -2.404
## L1_codeSPA  -0.37921    0.11440  -3.315
## L1_codeTEL  -0.63923    0.11390  -5.612
## L1_codeTUR  -0.23637    0.11410  -2.072
## score       -0.19477    0.02879  -6.766
## 
## Correlation of Fixed Effects:
##            (Intr) L1_CHI L1_FRE L1_GER L1_HIN L1_ITA L1_JPN L1_KOR L1_SPA
## L1_codeCHI -0.231                                                        
## L1_codeFRE -0.197  0.506                                                 
## L1_codeGER -0.158  0.504  0.522                                          
## L1_codeHIN -0.161  0.504  0.521  0.535                                   
## L1_codeITA -0.233  0.502  0.507  0.507  0.505                            
## L1_codeJPN -0.262  0.500  0.495  0.487  0.488  0.497                     
## L1_codeKOR -0.246  0.503  0.501  0.496  0.497  0.500  0.500              
## L1_codeSPA -0.197  0.507  0.515  0.517  0.516  0.502  0.495  0.501       
## L1_codeTEL -0.217  0.503  0.510  0.514  0.516  0.506  0.496  0.500  0.505
## L1_codeTUR -0.207  0.505  0.515  0.518  0.517  0.508  0.496  0.501  0.512
## score      -0.399 -0.099 -0.174 -0.260 -0.252 -0.090 -0.023 -0.063 -0.175
##            L1_TEL L1_TUR
## L1_codeCHI              
## L1_codeFRE              
## L1_codeGER              
## L1_codeHIN              
## L1_codeITA              
## L1_codeJPN              
## L1_codeKOR              
## L1_codeSPA              
## L1_codeTEL              
## L1_codeTUR  0.510       
## score      -0.128 -0.152
ggplot(tsne_prompt_score_language, aes(x = tsne_X, y = tsne_Y, fill = score_bin)) +
  geom_point(size = .2, aes(color = score_bin)) +
  facet_grid(L1_code ~ prompt_id, scales = "free") +
  geom_point(data = mutate(tsne_centroids_L1_prompt, score_bin = NA), size = .8, color = "black") +
  theme_bw()

Languages further away have lower scores (contra what I said earlier); there’s still an effect of language, controlling for score.

Age and gender

Older people look different from younger and males look different from females

get_group_centroid_with_CI <- function(doc_vecs_df, group_name){
  
  centroid <- doc_vecs_df %>%
    select(doc_vector) %>%
    unnest(doc_vector) %>%
    gather("coordinate_name", "value") %>%
    group_by(coordinate_name) %>%
    multi_boot_standard(col = "value") %>%
    rename(center = mean) %>%
    gather(variable, value, -coordinate_name) %>%
    unite(temp, coordinate_name, variable) %>%
    spread(temp, value)

  this_group_name  <- doc_vecs_df %>%
    select_(group_name) %>%
    slice(1) %>%
    unlist()
  
  df <- centroid %>%
    mutate(x = this_group_name) %>%
    select(x, everything())
  
   names(df)[1] = c(quo_name(group_name))
   df
}

tsne_dims_gender <- tsne_dims %>% 
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(gender)) %>%
  mutate(gender = as.character(gender)) %>%
  split(.$gender) %>% 
  map_df(get_group_centroid_with_CI, "gender") 
  
ggplot(tsne_dims_gender, aes(x = tsne_X_center, y = tsne_Y_center, fill = gender, color = gender)) +
  geom_point() +
  geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper)) + 
  geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
  theme_bw()

tsne_dims_age<- tsne_dims %>% 
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(age)) %>%
  mutate(age_bin = as.factor(cut(age, 8))) %>%
  split(.$age_bin) %>% 
  map_df(get_group_centroid_with_CI, "age_bin") 

ggplot(tsne_dims_age, aes(x = tsne_X_center, y = tsne_Y_center, fill = age_bin, color = age_bin)) +
  geom_point() +
  geom_pointrange(aes(ymin = tsne_Y_ci_lower,ymax = tsne_Y_ci_upper)) + 
  geom_errorbarh(aes(xmin = tsne_X_ci_lower,xmax = tsne_X_ci_upper, height = 0)) +
  theme_bw()

Gender by Language

tsne_dims_gender <- tsne_dims %>% 
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(gender)) %>%
  mutate(L1_gender = paste(L1_code, gender, sep = "_")) %>%
  split(.$L1_gender) %>% 
  map_df(get_group_centroid_with_CI, "L1_gender") %>%
  separate(L1_gender, c("L1_code", "gender"), sep = "_")

  
ggplot(tsne_dims_gender, aes(x = tsne_X_center, y = tsne_Y_center, fill = gender, color = gender)) +
  geom_point() +
  geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper)) + 
  geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
  facet_wrap( ~L1_code) +
  theme_bw()

Gender by prompt

tsne_dims_prompt <- tsne_dims %>% 
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(gender)) %>%
  mutate(prompt_gender = paste(prompt_id, gender, sep = "_")) %>%
  split(.$prompt_gender) %>% 
  map_df(get_group_centroid_with_CI, "prompt_gender") %>%
  separate(prompt_gender, c("prompt_id", "gender"), sep = "_")

  
ggplot(tsne_dims_prompt, aes(x = tsne_X_center, y = tsne_Y_center, fill = gender, color = gender)) +
  geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper), size = .5) + 
  geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
  facet_wrap(~prompt_id) +
  theme_bw()

All the differences in gender are accounted for by language.

Age by Language

tsne_dims_age <- tsne_dims %>% 
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(age)) %>%
  group_by(L1_code) %>%
  mutate(age_bin = ntile(age, 2)) %>%
  ungroup() %>%
  mutate(L1_age = paste(L1_code, age_bin, sep = "_")) %>%
  split(.$L1_age) %>% 
  map_df(get_group_centroid_with_CI, "L1_age") %>%
  separate(L1_age, c("L1_code", "age_bin"), sep = "_")

  
ggplot(tsne_dims_age, aes(x = tsne_X_center, y = tsne_Y_center, fill = age_bin, color = age_bin)) +
  geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper), size = .2) + 
  geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
  facet_wrap(~L1_code, scales = "free") +
  theme_bw()

Age by Prompt

tsne_dims_prompt_age <- tsne_dims %>% 
  nest(1:2, .key = "doc_vector") %>%
  filter(!is.na(age)) %>%
  group_by(L1_code) %>%
  mutate(age_bin = ntile(age, 2)) %>%
  ungroup() %>%
  mutate(prompt_age = paste(prompt_id, age_bin, sep = "_")) %>%
  split(.$prompt_age) %>% 
  map_df(get_group_centroid_with_CI, "prompt_age") %>%
  separate(prompt_age, c("prompt_id", "age_bin"), sep = "_")

  
ggplot(tsne_dims_prompt_age, aes(x = tsne_X_center, y = tsne_Y_center, fill = age_bin, color = age_bin)) +
  geom_pointrange(aes(ymin = tsne_Y_ci_lower, ymax = tsne_Y_ci_upper), size = .2) + 
  geom_errorbarh(aes(xmin = tsne_X_ci_lower, xmax = tsne_X_ci_upper, height = 0)) +
  facet_wrap(~prompt_id, scales = "free") +
  theme_bw()

** patterns are variable depending on tsne