scores

SCORE_METADATA <-  here("data/raw/models/all_model/merged_metadata.csv")
meta_data <- read_csv(SCORE_METADATA)

meta_data %>%
  summarize(mean_score = mean(score), 
            sd_score = sd(score)) %>%
  kable(digits = 2)
mean_score sd_score
3.51 0.91

Disinctiveness by score group

All

LANG_NAME_PATH <- here("data/processed/lang_names/ets_to_google_langcodes_complete.csv")

langs_clean <- read_csv(LANG_NAME_PATH)


DISTANCE_INFILE_ALL <- here("analyses/01_distinctiveness/data/sampled_essay_distances_all.csv")

mean_language_distances_all <- read_csv(DISTANCE_INFILE_ALL, 
                                        col_names = c("language", "score_group", "distance_type", "distance", "n", "sample_id")) %>%
  mutate_if(is.character, as.factor)

dists_wide_all <- mean_language_distances_all %>%
  select(-n) %>%
  spread(distance_type, distance) %>%
  mutate(same_diff_distance = same_language/diff_language) %>%
  select(-diff_language, -same_language)

lang_means_all <- dists_wide_all %>%
  group_by(language) %>%
  tidyboot_mean(column = same_diff_distance, nboot = 1000)

lang_means_all %>%
  left_join(langs_clean %>% 
            mutate(ETS_lang_name = toupper(ETS_lang_name)), 
          by = c("language" = "ETS_lang_name")) %>%
  ggplot(aes(x = lang_name3, y = mean - 1))   +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = ci_lower - 1, ymax = ci_upper - 1),
                 position = position_dodge(width = 1)) +
  scale_y_continuous(name = "Semantic Distinctiveness \n(between / within cosine distance)",
                     breaks = c(0, .2, .4, .6), 
                     labels = c("1.0", "1.2", "1.4", "1.6"),
                     limits = c(0, .6)) + # rescale so y-axis starts at 1
  scale_fill_manual(values = c( "#fb9a99", "#a6cee3"), name = "Essay Score") + 
  xlab("Language") +
  theme_classic(base_size = 20) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        legend.position = c(.1, 0.8),
        axis.line = element_line(size = 1.2),
        axis.ticks = element_line(size = 1),
        legend.background = element_blank())

Stats

Descriptives

# take mean across runs
language_means <- dists_wide_all %>%
  group_by(language) %>%
  summarize(same_diff_distance = mean(same_diff_distance)) 

# take mean across langauges
language_means %>%
  summarize(mean = mean(same_diff_distance),
            sd = sd(same_diff_distance)) %>%
  mutate_all(round, 2)%>%
  kable()
mean sd
1.26 0.09

Is each language > 1?

MU <- 1
t.test(language_means %>% 
         pull(same_diff_distance),
       mu = MU) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2) %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
1.26 17.09 0 34 1.225575 1.286447 One Sample t-test two.sided
t.test(language_means %>% 
         pull(same_diff_distance),
       mu = MU) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2) %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
1.26 17.09 0 34 1.225575 1.286447 One Sample t-test two.sided
Testing for normality: Kolmogorov-Smirnov test.

Null hypothesis: test distribution is normal

ks.test(language_means$same_diff_distance, "pnorm", 
        mean=mean(language_means$same_diff_distance), 
        sd=sd(language_means$same_diff_distance))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  language_means$same_diff_distance
## D = 0.090512, p-value = 0.9119
## alternative hypothesis: two-sided
Non-parameteric t-test: Wilcoxon signed-rank test

null hypothesis: two means the same

wilcox.test(language_means$same_diff_distance, mu = MU)
## 
##  Wilcoxon signed rank test
## 
## data:  language_means$same_diff_distance
## V = 630, p-value = 5.821e-11
## alternative hypothesis: true location is not equal to 1

Low vs.high

DISTANCE_INFILE <- here("analyses/01_distinctiveness/data/sampled_essay_distances_high_low.csv")
FIG_PATH <- here("analyses/01_distinctiveness/results/distinctiveness_fig.pdf")

Main plot

mean_language_distances <- read_csv(DISTANCE_INFILE,
                                         col_names = c("language", "score_group", "distance_type", "distance", "n", "sample_id"))  

dists_wide <- mean_language_distances %>%
  select(-n) %>%
  group_by(score_group) %>%
  spread(distance_type, distance) %>%
  mutate(same_diff_distance = same_language/diff_language) %>%
  select(-diff_language, -same_language)

lang_means <- dists_wide %>%
  group_by(language, score_group) %>%
  tidyboot_mean(column = same_diff_distance, nboot = 1000)

main_plot <- lang_means %>%
  left_join(langs_clean %>% 
            mutate(ETS_lang_name = toupper(ETS_lang_name)), 
          by = c("language" = "ETS_lang_name")) %>%
  ggplot(aes(x = lang_name3, y = mean - 1, group = score_group, fill = score_group))   +
  geom_bar(stat = "identity", position = "dodge") +
  geom_linerange(aes(ymin = ci_lower - 1, ymax = ci_upper - 1),
                 position = position_dodge(width = 1)) +
  scale_y_continuous(name = "Semantic Distinctiveness \n(between / within cosine distance)",
                     breaks = c(0, .2, .4, .6), 
                     labels = c("1.0", "1.2", "1.4", "1.6"),
                     limits = c(0, .6)) + # rescale so y-axis starts at 1
  scale_fill_manual(values = c( "#fb9a99", "#a6cee3"), name = "Essay Score") + 
  xlab("Language") +
  theme_classic(base_size = 20) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = 0.5),
        legend.position = c(.1, 0.8),
        axis.line = element_line(size = 1.2),
        axis.ticks = element_line(size = 1),
        legend.background = element_blank())

Histogram inset

inset_plot <- ggplot(meta_data, aes(x = score)) +
  geom_histogram(binwidth = 1) +
  xlab("Essay Score") +
  ylab("N Essays") +
  theme_classic(base_size = 10) 
plot_with_inset <-
  ggdraw() +
  draw_plot(main_plot) +
  draw_plot(inset_plot, x = 0.83, y = .75, width = .14, height = .2)

plot_with_inset

pdf(FIG_PATH, width = 14)
plot_with_inset
dev.off()

Stats

Descriptives
language_means_hl <- dists_wide %>%
  group_by(language, score_group) %>%
  summarize(same_diff_distance = mean(same_diff_distance)) %>%
  mutate(score_group = fct_rev(score_group))

language_means_hl %>%
  group_by(score_group) %>%
  summarize(mean = mean(same_diff_distance),
            sd = sd(same_diff_distance)) %>%
  mutate_if(is.numeric, round, 2) %>%
  kable()
score_group mean sd
low 1.27 0.08
high 1.21 0.09
paired t-tests: For each language pair is low > high?
t.test(same_diff_distance ~ score_group, 
       data = language_means_hl, paired = T) %>%
  tidy() %>%
  mutate_at(vars(estimate, statistic), round, 2) %>%
  kable()
estimate statistic p.value parameter conf.low conf.high method alternative
0.06 5.73 1.9e-06 34 0.0401324 0.0842251 Paired t-test two.sided
Testing for normality: Kolmogorov-Smirnov test.

Null hypothesis: test distribution is normal

low_data <- language_means_hl %>%
  filter(score_group == "low") %>%
  pull(same_diff_distance)

high_data <- language_means_hl %>%
  filter(score_group == "high") %>%
  pull(same_diff_distance)
  
ks.test(low_data, "pnorm", mean=mean(low_data), sd=sd(low_data))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  low_data
## D = 0.13227, p-value = 0.5296
## alternative hypothesis: two-sided
ks.test(high_data, "pnorm", mean=mean(high_data), sd=sd(high_data))
## 
##  One-sample Kolmogorov-Smirnov test
## 
## data:  high_data
## D = 0.10694, p-value = 0.7791
## alternative hypothesis: two-sided
Non-parameteric t-test: Wilcoxon signed-rank test

null hypothesis: two samples come from same distribution

wilcox.test(low_data, high_data, paired=TRUE)
## 
##  Wilcoxon signed rank test
## 
## data:  low_data and high_data
## V = 574, p-value = 3.647e-06
## alternative hypothesis: true location shift is not equal to 0