# Load the data
korean_replication <- read_csv(here("data","korean_replication_prolific_props.csv")) 
## Rows: 2328 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): theword, block, response, eng_response
## dbl (2): totcount, proportion
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
korean_chosun <- read_csv(here("data","jongmin_cleaned.csv"))
## Rows: 2749 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Word, block, response, theword
## dbl (2): totcount, proportion
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
korean_unilemma <- read_csv(here("data","korean_unilemma.csv"))
## Rows: 1543 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): word, block, response, language, uni_lemma
## dbl (3): totcount, count, proportion
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
korean_chosun <- korean_chosun %>% rename(
  uni_lemma = Word
)
# proportion ratings from english speakers on korean unilemma
  korean_unilemma %>%
      filter(block == "category_organization") %>%
      ggplot(aes(x = proportion, color = response, fill = response)) +
      geom_density(alpha = 0.5) +
      labs(
        title = "Density of Proportions",
        x = "Proportion Value",
        y = "Density (words)"
      ) +
      scale_fill_discrete(name = "Response") +
      scale_color_discrete(name = "Response")+
      theme_minimal(base_size = 14) +
      theme(legend.position = "top")

✅ 1. Side-by-side or overlaid density plots

# Example: shape-only comparison

korean_replication <- korean_replication %>%
  mutate(matching_response = case_when(eng_response == "form" ~ "shape",
                            TRUE ~ eng_response))  %>%
  #filter(block == "category_organization", matching_response == "shape") %>%
  mutate(group = "Korean")

korean_replication %>%
  filter(block == "category_organization", eng_response == "form") %>%
  mutate(group = "Korean") %>%
  bind_rows(
    korean_unilemma %>%
      filter(block == "category_organization", response == "shape") %>%
      mutate(group = "English")
  ) %>%
  ggplot(aes(x = proportion, fill = group)) +
  geom_density(alpha = 0.5) +
  labs(title = "Shape-based Ratings: English vs Korean",
       x = "Proportion", y = "Density") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")

korean_replication <- korean_replication %>%
  mutate(matching_response = case_when(eng_response == "form" ~ "shape",
                            TRUE ~ eng_response))  %>%
  mutate(group = "Korean")

korean_replication %>%
  filter(block == "solidity", eng_response == "solid") %>%
  mutate(group = "Korean") %>%
  bind_rows(
    korean_unilemma %>%
      filter(block == "solidity", response == "solid") %>%
      mutate(group = "English")
  ) %>%
  ggplot(aes(x = proportion, fill = group)) +
  geom_density(alpha = 0.5) +
  labs(title = "Solid ratings: English vs Korean",
       x = "Proportion", y = "Density") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")

korean_replication <- korean_replication %>%
  mutate(matching_response = case_when(eng_response == "form" ~ "shape",
                            TRUE ~ eng_response))  %>%
  mutate(group = "Korean")

korean_replication %>%
  filter(block == "count_mass", eng_response == "countable") %>%
  mutate(group = "Korean") %>%
  bind_rows(
    korean_unilemma %>%
      filter(block == "count_mass", response == "count noun") %>%
      mutate(group = "English")
  ) %>%
  ggplot(aes(x = proportion, fill = group)) +
  geom_density(alpha = 0.5) +
  labs(title = "count ratings: English vs Korean",
       x = "Proportion", y = "Density") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")

korean_replication <- korean_replication %>%
  mutate(matching_response = case_when(eng_response == "form" ~ "shape",
                            TRUE ~ eng_response))  %>%
  #filter(block == "category_organization", matching_response == "shape") %>%
  mutate(group = "Korean")

korean_replication %>%
  filter(block == "category_organization", eng_response == "form") %>%
  mutate(group = "Korean-online") %>%
  bind_rows(
    korean_chosun %>%
      filter(block == "category_organization", response == "Form") %>%
      mutate(group = "Korean_chosun")
  ) %>%
  ggplot(aes(x = proportion, fill = group)) +
  geom_density(alpha = 0.5) +
  labs(title = "Shape-based Ratings: Korean-online vs Korean-chosun",
       x = "Proportion", y = "Density") +
  theme_minimal() +
  scale_fill_brewer(palette = "Set1")

✅ 2. Word-level paired scatterplots

# Wide format for shape response
eng_shape <- korean_unilemma %>%
  filter(block == "category_organization", response == "shape") %>%
  mutate(uni_lemma = str_trim(str_to_lower(uni_lemma))) %>%
  select(uni_lemma, prop_eng = proportion)

kor_shape <- korean_chosun %>%
  filter(block == "category_organization", response == "Form") %>%
  mutate(uni_lemma = str_trim(str_to_lower(uni_lemma))) %>%
  select(uni_lemma, prop_kor = proportion)

shape_compare <- full_join(eng_shape, kor_shape, by = "uni_lemma")


ggplot(shape_compare, aes(x = prop_eng, y = prop_kor)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(title = "Word-level Shape Ratings",
       x = "English Speakers (unilemma)", y = "Korean Speakers Chosun (Korean word)") +
  theme_minimal()
## Warning: Removed 164 rows containing missing values or values outside the scale range
## (`geom_point()`).

anti_join(eng_shape, kor_shape, by = "uni_lemma") %>% 
  pull(uni_lemma) %>% 
  head(20)
##  [1] "airplane"         "back (body part)" "blocks"           "boulder"         
##  [5] "bowl"             "broom"            "bubbles"          "bunny"           
##  [9] "cabbage"          "chicken (animal)" "chopstick"        "clock"           
## [13] "comb (object)"    "couch"            "dress (object)"   "dress shoe"      
## [17] "dried seaweed"    "fingernail"       "firetruck"        "fish (animal)"
# ✅ 3. Correlations between the two sets of ratings
cor(shape_compare$prop_eng, shape_compare$prop_kor, use = "complete.obs")
## [1] 0.3379003

✅ Paired statistical tests of distributional differences/divergence between English and Korean ratings

shape_compare_clean <- shape_compare %>%
  filter(!is.na(prop_eng), !is.na(prop_kor))

wilcox.test(shape_compare_clean$prop_eng, shape_compare_clean$prop_kor, paired = TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  shape_compare_clean$prop_eng and shape_compare_clean$prop_kor
## V = 1228.5, p-value = 6.415e-16
## alternative hypothesis: true location shift is not equal to 0
ks.test(shape_compare_clean$prop_eng, shape_compare_clean$prop_kor)
## Warning in ks.test.default(shape_compare_clean$prop_eng,
## shape_compare_clean$prop_kor): p-value will be approximate in the presence of
## ties
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  shape_compare_clean$prop_eng and shape_compare_clean$prop_kor
## D = 0.5, p-value = 2.814e-16
## alternative hypothesis: two-sided
# Wide format for shape response
kor_rep_shape <- korean_replication %>%
  filter(block == "category_organization", matching_response == "shape") %>%
  select(theword, prop_eng = proportion)

kor_shape <- korean_chosun %>%
  filter(block == "category_organization", response == "Form") %>%
  select(theword, prop_kor = proportion)

shape_compare_onlykoreans <- full_join(kor_rep_shape, kor_shape, by = "theword")
## Warning in full_join(kor_rep_shape, kor_shape, by = "theword"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 47 of `x` matches multiple rows in `y`.
## ℹ Row 8 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
##   "many-to-many"` to silence this warning.
ggplot(shape_compare_onlykoreans, aes(x = prop_eng, y = prop_kor)) +
  geom_point() +
  geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
  labs(title = "Word-level Shape Ratings",
       x = "korean speakers online", y = "Korean Speakers Chosun") +
  theme_minimal()
## Warning: Removed 101 rows containing missing values or values outside the scale range
## (`geom_point()`).

# ✅ 3. Correlations between the two sets of ratings
cor(shape_compare_onlykoreans$prop_eng, shape_compare_onlykoreans$prop_kor, use = "complete.obs")
## [1] 0.2444943

✅ Paired statistical tests of distributional differences/divergence between the two different Korean groups ratings

shape_compare_onlykoreans_clean <- shape_compare_onlykoreans %>%
  filter(!is.na(prop_eng), !is.na(prop_kor))

wilcox.test(shape_compare_onlykoreans_clean$prop_eng, shape_compare_onlykoreans_clean$prop_kor, paired = TRUE)
## 
##  Wilcoxon signed rank test with continuity correction
## 
## data:  shape_compare_onlykoreans_clean$prop_eng and shape_compare_onlykoreans_clean$prop_kor
## V = 34381, p-value = 0.002931
## alternative hypothesis: true location shift is not equal to 0
ks.test(shape_compare_onlykoreans_clean$prop_eng, shape_compare_onlykoreans_clean$prop_kor)
## Warning in ks.test.default(shape_compare_onlykoreans_clean$prop_eng,
## shape_compare_onlykoreans_clean$prop_kor): p-value will be approximate in the
## presence of ties
## 
##  Asymptotic two-sample Kolmogorov-Smirnov test
## 
## data:  shape_compare_onlykoreans_clean$prop_eng and shape_compare_onlykoreans_clean$prop_kor
## D = 0.33529, p-value < 2.2e-16
## alternative hypothesis: two-sided
get_dominant_category <- function(df, lang_label) {
  df %>%
    filter(block == "category_organization") %>%
  mutate(uni_lemma = str_trim(str_to_lower(uni_lemma))) %>%
    group_by(uni_lemma) %>%
    slice_max(proportion, with_ties = FALSE) %>%
    select(uni_lemma, category = response) %>%
    mutate(language = lang_label)
}

eng_dom <- get_dominant_category(korean_unilemma, "English")
kor_dom <- get_dominant_category(korean_chosun, "Korean") #%>%
 # rename(response = category)  # use eng_response in Korean dataset

merged_dom <- full_join(eng_dom, kor_dom, by = "uni_lemma", suffix = c("_eng", "_kor"))

# Proportion of words with matching dominant categories
mean(merged_dom$category_eng == merged_dom$category_kor, na.rm = TRUE)
## [1] 0
# A helper function to map Korean response strings to English-style categories
map_kor_to_eng <- function(cat) {
  cat <- tolower(cat)
  case_when(
    str_detect(cat, "form") ~ "shape",
    str_detect(cat, "material") ~ "material",
    str_detect(cat, "color") ~ "color",
    TRUE ~ "none of these"
  )
}

# Apply the mapping to the Korean categories
merged_dom <- merged_dom %>%
  mutate(category_kor_mapped = map_kor_to_eng(category_kor)) %>%
  mutate(
    category_eng = tolower(category_eng),
    category_kor_mapped = tolower(category_kor_mapped)
  )

# Did they share at least one conceptual category?
merged_dom %>%
  mutate(
    overlap = mapply(function(a, b) any(str_split(a, ",\\s*")[[1]] %in% str_split(b, ",\\s*")[[1]]),
                     tolower(category_eng),
                     tolower(category_kor))
  ) %>%
  summarise(agreement = mean(overlap, na.rm = TRUE))
## # A tibble: 319 × 2
##    uni_lemma        agreement
##    <chr>                <dbl>
##  1 airplane                 0
##  2 animal                   0
##  3 ankle                    0
##  4 ant                      0
##  5 anus                     0
##  6 apple                    0
##  7 arm                      0
##  8 back                     0
##  9 back (body part)         0
## 10 bag                      0
## # ℹ 309 more rows
table(merged_dom$category_eng, merged_dom$category_kor_mapped)
##                
##                 material none of these shape
##   color                0             1     9
##   material             4            22    71
##   none of these        0            14    41
##   shape                0            24    61