# Load the data
korean_replication <- read_csv(here("data","korean_replication_prolific_props.csv"))
## Rows: 2328 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): theword, block, response, eng_response
## dbl (2): totcount, proportion
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
korean_chosun <- read_csv(here("data","jongmin_cleaned.csv"))
## Rows: 2749 Columns: 6
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (4): Word, block, response, theword
## dbl (2): totcount, proportion
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
korean_unilemma <- read_csv(here("data","korean_unilemma.csv"))
## Rows: 1543 Columns: 8
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (5): word, block, response, language, uni_lemma
## dbl (3): totcount, count, proportion
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
korean_chosun <- korean_chosun %>% rename(
uni_lemma = Word
)
# proportion ratings from english speakers on korean unilemma
korean_unilemma %>%
filter(block == "category_organization") %>%
ggplot(aes(x = proportion, color = response, fill = response)) +
geom_density(alpha = 0.5) +
labs(
title = "Density of Proportions",
x = "Proportion Value",
y = "Density (words)"
) +
scale_fill_discrete(name = "Response") +
scale_color_discrete(name = "Response")+
theme_minimal(base_size = 14) +
theme(legend.position = "top")
✅ 1. Side-by-side or overlaid density plots
# Example: shape-only comparison
korean_replication <- korean_replication %>%
mutate(matching_response = case_when(eng_response == "form" ~ "shape",
TRUE ~ eng_response)) %>%
#filter(block == "category_organization", matching_response == "shape") %>%
mutate(group = "Korean")
korean_replication %>%
filter(block == "category_organization", eng_response == "form") %>%
mutate(group = "Korean") %>%
bind_rows(
korean_unilemma %>%
filter(block == "category_organization", response == "shape") %>%
mutate(group = "English")
) %>%
ggplot(aes(x = proportion, fill = group)) +
geom_density(alpha = 0.5) +
labs(title = "Shape-based Ratings: English vs Korean",
x = "Proportion", y = "Density") +
theme_minimal() +
scale_fill_brewer(palette = "Set1")
korean_replication <- korean_replication %>%
mutate(matching_response = case_when(eng_response == "form" ~ "shape",
TRUE ~ eng_response)) %>%
mutate(group = "Korean")
korean_replication %>%
filter(block == "solidity", eng_response == "solid") %>%
mutate(group = "Korean") %>%
bind_rows(
korean_unilemma %>%
filter(block == "solidity", response == "solid") %>%
mutate(group = "English")
) %>%
ggplot(aes(x = proportion, fill = group)) +
geom_density(alpha = 0.5) +
labs(title = "Solid ratings: English vs Korean",
x = "Proportion", y = "Density") +
theme_minimal() +
scale_fill_brewer(palette = "Set1")
korean_replication <- korean_replication %>%
mutate(matching_response = case_when(eng_response == "form" ~ "shape",
TRUE ~ eng_response)) %>%
mutate(group = "Korean")
korean_replication %>%
filter(block == "count_mass", eng_response == "countable") %>%
mutate(group = "Korean") %>%
bind_rows(
korean_unilemma %>%
filter(block == "count_mass", response == "count noun") %>%
mutate(group = "English")
) %>%
ggplot(aes(x = proportion, fill = group)) +
geom_density(alpha = 0.5) +
labs(title = "count ratings: English vs Korean",
x = "Proportion", y = "Density") +
theme_minimal() +
scale_fill_brewer(palette = "Set1")
korean_replication <- korean_replication %>%
mutate(matching_response = case_when(eng_response == "form" ~ "shape",
TRUE ~ eng_response)) %>%
#filter(block == "category_organization", matching_response == "shape") %>%
mutate(group = "Korean")
korean_replication %>%
filter(block == "category_organization", eng_response == "form") %>%
mutate(group = "Korean-online") %>%
bind_rows(
korean_chosun %>%
filter(block == "category_organization", response == "Form") %>%
mutate(group = "Korean_chosun")
) %>%
ggplot(aes(x = proportion, fill = group)) +
geom_density(alpha = 0.5) +
labs(title = "Shape-based Ratings: Korean-online vs Korean-chosun",
x = "Proportion", y = "Density") +
theme_minimal() +
scale_fill_brewer(palette = "Set1")
✅ 2. Word-level paired scatterplots
# Wide format for shape response
eng_shape <- korean_unilemma %>%
filter(block == "category_organization", response == "shape") %>%
mutate(uni_lemma = str_trim(str_to_lower(uni_lemma))) %>%
select(uni_lemma, prop_eng = proportion)
kor_shape <- korean_chosun %>%
filter(block == "category_organization", response == "Form") %>%
mutate(uni_lemma = str_trim(str_to_lower(uni_lemma))) %>%
select(uni_lemma, prop_kor = proportion)
shape_compare <- full_join(eng_shape, kor_shape, by = "uni_lemma")
ggplot(shape_compare, aes(x = prop_eng, y = prop_kor)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "Word-level Shape Ratings",
x = "English Speakers (unilemma)", y = "Korean Speakers Chosun (Korean word)") +
theme_minimal()
## Warning: Removed 164 rows containing missing values or values outside the scale range
## (`geom_point()`).
anti_join(eng_shape, kor_shape, by = "uni_lemma") %>%
pull(uni_lemma) %>%
head(20)
## [1] "airplane" "back (body part)" "blocks" "boulder"
## [5] "bowl" "broom" "bubbles" "bunny"
## [9] "cabbage" "chicken (animal)" "chopstick" "clock"
## [13] "comb (object)" "couch" "dress (object)" "dress shoe"
## [17] "dried seaweed" "fingernail" "firetruck" "fish (animal)"
# ✅ 3. Correlations between the two sets of ratings
cor(shape_compare$prop_eng, shape_compare$prop_kor, use = "complete.obs")
## [1] 0.3379003
✅ Paired statistical tests of distributional differences/divergence between English and Korean ratings
shape_compare_clean <- shape_compare %>%
filter(!is.na(prop_eng), !is.na(prop_kor))
wilcox.test(shape_compare_clean$prop_eng, shape_compare_clean$prop_kor, paired = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: shape_compare_clean$prop_eng and shape_compare_clean$prop_kor
## V = 1228.5, p-value = 6.415e-16
## alternative hypothesis: true location shift is not equal to 0
ks.test(shape_compare_clean$prop_eng, shape_compare_clean$prop_kor)
## Warning in ks.test.default(shape_compare_clean$prop_eng,
## shape_compare_clean$prop_kor): p-value will be approximate in the presence of
## ties
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: shape_compare_clean$prop_eng and shape_compare_clean$prop_kor
## D = 0.5, p-value = 2.814e-16
## alternative hypothesis: two-sided
# Wide format for shape response
kor_rep_shape <- korean_replication %>%
filter(block == "category_organization", matching_response == "shape") %>%
select(theword, prop_eng = proportion)
kor_shape <- korean_chosun %>%
filter(block == "category_organization", response == "Form") %>%
select(theword, prop_kor = proportion)
shape_compare_onlykoreans <- full_join(kor_rep_shape, kor_shape, by = "theword")
## Warning in full_join(kor_rep_shape, kor_shape, by = "theword"): Detected an unexpected many-to-many relationship between `x` and `y`.
## ℹ Row 47 of `x` matches multiple rows in `y`.
## ℹ Row 8 of `y` matches multiple rows in `x`.
## ℹ If a many-to-many relationship is expected, set `relationship =
## "many-to-many"` to silence this warning.
ggplot(shape_compare_onlykoreans, aes(x = prop_eng, y = prop_kor)) +
geom_point() +
geom_abline(slope = 1, intercept = 0, linetype = "dashed") +
labs(title = "Word-level Shape Ratings",
x = "korean speakers online", y = "Korean Speakers Chosun") +
theme_minimal()
## Warning: Removed 101 rows containing missing values or values outside the scale range
## (`geom_point()`).
# ✅ 3. Correlations between the two sets of ratings
cor(shape_compare_onlykoreans$prop_eng, shape_compare_onlykoreans$prop_kor, use = "complete.obs")
## [1] 0.2444943
✅ Paired statistical tests of distributional differences/divergence between the two different Korean groups ratings
shape_compare_onlykoreans_clean <- shape_compare_onlykoreans %>%
filter(!is.na(prop_eng), !is.na(prop_kor))
wilcox.test(shape_compare_onlykoreans_clean$prop_eng, shape_compare_onlykoreans_clean$prop_kor, paired = TRUE)
##
## Wilcoxon signed rank test with continuity correction
##
## data: shape_compare_onlykoreans_clean$prop_eng and shape_compare_onlykoreans_clean$prop_kor
## V = 34381, p-value = 0.002931
## alternative hypothesis: true location shift is not equal to 0
ks.test(shape_compare_onlykoreans_clean$prop_eng, shape_compare_onlykoreans_clean$prop_kor)
## Warning in ks.test.default(shape_compare_onlykoreans_clean$prop_eng,
## shape_compare_onlykoreans_clean$prop_kor): p-value will be approximate in the
## presence of ties
##
## Asymptotic two-sample Kolmogorov-Smirnov test
##
## data: shape_compare_onlykoreans_clean$prop_eng and shape_compare_onlykoreans_clean$prop_kor
## D = 0.33529, p-value < 2.2e-16
## alternative hypothesis: two-sided
get_dominant_category <- function(df, lang_label) {
df %>%
filter(block == "category_organization") %>%
mutate(uni_lemma = str_trim(str_to_lower(uni_lemma))) %>%
group_by(uni_lemma) %>%
slice_max(proportion, with_ties = FALSE) %>%
select(uni_lemma, category = response) %>%
mutate(language = lang_label)
}
eng_dom <- get_dominant_category(korean_unilemma, "English")
kor_dom <- get_dominant_category(korean_chosun, "Korean") #%>%
# rename(response = category) # use eng_response in Korean dataset
merged_dom <- full_join(eng_dom, kor_dom, by = "uni_lemma", suffix = c("_eng", "_kor"))
# Proportion of words with matching dominant categories
mean(merged_dom$category_eng == merged_dom$category_kor, na.rm = TRUE)
## [1] 0
# A helper function to map Korean response strings to English-style categories
map_kor_to_eng <- function(cat) {
cat <- tolower(cat)
case_when(
str_detect(cat, "form") ~ "shape",
str_detect(cat, "material") ~ "material",
str_detect(cat, "color") ~ "color",
TRUE ~ "none of these"
)
}
# Apply the mapping to the Korean categories
merged_dom <- merged_dom %>%
mutate(category_kor_mapped = map_kor_to_eng(category_kor)) %>%
mutate(
category_eng = tolower(category_eng),
category_kor_mapped = tolower(category_kor_mapped)
)
# Did they share at least one conceptual category?
merged_dom %>%
mutate(
overlap = mapply(function(a, b) any(str_split(a, ",\\s*")[[1]] %in% str_split(b, ",\\s*")[[1]]),
tolower(category_eng),
tolower(category_kor))
) %>%
summarise(agreement = mean(overlap, na.rm = TRUE))
## # A tibble: 319 × 2
## uni_lemma agreement
## <chr> <dbl>
## 1 airplane 0
## 2 animal 0
## 3 ankle 0
## 4 ant 0
## 5 anus 0
## 6 apple 0
## 7 arm 0
## 8 back 0
## 9 back (body part) 0
## 10 bag 0
## # ℹ 309 more rows
table(merged_dom$category_eng, merged_dom$category_kor_mapped)
##
## material none of these shape
## color 0 1 9
## material 4 22 71
## none of these 0 14 41
## shape 0 24 61