packages

all predictors

predictors <- read_csv("data/predictors/aoa_predictor_data.csv")
head(predictors)

## # A tibble: 6 × 10
##   language  uni_lemma      lexical_category category   freq concreteness measure
##   <chr>     <chr>          <chr>            <chr>     <dbl>        <dbl> <chr>  
## 1 Cantonese air condition… nouns            furnitu… -1.66         0.816 produc…
## 2 Cantonese airplane       nouns            vehicles  0.465        0.846 produc…
## 3 Cantonese alligator      nouns            animals  -0.407        0.846 produc…
## 4 Cantonese ambulance      nouns            vehicles -1.10         0.695 produc…
## 5 Cantonese animal         nouns            animals   0.138        0.494 produc…
## 6 Cantonese ant            nouns            animals  -0.322        0.746 produc…
## # ℹ 3 more variables: intercept <dbl>, slope <dbl>, aoa <dbl>

all_languages

load_data

norms <- read_csv("data/predictors/samah_ratings.csv")
head(norms)

## # A tibble: 6 × 8
##   word     block           response totcount count proportion language uni_lemma
##   <chr>    <chr>           <chr>       <dbl> <dbl>      <dbl> <chr>    <chr>    
## 1 accident category_organ… material        7     1      0.143 Turkish  accident 
## 2 accident category_organ… none of…        7     6      0.857 Turkish  accident 
## 3 accident count_mass      count n…        7     1      0.143 Turkish  accident 
## 4 accident count_mass      mass no…        7     4      0.571 Turkish  accident 
## 5 accident count_mass      unclear…        7     2      0.286 Turkish  accident 
## 6 accident solidity        non-sol…        6     3      0.5   Turkish  accident

norms_wide <-norms %>%
  select(uni_lemma, block, response, proportion) %>% 
  distinct() %>%
    mutate(response = case_when(
           block == "solidity" & response == "unclear/unknown" ~ "unclear_solid",
           block == "count_mass" & response == "unclear/unknown" ~ "unclear_countmass",
           TRUE ~ response
         )) %>%
  select(-block) %>%
  pivot_wider(names_from = response, values_from = c(proportion), values_fill = 0)

norms_wide_korean <-norms %>% filter(language == "Korean") %>%
  select(uni_lemma, block, response, proportion) %>% 
  distinct() %>%
    mutate(response = case_when(
           block == "solidity" & response == "unclear/unknown" ~ "unclear_solid",
           block == "count_mass" & response == "unclear/unknown" ~ "unclear_countmass",
           TRUE ~ response
         )) %>%
  select(-block) %>%
  pivot_wider(names_from = response, values_from = c(proportion), values_fill = 0)

merge with my data

final_data2 <- predictors  %>%
    filter(measure != "understands" ) %>%
  left_join(norms_wide, by = c("uni_lemma")) %>%
  distinct()
#clean column names
final_data2 <- final_data2 %>% janitor:: clean_names()

# 1. Clean column names (remove "[,1]")
colnames(final_data2) <- gsub("\\[,1\\]", "", colnames(final_data2))
colnames(final_data2) <- gsub("\\[,1", "", colnames(final_data2)) # If any missed

# Ensure numeric columns are indeed numeric
# (They probably are, but if the [,1] made them character, this would fix it)
# Identify columns that should be numeric
numeric_cols <- c("freq", "concreteness", "intercept", "slope", "aoa",
                  "material", "none_of_these", "count_noun", "mass_noun",
                  "unclear_countmass", "non_solid", "unclear_solid",
                  "shape", "solid", "color", "na") # Assuming 'na' is numeric

# Check current types and convert if necessary
# glimpse(final_data2) # to see current types
final_data2 <- final_data2 %>%
  mutate(across(all_of(numeric_cols), as.numeric))

predictors

predictors_df_few <- final_data2 %>%
  select(uni_lemma, aoa, freq, concreteness, solid, shape, count_noun, category, language)  %>%
  drop_na() %>%
  filter(!is.na(freq) & freq > 0) %>%
  mutate(logfreq = log(freq)) %>%
  drop_na(concreteness, solid, shape, count_noun)

predictors_df_few<- predictors_df_few %>%
  mutate(
    across(c(logfreq, concreteness, solid, shape, count_noun), scale)
  )

predictors_df_few %>%
  summarise(
    across(c(logfreq, concreteness, solid, shape, count_noun), ~ sum(is.na(.)))
  )

## # A tibble: 1 × 5
##   logfreq concreteness solid shape count_noun
##     <int>        <int> <int> <int>      <int>
## 1       0            0     0     0          0

predictors_df_few %>% filter(uni_lemma == "apple")

## # A tibble: 12 × 10
##    uni_lemma   aoa   freq concreteness[,1] solid[,1] shape[,1] count_noun[,1]
##    <chr>     <dbl>  <dbl>            <dbl>     <dbl>     <dbl>          <dbl>
##  1 apple      21.6 0.537             0.928     0.803      1.56           1.05
##  2 apple      21.3 0.0903            0.520     0.803      1.56           1.05
##  3 apple      18.8 0.328             0.774     0.803      1.56           1.05
##  4 apple      18.9 0.709             0.541     0.803      1.56           1.05
##  5 apple      21.1 0.280             0.682     0.803      1.56           1.05
##  6 apple      19.2 0.355             0.593     0.803      1.56           1.05
##  7 apple      22.1 0.896             0.605     0.803      1.56           1.05
##  8 apple      21.4 0.942             0.835     0.803      1.56           1.05
##  9 apple      19.7 1.10              0.737     0.803      1.56           1.05
## 10 apple      23.9 0.102             0.624     0.803      1.56           1.05
## 11 apple      22.0 0.265             0.588     0.803      1.56           1.05
## 12 apple      21.9 0.634             0.764     0.803      1.56           1.05
## # ℹ 3 more variables: category <chr>, language <chr>, logfreq <dbl[,1]>

correlations

correlations <- predictors_df_few %>%
  select(aoa, logfreq, concreteness, solid, shape, count_noun) %>%
  cor(use = "pairwise.complete.obs")

correlations

##                       aoa      logfreq concreteness       solid        shape
## aoa           1.000000000 -0.061236335  -0.14227607 0.001299702 -0.080568481
## logfreq      -0.061236335  1.000000000  -0.02734521 0.002136942  0.009270508
## concreteness -0.142276072 -0.027345214   1.00000000 0.285222637  0.161447297
## solid         0.001299702  0.002136942   0.28522264 1.000000000  0.383062283
## shape        -0.080568481  0.009270508   0.16144730 0.383062283  1.000000000
## count_noun   -0.019800150 -0.008907352   0.23091646 0.684515948  0.470803287
##                count_noun
## aoa          -0.019800150
## logfreq      -0.008907352
## concreteness  0.230916464
## solid         0.684515948
## shape         0.470803287
## count_noun    1.000000000

GGally::ggpairs(
  correlations, 
  lower = list(continuous = wrap("points", alpha = 0.2)),
  upper = list(continuous = wrap("cor", size = 3))
) +
  theme_bw() +
  labs(title = "Correlations between AoA and Predictors")

correlation with raw data

correlation_matrix <- predictors_df_few %>%
  select(aoa, logfreq, concreteness, solid, shape, count_noun)

correlation_matrix_clean <- correlation_matrix %>%
  janitor::clean_names()

# Check column types
str(correlation_matrix_clean)

## tibble [1,458 × 6] (S3: tbl_df/tbl/data.frame)
##  $ aoa         : num [1:1458] 22.4 30.2 21.6 22.7 14.1 ...
##  $ logfreq     : num [1:1458, 1] 0.233 -0.795 0.355 0.287 0.282 ...
##   ..- attr(*, "scaled:center")= num -1.04
##   ..- attr(*, "scaled:scale")= num 1.18
##  $ concreteness: num [1:1458, 1] 0.757 -0.741 0.928 0.5 0.928 ...
##   ..- attr(*, "scaled:center")= num 0.668
##   ..- attr(*, "scaled:scale")= num 0.236
##  $ solid       : num [1:1458, 1] 0.8026 0.00807 0.8026 0.09636 -0.38919 ...
##   ..- attr(*, "scaled:center")= num 0.747
##   ..- attr(*, "scaled:scale")= num 0.315
##  $ shape       : num [1:1458, 1] -0.212 -1.392 1.556 0.355 1.665 ...
##   ..- attr(*, "scaled:center")= num 0.354
##   ..- attr(*, "scaled:scale")= num 0.254
##  $ count_noun  : num [1:1458, 1] 0.68128 -0.05549 1.04967 0.58918 -0.00286 ...
##   ..- attr(*, "scaled:center")= num 0.715
##   ..- attr(*, "scaled:scale")= num 0.271

# Ensure numeric
correlation_matrix_clean <- correlation_matrix_clean %>%
  dplyr::mutate(across(everything(), as.numeric))

# Remove rows with any missing data (just for plotting)
correlation_matrix_clean <- correlation_matrix_clean %>%
  drop_na()

GGally::ggpairs(
  correlation_matrix_clean,
  lower = list(continuous = wrap("points", alpha = 0.2)),
  upper = list(continuous = wrap("cor", size = 3))
) +
  theme_bw() +
  labs(title = "Correlations between AoA and Predictors")

print("using lmer")

## [1] "using lmer"

summary(lmerTest::lmer(aoa ~ logfreq + concreteness + solid + shape + count_noun + (1 | language) + (1 | category) , data = predictors_df_few))

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: aoa ~ logfreq + concreteness + solid + shape + count_noun + (1 |  
##     language) + (1 | category)
##    Data: predictors_df_few
## 
## REML criterion at convergence: 7617
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.4560 -0.6475 -0.1343  0.5518  4.0423 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  language (Intercept)  2.7175  1.6485  
##  category (Intercept)  0.5627  0.7501  
##  Residual             10.3529  3.2176  
## Number of obs: 1458, groups:  language, 14; category, 9
## 
## Fixed effects:
##                Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)    23.59946    0.51523   18.83032  45.804  < 2e-16 ***
## logfreq        -0.34592    0.08718 1442.15924  -3.968 7.61e-05 ***
## concreteness   -0.53534    0.09472 1423.18450  -5.652 1.92e-08 ***
## solid           0.30552    0.12297 1424.68832   2.485 0.013087 *  
## shape          -0.37189    0.10297 1421.69975  -3.611 0.000315 ***
## count_noun     -0.01545    0.12840 1426.89906  -0.120 0.904221    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) logfrq cncrtn solid  shape 
## logfreq      0.001                            
## concretenss  0.011 -0.007                     
## solid       -0.005 -0.023 -0.163              
## shape       -0.012  0.013 -0.103 -0.097       
## count_noun   0.000  0.008 -0.051 -0.549 -0.315

summary(lmerTest::lmer(aoa ~ logfreq + concreteness + solid + shape + count_noun + (1 | language) , data = predictors_df_few))

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: aoa ~ logfreq + concreteness + solid + shape + count_noun + (1 |  
##     language)
##    Data: predictors_df_few
## 
## REML criterion at convergence: 7663
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -3.5164 -0.6724 -0.0952  0.5505  3.6049 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  language (Intercept)  2.689   1.640   
##  Residual             10.817   3.289   
## Number of obs: 1458, groups:  language, 14
## 
## Fixed effects:
##                Estimate Std. Error         df t value Pr(>|t|)    
## (Intercept)    23.49114    0.44722   13.08474  52.527  < 2e-16 ***
## logfreq        -0.38676    0.08821 1444.33195  -4.384 1.25e-05 ***
## concreteness   -0.60567    0.09157 1442.31904  -6.614 5.25e-11 ***
## solid           0.34574    0.12094 1439.70801   2.859 0.004314 ** 
## shape          -0.33741    0.09844 1439.56290  -3.428 0.000626 ***
## count_noun     -0.05997    0.12486 1439.89570  -0.480 0.631077    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) logfrq cncrtn solid  shape 
## logfreq      0.005                            
## concretenss -0.001  0.013                     
## solid        0.001 -0.013 -0.176              
## shape        0.001 -0.015 -0.047 -0.084       
## count_noun  -0.003  0.010 -0.031 -0.602 -0.308

print("using lm")

## [1] "using lm"

summary(lm(aoa ~ logfreq + concreteness + solid + shape + count_noun , 
           data = predictors_df_few))

## 
## Call:
## lm(formula = aoa ~ logfreq + concreteness + solid + shape + count_noun, 
##     data = predictors_df_few)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -9.7143 -2.4907 -0.3539  2.0348 12.3647 
## 
## Coefficients:
##              Estimate Std. Error t value Pr(>|t|)    
## (Intercept)  23.53399    0.09640 244.134  < 2e-16 ***
## logfreq      -0.24185    0.09649  -2.506  0.01230 *  
## concreteness -0.56978    0.10089  -5.648 1.95e-08 ***
## solid         0.28123    0.13496   2.084  0.03735 *  
## shape        -0.32115    0.10992  -2.922  0.00354 ** 
## count_noun    0.01414    0.13922   0.102  0.91913    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 3.681 on 1452 degrees of freedom
## Multiple R-squared:  0.03261,    Adjusted R-squared:  0.02928 
## F-statistic:  9.79 on 5 and 1452 DF,  p-value: 3.246e-09

🤯 So Why Does solid Go in the Opposite Direction? could be due to shared variance (multicollinearity)?

solid is positively correlated with concreteness, shape, and count_noun. When included all of them in the same model, the model works to partition unique contributions to AoA. In that partitioning: The model attributes the shared effect of concreteness, shape, and countability mostly to concreteness and shape (since they are stronger or more directly related to AoA in the data). What’s left over, the residual unique effect of solid, may reflect words that are solid but not particularly concrete, shaped, or countable, and these might be learned later (e.g., abstract solids like “air” conditioners or opaque solids like “substance”). A known phenomenon in multiple regression: when predictors are collinear, the signs of coefficients can flip or seem unintuitive due to shared variance being “assigned” to one variable over another.

korean-chosun

korean_props <- read_csv("data/predictors/jongmin_cleaned.csv")

unilemma <- read_csv("data/predictors/koreanwords.csv")
unilemma <- unilemma %>%
  mutate(theword= item_definition)

korean_props <- korean_props %>%
  merge(unilemma, by = c("theword")) %>%
  rename(eng_response = response)

korean_props <- korean_props %>%
  mutate(proportion = as.numeric(unlist(proportion)))

korean_data_clean

##  [1] "Form/Material"                      "Form/Color/Material"               
##  [3] "unclear_solid"                      "Form"                              
##  [5] "unclear_count"                      "Form/Color"                        
##  [7] "Can be counted"                     "Solid"                             
##  [9] "Material"                           "Not a solid"                       
## [11] "Color/Material"                     "Color"                             
## [13] "Cannot be counted"                  "Not applicable"                    
## [15] "Form/Material/Not applicable"       "Color/Not applicable"              
## [17] "Material/Not applicable"            "Form/Color/Material/Not applicable"
## [19] "Form/Color/Not applicable"

## Number of elements with multiple values: 6

## Positions with multiple values: 23 82 100 103 195 234

## Processing column: solid 
## Processing column: form 
## Processing column: can_be_counted 
## Processing column: cannot_be_counted 
## Processing column: unclear_count 
## Processing column: material 
## Processing column: not_a_solid 
## Processing column: color 
## Processing column: not_applicable 
## Processing column: unclear_solid

## 'data.frame':    246 obs. of  16 variables:
##  $ uni_lemma        : chr  "airplane" "animal" "ankle" "ant" ...
##  $ lexical_category : chr  "nouns" "nouns" "nouns" "nouns" ...
##  $ category         : chr  "vehicles" "animals" "body_parts" "animals" ...
##  $ freq             : num  1 0.402 -1.892 0.187 0.942 ...
##  $ concreteness     : num  0.825 0.48 0.677 0.727 0.865 ...
##  $ aoa              : num  23.3 28.8 33.5 25.2 21.4 ...
##  $ solid            : num  0.933 0.562 0.655 0.621 0.828 ...
##  $ form             : num  0.643 0.586 0.862 0.621 0.448 ...
##  $ can_be_counted   : num  0.933 0.676 0.806 0.703 0.906 ...
##  $ cannot_be_counted: num  0 0.027 0 0.0811 0 ...
##  $ unclear_count    : num  0.0667 0.2973 0.1944 0.2162 0.0938 ...
##  $ material         : num  0.0714 0.0345 0.0345 0.0345 0.1034 ...
##  $ not_a_solid      : num  0 0.0938 0.1034 0.1034 0.069 ...
##  $ color            : num  0 0 0 0.0345 0 ...
##  $ not_applicable   : num  0 0.069 0 0 0 ...
##  $ unclear_solid    : num  0.0667 0.3438 0.2414 0.2759 0.1034 ...

## [1] "using lm"

## 
## Call:
## lm(formula = aoa ~ logreq + concreteness + solid + form + can_be_counted, 
##     data = korean_clean)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -5.3847 -1.9966 -0.2149  1.3484 10.1271 
## 
## Coefficients:
##                Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    24.41298    0.26517  92.065  < 2e-16 ***
## logreq         -0.29298    0.27597  -1.062 0.290861    
## concreteness   -0.50350    0.29421  -1.711 0.089996 .  
## solid           1.52266    0.49428   3.081 0.002644 ** 
## form           -0.09357    0.35789  -0.261 0.794274    
## can_be_counted -1.72391    0.48372  -3.564 0.000553 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 2.74 on 104 degrees of freedom
##   (136 observations deleted due to missingness)
## Multiple R-squared:  0.1852, Adjusted R-squared:  0.1461 
## F-statistic: 4.729 on 5 and 104 DF,  p-value: 0.0006218

## [1] "using lmer"

## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: aoa ~ logreq + concreteness + solid + form + can_be_counted +  
##     (1 | category)
##    Data: korean_clean
## 
## REML criterion at convergence: 528
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -1.8815 -0.6188 -0.1965  0.5337  3.5280 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  category (Intercept) 0.9161   0.9571  
##  Residual             6.9063   2.6280  
## Number of obs: 110, groups:  category, 9
## 
## Fixed effects:
##                Estimate Std. Error       df t value Pr(>|t|)    
## (Intercept)     24.2217     0.4290   5.9825  56.459 2.17e-09 ***
## logreq          -0.2125     0.2744 103.9946  -0.775  0.44035    
## concreteness    -0.5788     0.2946 103.7664  -1.965  0.05215 .  
## solid            1.5858     0.5323  93.2608   2.979  0.00368 ** 
## form             0.1043     0.4517  32.3111   0.231  0.81878    
## can_be_counted  -1.7675     0.5441  74.7787  -3.249  0.00174 ** 
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##             (Intr) logreq cncrtn solid  form  
## logreq      -0.017                            
## concretenss  0.019 -0.275                     
## solid       -0.093  0.078 -0.071              
## form        -0.103 -0.026  0.028  0.002       
## can_be_cntd  0.106 -0.028 -0.043 -0.761 -0.377

graphs

aoa-normscores_all languages

2025-05-06

packages

all_languages

load_data

predictors

korean-chosun

korean replication