all predictors
predictors <- read_csv("data/predictors/aoa_predictor_data.csv")
head(predictors)
## # A tibble: 6 × 10
## language uni_lemma lexical_category category freq concreteness measure
## <chr> <chr> <chr> <chr> <dbl> <dbl> <chr>
## 1 Cantonese air condition… nouns furnitu… -1.66 0.816 produc…
## 2 Cantonese airplane nouns vehicles 0.465 0.846 produc…
## 3 Cantonese alligator nouns animals -0.407 0.846 produc…
## 4 Cantonese ambulance nouns vehicles -1.10 0.695 produc…
## 5 Cantonese animal nouns animals 0.138 0.494 produc…
## 6 Cantonese ant nouns animals -0.322 0.746 produc…
## # ℹ 3 more variables: intercept <dbl>, slope <dbl>, aoa <dbl>
norms <- read_csv("data/predictors/samah_ratings.csv")
head(norms)
## # A tibble: 6 × 8
## word block response totcount count proportion language uni_lemma
## <chr> <chr> <chr> <dbl> <dbl> <dbl> <chr> <chr>
## 1 accident category_organ… material 7 1 0.143 Turkish accident
## 2 accident category_organ… none of… 7 6 0.857 Turkish accident
## 3 accident count_mass count n… 7 1 0.143 Turkish accident
## 4 accident count_mass mass no… 7 4 0.571 Turkish accident
## 5 accident count_mass unclear… 7 2 0.286 Turkish accident
## 6 accident solidity non-sol… 6 3 0.5 Turkish accident
norms_wide <-norms %>%
select(uni_lemma, block, response, proportion) %>%
distinct() %>%
mutate(response = case_when(
block == "solidity" & response == "unclear/unknown" ~ "unclear_solid",
block == "count_mass" & response == "unclear/unknown" ~ "unclear_countmass",
TRUE ~ response
)) %>%
select(-block) %>%
pivot_wider(names_from = response, values_from = c(proportion), values_fill = 0)
norms_wide_korean <-norms %>% filter(language == "Korean") %>%
select(uni_lemma, block, response, proportion) %>%
distinct() %>%
mutate(response = case_when(
block == "solidity" & response == "unclear/unknown" ~ "unclear_solid",
block == "count_mass" & response == "unclear/unknown" ~ "unclear_countmass",
TRUE ~ response
)) %>%
select(-block) %>%
pivot_wider(names_from = response, values_from = c(proportion), values_fill = 0)
merge with my data
final_data2 <- predictors %>%
filter(measure != "understands" ) %>%
left_join(norms_wide, by = c("uni_lemma")) %>%
distinct()
#clean column names
final_data2 <- final_data2 %>% janitor:: clean_names()
# 1. Clean column names (remove "[,1]")
colnames(final_data2) <- gsub("\\[,1\\]", "", colnames(final_data2))
colnames(final_data2) <- gsub("\\[,1", "", colnames(final_data2)) # If any missed
# Ensure numeric columns are indeed numeric
# (They probably are, but if the [,1] made them character, this would fix it)
# Identify columns that should be numeric
numeric_cols <- c("freq", "concreteness", "intercept", "slope", "aoa",
"material", "none_of_these", "count_noun", "mass_noun",
"unclear_countmass", "non_solid", "unclear_solid",
"shape", "solid", "color", "na") # Assuming 'na' is numeric
# Check current types and convert if necessary
# glimpse(final_data2) # to see current types
final_data2 <- final_data2 %>%
mutate(across(all_of(numeric_cols), as.numeric))
predictors_df_few <- final_data2 %>%
select(uni_lemma, aoa, freq, concreteness, solid, shape, count_noun, category, language) %>%
drop_na() %>%
filter(!is.na(freq) & freq > 0) %>%
mutate(logfreq = log(freq)) %>%
drop_na(concreteness, solid, shape, count_noun)
predictors_df_few<- predictors_df_few %>%
mutate(
across(c(logfreq, concreteness, solid, shape, count_noun), scale)
)
predictors_df_few %>%
summarise(
across(c(logfreq, concreteness, solid, shape, count_noun), ~ sum(is.na(.)))
)
## # A tibble: 1 × 5
## logfreq concreteness solid shape count_noun
## <int> <int> <int> <int> <int>
## 1 0 0 0 0 0
predictors_df_few %>% filter(uni_lemma == "apple")
## # A tibble: 12 × 10
## uni_lemma aoa freq concreteness[,1] solid[,1] shape[,1] count_noun[,1]
## <chr> <dbl> <dbl> <dbl> <dbl> <dbl> <dbl>
## 1 apple 21.6 0.537 0.928 0.803 1.56 1.05
## 2 apple 21.3 0.0903 0.520 0.803 1.56 1.05
## 3 apple 18.8 0.328 0.774 0.803 1.56 1.05
## 4 apple 18.9 0.709 0.541 0.803 1.56 1.05
## 5 apple 21.1 0.280 0.682 0.803 1.56 1.05
## 6 apple 19.2 0.355 0.593 0.803 1.56 1.05
## 7 apple 22.1 0.896 0.605 0.803 1.56 1.05
## 8 apple 21.4 0.942 0.835 0.803 1.56 1.05
## 9 apple 19.7 1.10 0.737 0.803 1.56 1.05
## 10 apple 23.9 0.102 0.624 0.803 1.56 1.05
## 11 apple 22.0 0.265 0.588 0.803 1.56 1.05
## 12 apple 21.9 0.634 0.764 0.803 1.56 1.05
## # ℹ 3 more variables: category <chr>, language <chr>, logfreq <dbl[,1]>
correlations
correlations <- predictors_df_few %>%
select(aoa, logfreq, concreteness, solid, shape, count_noun) %>%
cor(use = "pairwise.complete.obs")
correlations
## aoa logfreq concreteness solid shape
## aoa 1.000000000 -0.061236335 -0.14227607 0.001299702 -0.080568481
## logfreq -0.061236335 1.000000000 -0.02734521 0.002136942 0.009270508
## concreteness -0.142276072 -0.027345214 1.00000000 0.285222637 0.161447297
## solid 0.001299702 0.002136942 0.28522264 1.000000000 0.383062283
## shape -0.080568481 0.009270508 0.16144730 0.383062283 1.000000000
## count_noun -0.019800150 -0.008907352 0.23091646 0.684515948 0.470803287
## count_noun
## aoa -0.019800150
## logfreq -0.008907352
## concreteness 0.230916464
## solid 0.684515948
## shape 0.470803287
## count_noun 1.000000000
GGally::ggpairs(
correlations,
lower = list(continuous = wrap("points", alpha = 0.2)),
upper = list(continuous = wrap("cor", size = 3))
) +
theme_bw() +
labs(title = "Correlations between AoA and Predictors")
correlation with raw data
correlation_matrix <- predictors_df_few %>%
select(aoa, logfreq, concreteness, solid, shape, count_noun)
correlation_matrix_clean <- correlation_matrix %>%
janitor::clean_names()
# Check column types
str(correlation_matrix_clean)
## tibble [1,458 × 6] (S3: tbl_df/tbl/data.frame)
## $ aoa : num [1:1458] 22.4 30.2 21.6 22.7 14.1 ...
## $ logfreq : num [1:1458, 1] 0.233 -0.795 0.355 0.287 0.282 ...
## ..- attr(*, "scaled:center")= num -1.04
## ..- attr(*, "scaled:scale")= num 1.18
## $ concreteness: num [1:1458, 1] 0.757 -0.741 0.928 0.5 0.928 ...
## ..- attr(*, "scaled:center")= num 0.668
## ..- attr(*, "scaled:scale")= num 0.236
## $ solid : num [1:1458, 1] 0.8026 0.00807 0.8026 0.09636 -0.38919 ...
## ..- attr(*, "scaled:center")= num 0.747
## ..- attr(*, "scaled:scale")= num 0.315
## $ shape : num [1:1458, 1] -0.212 -1.392 1.556 0.355 1.665 ...
## ..- attr(*, "scaled:center")= num 0.354
## ..- attr(*, "scaled:scale")= num 0.254
## $ count_noun : num [1:1458, 1] 0.68128 -0.05549 1.04967 0.58918 -0.00286 ...
## ..- attr(*, "scaled:center")= num 0.715
## ..- attr(*, "scaled:scale")= num 0.271
# Ensure numeric
correlation_matrix_clean <- correlation_matrix_clean %>%
dplyr::mutate(across(everything(), as.numeric))
# Remove rows with any missing data (just for plotting)
correlation_matrix_clean <- correlation_matrix_clean %>%
drop_na()
GGally::ggpairs(
correlation_matrix_clean,
lower = list(continuous = wrap("points", alpha = 0.2)),
upper = list(continuous = wrap("cor", size = 3))
) +
theme_bw() +
labs(title = "Correlations between AoA and Predictors")
print("using lmer")
## [1] "using lmer"
summary(lmerTest::lmer(aoa ~ logfreq + concreteness + solid + shape + count_noun + (1 | language) + (1 | category) , data = predictors_df_few))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: aoa ~ logfreq + concreteness + solid + shape + count_noun + (1 |
## language) + (1 | category)
## Data: predictors_df_few
##
## REML criterion at convergence: 7617
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.4560 -0.6475 -0.1343 0.5518 4.0423
##
## Random effects:
## Groups Name Variance Std.Dev.
## language (Intercept) 2.7175 1.6485
## category (Intercept) 0.5627 0.7501
## Residual 10.3529 3.2176
## Number of obs: 1458, groups: language, 14; category, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 23.59946 0.51523 18.83032 45.804 < 2e-16 ***
## logfreq -0.34592 0.08718 1442.15924 -3.968 7.61e-05 ***
## concreteness -0.53534 0.09472 1423.18450 -5.652 1.92e-08 ***
## solid 0.30552 0.12297 1424.68832 2.485 0.013087 *
## shape -0.37189 0.10297 1421.69975 -3.611 0.000315 ***
## count_noun -0.01545 0.12840 1426.89906 -0.120 0.904221
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) logfrq cncrtn solid shape
## logfreq 0.001
## concretenss 0.011 -0.007
## solid -0.005 -0.023 -0.163
## shape -0.012 0.013 -0.103 -0.097
## count_noun 0.000 0.008 -0.051 -0.549 -0.315
summary(lmerTest::lmer(aoa ~ logfreq + concreteness + solid + shape + count_noun + (1 | language) , data = predictors_df_few))
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: aoa ~ logfreq + concreteness + solid + shape + count_noun + (1 |
## language)
## Data: predictors_df_few
##
## REML criterion at convergence: 7663
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -3.5164 -0.6724 -0.0952 0.5505 3.6049
##
## Random effects:
## Groups Name Variance Std.Dev.
## language (Intercept) 2.689 1.640
## Residual 10.817 3.289
## Number of obs: 1458, groups: language, 14
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 23.49114 0.44722 13.08474 52.527 < 2e-16 ***
## logfreq -0.38676 0.08821 1444.33195 -4.384 1.25e-05 ***
## concreteness -0.60567 0.09157 1442.31904 -6.614 5.25e-11 ***
## solid 0.34574 0.12094 1439.70801 2.859 0.004314 **
## shape -0.33741 0.09844 1439.56290 -3.428 0.000626 ***
## count_noun -0.05997 0.12486 1439.89570 -0.480 0.631077
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) logfrq cncrtn solid shape
## logfreq 0.005
## concretenss -0.001 0.013
## solid 0.001 -0.013 -0.176
## shape 0.001 -0.015 -0.047 -0.084
## count_noun -0.003 0.010 -0.031 -0.602 -0.308
print("using lm")
## [1] "using lm"
summary(lm(aoa ~ logfreq + concreteness + solid + shape + count_noun ,
data = predictors_df_few))
##
## Call:
## lm(formula = aoa ~ logfreq + concreteness + solid + shape + count_noun,
## data = predictors_df_few)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.7143 -2.4907 -0.3539 2.0348 12.3647
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 23.53399 0.09640 244.134 < 2e-16 ***
## logfreq -0.24185 0.09649 -2.506 0.01230 *
## concreteness -0.56978 0.10089 -5.648 1.95e-08 ***
## solid 0.28123 0.13496 2.084 0.03735 *
## shape -0.32115 0.10992 -2.922 0.00354 **
## count_noun 0.01414 0.13922 0.102 0.91913
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 3.681 on 1452 degrees of freedom
## Multiple R-squared: 0.03261, Adjusted R-squared: 0.02928
## F-statistic: 9.79 on 5 and 1452 DF, p-value: 3.246e-09
🤯 So Why Does solid Go in the Opposite Direction? could be due to shared variance (multicollinearity)?
solid is positively correlated with concreteness, shape, and count_noun. When included all of them in the same model, the model works to partition unique contributions to AoA. In that partitioning: The model attributes the shared effect of concreteness, shape, and countability mostly to concreteness and shape (since they are stronger or more directly related to AoA in the data). What’s left over, the residual unique effect of solid, may reflect words that are solid but not particularly concrete, shaped, or countable, and these might be learned later (e.g., abstract solids like “air” conditioners or opaque solids like “substance”). A known phenomenon in multiple regression: when predictors are collinear, the signs of coefficients can flip or seem unintuitive due to shared variance being “assigned” to one variable over another.
korean_props <- read_csv("data/predictors/jongmin_cleaned.csv")
unilemma <- read_csv("data/predictors/koreanwords.csv")
unilemma <- unilemma %>%
mutate(theword= item_definition)
korean_props <- korean_props %>%
merge(unilemma, by = c("theword")) %>%
rename(eng_response = response)
korean_props <- korean_props %>%
mutate(proportion = as.numeric(unlist(proportion)))
korean_data_clean
## [1] "Form/Material" "Form/Color/Material"
## [3] "unclear_solid" "Form"
## [5] "unclear_count" "Form/Color"
## [7] "Can be counted" "Solid"
## [9] "Material" "Not a solid"
## [11] "Color/Material" "Color"
## [13] "Cannot be counted" "Not applicable"
## [15] "Form/Material/Not applicable" "Color/Not applicable"
## [17] "Material/Not applicable" "Form/Color/Material/Not applicable"
## [19] "Form/Color/Not applicable"
## Number of elements with multiple values: 6
## Positions with multiple values: 23 82 100 103 195 234
## Processing column: solid
## Processing column: form
## Processing column: can_be_counted
## Processing column: cannot_be_counted
## Processing column: unclear_count
## Processing column: material
## Processing column: not_a_solid
## Processing column: color
## Processing column: not_applicable
## Processing column: unclear_solid
## 'data.frame': 246 obs. of 16 variables:
## $ uni_lemma : chr "airplane" "animal" "ankle" "ant" ...
## $ lexical_category : chr "nouns" "nouns" "nouns" "nouns" ...
## $ category : chr "vehicles" "animals" "body_parts" "animals" ...
## $ freq : num 1 0.402 -1.892 0.187 0.942 ...
## $ concreteness : num 0.825 0.48 0.677 0.727 0.865 ...
## $ aoa : num 23.3 28.8 33.5 25.2 21.4 ...
## $ solid : num 0.933 0.562 0.655 0.621 0.828 ...
## $ form : num 0.643 0.586 0.862 0.621 0.448 ...
## $ can_be_counted : num 0.933 0.676 0.806 0.703 0.906 ...
## $ cannot_be_counted: num 0 0.027 0 0.0811 0 ...
## $ unclear_count : num 0.0667 0.2973 0.1944 0.2162 0.0938 ...
## $ material : num 0.0714 0.0345 0.0345 0.0345 0.1034 ...
## $ not_a_solid : num 0 0.0938 0.1034 0.1034 0.069 ...
## $ color : num 0 0 0 0.0345 0 ...
## $ not_applicable : num 0 0.069 0 0 0 ...
## $ unclear_solid : num 0.0667 0.3438 0.2414 0.2759 0.1034 ...
## [1] "using lm"
##
## Call:
## lm(formula = aoa ~ logreq + concreteness + solid + form + can_be_counted,
## data = korean_clean)
##
## Residuals:
## Min 1Q Median 3Q Max
## -5.3847 -1.9966 -0.2149 1.3484 10.1271
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 24.41298 0.26517 92.065 < 2e-16 ***
## logreq -0.29298 0.27597 -1.062 0.290861
## concreteness -0.50350 0.29421 -1.711 0.089996 .
## solid 1.52266 0.49428 3.081 0.002644 **
## form -0.09357 0.35789 -0.261 0.794274
## can_be_counted -1.72391 0.48372 -3.564 0.000553 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.74 on 104 degrees of freedom
## (136 observations deleted due to missingness)
## Multiple R-squared: 0.1852, Adjusted R-squared: 0.1461
## F-statistic: 4.729 on 5 and 104 DF, p-value: 0.0006218
## [1] "using lmer"
## Linear mixed model fit by REML. t-tests use Satterthwaite's method [
## lmerModLmerTest]
## Formula: aoa ~ logreq + concreteness + solid + form + can_be_counted +
## (1 | category)
## Data: korean_clean
##
## REML criterion at convergence: 528
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -1.8815 -0.6188 -0.1965 0.5337 3.5280
##
## Random effects:
## Groups Name Variance Std.Dev.
## category (Intercept) 0.9161 0.9571
## Residual 6.9063 2.6280
## Number of obs: 110, groups: category, 9
##
## Fixed effects:
## Estimate Std. Error df t value Pr(>|t|)
## (Intercept) 24.2217 0.4290 5.9825 56.459 2.17e-09 ***
## logreq -0.2125 0.2744 103.9946 -0.775 0.44035
## concreteness -0.5788 0.2946 103.7664 -1.965 0.05215 .
## solid 1.5858 0.5323 93.2608 2.979 0.00368 **
## form 0.1043 0.4517 32.3111 0.231 0.81878
## can_be_counted -1.7675 0.5441 74.7787 -3.249 0.00174 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) logreq cncrtn solid form
## logreq -0.017
## concretenss 0.019 -0.275
## solid -0.093 0.078 -0.071
## form -0.103 -0.026 0.028 0.002
## can_be_cntd 0.106 -0.028 -0.043 -0.761 -0.377
graphs