TO DO:
- re-run script 2 to get means for multiple words
- check on lon lats.. where’d those come from?
INFILE <- "data/language_pairwise_swadesh_correlations_by_item.csv"
corr_df <- read_csv(INFILE)
lang_names <- read_csv( "/Users/mollylewis/Documents/research/Projects/1_in_progress/L2ETS/studies/study2/data/processed/lang_names/ets_to_google_langcodes_complete.csv") %>%
select(ETS_lang_name, google_lang_name, lon, lat)
pairwise_physical_distances <- geosphere::distm(as.matrix(lang_names[,c("lon", "lat")]))
colnames(pairwise_physical_distances) <-
lang_names$google_lang_name
pairwise_physical_dists <- pairwise_physical_distances %>%
data.frame() %>%
mutate(lang1 = colnames(pairwise_physical_distances)) %>%
select(lang1, everything()) %>%
gather("lang2", "dist", -lang1) %>%
filter(lang1 != lang2)
hist(pairwise_physical_dists$dist)

corr_geo <- left_join(corr_df, pairwise_physical_dists)
corr_geo %>%
ggplot(aes(x = r, y = dist)) +
facet_wrap(~item) +
geom_point(size = .1) +
geom_smooth(aes(color = item), se = F, method = "lm") +
xlab("swadesh pairwise distance correlation") +
ylab("distance (meters)") +
theme_classic()

corr_geo %>%
ggplot(aes(x = r, y = dist)) +
geom_point(size = .1) +
xlab("Word Pairwise Correlation (Pearson's r)") +
ylab("Distance (meters)") +
theme_classic(base_size = 15) +
geom_smooth(aes(color = item), se = F, method = "lm", size = .5) +
xlim(0,1) +
scale_color_discrete(name = "Swadesh word") +
theme(legend.position = c(0.8, 0.7))

corr_geo %>%
ggplot(aes(x = r, y = dist)) +
xlab("Word Pairwise Correlation (Pearson's r)") +
ylab("Distance (meters)") +
theme_classic(base_size = 15) +
geom_smooth(aes(color = item), se = F, method = "lm", size = .5) +
xlim(0,1) +
scale_color_discrete(name = "Swadesh word") +
theme(legend.position = c(0.8, 0.7))

corr_geo %>%
group_by(item) %>%
do(tidy(cor.test(.$r, .$dist))) %>%
select(item, estimate, p.value) %>%
arrange(estimate) %>%
kable()
| sun |
-0.3083445 |
0.0000000 |
| mountain |
-0.2966179 |
0.0000000 |
| day |
-0.2912059 |
0.0000000 |
| year |
-0.2869088 |
0.0000000 |
| moon |
-0.2864168 |
0.0000000 |
| water |
-0.2820849 |
0.0000000 |
| river |
-0.2795073 |
0.0000000 |
| sea |
-0.2753706 |
0.0000000 |
| earth |
-0.2727574 |
0.0000000 |
| salt |
-0.2706258 |
0.0000000 |
| fire |
-0.2384460 |
0.0000000 |
| dust |
-0.2376438 |
0.0000000 |
| sand |
-0.2364880 |
0.0000001 |
| smoke |
-0.2264250 |
0.0000001 |
| sky |
-0.2252467 |
0.0000001 |
| stone |
-0.2157788 |
0.0000002 |
| cloud |
-0.2099182 |
0.0000005 |
| night |
-0.2023687 |
0.0000109 |
| wind |
-0.1966189 |
0.0000053 |
| ash |
-0.1916804 |
0.0000092 |
| lake |
-0.1765190 |
0.0000453 |
| star |
-0.1327586 |
0.0016249 |
Excluding outliers
all_dists <- map_df(fs::dir_ls("/Volumes/wilbur_the_great/pairwise_swadesh_words/"), read_feather)
all_dists %>%
group_by(lang) %>%
distinct(word1) %>%
count(lang) %>%
filter(n < 22) %>%
arrange(n) %>%
kable()
| ig |
13 |
| yo |
14 |
| th |
18 |
| de |
20 |
| nl |
20 |
| kn |
21 |
| ml |
21 |
| ne |
21 |
| te |
21 |
There are four outliers in correlations (“ja”, “ig”, “zh”, “yo”). Let’s exclude them.
get_mean_corr <- function(current_lang, df){
corr_df %>%
filter(lang1 == current_lang| lang2 == current_lang) %>%
mutate(temp = "temp") %>%
group_by(temp) %>%
multi_boot_standard(col = "r") %>%
ungroup() %>%
mutate(lang = current_lang) %>%
select(-temp)
}
mean_corr <- map_df(unique(lang_names$google_lang_name),get_mean_corr)
ggplot(mean_corr, aes(x = fct_reorder(lang, mean), y = mean)) +
geom_pointrange(aes(ymin = ci_lower, ymax = ci_upper)) +
theme_classic()

corr_df_filtered <- corr_df %>%
filter(!(lang1 %in% c("ja", "ig", "zh", "yo")),
!(lang2 %in% c("ja", "ig", "zh", "yo")))
corr_geo_filtered <- left_join(corr_df_filtered, pairwise_physical_dists)
corr_geo_filtered %>%
ggplot(aes(x = r, y = dist)) +
facet_wrap(~item) +
geom_point(size = .1) +
geom_smooth(aes(color = item), se = F, method = "lm") +
xlab("swadesh pairwise distance correlation") +
ylab("distance (meters)") +
theme_classic()

corr_geo_filtered %>%
ggplot(aes(x = r, y = dist)) +
geom_smooth(aes(color = item), se = F ,method = "lm") +
xlab("swadesh pairwise distance correlation") +
ylab("distance (meters)") +
theme_classic()

corr_geo_filtered %>%
group_by(item) %>%
do(tidy(cor.test(.$r, .$dist))) %>%
select(item, estimate, p.value) %>%
arrange(estimate) %>%
kable()
| sand |
-0.2040444 |
0.0000180 |
| night |
-0.1790781 |
0.0002872 |
| day |
-0.1706691 |
0.0002177 |
| smoke |
-0.1704664 |
0.0002216 |
| mountain |
-0.1651925 |
0.0005417 |
| sun |
-0.1638499 |
0.0006019 |
| moon |
-0.1534174 |
0.0013293 |
| dust |
-0.1335875 |
0.0052596 |
| year |
-0.1298341 |
0.0050462 |
| fire |
-0.1201527 |
0.0121463 |
| earth |
-0.0996430 |
0.0316937 |
| sea |
-0.0994672 |
0.0319973 |
| wind |
-0.0930592 |
0.0448917 |
| ash |
-0.0785465 |
0.0906802 |
| water |
-0.0737212 |
0.1247240 |
| cloud |
-0.0564941 |
0.2240141 |
| lake |
-0.0417242 |
0.3853391 |
| sky |
-0.0170535 |
0.7137856 |
| stone |
-0.0000216 |
0.9996299 |
| river |
0.0073467 |
0.8785656 |
| salt |
0.0693106 |
0.1489742 |
| star |
0.0818810 |
0.0777520 |