Language codes (from WALS) – WALS, ISO, ascii-name. These are used to merge across datasets.
codes = read.csv("../data/language_codes.csv") %>%
select(language, WALS, ISO) %>%
mutate(ISO = unlist(lapply(strsplit(as.character(ISO),
","),function(x) x[1])))
# in cases where there are two ISO codes, takes first
Lupyan & Dale (2010): Demographic variables and syntactic compelxity
wichmann_ISOS = read.csv("../data/ISO_mappings_from_wichmann.csv",
stringsAsFactors=F)[-1] %>%
unique()
ld = read.table("../data/lupyan_2010.txt", fill = T,
header = T, sep = "\t", na.strings = "*") %>%
left_join(codes, c("walsCode" = "WALS")) %>%
mutate(lang = tolower(lang)) %>%
left_join(wichmann_ISOS, by = "lang") %>% # use wichmann ISOS for some missing
mutate(ISO = as.factor(ifelse(is.na(ISO.x), ISO.y, ISO.x))) %>%
select(-ISO.y, -ISO.x) %>%
filter(!is.na(ISO)) %>%
filter(ISO != "")
# rename columns
ld = rename(ld, pop_log = logpop2,
mean.temp = aveTemp,
n.neighbors = numNeighbors,
sum.precip = sumPrecip,
sd.temp = sdTemp,
sd.precip = sdPrecip,
lang.family = langFamily,
lang.genus = langGenus,
native.country = nativeCountry,
native.country.area = nativeCountryArea,
growing.season = growingSeason)
# get means across language
ld.demo = ld %>%
group_by(ISO) %>%
summarise_each(funs(mean(., na.rm = TRUE)),
c(8:9, 16:121)) %>%
select(1:3, 93:95, 100:101, 103:105, 108)
# add in data family
fams = ld %>%
group_by(ISO) %>%
filter(row_number() == 1) %>%
select(ISO, lang.family, lang.genus, native.country,
native.country.area, language)
ld.demo = left_join(fams, ld.demo, by = "ISO") %>% ungroup()
WALS features of complexity
# get variables to include
qual = ld %>%
select(18:103, 124) %>%
mutate_each(funs(as.factor)) %>%
group_by(ISO) %>%
summarise_each(funs(most.frequent.level)) %>%
mutate_each(funs(as.factor))
ld_feature_names = read.csv("../data/lupyan_2010_all_feature_mappings.csv")
qualVarNames = intersect(names(qual), ld_feature_names$WALS.feature.name)
qual = select(qual,which(is.element(names(qual), c("ISO", qualVarNames))))
# remap factor levels to complexity values (0,1)
# note two variables that are reported in the paper (NICPOC[59] and CYSIND[39] are missing in the data)
for (i in 1:length(qualVarNames)){
thisVarLabs = ld_feature_names[ld_feature_names$WALS.feature.name == qualVarNames[i],]
old = thisVarLabs$ld.level.label
if (is.na(old)) {print("NA!!!")}
new = thisVarLabs$ld.complexity
col_i = grep(qualVarNames[i], colnames(qual))
qual[,col_i] = mapvalues(as.matrix(qual[,col_i]),
from = as.character(old),
to = as.character(new), warn_missing = TRUE)
}
ld.complexity = qual %>%
mutate_each(funs(as.numeric), -ISO) %>%
gather(variable, complexity.level, 2:28) %>%
group_by(ISO) %>%
summarise(morphological.complexity = sum(complexity.level))
ld.demo.qual = left_join(ld.demo, ld.complexity)
Bentz, et al. (2015): L2 learners and lexical diversity.Note here that we are only looking at languages from the UDHR corpus. The other corpora have smaller languages only.
bentz = read.csv("../data/bentz_2015.csv") %>%
gather(temp, LDT, starts_with("LDT")) %>%
unite(temp1, measure, temp, sep = ".") %>%
spread(temp1, LDT) %>%
filter(text == "UDHR") %>%
select(iso_639_3, RatioL2, a.LDTscaled,
H.LDTscaled, TTR.LDTscaled, Stock,
Region, L1_speakers, L2_speakers, TTR.LDT) %>%
rename(ISO = iso_639_3,
n.L1.speakers = L1_speakers,
n.L2.speakers = L2_speakers,
ratio.L2.L1 = RatioL2,
stock = Stock,
region = Region,
scaled.LDT.TTR = TTR.LDTscaled,
scaled.LDT.ZM = a.LDTscaled,
scaled.LDT.H = H.LDTscaled)
# n_speakers taken directly from ethnologue website by ML for missing languages that were high-frequency across the datasets
ethno_sup = read.csv("../data/supplementary_enthnologue.csv")
bentz = full_join(bentz, ethno_sup) %>%
mutate(ISO = as.factor(ISO))
Atkinson (2011)
atkinson = read.csv("../data/atkinson_2011.csv") %>%
select(normalized.vowel.diversity,
normalized.consonant.diversity,
normalized.tone.diversity,
normalized.phoneme.diversity, ISO,
distance.from.origin) %>%
mutate(ISO = as.factor(unlist(lapply(strsplit(as.character(ISO),
" "),function(x) x[1])))) %>%
distinct(ISO)
Lewis & Frank (under review): Complexity Bias
# complexity bias
cb = read.csv("../data/lewis_2015.csv") %>%
rename(complexity.bias = corr,
p.complexity.bias = p.corr,
mono.complexity.bias = mono.cor,
open.complexity.bias = open.cor) %>%
left_join(codes, by="language") %>%
select(-X.1, -X, -lower.ci, -upper.ci, -checked,
-mean.length) %>%
distinct(ISO) %>%
filter(language != "english") %>% # english is an outlier in this dataset because norms were colelcted in english
select(-language, -WALS)
Futrell, Mahowald, & Gibson (2015): Dependency length
dl = read.csv("../data/futrell_2015.csv") %>%
left_join(codes, by="language") %>%
select(-language, -WALS, -fixed.random.baseline.slope,
-observed.slope, -m_rand) %>%
rename(mean.dependency.length = m_obs) %>%
filter(!is.na(ISO))
Pellegrino, Coupe, & Marsico (2015): Information density
uid = read.csv("../data/pellegrino_2015.csv") %>%
left_join(codes, by="language") %>%
select(-language, -WALS)
Moran, McCloy, & Wright (2012): Phonemic inventory
phoneme = read.csv("../data/moran_2012.csv") %>%
select(ISO, pho, con, vow, son,
obs, mon, qua, ton) %>%
rename(n.phonemes = pho,
n.consonants = con,
n.vowels = vow,
n.sonorants = son,
n.obstruents = obs,
n.monophthongs = mon,
n.qual.monophthongs = qua,
n.tones = ton)
Wichmann, et al. (2013): Mean word length
# note is this IPA? - fix this!
# [308 low frequency language have no ISO, leaving 4421 (TOTAL:4729)]
ml.raw = read.csv("../data/wichmann_2013.csv") %>%
select(1,1:109) %>%
gather(word,translation,I:name) %>%
mutate(nchar = unlist(
lapply(
lapply(
strsplit(
gsub("[[:space:]]", "", translation) ,
","),
nchar), mean))) %>%
filter(translation != "")
# subset to only those words in the swadesh list (n = 40)
swadesh.words = ml[ml$ISO == "gwj", "word"]
ml.raw = ml.raw %>%
filter(is.element(word, swadesh.words))
ml = ml.raw %>%
group_by(ISO) %>%
summarize(mean.length = mean(nchar, na.rm = T)) %>%
filter(ISO != "")
#more_ISOs = ml.raw %>%
# select(ISO, names) %>%
# mutate(names = tolower(names))
Luniewska, et al. (2015): Age of acquistion (Aoa)
aoa.raw = read.csv("../data/luniewska_2015.csv",
header = T, fileEncoding = "latin1")
aoa = aoa.raw %>%
gather(language_aoa, aoa, grep("aoa",
names(aoa.raw))) %>%
mutate(language_aoa = tolower(unlist(lapply(strsplit(
as.character(language_aoa),"_"), function(x) x[1])))) %>%
select(-3:-27) %>%
rename(language = language_aoa) %>%
left_join(codes, by="language") %>%
mutate(language = as.factor(language)) %>%
group_by(ISO) %>%
summarize(mean.aoa = mean(aoa, na.rm = T)) %>%
filter(!is.na(ISO))
Merge data frames together by ISO
d = full_join(ld.demo.qual, cb, by = "ISO") %>%
full_join(bentz, by = "ISO") %>%
full_join(phoneme, by = "ISO") %>%
full_join(ml, by = "ISO") %>%
full_join(dl, by = "ISO") %>%
full_join(uid, by = "ISO") %>%
full_join(aoa, by = "ISO") %>%
full_join(atkinson, by = "ISO") %>%
mutate(ISO = as.factor(ISO)) %>%
filter(!is.na(native.country)) # this is important to have for the mixed effects models
load data
# write.csv(d, "../data/langLearnVar_data.csv")
d = read.csv("../data/langLearnVar_data.csv")[,-1]
Remove outliers and check for normality
d.clean = d %>%
mutate_each(funs(removeOutlier(.,N_EXCLUDE_SDS)), c(-ISO, -lang.family,
-native.country, -native.country.area,
-lang.genus, -language, -stock, -region))
d.clean %>%
select(-ISO, -lang.family, -native.country, -native.country.area,
-lang.genus, -language, -stock, -region) %>%
gather(var, value) %>%
ggplot(aes(sample = value)) +
stat_qq(na.rm = T, size = .2) +
facet_wrap( ~ var, scales = "free") +
theme_bw()
The following variables look right-skewed.
NN_vars = c("area", "perimeter", "n.neighbors", "sd.precip",
"ratio.L2.L1", "n.L1.speakers", "n.L2.speakers",
"n.phonemes", "n.consonants","n.vowels", "n.sonorants",
"n.obstruents", "n.monophthongs", "n.qual.monophthongs",
"n.tones")
Take the log of these variables, and check for normality again.
d.clean = d.clean %>%
mutate_each(funs(log = log(.), dummy = sum(.)),
one_of(NN_vars)) %>%
select(-contains("dummy")) %>%
select(-one_of(NN_vars))
d.clean %>%
select(-ISO, -lang.family, -native.country, -native.country.area,
-lang.genus, -language, -stock, -region) %>%
gather(var, value) %>%
ggplot(aes(sample = value)) +
stat_qq(na.rm = T, size = .2) +
facet_wrap( ~ var, scales = "free") +
theme_bw()
Check for normality of variables using Shapiro-Wilk Normality Test
is.na(d.clean) <- sapply(d.clean, is.infinite)
normality.test = d.clean %>%
summarise_each(funs(unlist(tidy(shapiro.test(.))[2])),
c(-ISO, -lang.family, -native.country,
-native.country.area,
-lang.genus, -language, -stock, -region)) %>%
gather(variable, shapiro.p.value) %>%
mutate(normal = ifelse(shapiro.p.value > .05, TRUE, FALSE)) %>%
summary()
By this test, only 11 variables normal. But, I think this is the best we can do (?).
Look at histograms of all variables.
demo_vars = c("lat", "lon", "perimeter_log", "n.neighbors_log",
"mean.temp", "sum.precip",
"sd.temp", "growing.season", "pop_log", "area_log",
"n.neighbors_log", "sd.precip_log", "ratio.L2.L1_log",
"n.L1.speakers_log",
"n.L2.speakers_log", "distance.from.origin")
lang_vars = c(setdiff(names(d.clean), demo_vars))[-c(1:6, 22,23)]
d.clean %>%
gather(variable, num, lat:length(d.clean)) %>%
filter(variable != "stock", variable != "region") %>%
mutate(var_type = ifelse(is.element(variable, demo_vars),
"demo", "lang"),
num = as.numeric(as.character(num))) %>%
arrange(var_type) %>%
mutate(variable = factor(variable, variable)) %>%
ggplot(aes(x = num, fill = var_type)) +
geom_histogram(position = "identity") +
facet_wrap( ~ variable, scales = "free") +
theme_bw()
Number of languages we have data, for each variable
d.clean %>%
gather(variable, num, lat:length(d.clean)) %>%
filter(variable != "stock", variable != "region") %>%
group_by(variable) %>%
summarize(counts = length(which(is.na(num) == F))) %>%
mutate(var_type = ifelse(is.element(variable, demo_vars),
"demo", "lang")) %>%
mutate(counts_trunc = ifelse(counts > 150, 150, counts)) %>%
arrange(var_type) %>%
mutate(variable = factor(variable, variable)) %>%
ggplot(aes(y=counts_trunc, x = variable, fill = var_type)) +
geom_bar(stat = "identity") +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1),
legend.position = "none") +
ylab("Number of languages (truncated at 150)") +
ggtitle("Number languages per measure")
Look at number of languages intersecting each variable
arealControl_vars = c("stock", "region", "lang.family", "lang.genus",
"native.country", "native.country.area")
d.clean %>%
gather(lang_measure, lang_value,
which(is.element(names(.), c(lang_vars, arealControl_vars)))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars))) %>%
group_by(lang_measure, pop_measure) %>%
mutate(both_not_na = !is.na(lang_value) & !is.na(pop_value)) %>%
summarise(n_languages = length(which(both_not_na == TRUE))) %>%
ggplot(aes(pop_measure, lang_measure, group = lang_measure)) +
geom_tile(aes(fill = n_languages)) +
geom_text(aes(fill = n_languages, label = n_languages)) +
scale_fill_gradient(low = "white", high = "red") +
xlab("population variables") +
ylab("language variables") +
ggtitle("Number languages between variables") +
theme(axis.text.x = element_text(angle = 90, hjust = 1))
# write.csv(d.clean,"../data/langLearnVar_data_clean.csv")
# d.clean = read.csv("../data/langLearnVar_data_clean.csv")[-1]
# these are the variables we care about for the paper
demo_vars_crit = c("n.neighbors_log",
"mean.temp", "sum.precip", "sd.temp",
"pop_log", "area_log", "n.neighbors_log",
"sd.precip_log", "ratio.L2.L1_log")
lang_vars_crit = c("p.complexity.bias","TTR.LDT",
"mean.dependency.length", "mean.length",
"n.consonants_log", "n.vowels_log",
"morphological.complexity")
d.scatter.fam = d.clean %>%
select(pop_log, ratio.L2.L1_log,
n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip,
sd.precip_log, morphological.complexity,TTR.LDT, p.complexity.bias,
mean.length, n.consonants_log, n.vowels_log,
lang.family, native.country) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars_crit))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars_crit)))
is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite)
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
!is.na(lang_value))
# reorder factor levels for facet plots below (affects order in other dfs)
d.scatter.fam$lang_measure <- factor(d.scatter.fam$lang_measure,
levels = c("morphological.complexity",
"p.complexity.bias", "TTR.LDT",
"mean.length", "n.consonants_log",
"n.vowels_log"))
d.scatter.fam$pop_measure <- factor(d.scatter.fam$pop_measure,
levels = c('pop_log', 'ratio.L2.L1_log',
'n.neighbors_log', 'area_log',
'mean.temp', 'sd.temp',
'sum.precip', 'sd.precip_log' ))
# get model fits
d.model.fits = d.scatter.fam %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) +
(1|native.country), data=.)))
line.fits = d.model.fits %>%
filter(term == "pop_value" | term == "(Intercept)") %>%
select(-std.error, -statistic, -group) %>%
spread(term, estimate) %>%
rename(s = pop_value,
i = `(Intercept)`)
sigs = d.model.fits %>%
filter(term == "pop_value") %>%
mutate(sig.col = ifelse(statistic > 1.96, "pos",
ifelse(statistic < -1.96, "neg",
"none"))) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
ungroup
Paper plot #1
facet_pop_names <- c(
pop_log="Population \n(log)",
ratio.L2.L1_log="L2:L1 \n(log)",
n.neighbors_log="N. Ling. \nNeighbors (log)",
area_log="Area \n(log)",
mean.temp="Temperature \n Mean",
sd.temp="Temperature \nSD",
sum.precip="Precipitation \nSum",
sd.precip_log="Precipitation \nSD (log)"
)
facet_lang_names <- c(
morphological.complexity="Morphosyntactic \nComplexity",
TTR.LDT="Lexical \nDiversity",
p.complexity.bias="Complexity \nBias",
mean.length="Word \nLength",
n.consonants_log="N. Consonants \n(log)",
n.vowels_log="N. Vowels \n(log)"
)
#pdf("../papers/cogsci2016/figs/plot1.pdf", width = 18, height = 14)
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
facet_grid(lang_measure ~ pop_measure,
scales = "free",
labeller = labeller(pop_measure = facet_pop_names,
lang_measure = facet_lang_names)) +
geom_rect(data = sigs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = 1.1, color = "grey48") +
geom_abline(line.fits, mapping = aes(slope = s, intercept = i),
size = 2, color = "black") +
scale_fill_manual(values = c( "mediumblue", "grey99", "red1")) +
xlab("\nEnvironmental variables") +
ylab("Language variables\n") +
theme_bw() +
theme(legend.position = "none",
axis.title = element_text(size = 21),
strip.text= element_text(size = 17))
#dev.off()
ld.demo.qual %>%
mutate(morphological.complexity.factor =
as.factor(morphological.complexity)) %>%
group_by(morphological.complexity.factor) %>%
multi_boot(column="pop_log",
summary_groups = c("morphological.complexity.factor"),
statistics_functions = c("mean", "ci_lower","ci_upper")) %>%
mutate(morphological.complexity =
as.numeric(as.character(morphological.complexity.factor))) %>%
ggplot() +
geom_pointrange(aes(x= morphological.complexity.factor,
y = mean, ymin = ci_lower, ymax = ci_upper)) +
geom_smooth(method = "lm",aes(morphological.complexity-5, mean)) +
# theres a weird offset because complexity is a factor?
ylab("Log population") +
xlab("Morphological complexity") +
ggtitle("Lupyan and Dale (2010) Fig. 3 reproduction") +
theme_bw()
prep by checking for correlations
# get demographic variables only (excluding L2 because intersections contains only 40 languages)
d.demo = d.clean %>%
select(lat, mean.temp, sd.temp, sum.precip, sd.precip_log,
area_log, pop_log, n.neighbors_log) %>%
mutate(lat = abs(lat))
is.na(d.demo) <- sapply(d.demo, is.infinite)
demo.corr = cor(d.demo, use = "pairwise.complete")
abs(demo.corr) > CORR_THRESHOLD
## lat mean.temp sd.temp sum.precip sd.precip_log area_log
## lat TRUE TRUE FALSE FALSE TRUE FALSE
## mean.temp TRUE TRUE FALSE FALSE FALSE FALSE
## sd.temp FALSE FALSE TRUE FALSE FALSE FALSE
## sum.precip FALSE FALSE FALSE TRUE FALSE FALSE
## sd.precip_log TRUE FALSE FALSE FALSE TRUE FALSE
## area_log FALSE FALSE FALSE FALSE FALSE TRUE
## pop_log FALSE FALSE FALSE FALSE FALSE FALSE
## n.neighbors_log FALSE FALSE FALSE FALSE FALSE FALSE
## pop_log n.neighbors_log
## lat FALSE FALSE
## mean.temp FALSE FALSE
## sd.temp FALSE FALSE
## sum.precip FALSE FALSE
## sd.precip_log FALSE FALSE
## area_log FALSE FALSE
## pop_log TRUE FALSE
## n.neighbors_log FALSE TRUE
# exclude lat (correlated with mean.temp, sd.precip_log) [threshold: > .8]
PCA
d.demo = select(d.demo, -lat)
# do pca
pca.demo = prcomp(na.omit(d.demo), scale = TRUE)
# look at variance explained
fviz_eig(pca.demo, addlabels = TRUE,
linecolor ="red", geom = "line") +
theme_bw()
barplot(pca.demo$sdev/pca.demo$sdev[1])
pca.demo = prcomp(na.omit(d.demo), scale = TRUE, tol = .6)
# plot loadings
pcs.demo <- cbind(names(d.demo),
gather(as.data.frame(pca.demo$rotation),
pc, value))
names(pcs.demo)[1] = "variable"
pcs.demo$variable = factor(pcs.demo$variable, levels = pcs.demo$variable[1:8])
# order variables
pcs.demo$magnitude = ifelse(abs(pcs.demo$value)>.2, TRUE, FALSE)
ggplot(pcs.demo) +
geom_bar(aes(x = variable, y = value,
fill = magnitude), stat="identity") +
facet_wrap(~pc) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(name = "Magnitude",
values = c("grey","red1")) +
ggtitle("PCA loadings")
# merge PCAs with full dataset for model fits
d.demo[!is.na(d.demo)] = 0
d.demo[is.na(d.demo)] = 1
good_is = which(rowSums(d.demo) == 0)
d.pca.demo = d.clean[good_is,] %>% cbind(pca.demo$x)
# rename PCs
names(d.pca.demo)[names(d.pca.demo) == "PC1"] = "hot"
names(d.pca.demo)[names(d.pca.demo) == "PC2"] = "small"
Maps of principle components for demographic variable by language
ggplot(d.pca.demo) +
borders("world", colour="gray50", fill="gray50") +
geom_point(aes(x = lon, y=lat,
color = hot), size=3) +
scale_colour_gradient(high="green", low = "white") +
ggtitle("Hot (PC1)") +
mapTheme
ggplot(d.pca.demo) +
borders("world", colour="gray50", fill="gray50") +
geom_point(aes(x = lon, y=lat,
color = small), size=3) +
scale_colour_gradient(high="blue", low = "white") +
ggtitle("Small (PC2)") +
mapTheme
Model fits using demographic PCA with all language variables
demo_vars_crit2 = c("hot", "small")
lang_vars_crit2 = c("morphological.complexity",
"p.complexity.bias","TTR.LDT",
"mean.length", "mean.aoa",
"n.consonants_log", "n.vowels_log")
d.scatter.fam = d.pca.demo %>%
select(n.consonants_log, n.vowels_log,
mean.length, p.complexity.bias, TTR.LDT,
morphological.complexity,
lang.family, native.country, small, hot) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars_crit2))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars_crit2)))
is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite)
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
!is.na(lang_value))
# get model fits
d.model.fits = d.scatter.fam %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(lmer(lang_value ~ pop_value +
(pop_value|lang.family) +
(1|native.country),
data=.)))
line.fits = d.model.fits %>%
filter(term == "pop_value" | term == "(Intercept)") %>%
select(-std.error, -statistic, -group) %>%
spread(term, estimate) %>%
rename(s = pop_value,
i = `(Intercept)`)
sigs = d.model.fits %>%
filter(term == "pop_value") %>%
mutate(sig.col = ifelse(statistic > 1.96, "pos",
ifelse(statistic < -1.96, "neg",
"none"))) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
ungroup
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
geom_rect(data = sigs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = .3) +
geom_abline(line.fits, mapping = aes(slope = s, intercept = i),
size = 1, color = "green") +
facet_grid(lang_measure~pop_measure, scales = "free") +
scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
xlab("Demographic variables") +
ylab("Language variables") +
theme(legend.position = "none")
prep by looking at correlations
d.lang = d.clean %>%
select(morphological.complexity, scaled.LDT.H,
mean.length, n.consonants_log, n.vowels_log)
is.na(d.lang) <- sapply(d.lang, is.infinite)
lang.corr = cor(d.lang, use = "pairwise.complete")
abs(lang.corr) > CORR_THRESHOLD
## morphological.complexity scaled.LDT.H mean.length
## morphological.complexity TRUE FALSE FALSE
## scaled.LDT.H FALSE TRUE FALSE
## mean.length FALSE FALSE TRUE
## n.consonants_log FALSE FALSE FALSE
## n.vowels_log FALSE FALSE FALSE
## n.consonants_log n.vowels_log
## morphological.complexity FALSE FALSE
## scaled.LDT.H FALSE FALSE
## mean.length FALSE FALSE
## n.consonants_log TRUE FALSE
## n.vowels_log FALSE TRUE
# include all language variables
PCA
# do pca
pca.lang = prcomp(na.omit(d.lang), scale = TRUE)
barplot(pca.lang$sdev/pca.lang$sdev[1])
# look at variance explained
fviz_eig(pca.lang, addlabels = TRUE,
linecolor ="red", geom = "line") +
theme_bw()
pca.lang = prcomp(na.omit(d.lang), scale = TRUE, tol = .7)
# plot loadings
pcs.lang <- cbind(names(d.lang),
gather(as.data.frame(pca.lang$rotation),
pc, value))
names(pcs.lang)[1] = "variable"
pcs.lang$variable = factor(pcs.lang$variable , levels = pcs.lang$variable[1:8]) # order variables
pcs.lang$magnitude = ifelse(abs(pcs.lang$value)>.2, TRUE, FALSE)
ggplot(pcs.lang) +
geom_bar(aes(x = variable, y = value,
fill = magnitude), stat = "identity") +
facet_wrap(~pc) +
theme_bw() +
theme(axis.text.x = element_text(angle = 90, hjust = 1)) +
scale_fill_manual(name = "Magnitude",
values = c("grey","red1")) +
ggtitle("PCA loadings")
# merge PCAs with full dataset for model fits
d.lang[!is.na(d.lang)] = 0
d.lang[is.na(d.lang)] = 1
good_is = which(rowSums(d.lang) == 0)
d.pca.lang = d.clean[good_is,] %>% cbind(pca.lang$x)
names(d.pca.lang)[names(d.pca.lang) == "PC1"] = "PC1_L"
names(d.pca.lang)[names(d.pca.lang) == "PC2"] = "PC2_L"
Model fits using language PCA with all demographics variables
lang_vars_crit2 = c("PC1_L", "PC2_L")
demo_vars_crit2 = c("n.neighbors_log",
"mean.temp", "sum.precip", "sd.temp",
"pop_log", "area_log", "n.neighbors_log",
"sd.precip_log", "ratio.L2.L1_log")
d.scatter.fam = d.pca.lang %>%
select(ratio.L2.L1_log, pop_log,
n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip,
PC1_L, PC2_L, lang.family, native.country) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars_crit2))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars_crit2)))
is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite)
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
!is.na(lang_value))
# get model fits
d.model.fits = d.scatter.fam %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(lmer(lang_value ~ pop_value +
(1|native.country), data=.)))
line.fits = d.model.fits %>%
filter(term == "pop_value" | term == "(Intercept)") %>%
select(-std.error, -statistic, -group) %>%
spread(term, estimate) %>%
rename(s = pop_value,
i = `(Intercept)`)
sigs = d.model.fits %>%
filter(term == "pop_value") %>%
mutate(sig.col = ifelse(statistic > 1.96, "pos",
ifelse(statistic < -1.96, "neg",
"none"))) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
ungroup
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
geom_rect(data = sigs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = .3) +
geom_abline(line.fits, mapping = aes(slope = s, intercept = i), size = 1, color = "green") +
facet_grid(lang_measure~pop_measure, scales = "free") +
scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
xlab("Demographic variables") +
ylab("Language variables") +
theme(legend.position = "none")
Model fits
all.pca = left_join(d.pca.demo,
d.pca.lang[,c("PC1_L", "PC2_L", "ISO")],
by = "ISO")
demo_vars_crit2 = c("hot", "small")
lang_vars_crit2 = c("PC1_L", "PC2_L")
d.scatter.fam = all.pca %>%
select(PC1_L, PC2_L, hot, small,
lang.family, native.country, ISO, language) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars_crit2))) %>%
group_by(lang_measure) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars_crit2)))
is.na(d.scatter.fam) <- sapply(d.scatter.fam, is.infinite)
d.scatter.fam <- filter(d.scatter.fam, !is.na(pop_value),
!is.na(lang_value))
# get model fits
d.model.fits = d.scatter.fam %>%
group_by(lang_measure, pop_measure) %>%
do(tidy(lmer(lang_value ~ pop_value + (pop_value|lang.family) +
(1|native.country), data=.)))
line.fits = d.model.fits %>%
filter(term == "pop_value" | term == "(Intercept)") %>%
select(-std.error, -statistic, -group) %>%
spread(term, estimate) %>%
rename(s = pop_value,
i = `(Intercept)`)
sigs = d.model.fits %>%
filter(term == "pop_value") %>%
mutate(sig.col = ifelse(statistic > 1.96, "pos",
ifelse(statistic < -1.96, "neg",
"none"))) %>%
mutate(pop_value = .1, lang_value = .1) %>% # this is a hack
ungroup
ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
geom_rect(data = sigs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point(size = .3) +
geom_abline(line.fits, mapping = aes(slope = s, intercept = i), size = 1, color = "green") +
facet_grid(lang_measure~pop_measure, scales = "free") +
scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
xlab("Demographic variables") +
ylab("Language variables") +
theme(legend.position = "none")
Paper plot #2
dcopy = d.scatter.fam
vplayout <- function(x, y)
viewport(layout.pos.row = x, layout.pos.col = y)
#Extract Legend
g_legend<-function(a.gplot){
tmp <- ggplot_gtable(ggplot_build(a.gplot))
leg <- which(sapply(tmp$grobs, function(x) x$name) == "guide-box")
legend <- tmp$grobs[[leg]]
return(legend)}
facet_env_names <- c(PC1="PC1: Hot and rainy", PC2="PC2: Small")
facet_lang_names <- c(PC1="PC1: Complex", PC2="PC2: Short words")
pc.demo = ggplot(pcs.demo) +
geom_bar(aes(x = variable, y = value, fill = variable), stat="identity") +
facet_wrap(~pc, labeller = labeller(pc = facet_env_names)) +
ylab("Component loading")+
theme_classic() +
theme(axis.text.x = element_blank(),
axis.title.x = element_blank(),
strip.text= element_text(size = 21),
axis.title = element_text(size = 21),
strip.background = element_rect(fill = "grey", colour = "white")) +
ylim(-.6, .7) +
scale_x_discrete(breaks=NULL) +
scale_fill_brewer(palette="YlOrRd") +
theme(legend.position = "none")
pc.lang = ggplot(pcs.lang) +
geom_bar(aes(x = variable, y = value,
fill = variable), stat="identity") +
theme_classic() +
theme(axis.text.y = element_blank(),
axis.title.y = element_blank(),
strip.text= element_text(size = 21),
axis.title = element_text(size = 21),
strip.background = element_rect(fill = "grey", colour = "white")) +
coord_flip() +
facet_grid(pc ~ ., scales="free_x",
labeller = labeller(pc = facet_lang_names)) +
ylim(-.6, .7) +
scale_x_discrete(breaks=NULL) +
ylab("Component loading") +
theme(legend.position = "none") +
scale_fill_brewer(palette="BuGn")
## add missing names:
extra_names = data.frame(ISO = c("tgl", "tam", "sna", "wol"),
language = c("tagalog", "tamil", "shona", "wolof"))
d.scatter.fam = left_join(d.scatter.fam, extra_names, by="ISO") %>%
mutate(language = as.factor(ifelse(is.na(language.x),
as.character(language.y),
as.character(language.x))))
p1 <- ggplot(d.scatter.fam, aes(x = pop_value, y = lang_value)) +
geom_rect(data = sigs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point() +
geom_abline(line.fits, mapping = aes(slope = s, intercept = i),
size = 2) +
facet_grid(lang_measure~pop_measure, scales = "free") +
geom_text_repel(aes(label = language), size = 3) +
#geom_text(aes(label = language), size = 3) +
scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
theme(strip.background = element_blank(),
strip.text = element_blank(),
axis.title = element_text(size = 21)) +
ylab("Rotated language values")+
xlab("Rotated environmental values")+
theme(legend.position = "none")
SIZE= 1.6
FONT_SIZE = 20
pc.demo.legend = ggplot(pcs.demo) +
geom_bar(aes(x = variable, y = value, fill = variable), stat = "identity") +
scale_fill_brewer(guide = guide_legend(title = NULL, reverse = T,
keywidth = SIZE, keyheight = SIZE),
palette="YlOrRd") +
theme(legend.text=element_text(size=FONT_SIZE))
legend.demo <- g_legend(pc.demo.legend)
pc.lang.legend = ggplot(pcs.lang) +
geom_bar(aes(x = variable, y = value, fill = variable), stat = "identity") +
#theme(legend.title=element_blank()) +
scale_fill_brewer(guide = guide_legend(title = NULL, reverse = T,
keywidth = SIZE, keyheight = SIZE),
palette="BuGn") +
theme(legend.text=element_text(size=FONT_SIZE))
legend.lang <- g_legend(pc.lang.legend)
#pdf("../papers/cogsci2016/figs/plot2.pdf", width = 18, height = 12)
#quartz(width = 9, height = 6)
legend.demo$vp$x <- unit(.75, 'npc')
legend.demo$vp$y <- unit(.84, 'npc')
legend.lang$vp$x <- unit(.9, 'npc')
legend.lang$vp$y <- unit(.8, 'npc')
grid.newpage()
pushViewport(viewport(layout = grid.layout(6, 6)))
print(pc.demo, vp=vplayout(1:2.3,1:4))
print(pc.lang, vp=vplayout(3:6,5:6))
print(p1, vp=vplayout(3:6,1:4))
grid.draw(legend.demo)
grid.draw(legend.lang)
#dev.off()
Grand means across languages
# read in word bank data
aoa_data.wb = read.csv("../data/frank_underreview.csv") %>%
filter(aoa > 5 & aoa < 31)
# get common words across languages in wb
all_words = levels(aoa_data.wb$uni_lemma)
for (i in 1:length(levels(aoa_data.wb$language))){
this_language = aoa_data.wb[aoa_data.wb$language == levels(aoa_data.wb$language)[i],]
all_words = intersect(all_words, unique(this_language$uni_lemma))
}
aoa.means.all = aoa_data.wb %>%
mutate(language = tolower(language)) %>%
filter(is.element(uni_lemma, all_words)) %>%
group_by(language) %>%
summarise(mean.aoa.wb = mean(aoa)) %>%
left_join(select(d.clean, mean.aoa, language)) %>%
rename(mean.aoa.lu = mean.aoa)
ggplot(aoa.means.all, aes(x = mean.aoa.lu, y = mean.aoa.wb)) +
geom_text(aes(label = language)) +
geom_smooth(method = "lm") +
ggtitle("Grand means across langauges (wb common words only; n = 117)") +
theme_bw()
cor.test(aoa.means.all$mean.aoa.wb,
aoa.means.all$mean.aoa.lu,
use = "pairwise.complete")
##
## Pearson's product-moment correlation
##
## data: aoa.means.all$mean.aoa.wb and aoa.means.all$mean.aoa.lu
## t = 1.716, df = 5, p-value = 0.1468
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.2663875 0.9337631
## sample estimates:
## cor
## 0.6087989
Now lets look at the same words for both data sets [n = 56]
all.wb.words = unlist(lapply(strsplit(
as.character(all_words)," "), function(x) x[1]))
aoa.raw$word = unlist(lapply(strsplit(
as.character(aoa.raw$word)," "), function(x) x[1]))
aoa.lu.common.words.only = aoa.raw %>%
#filter lu words based on the ones that are common in wb
filter(is.element(word,all.wb.words)) %>%
gather(language_aoa, aoa, grep("aoa",
names(aoa.raw))) %>%
mutate(language_aoa = tolower(unlist(lapply(strsplit(
as.character(language_aoa),"_"), function(x) x[1])))) %>%
select(-3:-27) %>%
rename(language = language_aoa) %>%
group_by(language) %>%
summarize(mean.aoa = mean(aoa, na.rm = T))
aoa.means.all.common = aoa_data.wb %>%
mutate(language = tolower(language)) %>%
filter(is.element(uni_lemma, all_words)) %>%
group_by(language) %>%
summarise(mean.aoa.wb = mean(aoa)) %>%
left_join(select(aoa.lu.common.words.only, mean.aoa, language)) %>%
rename(mean.aoa.lu = mean.aoa)
ggplot(aoa.means.all.common, aes(x = mean.aoa.lu, y = mean.aoa.wb)) +
geom_text(aes(label = language)) +
geom_smooth(method = "lm") +
ggtitle("Grand means across langauges (common words only in both datasets; n = 56)") +
theme_bw()
cor.test(aoa.means.all.common$mean.aoa.wb,
aoa.means.all.common$mean.aoa.lu,
use = "pairwise.complete")
##
## Pearson's product-moment correlation
##
## data: aoa.means.all.common$mean.aoa.wb and aoa.means.all.common$mean.aoa.lu
## t = 1.3055, df = 5, p-value = 0.2486
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.4011876 0.9112595
## sample estimates:
## cor
## 0.5041974
At word level, by language
aoa.lu.common.words.only = aoa.raw %>%
filter(is.element(word,all.wb.words)) %>% #filter lu words based on the ones that are common in wb
gather(language_aoa, aoa, grep("aoa",
names(aoa.raw))) %>%
mutate(language_aoa = tolower(unlist(lapply(strsplit(
as.character(language_aoa),"_"), function(x) x[1])))) %>%
select(-3:-27) %>%
rename(language = language_aoa) %>%
select(word, language, aoa) %>%
group_by(word, language) %>%
summarise(aoa = mean(aoa, na.rm = T)) %>%
mutate(dataset = "lu")
aoa_data.wb$uni_lemma = unlist(lapply(strsplit(
as.character(aoa_data.wb$uni_lemma)," "), function(x) x[1]))
common_words_across_all_langs_and_datasets = unique(aoa.lu.common.words.only$word)
aoa.wb.common.words.only = aoa_data.wb %>%
mutate(language = tolower(language)) %>%
group_by(language, uni_lemma) %>%
summarise(aoa = mean(aoa, na.rm = T)) %>%
filter(is.element(uni_lemma, common_words_across_all_langs_and_datasets)) %>%
rename(word = uni_lemma) %>%
select(language, word, aoa) %>%
mutate(dataset = "wb")
common_languages = intersect(tolower(as.character(unique(aoa_data.wb$language))), #wb
unique(aoa.lu.common.words.only$language)) #lu
all.aoa.by.word = rbind(aoa.wb.common.words.only,
aoa.lu.common.words.only) %>%
filter(is.element(language, common_languages)) %>%
group_by(language, dataset) %>%
spread(dataset, aoa)
ggplot(all.aoa.by.word, aes(x = lu, y = wb,
group = language, color = language)) +
geom_point() +
geom_smooth(method = "lm") +
facet_grid(.~language) +
ggtitle("By word means for each language (common words only (n = 56))") +
theme_bw() +
theme(legend.position="none")
summary(lm(wb ~ lu + language, data = all.aoa.by.word))
##
## Call:
## lm(formula = wb ~ lu + language, data = all.aoa.by.word)
##
## Residuals:
## Min 1Q Median 3Q Max
## -9.5159 -1.2652 0.0568 1.2700 8.2104
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 13.8991 0.5365 25.907 < 2e-16 ***
## lu 0.9356 0.1598 5.857 1.02e-08 ***
## languagehebrew 1.6598 0.4241 3.914 0.000107 ***
## languageitalian 1.9100 0.4213 4.533 7.76e-06 ***
## languagerussian 1.0638 0.4239 2.510 0.012495 *
## languagespanish 1.7677 0.4266 4.144 4.20e-05 ***
## languageswedish 0.2570 0.4221 0.609 0.542956
## languageturkish 2.1323 0.4213 5.061 6.48e-07 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 2.229 on 384 degrees of freedom
## Multiple R-squared: 0.1958, Adjusted R-squared: 0.1811
## F-statistic: 13.36 on 7 and 384 DF, p-value: 1.898e-15
summary(lmer(wb ~ lu + (lu|language), all.aoa.by.word))
## Linear mixed model fit by REML ['lmerMod']
## Formula: wb ~ lu + (lu | language)
## Data: all.aoa.by.word
##
## REML criterion at convergence: 1754.3
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.2310 -0.6115 0.0283 0.5812 3.6718
##
## Random effects:
## Groups Name Variance Std.Dev. Corr
## language (Intercept) 2.1938 1.4812
## lu 0.0627 0.2504 -1.00
## Residual 4.9459 2.2239
## Number of obs: 392, groups: language, 7
##
## Fixed effects:
## Estimate Std. Error t value
## (Intercept) 14.9982 0.7172 20.911
## lu 0.9985 0.1831 5.453
##
## Correlation of Fixed Effects:
## (Intr)
## lu -0.921
By common words only
all.means.by.word = all.aoa.by.word %>%
group_by(word) %>%
summarise(mean.lu = mean(lu, na.rm = T),
mean.wb = mean(wb, na.rm = T))
ggplot(all.means.by.word,
aes(x = mean.lu, y = mean.wb)) +
geom_text(aes(label = word)) +
geom_smooth(method = "lm") +
ggtitle("By word means (common words only (n = 56)") +
theme_bw()
cor.test(all.means.by.word$mean.lu, all.means.by.word$mean.wb)
##
## Pearson's product-moment correlation
##
## data: all.means.by.word$mean.lu and all.means.by.word$mean.wb
## t = 2.8311, df = 54, p-value = 0.006504
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## 0.1066833 0.5686561
## sample estimates:
## cor
## 0.3595009
For the 7 common languages, WB and Lu are relatively correlated (r = .36).
d.scatter.fam = d.clean %>%
select(pop_log, ratio.L2.L1_log,
n.neighbors_log, area_log, mean.temp, sd.temp, sum.precip,
sd.precip_log, language, mean.aoa, lang.family, native.country) %>%
gather(pop_measure, pop_value,
which(is.element(names(.), demo_vars_crit)))
d.model.fits = d.scatter.fam %>%
group_by(pop_measure) %>%
do(tidy(lmer(mean.aoa ~ pop_value + (pop_value|lang.family) +
(1|native.country), data=.)))
line.fits = d.model.fits %>%
filter(term == "pop_value" | term == "(Intercept)") %>%
select(-std.error, -statistic, -group) %>%
spread(term, estimate) %>%
rename(s = pop_value,
i = `(Intercept)`)
sigs = d.model.fits %>%
filter(term == "pop_value") %>%
mutate(sig.col = ifelse(statistic > 1.96, "pos",
ifelse(statistic < -1.96, "neg",
"none"))) %>%
mutate(pop_value = .1, mean.aoa = .1) %>% # this is a hack
ungroup
ggplot(d.scatter.fam, aes(x = pop_value, y = mean.aoa)) +
geom_rect(data = sigs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point() +
geom_abline(line.fits, mapping = aes(slope = s, intercept = i),
size = 2) +
facet_grid(.~pop_measure, scales = "free",
labeller = labeller(pop_measure = facet_pop_names)) +
geom_text_repel(aes(label = language), size = 3) +
scale_fill_manual(values = c( "mediumblue", "grey99","red1")) +
theme_bw() +
theme(legend.position = "none")
Temperature is positively correlated with AOA, and precipitation is negatively correlated. Children learn words later in hotter places with little variability in preicipitation.
lang_vars_crit = c("p.complexity.bias","TTR.LDT",
"mean.dependency.length", "mean.length",
"n.consonants_log", "n.vowels_log",
"morphological.complexity")
d.scatter.fam = d.clean %>%
filter(language != "xhosa") %>% # an outlier
select(p.complexity.bias,TTR.LDT,
mean.dependency.length, mean.length,
n.consonants_log, n.vowels_log,
morphological.complexity, mean.aoa,
lang.family, native.country, language) %>%
gather(lang_measure, lang_value,
which(is.element(names(.), lang_vars_crit)))
# get model fits
d.model.fits = d.scatter.fam %>%
group_by(lang_measure) %>%
do(tidy(lmer(mean.aoa ~ lang_value + (lang_value|lang.family) +
(1|native.country), data=.)))
line.fits = d.model.fits %>%
filter(term == "lang_value" | term == "(Intercept)") %>%
select(-std.error, -statistic, -group) %>%
spread(term, estimate) %>%
rename(s = lang_value,
i = `(Intercept)`)
sigs = d.model.fits %>%
filter(term == "lang_value") %>%
mutate(sig.col = ifelse(statistic > 1.96, "pos",
ifelse(statistic < -1.96, "neg",
"none"))) %>%
mutate(lang_value = .1, mean.aoa = .1) %>% # this is a hack
ungroup
ggplot(d.scatter.fam, aes(x = lang_value, y = mean.aoa)) +
geom_rect(data = sigs, aes(fill = sig.col),
xmin = -Inf, xmax = Inf,
ymin = -Inf, ymax = Inf, alpha = 0.2) +
geom_point() +
geom_abline(line.fits, mapping = aes(slope = s, intercept = i),
size = 2) +
facet_grid(.~lang_measure, scales = "free") +
geom_text_repel(aes(label = language), size = 3) +
scale_fill_manual(values = c( "grey99", "red1","mediumblue")) +
theme_bw() +
theme(legend.position = "none")
## TTR significant in correlation
cor.test(d.clean$TTR.LDT, d.clean$mean.aoa)
##
## Pearson's product-moment correlation
##
## data: d.clean$TTR.LDT and d.clean$mean.aoa
## t = -2.4237, df = 15, p-value = 0.02848
## alternative hypothesis: true correlation is not equal to 0
## 95 percent confidence interval:
## -0.8056945 -0.0668897
## sample estimates:
## cor
## -0.5304795
Predicted relationship with AOA number of consonants (TTR has reliable correlation, but not significant in lmer)
d.clean = ld %>%
select(ISO, DRYSOV) %>%
left_join(d.clean, by="ISO")
old = 1:7
new = new = c("SOV", "SVO", "VSO", "VOS", "OVS", "OSV", "none")
d.clean$DRYSOV = mapvalues(d.clean$DRYSOV,
from = as.character(old),
to = as.character(new))
d.clean %>%
distinct(ISO) %>%
filter(!is.na(mean.aoa))%>%
group_by(DRYSOV) %>%
multi_boot_standard("mean.aoa", na.rm = T) %>%
ggplot(aes(x = DRYSOV, y = mean, fill = DRYSOV)) +
geom_bar(stat = "identity") +
geom_errorbar(aes(ymin = ci_lower, ymax = ci_upper),
size=.3,
width=.2,
position=position_dodge(.9)) +
theme_bw()
Too small of a sample to see differences across categories.
d.clean = ld %>%
select(ISO, BICFUS) %>%
rename(affix.type = BICFUS) %>%
left_join(d.clean, by="ISO") %>%
mutate(affix.type = as.factor(affix.type))
summary(d.clean$affix.type)
## 1 2 3 4 5 6 7 NA's
## 24666 720 18 25 5 167 310 20870
old = 1:7
new = new = c("concatenative", "isolating", "tonal",
"isolating", "concatenative", "concatenative", "both")
d.clean$affix.type = mapvalues(d.clean$affix.type,
from = as.character(old),
to = as.character(new))
d.clean %>%
filter(!is.na(mean.aoa), !is.na(affix.type)) %>%
group_by(ISO) %>%
summarise(n = n(),
affix.type = affix.type[1])
## Source: local data frame [9 x 3]
##
## ISO n affix.type
## (chr) (int) (fctr)
## 1 deu 1681 concatenative
## 2 ell 1225 concatenative
## 3 eng 11236 concatenative
## 4 fin 49 concatenative
## 5 heb 64 concatenative
## 6 hun 100 concatenative
## 7 rus 961 concatenative
## 8 spa 1849 concatenative
## 9 tur 1156 concatenative
All are concatenative that we have data for (n = 9 for AOA ^ WALS). Can’t do this analysis.
d.clean = ld %>%
select(ISO, DRYPRE) %>%
rename(preVsuf = DRYPRE) %>%
left_join(d.clean, by="ISO") %>%
mutate(preVsuf = as.factor(preVsuf))
summary(d.clean$preVsuf)
## 1 2 3 4 5 6 NA's
## 24068 1849792 34765 1165 2305 607 139489
old = 1:6
new = c("none", "suffix", "suffix",
"prefix", "prefix", "prefix")
d.clean$preVsuf = mapvalues(d.clean$preVsuf,
from = as.character(old),
to = as.character(new))
d.clean %>%
filter(!is.na(mean.aoa), !is.na(preVsuf)) %>%
group_by(ISO) %>%
summarise(n = n(),
affix.type = preVsuf[1]) %>%
select(-n) %>%
summary()
## ISO affix.type
## Length:19 none : 0
## Class :character suffix:17
## Mode :character prefix: 2
All but two are suffixing. Can’t do this analysis either.