Data source: wordbank[http://wordbank.stanford.edu].
Takeaway: Polite words are acquired early. They are produced later than “mommy”, but they are produced either earlier or similarly early to “milk”.
cdi_trajectory <- read.csv(here::here("childes/item_trajectory_table.csv")) %>%
select(-language, -measure, -form) %>%
gather(-age, key=word, value=prop) %>%
separate(word, c("word", "word2")) %>%
select(-word2)
p_cdi <- ggplot(cdi_trajectory, aes(x=age, y=prop, col=word, label=word)) +
geom_point() +
geom_smooth(se=F) +
theme_few() +
scale_color_ptol() +
geom_text_repel(data=subset(cdi_trajectory, age == 16),
aes(x=age, y=prop, col=word, label=word)) +
ylab("proportion of children producing") +
xlab("age (months)") +
guides(col=F)
ggsave(here::here("childes/production_trajectory.png"), width = 6, height = 4)
## `geom_smooth()` using method = 'loess' and formula 'y ~ x'
Data source: childes-db[http://childes-db.stanford.edu/]
preprocess.
#### Here's how to load childes data and create a csv of the entire corpora for NA English:
# english_na_tokens <- get_tokens(collection=c("Eng-NA"),token="*") %>%
# group_by(utterance_id) %>%
# mutate(utterance = paste0(gloss, collapse = " "))
#
# write.csv(english_na_tokens, file=here("childes/english_na_tokens_w_utterance.csv"), row.names = FALSE)
clean data.
# d <- read_csv(here::here("childes/english_na_tokens_w_utterance.csv"))
#
# # remove the unintelligible tokens
# d %<>% filter(gloss!="xxx")
#
# # remove NAs in speaker_role, target_child_age, or gloss
# d %<>% drop_na(speaker_role, target_child_age, gloss)
#
# #Convert and store children's age in years using the lubridate package
# d$target_child_age_years <-
# d$target_child_age %>%
# duration("days") %>%
# as.numeric("years")
#
# #Take out data for the age range below 1 and above 6 years, this is because there is not much data in that range
# d %<>% filter(target_child_age < 72, target_child_age > 12) %>%
# mutate(target_child_age_years = floor(target_child_age/12)) %>%
# mutate(gloss_before = shift(gloss),
# gloss_after = shift(gloss, type="lead"))
#
# # Prepare the speech_act categories for this study based on the utterance_types in childes-db
# # Categories: declarative, impertaive, question, and other
# d$speech_act <-
# recode(d$utterance_type,
# `broken for coding`="other",
# `imperative_emphatic` = "imperative",
# interruption = "other",
# `interruption question` = "question",
# `missing CA terminator` = "other",
# `no break TCU continuation` = "other",
# `question exclamation` = "question",
# `quotation next line` = "other",
# `quotation precedes` = "other",
# `self interruption` = "other",
# `self interruption question` = "question",
# `trail off` = "other",
# `trail off question` = "question"
# )
#
# write.csv(d, file=here::here("childes/english_na_tokens_w_utterance_cleaned.csv"), row.names = FALSE)
load cleaned data.
d <- read_csv(here::here("childes/english_na_tokens_w_utterance_cleaned.csv"))
## Parsed with column specification:
## cols(
## .default = col_character(),
## id = col_integer(),
## speaker_id = col_integer(),
## utterance_id = col_integer(),
## token_order = col_integer(),
## corpus_id = col_integer(),
## transcript_id = col_integer(),
## target_child_id = col_integer(),
## target_child_age = col_double(),
## collection_id = col_integer(),
## num_morphemes = col_integer(),
## target_child_age_years = col_integer()
## )
## See spec(...) for full column specifications.
head(d)
## # A tibble: 6 x 31
## id gloss stem part_of_speech speaker_id utterance_id token_order
## <int> <chr> <chr> <chr> <int> <int> <int>
## 1 3141904 yeah yeah co 2454 776196 1
## 2 3141932 what what pro:int 2455 776202 1
## 3 3141944 was be cop 2455 776202 2
## 4 3141951 the the det:art 2455 776202 3
## 5 3141960 book book n 2455 776202 4
## 6 3141965 about about adv 2455 776202 5
## # ... with 24 more variables: corpus_id <int>, transcript_id <int>,
## # speaker_code <chr>, speaker_name <chr>, speaker_role <chr>,
## # target_child_id <int>, target_child_age <dbl>,
## # target_child_name <chr>, target_child_sex <chr>, utterance_type <chr>,
## # collection_id <int>, collection_name <chr>, english <chr>,
## # prefix <chr>, suffix <chr>, num_morphemes <int>, language <chr>,
## # corpus_name <chr>, clitic <chr>, target_child_age_years <int>,
## # gloss_before <chr>, speech_act <chr>, gloss_after <chr>,
## # utterance <chr>
Normalized utterance frequency: polite utterances divided by total number of utterances.
# Group the tokens (words) by speaker role and the age of the child, count how many words for each speaker at each age
rawOverall_indiv <-
d %>%
filter(speaker_role == "Mother" | speaker_role == "Father" | speaker_role == "Target_Child", !is.na(gloss)) %>%
mutate(speaker_role = case_when(speaker_role == "Target_Child" ~ "child",
TRUE ~ "parent")) %>%
distinct(utterance, .keep_all=T) %>%
group_by(speaker_role, floor(target_child_age)) %>%
summarize(total_freq = n())
utt_polite <- d %>%
filter(speaker_role == "Mother" | speaker_role == "Father" | speaker_role == "Target_Child", !is.na(gloss)) %>%
mutate(speaker_role = case_when(speaker_role == "Target_Child" ~ "child",
TRUE ~ "parent")) %>%
distinct(utterance, .keep_all=T) %>%
filter(grepl("hi", utterance) | grepl("bye", utterance) | grepl("please", utterance) | grepl("sorry", utterance) | grepl("thank", utterance)
| grepl("can you", utterance) | grepl("could you", utterance)
) %>%
mutate(gloss = case_when(
grepl("please", utterance) & grepl("can you", utterance) ~ "can you please",
grepl("please", utterance) ~ "please",
grepl("sorry", utterance) ~ "sorry",
grepl("thank", utterance) ~ "thank",
grepl("can you", utterance) ~ "can you",
grepl("could you", utterance) ~ "can you",
grepl("^hi ", utterance) ~ "hi",
grepl(" hi$", utterance) ~ "hi",
grepl(" hi ", utterance) ~ "hi",
grepl("bye", utterance) ~ "bye",
grepl("milk", utterance) ~ "milk",
grepl("mommy", utterance) ~ "mommy",
TRUE ~ "NA"
))
# count how many polite utterance for each speaker at each age
rawPlease_indiv <-
d %>%
filter(speaker_role == "Mother" | speaker_role == "Father" | speaker_role == "Target_Child", !is.na(gloss)) %>%
mutate(speaker_role = case_when(speaker_role == "Target_Child" ~ "child",
TRUE ~ "parent")) %>%
distinct(utterance, .keep_all=T) %>%
filter(grepl("hi", utterance) | grepl("bye", utterance) | grepl("please", utterance) | grepl("sorry", utterance) | grepl("thank", utterance)
| grepl("can you", utterance) | grepl("could you", utterance)
) %>%
mutate(gloss = case_when(
grepl("please", utterance) & grepl("can you", utterance) ~ "can you please",
grepl("please", utterance) ~ "please",
grepl("sorry", utterance) ~ "sorry",
grepl("thank", utterance) ~ "thank",
grepl("can you", utterance) ~ "can you",
grepl("could you", utterance) ~ "can you",
grepl("^hi ", utterance) ~ "hi",
grepl(" hi$", utterance) ~ "hi",
grepl(" hi ", utterance) ~ "hi",
grepl("bye", utterance) ~ "bye",
grepl("milk", utterance) ~ "milk",
grepl("mommy", utterance) ~ "mommy",
TRUE ~ "NA"
)) %>%
group_by(speaker_role, floor(target_child_age), gloss) %>%
summarize(freq = n())
# Normalize the polite-word count by dividing the connective frequency by the total word frequency for each speaker at each age
rawNormalized_indiv <-
full_join(rawPlease_indiv, rawOverall_indiv, by=c("speaker_role", "floor(target_child_age)")) %>%
as.data.frame(.) %>%
filter(!is.na(gloss)) %>%
mutate(relFreq = freq / total_freq, relFreq_ppt = relFreq * 1000) %>%
mutate(age = `floor(target_child_age)`) %>%
mutate(gloss = factor(gloss))
Takeaway: number of utterances containing politeness markers: children produce politeness markers early on (“hi”, “bye”, “please”, “can you”) and parents produce them a lot too. The polite words are produced more often than other baby words like “mommy” and “milk”.
p <- ggplot(rawNormalized_indiv %>%
filter(gloss != "NA") %>%
filter(gloss != "can you please") %>%
mutate(gloss = fct_relevel(factor(gloss), "hi", "bye", "please", "can you", "thank", "sorry", "mommy", "milk", "can you please")),
aes(x=age, y=relFreq_ppt, color=speaker_role)) +
geom_point(alpha = 0.3) +
geom_smooth(method="loess",size=0.5, span=1) +
# facet_grid(gloss~., scales="free") +
facet_grid(.~gloss) +
theme_few() +
scale_color_ptol() +
ylim(0, 12) +
ylab("relative frequency \n(per thousand utterances)") +
xlab("child age (months)") +
theme(legend.position="bottom")
print(p)
ggsave(here::here("childes/marker_utt_frequency_by_age.png"), width = 12, height = 3)
utt_polite %>% select(gloss, speaker_role, utterance) %>% filter(gloss == "sorry")
## # A tibble: 881 x 3
## gloss speaker_role utterance
## <chr> <chr> <chr>
## 1 sorry parent and mommy thinks that I should feel sorry for her w…
## 2 sorry parent sorry
## 3 sorry parent sorry honey
## 4 sorry parent I'm sorry
## 5 sorry parent I'm sorry pal
## 6 sorry parent oh I'm sorry
## 7 sorry child it I I'm sorry that I I dropped him and hurt him
## 8 sorry parent sorry I took the wrong here
## 9 sorry parent I'm sorry you can't have the pen
## 10 sorry parent oh alright sorry
## # ... with 871 more rows
Where are the politeness markers located in sentences?
# what do utterances look like?
utterances <-
d %>%
select(speaker_role, target_child_age, utterance, gloss) %>%
filter(speaker_role == "Mother" | speaker_role == "Father" | speaker_role == "Target_Child", !is.na(gloss)) %>%
mutate(speaker_role = case_when(speaker_role == "Target_Child" ~ "child",
TRUE ~ "parent")) %>%
mutate(target_child_age = round(target_child_age)) %>%
distinct(utterance, .keep_all = TRUE) %>%
mutate(gloss = case_when(grepl("please", utterance) & grepl("can you", utterance) ~ "can you please",
grepl("can you", utterance) ~ "can you",
grepl("could you", utterance) ~ "can you",
grepl("please", utterance) ~ "please",
grepl("sorry", utterance) ~ "sorry",
grepl("thank", utterance) ~ "thank",
grepl("^hi ", utterance) ~ "hi",
grepl(" hi$", utterance) ~ "hi",
grepl(" hi ", utterance) ~ "hi",
grepl("bye", utterance) ~ "bye",
grepl("milk", utterance) ~ "milk",
grepl("mommy", utterance) ~ "mommy",
TRUE ~ ""
))
# marker position: start, middle, or end?
utterances1 <- utterances %>%
mutate(gloss = fct_relevel(gloss, "hi", "bye", "thank", "sorry", "please", "can you", "can you please", "mommy", "milk")) %>%
mutate(marker_position = case_when(
gloss == "" ~ "",
utterance == "hi" ~ "whole",
utterance == "bye" ~ "whole",
utterance == "thank you" ~ "whole",
utterance == "please" ~ "whole",
utterance == "sorry" ~ "whole",
utterance == "thank you" ~ "whole",
utterance == "can you" ~ "whole",
utterance == "could you" ~ "whole",
grepl("^sorry", utterance) ~ "start",
grepl("sorry$", utterance) ~ "end",
grepl("^can you", utterance) ~ "start",
grepl("can you$", utterance) ~ "end",
grepl("^could you", utterance) ~ "start",
grepl("could you$", utterance) ~ "end",
grepl("^thank you", utterance) ~ "start",
grepl("thank you$", utterance) ~ "end",
grepl("^thank_you", utterance) ~ "start",
grepl("thank_you$", utterance) ~ "end",
grepl("^thank", utterance) ~ "start",
grepl("thanks$", utterance) ~ "end",
grepl("^please", utterance) ~ "start",
grepl("please$", utterance) ~ "end",
grepl("^hi", utterance) ~ "start",
grepl("hi$", utterance) ~ "end",
grepl("^bye", utterance) ~ "start",
grepl("bye$", utterance) ~ "end",
grepl("^milk", utterance) ~ "start",
grepl("milk$", utterance) ~ "end",
grepl("^mommy", utterance) ~ "start",
grepl("mommy$", utterance) ~ "end",
TRUE ~ "middle"))
Takeaway: children and parents produce politeness markers in similar positions (e.g., “hi” mostly at the start and “please” mostly at the end of sentences).
utterances1 %>%
filter(marker_position != "") %>%
group_by(gloss, speaker_role) %>%
summarise(total_n = n()) %>%
left_join(utterances1 %>%
filter(marker_position != "") %>%
group_by(gloss, marker_position, speaker_role) %>%
summarise(n_marker = n())
) %>%
as.data.frame(.) %>%
mutate(prop_position = n_marker/total_n) %>%
mutate(marker_position = fct_relevel(marker_position, "start", "middle", "end")) %>%
ggplot(., aes(x=speaker_role, y=prop_position, fill=marker_position)) +
geom_col() +
facet_grid(.~gloss) +
scale_fill_ptol() +
theme_few() +
ylab("proportion of politeness marker position")
## Joining, by = c("gloss", "speaker_role")
ggsave(here::here("childes/marker_position.png"), width = 10, height = 5)
How long are utterances containing politeness markers?
utterances2 <- utterances1 %>%
mutate(n_words = stri_count(utterance,regex="\\S+")) %>%
filter(gloss != "")
getmode <- function(v) {
uniqv <- unique(v)
uniqv[which.max(tabulate(match(v, uniqv)))]
}
utterances2_mean <- utterances2 %>%
group_by(speaker_role, gloss) %>%
summarise(mean = mean(n_words))
utterances2_mode <- utterances2 %>%
group_by(speaker_role, gloss) %>%
summarise(mean = getmode(n_words))
p_mlu <- utterances2 %>%
filter(n_words <15) %>%
mutate(gloss = fct_relevel(gloss, "hi", "bye", "thank", "sorry", "please", "can you", "can you please", "mommy", "milk")) %>%
ggplot(., aes(x=n_words, y=..density.., color=speaker_role)) +
geom_freqpoly(binwidth = 1) +
facet_grid(gloss~.) +
scale_x_continuous(breaks=seq(0,15,1)) +
# xlim(0, 20) +
theme_few() +
geom_vline(data=utterances2_mode, aes(xintercept=mean, color=speaker_role),
linetype="dashed", size=1, alpha=.5) +
scale_color_ptol() +
xlab("number of words in the utterance")
print(p_mlu)
ggsave(here::here("childes/mlu_with_mode2.png"), width = 8, height = 10)