Corpus analyses

Data source: childes-db[http://childes-db.stanford.edu/]

preprocess.

#### Here's how to load childes data and create a csv of the entire corpora for NA English:
# english_na_tokens <- get_tokens(collection=c("Eng-NA"),token="*") %>%
#   group_by(utterance_id) %>%
#   mutate(utterance = paste0(gloss, collapse = " ")) 
# 
# write.csv(english_na_tokens, file=here("childes/english_na_tokens_w_utterance.csv"), row.names = FALSE)

clean data.

# d <- read_csv(here::here("childes/english_na_tokens_w_utterance.csv"))
# 
# # remove the unintelligible tokens
# d %<>% filter(gloss!="xxx")
# 
# # remove NAs in speaker_role, target_child_age, or gloss
# d %<>% drop_na(speaker_role, target_child_age, gloss)
# 
# #Convert and store children's age in years using the lubridate package
# d$target_child_age_years <- 
#   d$target_child_age %>% 
#   duration("days") %>% 
#   as.numeric("years")
# 
# #Take out data for the age range below 1 and above 6 years, this is because there is not much data in that range
# d %<>% filter(target_child_age < 72, target_child_age > 12) %>%
#   mutate(target_child_age_years = floor(target_child_age/12)) %>%
#   mutate(gloss_before = shift(gloss),
#          gloss_after = shift(gloss, type="lead"))
# 
# # Prepare the speech_act categories for this study based on the utterance_types in childes-db
# # Categories: declarative, impertaive, question, and other
# d$speech_act <-
#   recode(d$utterance_type, 
#          `broken for coding`="other",
#           `imperative_emphatic` = "imperative",
#          interruption = "other",
#          `interruption question` = "question",
#          `missing CA terminator` = "other",
#          `no break TCU continuation` = "other",
#          `question exclamation` = "question",
#          `quotation next line` = "other",
#          `quotation precedes` = "other",
#          `self interruption` = "other",
#          `self interruption question` = "question",
#          `trail off` = "other",
#          `trail off question` = "question"
#          )
# 
# write.csv(d, file=here::here("childes/english_na_tokens_w_utterance_cleaned.csv"), row.names = FALSE)

load cleaned data.

d <- read_csv(here::here("childes/english_na_tokens_w_utterance_cleaned.csv"))

## Parsed with column specification:
## cols(
##   .default = col_character(),
##   id = col_integer(),
##   speaker_id = col_integer(),
##   utterance_id = col_integer(),
##   token_order = col_integer(),
##   corpus_id = col_integer(),
##   transcript_id = col_integer(),
##   target_child_id = col_integer(),
##   target_child_age = col_double(),
##   collection_id = col_integer(),
##   num_morphemes = col_integer(),
##   target_child_age_years = col_integer()
## )

## See spec(...) for full column specifications.

head(d)

## # A tibble: 6 x 31
##        id gloss stem  part_of_speech speaker_id utterance_id token_order
##     <int> <chr> <chr> <chr>               <int>        <int>       <int>
## 1 3141904 yeah  yeah  co                   2454       776196           1
## 2 3141932 what  what  pro:int              2455       776202           1
## 3 3141944 was   be    cop                  2455       776202           2
## 4 3141951 the   the   det:art              2455       776202           3
## 5 3141960 book  book  n                    2455       776202           4
## 6 3141965 about about adv                  2455       776202           5
## # ... with 24 more variables: corpus_id <int>, transcript_id <int>,
## #   speaker_code <chr>, speaker_name <chr>, speaker_role <chr>,
## #   target_child_id <int>, target_child_age <dbl>,
## #   target_child_name <chr>, target_child_sex <chr>, utterance_type <chr>,
## #   collection_id <int>, collection_name <chr>, english <chr>,
## #   prefix <chr>, suffix <chr>, num_morphemes <int>, language <chr>,
## #   corpus_name <chr>, clitic <chr>, target_child_age_years <int>,
## #   gloss_before <chr>, speech_act <chr>, gloss_after <chr>,
## #   utterance <chr>

production frequency

Normalized utterance frequency: polite utterances divided by total number of utterances.

# Group the tokens (words) by speaker role and the age of the child, count how many words for each speaker at each age
rawOverall_indiv <-
  d %>%
  filter(speaker_role == "Mother" | speaker_role == "Father" | speaker_role == "Target_Child", !is.na(gloss)) %>% 
  mutate(speaker_role = case_when(speaker_role == "Target_Child" ~ "child",
                                  TRUE ~ "parent")) %>%
  distinct(utterance, .keep_all=T) %>%
  group_by(speaker_role, floor(target_child_age)) %>%
  summarize(total_freq = n())

utt_polite <-   d %>%
  filter(speaker_role == "Mother" | speaker_role == "Father" | speaker_role == "Target_Child", !is.na(gloss)) %>% 
  mutate(speaker_role = case_when(speaker_role == "Target_Child" ~ "child",
                                  TRUE ~ "parent")) %>%
  distinct(utterance, .keep_all=T) %>%
  filter(grepl("hi", utterance) | grepl("bye", utterance) | grepl("please", utterance) | grepl("sorry", utterance)  | grepl("thank", utterance) 
         | grepl("can you", utterance) | grepl("could you", utterance)
         ) %>%
  mutate(gloss = case_when(
    grepl("please", utterance) & grepl("can you", utterance) ~ "can you please",
                           grepl("please", utterance) ~ "please",
                           grepl("sorry", utterance) ~ "sorry",
                           grepl("thank", utterance) ~ "thank",
                           grepl("can you", utterance) ~ "can you",
                           grepl("could you", utterance) ~ "can you",
                           grepl("^hi ", utterance) ~ "hi",
                           grepl(" hi$", utterance) ~ "hi",
                           grepl(" hi ", utterance) ~ "hi",
                           grepl("bye", utterance) ~ "bye",
                           grepl("milk", utterance) ~ "milk",
                           grepl("mommy", utterance) ~ "mommy",
    
                           TRUE ~ "NA"
                           ))


# count how many polite utterance for each speaker at each age
rawPlease_indiv <-
  d %>%
  filter(speaker_role == "Mother" | speaker_role == "Father" | speaker_role == "Target_Child", !is.na(gloss)) %>% 
  mutate(speaker_role = case_when(speaker_role == "Target_Child" ~ "child",
                                  TRUE ~ "parent")) %>%
  distinct(utterance, .keep_all=T) %>%
  filter(grepl("hi", utterance) | grepl("bye", utterance) | grepl("please", utterance) | grepl("sorry", utterance)  | grepl("thank", utterance) 
         | grepl("can you", utterance) | grepl("could you", utterance)
         ) %>%
  mutate(gloss = case_when(
    grepl("please", utterance) & grepl("can you", utterance) ~ "can you please",
                           grepl("please", utterance) ~ "please",
                           grepl("sorry", utterance) ~ "sorry",
                           grepl("thank", utterance) ~ "thank",
                           grepl("can you", utterance) ~ "can you",
                           grepl("could you", utterance) ~ "can you",
                           grepl("^hi ", utterance) ~ "hi",
                           grepl(" hi$", utterance) ~ "hi",
                           grepl(" hi ", utterance) ~ "hi",
                           grepl("bye", utterance) ~ "bye",
                           grepl("milk", utterance) ~ "milk",
                           grepl("mommy", utterance) ~ "mommy",
    
                           TRUE ~ "NA"
                           )) %>%
  group_by(speaker_role, floor(target_child_age), gloss) %>%
  summarize(freq = n())

# Normalize the polite-word count by dividing the connective frequency by the total word frequency for each speaker at each age
rawNormalized_indiv <-
  full_join(rawPlease_indiv, rawOverall_indiv, by=c("speaker_role", "floor(target_child_age)")) %>%
  as.data.frame(.) %>%
  filter(!is.na(gloss)) %>%
  mutate(relFreq = freq / total_freq, relFreq_ppt = relFreq * 1000) %>%
  mutate(age = `floor(target_child_age)`) %>%
  mutate(gloss = factor(gloss))

Takeaway: number of utterances containing politeness markers: children produce politeness markers early on (“hi”, “bye”, “please”, “can you”) and parents produce them a lot too. The polite words are produced more often than other baby words like “mommy” and “milk”.

p <- ggplot(rawNormalized_indiv %>% 
              filter(gloss != "NA") %>%
              filter(gloss != "can you please") %>%
  mutate(gloss = fct_relevel(factor(gloss), "hi", "bye", "please", "can you", "thank", "sorry", "mommy", "milk", "can you please")), 
       aes(x=age, y=relFreq_ppt, color=speaker_role)) +
  geom_point(alpha = 0.3) +
  geom_smooth(method="loess",size=0.5, span=1) +
  # facet_grid(gloss~., scales="free") +
  facet_grid(.~gloss) +
  theme_few() +
  scale_color_ptol() +
  ylim(0, 12) +
  ylab("relative frequency \n(per thousand utterances)") +
  xlab("child age (months)") +
  theme(legend.position="bottom")
print(p)

ggsave(here::here("childes/marker_utt_frequency_by_age.png"), width = 12, height = 3)

  utt_polite %>% select(gloss, speaker_role, utterance) %>% filter(gloss == "sorry")

## # A tibble: 881 x 3
##    gloss speaker_role utterance                                           
##    <chr> <chr>        <chr>                                               
##  1 sorry parent       and mommy thinks that I should feel sorry for her w…
##  2 sorry parent       sorry                                               
##  3 sorry parent       sorry honey                                         
##  4 sorry parent       I'm sorry                                           
##  5 sorry parent       I'm sorry pal                                       
##  6 sorry parent       oh I'm sorry                                        
##  7 sorry child        it I I'm sorry that I I dropped him and hurt him    
##  8 sorry parent       sorry I took the wrong here                         
##  9 sorry parent       I'm sorry you can't have the pen                    
## 10 sorry parent       oh alright sorry                                    
## # ... with 871 more rows

Politeness marker position

Where are the politeness markers located in sentences?

# what do utterances look like?
utterances <-
  d %>%
  select(speaker_role, target_child_age, utterance, gloss) %>%
  filter(speaker_role == "Mother" | speaker_role == "Father" | speaker_role == "Target_Child", !is.na(gloss)) %>% 
  mutate(speaker_role = case_when(speaker_role == "Target_Child" ~ "child",
                                  TRUE ~ "parent")) %>%
  mutate(target_child_age = round(target_child_age)) %>%
  distinct(utterance, .keep_all = TRUE) %>%
  mutate(gloss = case_when(grepl("please", utterance) & grepl("can you", utterance) ~ "can you please",
                           grepl("can you", utterance) ~ "can you",
                           grepl("could you", utterance) ~ "can you",
                           grepl("please", utterance) ~ "please",
                           grepl("sorry", utterance) ~ "sorry",
                           grepl("thank", utterance) ~ "thank",
                           grepl("^hi ", utterance) ~ "hi",
                           grepl(" hi$", utterance) ~ "hi",
                           grepl(" hi ", utterance) ~ "hi",
                           grepl("bye", utterance) ~ "bye",
                           grepl("milk", utterance) ~ "milk",
                           grepl("mommy", utterance) ~ "mommy",
                           TRUE ~ ""
                           )) 

# marker position: start, middle, or end?
utterances1 <- utterances %>%
  mutate(gloss = fct_relevel(gloss, "hi", "bye", "thank", "sorry", "please", "can you", "can you please", "mommy", "milk")) %>%
  mutate(marker_position = case_when(
    gloss == "" ~ "",
    utterance == "hi" ~ "whole",
    utterance == "bye" ~ "whole",
    utterance == "thank you" ~ "whole",
    utterance == "please" ~ "whole",
    utterance == "sorry" ~ "whole",
    utterance == "thank you" ~ "whole",
    utterance == "can you" ~ "whole",
    utterance == "could you" ~ "whole",
    grepl("^sorry", utterance) ~ "start",
    grepl("sorry$", utterance) ~ "end",
    grepl("^can you", utterance) ~ "start",
    grepl("can you$", utterance) ~ "end",
    grepl("^could you", utterance) ~ "start",
    grepl("could you$", utterance) ~ "end",
    grepl("^thank you", utterance) ~ "start",
    grepl("thank you$", utterance) ~ "end",
    grepl("^thank_you", utterance) ~ "start",
    grepl("thank_you$", utterance) ~ "end",
    grepl("^thank", utterance) ~ "start",
    grepl("thanks$", utterance) ~ "end",
    grepl("^please", utterance) ~ "start",
    grepl("please$", utterance) ~ "end",
    grepl("^hi", utterance) ~ "start",
    grepl("hi$", utterance) ~ "end",
    grepl("^bye", utterance) ~ "start",
    grepl("bye$", utterance) ~ "end",
    grepl("^milk", utterance) ~ "start",
    grepl("milk$", utterance) ~ "end",
    grepl("^mommy", utterance) ~ "start",
    grepl("mommy$", utterance) ~ "end",
    TRUE ~ "middle"))

Takeaway: children and parents produce politeness markers in similar positions (e.g., “hi” mostly at the start and “please” mostly at the end of sentences).

utterances1 %>%
  filter(marker_position != "") %>%
  group_by(gloss, speaker_role) %>%
  summarise(total_n = n()) %>%
  left_join(utterances1 %>%
              filter(marker_position != "") %>%
              group_by(gloss, marker_position, speaker_role) %>%
              summarise(n_marker = n())
) %>%
  as.data.frame(.) %>%
  mutate(prop_position = n_marker/total_n) %>%
  mutate(marker_position = fct_relevel(marker_position, "start", "middle", "end")) %>%
  ggplot(., aes(x=speaker_role, y=prop_position, fill=marker_position)) +
  geom_col() +
  facet_grid(.~gloss) +
  scale_fill_ptol() +
  theme_few() +
  ylab("proportion of politeness marker position")

## Joining, by = c("gloss", "speaker_role")

ggsave(here::here("childes/marker_position.png"), width = 10, height = 5)

Utterance length

How long are utterances containing politeness markers?

utterances2 <- utterances1 %>%
  mutate(n_words = stri_count(utterance,regex="\\S+")) %>%
  filter(gloss != "")

getmode <- function(v) {
   uniqv <- unique(v)
   uniqv[which.max(tabulate(match(v, uniqv)))]
}

utterances2_mean <- utterances2 %>% 
  group_by(speaker_role, gloss) %>%
  summarise(mean = mean(n_words))

utterances2_mode <- utterances2 %>% 
  group_by(speaker_role, gloss) %>%
  summarise(mean = getmode(n_words))

p_mlu <- utterances2 %>%
  filter(n_words <15) %>%
  mutate(gloss = fct_relevel(gloss, "hi", "bye", "thank", "sorry", "please", "can you", "can you please", "mommy", "milk")) %>%
  ggplot(., aes(x=n_words, y=..density.., color=speaker_role)) +
  geom_freqpoly(binwidth = 1) +
  facet_grid(gloss~.) +
  scale_x_continuous(breaks=seq(0,15,1)) +
  # xlim(0, 20) +
  theme_few() +
  geom_vline(data=utterances2_mode, aes(xintercept=mean, color=speaker_role),
            linetype="dashed", size=1, alpha=.5) +
  scale_color_ptol() +
  xlab("number of words in the utterance")
print(p_mlu)

ggsave(here::here("childes/mlu_with_mode2.png"), width = 8, height = 10)

polcon_childes

Erica Yoon

10/25/2018

Data from CDI

Production trajectory

Corpus analyses

production frequency

Politeness marker position

Utterance length