Infant Effect Sizes Word Properties

#packages
library(tidyverse)
library(cowplot)
library(knitr)
library(htmltools)
library(glue)
theme_set(theme_cowplot())

###function for adding html audio
html_tag_audio <- function(file, type = "wav") {
  type <- match.arg(type)
  htmltools::tags$audio(
    controls = NA,
    htmltools::tags$source(
      src = file,
      type = glue::glue("audio/{type}", type = type)
    )
  )
}

#read in data
d <- read.csv("WordProperties - phonotactic probability.csv")
pitch <- read.csv("pitch_lists.csv")
#pitch$word <- pitch$item
d <- d %>%
  left_join(pitch)

#clean up the data frame
d <- d %>%
  rename(seg_pp=adult_corpus_avg_segment_phonotactic_probability,bi_pp=adult_corpus_avg_biphone_phonotactic.probability)


# define pairs for each list and add info to data frame
#list1=c("toma","manu","fiffin","kita","modi","regli", "tosip","coodle")
#list2=c("blicket","sarel","boskot","koba","tever", "chatten","jefa",
#        "fuppy")
list1=c("toma","manu","fiffin","modi","kita", "chatten","gazzer","jefa")
list2=c("blicket","sarel","koba","tever","boskot","regli" ,"doopy","pizer")

d$new_list <- ifelse(d$word %in% list1,"list1",
                     ifelse(d$word %in% list2,"list2","not_used"))
d$used=ifelse(d$new_list=="not_used",0.5,1)

#create phonotactics summary values for each list
sum_d_phon <- d %>%
  group_by(new_list) %>%
  summarize(
    mean_seg=mean(seg_pp),
    median_seg=median(seg_pp),
    sd_seg=sd(seg_pp),
    mean_bi=mean(bi_pp),
    median_bi=median(bi_pp),
    sd_bi=sd(bi_pp),
    max_seg=max(seg_pp),
    min_seg=min(seg_pp),
    max_bi=max(bi_pp),
    min_bi=min(bi_pp)
    
  )

#create pitch summary values for each list
sum_d_pitch <- d %>%
  group_by(new_list) %>%
  summarize(
    mean_f0_mean=mean(f0_mean),
    median_f0_mean=median(f0_mean),
    sd_f0_mean=sd(f0_mean),
    mean_f0_min=mean(f0_min),
    median_f0_min=median(f0_min),
    sd_f0_min=sd(f0_min),
    mean_f0_max=mean(f0_max),
    median_f0_max=median(f0_max),
    sd_f0_max=sd(f0_max),
    
  )

The Words

Below are the list of words I am proposing for List 1 and for List 2 in the pilot study. There are 8 words in each list. Note that you can also play the audio files directly in the browser.

#create audio files in a loop
audio_list_1=paste('<audio controls><source src="audio/',list1,'.wav" type="audio/wav"/></audio>',sep="")
audio_list_2=paste('<audio controls><source src="audio/',list2,'.wav" type="audio/wav"/></audio>',sep="")
list_d <- data.frame(
  list_1=list1,
  audio_list_1=audio_list_1,
  list_2=list2,
  audio_list_2=audio_list_2)

kable(list_d)

list_1	audio_list_1	list_2	audio_list_2
toma		blicket
manu		sarel
fiffin		koba
modi		tever
kita		boskot
chatten		regli
gazzer		doopy
jefa		pizer

Suggested test items (PICK TWO PAIRS)

Here are specifically the setof items that I was thinking of pulling out as test items. The idea is to select two items from each list that will be the test items (novel or familiar, depending on which list the child heard). It’s not super easy, because it’s even more difficult to match the paired items when there are just 2 novel and 2 familiar test items (while avoiding similar sounds, etc.). It doesn’t need to be perfect, since we will counterbalance across lists (and hence across which items are novel/ familiar), but it would be good if no test item stands out toooooo prominently (at least that’s how I’ve been thinking about it).

I think manu vs. sarel are a good pair. Then it gets a bit tricky. I like toma vs. blicket, but unfortunately blicket does have a noticeably higher peak than the other items. Other possibilities that sound pretty ok to me are kita vs. boskot and fiffin vs. koba. I think the [d] in doopy is pretty prominent, otherwise I would like gazzer vs. doopy best. What do y’all think?

test_list_1 <- c("manu","toma","fiffin","kita","gazzer")
test_list_2 <- c("sarel","blicket","koba","boskot","doopy")

#create audio files in a loop
audio_list_1=paste('<audio controls><source src="audio/',test_list_1,'.wav" type="audio/wav"/></audio>',sep="")
audio_list_2=paste('<audio controls><source src="audio/',test_list_2,'.wav" type="audio/wav"/></audio>',sep="")
list_d <- data.frame(
  test_list_1=test_list_1,
  audio_list_1=audio_list_1,
  test_list_2=test_list_2,
  audio_list_2=audio_list_2)

kable(list_d)

test_list_1	audio_list_1	test_list_2	audio_list_2
manu		sarel
toma		blicket
fiffin		koba
kita		boskot
gazzer		doopy

Summarize Properties of Words By List

Phonotactic probability

The table below compares the two lists in terms of phonotactic probability (seg==segment (roughly speaking, phone) probability; bi == biphone probability).

Below, I’m showing the mean, SD, median, maximum, and minumum values (both segment and biphone phonotactic probability) for the items I currently have assigned to list 1 and list 2.

The only noteable difference is that one word in list one (manu) has somewhat higher biphone probability (the maximum value among the set). I still stuck with this word since I wanted to err on the side of including phonotactically probable words. I did not select “kaki” for the list (see plot below to see why this is another possible option) since it ended up being quite close to real word (~“cocky”), which I only realized after recording. If the higher maximum value is a concern, though, it would be easy to adjust this, however.

kable(filter(sum_d_phon, new_list!="not_used"), digits = 5)

new_list	mean_seg	median_seg	sd_seg	mean_bi	median_bi	sd_bi	max_seg	min_seg	max_bi	min_bi
list1	0.05255	0.05135	0.00926	0.00365	0.00314	0.00208	0.0676	0.0411	0.0083	0.00173
list2	0.05284	0.05300	0.00979	0.00342	0.00334	0.00167	0.0660	0.0386	0.0065	0.00127

T-tests on average segment and biphone phonotactic probability

We can do a quick t-test to show that the lists don’t appear to differ on segment or biphone phonotactic probability, on average.

Segment phonotactic probability

t.test(
  filter(d,new_list=="list1")$seg_pp,
  filter(d,new_list=="list2")$seg_pp,
  var.equal=T
  )

## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$seg_pp and filter(d, new_list == "list2")$seg_pp
## t = -0.060342, df = 14, p-value = 0.9527
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.010506308  0.009931308
## sample estimates:
## mean of x mean of y 
## 0.0525500 0.0528375

Biphone phonotactic probability

t.test(
  filter(d,new_list=="list1")$bi_pp,
  filter(d,new_list=="list2")$bi_pp,
  var.equal=T
)

## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$bi_pp and filter(d, new_list == "list2")$bi_pp
## t = 0.24478, df = 14, p-value = 0.8102
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -0.001795016  0.002257516
## sample estimates:
##  mean of x  mean of y 
## 0.00365125 0.00342000

Plotting the segment and biphone phonotactic probability

Since we are matching the words on two dimensions (segment and biphone phonotactic probability), this makes selection a little more complex and a little more difficult to visualize. Here’s how the selected words are distributed in the 2-D space of segment and biphone phonotactic probability.

ggplot(filter(d,new_list!="not_used"),aes(seg_pp,bi_pp, label=word, color=new_list)) +
  geom_point()+
  geom_label()+
  geom_point(data=filter(d,new_list=="not_used"), alpha=0.5)+
  geom_text(data=filter(d,new_list=="not_used"), alpha=0.5)+
  xlab("average segment probability")+
  ylab("average biphone phonotactive probability")+
  theme(legend.position=c(0.2,0.8))

Pitch

kable(filter(sum_d_pitch, new_list!="not_used"), digits = 5)

new_list	mean_f0_mean	median_f0_mean	sd_f0_mean	mean_f0_min	median_f0_min	sd_f0_min	mean_f0_max	median_f0_max	sd_f0_max
list1	202.6463	205.255	10.24293	131.0775	129.775	4.12751	345.2575	345.645	33.32560
list2	204.0200	199.580	19.58810	133.1037	133.765	3.72697	341.4100	331.220	49.95472

Mean F0

t.test(
  filter(d,new_list=="list1")$f0_mean,
  filter(d,new_list=="list2")$f0_mean,
  var.equal=T
)

## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$f0_mean and filter(d, new_list == "list2")$f0_mean
## t = -0.17578, df = 14, p-value = 0.863
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -18.13556  15.38806
## sample estimates:
## mean of x mean of y 
##  202.6463  204.0200

Min F0

t.test(
  filter(d,new_list=="list1")$f0_min,
  filter(d,new_list=="list2")$f0_min,
  var.equal=T
)

## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$f0_min and filter(d, new_list == "list2")$f0_min
## t = -1.0306, df = 14, p-value = 0.3202
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -6.243271  2.190771
## sample estimates:
## mean of x mean of y 
##  131.0775  133.1037

Max F0

t.test(
  filter(d,new_list=="list1")$f0_max,
  filter(d,new_list=="list2")$f0_max,
  var.equal=T
)

## 
##  Two Sample t-test
## 
## data:  filter(d, new_list == "list1")$f0_max and filter(d, new_list == "list2")$f0_max
## t = 0.18122, df = 14, p-value = 0.8588
## alternative hypothesis: true difference in means is not equal to 0
## 95 percent confidence interval:
##  -41.68864  49.38364
## sample estimates:
## mean of x mean of y 
##  345.2575  341.4100