#Getting Started
For those of you new to R, the first thing you generally need to do
is load packages that you will use. The tidyverse package
has most of the tools we’ll need today.
If you have never used the package before, you’ll need to install it
manually. Click the “Packages” tab in the bottom right window, then
click the “Install” button, and type “tidyverse” and click
“Install”.
library(tidyverse)
Now we’ll read in data that Anu has graciously shared with us.
data <- read_csv("Pune_SchoolData.csv", na = c("", "NA", "na"))
#Preparing Data
This dataset is currently in long format, meaning there are separate
rows for each trial of Anu’s experiment. It also means that participant
background data, which does not vary across trials, is repeated.
unique(data$participant_id)
[1] "A01" "A02" "A03" "A04" "A05" "A06" "A07" "A08" "B01" "B02" "B03" "B04" "B05" "B06" "B07" "B08" "B09" "B10" "B11" "B12" "B13" "C01"
[23] "C02" "C03" "C04" "C05" "C06" "C07" "D01" "D02" "D03" "D04" "D05" "D06" "C08" "C09" "C10" "C11" "C12" "C13" "C14" "C15" "C16" "C17"
900+ rows, but only a few dozen participants.
length(unique(data$participant_id))
[1] 44
44 participants, to be precise!
We want to do a little work on that background data first, so we’ll
make a separate dataset for that.
bg <- data %>%
select(participant_id:marathi_home) %>%
distinct()
Now we have a dataframe calld bg with 44 rows; one for
each participant.
We want to compute a (fairly) precise age for each participant based
on their birthday (month and year, it looks like) and the date of data
collection. Months are a very meaningful unit of time in child language
acquisition, so we can’t just round to the nearest year.
We’ll use some tidyverse functions that are part of the
lubridate package.
#change collection data to a special date format
bg$collection_date <- as.Date(bg$collection_date, "%m/%d/%Y")
#also change birthdate to a special date format
bg$birthdate <- as.Date(bg$birthdate, "%m/%d/%Y")
#subtract birthdate from collection date (result in days)
bg$age_cont <- difftime(bg$collection_date, bg$birthdate)/365#divide by 365 to get years
#make this a number
bg$age_cont <- as.numeric(bg$age_cont)*12 #multiply by 12 to get months
Let’s do a little bit of summarizing and plotting to see what our
participants’ ages look like.
bg %>% summarise(mean = mean(age_cont),
sd = sd(age_cont),
median = median(age_cont),
min = min(age_cont),
max = max(age_cont)) %>%
round(2)
Now for a histogram…
bg %>% ggplot(aes(x = age_cont))+
geom_vline(xintercept = 48, linetype = 2)+
geom_vline(xintercept = 60, linetype = 2)+
geom_vline(xintercept = 72, linetype = 2)+
geom_histogram(binwidth = 1)+
scale_x_continuous(breaks = c(48, 60, 72))+
scale_y_continuous(expand = c(0,0), limits = c(0, 5.5))+
labs(x = "Age (mos.)")+
theme_bw()

Anu’s data also has an age category, which corresponds to Junior and
Senior kindergarten. Let’s compare the ages of the two groups:
bg %>% group_by(age_cat) %>%
summarise(n = n(),
mean = mean(age_cont),
sd = sd(age_cont),
median = median(age_cont),
min = min(age_cont),
max = max(age_cont))
As expected, the Senior Kindergarteners are about a year older, on
average, than the junior kindergarteners.
Visually:
bg %>% ggplot(aes(x = age_cat, y = age_cont)) +
geom_boxplot()+
geom_jitter(height = 0, width = .1, color = "blue")+
stat_summary(fun = "mean", color = "red", geom = "point", size = 4)+
theme_bw()

#Proportions of verb interpretations
Anu’s data involves children having to interpret verbs as being
causative or conjunctive in Marathi (I think…). These are all
intransitive verbs, and can have markers attached(or in proximity? - I
know nothing about Marathi!).
We can use the table(), xtabs(), and
count() or tally() functions to get a sense of
the design and stimuli.
table(data$verb, data$interp)
xtabs(~verb+token_type , data = data)
data %>% group_by(verb, verb_type) %>% count()
data %>% group_by(verb, marker) %>% tally()
Six verbs (cry, getwet, laugh, rise, scare, sleep) are the targets
and it seems like Anu is interested how case markers affect verb
interpretations (causative or conjunctive) for children at different
ages.
data %>% filter(!is.na(interp)) %>%
group_by(age_cat, marker, interp) %>%
count() %>%
pivot_wider(names_from = interp, values_from = n) %>%
mutate(total = causative + conjunctive,
cause_perc = (causative/total)*100,
conj_perc = (conjunctive/total)*100)
And now for a plot of causatives…
data %>% filter(!is.na(interp)) %>%
group_by(age_cat, marker, interp) %>%
count() %>%
pivot_wider(names_from = interp, values_from = n) %>%
mutate(total = causative + conjunctive,
cause_perc = (causative/total)*100,
conj_perc = (conjunctive/total)*100) %>%
ggplot(aes(x = marker, y = cause_perc, fill = age_cat))+
geom_bar(stat = "identity", position = "dodge")+
theme_bw()

And now for a plot of conjunctives…
data %>% filter(!is.na(interp)) %>%
group_by(age_cat, marker, interp) %>%
count() %>%
pivot_wider(names_from = interp, values_from = n) %>%
mutate(total = causative + conjunctive,
cause_perc = (causative/total)*100,
conj_perc = (conjunctive/total)*100) %>%
ggplot(aes(x = marker, y = conj_perc, fill = age_cat))+
geom_bar(stat = "identity", position = "dodge")+
theme_bw()
Now we’ll do something similar, but at the participant level so we
can look in greater detail at trends across ages.
And now some plotting….
data %>% filter(!is.na(interp)) %>%
group_by(participant_id, marker, interp) %>%
count(.drop = F) %>%
pivot_wider(names_from = interp, values_from = n) %>%
mutate(causative = ifelse(is.na(causative), 0, causative),
conjunctive = ifelse(is.na(conjunctive), 0, conjunctive),
total = causative + conjunctive,
cause_perc = (causative/total)*100,
conj_perc = (conjunctive/total)*100) %>%
left_join(.,select(bg, participant_id, age_cont), by = "participant_id") %>%
select(participant_id, age_cont, marker, cause_perc, conj_perc) %>%
pivot_longer(cause_perc:conj_perc, names_to = "interp", values_to = "percent") %>%
ggplot(aes(x = age_cont, color = marker, y = percent))+
geom_point(alpha = .4)+
geom_smooth(method = "lm")+
theme_bw()+
facet_wrap(~interp+marker, nrow = 2)

---
title: "QRDG Data Analysis Workshop Nov. 2022"
output: html_notebook
---

#Getting Started

For those of you new to R, the first thing you generally need to do is load packages that you will use. The `tidyverse` package has most of the tools we'll need today.

If you have never used the package before, you'll need to install it manually. Click the "Packages" tab in the bottom right window, then click the "Install" button, and type "tidyverse" and click "Install".

```{r echo=T, results='hide'}
library(tidyverse)
```

Now we'll read in data that Anu has graciously shared with us.

```{r echo=T, results='hide'}
data <- read_csv("Pune_SchoolData.csv", na = c("", "NA", "na"))
```

#Preparing Data

This dataset is currently in long format, meaning there are separate rows for each trial of Anu's experiment. It also means that participant background data, which does not vary across trials, is repeated. 

```{r}
unique(data$participant_id)
```
900+ rows, but only a few dozen participants.

```{r}
length(unique(data$participant_id))
```
44 participants, to be precise!

We want to do a little work on that background data first, so we'll make a separate dataset for that.

```{r}
bg <- data %>%
  select(participant_id:marathi_home) %>%
  distinct()
```

Now we have a dataframe calld `bg` with 44 rows; one for each participant.

We want to compute a (fairly) precise age for each participant based on their birthday (month and year, it looks like) and the date of data collection. Months are a very meaningful unit of time in child language acquisition, so we can't just round to the nearest year.

We'll use some `tidyverse` functions that are part of the `lubridate` package.

```{r}
#change collection data to a special date format
bg$collection_date <- as.Date(bg$collection_date, "%m/%d/%Y")

#also change birthdate to a special date format
bg$birthdate <- as.Date(bg$birthdate, "%m/%d/%Y")

#subtract birthdate from collection date (result in days)
bg$age_cont <- difftime(bg$collection_date, bg$birthdate)/365#divide by 365 to get years

#make this a number
bg$age_cont <- as.numeric(bg$age_cont)*12 #multiply by 12 to get months
```

Let's do a little bit of summarizing and plotting to see what our participants' ages look like.

```{r}
bg %>% summarise(mean = mean(age_cont),
                 sd = sd(age_cont),
                 median = median(age_cont),
                 min = min(age_cont),
                 max = max(age_cont)) %>%
  round(2)
```

Now for a histogram...

```{r}
bg %>% ggplot(aes(x = age_cont))+
  geom_vline(xintercept = 48, linetype = 2)+
  geom_vline(xintercept = 60, linetype = 2)+
  geom_vline(xintercept = 72, linetype = 2)+
  geom_histogram(binwidth = 1)+
  scale_x_continuous(breaks = c(48, 60, 72))+
  scale_y_continuous(expand = c(0,0), limits = c(0, 5.5))+
  labs(x = "Age (mos.)")+
  theme_bw()
```


Anu's data also has an age category, which corresponds to Junior and Senior kindergarten. Let's compare the ages of the two groups:

```{r}
bg %>% group_by(age_cat) %>%
  summarise(n = n(),
            mean = mean(age_cont),
            sd = sd(age_cont),
            median = median(age_cont),
            min = min(age_cont),               
            max = max(age_cont)) 
```

As expected, the Senior Kindergarteners are about a year older, on average, than the junior kindergarteners.

Visually:

```{r}
bg %>% ggplot(aes(x = age_cat, y = age_cont)) +
  geom_boxplot()+
  geom_jitter(height = 0, width = .1, color = "blue")+
  stat_summary(fun = "mean", color = "red", geom = "point", size = 4)+
  theme_bw()
```


#Proportions of verb interpretations

Anu's data involves children having to interpret verbs as being causative or conjunctive in Marathi (I think...). These are all intransitive verbs, and can have markers attached(or in proximity? - I know nothing about Marathi!).

We can use the `table()`, `xtabs()`, and `count()` or `tally()` functions to get a sense of the design and stimuli.

```{r echo=T, results='hide'}
table(data$verb, data$interp)
xtabs(~verb+token_type , data = data)
data %>% group_by(verb, verb_type) %>% count()
data %>% group_by(verb, marker) %>% tally()
```

Six verbs (cry, getwet, laugh, rise, scare, sleep) are the targets and it seems like Anu is interested how case markers affect verb interpretations (causative or conjunctive) for children at different ages.

```{r}
data %>% filter(!is.na(interp)) %>%
  group_by(age_cat, marker, interp) %>%
  count() %>%
  pivot_wider(names_from = interp, values_from = n) %>%
  mutate(total = causative + conjunctive,
         cause_perc = (causative/total)*100,
         conj_perc = (conjunctive/total)*100)
```

And now for a plot of causatives...

```{r}
data %>% filter(!is.na(interp)) %>%
  group_by(age_cat, marker, interp) %>%
  count() %>%
  pivot_wider(names_from = interp, values_from = n) %>%
  mutate(total = causative + conjunctive,
         cause_perc = (causative/total)*100,
         conj_perc = (conjunctive/total)*100) %>%
  ggplot(aes(x = marker, y = cause_perc, fill = age_cat))+
  geom_bar(stat = "identity", position = "dodge")+
  theme_bw()
```

And now for a plot of conjunctives...

```{r}
data %>% filter(!is.na(interp)) %>%
  group_by(age_cat, marker, interp) %>%
  count() %>%
  pivot_wider(names_from = interp, values_from = n) %>%
  mutate(total = causative + conjunctive,
         cause_perc = (causative/total)*100,
         conj_perc = (conjunctive/total)*100) %>%
  ggplot(aes(x = marker, y = conj_perc, fill = age_cat))+
  geom_bar(stat = "identity", position = "dodge")+
  theme_bw()
```


Now we'll do something similar, but at the participant level so we can look in greater detail at trends across ages.

```{r}
data %>% filter(!is.na(interp)) %>%
  group_by(participant_id, marker, interp) %>%
  count(.drop = F) %>%
  pivot_wider(names_from = interp, values_from = n) %>%
  mutate(causative = ifelse(is.na(causative), 0, causative),
         conjunctive = ifelse(is.na(conjunctive), 0, conjunctive),
         total = causative + conjunctive,
         cause_perc = (causative/total)*100,
         conj_perc = (conjunctive/total)*100) %>%
  left_join(.,select(bg, participant_id, age_cont), by = "participant_id")
```

And now some plotting....

```{r}
data %>% filter(!is.na(interp)) %>%
  group_by(participant_id, marker, interp) %>%
  count(.drop = F) %>%
  pivot_wider(names_from = interp, values_from = n) %>%
  mutate(causative = ifelse(is.na(causative), 0, causative),
         conjunctive = ifelse(is.na(conjunctive), 0, conjunctive),
         total = causative + conjunctive,
         cause_perc = (causative/total)*100,
         conj_perc = (conjunctive/total)*100) %>%
  left_join(.,select(bg, participant_id, age_cont), by = "participant_id") %>%
  select(participant_id, age_cont, marker, cause_perc, conj_perc) %>%
  pivot_longer(cause_perc:conj_perc, names_to = "interp", values_to = "percent") %>%
  ggplot(aes(x = age_cont, color = marker, y = percent))+
  geom_point(alpha = .4)+
  geom_smooth(method = "lm")+
  theme_bw()+
  facet_wrap(~interp+marker, nrow = 2)
```

