MBAH K-pilot data analysis

ANALYSIS OF EYE-GAZE DATA FOR RAs

Last edited by Raji Madhavan 21 Nov 2025
#clean slate
rm(list=ls())

#load libraries
library(dplyr)
## 
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
#load data
data=read.csv("https://dl.dropboxusercontent.com/scl/fi/79zafpfgkek377l5zncvy/Look_data_Kr_Pilot.csv?rlkey=0073y7fykoqhngl6gohym73o7&st=5vdarob4&dl=0") #eye-tracking data
demo=read.csv("https://dl.dropboxusercontent.com/scl/fi/rdb42kcx6iwdf1lz36uze/Demo_data_Kr_Pilot.csv?rlkey=bu3j72komc1zyiommn2n2gcjm&st=csll3qry&dl=0") #demographic data

# make sure that children know BOTH words in the picture
# we do this by checking whether parents marked the word as 'known' in the questionnaire we gave them
data$know.left=demo$resp[match(paste(data$fileid,data$left.item,sep = '_'),
                               paste(demo$fileid,demo$word,sep='_'))]
data$know.right=demo$resp[match(paste(data$fileid,data$right.item,sep = '_'),
                                paste(demo$fileid,demo$word,sep='_'))]

# because these are 1 and 0 (1 for knowing, 0 for not knowing), we can simply say we will only keep the 
# trials where the child knew both of the words that were shown to them on the screen
data$keep=ifelse(data$know.left==1 & data$know.right==1,1,0)

# Now we remove trials where the child did not know one (or both the words) 
data=subset(data,keep==1)

# we also remove any trials where the eyetracker did not record the eye gaze
data=subset(data, gaze.record=='TRUE')

# We clean up the data to keep only relevant columns now
data=data[,c("fileid","trial","left.item","right.item","tar.side","dis.side","bin",
             "target","aoi.tar","aoi.dis","outer.block","inner.block","vis.stim","aud.stim","rt")]

# We will define time windows of analysis (that is, where in the trial we expect to see an effect)
# Here, we have two time windows, 'pre' and 'post' naming time window. 
# This is to see how much children look toward the target word before they hear the name of the word
# and compare it to how long children look toward the target word AFTER they hear the name.
data$window=ifelse(data$bin>=0 & data$bin<=2000,'pre',
                   ifelse(data$bin>=3567 & data$bin<=6200,'post',NA))

# and remove the times in the trial which do not belong to these time windows
data=subset(data,!is.na(window))

# now, we calculate the proportion of time the children spent looking at the target word during 
# these two time windows

# to do so, we first calculate the TOTAL number of time the child looked to the target word and
# total number of time the child looked to the distractor word, during these two time windows
d.ptl=data%>%group_by(fileid,target,window)%>%
  summarise(sum.tar=sum(aoi.tar),sum.dis=sum(aoi.dis))
## `summarise()` has grouped output by 'fileid', 'target'. You can override using
## the `.groups` argument.
# and then we calculate the PROPORTION of target looking by adding up the total amount of time 
# the child looks at the target word, divided the total amount of time the child looks at 
# both the target and distractor word (t/(t+d)) (this is generally referred to as 'PTL')
# So now we have the proportion of target looking before they hear the name of the word
# and the proportion of target looking after they hear the name of the word
d.ptl$ptl=d.ptl$sum.tar/(d.ptl$sum.tar+d.ptl$sum.dis)

# to make the data look easier to read, I'm 'releveling' the window column (this is purely for my reference)
# because I want 'pre' naming window to be the first level of a factor 
# but since R sorts levels using the aplhabets, currents 'post' is the first level and I want to change it
# you dont have to understand this, but run this command anyway
d.ptl$window=factor(d.ptl$window, levels=c("pre", "post"))
##############################################################################################
##############################################################################################

##########Analysis for RAs ##############

####  Descriptive and inferential statistics
#### Demographic details ####

# 1. how many participants do we have?
length(unique(demo$uuid))
## [1] 4
#we have 4 participants

# 2. how many participants are male and how many female?
demo%>%distinct(uuid,sex)%>%count(sex)
##   sex n
## 1  Fe 1
## 2  Ma 3
#we have 1 female and 3 male participants

# 3. can you count the number of monolinguals and bilinguals in the data?
#tip - use the same R code as above, and change the 'sex' column to lang.stat
demo%>%distinct(uuid,lang.stat)%>%count(lang.stat)
##     lang.stat n
## 1   Bilingual 1
## 2 Monolingual 3
# 4. what is the average age of the participants (in months)?
age=demo%>%distinct(uuid, age.months)%>%pull(age.months) # first get age for each participant
mean(age)# get the mean
## [1] 17.25
# 5. what is the averge age of participants in days?
#tip: use the same formula as above, but change age.months to age.days

# __________________________ (your answer here)
age=demo%>%distinct(uuid, age.days)%>%pull(age.days)
mean(age)
## [1] 530
# 6. do all participants know all the words?
table(demo$resp,demo$word)
##    
##     ball banana book candy car ear eye fork hand strawberry tongue toy
##   0    1      0    0     2   0   1   0    0    0          0      2   0
##   1    3      4    4     2   4   3   4    4    4          4      2   4
# the top row tells you if there are children who don't know some words. because the data is split 
# as 0 (don't know) and 1 (know), we can look at the first row to see if there are (and if, how many) children
# don't know certain words

# a. how many children know the word, 'ball'?
# -> three children know the word, and one child does not know the word

# b. do all children know the word, 'banana'?
# -> 4 children know the word

# c. how many children know the word, ' tongue'?
# -> 2 children don't know and 2 children know
#### Analysis of eye-tracking data ####

### let's start with descriptive statistics about the proportion of target looking

# 1. look at the mean of proportion of target looking in the pre-naming and post-naming phase

# 1a. mean ptl for pre-naming phase
mean(d.ptl$ptl[d.ptl$window=='pre'])
## [1] 0.4960342
#0.4960342

#this number is very long - can you round up the digits using an rcode?
#first its easier to allocate the ptl as a value
pre_ptl=mean(d.ptl$ptl[d.ptl$window=='pre'])

#now round the value
round(pre_ptl,2) # 2 for 2 decimal places
## [1] 0.5
## 1b. DO IT YOURSELF: calculate the mean proportion of target looking 
#for the post-naming phase (and round to 2 decimal points!)


post_ptl=mean(d.ptl$ptl[d.ptl$window=='post'])
round(post_ptl,2)
## [1] 0.57
# 2. Inferential statistical test: t-test

# Research question: do children successfully recognise the object they are hearing the name of?
 
# to check this, we will have to look at whether there is a significant difference between 
#their proportion of looking before the naming and after the naming

# 2a. we can run a paired sampled t-test to achieve this

t.test(d.ptl$ptl[d.ptl$window=='post'], d.ptl$ptl[d.ptl$window=='pre'], paired = T)
## 
##  Paired t-test
## 
## data:  d.ptl$ptl[d.ptl$window == "post"] and d.ptl$ptl[d.ptl$window == "pre"]
## t = 2.1546, df = 35, p-value = 0.03816
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
##  0.004516592 0.151839178
## sample estimates:
## mean difference 
##      0.07817789
# what is the t-value?

#2.1546

#what is the p-value?

#0.03816

# what is the interpretation of the p-value?

#0.5보다 낮으므로 신뢰도가 있는 값


# 2b. Do it yourself - conduct a one-sample t-test to check if the proportion of looking in the
#post-naming phase is significantly higher than 0.5 (Question: why 0.5?)
# TIP - use mu= 0.5 and alternative = 'greater' in your R code


# 3a. Plots
# Build a plot to show the difference between the poroprtion of looking in the pre-naming time window 
# and the proportion of looking in the post-naming time window (we examined in the paired sample t-test)
#install.packages(ggplot2) #run this line if you don't have ggplot
library(ggplot2)
## Warning: 패키지 'ggplot2'는 R 버전 4.4.3에서 작성되었습니다
ggplot(d.ptl,aes(x=window,y=ptl)) + #define what is in the x and y axis
  geom_boxplot(aes(fill = window), alpha=0.7,width=0.3) + #define what kind of plot  you want 
  #and what you want to add colour to 
  geom_jitter(width = 0.05)+
  geom_hline(yintercept = 0.5) +
  facet_wrap(~ fileid) +#for each individual data point in the data we have, 
  #define whether you want them to be a little scattered 
  #(we want them a little scattered so we can see each individual point)
  #
  #from here on, define the theme of the plot (make the plot background white, no grids, etc.)
  theme_classic() + theme(text= element_text(size=18)) + 
  theme(legend.title=element_blank()) +
  theme(axis.title.x = element_text(margin = margin(t = 20)))+
  theme(legend.position="none", strip.text = element_text(size=6)) +
  #add title and x and y axis labels
  labs(y="Proportion of Target Looking (PTL)", x="Time window") +
  ggtitle("Children's Proportion of target looking across pre- and post-naming windows")

# 3b. Do it yourself: Plots with a horizontal line that represents the chance proportion (0.5)
# edit the above plot to get a horizontal line across the boxplots at yaxis=0.5
# hint - use the command geom_hline to add the horizontal line


# 3c. Do it yourself: Plots for each individual participant
# edit the above plot to get a separate plot for each participant, using the facet_wrap command
  


#############################################################################################