Last edited by Raji Madhavan 21 Nov 2025
#clean slate
rm(list=ls())
#load libraries
library(dplyr)
##
## 다음의 패키지를 부착합니다: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
#load data
data=read.csv("https://dl.dropboxusercontent.com/scl/fi/79zafpfgkek377l5zncvy/Look_data_Kr_Pilot.csv?rlkey=0073y7fykoqhngl6gohym73o7&st=5vdarob4&dl=0") #eye-tracking data
demo=read.csv("https://dl.dropboxusercontent.com/scl/fi/rdb42kcx6iwdf1lz36uze/Demo_data_Kr_Pilot.csv?rlkey=bu3j72komc1zyiommn2n2gcjm&st=csll3qry&dl=0") #demographic data
# make sure that children know BOTH words in the picture
# we do this by checking whether parents marked the word as 'known' in the questionnaire we gave them
data$know.left=demo$resp[match(paste(data$fileid,data$left.item,sep = '_'),
paste(demo$fileid,demo$word,sep='_'))]
data$know.right=demo$resp[match(paste(data$fileid,data$right.item,sep = '_'),
paste(demo$fileid,demo$word,sep='_'))]
# because these are 1 and 0 (1 for knowing, 0 for not knowing), we can simply say we will only keep the
# trials where the child knew both of the words that were shown to them on the screen
data$keep=ifelse(data$know.left==1 & data$know.right==1,1,0)
# Now we remove trials where the child did not know one (or both the words)
data=subset(data,keep==1)
# we also remove any trials where the eyetracker did not record the eye gaze
data=subset(data, gaze.record=='TRUE')
# We clean up the data to keep only relevant columns now
data=data[,c("fileid","trial","left.item","right.item","tar.side","dis.side","bin",
"target","aoi.tar","aoi.dis","outer.block","inner.block","vis.stim","aud.stim","rt")]
# We will define time windows of analysis (that is, where in the trial we expect to see an effect)
# Here, we have two time windows, 'pre' and 'post' naming time window.
# This is to see how much children look toward the target word before they hear the name of the word
# and compare it to how long children look toward the target word AFTER they hear the name.
data$window=ifelse(data$bin>=0 & data$bin<=2000,'pre',
ifelse(data$bin>=3567 & data$bin<=6200,'post',NA))
# and remove the times in the trial which do not belong to these time windows
data=subset(data,!is.na(window))
# now, we calculate the proportion of time the children spent looking at the target word during
# these two time windows
# to do so, we first calculate the TOTAL number of time the child looked to the target word and
# total number of time the child looked to the distractor word, during these two time windows
d.ptl=data%>%group_by(fileid,target,window)%>%
summarise(sum.tar=sum(aoi.tar),sum.dis=sum(aoi.dis))
## `summarise()` has grouped output by 'fileid', 'target'. You can override using
## the `.groups` argument.
# and then we calculate the PROPORTION of target looking by adding up the total amount of time
# the child looks at the target word, divided the total amount of time the child looks at
# both the target and distractor word (t/(t+d)) (this is generally referred to as 'PTL')
# So now we have the proportion of target looking before they hear the name of the word
# and the proportion of target looking after they hear the name of the word
d.ptl$ptl=d.ptl$sum.tar/(d.ptl$sum.tar+d.ptl$sum.dis)
# to make the data look easier to read, I'm 'releveling' the window column (this is purely for my reference)
# because I want 'pre' naming window to be the first level of a factor
# but since R sorts levels using the aplhabets, currents 'post' is the first level and I want to change it
# you dont have to understand this, but run this command anyway
d.ptl$window=factor(d.ptl$window, levels=c("pre", "post"))
##############################################################################################
##############################################################################################
##########Analysis for RAs ##############
#### Descriptive and inferential statistics
#### Demographic details ####
# 1. how many participants do we have?
length(unique(demo$uuid))
## [1] 4
#we have 4 participants
# 2. how many participants are male and how many female?
demo%>%distinct(uuid,sex)%>%count(sex)
## sex n
## 1 Fe 1
## 2 Ma 3
#we have 1 female and 3 male participants
# 3. can you count the number of monolinguals and bilinguals in the data?
#tip - use the same R code as above, and change the 'sex' column to lang.stat
demo%>%distinct(uuid,lang.stat)%>%count(lang.stat)
## lang.stat n
## 1 Bilingual 1
## 2 Monolingual 3
# 4. what is the average age of the participants (in months)?
age=demo%>%distinct(uuid, age.months)%>%pull(age.months) # first get age for each participant
mean(age)# get the mean
## [1] 17.25
# 5. what is the averge age of participants in days?
#tip: use the same formula as above, but change age.months to age.days
# __________________________ (your answer here)
age=demo%>%distinct(uuid, age.days)%>%pull(age.days)
mean(age)
## [1] 530
# 6. do all participants know all the words?
table(demo$resp,demo$word)
##
## ball banana book candy car ear eye fork hand strawberry tongue toy
## 0 1 0 0 2 0 1 0 0 0 0 2 0
## 1 3 4 4 2 4 3 4 4 4 4 2 4
# the top row tells you if there are children who don't know some words. because the data is split
# as 0 (don't know) and 1 (know), we can look at the first row to see if there are (and if, how many) children
# don't know certain words
# a. how many children know the word, 'ball'?
# -> three children know the word, and one child does not know the word
# b. do all children know the word, 'banana'?
# -> 4 children know the word
# c. how many children know the word, ' tongue'?
# -> 2 children don't know and 2 children know
#### Analysis of eye-tracking data ####
### let's start with descriptive statistics about the proportion of target looking
# 1. look at the mean of proportion of target looking in the pre-naming and post-naming phase
# 1a. mean ptl for pre-naming phase
mean(d.ptl$ptl[d.ptl$window=='pre'])
## [1] 0.4960342
#0.4960342
#this number is very long - can you round up the digits using an rcode?
#first its easier to allocate the ptl as a value
pre_ptl=mean(d.ptl$ptl[d.ptl$window=='pre'])
#now round the value
round(pre_ptl,2) # 2 for 2 decimal places
## [1] 0.5
## 1b. DO IT YOURSELF: calculate the mean proportion of target looking
#for the post-naming phase (and round to 2 decimal points!)
post_ptl=mean(d.ptl$ptl[d.ptl$window=='post'])
round(post_ptl,2)
## [1] 0.57
# 2. Inferential statistical test: t-test
# Research question: do children successfully recognise the object they are hearing the name of?
# to check this, we will have to look at whether there is a significant difference between
#their proportion of looking before the naming and after the naming
# 2a. we can run a paired sampled t-test to achieve this
t.test(d.ptl$ptl[d.ptl$window=='post'], d.ptl$ptl[d.ptl$window=='pre'], paired = T)
##
## Paired t-test
##
## data: d.ptl$ptl[d.ptl$window == "post"] and d.ptl$ptl[d.ptl$window == "pre"]
## t = 2.1546, df = 35, p-value = 0.03816
## alternative hypothesis: true mean difference is not equal to 0
## 95 percent confidence interval:
## 0.004516592 0.151839178
## sample estimates:
## mean difference
## 0.07817789
# what is the t-value?
#2.1546
#what is the p-value?
#0.03816
# what is the interpretation of the p-value?
#0.5보다 낮으므로 신뢰도가 있는 값
# 2b. Do it yourself - conduct a one-sample t-test to check if the proportion of looking in the
#post-naming phase is significantly higher than 0.5 (Question: why 0.5?)
# TIP - use mu= 0.5 and alternative = 'greater' in your R code
# 3a. Plots
# Build a plot to show the difference between the poroprtion of looking in the pre-naming time window
# and the proportion of looking in the post-naming time window (we examined in the paired sample t-test)
#install.packages(ggplot2) #run this line if you don't have ggplot
library(ggplot2)
## Warning: 패키지 'ggplot2'는 R 버전 4.4.3에서 작성되었습니다
ggplot(d.ptl,aes(x=window,y=ptl)) + #define what is in the x and y axis
geom_boxplot(aes(fill = window), alpha=0.7,width=0.3) + #define what kind of plot you want
#and what you want to add colour to
geom_jitter(width = 0.05)+
geom_hline(yintercept = 0.5) +
facet_wrap(~ fileid) +#for each individual data point in the data we have,
#define whether you want them to be a little scattered
#(we want them a little scattered so we can see each individual point)
#
#from here on, define the theme of the plot (make the plot background white, no grids, etc.)
theme_classic() + theme(text= element_text(size=18)) +
theme(legend.title=element_blank()) +
theme(axis.title.x = element_text(margin = margin(t = 20)))+
theme(legend.position="none", strip.text = element_text(size=6)) +
#add title and x and y axis labels
labs(y="Proportion of Target Looking (PTL)", x="Time window") +
ggtitle("Children's Proportion of target looking across pre- and post-naming windows")

# 3b. Do it yourself: Plots with a horizontal line that represents the chance proportion (0.5)
# edit the above plot to get a horizontal line across the boxplots at yaxis=0.5
# hint - use the command geom_hline to add the horizontal line
# 3c. Do it yourself: Plots for each individual participant
# edit the above plot to get a separate plot for each participant, using the facet_wrap command
#############################################################################################