This is an R Markdown document of my first look at Data from Pejovic, Nielsen, & Kovic.
The data for these experiments can be found at the GitHub Repository here: www.github.com/SOMETHING
In the two studies presented here, participants completed a categorisation task where they had to learn the category label for two sets of images- rounded visual stimuli and jagged visual stimuli.
In Experiment 1, the classic trisyllabic labels Takete and Maluma where used, while in Experiment 2, the labels were modified to be single syllables: Mal vs. Tik
Both experiments were 2x2 experimental designs: Participants learned a category structure that was either congruent with sound symbolism (Jagged = Takete/Tik, Curvy= Maluma/Mal) or incongruent. As a second (and novel) manipulation, participants varied with respect to which was presented first: participants in the label-first condition heard labels prior to seeing the presented images, while those in the image-first condition were presented with labels after seeing the images.
Experiment 1: 33 undergraduate students between 19 and 24 years old (1 excluded) Experiment 2: 29 undergraduate students between 19 and 24 years old (2 excluded)
First we need to Read In and Sanitize the Data for Analysis
mainPath <- 'D:/Google Drive/Experiments/Collaborations/Pejovic et al/' ## Sets the path of the local directory
setwd(file.path(mainPath, "data")) ## sets the data folder as the working directory
Exp1 <- read.csv("Exp1.csv") ## Reads in Experiment 1 Data
Exp2 <- read.csv("Exp2.csv") ## Reads in Experiment 2 Data
So that’s the data read in- now lets take a peek at what it looks like
head(Exp1)
## Participant Experiment Training Balance Block TrialName TrialNum
## 1 1 1 1 1 blok1 e31 4
## 2 1 1 1 1 blok1 e42 11
## 3 1 1 1 1 blok1 e33 6
## 4 1 1 1 1 blok1 e41 10
## 5 1 1 1 1 blok1 e45 14
## 6 1 1 1 1 blok1 e43 12
## EventNum Resp RespCorr RT Stim Rek.Err Filter.
## 1 5 L C 8328 3 1 1
## 2 4 L C 3656 4 1 1
## 3 5 R E 1828 3 2 0
## 4 4 L C 2718 4 1 1
## 5 4 L C 3110 4 1 1
## 6 4 L C 2797 4 1 1
head(Exp2)
## Pnum Participant Experiment Training Balance Block BlockNum TrialName
## 1 1 PS110006 1 1 1 blok1 2 e42
## 2 1 PS110006 1 1 1 blok1 2 e41
## 3 1 PS110006 1 1 1 blok1 2 e44
## 4 1 PS110006 1 1 1 blok1 2 e46
## 5 1 PS110006 1 1 1 blok1 2 e45
## 6 1 PS110006 1 1 1 blok1 2 e31
## TrialNum EventNum Resp RespCorr RT Rek.Err Filter.
## 1 11 4 L C 5844 1 1
## 2 10 4 L C 2953 1 1
## 3 13 4 L C 907 1 1
## 4 15 4 L C 3453 1 1
## 5 14 4 R E 1985 2 0
## 6 4 5 R E 3906 2 0
The first thing that we will see is that the data formatting is a little bit strange, so lets clean that up
First, we will Replace the Values of the “Block” column with Block Numbers:
To do this, we’re going to use the mapvalues function, which is in the plyr package- so first we load that in:
## install.packages("plyr") ## you need to delete the hashes and run this line if you don't have plyr installs
library(plyr)
## mapvalues is a handy function where you can give a list of existing values and it will replace them- there are *tons* of ways to do this in R though
Exp1$Block <- mapvalues(Exp1$Block,
from= c("blok1", "blok2", "blok3"),
to= c(1,2,3))
Exp1$Block <- as.factor(Exp1$Block) ## Make sure block is a factor, not a numerical
## Then do the same thing for Experiment 2 (but in one block of code, rather than breaking it up)
Exp2$Block <- as.factor(mapvalues(Exp2$Block,
from= c("blok1", "blok2", "blok3"),
to= c(1,2,3)))
Next, we will Recode the RespCorr Column, so that Correct = 1, Incorrect = 0: this is both the standard, and also allows the data to be aggregated over a little bit easierr
## We do this exactly as above, with the mapvalues function, making sure the now numeric Correctness values stay coded as a factor
Exp1$RespCorr <- as.factor(mapvalues(Exp1$RespCorr,
from = c('C', 'E'),
to = c(1,0)))
Exp2$RespCorr <- as.factor(mapvalues(Exp2$RespCorr,
from = c('C', 'E'),
to = c(1,0)))
There are then a few other columns that we want to make sure are Factors, rather than numerical
Setting Columns as Factors:
Exp1$Experiment <- as.factor(Exp1$Experiment)
Exp1$Training <- as.factor(Exp1$Training)
Exp1$Balance <- as.factor(Exp1$Balance)
Exp1$Participant <- as.factor(Exp1$Participant)
Exp2$Experiment <- as.factor(Exp2$Experiment)
Exp2$Training <- as.factor(Exp2$Training)
Exp2$Balance <- as.factor(Exp2$Balance)
Exp2$Participant <- as.factor(Exp2$Participant)
One other common thing is to transform response times, because they are almost never normally distributed, so lets quickly take a look at the Distribution of Response Times:
## we can do this with a basic histogram, which looks like this
hist(Exp1$RT)
## But density plots are generally a bit nicer:
Exp1D <- density(Exp1$RT)
plot(Exp1D, main="Kernel Density of Response Times")
polygon(Exp1D, col= 'red', border= 'blue')
This doesn’t actually look horrible in terms of how normal the distribution is, but we can test that, so lets do so:
shapiro.test(Exp1$RT)
##
## Shapiro-Wilk normality test
##
## data: Exp1$RT
## W = 0.6194, p-value < 2.2e-16
# The nortest package gives us a bunch of other tests of normality
## install.packages('nortest')
library(nortest)
ad.test(Exp1$RT) #Anderson-Darling normality test
##
## Anderson-Darling normality test
##
## data: Exp1$RT
## A = 90.458, p-value < 2.2e-16
cvm.test(Exp1$RT) # Cramer-von Mises normality test
## Warning in cvm.test(Exp1$RT): p-value is smaller than 7.37e-10, cannot be
## computed more accurately
##
## Cramer-von Mises normality test
##
## data: Exp1$RT
## W = 16.332, p-value = 7.37e-10
pearson.test(Exp1$RT) # Pearson chi-square normality test
##
## Pearson chi-square normality test
##
## data: Exp1$RT
## P = 1065.5, p-value < 2.2e-16
In sum, what all of these normality tests tell us is that the data is very likely not normally distributed, so will indeed need to be transformed - however log-transforming RT values is something that is generally done on the aggregate data, so lets do that first
**Aggregating Data:**
## To aggregate the data, we're going to use the SummaryBy function, which is found in the doBy library
library(doBy)
Exp1Agg <- summaryBy(RespCorr + RT ~ Participant, data= Exp1, Fun = c(mean, sd))
head(Exp1Agg)
## Participant RespCorr.mean RT.mean
## 1 1 1.027778 1184.0833
## 2 2 1.166667 614.0833
## 3 3 1.055556 1200.5833
## 4 4 1.055556 1098.8333
## 5 5 1.111111 963.9444
## 6 6 1.166667 597.7778
Here we can see our first error pop up- Means of RespCorr that are Above 1- aka impossible values- this is because there are still “NR” (No response) values in that column - so lets get rid of those
Exp1F <- subset(Exp1, RespCorr == 1| RespCorr == 0) # Subsets out lines where the RespCorr value is anything but 0 or 1
Exp1F$RespCorr <- factor(Exp1F$RespCorr) # refactors to remove "NR" as a possible factor level
Exp1F$RespCorr <- as.numeric(as.character(Exp1F$RespCorr)) # make it numeric (you have to make it into characters first, because R is stupid)
Exp1Agg <- summaryBy(RespCorr + RT ~ Participant + Experiment + Training + Balance + Block, data= Exp1F, FUN = c(mean, sd))
head(Exp1Agg)
## Participant Experiment Training Balance Block RespCorr.mean RT.mean
## 1 1 1 1 1 1 0.9166667 2865.9167
## 2 1 1 1 1 2 1.0000000 389.3333
## 3 1 1 1 1 3 1.0000000 297.0000
## 4 2 1 2 1 1 0.8333333 1147.0833
## 5 2 1 2 1 2 1.0000000 336.6364
## 6 2 1 2 1 3 1.0000000 421.7273
## RespCorr.sd RT.sd
## 1 0.2886751 1919.0203
## 2 0.0000000 221.0736
## 3 0.0000000 152.8392
## 4 0.3892495 1257.6413
## 5 0.0000000 334.5783
## 6 0.0000000 431.5563
Now we can take another look at the distributions of RTs by participant
Exp1AggD <- density(Exp1Agg$RT.mean)
plot(Exp1AggD, main="Kernel Density of Response Times")
polygon(Exp1AggD, col= 'red', border= 'blue')
library(nortest)
shapiro.test(Exp1Agg$RT.mean)
##
## Shapiro-Wilk normality test
##
## data: Exp1Agg$RT.mean
## W = 0.92782, p-value = 4.09e-05
ad.test(Exp1Agg$RT.mean) #Anderson-Darling normality test
##
## Anderson-Darling normality test
##
## data: Exp1Agg$RT.mean
## A = 1.8842, p-value = 7.636e-05
cvm.test(Exp1Agg$RT.mean) # Cramer-von Mises normality test
##
## Cramer-von Mises normality test
##
## data: Exp1Agg$RT.mean
## W = 0.30058, p-value = 0.0003152
pearson.test(Exp1Agg$RT.mean) # Pearson chi-square normality test
##
## Pearson chi-square normality test
##
## data: Exp1Agg$RT.mean
## P = 20.889, p-value = 0.02188
These still arne’t normally distributed, so lets *Log Transform the RTs** and see what we get:
Exp1Agg$LogRT <- log(Exp1Agg$RT.mean)
Exp1AggD <- density(Exp1Agg$LogRT)
plot(Exp1AggD, main="Kernel Density of Response Times")
polygon(Exp1AggD, col= 'red', border= 'blue')
library(nortest)
shapiro.test(Exp1Agg$LogRT)
##
## Shapiro-Wilk normality test
##
## data: Exp1Agg$LogRT
## W = 0.98304, p-value = 0.2334
ad.test(Exp1Agg$LogRT) #Anderson-Darling normality test
##
## Anderson-Darling normality test
##
## data: Exp1Agg$LogRT
## A = 0.39729, p-value = 0.3618
cvm.test(Exp1Agg$LogRT) # Cramer-von Mises normality test
##
## Cramer-von Mises normality test
##
## data: Exp1Agg$LogRT
## W = 0.059857, p-value = 0.3765
pearson.test(Exp1Agg$LogRT) # Pearson chi-square normality test
##
## Pearson chi-square normality test
##
## data: Exp1Agg$LogRT
## P = 11.172, p-value = 0.3443
## Quickly here is the distribution of RTs for each block:
library(sm)
## Package 'sm', version 2.2-5.4: type help(sm) for summary information
attach(Exp1Agg)
sm.density.compare(LogRT, Block)
legend("topright", levels(Exp1Agg$Block), fill = 2+(0:nlevels(Exp1Agg$Block)))
We can see that after log transforming, the RT values are reasonably normal in all tests, so that’s good.
Now lets quickly do all the same Transforms for the data from Experiment 2
## Quick density plot of untransformed RTs
Exp2D <- density(Exp2$RT)
plot(Exp2D, main="Kernel Density of Response Times")
polygon(Exp1D, col= 'red', border= 'blue')
## And one test of normality
shapiro.test(Exp2$RT)
##
## Shapiro-Wilk normality test
##
## data: Exp2$RT
## W = 0.59456, p-value < 2.2e-16
## Summarize the data
library(doBy)
Exp2F <- subset(Exp2, RespCorr == 1| RespCorr == 0)
Exp2F$RespCorr <- factor(Exp2F$RespCorr) # refactors to remove "NR" as a possible factor level
Exp2F$RespCorr <- as.numeric(as.character(Exp2F$RespCorr))
Exp2Agg <- summaryBy(RespCorr + RT ~ Participant + Experiment + Training + Balance + Block, data= Exp2F, FUN = c(mean, sd))
## Plot and test distribution of RTs by participant (untransformed)
Exp2AggD <- density(Exp2Agg$RT.mean)
plot(Exp2AggD, main="Kernel Density of Response Times")
polygon(Exp2AggD, col= 'red', border= 'blue')
shapiro.test(Exp2Agg$RT.mean)
##
## Shapiro-Wilk normality test
##
## data: Exp2Agg$RT.mean
## W = 0.68667, p-value = 2.48e-12
## And same for log transformed
Exp2Agg$LogRT <- log(Exp2Agg$RT.mean)
Exp2AggD <- density(Exp2Agg$LogRT)
plot(Exp2AggD, main="Kernel Density of Response Times")
polygon(Exp2AggD, col= 'red', border= 'blue')
shapiro.test(Exp2Agg$LogRT)
##
## Shapiro-Wilk normality test
##
## data: Exp2Agg$LogRT
## W = 0.97378, p-value = 0.07409
## And distributions by block
library(sm)
attach(Exp2Agg)
## The following objects are masked from Exp1Agg:
##
## Balance, Block, Experiment, LogRT, Participant, RespCorr.mean,
## RespCorr.sd, RT.mean, RT.sd, Training
sm.density.compare(LogRT, Block)
legend("topright", levels(Exp2Agg$Block), fill = 2+(0:nlevels(Exp2Agg$Block)))
So now, finally, we can do some stats
library(lme4)
## Loading required package: Matrix
library(tidyr)
##
## Attaching package: 'tidyr'
## The following object is masked from 'package:Matrix':
##
## expand
Exp1Agg <- unite(Exp1Agg, ET, Experiment, Training, sep = "-", remove = FALSE)
Exp1Agg <- unite(Exp1Agg, EB, Experiment, Balance, sep = "-", remove = FALSE)
Exp1Agg <- unite(Exp1Agg, TB, Training, Balance, sep = "-", remove = FALSE)
Exp1Agg$ET <- as.factor(Exp1Agg$ET)
Exp1Agg$EB <- as.factor(Exp1Agg$EB)
Exp1Agg$TB <- as.factor(Exp1Agg$TB)