# import packages to be used
library(dplyr) # excellent for data cleaning and manipulation. Uses five key verbs
library(beepr) # this can be used to alert you when analyses are complete
library(tidyr) # necessary for cleaning data
library(ez) # can calculate generalised-eta-squared for anovas
library(ggplot2) # for creating graphs and figures
library(car) # contains some data analysis functions
library(psych) # contains useful functions for quickly describing and exploring data
library(Publish) # useful exploratory analysis package
## Importing the data
# first I want to set the working directory to a folder containing my raw data
setwd("~/Documents/PhD Resources/R/Experiment One R Analysis/Experiment One R Analysis/Experiment One Raw Data")
# list all the files ending with '.csv' in the data folder
df_name <- list.files(pattern = "\\.csv$")
# quickly check whether all these files have been listed
df_name
# make a list containing the raw data of all the csvs
df_list <- lapply(df_name,
read.csv
)
# bind all the csvs to make one big csv containing the data of all participants
df <- data.table::rbindlist(df_list,
use.names = TRUE,
idcol = TRUE,
fill = TRUE)
## Cleaning the Data
# making all blank cells into NA so they fall under na.rm arguments
# na.rm arguments are arguments in functions that tell teh function to ignore any "NA"s
df[df==""] <- NA
# also changing None to NA, since some blank cells come up as None
df[df=="None"] <- NA
# Data Cleaning
# set the working directory to the output file so that any output from my data cleaning or
# analysis will end up there, rather than filling up and confusing the raw data folder
setwd("~/Documents/PhD Resources/R/Experiment One R Analysis/Experiment One R Analysis/R Output")
# look at the column names so I know what I variables I'm working with
names(df)
# get all the columns I want by deleting those I don't need
df_trim <- df %>%
select(-wholeFrames.thisRepN,
-wholeFrames.thisTrialN,
-responseTrain.keys,
-responseTrain.corr,
-responseTrain.rt,
-affectRating.rt,
-arousalRating.rt,
-senseMakingRating.rt,
-frameRate,
-session,
-RPS.SONA.ID.Code,
-i.e..23.,
-X
)
# look at the columns remaining (to get them in the console to work with more easily)
names(df_trim)
# now I will group by block and count the number of attempts on each block of the experiment
# this will create a new dataframe for the number of attempts
# I will then join this dataframe to the original dataframe
sum_correct <- df_trim %>%
group_by(participant, block) %>%
summarise(block_attempts_minus_one = max(ifNotOver80.thisRepN, na.rm = TRUE)) %>%
mutate(block_attempts_this_block = block_attempts_minus_one + 1) %>%
select(-block_attempts_minus_one) %>%
ungroup()
# joining both dataframes
# if you want, you can first make an excel sheet to see what it looks like pre-join
# write.csv(df_trim, "Pre_Join_Excel.csv") # unhash the code on this line if needed
# join the two dfs
df_join <- full_join(df_trim, sum_correct, by = c("participant", "block"))
# make an excel sheet to see what it looks like post-join
# write.csv(df_join, "Post_Join_Excel.csv") # unhas the code on this line if needed
# now I will make individual dataframes for each rating at the end of each block of the experiment
# these dataframes will then be merged to the main body of data.
# this is much easier to write, understand and debug/edit than doing all
# data cleaning within the original dataframe itself
# creating a dataframe for the affect rating of each participant at the end of each block
block_affect_rating_df <- df_join %>%
group_by(participant, block) %>%
summarise(block_affect_rating =
sum(as.numeric(as.character(affectRating.response)), na.rm = TRUE)) %>%
ungroup()
# creating a dataframe for the arousal rating of each participant at the end of each block
block_arousal_rating_df <- df_join %>%
group_by(participant, block) %>%
summarise(block_arousal_rating =
sum(as.numeric(as.character(arousalRating.response)), na.rm = TRUE)) %>%
ungroup()
# creating a dataframe for the sense-making rating of each participant at the end of each block
block_sense_making_rating_df <- df_join %>%
group_by(participant, block) %>%
summarise(block_sense_making_rating =
sum(as.numeric(as.character(senseMakingRating.response)), na.rm = TRUE)) %>%
ungroup()
# now to join all these dfs to the main dataframe (mother_df) in one go
mother_df <- df_join %>%
full_join(block_affect_rating_df, by = c("participant", "block")) %>%
full_join(block_arousal_rating_df, by = c("participant", "block")) %>%
full_join(block_sense_making_rating_df, by = c("participant", "block"))
# delete the now redundant rows that are by-products of way the output from the Python...
# experiment was formatted.
# to do this, I find a column that contains NAs on certain rows and will delete them
# First, count the NAs in this column. False is what should remain (cells with values in)
# TRUE is what should be deleted (cells with NAs)
table(is.na(mother_df$trials.thisRepN))
# then delete all rows with NAs
mother_df_2 <- subset(mother_df, !is.na(trials.thisRepN))
# this has deleted a vast number of now redundant rows.
# turn the block order number (from Python, 0:5) to normal numbers 1-6
mother_df_4 <- mother_df_2 %>%
mutate(block_order_number = wholeFrames.thisN + 1) %>%
select(-wholeFrames.thisN)
# create a block_code variable so I know which of the six blocks I am dealing with
# the code is: (AmbA = 1, CohA = 4). There are three 'amb's and three 'coh's, 1-6
mother_df_5 <- mother_df_4 %>%
mutate(block_code = wholeFrames.thisIndex + 1) %>%
select(-wholeFrames.thisIndex)
# look at the columns we have left so that we know what we want to delete
names(mother_df_5)
# delete redundant columns
mother_df_6 <- select(mother_df_5,
-ifNotOver80.thisRepN,
-ifNotOver80.thisTrialN,
-ifNotOver80.thisN,
-ifNotOver80.thisIndex,
-trainingLoops.thisRepN,
-trainingLoops.thisTrialN,
-trainingLoops.thisN,
-trainingLoops.thisIndex,
-affectRating.response,
-arousalRating.response,
-senseMakingRating.response,
ifNotOver80.thisTrial
)
# turn gender from M/F into 0/1 for analysis
mother_df_7 <- mother_df_6 %>% mutate(Gender_0F_1M =
ifelse(Gender..e.g..M.or.F. == "M", 1, 0)) %>%
select(-Gender..e.g..M.or.F.)
# rename and reorder all variables in one fell swoop using dplyr::select
mother_df_8 <- mother_df_7 %>%
select(participant,
age = Age..Number,
Gender_0F_1M,
expName,
block,
block_code,
block_order_number,
block_attempts_this_block,
stimR,
picRel,
stimL,
stimL,
corrAns,
realRel,
trials.thisRepN,
trials.thisTrialN,
trials.thisN,
trials.thisIndex,
test_response_keys = responseTest.keys,
test_response_rt = responseTest.rt,
block_affect_rating,
block_arousal_rating,
block_sense_making_rating
)
# making a column for correct = 1 and incorrect = 0 for each trial
# notice that in the df, corrAns has "none" for amb blocks.
# This is because there is no correct answer, so...
# accordingly, this will become a 0 in the new column
mother_df_999 <- mother_df_8 %>% mutate(Correct1_Incorrect0 =
ifelse(as.character(corrAns) == as.character(test_response_keys), 1, 0))
# the silly numbered dataframe label is used to indicate that it is an intermediate df not to be used
# arrange the df by participant
mother_df_9 <- mother_df_999 %>% arrange(participant)
# data cleaning is largely complete, and this is a 'good place to save'
# further cleaning involves a few manipulations I might do to refine the dataset -
# and also prepare numerous dataframes for different kinds of analysis.
# Additionally, I will do a few checks before doing any further manipulations
# Usually I would run a bunch of checks as I go along to look at how the cleaning is going.
# So, write the current clean, tidy data file to a csv
write.csv(mother_df_9, "Cleaned Experiment One Data.csv")
# create a quick dataframe to glance at the correctness of participants on blocks
correct_output_df <- mother_df_9 %>% group_by(participant, block) %>%
summarise(correct_sum = sum(Correct1_Incorrect0))
# getting the mean across each block for participant to then graph
mean_corr_df <- correct_output_df %>% group_by(participant) %>%
summarise(mean_corr = mean(correct_sum, na.rm = TRUE))
# make a histogram of the correctness
hist(mean_corr_df$mean_corr, breaks = 25)
# convert mean_corr_df to percents
percent_corr_df <- mean_corr_df %>% group_by(participant) %>%
mutate(percent_corr = 100*(mean_corr/48))
# make a histogram
hist(percent_corr_df$percent_corr, breaks = 25)
# adding new variable to mother_df_9 which is 'what button the participant pressed'
mother_df_10 <- mother_df_9 %>%
mutate(button_press_Z1_M0 =
ifelse(test_response_keys == "z", 1, 0))
# adding new variable to mother_df_10 which is 'what stimulus the participant chose'
## NOTE this only works for ambiguous blocks. Coherent blocks all revert to 0
mother_df_11 <- mother_df_10 %>%
mutate(stim_chosen_0_1 =
ifelse(stimR %in% c("CDO", "LDF", "KYW") & test_response_keys == "m", 1, 0))
# adding new variable to mother_df_11 which is whether the participant chose A>C or A<C
# CAUTION this is only for amb blocks, which is why coh blocks all come up 0
mother_df_12 <- mother_df_11 %>%
mutate(compound_AbiggerC1_AsmallerC0 =
ifelse(
# stim A plus BIGGER plus CHOSEN
# stim A
stimR %in% c("CDO", "LDF", "KYW") &
# symbols indicating 'bigger'
picRel %in% c("####", "????", "\"") &
# if that response was chosen
test_response_keys == "m" |
# stim B plus SMALLER plus CHOSEN
stimR %in% c("ZKR", "RSQ", "YNM") &
picRel %in% c("****", "%%%%", "[[[[") &
test_response_keys == "m" |
# stim A plus SMALLER plus NOT CHOSEN
stimR %in% c("CDO", "LDF", "KYW") &
picRel %in% c("****", "%%%%", "[[[[") &
test_response_keys == "z" |
# stim B plus BIGGER plus NOT CHOSEN
stimR %in% c("ZKR", "RSQ", "YNM") &
picRel %in% c("####", "????", "\"") &
test_response_keys == "z"
# if any of the above true, then 1, else 0
, 1, 0))
# just checking the final mutation worked correctly
checking <- mother_df_12 %>% filter(block_code == 1:3) %>%
select(block_code, stimR, picRel, test_response_keys, compound_AbiggerC1_AsmallerC0)
## Here I make a few other dataframes in preparation for other checks, graphs and analyses
# df_2 will have the means for each block (more variables than flattened_df)
# use mother_df_12 if you are after raw individual trial information
# test_response_rt is not in here because you should not average it twice
# test_response_correct also should not be averaged twice so it was removed
# flatten (average or sum) the main dataframe by participant and block
flattened_df <- mother_df_12 %>% group_by(participant, block) %>%
summarise(age = mean(age),
gender = mean(Gender_0F_1M),
block_code = mean(block_code),
block_order_number = mean(block_order_number),
block_attempts_this_block = mean(block_attempts_this_block),
sum_test_correct = sum(Correct1_Incorrect0),
block_affect_rating = mean(block_affect_rating),
block_arousal_rating = mean(block_arousal_rating),
block_sense_making_rating = mean(block_sense_making_rating),
sum_button_pressed = sum(button_press_Z1_M0),
sum_stim_picked = sum(stim_chosen_0_1),
sum_compound_responding = sum(compound_AbiggerC1_AsmallerC0)
)
# make a new variable for block type
df_1 <- flattened_df %>% mutate(block_type_0A_1C = ifelse(block_code == 1:3, 0, 1))
# df_2 is primary df for next few steps
df_2 <- df_1
# note - df_2 is flattened_df but with more variables.
# df_2 contains all participant data, and usually you would remove some outliers etc
# use the later df_limits for all stats where outliers should be removed
## this is where I will remove participants that reached the 10 loop attempts limit
# first make a df of all 10 loop attempters
limit_reached_df <- df_2 %>% filter(block_attempts_this_block == 10)
# these are all the 10 loop attempters
unique(limit_reached_df$participant)
# using a hist you can see that most people reached the limit of attempts on block 1
hist(limit_reached_df$block_order_number)
# remove all participants that went to 10 loop limit attempts
df_limits_removed <- df_2 %>%
subset(subset = !(participant %in% limit_reached_df$participant))
# count how many participants are left
# I know there are 53 participants left, but this could be recoded to be more flexible
# I have simply copied and pasted what I wrote in the console on the fly
n_distinct(df_limits_removed$participant)
gender_count <- df_limits_removed %>% group_by(participant) %>%
summarise(gender_ss = mean(gender),
age_ss = mean(age))
nrow(gender_count)
# count how many males
males <- sum(gender_count$gender_ss)
# count how many females
females <- 53 - males
# get descriptive statistics
mean_age <- mean(gender_count$age_ss)
sd_age <- sd(gender_count$age_ss)
range(gender_count$age_ss)
mean_age
sd_age
# create dataframe for average of all outcomes by using group_by
# correct_percent deleted from here because it was being averaged twice
average_outcomes_df <- df_limits_removed %>% group_by(participant, block_type_0A_1C) %>%
summarise(age = mean(age),
gender_0F_1M = max(gender),
average_block_attempts_this_block = mean(block_attempts_this_block),
average_sum_correct = mean(sum_test_correct),
average_sum_button = mean(sum_button_pressed),
average_sum_stim_chosen = mean(sum_stim_picked),
average_block_affect_rating = mean(block_affect_rating),
average_block_arousal_rating = mean(block_arousal_rating),
average_block_sense_making_rating = mean(block_sense_making_rating),
average_sum_compound = mean(sum_compound_responding)
)
### Data Cleaning and Manipulating Complete
# write the very clean, tidy data file to a csv
write.csv(df_2, "Cleaned Average Data for Experiment One (INSERT DATE).csv")
# be informed that the analysis is complete by the computer speaking this message
system("say -v Daniel Hey sam your data has been cleaned and tidied")
# run a sound to make you feel happy inside regarding your coding abilities
beep(3)
# data cleaning is done, and what I would do next is to start ..
# checking the df for anomalies both by viewing the excel sheet,
# and using commands such as the following
str(df_2) # to view the structure (variable type etc)
summary(df_2) # to get a summary of results (descriptive stats of all variables)
### End of Code