We started with a really messy dataset with issues below:
1. raw data has been compromised by manually adding entries at the bottom of each file
2. after binding .csv files together, there were 6 NA columns for each row, this was due to inconsistent columns in each .csv file before binding
3. column entries were highly inconsistent in terms of casing or style (e.g., Abb vs. abb vs. #_Abb ; numeric vs. factor), thus many unintended levels were created
4. relavant variable values are allocated across 8 different columns in stead of 2, we need to align these values into common columns
5. variable name was changes halfway through data collection, thus 2 columns intended for the same variable were created complementing each other
We will conquer these issues one by one and return a clean dataframe for further analyses.
library(data.table)
require(plyr)
library(knitr)
library(ggplot2)
library(dplyr)
library(Hmisc)
#compile all csv files into one, and remove last 3 rows from each csv file due to inadvertent manual changes irrelavant to current analyses
csv <- list.files(full.names = TRUE, pattern = '*.csv')
# data <- rbind.fill(lapply(csv, read_data <- function(z) {
# dat <- fread(z)
# return(dat[1:(nrow(dat) - 3), ])
# }))
data <- as_tibble (rbind.fill (lapply (csv, fread)))
#subset data
df<-data[, c('Participant',"selectCondition", "stimfile", 'Gender', 'Age','Discipline','Field of Study', 'Letters_RB_Trials.thisRepN','Tapping_II_Trials.thisRepN','Letters_II_Trials.thisRepN','Tapping_RB_Trials.thisRepN',"Image_Response_B1.rt", "Image_Response_B1.corr", "Image_Response_B2.rt", "Image_Response_B2.corr", "Image_Response_B3.rt", "Image_Response_B3.corr", "Image_Response_B4.rt","Image_Response_B4.corr")]
#make casing consistent across each column for all cols in dataframe
df<-data.frame(lapply(df, function(v) {
if (is.character(v)) return(toupper(v))
else return(v)
}))
colnames(df)[colnames(df)=="Field.of.Study"] <- "Field"
#convert factor cols to factor
df[c('Participant',"selectCondition", "stimfile", 'Gender', 'Letters_RB_Trials.thisRepN','Tapping_II_Trials.thisRepN','Letters_II_Trials.thisRepN','Tapping_RB_Trials.thisRepN',"Image_Response_B1.corr", "Image_Response_B2.corr", "Image_Response_B3.corr","Image_Response_B4.corr")]<-lapply(df[c('Participant',"selectCondition", "stimfile", 'Gender', 'Letters_RB_Trials.thisRepN','Tapping_II_Trials.thisRepN','Letters_II_Trials.thisRepN','Tapping_RB_Trials.thisRepN',"Image_Response_B1.corr", "Image_Response_B2.corr", "Image_Response_B3.corr", "Image_Response_B4.corr")],factor)
levels(df$Gender)[levels(df$Gender)=="MAN"] <- 'MALE'
#name came from nowhere, coded as P100
levels(df$Participant)[levels(df$Participant)=="OLIVIA KEENAN"] <- 'P100'
#describe(df)
df$corr<-ifelse(df$selectCondition=='1', df$Image_Response_B1.corr, NA)
df$rt<-ifelse(df$selectCondition=='1', df$Image_Response_B1.rt, NA)
df$corr<-ifelse(df$selectCondition=='2', df$Image_Response_B2.corr, df$corr)
df$rt<-ifelse(df$selectCondition=='2', df$Image_Response_B2.rt, df$rt)
df$corr<-ifelse(df$selectCondition=='3', df$Image_Response_B3.corr, df$corr)
df$rt<-ifelse(df$selectCondition=='3', df$Image_Response_B3.rt, df$rt)
df$corr<-ifelse(df$selectCondition=='4', df$Image_Response_B4.corr, df$corr)
df$rt<-ifelse(df$selectCondition=='4', df$Image_Response_B4.rt, df$rt)
df$block <-ifelse(df$selectCondition=='1', df$Letters_RB_Trials.thisRepN, NA)
df$block <-ifelse(df$selectCondition=='2', df$Letters_II_Trials.thisRepN, df$block)
df$block <-ifelse(df$selectCondition=='3', df$Tapping_RB_Trials.thisRepN, df$block)
df$block <-ifelse(df$selectCondition=='4', df$Tapping_II_Trials.thisRepN, df$block)
#set this so column doesn't return numeric values
df1<-data.frame(df, stringsAsFactors = FALSE)
#convert cols to character for coalesce to work
df1$Discipline<-as.character(df1$Discipline)
df1$Field<-as.character(df1$Field)
#coalesce columns
df1$program<-coalesce(df1$Discipline,df1$Field)
df2<-df1[,c('Participant',"selectCondition","stimfile", 'Gender', 'Age','program', "corr", "rt",'block')]
kable(head(df2))
| Participant | selectCondition | stimfile | Gender | Age | program | corr | rt | block |
|---|---|---|---|---|---|---|---|---|
| P1 | 1 | BSD19.JPG | FEMALE | 18 | MOS | 1 | 107.430366 | 1 |
| P1 | 1 | ASD33.JPG | FEMALE | 18 | MOS | 1 | 8.281572 | 1 |
| P1 | 1 | BSD15.JPG | FEMALE | 18 | MOS | 2 | 17.777941 | 1 |
| P1 | 1 | BSD3.JPG | FEMALE | 18 | MOS | 1 | 2.517104 | 1 |
| P1 | 1 | ASD35.JPG | FEMALE | 18 | MOS | 1 | 1.716698 | 1 |
| P1 | 1 | ASD2.JPG | FEMALE | 18 | MOS | 2 | 11.414126 | 1 |