MitsMark- StimListeR

## Alan Nielsen
## June 25, 2017
## Stimlist Creation for Nielsen, Ota, & Dingemanse
## This script sets up trials for the sampled within-subjects design

require(reshape2)
require(plyr)
require(magrittr)
require(lme4)
require(dplyr)
require(tidyr)
require(broom)
require(lattice)
require(doBy)
require(stringr)

##------------------------------------------------------------------
## Pre-processing:
##------------------------------------------------------------------

RI <- read.csv("F:/Experiments/Collaborations/MitsMark/data/RedupIdeo.csv")           # Read in the Reduplicative ideophones 
NRI <- read.csv("F:/Experiments/Collaborations/MitsMark/data/NonRedupIdeo.csv")       # Read in the Non-Reduplicative ideophones
NAdj <- read.csv("F:/Experiments/Collaborations/MitsMark/data/NormAdjs.csv")          # Read in normal adjectives


################################################################################################
## Write a all of the functions

#################################################################################################

# This function is supposed to do the sampling - once for every seednum, and return the sample

createsample <- function(seedval) {              
  
  set.seed(seedval)   # Setting the seed is used because we are yoking participants to each other- e.g. we might have 20 different samples and 25 participants using
  # the same seed 
  
  RI <- RI[sample(nrow(RI)),]                         # Scrambles the order of the Reduplicated Ideophones 
  RISplit <- split(RI, rep(1:4, each= 6))             # Split the dataframe into two equal parts
  RIWords <- RISplit$'1'                              # use half of the RI as RI
  SRIWords <- RISplit$'2'                             # use half of the RI as SRI (split them below)
  SRIWords %<>%  mutate(Japanese = str_sub(Japanese, start=nchar(as.character(Japanese))/2 + 1))  # Splits the ideophones
  
  NRI <- NRI[sample(nrow(NRI)),]                       # Scrambles the order of the Non-Reduplicated Ideophones
  NRISplit <- split(NRI, rep(1:4, each= 6))            # Split the dataframe into two equal parts
  NRIWords <- NRISplit$'1'                                  # use half of the NRI as NRI
  RNRIWords <- NRISplit$'2'                                 # use half of the NRI as RNRI (reduplicated below)
  RNRIWords$Word2 <- RNRIWords$Japanese
  RNRIWords <- unite(RNRIWords, Japanese, Japanese, Word2, sep = "", remove = TRUE) 
  
  NAdj <- NAdj[sample(nrow(NAdj)),]                    # Scrambles the order of the Normal Adjectives
  NAdjSplit <- split(NAdj, rep(1:4, each= 6))         # Split the dataframe into two equal parts
  NAdjWords <- NAdjSplit$'1'                                # use half of the NA as NA
  RNAdjWords <- NAdjSplit$'2'                               # use half of the NA as RNA (reduplicated below)
  RNAdjWords$Word2 <- RNAdjWords$Japanese
  RNAdjWords <- unite(RNAdjWords, Japanese, Japanese, Word2, sep = "", remove = TRUE) 
  
  RIWords$WordType <- "RI"      # add a wordtype column to all of them showing their wordtype
  SRIWords$WordType <- "SRI"
  NRIWords$WordType <- "NRI"
  RNRIWords$WordType <- "RNRI"
  NAdjWords$WordType <- "NAdj"
  RNAdjWords$WordType <- "RNAdj"
  
  sampleX <- rbind(RIWords, SRIWords, NRIWords, RNRIWords, NAdjWords, RNAdjWords)
  names(sampleX)[names(sampleX) == 'Japanese'] <- 'Word'                    # Change some names
  names(sampleX)[names(sampleX) == 'Engl'] <- 'TrueMeaning'              # Change some names
  sampleX <- subset(sampleX, select = c(Word, TrueMeaning, WordType))       # Select only the columns we need

  return(sampleX)
}


##################################################################

createtraining <- function(sampleY) {
  
  ####################################################################
  ## Set up Training trials
  rm(.Random.seed, envir=globalenv())           # this script resets the seed to something random
  
  Training_FullWithin <- sampleY       # Stick all the sampled word types together into a list
  TrainingFullWithin1 <- Training_FullWithin[sample(nrow(Training_FullWithin)),]                    # Scramble the list
  TrainingFullWithin2 <- Training_FullWithin[sample(nrow(Training_FullWithin)),]                    # Scramble the list
  Training <- rbind(TrainingFullWithin1, TrainingFullWithin2)                            # Stick the two lists together to make a training list (in training all words are seen twice in two blocks)
  
  return(Training)
  
}


createtesting <- function(sampleY) {
  
  ## Set up Testing trials  
  
  rm(.Random.seed, envir=globalenv())           # this script resets the seed to something random
  
  TestingCorrect <- sampleY                                                             # Stick the correct meanings into a list
  TestingCorrect$TrialType <- "T"
  TestingCorrect$TargetMeaning <- TestingCorrect$TrueMeaning
  TestingCorrect$LegalTrial <- "Y"                                                      # notes that all target trials are legal trials
  
  
  TestingIncorrect <- transform(sampleY, TargetMeaning = sample(TrueMeaning))           # Create a second list where meanings are randomly assigned
  TestingIncorrect$TrialType <- "D"
  TestingIncorrect$LegalTrial <- ifelse(TestingIncorrect$TargetMeaning == TestingIncorrect$TrueMeaning, "N", "Y")      # puts in a column that tells you if the trial is legal
  
  Testing <- rbind(TestingCorrect, TestingIncorrect)
  Testing <- Testing[sample(nrow(Testing)),]
  
  TestCase <- length(unique(Testing$LegalTrial))  # checks what the values of the whole columns LegalTrial are - if they are all legal then this will just return 1,
                                                  # if not it will return 2
  
  if(TestCase == 1) {
    return(Testing)                       # if there are only legal trials, return Testing to the function
  } else {
    createtesting(sampleY)                # if there are illegal trials, then re-call the function
  }
  
  
}



############################################################################################################################
## Here is the function that actually writes out all of the csvs etc.
# Don't worry about the warnings that get kicked - they are just from the split function and are working correctly

writestimuli <- function(seeds, perseed) {  
  # do n times, storing the current iteration number in 'i'
  for (i in 1:seeds) {
    seednum <- i
    
    seedval <- sample(1:10000,1)
    
    sampleY <- createsample(seedval)
    
    for (j in 1:perseed) {
      
      training <- createtraining(sampleY)
      training <- cbind(Id=paste(seednum, j, sep="-"), training)
      write.table(training, paste("F:/Experiments/Collaborations/MitsMark/data/stimlists",seednum, "-", j, "-Training", ".csv", sep = ''),
                  sep = "\t", row.names = FALSE)
      
      testing <- createtesting(sampleY)
      testing <- cbind(Id=paste(seednum, j, sep="-"), testing)
      write.table(testing, paste("F:/Experiments/Collaborations/MitsMark/data/stimlists",seednum, "-", j, "-Testing", ".csv", sep = ''),
                  sep = "\t", row.names = FALSE)
    }
  }
}



#################################################################################################################################
# Now we write a function that calls writestimuli - we give this function a target standard deviation for how similar the counts of words being 
# sampled has to be, and it will keep calling writestimuli until it settles on a list that is well sampled enough


#### I want to feed this function a targetSD, and a maximum number of tries to get to that targetSD (so it doesn't just hang the computer forever)
# I tried to implement this a few different ways but I just can't get it to work properly

# SelectBalanced <- function(TargetSD, MaxTries, TryNum) { 
#   
#   writestimuli(25,20)
#   
#   files  <- list.files(pattern = '\\-1-Testing.csv')          # Read one testing file from each seed into a dataframe
#   tables <- lapply(files, read.csv, header = TRUE, sep = '\t')
#   AllTrials <- do.call(rbind , tables)
# 
#   AllTrials$Word <- as.factor(AllTrials$Word)
# 
#   WordCounts <- count(AllTrials, Word)
#   
#   SD = sd(WordCounts$n)
#   
#   if(TryNum < MaxTries){
#     if(SD == 0){
#       print("Complete")
#     } else {
#         print(SD)
#         TryNum <- TryNum + 1
#         return(TryNum)
#         SelectBalanced(0, 10, TryNum)
#     }
#   } else {
#       print("Failed to Reach TargetSD")
#   }
#   
# 
# }


######## Here I try to run it without the fancy if statements- it should just run forever, right?

SelectBalanced2 <- function(MinNum, seeds, perseed) { 
  
  writestimuli(seeds,perseed)
  
  files  <- list.files(pattern = '\\-1-Testing.csv')          # Read one testing file from each seed into a dataframe
  tables <- lapply(files, read.csv, header = TRUE, sep = '\t')
  AllTrials <- do.call(rbind , tables)
  
  AllTrials$Word <- as.factor(AllTrials$Word)
  
  WordCounts <- count(AllTrials, Word)
  MaxN <- max(WordCounts$n)
  MinN <- min(WordCounts$n)
  SD <- sd(WordCounts$n)
  Diff = MaxN - MinN
  
  if(MinN >= MinNum){
      print(paste(MaxN, MinN, Diff, SD))
      print("Complete")
    }else {
      print(paste(MaxN, MinN, Diff, SD))
      SelectBalanced2(MinNum, seeds, perseed)
    }
  
}


######################
# 


writestimuli(25, 20)
MitsMark- StimListeR

Alan Nielsen

September 4, 2017