library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.0 ──
## ✓ ggplot2 3.3.3     ✓ purrr   0.3.4
## ✓ tibble  3.1.0     ✓ dplyr   1.0.5
## ✓ tidyr   1.1.1     ✓ stringr 1.4.0
## ✓ readr   1.3.1     ✓ forcats 0.4.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(here)
## here() starts at /Users/caoanjie/Desktop/projects/CCRR_kids

set up path

US_PATH <- here("data/1_raw_data/US/")
CN_PATH <- here("data/1_raw_data/CN/")

MERGED_DATA_PATH <- here("data/2_merge_data/merged_data.csv")

read in files

first we count how many rows are in the data files

us_files <- str_c(US_PATH, dir(here(US_PATH), "*.csv"))
cn_files <- str_c(CN_PATH, dir(here(CN_PATH), "*.csv"))


us_data_RAW <- map_df(us_files, function(file) {
  d <- read_csv(file) %>% 
    count() %>% 
    mutate(
      file_name = file 
    )
  }) 


cn_data_RAW <- map_df(cn_files, function(file) {
  d <- read_csv(file) %>% 
    count() %>% 
    mutate(
      file_name = file, 
    )
  }) 

#TBA: check if we are just trying it out 

MIN_ROW = 50

us_data <- map_df((us_data_RAW %>% filter(n > MIN_ROW))$file_name,
                  function(file){
                    d <- read_csv(file) %>% 
                      mutate(unique = as.character(unique))
                  }) %>% 
  mutate(culture = "US")


cn_data <- map_df((cn_data_RAW %>% filter(n > MIN_ROW))$file_name,
                  function(file){
                    d <- read_csv(file) %>% 
                      mutate(unique = as.character(unique))
                  })%>% 
  mutate(culture = "CN") 

# count number of raw participants: 
fun.count_s <- function(df){
  num_s <- df %>% distinct(subject) %>% count()
  return(num_s)
}

#fun.count_s(us_data)
fun.count_s(cn_data)
## # A tibble: 1 x 1
##       n
##   <int>
## 1    11

merge data together

merged_data <- bind_rows(us_data,cn_data)

write_csv(merged_data, MERGED_DATA_PATH)
# currently contains voice info which is identifiable 

if no raw data start from reading merged data

#merged_data %>% 
#  filter(culture == "CN")

NOT DOING ANY EXCLUSION RIGHT NOW

source(here("preprocessing/extract/extract_demog.R"))
## 
## Attaching package: 'jsonlite'
## The following object is masked from 'package:purrr':
## 
##     flatten
## buggy, does not work, issue with the jsonstring!
#extract_demog(merged_data, "US")
#extract_demog(merged_data, "CN")

free description (audio)

this is very slow! need to optimize or modularize this into another rmd

ad_only <- merged_data %>% 
  select(culture, subject, audio_data) %>% 
  filter(culture == "CN") %>% 
  filter(!is.na(audio_data))
library(base64enc)


write_to_audio <- function(audio_code, path_name){
  audio <- base64decode(audio_code)
  filte_path <- file(paste0(path_name, ".wav"), "wb")
  writeBin(audio, filte_path)
  close(filte_path)
}

convert_all_audio_file <- function(df){

  total_n <- nrow(df)
  
  df_id <- df %>% 
    group_by(subject) %>% 
    mutate(audio_id = row_number())
  
  
  for (i in 1:total_n){
    path_name <- here(paste0(df_id$audio_id[[i]], df_id$subject[[i]]))
    audio_code <- df_id$audio_data[[i]]
    write_to_audio(audio_code, path_name)
    
  }
  
  
}

convert_all_audio_file(ad_only)
source(here("preprocessing/task/RMTS.R"))

get_RMTS_main(merged_data)
## # A tibble: 48 x 7
##    subject         culture task_name task_info trial_info resp_type     resp
##    <chr>           <chr>   <chr>     <chr>     <chr>      <chr>        <dbl>
##  1 SS1623330510838 US      RMTS      RMTS      RMTS       choice_match     1
##  2 SS1623330510838 US      RMTS      RMTS      RMTS       choice_match     1
##  3 SS1623330510838 US      RMTS      RMTS      RMTS       choice_match     1
##  4 SS1623330510838 US      RMTS      RMTS      RMTS       choice_match     1
##  5 SS1625633849073 CN      RMTS      RMTS      RMTS       choice_match     0
##  6 SS1625633849073 CN      RMTS      RMTS      RMTS       choice_match     0
##  7 SS1625633849073 CN      RMTS      RMTS      RMTS       choice_match     0
##  8 SS1625633849073 CN      RMTS      RMTS      RMTS       choice_match     0
##  9 SS1626246567529 CN      RMTS      RMTS      RMTS       choice_match     0
## 10 SS1626246567529 CN      RMTS      RMTS      RMTS       choice_match     0
## # … with 38 more rows

pen choice

source(here("preprocessing/task/conformity_preference.R"))

get_CP_main(merged_data)
## # A tibble: 15 x 7
##    subject         culture task_name task_info trial_info resp_type  resp
##    <chr>           <chr>   <chr>     <chr>     <chr>      <chr>     <dbl>
##  1 SS1619115057503 US      CP        CP        CP         choice        0
##  2 SS1620060991140 US      CP        CP        CP         choice        1
##  3 SS1622239825527 US      CP        CP        CP         choice        1
##  4 SS1623330510838 US      CP        CP        CP         choice        0
##  5 SS1625633849073 CN      CP        CP        CP         choice        1
##  6 SS1626246567529 CN      CP        CP        CP         choice        1
##  7 SS1626248030076 CN      CP        CP        CP         choice        0
##  8 SS1626270942346 CN      CP        CP        CP         choice        1
##  9 SS1626318133574 CN      CP        CP        CP         choice        1
## 10 SS1626318241504 CN      CP        CP        CP         choice        0
## 11 SS1626205419081 CN      CP        CP        CP         choice        0
## 12 SS1626350796915 CN      CP        CP        CP         choice        1
## 13 SS1626436894129 CN      CP        CP        CP         choice        1
## 14 SS1626472640251 CN      CP        CP        CP         choice        1
## 15 SS1626483615684 CN      CP        CP        CP         choice        0