# sample code
# version 1
  # 4/7/2021
# data: CC data
# authors
  # heather perkins

###############################################################################
# notes -------------------------------------------------------------------
# useful code from office hours
# ctrl-shift-c - create commented out line (like this one)
# ctrl-shift-r - create section header
# ctrl-alt-b - run all code before this point

# code to create new csv file
# write.csv(data, file="cleaned_data1.csv", row.names = F)

# code to create new ids
# data_pre$new_id <- 1:nrow(data_pre)
# data_post$new_id <- 91:170

# create a new column for each dataframe before merging
# data_pre$timepoint <- "pre"

###############################################################################
# 1. basic starting tasks -------------------------------------------------

# 1.1 load libraries ------------------------------------------------------
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(tidyr)


# 1.2 import data -------------------------------------------------------------
data_pre <- read.csv(file="Data/final_pre.csv", header = T, na.strings = c("NA",""," "))
data_post <- read.csv(file="Data/final_post.csv", header = T, na.strings = c("NA",""," "))

###############################################################################

###############################################################################
# 2. create factor scores -------------------------------------------------

names(data_pre)
##  [1] "salg1"        "salg2"        "salg3"        "salg4"        "salg5"       
##  [6] "salg6"        "salg7"        "salg8"        "vbs1"         "vbs2"        
## [11] "vbs3"         "vbs4"         "vbs5"         "vbs6"         "vbs7"        
## [16] "vbs8"         "vbs9"         "vbs10"        "vbs11"        "vbs12"       
## [21] "vbs13"        "vbs14"        "vbs15"        "write_gender" "id"
data_pre$salg <- rowMeans(data_pre[1:8], na.rm = F)
data_pre$vbs <- rowMeans(data_pre[9:23], na.rm = F)

names(data_post)
##  [1] "salg1"        "salg2"        "salg3"        "salg4"        "salg5"       
##  [6] "salg6"        "salg7"        "salg8"        "vbs1"         "vbs2"        
## [11] "vbs3"         "vbs4"         "vbs5"         "vbs6"         "vbs7"        
## [16] "vbs8"         "vbs9"         "vbs10"        "vbs11"        "vbs12"       
## [21] "vbs13"        "vbs14"        "vbs15"        "write_gender" "id"
data_post$salg <- rowMeans(data_post[1:8], na.rm = F)
data_post$vbs <- rowMeans(data_post[9:23], na.rm = F)

data_pre2 <- subset(data_pre, select=c(id, write_gender, salg, vbs))
data_post2 <- subset(data_post, select=c(id, write_gender, salg, vbs))

###############################################################################

###############################################################################
# 3. merging pre and post -------------------------------------------------

# 3.1 append column names -------------------------------------------------
names(data_pre2)
## [1] "id"           "write_gender" "salg"         "vbs"
pre_names <- paste0(names(data_pre2)[2:4],"_pre")
colnames(data_pre2)[2:4] <- pre_names

post_names <- paste0(names(data_post2)[2:4],"_post")
names(data_post2)[2:4] <- post_names

# 3.2 merge dataframes ----------------------------------------------------
data_final_matched <- merge(data_pre2, data_post2, by = "id") # only matched pairs
data_final_all <- merge(data_pre2, data_post2, by = "id", all = T) # all observations, even those without matches

###############################################################################

###############################################################################
# 4. create non-repeating variable ----------------------------------------

# uses post-test gender data if available, otherwise uses gender data from pre-test or leaves NA
gender <- data_final_all$write_gender_post
gender[is.na(data_final_all$write_gender_post)] <- data_final_all$write_gender_pre[is.na(data_final_all$write_gender_post)]

# creates new vector, removes cases and extra spaces, and creates list of entries
gender2 <- tolower(gender)
gender3 <- trimws(gender2)
gender_write <- unique(gender3)

data_final_all$gender_rc[gender3 == gender_write[1]] <- "F"
data_final_all$gender_rc[gender3 == gender_write[2]] <- "M"
data_final_all$gender_rc[gender3 == gender_write[4]] <- "NB"
data_final_all$gender_rc[gender3 == gender_write[5]] <- "F"
data_final_all$gender_rc[gender3 == gender_write[6]] <- "F"
data_final_all$gender_rc[gender3 == gender_write[7]] <- "F"

# subset to remove extra columns
head(data_final_all)
##   id write_gender_pre salg_pre  vbs_pre write_gender_post salg_post vbs_post
## 1  1           Female    2.750 3.866667            Female     2.625 3.800000
## 2  2           Female    2.375 3.533333           female      2.250 3.133333
## 3  3          Female     2.250 3.866667            Female     3.625 3.933333
## 4  4           female    4.250 4.066667            Female     4.250 3.933333
## 5  5          Female     3.750 3.600000           female      3.500 3.933333
## 6  6           Female    4.250 3.266667            Female     5.000 4.333333
##   gender_rc
## 1         F
## 2         F
## 3         F
## 4         F
## 5         F
## 6         F
data_final_all2 <- subset(data_final_all, select=-c(write_gender_pre, write_gender_post))

###############################################################################

###############################################################################
# 5. switching between long/wide formats ----------------------------------

# 5.1 switching from wide to long -----------------------------------------
head(data_final_all2)
##   id salg_pre  vbs_pre salg_post vbs_post gender_rc
## 1  1    2.750 3.866667     2.625 3.800000         F
## 2  2    2.375 3.533333     2.250 3.133333         F
## 3  3    2.250 3.866667     3.625 3.933333         F
## 4  4    4.250 4.066667     4.250 3.933333         F
## 5  5    3.750 3.600000     3.500 3.933333         F
## 6  6    4.250 3.266667     5.000 4.333333         F
data_final_all_long <- gather(data_final_all2, variable, value, salg_pre:vbs_post, factor_key = T)
head(data_final_all_long)
##   id gender_rc variable value
## 1  1         F salg_pre 2.750
## 2  2         F salg_pre 2.375
## 3  3         F salg_pre 2.250
## 4  4         F salg_pre 4.250
## 5  5         F salg_pre 3.750
## 6  6         F salg_pre 4.250
# 5.2 switching from long to wide -----------------------------------------
data_final_all_wide <- spread(data_final_all_long, variable, value)
head(data_final_all_wide)
##   id gender_rc salg_pre  vbs_pre salg_post vbs_post
## 1  1         F    2.750 3.866667     2.625 3.800000
## 2  2         F    2.375 3.533333     2.250 3.133333
## 3  3         F    2.250 3.866667     3.625 3.933333
## 4  4         F    4.250 4.066667     4.250 3.933333
## 5  5         F    3.750 3.600000     3.500 3.933333
## 6  6         F    4.250 3.266667     5.000 4.333333