Data analysis of basic parenting values/attitudes survey.

1 Data preprocessing

Preliminaries.

## [1] "dplyr"   "langcog" "tidyr"   "ggplot2" "lme4"
## 
## Attaching package: 'langcog'
## 
## The following object is masked from 'package:base':
## 
##     scale
## 
## 
## Attaching package: 'dplyr'
## 
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## 
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
## 
## 
## Attaching package: 'ggplot2'
## 
## The following object is masked from 'package:psych':
## 
##     %+%
## 
## Loading required package: MASS
## 
## Attaching package: 'MASS'
## 
## The following object is masked from 'package:dplyr':
## 
##     select
## 
## Loading required package: boot
## 
## Attaching package: 'boot'
## 
## The following object is masked from 'package:psych':
## 
##     logit
## 
## Loading required package: lattice
## 
## Attaching package: 'lattice'
## 
## The following object is masked from 'package:boot':
## 
##     melanoma
## 
## 
## Attaching package: 'nFactors'
## 
## The following object is masked from 'package:lattice':
## 
##     parallel
## 
## Loading required package: Matrix
## 
## Attaching package: 'Matrix'
## 
## The following object is masked from 'package:tidyr':
## 
##     expand

Read in files and consolidate to the same directory.

files <- dir("../production-results/uptake_e2/")
d.raw <- data.frame()

for (f in files) {
  jf <- paste("../production-results/uptake_e2/",f,sep="")
  jd <- fromJSON(paste(readLines(jf), collapse=""))
  
  # clean up different tasks
  trial_type <- jd$answer$data$trial_type
  
  #parenting questionnaire 
  sent <- jd$answers$data$sentence[trial_type != "uptake"]
  rating <- as.numeric(jd$answers$data$rating[trial_type != "uptake"]) 
  
  #uptake experiment
  control_recall_1 = as.character(jd$answers$data$answer[1])
  control_recall_2 = as.character(jd$answers$data$answer[2])
  control_recall_3 = as.character(jd$answers$data$answer[3])
  control_recall_4 = as.character(jd$answers$data$answer[4])
  control_recall_5 = as.character(jd$answers$data$answer[5])
  target_generalize_1 = as.character(jd$answers$data$answer[6])
  target_generalize_2 = as.character(jd$answers$data$answer[7])
  target_generalize_3 = as.character(jd$answers$data$answer[8])
  target_generalize_4 = as.character(jd$answers$data$answer[9])
  target_generalize_5 = as.character(jd$answers$data$answer[10])
  target_recall_1 = as.character(jd$answers$data$answer[11])
  target_recall_2 = as.character(jd$answers$data$answer[12])
  target_recall_3 = as.character(jd$answers$data$answer[13])
  target_recall_4 = as.character(jd$answers$data$answer[14])
  target_recall_5 = as.character(jd$answers$data$answer[15])
  reading_time_target = jd$answers$data$target_rt[1]
  reading_time_control = jd$answers$data$control_rt[1]
  time_questionnaire = jd$answers$data$questionnaire_rt[1]
  
  
  #demographics
  race <- as.character(jd$answers$data$race[1])
  
  id <- data.frame(workerid = jd$WorkerId, 
                   sent = sent,
                   rating = rating,
                   enjoy_target = jd$answers$data$enjoy_target,
                   enjoy_control = jd$answers$data$enjoy_control,
                   reading_ease_target = jd$answers$data$reading_ease_target,
                   reading_ease_control = jd$answers$data$reading_ease_control,
                   target_recall_1 = target_recall_1,
                   target_recall_2 = target_recall_2,
                   target_recall_3 = target_recall_3,
                   target_recall_4 = target_recall_4,
                   target_recall_5 = target_recall_5,
                   target_generalize_1 = target_generalize_1,
                   target_generalize_2 = target_generalize_2,
                   target_generalize_3 = target_generalize_3,
                   target_generalize_4 = target_generalize_4,
                   target_generalize_5 = target_generalize_5,
                   control_recall_1 = control_recall_1,
                   control_recall_2 = control_recall_2,
                   control_recall_3 = control_recall_3,
                   control_recall_4 = control_recall_4,
                   control_recall_5 = control_recall_5,
                   children = jd$answers$data$children,
                   language = jd$answers$data$homelang,
                   ses = jd$answers$data$ladder,
                   gender = jd$answers$data$gender,
                   age = jd$answers$data$age,
                   education = jd$answers$data$education,
                   ethnicity = jd$answers$data$ethnicity,
                   childAgeYoung = jd$answers$data$childAgeYoung,
                   childAgeOld = jd$answers$data$childAgeOld,
                   race = race,
                   reading_time_target = reading_time_target,
                   reading_time_control = reading_time_control,
                   time_questionnaire = time_questionnaire)
  d.raw <- bind_rows(d.raw, id)
}

Map on question short forms so that we can use these instead.

labels <- read.csv("sent_forms_e7.csv")
labels$sent <- as.character(labels$sent)

answers <- read.csv("uptake_key.csv")
answers$tar_gen <-as.character(answers$tar_gen)
answers$tar_rec <-as.character(answers$tar_rec)
answers$con_rec <-as.character(answers$con_rec)

Clean up labels.

d.raw$sent <- as.character(d.raw$sent)
d.raw$sent <- str_replace_all(d.raw$sent, "'", "")
d.raw$sent <- str_replace_all(d.raw$sent, "’", "")
d.raw$sent <- str_replace_all(d.raw$sent, "“", "")
d.raw$sent <- str_replace_all(d.raw$sent, "”", "")
d.raw$sent <- str_replace_all(d.raw$sent, "‘", "")
d.raw$sent <- str_replace_all(d.raw$sent, "â", "")

Merge. Recode uptake answers by accuracy.

d <- left_join(d.raw, labels)
d$rating[d$reverse_code == 1] <- 6 - d$rating[d$reverse_code == 1]

d$target_generalize_1[d$target_generalize_1 != answers$tar_gen[1]] <- 0
d$target_generalize_1[d$target_generalize_1 == answers$tar_gen[1]] <- 1
d$target_generalize_2[d$target_generalize_2 != answers$tar_gen[2]] <- 0
d$target_generalize_2[d$target_generalize_2 == answers$tar_gen[2]] <- 1
d$target_generalize_3[d$target_generalize_3 != answers$tar_gen[3]] <- 0
d$target_generalize_3[d$target_generalize_3 == answers$tar_gen[3]] <- 1
d$target_generalize_4[d$target_generalize_4 != answers$tar_gen[4]] <- 0
d$target_generalize_4[d$target_generalize_4 == answers$tar_gen[4]] <- 1
d$target_generalize_5[d$target_generalize_5 != answers$tar_gen[5]] <- 0
d$target_generalize_5[d$target_generalize_5 == answers$tar_gen[5]] <- 1

d$target_recall_1[d$target_recall_1 != answers$tar_rec[1]] <- 0
d$target_recall_1[d$target_recall_1 == answers$tar_rec[1]] <- 1
d$target_recall_2[d$target_recall_2 != answers$tar_rec[2]] <- 0
d$target_recall_2[d$target_recall_2 == answers$tar_rec[2]] <- 1
d$target_recall_3[d$target_recall_3 != answers$tar_rec[3]] <- 0
d$target_recall_3[d$target_recall_3 == answers$tar_rec[3]] <- 1
d$target_recall_4[d$target_recall_4 != answers$tar_rec[4]] <- 0
d$target_recall_4[d$target_recall_4 == answers$tar_rec[4]] <- 1
d$target_recall_5[d$target_recall_5 != answers$tar_rec[5]] <- 0
d$target_recall_5[d$target_recall_5 == answers$tar_rec[5]] <- 1

d$control_recall_1[d$control_recall_1 != answers$con_rec[1]] <- 0
d$control_recall_1[d$control_recall_1 == answers$con_rec[1]] <- 1
d$control_recall_2[d$control_recall_2 != answers$con_rec[2]] <- 0
d$control_recall_2[d$control_recall_2 == answers$con_rec[2]] <- 1
d$control_recall_3[d$control_recall_3 != answers$con_rec[3]] <- 0
d$control_recall_3[d$control_recall_3 == answers$con_rec[3]] <- 1
d$control_recall_4[d$control_recall_4 != answers$con_rec[4]] <- 0
d$control_recall_4[d$control_recall_4 == answers$con_rec[4]] <- 1
d$control_recall_5[d$control_recall_5 != answers$con_rec[5]] <- 0
d$control_recall_5[d$control_recall_5 == answers$con_rec[5]] <- 1

Plot demographic info.

subinfo <- d %>%
  group_by(workerid) %>%
  select(-short_sent, -category, -instrument, -reverse_code) %>%
  distinct 

questions <- subinfo %>%
  select(workerid, starts_with("target"), starts_with("control")) %>%
  gather(question, correct, starts_with("target"), starts_with("control")) %>%
  separate(question, c("passage","trial_type","q_num"), sep = "_") %>%
  group_by(workerid, passage, trial_type) %>%
  summarise(correct = mean(as.numeric(correct))) %>%
  unite(trialtype, passage, trial_type) %>%
  spread(trialtype, correct)

subinfo <- subinfo %>%
  select(-starts_with("target"), -starts_with("control")) %>%
  left_join(questions) %>%
  select(-sent, -rating)
subinfo$education <- factor(subinfo$education, 
                            levels = c("highSchool","someCollege","4year","someGrad","Grad"))

subinfo$gender <- str_replace_all(subinfo$gender, 
                              "female|FEMALE|F$|f$|Femal$|Females|Females","Female")
subinfo$gender <- str_replace_all(subinfo$gender, 
                                  "^male|^Male|^MALE|^M$|^m$|^Maleq|Make", "Male")
subinfo$gender <- str_replace_all(subinfo$gender, 
                                  "29|24|25|28|32|33|45", "")
subinfo$gender <- str_replace_all(subinfo$gender, 
                                  "males", "male")
subinfo$gender <- str_replace_all(subinfo$gender, " ", "")

subinfo$language <- str_replace_all(subinfo$language, "english|eNGLISH|Engliah|ENGLISH|^eng$|Enlgish", "English")
subinfo$language <- str_replace_all(subinfo$language," ", "")
subinfo$language <- str_replace_all(subinfo$language,"arabic", "Arabic")
subinfo$language <- str_replace_all(subinfo$language,"chinese", "Chinese")
subinfo$language <- str_replace_all(subinfo$language,"german", "German")
subinfo$language <- str_replace_all(subinfo$language,"tagalog", "Tagalog")

subinfo$youngestChildAge <- factor(subinfo$childAgeYoung, levels = c("","0to6mo","7to12mo","1y","2y","3y","4y","5y","6y","7y","8y","9y","10y","olderthan10"))

subinfo$oldestChildAge <- factor(subinfo$childAgeOld, levels = c("","0to6mo","7to12mo","1y","2y","3y","4y","5y","6y","7y","8y","9y","10y","olderthan10"))

subinfo$reading_ease_target <- factor(subinfo$reading_ease_target, levels = c("Very Difficult", "Somewhat Difficult", "Somewhat Easy", "Very Easy"))

subinfo$reading_ease_control <- factor(subinfo$reading_ease_control, levels = c("Very Difficult", "Somewhat Difficult", "Somewhat Easy", "Very Easy"))

subinfo$enjoy_target <- factor(subinfo$enjoy_target, levels = c("Very Unenjoyable", "Somewhat Unenjoyable", "Somewhat Enjoyable", "Very Enjoyable"))

subinfo$enjoy_control <- factor(subinfo$enjoy_control, levels = c("Very Unenjoyable", "Somewhat Unenjoyable", "Somewhat Enjoyable", "Very Enjoyable"))
qplot(ses, data=subinfo)

qplot(children, data=subinfo)

qplot(gender, data=subinfo)

qplot(education, data=subinfo)

qplot(age, data=subinfo)

qplot(language, data=subinfo)

qplot(ethnicity, data=subinfo)

qplot(race, data=subinfo)

qplot(youngestChildAge, data=subinfo)

qplot(oldestChildAge, data=subinfo)

qplot(reading_ease_target, data=subinfo)

qplot(reading_ease_control, data=subinfo)

qplot(enjoy_target, data=subinfo)

qplot(enjoy_control, data=subinfo)

2 Basic analyses

Now look at mean ratings across sentences.

rating_count <- table(d$rating) 
rating_count
## 
##    0    1    2    3    4    5    6 
##   77  132  212  524  731 1017 1807
prop.table(rating_count) 
## 
##          0          1          2          3          4          5 
## 0.01711111 0.02933333 0.04711111 0.11644444 0.16244444 0.22600000 
##          6 
## 0.40155556
ms <- d %>%
  group_by(category, instrument, short_sent, reverse_code) %>%
  multi_boot_standard(col = "rating") %>%
  arrange(instrument, category, desc(mean)) 

ms$short_sent_ord <- factor(ms$short_sent, 
                             levels = ms$short_sent)

Plot attitude.

qplot(short_sent_ord, mean, col = category,
      ymin = ci_lower, ymax = ci_upper, pch = factor(reverse_code),
      geom = "pointrange",
      data = filter(ms, instrument == "attitudes")) +
  theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5)) +
  xlab("") + 
  ylab("Mean Rating") + 
  ylim(c(0,6)) + 
  scale_colour_solarized()

Plot mean subscale scores.

mc <- d %>%
  group_by(category) %>%
  multi_boot_standard(col = "rating") %>%
  arrange(category, desc(mean)) 

ggplot(mc, aes(x = category, y = mean)) + 
  geom_bar(stat="identity") + 
  geom_linerange(aes(ymin = ci_lower, ymax = ci_upper), 
             position = position_dodge(width = .9))

mcl <- d %>%
  group_by(category, workerid) %>%
  multi_boot_standard(col = "rating") %>%
  arrange(category, desc(mean))%>%
  spread(category, mean) 

3 Scale reliability

3.1 Whole Scale

wide.attitudes <- d %>% 
  filter(instrument == "attitudes") %>%
  select(workerid, short_sent, rating) %>% 
  spread(short_sent, rating)
alpha.mat <- as.matrix(select(wide.attitudes, -workerid))
summary(alpha(x = alpha.mat))
## 
## Reliability analysis   
##  raw_alpha std.alpha G6(smc) average_r S/N  ase mean  sd
##       0.83      0.85    0.89      0.24 5.8 0.02  4.7 0.7

3.2 Rules and Respect Subscale

wide.rules_respect <- d %>% 
  filter(category == "rules_respect") %>%
  select(workerid, short_sent, rating) %>% 
  spread(short_sent, rating)
alpha.rr <- as.matrix(select(wide.rules_respect, -workerid))
summary(alpha(x = alpha.rr))
## 
## Reliability analysis   
##  raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd
##        0.7       0.7     0.7      0.28 2.4 0.044  4.2 0.92

3.3 Affection and Attachment Subscale

wide.affection <- d %>% 
  filter(category == "affection") %>%
  select(workerid, short_sent, rating) %>% 
  spread(short_sent, rating)
alpha.af <- as.matrix(select(wide.affection, -workerid))
summary(alpha(x = alpha.af))
## 
## Reliability analysis   
##  raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd
##       0.79      0.82    0.83      0.43 4.5 0.035  4.8 0.98

3.4 Active Learning Subscale

wide.active_learning <- d %>% 
  filter(category == "active_learning") %>%
  select(workerid, short_sent, rating) %>% 
  spread(short_sent, rating)
alpha.al <- as.matrix(select(wide.active_learning, -workerid))
summary(alpha(x = alpha.al))
## 
## Reliability analysis   
##  raw_alpha std.alpha G6(smc) average_r S/N   ase mean   sd
##       0.77      0.78    0.76      0.37 3.5 0.038    5 0.85

4 Information Uptake Analyses

Create a data frame that has subscale scores.

Standardize ratings within subscale.

ds <- d
ds$srating <- ave(ds$rating, ds$category, FUN=scale)

ss <- ds %>%
  group_by(workerid, category) %>%
  summarize(srating = mean(srating)) %>%
  spread(category, srating) %>% 
  left_join(subinfo) %>%
  gather(trial_type, correct, control_recall, target_recall, target_generalize)

ss.long <- ss %>%
  gather(subscale, srating, active_learning, affection, rules_respect)

4.1 Plots

ggplot(ss.long, aes(x = srating, y = correct, col = subscale)) +
  geom_jitter() + 
  geom_smooth(method="lm", se=FALSE) + 
  facet_wrap(~trial_type)

Now with some exclusions, exploratory.

exclude <- subinfo$workerid[subinfo$reading_time_target < 30 | subinfo$reading_time_control < 30]
length(exclude)
## [1] 189
qplot(subinfo$reading_time_target, binwidth = 15) + 
  geom_vline(xintercept =15, lty =2, col="red")

qplot(subinfo$reading_time_control, binwidth = 15) + 
  geom_vline(xintercept =15, lty =2, col="red")

Replot with exclusions.

ggplot(filter(ss.long, !workerid %in% exclude), 
       aes(x = srating, y = correct, col = subscale)) +
  geom_jitter() + 
  geom_smooth(method="lm", se=FALSE) + 
  facet_wrap(~trial_type)

4.2 Stats

subscales <- ds %>%
  group_by(workerid, category) %>%
  summarize(srating = mean(srating)) %>%
  spread(category, srating) 

ratings <- d %>% 
  select(workerid, starts_with("target"), starts_with("control")) %>%
  gather(question, correct, starts_with("target"), starts_with("control")) %>%
  separate(question, c("passage","trial_type","q_num"), sep = "_") %>%
  mutate(correct = as.numeric(correct), 
         q_num = as.numeric(q_num) + 
           as.numeric(factor(passage)) * 10 + 
           as.numeric(factor(trial_type)) * 100)

d.reg <- left_join(ratings, subscales) %>%
  unite(question_type, passage, trial_type)
summary(glmer(correct ~ question_type * rules_respect + 
                question_type * active_learning + 
                (1|workerid) + 
                (1|q_num), 
              data = filter(d.reg, !workerid %in% exclude), 
              family = "binomial"))
## Generalized linear mixed model fit by maximum likelihood (Laplace
##   Approximation) [glmerMod]
##  Family: binomial  ( logit )
## Formula: 
## correct ~ question_type * rules_respect + question_type * active_learning +  
##     (1 | workerid) + (1 | q_num)
##    Data: filter(d.reg, !workerid %in% exclude)
## 
##      AIC      BIC   logLik deviance df.resid 
##  10761.3  10846.2  -5369.7  10739.3    16459 
## 
## Scaled residuals: 
##     Min      1Q  Median      3Q     Max 
## -4.7910  0.0222  0.1708  0.3595  7.2830 
## 
## Random effects:
##  Groups   Name        Variance Std.Dev.
##  workerid (Intercept) 5.3236   2.307   
##  q_num    (Intercept) 0.7868   0.887   
## Number of obs: 16470, groups:  workerid, 61; q_num, 15
## 
## Fixed effects:
##                                                Estimate Std. Error z value
## (Intercept)                                     1.72087    0.49767   3.458
## question_typetarget_generalize                  1.30465    0.56188   2.322
## question_typetarget_recall                      1.00594    0.56221   1.789
## rules_respect                                   0.23239    0.46030   0.505
## active_learning                                 0.16559    0.45845   0.361
## question_typetarget_generalize:rules_respect    0.03638    0.10680   0.341
## question_typetarget_recall:rules_respect        0.45099    0.09478   4.758
## question_typetarget_generalize:active_learning  1.34951    0.09501  14.204
## question_typetarget_recall:active_learning      1.00373    0.08890  11.290
##                                                Pr(>|z|)    
## (Intercept)                                    0.000545 ***
## question_typetarget_generalize                 0.020237 *  
## question_typetarget_recall                     0.073574 .  
## rules_respect                                  0.613659    
## active_learning                                0.717955    
## question_typetarget_generalize:rules_respect   0.733359    
## question_typetarget_recall:rules_respect       1.95e-06 ***
## question_typetarget_generalize:active_learning  < 2e-16 ***
## question_typetarget_recall:active_learning      < 2e-16 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Correlation of Fixed Effects:
##                       (Intr) qstn_typtrgt_g qstn_typtrgt_r rls_rs actv_l
## qstn_typtrgt_g        -0.560                                            
## qstn_typtrgt_r        -0.561  0.496                                     
## rules_rspct            0.018 -0.001         -0.001                      
## activ_lrnng            0.012 -0.001          0.000         -0.139       
## qstn_typtrgt_gnrlz:r_  0.000  0.000          0.001         -0.065  0.010
## qstn_typtrgt_rcll:r_   0.001  0.003          0.011         -0.076  0.012
## qstn_typtrgt_gnrlz:c_  0.000  0.023          0.005          0.011 -0.068
## qstn_typtrgt_rcll:c_   0.000  0.006          0.018          0.013 -0.077
##                       qstn_typtrgt_gnrlz:r_ qstn_typtrgt_rcll:r_
## qstn_typtrgt_g                                                  
## qstn_typtrgt_r                                                  
## rules_rspct                                                     
## activ_lrnng                                                     
## qstn_typtrgt_gnrlz:r_                                           
## qstn_typtrgt_rcll:r_   0.371                                    
## qstn_typtrgt_gnrlz:c_ -0.196                -0.059              
## qstn_typtrgt_rcll:c_  -0.057                -0.132              
##                       qstn_typtrgt_gnrlz:c_
## qstn_typtrgt_g                             
## qstn_typtrgt_r                             
## rules_rspct                                
## activ_lrnng                                
## qstn_typtrgt_gnrlz:r_                      
## qstn_typtrgt_rcll:r_                       
## qstn_typtrgt_gnrlz:c_                      
## qstn_typtrgt_rcll:c_   0.402