Data analysis of basic parenting values/attitudes survey.
Preliminaries.
## [1] "dplyr" "langcog" "tidyr" "ggplot2" "lme4"
##
## Attaching package: 'langcog'
##
## The following object is masked from 'package:base':
##
## scale
##
##
## Attaching package: 'dplyr'
##
## The following objects are masked from 'package:stats':
##
## filter, lag
##
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
##
##
## Attaching package: 'ggplot2'
##
## The following object is masked from 'package:psych':
##
## %+%
##
## Loading required package: MASS
##
## Attaching package: 'MASS'
##
## The following object is masked from 'package:dplyr':
##
## select
##
## Loading required package: boot
##
## Attaching package: 'boot'
##
## The following object is masked from 'package:psych':
##
## logit
##
## Loading required package: lattice
##
## Attaching package: 'lattice'
##
## The following object is masked from 'package:boot':
##
## melanoma
##
##
## Attaching package: 'nFactors'
##
## The following object is masked from 'package:lattice':
##
## parallel
##
## Loading required package: Matrix
##
## Attaching package: 'Matrix'
##
## The following object is masked from 'package:tidyr':
##
## expand
Read in files and consolidate to the same directory.
files <- dir("../production-results/uptake_e2/")
d.raw <- data.frame()
for (f in files) {
jf <- paste("../production-results/uptake_e2/",f,sep="")
jd <- fromJSON(paste(readLines(jf), collapse=""))
# clean up different tasks
trial_type <- jd$answer$data$trial_type
#parenting questionnaire
sent <- jd$answers$data$sentence[trial_type != "uptake"]
rating <- as.numeric(jd$answers$data$rating[trial_type != "uptake"])
#uptake experiment
control_recall_1 = as.character(jd$answers$data$answer[1])
control_recall_2 = as.character(jd$answers$data$answer[2])
control_recall_3 = as.character(jd$answers$data$answer[3])
control_recall_4 = as.character(jd$answers$data$answer[4])
control_recall_5 = as.character(jd$answers$data$answer[5])
target_generalize_1 = as.character(jd$answers$data$answer[6])
target_generalize_2 = as.character(jd$answers$data$answer[7])
target_generalize_3 = as.character(jd$answers$data$answer[8])
target_generalize_4 = as.character(jd$answers$data$answer[9])
target_generalize_5 = as.character(jd$answers$data$answer[10])
target_recall_1 = as.character(jd$answers$data$answer[11])
target_recall_2 = as.character(jd$answers$data$answer[12])
target_recall_3 = as.character(jd$answers$data$answer[13])
target_recall_4 = as.character(jd$answers$data$answer[14])
target_recall_5 = as.character(jd$answers$data$answer[15])
reading_time_target = jd$answers$data$target_rt[1]
reading_time_control = jd$answers$data$control_rt[1]
time_questionnaire = jd$answers$data$questionnaire_rt[1]
#demographics
race <- as.character(jd$answers$data$race[1])
id <- data.frame(workerid = jd$WorkerId,
sent = sent,
rating = rating,
enjoy_target = jd$answers$data$enjoy_target,
enjoy_control = jd$answers$data$enjoy_control,
reading_ease_target = jd$answers$data$reading_ease_target,
reading_ease_control = jd$answers$data$reading_ease_control,
target_recall_1 = target_recall_1,
target_recall_2 = target_recall_2,
target_recall_3 = target_recall_3,
target_recall_4 = target_recall_4,
target_recall_5 = target_recall_5,
target_generalize_1 = target_generalize_1,
target_generalize_2 = target_generalize_2,
target_generalize_3 = target_generalize_3,
target_generalize_4 = target_generalize_4,
target_generalize_5 = target_generalize_5,
control_recall_1 = control_recall_1,
control_recall_2 = control_recall_2,
control_recall_3 = control_recall_3,
control_recall_4 = control_recall_4,
control_recall_5 = control_recall_5,
children = jd$answers$data$children,
language = jd$answers$data$homelang,
ses = jd$answers$data$ladder,
gender = jd$answers$data$gender,
age = jd$answers$data$age,
education = jd$answers$data$education,
ethnicity = jd$answers$data$ethnicity,
childAgeYoung = jd$answers$data$childAgeYoung,
childAgeOld = jd$answers$data$childAgeOld,
race = race,
reading_time_target = reading_time_target,
reading_time_control = reading_time_control,
time_questionnaire = time_questionnaire)
d.raw <- bind_rows(d.raw, id)
}
Map on question short forms so that we can use these instead.
labels <- read.csv("sent_forms_e7.csv")
labels$sent <- as.character(labels$sent)
answers <- read.csv("uptake_key.csv")
answers$tar_gen <-as.character(answers$tar_gen)
answers$tar_rec <-as.character(answers$tar_rec)
answers$con_rec <-as.character(answers$con_rec)
Clean up labels.
d.raw$sent <- as.character(d.raw$sent)
d.raw$sent <- str_replace_all(d.raw$sent, "'", "")
d.raw$sent <- str_replace_all(d.raw$sent, "’", "")
d.raw$sent <- str_replace_all(d.raw$sent, "“", "")
d.raw$sent <- str_replace_all(d.raw$sent, "”", "")
d.raw$sent <- str_replace_all(d.raw$sent, "‘", "")
d.raw$sent <- str_replace_all(d.raw$sent, "â", "")
Merge. Recode uptake answers by accuracy.
d <- left_join(d.raw, labels)
d$rating[d$reverse_code == 1] <- 6 - d$rating[d$reverse_code == 1]
d$target_generalize_1[d$target_generalize_1 != answers$tar_gen[1]] <- 0
d$target_generalize_1[d$target_generalize_1 == answers$tar_gen[1]] <- 1
d$target_generalize_2[d$target_generalize_2 != answers$tar_gen[2]] <- 0
d$target_generalize_2[d$target_generalize_2 == answers$tar_gen[2]] <- 1
d$target_generalize_3[d$target_generalize_3 != answers$tar_gen[3]] <- 0
d$target_generalize_3[d$target_generalize_3 == answers$tar_gen[3]] <- 1
d$target_generalize_4[d$target_generalize_4 != answers$tar_gen[4]] <- 0
d$target_generalize_4[d$target_generalize_4 == answers$tar_gen[4]] <- 1
d$target_generalize_5[d$target_generalize_5 != answers$tar_gen[5]] <- 0
d$target_generalize_5[d$target_generalize_5 == answers$tar_gen[5]] <- 1
d$target_recall_1[d$target_recall_1 != answers$tar_rec[1]] <- 0
d$target_recall_1[d$target_recall_1 == answers$tar_rec[1]] <- 1
d$target_recall_2[d$target_recall_2 != answers$tar_rec[2]] <- 0
d$target_recall_2[d$target_recall_2 == answers$tar_rec[2]] <- 1
d$target_recall_3[d$target_recall_3 != answers$tar_rec[3]] <- 0
d$target_recall_3[d$target_recall_3 == answers$tar_rec[3]] <- 1
d$target_recall_4[d$target_recall_4 != answers$tar_rec[4]] <- 0
d$target_recall_4[d$target_recall_4 == answers$tar_rec[4]] <- 1
d$target_recall_5[d$target_recall_5 != answers$tar_rec[5]] <- 0
d$target_recall_5[d$target_recall_5 == answers$tar_rec[5]] <- 1
d$control_recall_1[d$control_recall_1 != answers$con_rec[1]] <- 0
d$control_recall_1[d$control_recall_1 == answers$con_rec[1]] <- 1
d$control_recall_2[d$control_recall_2 != answers$con_rec[2]] <- 0
d$control_recall_2[d$control_recall_2 == answers$con_rec[2]] <- 1
d$control_recall_3[d$control_recall_3 != answers$con_rec[3]] <- 0
d$control_recall_3[d$control_recall_3 == answers$con_rec[3]] <- 1
d$control_recall_4[d$control_recall_4 != answers$con_rec[4]] <- 0
d$control_recall_4[d$control_recall_4 == answers$con_rec[4]] <- 1
d$control_recall_5[d$control_recall_5 != answers$con_rec[5]] <- 0
d$control_recall_5[d$control_recall_5 == answers$con_rec[5]] <- 1
Plot demographic info.
subinfo <- d %>%
group_by(workerid) %>%
select(-short_sent, -category, -instrument, -reverse_code) %>%
distinct
questions <- subinfo %>%
select(workerid, starts_with("target"), starts_with("control")) %>%
gather(question, correct, starts_with("target"), starts_with("control")) %>%
separate(question, c("passage","trial_type","q_num"), sep = "_") %>%
group_by(workerid, passage, trial_type) %>%
summarise(correct = mean(as.numeric(correct))) %>%
unite(trialtype, passage, trial_type) %>%
spread(trialtype, correct)
subinfo <- subinfo %>%
select(-starts_with("target"), -starts_with("control")) %>%
left_join(questions) %>%
select(-sent, -rating)
subinfo$education <- factor(subinfo$education,
levels = c("highSchool","someCollege","4year","someGrad","Grad"))
subinfo$gender <- str_replace_all(subinfo$gender,
"female|FEMALE|F$|f$|Femal$|Females|Females","Female")
subinfo$gender <- str_replace_all(subinfo$gender,
"^male|^Male|^MALE|^M$|^m$|^Maleq|Make", "Male")
subinfo$gender <- str_replace_all(subinfo$gender,
"29|24|25|28|32|33|45", "")
subinfo$gender <- str_replace_all(subinfo$gender,
"males", "male")
subinfo$gender <- str_replace_all(subinfo$gender, " ", "")
subinfo$language <- str_replace_all(subinfo$language, "english|eNGLISH|Engliah|ENGLISH|^eng$|Enlgish", "English")
subinfo$language <- str_replace_all(subinfo$language," ", "")
subinfo$language <- str_replace_all(subinfo$language,"arabic", "Arabic")
subinfo$language <- str_replace_all(subinfo$language,"chinese", "Chinese")
subinfo$language <- str_replace_all(subinfo$language,"german", "German")
subinfo$language <- str_replace_all(subinfo$language,"tagalog", "Tagalog")
subinfo$youngestChildAge <- factor(subinfo$childAgeYoung, levels = c("","0to6mo","7to12mo","1y","2y","3y","4y","5y","6y","7y","8y","9y","10y","olderthan10"))
subinfo$oldestChildAge <- factor(subinfo$childAgeOld, levels = c("","0to6mo","7to12mo","1y","2y","3y","4y","5y","6y","7y","8y","9y","10y","olderthan10"))
subinfo$reading_ease_target <- factor(subinfo$reading_ease_target, levels = c("Very Difficult", "Somewhat Difficult", "Somewhat Easy", "Very Easy"))
subinfo$reading_ease_control <- factor(subinfo$reading_ease_control, levels = c("Very Difficult", "Somewhat Difficult", "Somewhat Easy", "Very Easy"))
subinfo$enjoy_target <- factor(subinfo$enjoy_target, levels = c("Very Unenjoyable", "Somewhat Unenjoyable", "Somewhat Enjoyable", "Very Enjoyable"))
subinfo$enjoy_control <- factor(subinfo$enjoy_control, levels = c("Very Unenjoyable", "Somewhat Unenjoyable", "Somewhat Enjoyable", "Very Enjoyable"))
qplot(ses, data=subinfo)
qplot(children, data=subinfo)
qplot(gender, data=subinfo)
qplot(education, data=subinfo)
qplot(age, data=subinfo)
qplot(language, data=subinfo)
qplot(ethnicity, data=subinfo)
qplot(race, data=subinfo)
qplot(youngestChildAge, data=subinfo)
qplot(oldestChildAge, data=subinfo)
qplot(reading_ease_target, data=subinfo)
qplot(reading_ease_control, data=subinfo)
qplot(enjoy_target, data=subinfo)
qplot(enjoy_control, data=subinfo)
Now look at mean ratings across sentences.
rating_count <- table(d$rating)
rating_count
##
## 0 1 2 3 4 5 6
## 77 132 212 524 731 1017 1807
prop.table(rating_count)
##
## 0 1 2 3 4 5
## 0.01711111 0.02933333 0.04711111 0.11644444 0.16244444 0.22600000
## 6
## 0.40155556
ms <- d %>%
group_by(category, instrument, short_sent, reverse_code) %>%
multi_boot_standard(col = "rating") %>%
arrange(instrument, category, desc(mean))
ms$short_sent_ord <- factor(ms$short_sent,
levels = ms$short_sent)
Plot attitude.
qplot(short_sent_ord, mean, col = category,
ymin = ci_lower, ymax = ci_upper, pch = factor(reverse_code),
geom = "pointrange",
data = filter(ms, instrument == "attitudes")) +
theme(axis.text.x = element_text(angle = 90, hjust = 1, vjust = .5)) +
xlab("") +
ylab("Mean Rating") +
ylim(c(0,6)) +
scale_colour_solarized()
Plot mean subscale scores.
mc <- d %>%
group_by(category) %>%
multi_boot_standard(col = "rating") %>%
arrange(category, desc(mean))
ggplot(mc, aes(x = category, y = mean)) +
geom_bar(stat="identity") +
geom_linerange(aes(ymin = ci_lower, ymax = ci_upper),
position = position_dodge(width = .9))
mcl <- d %>%
group_by(category, workerid) %>%
multi_boot_standard(col = "rating") %>%
arrange(category, desc(mean))%>%
spread(category, mean)
wide.attitudes <- d %>%
filter(instrument == "attitudes") %>%
select(workerid, short_sent, rating) %>%
spread(short_sent, rating)
alpha.mat <- as.matrix(select(wide.attitudes, -workerid))
summary(alpha(x = alpha.mat))
##
## Reliability analysis
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd
## 0.83 0.85 0.89 0.24 5.8 0.02 4.7 0.7
wide.rules_respect <- d %>%
filter(category == "rules_respect") %>%
select(workerid, short_sent, rating) %>%
spread(short_sent, rating)
alpha.rr <- as.matrix(select(wide.rules_respect, -workerid))
summary(alpha(x = alpha.rr))
##
## Reliability analysis
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd
## 0.7 0.7 0.7 0.28 2.4 0.044 4.2 0.92
wide.affection <- d %>%
filter(category == "affection") %>%
select(workerid, short_sent, rating) %>%
spread(short_sent, rating)
alpha.af <- as.matrix(select(wide.affection, -workerid))
summary(alpha(x = alpha.af))
##
## Reliability analysis
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd
## 0.79 0.82 0.83 0.43 4.5 0.035 4.8 0.98
wide.active_learning <- d %>%
filter(category == "active_learning") %>%
select(workerid, short_sent, rating) %>%
spread(short_sent, rating)
alpha.al <- as.matrix(select(wide.active_learning, -workerid))
summary(alpha(x = alpha.al))
##
## Reliability analysis
## raw_alpha std.alpha G6(smc) average_r S/N ase mean sd
## 0.77 0.78 0.76 0.37 3.5 0.038 5 0.85
Create a data frame that has subscale scores.
Standardize ratings within subscale.
ds <- d
ds$srating <- ave(ds$rating, ds$category, FUN=scale)
ss <- ds %>%
group_by(workerid, category) %>%
summarize(srating = mean(srating)) %>%
spread(category, srating) %>%
left_join(subinfo) %>%
gather(trial_type, correct, control_recall, target_recall, target_generalize)
ss.long <- ss %>%
gather(subscale, srating, active_learning, affection, rules_respect)
ggplot(ss.long, aes(x = srating, y = correct, col = subscale)) +
geom_jitter() +
geom_smooth(method="lm", se=FALSE) +
facet_wrap(~trial_type)
Now with some exclusions, exploratory.
exclude <- subinfo$workerid[subinfo$reading_time_target < 30 | subinfo$reading_time_control < 30]
length(exclude)
## [1] 189
qplot(subinfo$reading_time_target, binwidth = 15) +
geom_vline(xintercept =15, lty =2, col="red")
qplot(subinfo$reading_time_control, binwidth = 15) +
geom_vline(xintercept =15, lty =2, col="red")
Replot with exclusions.
ggplot(filter(ss.long, !workerid %in% exclude),
aes(x = srating, y = correct, col = subscale)) +
geom_jitter() +
geom_smooth(method="lm", se=FALSE) +
facet_wrap(~trial_type)
subscales <- ds %>%
group_by(workerid, category) %>%
summarize(srating = mean(srating)) %>%
spread(category, srating)
ratings <- d %>%
select(workerid, starts_with("target"), starts_with("control")) %>%
gather(question, correct, starts_with("target"), starts_with("control")) %>%
separate(question, c("passage","trial_type","q_num"), sep = "_") %>%
mutate(correct = as.numeric(correct),
q_num = as.numeric(q_num) +
as.numeric(factor(passage)) * 10 +
as.numeric(factor(trial_type)) * 100)
d.reg <- left_join(ratings, subscales) %>%
unite(question_type, passage, trial_type)
summary(glmer(correct ~ question_type * rules_respect +
question_type * active_learning +
(1|workerid) +
(1|q_num),
data = filter(d.reg, !workerid %in% exclude),
family = "binomial"))
## Generalized linear mixed model fit by maximum likelihood (Laplace
## Approximation) [glmerMod]
## Family: binomial ( logit )
## Formula:
## correct ~ question_type * rules_respect + question_type * active_learning +
## (1 | workerid) + (1 | q_num)
## Data: filter(d.reg, !workerid %in% exclude)
##
## AIC BIC logLik deviance df.resid
## 10761.3 10846.2 -5369.7 10739.3 16459
##
## Scaled residuals:
## Min 1Q Median 3Q Max
## -4.7910 0.0222 0.1708 0.3595 7.2830
##
## Random effects:
## Groups Name Variance Std.Dev.
## workerid (Intercept) 5.3236 2.307
## q_num (Intercept) 0.7868 0.887
## Number of obs: 16470, groups: workerid, 61; q_num, 15
##
## Fixed effects:
## Estimate Std. Error z value
## (Intercept) 1.72087 0.49767 3.458
## question_typetarget_generalize 1.30465 0.56188 2.322
## question_typetarget_recall 1.00594 0.56221 1.789
## rules_respect 0.23239 0.46030 0.505
## active_learning 0.16559 0.45845 0.361
## question_typetarget_generalize:rules_respect 0.03638 0.10680 0.341
## question_typetarget_recall:rules_respect 0.45099 0.09478 4.758
## question_typetarget_generalize:active_learning 1.34951 0.09501 14.204
## question_typetarget_recall:active_learning 1.00373 0.08890 11.290
## Pr(>|z|)
## (Intercept) 0.000545 ***
## question_typetarget_generalize 0.020237 *
## question_typetarget_recall 0.073574 .
## rules_respect 0.613659
## active_learning 0.717955
## question_typetarget_generalize:rules_respect 0.733359
## question_typetarget_recall:rules_respect 1.95e-06 ***
## question_typetarget_generalize:active_learning < 2e-16 ***
## question_typetarget_recall:active_learning < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Correlation of Fixed Effects:
## (Intr) qstn_typtrgt_g qstn_typtrgt_r rls_rs actv_l
## qstn_typtrgt_g -0.560
## qstn_typtrgt_r -0.561 0.496
## rules_rspct 0.018 -0.001 -0.001
## activ_lrnng 0.012 -0.001 0.000 -0.139
## qstn_typtrgt_gnrlz:r_ 0.000 0.000 0.001 -0.065 0.010
## qstn_typtrgt_rcll:r_ 0.001 0.003 0.011 -0.076 0.012
## qstn_typtrgt_gnrlz:c_ 0.000 0.023 0.005 0.011 -0.068
## qstn_typtrgt_rcll:c_ 0.000 0.006 0.018 0.013 -0.077
## qstn_typtrgt_gnrlz:r_ qstn_typtrgt_rcll:r_
## qstn_typtrgt_g
## qstn_typtrgt_r
## rules_rspct
## activ_lrnng
## qstn_typtrgt_gnrlz:r_
## qstn_typtrgt_rcll:r_ 0.371
## qstn_typtrgt_gnrlz:c_ -0.196 -0.059
## qstn_typtrgt_rcll:c_ -0.057 -0.132
## qstn_typtrgt_gnrlz:c_
## qstn_typtrgt_g
## qstn_typtrgt_r
## rules_rspct
## activ_lrnng
## qstn_typtrgt_gnrlz:r_
## qstn_typtrgt_rcll:r_
## qstn_typtrgt_gnrlz:c_
## qstn_typtrgt_rcll:c_ 0.402