Data reading and variable preparation
d <- read.csv("~/Desktop/jess/zenith all data complete and correct estimation 070714.csv")
d$condition <- factor(d$condition, levels=c("control","abacus"))
d$standardized <- rowMeans(d[,c("wiat","woodcock")],na.rm=TRUE)
d$standardized.scale <- scale(d$standardized)
Without intervention split
qplot(ans, standardized, facets=~year,
data=d) +
geom_smooth(method="lm") +
ylab("Standardized Test Composite") +
xlab("Weber Fraction")
And with:
qplot(ans, standardized, facets=~year,
col=condition,
data=d) +
geom_smooth(method="lm") +
ylab("Standardized Test Composite") +
xlab("Weber Fraction")
Now by math grades.
qplot(ans, math, facets=~year,
data=d) +
geom_smooth(method="lm") +
ylab("Math Grades") +
xlab("Weber Fraction")
Just to check - should we log weber fractions for subsequent analyses? No, is the answer.
qplot(log(ans), standardized, facets=~year,
data=d) +
geom_smooth(method="lm") +
ylab("Math Grades") +
xlab("Weber Fraction")
Now do the same for deviance:
qplot(deviance, standardized, facets=~year,
data=d) +
geom_smooth(method="lm") +
ylab("Standardized Test Composite") +
xlab("Weber Fraction")
qplot(deviance, math, facets=~year,
data=d) +
geom_smooth(method="lm") +
ylab("Math Grades") +
xlab("Weber Fraction")
and for linear r^2:
qplot(linr2, standardized, facets=~year,
data=d) +
geom_smooth(method="lm") +
ylab("Standardized Test Composite") +
xlab("Weber Fraction")
qplot(linr2, math, facets=~year,
data=d) +
geom_smooth(method="lm") +
ylab("Math Grades") +
xlab("Weber Fraction")
Here are all the correlations:
ddply(d, .(year),
summarise,
ans.std.cor = cor.test(ans,standardized)$estimate,
log.ans.std.cor = cor.test(log(ans),standardized)$estimate,
ans.math.cor = cor.test(ans,math)$estimate,
log.ans.math.cor = cor.test(log(ans),math)$estimate,
dev.math.cor = try(cor.test(deviance,math)$estimate),
dev.std.cor = try(cor.test(deviance,standardized)$estimate),
lin.math.cor = try(cor.test(linr2,math)$estimate),
lin.std.cor = try(cor.test(linr2,standardized)$estimate))
## year ans.std.cor log.ans.std.cor ans.math.cor log.ans.math.cor
## 1 0 -0.2858 -0.29423 -0.19259 -0.18333
## 2 1 -0.1228 -0.05647 -0.07435 -0.03596
## 3 2 -0.1048 -0.08561 -0.07457 -0.04209
## 4 3 -0.2270 -0.21945 -0.22367 -0.19436
## dev.math.cor
## 1 Error in cor.test.default(deviance, math) : \n not enough finite observations\n
## 2 -0.215115516790599
## 3 -0.158048150149025
## 4 -0.189339242687907
## dev.std.cor
## 1 Error in cor.test.default(deviance, standardized) : \n not enough finite observations\n
## 2 -0.0732470386843735
## 3 -0.144610365153124
## 4 -0.228032492039368
## lin.math.cor
## 1 Error in cor.test.default(linr2, math) : not enough finite observations\n
## 2 0.169061790845966
## 3 0.231332886193189
## 4 0.344236511959198
## lin.std.cor
## 1 Error in cor.test.default(linr2, standardized) : \n not enough finite observations\n
## 2 0.102622810069744
## 3 0.188464437815951
## 4 0.27700492242386
And their p-values:
ddply(d, .(year),
summarise,
ans.std.cor = cor.test(ans,standardized)$p.value,
log.ans.std.cor = cor.test(log(ans),standardized)$p.value,
ans.math.cor = cor.test(ans,math)$p.value,
log.ans.math.cor = cor.test(log(ans),math)$p.value,
dev.math.cor = try(cor.test(deviance,math)$p.value),
dev.std.cor = try(cor.test(deviance,standardized)$p.value),
lin.math.cor = try(cor.test(linr2,math)$p.value),
lin.std.cor = try(cor.test(linr2,standardized)$p.value))
## year ans.std.cor log.ans.std.cor ans.math.cor log.ans.math.cor
## 1 0 0.0002277 0.0001445 0.015335 0.02113
## 2 1 0.0997119 0.4502481 0.319885 0.63084
## 3 2 0.1567078 0.2478888 0.313095 0.56946
## 4 3 0.0018849 0.0026885 0.002338 0.00838
## dev.math.cor
## 1 Error in cor.test.default(deviance, math) : \n not enough finite observations\n
## 2 0.00336251650003418
## 3 0.0307431743271009
## 4 0.0100487626904546
## dev.std.cor
## 1 Error in cor.test.default(deviance, standardized) : \n not enough finite observations\n
## 2 0.32308986402252
## 3 0.0489197083110919
## 4 0.00174609755361111
## lin.math.cor
## 1 Error in cor.test.default(linr2, math) : not enough finite observations\n
## 2 0.0217803428290335
## 3 0.00144483472032642
## 4 1.71261724246641e-06
## lin.std.cor
## 1 Error in cor.test.default(linr2, standardized) : \n not enough finite observations\n
## 2 0.165679838871076
## 3 0.00999182308988322
## 4 0.000129426268423494
Create a function to do model-based predictor analyses. (Doesn’t render currently).
And a function to relevel coefficients. (Also not rendered).
First with ANS:
coefs <- rbind(try(model.pred(data=d,
model.formula = formula(standardized.scale ~ ans.scale),
model.name="vanilla")),
try(model.pred(data=d,
model.formula = formula(standardized.scale ~ ans.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale + ravens.scale +
age),
model.name="complex")),
try(model.pred(data=d,
model.formula = formula(standardized.scale ~ ans.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale + ravens.scale +
age + condition),
model.name="complex + cond")))
coefs <- relevel.coefs(coefs)
## The following `from` values were not present in `x`: linr2.scale, deviance.scale
qplot(preds,betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=betas-se,
ymax=betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
Next with deviance:
coefs <- rbind(try(model.pred(data=d,
model.formula = formula(standardized.scale ~
deviance.scale),
model.name="vanilla", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(standardized.scale ~
deviance.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age),
model.name="complex", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(standardized.scale ~
deviance.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age + condition),
model.name="complex + cond", years=c(1,2,3))))
coefs <- relevel.coefs(coefs,dv="deviance.scale")
## The following `from` values were not present in `x`: ans.scale, linr2.scale
qplot(preds,betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=betas-se,
ymax=betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
And linear r2:
coefs <- rbind(try(model.pred(data=d,
model.formula = formula(standardized.scale ~
linr2.scale),
model.name="vanilla", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(standardized.scale ~
linr2.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age),
model.name="complex", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(standardized.scale ~
linr2.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age + condition),
model.name="complex + cond", years=c(1,2,3))))
coefs <- relevel.coefs(coefs,dv="linr2.scale")
## The following `from` values were not present in `x`: ans.scale, deviance.scale
qplot(preds,betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=betas-se,
ymax=betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
Now combine deviance + ANS:
coefs <- rbind(try(model.pred(data=d,
model.formula = formula(standardized.scale ~
ans.scale + deviance.scale),
model.name="vanilla", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(standardized.scale ~
ans.scale +
deviance.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age),
model.name="complex", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(standardized.scale ~
ans.scale +
deviance.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age + condition),
model.name="complex + cond", years=c(1,2,3))))
coefs <- relevel.coefs(coefs,dv=c("ans.scale","deviance.scale"))
## The following `from` values were not present in `x`: linr2.scale
qplot(preds,betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=betas-se,
ymax=betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
d$math.scale <- scale(d$math)
coefs <- rbind(model.pred(data=d,
model.formula = formula(math.scale ~ ans.scale),
model.name="vanilla"),
model.pred(data=d,
model.formula = formula(math.scale ~ ans.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale + ravens.scale +
age),
model.name="complex"),
model.pred(data=d,
model.formula = formula(math.scale ~ ans.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale + ravens.scale +
age + condition),
model.name="complex + cond"))
coefs <- relevel.coefs(coefs)
## The following `from` values were not present in `x`: linr2.scale, deviance.scale
qplot(preds,betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=betas-se,
ymax=betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
Next with deviance:
coefs <- rbind(try(model.pred(data=d,
model.formula = formula(math.scale ~
deviance.scale),
model.name="vanilla", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(math.scale ~
deviance.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age),
model.name="complex", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(math.scale ~
deviance.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age + condition),
model.name="complex + cond", years=c(1,2,3))))
coefs <- relevel.coefs(coefs,dv="deviance.scale")
## The following `from` values were not present in `x`: ans.scale, linr2.scale
qplot(preds,betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=betas-se,
ymax=betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
And linear r2:
coefs <- rbind(try(model.pred(data=d,
model.formula = formula(math.scale ~
linr2.scale),
model.name="vanilla", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(math.scale ~
linr2.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age),
model.name="complex", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(math.scale ~
linr2.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age + condition),
model.name="complex + cond", years=c(1,2,3))))
coefs <- relevel.coefs(coefs,dv="linr2.scale")
## The following `from` values were not present in `x`: ans.scale, deviance.scale
qplot(preds,betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=betas-se,
ymax=betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
Now combine deviance + ANS:
coefs <- rbind(try(model.pred(data=d,
model.formula = formula(math.scale ~
ans.scale + deviance.scale),
model.name="vanilla", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(math.scale ~
ans.scale +
deviance.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age),
model.name="complex", years=c(1,2,3))),
try(model.pred(data=d,
model.formula = formula(math.scale ~
ans.scale +
deviance.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale +
ravens.scale +
age + condition),
model.name="complex + cond", years=c(1,2,3))))
coefs <- relevel.coefs(coefs,dv=c("ans.scale","deviance.scale"))
## The following `from` values were not present in `x`: linr2.scale
qplot(preds,betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=betas-se,
ymax=betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
Here we take a principal components analysis of all the math measures and then take PC1, which is the factor with the largest explained variance amongst all the math measures. This is just one way of making a composite of all the math DVs and exploring its relatedness to ANS.
Note: the PCs come out negative, so I flipped them for interpretability.
coefs <- rbind(model.pred(data=d,
model.formula = formula(pc1 ~ ans.scale),
model.name="vanilla"),
model.pred(data=d,
model.formula = formula(pc1 ~ ans.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale + ravens.scale +
age),
model.name="complex"),
model.pred(data=d,
model.formula = formula(pc1 ~ ans.scale +
mental.rot.scale +
verbalwm.scale +
spatialwm.scale + ravens.scale +
age + condition),
model.name="complex + cond"))
coefs <- relevel.coefs(coefs)
## The following `from` values were not present in `x`: linr2.scale, deviance.scale
qplot(preds,-betas,facets=model~year,
fill=preds,
geom="bar",
stat="identity",
position="dodge",
data=subset(coefs, preds!="Intercept")) +
geom_linerange(aes(ymin=-betas-se,
ymax=-betas+se)) +
geom_hline(yintercept=0, lty=2) +
xlab("Predictor") +
ylab("Standardized Beta Weight") +
scale_colour_discrete(name="Predictor") +
theme(axis.text.x = element_text(angle = 90, vjust=.5, hjust = 1))
Let’s start by plotting some growth curves for standardized math measures.
qplot(year, standardized, facets=~subnum,
geom="line",
data=subset(d, subnum %in% unique(subnum)[1:30])) +
ylim(c(0,1)) +
ylab("Standardized Math Tests (WIAT & WJ)") +
xlab("Year")
Now add some models. Check whether linear or quadratic is more useful.
preds <- c("standardized","year","subnum")
d.cc <- d[complete.cases(d[,preds]),preds]
mod1 <- lmer(standardized ~ year + (year | subnum),
data=d.cc)
mod2 <- lmer(standardized ~ year + I(year^2) + (year + I(year^2) | subnum),
data=d.cc)
anova(mod1,mod2)
## refitting model(s) with ML (instead of REML)
## Data: d.cc
## Models:
## mod1: standardized ~ year + (year | subnum)
## mod2: standardized ~ year + I(year^2) + (year + I(year^2) | subnum)
## Df AIC BIC logLik deviance Chisq Chi Df Pr(>Chisq)
## mod1 6 -1931 -1903 972 -1943
## mod2 10 -1928 -1882 974 -1948 5.11 4 0.28
No gain for this DV (standardized tests) in adding a quadratic term. Let’s drop it then.
Here’s a basic model showing individaul variability plotted by the same model for all participants. It’s a basic linear model and it doesn’t do that badly.
The approach we’re going to follow is to try and see gains by adding predictors to this model to better capture individual variation. (Note that if we use individualized growth terms in a mixed model we already fit pretty much all the variance so there’s not much else we can do, so we are using regular linear models here).
preds <- c("standardized","year","subnum")
d.cc <- d[complete.cases(d[,preds]),preds]
mod <- lm(standardized ~ year,
data=d.cc)
d.cc$mod <- predict(mod)
qplot(year, standardized, facets=~subnum,
geom="point",
data=subset(d.cc, subnum %in% unique(subnum)[1:30])) +
geom_line(aes(x=year, y=mod),lty=2) +
ylim(c(0,1)) +
ylab("Standardized Math Tests (WIAT & WJ)") +
xlab("Year")
Let’s try adding concurrent ANS as a predictor. As you can see, it does very little here if we have a single coefficient.
(Note that there are some missing values for ANS from year 0, so that’s why curves are truncated).
preds <- c("standardized","year","subnum","ans")
d.cc <- d[complete.cases(d[,preds]),preds]
mod <- lm(standardized ~ year,
data=d.cc)
mod2 <- lm(standardized ~ year + ans,
data=d.cc)
d.cc$mod <- predict(mod)
d.cc$mod2 <- predict(mod2)
anova(mod,mod2)
## Analysis of Variance Table
##
## Model 1: standardized ~ year
## Model 2: standardized ~ year + ans
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 710 5.20
## 2 709 5.08 1 0.119 16.6 5.2e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qplot(year, standardized, facets=~subnum,
geom="point",
data=subset(d.cc, subnum %in% unique(subnum)[1:30])) +
geom_line(aes(x=year, y=mod),lty=2) +
geom_line(aes(x=year, y=mod2),lty=1,col="red") +
ylim(c(0,1)) +
ylab("Standardized Math Tests (WIAT & WJ)") +
xlab("Year")
Multiple coefficients, one for each year, and you start to see some teensy-tiny shape differences between curves. (I added abacus condition here as well).
The dashed line is without ANS, the solid is with.
preds <- c("standardized","year","subnum","condition","ans")
d.cc <- d[complete.cases(d[,preds]),preds]
mod <- lm(standardized ~ factor(year) * condition,
data=d.cc)
mod2 <- lm(standardized ~ factor(year) * condition +
factor(year) * ans - ans,
data=d.cc)
d.cc$mod <- predict(mod)
d.cc$mod2 <- predict(mod2)
anova(mod,mod2)
## Analysis of Variance Table
##
## Model 1: standardized ~ factor(year) * condition
## Model 2: standardized ~ factor(year) * condition + factor(year) * ans -
## ans
## Res.Df RSS Df Sum of Sq F Pr(>F)
## 1 704 5.14
## 2 700 4.95 4 0.186 6.57 3.4e-05 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
qplot(year, standardized, facets=~subnum,
geom="point",col=condition,
data=subset(d.cc, subnum %in% unique(subnum)[1:30])) +
geom_line(aes(x=year, y=mod),lty=2) +
geom_line(aes(x=year, y=mod2),lty=1) +
ylim(c(0,1)) +
ylab("Standardized Math Tests (WIAT & WJ)") +
xlab("Year")
Just for kicks, let’s add everything into the mix and see how we do.
We are doing a bunch better in making customized predictions. In terms of significant effects, we get:
Nothing for ANS. Note that these predictions get noticeably worse when you leave out arithmetic scores. Domain knowledge is a big predictor.
preds <- c("standardized","year","subnum","condition","ans","mental.rot","spatialwm","verbalwm","ravens","female","arith")
d.cc <- d[complete.cases(d[,preds]),preds]
mod <- lm(standardized ~ factor(year) * condition,
data=d.cc)
mod2 <- lm(standardized ~ factor(year) + condition +
female + arith + mental.rot +
spatialwm + ravens + verbalwm + ans,
data=d.cc)
d.cc$mod <- predict(mod)
d.cc$mod2 <- predict(mod2)
summary(mod2)
##
## Call:
## lm(formula = standardized ~ factor(year) + condition + female +
## arith + mental.rot + spatialwm + ravens + verbalwm + ans,
## data = d.cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.19768 -0.04100 -0.00023 0.03995 0.17359
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.07963 0.01554 5.12 3.9e-07 ***
## factor(year)1 0.08415 0.00825 10.20 < 2e-16 ***
## factor(year)2 0.13228 0.00927 14.27 < 2e-16 ***
## factor(year)3 0.18293 0.01080 16.94 < 2e-16 ***
## conditionabacus -0.01214 0.00493 -2.46 0.0140 *
## female 0.01601 0.00494 3.24 0.0013 **
## arith 0.48181 0.02309 20.87 < 2e-16 ***
## mental.rot 0.04396 0.01519 2.89 0.0039 **
## spatialwm -0.00341 0.00269 -1.27 0.2055
## ravens 0.06208 0.01292 4.80 1.9e-06 ***
## verbalwm 0.00832 0.00289 2.88 0.0041 **
## ans -0.03049 0.02396 -1.27 0.2036
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0606 on 657 degrees of freedom
## Multiple R-squared: 0.842, Adjusted R-squared: 0.84
## F-statistic: 319 on 11 and 657 DF, p-value: <2e-16
qplot(year, standardized, facets=~subnum,
geom="point",col=condition,
data=subset(d.cc, subnum %in% unique(subnum)[1:30])) +
geom_line(aes(x=year, y=mod),lty=2) +
geom_line(aes(x=year, y=mod2),lty=1) +
ylim(c(0,1)) +
ylab("Standardized Math Tests (WIAT & WJ)") +
xlab("Year")
Let’s do this same exercise with math grades. Picture is largely consistent here. ANS not doing much.
preds <- c("math","year","subnum","condition","ans","mental.rot","spatialwm","verbalwm","ravens","female","arith")
d.cc <- d[complete.cases(d[,preds]),preds]
mod <- lm(math ~ factor(year) * condition,
data=d.cc)
mod2 <- lm(math ~ factor(year) + condition +
female + arith + mental.rot +
spatialwm + ravens + verbalwm + ans,
data=d.cc)
d.cc$mod <- predict(mod)
d.cc$mod2 <- predict(mod2)
summary(mod2)
##
## Call:
## lm(formula = math ~ factor(year) + condition + female + arith +
## mental.rot + spatialwm + ravens + verbalwm + ans, data = d.cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -36.87 -6.12 0.56 6.58 31.23
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 67.6001 2.6090 25.91 < 2e-16 ***
## factor(year)1 -7.9468 1.3880 -5.73 1.6e-08 ***
## factor(year)2 -30.1712 1.5616 -19.32 < 2e-16 ***
## factor(year)3 -36.6142 1.8359 -19.94 < 2e-16 ***
## conditionabacus -1.0539 0.8274 -1.27 0.20320
## female 2.4252 0.8305 2.92 0.00362 **
## arith 66.8832 3.9151 17.08 < 2e-16 ***
## mental.rot 8.6028 2.5604 3.36 0.00082 ***
## spatialwm -0.8787 0.4526 -1.94 0.05264 .
## ravens 10.6577 2.1725 4.91 1.2e-06 ***
## verbalwm 1.5653 0.4847 3.23 0.00130 **
## ans -0.0947 4.0342 -0.02 0.98128
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 10.2 on 653 degrees of freedom
## Multiple R-squared: 0.561, Adjusted R-squared: 0.553
## F-statistic: 75.8 on 11 and 653 DF, p-value: <2e-16
qplot(year, math, facets=~subnum,
geom="point",col=condition,
data=subset(d.cc, subnum %in% unique(subnum)[1:30])) +
geom_line(aes(x=year, y=mod),lty=2) +
geom_line(aes(x=year, y=mod2),lty=1) +
ylab("Math Grades") +
xlab("Year")
A final analysis: Back to standardized math scores, and let’s try this by lagging the predictors. We use the year before to predict the current year - so this is trying to use e.g. year 0’s ANS to see if it predicts growth over standard predictions in year 1.
Note that I added gender to the baseline predictor set so we will see only lag differences.
Define a function to do the lagging.
lag <- function(x, p) {
for (y in x$year) {
if (y == 0) {
eval(parse(text=paste("x$",p,".lag[x$year==0] <- 0",sep="")))
}
if (y != 0 & ((y-1) %in% x$year)) {
eval(parse(text=paste("x$",p,".lag[x$year==y] <- x$",
p,"[x$year==y-1]",sep="")))
} else {
eval(parse(text=paste("x$",p,".lag[x$year==y] <- NA",sep="")))
}
}
return(x)
}
Now do the actual predictions. We are actually missing too many observations to really do this right without some serious imputation. We lose around a third of our data, as you can see from the plot.
preds <- c("standardized","year","subnum","condition","ans","mental.rot",
"spatialwm","verbalwm","ravens","female","arith")
pred_small <- c("ans","mental.rot","spatialwm","verbalwm","ravens","arith")
d.cc <- d[complete.cases(d[,preds]),preds]
d.cc <- ddply(d.cc, .(subnum),
function(x) {
for (p in pred_small) {
x <- lag(x,p)
}
return(x)
})
d.cc <- d.cc[complete.cases(d.cc),]
mod <- lm(standardized ~ factor(year) * condition + female,
data=d.cc)
mod2 <- lm(standardized ~ factor(year) + condition + female +
arith.lag + mental.rot.lag +
spatialwm.lag + ravens.lag + verbalwm.lag + ans.lag,
data=d.cc)
d.cc$mod <- predict(mod)
d.cc$mod2 <- predict(mod2)
summary(mod2)
##
## Call:
## lm(formula = standardized ~ factor(year) + condition + female +
## arith.lag + mental.rot.lag + spatialwm.lag + ravens.lag +
## verbalwm.lag + ans.lag, data = d.cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.27637 -0.04992 0.00049 0.04852 0.21297
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.188358 0.021191 8.89 < 2e-16 ***
## factor(year)2 0.077741 0.010651 7.30 1.3e-12 ***
## factor(year)3 0.124521 0.012608 9.88 < 2e-16 ***
## conditionabacus -0.008156 0.007210 -1.13 0.25854
## female 0.020591 0.007238 2.85 0.00464 **
## arith.lag 0.474753 0.039944 11.89 < 2e-16 ***
## mental.rot.lag 0.080157 0.021777 3.68 0.00026 ***
## spatialwm.lag -0.000197 0.003847 -0.05 0.95916
## ravens.lag 0.031072 0.018269 1.70 0.08965 .
## verbalwm.lag 0.007704 0.004242 1.82 0.07000 .
## ans.lag -0.002685 0.032437 -0.08 0.93408
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0746 on 466 degrees of freedom
## Multiple R-squared: 0.659, Adjusted R-squared: 0.651
## F-statistic: 89.9 on 10 and 466 DF, p-value: <2e-16
qplot(year, standardized, facets=~subnum,
geom="point",col=condition,
data=subset(d.cc, subnum %in% unique(subnum)[1:30])) +
geom_line(aes(x=year, y=mod),lty=2) +
geom_line(aes(x=year, y=mod2),lty=1) +
ylim(c(0,1)) +
ylab("Standardized Math Tests (WIAT & WJ)") +
xlab("Year")
## geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
## geom_path: Each group consist of only one observation. Do you need to adjust the group aesthetic?
Because of the missing data problem, do this with just ANS, to see if anything else is different. Results:
preds <- c("standardized","year","subnum","condition","ans","female")
pred_small <- c("ans")
d.cc <- d[complete.cases(d[,preds]),preds]
d.cc <- ddply(d.cc, .(subnum),
function(x) {
for (p in pred_small) {
x <- lag(x,p)
}
return(x)
})
d.cc <- d.cc[complete.cases(d.cc),]
mod <- lm(standardized ~ factor(year) + condition + female +
ans +
ans.lag,
data=d.cc)
summary(mod)
##
## Call:
## lm(formula = standardized ~ factor(year) + condition + female +
## ans + ans.lag, data = d.cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3715 -0.0581 0.0010 0.0661 0.2140
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.35256 0.01757 20.07 < 2e-16 ***
## factor(year)2 0.09109 0.01201 7.58 1.6e-13 ***
## factor(year)3 0.20131 0.01267 15.89 < 2e-16 ***
## conditionabacus 0.01652 0.00792 2.09 0.0374 *
## female 0.02200 0.00820 2.68 0.0075 **
## ans -0.13748 0.06082 -2.26 0.0242 *
## ans.lag -0.06174 0.03750 -1.65 0.1003
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0892 on 504 degrees of freedom
## Multiple R-squared: 0.511, Adjusted R-squared: 0.506
## F-statistic: 87.9 on 6 and 504 DF, p-value: <2e-16
Last attempt at doing this assuming independent year effects (e.g. in case ANS is changing so much that the effect size is not consistent from year to year). Result: These analyses knock out all the predictors because they have too many parameters.
mod <- lm(standardized ~ factor(year) + condition + female +
factor(year) * ans.lag,
data=d.cc)
summary(mod)
##
## Call:
## lm(formula = standardized ~ factor(year) + condition + female +
## factor(year) * ans.lag, data = d.cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3659 -0.0569 0.0056 0.0655 0.2130
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.32009 0.01936 16.53 < 2e-16 ***
## factor(year)2 0.10925 0.02320 4.71 3.2e-06 ***
## factor(year)3 0.25095 0.02549 9.84 < 2e-16 ***
## conditionabacus 0.01760 0.00792 2.22 0.0266 *
## female 0.02211 0.00822 2.69 0.0074 **
## ans.lag -0.03876 0.04581 -0.85 0.3979
## factor(year)2:ans.lag -0.05914 0.08436 -0.70 0.4836
## factor(year)3:ans.lag -0.27358 0.12163 -2.25 0.0249 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0893 on 503 degrees of freedom
## Multiple R-squared: 0.511, Adjusted R-squared: 0.505
## F-statistic: 75.2 on 7 and 503 DF, p-value: <2e-16
mod <- lm(standardized ~ factor(year) + condition + female +
factor(year) * ans,
data=d.cc)
summary(mod)
##
## Call:
## lm(formula = standardized ~ factor(year) + condition + female +
## factor(year) * ans, data = d.cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3761 -0.0552 0.0028 0.0645 0.2148
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.31895 0.01790 17.82 < 2e-16 ***
## factor(year)2 0.12081 0.02466 4.90 1.3e-06 ***
## factor(year)3 0.24163 0.02331 10.37 < 2e-16 ***
## conditionabacus 0.01717 0.00793 2.17 0.0308 *
## female 0.02282 0.00822 2.78 0.0057 **
## ans -0.07713 0.09050 -0.85 0.3945
## factor(year)2:ans -0.11261 0.14470 -0.78 0.4368
## factor(year)3:ans -0.17985 0.13947 -1.29 0.1978
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0894 on 503 degrees of freedom
## Multiple R-squared: 0.51, Adjusted R-squared: 0.504
## F-statistic: 74.9 on 7 and 503 DF, p-value: <2e-16
mod <- lm(standardized ~ factor(year) + condition + female +
factor(year) * ans.lag +
factor(year) * ans,
data=d.cc)
summary(mod)
##
## Call:
## lm(formula = standardized ~ factor(year) + condition + female +
## factor(year) * ans.lag + factor(year) * ans, data = d.cc)
##
## Residuals:
## Min 1Q Median 3Q Max
## -0.3726 -0.0576 0.0024 0.0660 0.2111
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 0.32786 0.02266 14.47 < 2e-16 ***
## factor(year)2 0.12273 0.02978 4.12 4.4e-05 ***
## factor(year)3 0.25343 0.02911 8.71 < 2e-16 ***
## conditionabacus 0.01696 0.00792 2.14 0.033 *
## female 0.02321 0.00823 2.82 0.005 **
## ans.lag -0.03103 0.04709 -0.66 0.510
## ans -0.06274 0.09295 -0.67 0.500
## factor(year)2:ans.lag -0.04879 0.08591 -0.57 0.570
## factor(year)3:ans.lag -0.20737 0.13482 -1.54 0.125
## factor(year)2:ans -0.10480 0.14738 -0.71 0.477
## factor(year)3:ans -0.09193 0.15095 -0.61 0.543
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 0.0892 on 500 degrees of freedom
## Multiple R-squared: 0.516, Adjusted R-squared: 0.506
## F-statistic: 53.2 on 10 and 500 DF, p-value: <2e-16