In-class exercise 6: Edit this rmarkdown template in RStudio to reproduce the following histogram for correlation coefficients between written and course variables by school from the data set Gcsemv{mlmRev}. The two vertical lines indicate averaged correlations over schools and correlation computed over individuals ignoring school label. Which is which?
#install.packages("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# invoke help document
?Gcsemv
# view first 6 lines
head(Gcsemv)
school student gender written course
1 20920 16 M 23 NA
2 20920 25 F NA 71.2
3 20920 27 F 39 76.8
4 20920 31 F 36 87.9
5 20920 42 M 16 44.4
6 20920 62 F 36 NA
with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568
library(tidyverse)
dta <- Gcsemv %>%
group_by(school) %>%
mutate(r_sch = cor(course, written, use="pairwise"))
dtar <- dta[!duplicated(dta$school),"r_sch"]
ggplot(data=dtar, aes(r_sch)) +
geom_histogram(fill="skyblue") +
geom_vline(xintercept=c(0.39568, 0.47414),
col=c("peru","red"),
lty=c(1,3)) +
labs(x="Estimated correlation coefficients",
y="Counts") +
theme_bw()
We draw a scatter diagram of the written scores against the course scores and add the regression line.
Next we superimpose on the scatter plot the mean school written scores and mean school course scores (in color cyan) and add the regression line based on the mean school scores.
with(Gcsemv, plot(course_schavg ~ written_schavg,
bty = 'n',
cex = 0.5,
xlab = 'Standardized course score',
ylab = 'Normalized written score'))
grid()
with(Gcsemv, abline(lm(course_schavg ~ written_schavg)))
points(written_schavg, course_schavg, pch=16, col=5)
abline(lm(course_schavg ~ written_schavg), col=5)
##The end