In-class exercise 6: Edit this rmarkdown template in RStudio to reproduce the following histogram for correlation coefficients between written and course variables by school from the data set Gcsemv{mlmRev}. The two vertical lines indicate averaged correlations over schools and correlation computed over individuals ignoring school label. Which is which?

1 Data management

#install.packages("mlmRev")
library(mlmRev)
# load the data from the package
data(Gcsemv, package="mlmRev")
# invoke help document
?Gcsemv
# view first 6 lines
head(Gcsemv)
  school student gender written course
1  20920      16      M      23     NA
2  20920      25      F      NA   71.2
3  20920      27      F      39   76.8
4  20920      31      F      36   87.9
5  20920      42      M      16   44.4
6  20920      62      F      36     NA

2 Summary statistics

with(Gcsemv, cor(written, course, use="pairwise"))
[1] 0.47417
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school, mean, na.rm=T))
cor(course_schavg, written_schavg)
[1] 0.39568

3 Visualization

library(tidyverse)
dta <- Gcsemv %>% 
  group_by(school) %>%
  mutate(r_sch = cor(course, written, use="pairwise")) 
dtar <- dta[!duplicated(dta$school),"r_sch"]
ggplot(data=dtar, aes(r_sch)) +
  geom_histogram(fill="skyblue") +
  geom_vline(xintercept=c(0.39568, 0.47414), 
             col=c("peru","red"), 
             lty=c(1,3)) +
  labs(x="Estimated correlation coefficients",
       y="Counts") +
  theme_bw()

We draw a scatter diagram of the written scores against the course scores and add the regression line.

Next we superimpose on the scatter plot the mean school written scores and mean school course scores (in color cyan) and add the regression line based on the mean school scores.

with(Gcsemv, plot(course_schavg ~ written_schavg, 
                bty = 'n', 
                cex = 0.5,
                xlab = 'Standardized course score', 
                ylab = 'Normalized written score'))
grid()
with(Gcsemv, abline(lm(course_schavg ~ written_schavg)))
points(written_schavg, course_schavg, pch=16, col=5)
abline(lm(course_schavg ~ written_schavg), col=5)

##The end