Exercises 1:
Draw a scatter plot of the variables written and course scores from the data set Gcsemv{mlmRev} and superimpose another scatter plot on it using mean written and mean course scores by school as in the language and math example. You can start by editing in RStudio either the gcsemv rmarkdown or the exam rmarkdown file.
# data management and graphics package
library(mlmRev)
## Loading required package: lme4
## Loading required package: Matrix
# load the data from the package
data(Gcsemv, package="mlmRev")
# invoke help document
?Gcsemv
## starting httpd help server ... done
# view first 6 lines
head(Gcsemv)
## school student gender written course
## 1 20920 16 M 23 NA
## 2 20920 25 F NA 71.2
## 3 20920 27 F 39 76.8
## 4 20920 31 F 36 87.9
## 5 20920 42 M 16 44.4
## 6 20920 62 F 36 NA
with(Gcsemv, cor(written, course, use="pairwise"))
## [1] 0.4741707
# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school , mean, na.rm=T))
cor(course_schavg, written_schavg)
## [1] 0.3956849
# compute averages by school
#dta_a <- Gcsemv %>%
# group_by(school) %>%
# summarize(ave_course = mean(course, na.rm=TRUE),
# ave_written = mean(written, na.rm=TRUE))
#cor(ave_course, ave_written)
library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --
## √ ggplot2 3.3.2 √ purrr 0.3.4
## √ tibble 3.0.4 √ dplyr 1.0.2
## √ tidyr 1.1.2 √ stringr 1.4.0
## √ readr 1.3.1 √ forcats 0.5.0
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
## x tidyr::pack() masks Matrix::pack()
## x tidyr::unpack() masks Matrix::unpack()
dta <- Gcsemv %>%
group_by(school) %>%
mutate(r_sch = cor(course, written, use="pairwise"))
dtar <- dta[!duplicated(dta$school),"r_sch"]
# superimpose two plots
ggplot(data=Gcsemv, aes(x=course, y=written)) +
geom_point(color="lightskyblue1") +
geom_point(data=dtar, aes(x=course_schavg, y=written_schavg), color="steelblue") +labs(x="course score",
y="written score") +
theme_bw()
## Warning: Removed 382 rows containing missing values (geom_point).
ggplot(data=dta, aes(x=Arith, y=Lang)) + geom_point(color=“skyblue”) + stat_smooth(method=“lm”, formula=y ~ x, se=F, col=“skyblue”) + geom_point(data=dta_a, aes(ave_arith, ave_lang), color=“steelblue”) + stat_smooth(data=dta_a, aes(ave_arith, ave_lang), method=“lm”, formula= y ~ x, se=F, color=“steelblue”) + labs(x=“Arithmetic score”, y=“Language score”) + theme_bw()