Exercises 1:

Draw a scatter plot of the variables written and course scores from the data set Gcsemv{mlmRev} and superimpose another scatter plot on it using mean written and mean course scores by school as in the language and math example. You can start by editing in RStudio either the gcsemv rmarkdown or the exam rmarkdown file.

1 Data management

# data management and graphics package
library(mlmRev)

## Loading required package: lme4

## Loading required package: Matrix

# load the data from the package
data(Gcsemv, package="mlmRev")

# invoke help document
?Gcsemv

## starting httpd help server ... done

# view first 6 lines
head(Gcsemv)

##   school student gender written course
## 1  20920      16      M      23     NA
## 2  20920      25      F      NA   71.2
## 3  20920      27      F      39   76.8
## 4  20920      31      F      36   87.9
## 5  20920      42      M      16   44.4
## 6  20920      62      F      36     NA

2 Summary statistics

with(Gcsemv, cor(written, course, use="pairwise"))

## [1] 0.4741707

# compute the means by school
course_schavg <- with(Gcsemv, tapply(course, school, mean, na.rm=T))
written_schavg <- with(Gcsemv, tapply(written, school , mean, na.rm=T))

cor(course_schavg, written_schavg)

## [1] 0.3956849

# compute averages by school
#dta_a <- Gcsemv %>%
       # group_by(school) %>%
       # summarize(ave_course = mean(course, na.rm=TRUE),
              # ave_written = mean(written, na.rm=TRUE))

#cor(ave_course, ave_written)

3 Visualization

library(tidyverse)

## -- Attaching packages --------------------------------------- tidyverse 1.3.0 --

## √ ggplot2 3.3.2     √ purrr   0.3.4
## √ tibble  3.0.4     √ dplyr   1.0.2
## √ tidyr   1.1.2     √ stringr 1.4.0
## √ readr   1.3.1     √ forcats 0.5.0

## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x tidyr::expand() masks Matrix::expand()
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
## x tidyr::pack()   masks Matrix::pack()
## x tidyr::unpack() masks Matrix::unpack()

dta <- Gcsemv %>% 
  group_by(school) %>%
  mutate(r_sch = cor(course, written, use="pairwise"))

dtar <- dta[!duplicated(dta$school),"r_sch"]

4 superimpose two plots

# superimpose two plots
ggplot(data=Gcsemv, aes(x=course, y=written)) +
 geom_point(color="lightskyblue1") +
 geom_point(data=dtar, aes(x=course_schavg, y=written_schavg), color="steelblue") +labs(x="course score", 
      y="written score") +
 theme_bw()

## Warning: Removed 382 rows containing missing values (geom_point).

5 superimpose two plots

ggplot(data=dta, aes(x=Arith, y=Lang)) + geom_point(color=“skyblue”) + stat_smooth(method=“lm”, formula=y ~ x, se=F, col=“skyblue”) + geom_point(data=dta_a, aes(ave_arith, ave_lang), color=“steelblue”) + stat_smooth(data=dta_a, aes(ave_arith, ave_lang), method=“lm”, formula= y ~ x, se=F, color=“steelblue”) + labs(x=“Arithmetic score”, y=“Language score”) + theme_bw()

W2 exercise1：Individual correlation vs grouped correlation

Ching-Fang Wu

Thu Jan 14 13:30:17 2021

1 Data management

2 Summary statistics

3 Visualization

4 superimpose two plots

5 superimpose two plots

6 THE END