California School average math scores against reading scores scatter diagram

#read the data
library(Ecdat)

## Loading required package: Ecfun

## 
## Attaching package: 'Ecfun'

## The following object is masked from 'package:base':
## 
##     sign

## 
## Attaching package: 'Ecdat'

## The following object is masked from 'package:datasets':
## 
##     Orange

library(knitr)
library(dplyr)

## 
## Attaching package: 'dplyr'

## The following objects are masked from 'package:stats':
## 
##     filter, lag

## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union

data(Caschool,package="Ecdat")
knitr::kable(head(Caschool))

distcod	county	district	grspan	enrltot	teachers	calwpct	mealpct	computer	testscr	compstu	expnstu	str	avginc	elpct	readscr	mathscr
75119	Alameda	Sunol Glen Unified	KK-08	195	10.90	0.5102	2.0408	67	690.80	0.3435898	6384.911	17.88991	22.690001	0.000000	691.6	690.0
61499	Butte	Manzanita Elementary	KK-08	240	11.15	15.4167	47.9167	101	661.20	0.4208333	5099.381	21.52466	9.824000	4.583334	660.5	661.9
61549	Butte	Thermalito Union Elementary	KK-08	1550	82.90	55.0323	76.3226	169	643.60	0.1090323	5501.955	18.69723	8.978000	30.000002	636.3	650.9
61457	Butte	Golden Feather Union Elementary	KK-08	243	14.00	36.4754	77.0492	85	647.70	0.3497942	7101.831	17.35714	8.978000	0.000000	651.9	643.5
61523	Butte	Palermo Union Elementary	KK-08	1335	71.50	33.1086	78.4270	171	640.85	0.1280899	5235.988	18.67133	9.080333	13.857677	641.8	639.9
62042	Fresno	Burrel Union Elementary	KK-08	137	6.40	12.3188	86.9565	25	605.55	0.1824818	5580.147	21.40625	10.415000	12.408759	605.7	605.4

dta <- as_tibble(Ecdat::Caschool)
# 1. There are 420 obs. of  17 variables.
# 2. There are 45 counties. 
str(dta)

## Classes 'tbl_df', 'tbl' and 'data.frame':    420 obs. of  17 variables:
##  $ distcod : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
##  $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ enrltot : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers: num  10.9 11.1 82.9 14 71.5 ...
##  $ calwpct : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ mealpct : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer: int  67 101 169 85 171 25 28 66 35 0 ...
##  $ testscr : num  691 661 644 648 641 ...
##  $ compstu : num  0.344 0.421 0.109 0.35 0.128 ...
##  $ expnstu : num  6385 5099 5502 7102 5236 ...
##  $ str     : num  17.9 21.5 18.7 17.4 18.7 ...
##  $ avginc  : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ elpct   : num  0 4.58 30 0 13.86 ...
##  $ readscr : num  692 660 636 652 642 ...
##  $ mathscr : num  690 662 651 644 640 ...

# 1. set seed to make reproducible sampling
# 2. group the data by county and sample one school per county.
# 3. name it dta1
set.seed(1234)
dta1<- dta %>% dplyr::group_by(county) %>% sample_n(1) 
# 1. There are 45 obs. of  17 variables. There are 45 counties and now I have 45 obs, which means I sample one school within each county successfully.
str(dta1)

## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  45 obs. of  17 variables:
##  $ distcod : int  75119 61549 61572 61713 61978 62539 62596 62976 63123 63255 ...
##  $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 367 217 178 300 395 181 269 103 37 ...
##  $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 2 ...
##  $ enrltot : int  195 1550 777 3469 2987 314 129 594 6272 1510 ...
##  $ teachers: num  10.9 82.9 36.8 172.3 154.2 ...
##  $ calwpct : num  0.51 55.03 12.99 0 2.04 ...
##  $ mealpct : num  2.041 76.323 39.849 0.173 11.182 ...
##  $ computer: int  67 169 148 496 290 8 10 75 1338 141 ...
##  $ testscr : num  691 644 657 695 673 ...
##  $ compstu : num  0.3436 0.109 0.1905 0.143 0.0971 ...
##  $ expnstu : num  6385 5502 5483 5231 4825 ...
##  $ str     : num  17.9 18.7 21.1 20.1 19.4 ...
##  $ avginc  : num  22.69 8.98 13.24 34.3 18.73 ...
##  $ elpct   : num  0 30 1.158 0.894 0.502 ...
##  $ readscr : num  692 636 663 698 676 ...
##  $ mathscr : num  690 651 650 692 670 ...
##  - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame':   45 obs. of  2 variables:
##   ..$ county: Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ .rows :List of 45
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   ..- attr(*, ".drop")= logi TRUE

library(lattice)
# draw a scatter diagram by average math scores against reading score.
lattice::xyplot(readscr~ mathscr, type=c("p","g","r"), data=dta1, xlab = "average math score", ylab="average reading score", auto.key=list(columns=2))