#read the data
library(Ecdat)
## Loading required package: Ecfun
## 
## Attaching package: 'Ecfun'
## The following object is masked from 'package:base':
## 
##     sign
## 
## Attaching package: 'Ecdat'
## The following object is masked from 'package:datasets':
## 
##     Orange
library(knitr)
library(dplyr)
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
data(Caschool,package="Ecdat")
knitr::kable(head(Caschool))
distcod county district grspan enrltot teachers calwpct mealpct computer testscr compstu expnstu str avginc elpct readscr mathscr
75119 Alameda Sunol Glen Unified KK-08 195 10.90 0.5102 2.0408 67 690.80 0.3435898 6384.911 17.88991 22.690001 0.000000 691.6 690.0
61499 Butte Manzanita Elementary KK-08 240 11.15 15.4167 47.9167 101 661.20 0.4208333 5099.381 21.52466 9.824000 4.583334 660.5 661.9
61549 Butte Thermalito Union Elementary KK-08 1550 82.90 55.0323 76.3226 169 643.60 0.1090323 5501.955 18.69723 8.978000 30.000002 636.3 650.9
61457 Butte Golden Feather Union Elementary KK-08 243 14.00 36.4754 77.0492 85 647.70 0.3497942 7101.831 17.35714 8.978000 0.000000 651.9 643.5
61523 Butte Palermo Union Elementary KK-08 1335 71.50 33.1086 78.4270 171 640.85 0.1280899 5235.988 18.67133 9.080333 13.857677 641.8 639.9
62042 Fresno Burrel Union Elementary KK-08 137 6.40 12.3188 86.9565 25 605.55 0.1824818 5580.147 21.40625 10.415000 12.408759 605.7 605.4
dta <- as_tibble(Ecdat::Caschool)
# 1. There are 420 obs. of  17 variables.
# 2. There are 45 counties. 
str(dta)
## Classes 'tbl_df', 'tbl' and 'data.frame':    420 obs. of  17 variables:
##  $ distcod : int  75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
##  $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
##  $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
##  $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
##  $ enrltot : int  195 240 1550 243 1335 137 195 888 379 2247 ...
##  $ teachers: num  10.9 11.1 82.9 14 71.5 ...
##  $ calwpct : num  0.51 15.42 55.03 36.48 33.11 ...
##  $ mealpct : num  2.04 47.92 76.32 77.05 78.43 ...
##  $ computer: int  67 101 169 85 171 25 28 66 35 0 ...
##  $ testscr : num  691 661 644 648 641 ...
##  $ compstu : num  0.344 0.421 0.109 0.35 0.128 ...
##  $ expnstu : num  6385 5099 5502 7102 5236 ...
##  $ str     : num  17.9 21.5 18.7 17.4 18.7 ...
##  $ avginc  : num  22.69 9.82 8.98 8.98 9.08 ...
##  $ elpct   : num  0 4.58 30 0 13.86 ...
##  $ readscr : num  692 660 636 652 642 ...
##  $ mathscr : num  690 662 651 644 640 ...
# 1. set seed to make reproducible sampling
# 2. group the data by county and sample one school per county.
# 3. name it dta1
set.seed(1234)
dta1<- dta %>% dplyr::group_by(county) %>% sample_n(1) 
# 1. There are 45 obs. of  17 variables. There are 45 counties and now I have 45 obs, which means I sample one school within each county successfully.
str(dta1)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame':  45 obs. of  17 variables:
##  $ distcod : int  75119 61549 61572 61713 61978 62539 62596 62976 63123 63255 ...
##  $ county  : Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
##  $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 367 217 178 300 395 181 269 103 37 ...
##  $ grspan  : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 2 ...
##  $ enrltot : int  195 1550 777 3469 2987 314 129 594 6272 1510 ...
##  $ teachers: num  10.9 82.9 36.8 172.3 154.2 ...
##  $ calwpct : num  0.51 55.03 12.99 0 2.04 ...
##  $ mealpct : num  2.041 76.323 39.849 0.173 11.182 ...
##  $ computer: int  67 169 148 496 290 8 10 75 1338 141 ...
##  $ testscr : num  691 644 657 695 673 ...
##  $ compstu : num  0.3436 0.109 0.1905 0.143 0.0971 ...
##  $ expnstu : num  6385 5502 5483 5231 4825 ...
##  $ str     : num  17.9 18.7 21.1 20.1 19.4 ...
##  $ avginc  : num  22.69 8.98 13.24 34.3 18.73 ...
##  $ elpct   : num  0 30 1.158 0.894 0.502 ...
##  $ readscr : num  692 636 663 698 676 ...
##  $ mathscr : num  690 651 650 692 670 ...
##  - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame':   45 obs. of  2 variables:
##   ..$ county: Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
##   ..$ .rows :List of 45
##   .. ..$ : int 1
##   .. ..$ : int 2
##   .. ..$ : int 3
##   .. ..$ : int 4
##   .. ..$ : int 5
##   .. ..$ : int 6
##   .. ..$ : int 7
##   .. ..$ : int 8
##   .. ..$ : int 9
##   .. ..$ : int 10
##   .. ..$ : int 11
##   .. ..$ : int 12
##   .. ..$ : int 13
##   .. ..$ : int 14
##   .. ..$ : int 15
##   .. ..$ : int 16
##   .. ..$ : int 17
##   .. ..$ : int 18
##   .. ..$ : int 19
##   .. ..$ : int 20
##   .. ..$ : int 21
##   .. ..$ : int 22
##   .. ..$ : int 23
##   .. ..$ : int 24
##   .. ..$ : int 25
##   .. ..$ : int 26
##   .. ..$ : int 27
##   .. ..$ : int 28
##   .. ..$ : int 29
##   .. ..$ : int 30
##   .. ..$ : int 31
##   .. ..$ : int 32
##   .. ..$ : int 33
##   .. ..$ : int 34
##   .. ..$ : int 35
##   .. ..$ : int 36
##   .. ..$ : int 37
##   .. ..$ : int 38
##   .. ..$ : int 39
##   .. ..$ : int 40
##   .. ..$ : int 41
##   .. ..$ : int 42
##   .. ..$ : int 43
##   .. ..$ : int 44
##   .. ..$ : int 45
##   ..- attr(*, ".drop")= logi TRUE
library(lattice)
# draw a scatter diagram by average math scores against reading score.
lattice::xyplot(readscr~ mathscr, type=c("p","g","r"), data=dta1, xlab = "average math score", ylab="average reading score", auto.key=list(columns=2))