#read the data
library(Ecdat)
## Loading required package: Ecfun
##
## Attaching package: 'Ecfun'
## The following object is masked from 'package:base':
##
## sign
##
## Attaching package: 'Ecdat'
## The following object is masked from 'package:datasets':
##
## Orange
library(knitr)
library(dplyr)
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
data(Caschool,package="Ecdat")
knitr::kable(head(Caschool))
| 75119 |
Alameda |
Sunol Glen Unified |
KK-08 |
195 |
10.90 |
0.5102 |
2.0408 |
67 |
690.80 |
0.3435898 |
6384.911 |
17.88991 |
22.690001 |
0.000000 |
691.6 |
690.0 |
| 61499 |
Butte |
Manzanita Elementary |
KK-08 |
240 |
11.15 |
15.4167 |
47.9167 |
101 |
661.20 |
0.4208333 |
5099.381 |
21.52466 |
9.824000 |
4.583334 |
660.5 |
661.9 |
| 61549 |
Butte |
Thermalito Union Elementary |
KK-08 |
1550 |
82.90 |
55.0323 |
76.3226 |
169 |
643.60 |
0.1090323 |
5501.955 |
18.69723 |
8.978000 |
30.000002 |
636.3 |
650.9 |
| 61457 |
Butte |
Golden Feather Union Elementary |
KK-08 |
243 |
14.00 |
36.4754 |
77.0492 |
85 |
647.70 |
0.3497942 |
7101.831 |
17.35714 |
8.978000 |
0.000000 |
651.9 |
643.5 |
| 61523 |
Butte |
Palermo Union Elementary |
KK-08 |
1335 |
71.50 |
33.1086 |
78.4270 |
171 |
640.85 |
0.1280899 |
5235.988 |
18.67133 |
9.080333 |
13.857677 |
641.8 |
639.9 |
| 62042 |
Fresno |
Burrel Union Elementary |
KK-08 |
137 |
6.40 |
12.3188 |
86.9565 |
25 |
605.55 |
0.1824818 |
5580.147 |
21.40625 |
10.415000 |
12.408759 |
605.7 |
605.4 |
dta <- as_tibble(Ecdat::Caschool)
# 1. There are 420 obs. of 17 variables.
# 2. There are 45 counties.
str(dta)
## Classes 'tbl_df', 'tbl' and 'data.frame': 420 obs. of 17 variables:
## $ distcod : int 75119 61499 61549 61457 61523 62042 68536 63834 62331 67306 ...
## $ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 2 2 2 6 29 11 6 25 ...
## $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 214 367 132 270 53 152 383 263 94 ...
## $ grspan : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 1 ...
## $ enrltot : int 195 240 1550 243 1335 137 195 888 379 2247 ...
## $ teachers: num 10.9 11.1 82.9 14 71.5 ...
## $ calwpct : num 0.51 15.42 55.03 36.48 33.11 ...
## $ mealpct : num 2.04 47.92 76.32 77.05 78.43 ...
## $ computer: int 67 101 169 85 171 25 28 66 35 0 ...
## $ testscr : num 691 661 644 648 641 ...
## $ compstu : num 0.344 0.421 0.109 0.35 0.128 ...
## $ expnstu : num 6385 5099 5502 7102 5236 ...
## $ str : num 17.9 21.5 18.7 17.4 18.7 ...
## $ avginc : num 22.69 9.82 8.98 8.98 9.08 ...
## $ elpct : num 0 4.58 30 0 13.86 ...
## $ readscr : num 692 660 636 652 642 ...
## $ mathscr : num 690 662 651 644 640 ...
# 1. set seed to make reproducible sampling
# 2. group the data by county and sample one school per county.
# 3. name it dta1
set.seed(1234)
dta1<- dta %>% dplyr::group_by(county) %>% sample_n(1)
# 1. There are 45 obs. of 17 variables. There are 45 counties and now I have 45 obs, which means I sample one school within each county successfully.
str(dta1)
## Classes 'grouped_df', 'tbl_df', 'tbl' and 'data.frame': 45 obs. of 17 variables:
## $ distcod : int 75119 61549 61572 61713 61978 62539 62596 62976 63123 63255 ...
## $ county : Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
## $ district: Factor w/ 409 levels "Ackerman Elementary",..: 362 367 217 178 300 395 181 269 103 37 ...
## $ grspan : Factor w/ 2 levels "KK-06","KK-08": 2 2 2 2 2 2 2 2 2 2 ...
## $ enrltot : int 195 1550 777 3469 2987 314 129 594 6272 1510 ...
## $ teachers: num 10.9 82.9 36.8 172.3 154.2 ...
## $ calwpct : num 0.51 55.03 12.99 0 2.04 ...
## $ mealpct : num 2.041 76.323 39.849 0.173 11.182 ...
## $ computer: int 67 169 148 496 290 8 10 75 1338 141 ...
## $ testscr : num 691 644 657 695 673 ...
## $ compstu : num 0.3436 0.109 0.1905 0.143 0.0971 ...
## $ expnstu : num 6385 5502 5483 5231 4825 ...
## $ str : num 17.9 18.7 21.1 20.1 19.4 ...
## $ avginc : num 22.69 8.98 13.24 34.3 18.73 ...
## $ elpct : num 0 30 1.158 0.894 0.502 ...
## $ readscr : num 692 636 663 698 676 ...
## $ mathscr : num 690 651 650 692 670 ...
## - attr(*, "groups")=Classes 'tbl_df', 'tbl' and 'data.frame': 45 obs. of 2 variables:
## ..$ county: Factor w/ 45 levels "Alameda","Butte",..: 1 2 3 4 5 6 7 8 9 10 ...
## ..$ .rows :List of 45
## .. ..$ : int 1
## .. ..$ : int 2
## .. ..$ : int 3
## .. ..$ : int 4
## .. ..$ : int 5
## .. ..$ : int 6
## .. ..$ : int 7
## .. ..$ : int 8
## .. ..$ : int 9
## .. ..$ : int 10
## .. ..$ : int 11
## .. ..$ : int 12
## .. ..$ : int 13
## .. ..$ : int 14
## .. ..$ : int 15
## .. ..$ : int 16
## .. ..$ : int 17
## .. ..$ : int 18
## .. ..$ : int 19
## .. ..$ : int 20
## .. ..$ : int 21
## .. ..$ : int 22
## .. ..$ : int 23
## .. ..$ : int 24
## .. ..$ : int 25
## .. ..$ : int 26
## .. ..$ : int 27
## .. ..$ : int 28
## .. ..$ : int 29
## .. ..$ : int 30
## .. ..$ : int 31
## .. ..$ : int 32
## .. ..$ : int 33
## .. ..$ : int 34
## .. ..$ : int 35
## .. ..$ : int 36
## .. ..$ : int 37
## .. ..$ : int 38
## .. ..$ : int 39
## .. ..$ : int 40
## .. ..$ : int 41
## .. ..$ : int 42
## .. ..$ : int 43
## .. ..$ : int 44
## .. ..$ : int 45
## ..- attr(*, ".drop")= logi TRUE
library(lattice)
# draw a scatter diagram by average math scores against reading score.
lattice::xyplot(readscr~ mathscr, type=c("p","g","r"), data=dta1, xlab = "average math score", ylab="average reading score", auto.key=list(columns=2))
