Language of Data


> library(openintro)
> library(tidyverse)
> data(hsb2)
> glimpse(hsb2)
Rows: 200
Columns: 11
$ id      <int> 70, 121, 86, 141, 172, 113, 50, 11, 84, 48, 75, 60, 95, 104...
$ gender  <chr> "male", "female", "male", "male", "male", "male", "male", "...
$ race    <chr> "white", "white", "white", "white", "white", "white", "afri...
$ ses     <fct> low, middle, high, high, middle, middle, middle, middle, mi...
$ schtyp  <fct> public, public, public, public, public, public, public, pub...
$ prog    <fct> general, vocational, general, vocational, academic, academi...
$ read    <int> 57, 68, 44, 63, 47, 44, 50, 34, 63, 57, 60, 57, 73, 54, 45,...
$ write   <int> 52, 59, 33, 44, 52, 52, 59, 46, 57, 55, 46, 65, 60, 63, 57,...
$ math    <int> 41, 53, 54, 47, 57, 51, 42, 45, 54, 52, 51, 51, 71, 57, 50,...
$ science <int> 47, 63, 58, 53, 53, 63, 53, 39, 58, 50, 53, 63, 61, 55, 31,...
$ socst   <int> 57, 61, 31, 56, 61, 61, 61, 36, 51, 51, 61, 61, 71, 46, 56,...
> data(email50)
> glimpse(email50)
Rows: 50
Columns: 21
$ spam         <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...
$ to_multiple  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
$ from         <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ cc           <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ sent_email   <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...
$ time         <dttm> 2012-01-04 08:19:16, 2012-02-16 15:10:06, 2012-01-04 ...
$ image        <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ attach       <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, ...
$ dollar       <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, 0, 0,...
$ winner       <fct> no, no, no, no, no, no, no, no, no, no, no, no, yes, n...
$ inherit      <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ viagra       <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ password     <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, ...
$ num_char     <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809, 5.2...
$ line_breaks  <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167, 198...
$ format       <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ...
$ re_subj      <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...
$ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ urgent_subj  <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, 10, 0...
$ number       <fct> small, big, none, small, small, small, small, small, s...
> table(hsb2$schtyp)

 public private 
    168      32 
> hsb2_public <- hsb2 %>% filter(schtyp == "public")
> #private factor is still listed
> table(hsb2_public$schtyp)

 public private 
    168       0 
> #drop unused levels
> hsb2_public$schtyp <- droplevels(hsb2_public$schtyp)
> table(hsb2_public$schtyp)

public 
   168 
> # Subset of emails with big numbers: email50_big
> email50_big <- email50 %>%
+   filter(email50$number=="big")
> # Glimpse the subset
> glimpse(email50_big)
Rows: 7
Columns: 21
$ spam         <dbl> 0, 0, 1, 0, 0, 0, 0
$ to_multiple  <dbl> 0, 0, 0, 0, 0, 0, 0
$ from         <dbl> 1, 1, 1, 1, 1, 1, 1
$ cc           <int> 0, 0, 0, 0, 0, 0, 0
$ sent_email   <dbl> 0, 0, 0, 0, 0, 1, 0
$ time         <dttm> 2012-02-16 15:10:06, 2012-02-04 18:26:09, 2012-01-24 ...
$ image        <dbl> 0, 0, 0, 0, 0, 0, 0
$ attach       <dbl> 0, 0, 0, 0, 0, 0, 0
$ dollar       <dbl> 0, 0, 3, 2, 0, 0, 0
$ winner       <fct> no, no, yes, no, no, no, no
$ inherit      <dbl> 0, 0, 0, 0, 0, 0, 0
$ viagra       <dbl> 0, 0, 0, 0, 0, 0, 0
$ password     <dbl> 0, 2, 0, 0, 0, 0, 8
$ num_char     <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
$ line_breaks  <int> 183, 198, 712, 692, 140, 512, 225
$ format       <dbl> 1, 1, 1, 1, 1, 1, 1
$ re_subj      <dbl> 0, 0, 0, 0, 0, 0, 0
$ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
$ urgent_subj  <dbl> 0, 0, 0, 0, 0, 0, 0
$ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
$ number       <fct> big, big, big, big, big, big, big
> # Subset of emails with big numbers: email50_big
> email50_big <- email50 %>%
+   filter(number == "big")
> 
> # Table of the number variable
> table(email50_big$number)

 none small   big 
    0     0     7 
> # Drop levels
> email50_big$number_dropped <- droplevels(email50_big$number)
> 
> # Table of the number_dropped variable
> table(email50_big$number_dropped)

big 
  7 
> mean(hsb2$read)
[1] 52.23
> avg_read <- mean(hsb2$read)
> (avg_read <- mean(hsb2$read))
[1] 52.23
> # create new variable: read_cat
> hsb2 <- hsb2 %>% mutate(read_cat = ifelse(
+   read < avg_read,"below average",
+   "at or above average"
+ ))
> 
> # Calculate median number of characters: med_num_char
> med_num_char <- median(email50$num_char)
> 
> # Create num_char_cat variable in email50
> email50_fortified <- email50 %>%
+   mutate(num_char_cat = ifelse(num_char < med_num_char, 
+                                "below median", 
+                                "at or above median"))
> 
> # Count emails in each category
> email50_fortified %>%
+   count(num_char_cat)
# A tibble: 2 x 2
  num_char_cat           n
  <chr>              <int>
1 at or above median    25
2 below median          25
> # Create number_yn column in email50
> email50_fortified <- email50 %>%
+   mutate(
+     number_yn = case_when(
+       # if number is "none", make number_yn "no"
+       number == "none" ~ "no", 
+       # if number is not "none", make number_yn "yes"
+       number != "none" ~ "yes"  
+     )
+   )
> # Visualize the distribution of number_yn
> ggplot(email50_fortified, aes(x = number_yn)) +
+   geom_bar()

> #scatterplot of math vs science scores
> ggplot(data = hsb2, aes(x = science, y = math)) + geom_point()

> ggplot(data = hsb2, aes(x = science, y = math, color = prog)) +
+   geom_point()

> # Scatterplot of exclaim_mess vs. num_char
> ggplot(email50, aes(x = num_char, y = exclaim_mess,
+       color = factor(spam))) + geom_point()

Study types and Cautionary Tales


> library(gapminder)
> 
> # Glimpse data
> glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country   <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afgha...
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
$ year      <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 199...
$ lifeExp   <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 4...
$ pop       <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372,...
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.113...
> glimpse(UCBAdmissions)
 'table' num [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
 - attr(*, "dimnames")=List of 3
  ..$ Admit : chr [1:2] "Admitted" "Rejected"
  ..$ Gender: chr [1:2] "Male" "Female"
  ..$ Dept  : chr [1:6] "A" "B" "C" "D" ...
> ucb_admit <- read.csv("C:\\Users\\pbj20\\Documents\\R Documents\\ucba.csv")
> 
> head(ucb_admit)
     Admit Gender Dept
1 Admitted   Male    A
2 Admitted   Male    A
3 Admitted   Male    A
4 Admitted   Male    A
5 Admitted   Male    A
6 Admitted   Male    A
> # Count number of male and female applicants admitted
> ucb_admit %>%
+   count(Gender, Admit)
  Gender    Admit    n
1 Female Admitted  557
2 Female Rejected 1278
3   Male Admitted 1198
4   Male Rejected 1493
> ucb_admission_counts <- ucb_admit %>%
+   count(Gender, Admit)
> 
> ucb_admission_counts %>%
+   # Group by gender
+   group_by(Gender) %>%
+   # Create new variable
+   mutate(prop = n / sum(n)) %>%
+   # Filter for admitted
+   filter(Admit == "Admitted")
# A tibble: 2 x 4
# Groups:   Gender [2]
  Gender Admit        n  prop
  <chr>  <chr>    <int> <dbl>
1 Female Admitted   557 0.304
2 Male   Admitted  1198 0.445
> ucb_admission_counts <- ucb_admit %>%
+   # Counts by department, then gender, then admission status
+   count(Dept, Gender, Admit)
> 
> # See the result
> ucb_admission_counts
   Dept Gender    Admit   n
1     A Female Admitted  89
2     A Female Rejected  19
3     A   Male Admitted 512
4     A   Male Rejected 313
5     B Female Admitted  17
6     B Female Rejected   8
7     B   Male Admitted 353
8     B   Male Rejected 207
9     C Female Admitted 202
10    C Female Rejected 391
11    C   Male Admitted 120
12    C   Male Rejected 205
13    D Female Admitted 131
14    D Female Rejected 244
15    D   Male Admitted 138
16    D   Male Rejected 279
17    E Female Admitted  94
18    E Female Rejected 299
19    E   Male Admitted  53
20    E   Male Rejected 138
21    F Female Admitted  24
22    F Female Rejected 317
23    F   Male Admitted  22
24    F   Male Rejected 351
> ucb_admission_counts  %>%
+   # Group by department, then gender
+   group_by(Dept, Gender) %>%
+   # Create new variable
+   mutate(prop = n/sum(n)) %>%
+   # Filter for male and admitted
+   filter(Gender == "Male", Admit == "Admitted")
# A tibble: 6 x 5
# Groups:   Dept, Gender [6]
  Dept  Gender Admit        n   prop
  <chr> <chr>  <chr>    <int>  <dbl>
1 A     Male   Admitted   512 0.621 
2 B     Male   Admitted   353 0.630 
3 C     Male   Admitted   120 0.369 
4 D     Male   Admitted   138 0.331 
5 E     Male   Admitted    53 0.277 
6 F     Male   Admitted    22 0.0590
> ucb_admission_counts  %>%
+   # Group by department, then gender
+   group_by(Dept, Gender) %>%
+   # Create new variable
+   mutate(prop = n/sum(n))%>%
+   # Filter foradmitted
+   filter(Admit == "Admitted")
# A tibble: 12 x 5
# Groups:   Dept, Gender [12]
   Dept  Gender Admit        n   prop
   <chr> <chr>  <chr>    <int>  <dbl>
 1 A     Female Admitted    89 0.824 
 2 A     Male   Admitted   512 0.621 
 3 B     Female Admitted    17 0.68  
 4 B     Male   Admitted   353 0.630 
 5 C     Female Admitted   202 0.341 
 6 C     Male   Admitted   120 0.369 
 7 D     Female Admitted   131 0.349 
 8 D     Male   Admitted   138 0.331 
 9 E     Female Admitted    94 0.239 
10 E     Male   Admitted    53 0.277 
11 F     Female Admitted    24 0.0704
12 F     Male   Admitted    22 0.0590
> head(county)
# A tibble: 6 x 15
  name  state pop2000 pop2010 pop2017 pop_change poverty homeownership
  <chr> <fct>   <dbl>   <dbl>   <int>      <dbl>   <dbl>         <dbl>
1 Auta~ Alab~   43671   54571   55504       1.48    13.7          77.5
2 Bald~ Alab~  140415  182265  212628       9.19    11.8          76.7
3 Barb~ Alab~   29038   27457   25270      -6.22    27.2          68  
4 Bibb~ Alab~   20826   22915   22668       0.73    15.2          82.9
5 Blou~ Alab~   51024   57322   58013       0.68    15.6          82  
6 Bull~ Alab~   11714   10914   10309      -2.28    28.5          76.9
# ... with 7 more variables: multi_unit <dbl>, unemployment_rate <dbl>,
#   metro <fct>, median_edu <fct>, per_capita_income <dbl>,
#   median_hh_income <int>, smoking_ban <fct>
> #Remove DC
> county_noDC <- county %>% 
+   filter(state != "District of Columbia") %>% 
+   droplevels()
> head(county_noDC)
# A tibble: 6 x 15
  name  state pop2000 pop2010 pop2017 pop_change poverty homeownership
  <chr> <fct>   <dbl>   <dbl>   <int>      <dbl>   <dbl>         <dbl>
1 Auta~ Alab~   43671   54571   55504       1.48    13.7          77.5
2 Bald~ Alab~  140415  182265  212628       9.19    11.8          76.7
3 Barb~ Alab~   29038   27457   25270      -6.22    27.2          68  
4 Bibb~ Alab~   20826   22915   22668       0.73    15.2          82.9
5 Blou~ Alab~   51024   57322   58013       0.68    15.6          82  
6 Bull~ Alab~   11714   10914   10309      -2.28    28.5          76.9
# ... with 7 more variables: multi_unit <dbl>, unemployment_rate <dbl>,
#   metro <fct>, median_edu <fct>, per_capita_income <dbl>,
#   median_hh_income <int>, smoking_ban <fct>
> #simple random sample of 150 counties
> county_srs <- county_noDC %>% sample_n(size=150)
> glimpse(county_srs)
Rows: 150
Columns: 15
$ name              <chr> "Citrus County", "Chouteau County", "Stark County...
$ state             <fct> Florida, Montana, North Dakota, New York, Texas, ...
$ pop2000           <dbl> 118085, 5970, 22636, 51134, 26031, 46611, 10909, ...
$ pop2010           <dbl> 141236, 5813, 24199, 51599, 35096, 45949, 10658, ...
$ pop2017           <int> 145647, 5765, 30209, 51116, 41441, 45778, 11472, ...
$ pop_change        <dbl> 4.88, -1.33, 6.64, -0.25, 10.95, -0.08, 2.33, 3.4...
$ poverty           <dbl> 17.4, 20.3, 8.1, 19.4, 12.9, 9.0, 11.6, 7.3, 17.2...
$ homeownership     <dbl> 84.6, 66.3, 72.7, 72.6, 86.2, 77.9, 74.0, 81.5, 8...
$ multi_unit        <dbl> 5.4, 5.1, 21.9, 16.5, 4.5, 12.0, 10.3, 10.4, 1.7,...
$ unemployment_rate <dbl> 5.78, 3.28, 2.67, 6.07, 6.51, 3.72, 4.06, 3.61, 4...
$ metro             <fct> yes, no, no, no, yes, no, no, yes, no, yes, yes, ...
$ median_edu        <fct> hs_diploma, some_college, some_college, hs_diplom...
$ per_capita_income <dbl> 24463.06, 21777.60, 38659.76, 26415.97, 30891.81,...
$ median_hh_income  <int> 40574, 39577, 77328, 50733, 74368, 59516, 63333, ...
$ smoking_ban       <fct> partial, partial, NA, partial, none, NA, partial,...
> #State distribution of SRS counties
> county_srs %>% 
+   group_by(state) %>% 
+   count()
# A tibble: 41 x 2
# Groups:   state [41]
   state          n
   <fct>      <int>
 1 Alabama        1
 2 Arizona        1
 3 Arkansas       3
 4 California     2
 5 Florida        3
 6 Georgia       10
 7 Idaho          2
 8 Illinois       4
 9 Indiana        5
10 Iowa           5
# ... with 31 more rows
> # Stratified sample of 150 counties, each state is a stratum
> county_str <- county_noDC %>% 
+   group_by(state) %>% 
+   sample_n(size=3)
> glimpse(county_str)
Rows: 150
Columns: 15
Groups: state [50]
$ name              <chr> "Jackson County", "Coosa County", "Jefferson Coun...
$ state             <fct> Alabama, Alabama, Alabama, Alaska, Alaska, Alaska...
$ pop2000           <dbl> 53926, 12202, 662047, 10195, 2392, 7385, 843746, ...
$ pop2010           <dbl> 53227, 11539, 658466, 9636, 2508, 9430, 980263, 1...
$ pop2017           <int> 51909, 10754, 659197, 9278, 2526, 9782, 1022769, ...
$ pop_change        <dbl> -1.92, -4.43, 0.07, -5.06, -1.29, 0.06, 2.82, 3.1...
$ poverty           <dbl> 19.0, 14.4, 17.6, 7.4, 8.4, 10.2, 18.3, 21.0, 15....
$ homeownership     <dbl> 76.6, 83.7, 66.8, 71.8, 74.5, 48.3, 64.6, 61.2, 6...
$ multi_unit        <dbl> 5.8, 1.9, 24.0, 17.6, 13.2, 24.9, 22.9, 18.9, 25....
$ unemployment_rate <dbl> 4.77, 4.62, 4.24, 7.96, 9.09, 7.44, 4.48, 5.55, 4...
$ metro             <fct> no, no, yes, no, no, no, yes, yes, yes, no, yes, ...
$ median_edu        <fct> hs_diploma, hs_diploma, some_college, some_colleg...
$ per_capita_income <dbl> 20061.42, 19147.01, 29259.76, 30746.27, 37141.41,...
$ median_hh_income  <int> 39281, 34792, 49321, 86019, 70640, 77266, 48676, ...
$ smoking_ban       <fct> NA, none, none, none, none, none, NA, none, parti...

Sampling Strategies and Experimental Design


> us_regions <- read.csv("C:\\Users\\pbj20\\Documents\\R Documents\\regions.csv")
> 
> # Simple random sample: states_srs
> states_srs <- us_regions %>%
+   sample_n(8)
> 
> # Count states by region
> states_srs %>%
+   count(region)
     region n
1   Midwest 3
2 Northeast 2
3     South 2
4      West 1
> # Stratified sample
> states_str <- us_regions %>%
+   group_by(region) %>%
+   sample_n(2)
> 
> # Count states by region
> states_str %>%
+   count()
# A tibble: 4 x 2
# Groups:   region [4]
  region        n
  <chr>     <int>
1 Midwest       2
2 Northeast     2
3 South         2
4 West          2

Case Study


> head(evals)
# A tibble: 6 x 23
  course_id prof_id score rank  ethnicity gender language   age cls_perc_eval
      <int>   <int> <dbl> <fct> <fct>     <fct>  <fct>    <int>         <dbl>
1         1       1   4.7 tenu~ minority  female english     36          55.8
2         2       1   4.1 tenu~ minority  female english     36          68.8
3         3       1   3.9 tenu~ minority  female english     36          60.8
4         4       1   4.8 tenu~ minority  female english     36          62.6
5         5       2   4.6 tenu~ not mino~ male   english     59          85  
6         6       2   4.3 tenu~ not mino~ male   english     59          87.5
# ... with 14 more variables: cls_did_eval <int>, cls_students <int>,
#   cls_level <fct>, cls_profs <fct>, cls_credits <fct>, bty_f1lower <int>,
#   bty_f1upper <int>, bty_f2upper <int>, bty_m1lower <int>, bty_m1upper <int>,
#   bty_m2upper <int>, bty_avg <dbl>, pic_outfit <fct>, pic_color <fct>
> # Inspect variable types
> glimpse(evals)
Rows: 463
Columns: 23
$ course_id     <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
$ prof_id       <int> 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,...
$ score         <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8...
$ rank          <fct> tenure track, tenure track, tenure track, tenure trac...
$ ethnicity     <fct> minority, minority, minority, minority, not minority,...
$ gender        <fct> female, female, female, female, male, male, male, mal...
$ language      <fct> english, english, english, english, english, english,...
$ age           <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 4...
$ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000, 87....
$ cls_did_eval  <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, ...
$ cls_students  <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 2...
$ cls_level     <fct> upper, upper, upper, upper, upper, upper, upper, uppe...
$ cls_profs     <fct> single, single, single, single, multiple, multiple, m...
$ cls_credits   <fct> multi credit, multi credit, multi credit, multi credi...
$ bty_f1lower   <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 7,...
$ bty_f1upper   <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 9,...
$ bty_f2upper   <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 9,...
$ bty_m1lower   <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 7,...
$ bty_m1upper   <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6,...
$ bty_m2upper   <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 6,...
$ bty_avg       <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.33...
$ pic_outfit    <fct> not formal, not formal, not formal, not formal, not f...
$ pic_color     <fct> color, color, color, color, color, color, color, colo...
> # Remove non-factor variables from the vector below
> cat_vars <- c("rank", "ethnicity", "gender", "language",
+               "cls_level", "cls_profs", "cls_credits",
+                "pic_outfit", "pic_color")
> 
> # Recode cls_students as cls_type
> evals_fortified <- evals %>%
+   mutate(
+     cls_type = case_when(
+       cls_students <=18 ~ "small",
+       cls_students <=59 ~ "midsize",
+       cls_students >59  ~ "large"
+     )
+   )
> # Scatterplot of score vs. bty_avg
> ggplot(evals, aes(x=bty_avg, y=score)) +
+   geom_point()

> # Scatterplot of score vs. bty_avg colored by cls_type
> ggplot(evals_fortified, aes(x=bty_avg, y=score, color=cls_type)) +
+   geom_point()