Language of Data
> library(openintro)
> library(tidyverse)
> data(hsb2)
> glimpse(hsb2)
Rows: 200
Columns: 11
$ id <int> 70, 121, 86, 141, 172, 113, 50, 11, 84, 48, 75, 60, 95, 104...
$ gender <chr> "male", "female", "male", "male", "male", "male", "male", "...
$ race <chr> "white", "white", "white", "white", "white", "white", "afri...
$ ses <fct> low, middle, high, high, middle, middle, middle, middle, mi...
$ schtyp <fct> public, public, public, public, public, public, public, pub...
$ prog <fct> general, vocational, general, vocational, academic, academi...
$ read <int> 57, 68, 44, 63, 47, 44, 50, 34, 63, 57, 60, 57, 73, 54, 45,...
$ write <int> 52, 59, 33, 44, 52, 52, 59, 46, 57, 55, 46, 65, 60, 63, 57,...
$ math <int> 41, 53, 54, 47, 57, 51, 42, 45, 54, 52, 51, 51, 71, 57, 50,...
$ science <int> 47, 63, 58, 53, 53, 63, 53, 39, 58, 50, 53, 63, 61, 55, 31,...
$ socst <int> 57, 61, 31, 56, 61, 61, 61, 36, 51, 51, 61, 61, 71, 46, 56,...
> data(email50)
> glimpse(email50)
Rows: 50
Columns: 21
$ spam <dbl> 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 1, ...
$ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, ...
$ from <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, ...
$ cc <int> 0, 0, 4, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ sent_email <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...
$ time <dttm> 2012-01-04 08:19:16, 2012-02-16 15:10:06, 2012-01-04 ...
$ image <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ attach <dbl> 0, 0, 2, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 1, ...
$ dollar <dbl> 0, 0, 0, 0, 9, 0, 0, 0, 0, 23, 4, 0, 3, 2, 0, 0, 0, 0,...
$ winner <fct> no, no, no, no, no, no, no, no, no, no, no, no, yes, n...
$ inherit <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ viagra <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ password <dbl> 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 2, 0, 0, 0, 0, 0, 0, ...
$ num_char <dbl> 21.705, 7.011, 0.631, 2.454, 41.623, 0.057, 0.809, 5.2...
$ line_breaks <int> 551, 183, 28, 61, 1088, 5, 17, 88, 242, 578, 1167, 198...
$ format <dbl> 1, 1, 0, 0, 1, 0, 0, 1, 1, 1, 1, 1, 1, 0, 1, 1, 1, 0, ...
$ re_subj <dbl> 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, 0, 0, 0, 0, 0, 1, 1, 0, ...
$ exclaim_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 1, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, ...
$ exclaim_mess <dbl> 8, 1, 2, 1, 43, 0, 0, 2, 22, 3, 13, 1, 2, 2, 21, 10, 0...
$ number <fct> small, big, none, small, small, small, small, small, s...
public private
168 32
> hsb2_public <- hsb2 %>% filter(schtyp == "public")
> #private factor is still listed
> table(hsb2_public$schtyp)
public private
168 0
> #drop unused levels
> hsb2_public$schtyp <- droplevels(hsb2_public$schtyp)
> table(hsb2_public$schtyp)
public
168
> # Subset of emails with big numbers: email50_big
> email50_big <- email50 %>%
+ filter(email50$number=="big")
> # Glimpse the subset
> glimpse(email50_big)
Rows: 7
Columns: 21
$ spam <dbl> 0, 0, 1, 0, 0, 0, 0
$ to_multiple <dbl> 0, 0, 0, 0, 0, 0, 0
$ from <dbl> 1, 1, 1, 1, 1, 1, 1
$ cc <int> 0, 0, 0, 0, 0, 0, 0
$ sent_email <dbl> 0, 0, 0, 0, 0, 1, 0
$ time <dttm> 2012-02-16 15:10:06, 2012-02-04 18:26:09, 2012-01-24 ...
$ image <dbl> 0, 0, 0, 0, 0, 0, 0
$ attach <dbl> 0, 0, 0, 0, 0, 0, 0
$ dollar <dbl> 0, 0, 3, 2, 0, 0, 0
$ winner <fct> no, no, yes, no, no, no, no
$ inherit <dbl> 0, 0, 0, 0, 0, 0, 0
$ viagra <dbl> 0, 0, 0, 0, 0, 0, 0
$ password <dbl> 0, 2, 0, 0, 0, 0, 8
$ num_char <dbl> 7.011, 10.368, 42.793, 26.520, 6.563, 11.223, 10.613
$ line_breaks <int> 183, 198, 712, 692, 140, 512, 225
$ format <dbl> 1, 1, 1, 1, 1, 1, 1
$ re_subj <dbl> 0, 0, 0, 0, 0, 0, 0
$ exclaim_subj <dbl> 0, 0, 0, 1, 0, 0, 0
$ urgent_subj <dbl> 0, 0, 0, 0, 0, 0, 0
$ exclaim_mess <dbl> 1, 1, 2, 7, 2, 9, 9
$ number <fct> big, big, big, big, big, big, big
> # Subset of emails with big numbers: email50_big
> email50_big <- email50 %>%
+ filter(number == "big")
>
> # Table of the number variable
> table(email50_big$number)
none small big
0 0 7
> # Drop levels
> email50_big$number_dropped <- droplevels(email50_big$number)
>
> # Table of the number_dropped variable
> table(email50_big$number_dropped)
big
7
[1] 52.23
> avg_read <- mean(hsb2$read)
> (avg_read <- mean(hsb2$read))
[1] 52.23
> # create new variable: read_cat
> hsb2 <- hsb2 %>% mutate(read_cat = ifelse(
+ read < avg_read,"below average",
+ "at or above average"
+ ))
>
> # Calculate median number of characters: med_num_char
> med_num_char <- median(email50$num_char)
>
> # Create num_char_cat variable in email50
> email50_fortified <- email50 %>%
+ mutate(num_char_cat = ifelse(num_char < med_num_char,
+ "below median",
+ "at or above median"))
>
> # Count emails in each category
> email50_fortified %>%
+ count(num_char_cat)
# A tibble: 2 x 2
num_char_cat n
<chr> <int>
1 at or above median 25
2 below median 25
> # Create number_yn column in email50
> email50_fortified <- email50 %>%
+ mutate(
+ number_yn = case_when(
+ # if number is "none", make number_yn "no"
+ number == "none" ~ "no",
+ # if number is not "none", make number_yn "yes"
+ number != "none" ~ "yes"
+ )
+ )
> # Visualize the distribution of number_yn
> ggplot(email50_fortified, aes(x = number_yn)) +
+ geom_bar()

> #scatterplot of math vs science scores
> ggplot(data = hsb2, aes(x = science, y = math)) + geom_point()

> ggplot(data = hsb2, aes(x = science, y = math, color = prog)) +
+ geom_point()

> # Scatterplot of exclaim_mess vs. num_char
> ggplot(email50, aes(x = num_char, y = exclaim_mess,
+ color = factor(spam))) + geom_point()

Study types and Cautionary Tales
> library(gapminder)
>
> # Glimpse data
> glimpse(gapminder)
Rows: 1,704
Columns: 6
$ country <fct> Afghanistan, Afghanistan, Afghanistan, Afghanistan, Afgha...
$ continent <fct> Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asia, Asi...
$ year <int> 1952, 1957, 1962, 1967, 1972, 1977, 1982, 1987, 1992, 199...
$ lifeExp <dbl> 28.801, 30.332, 31.997, 34.020, 36.088, 38.438, 39.854, 4...
$ pop <int> 8425333, 9240934, 10267083, 11537966, 13079460, 14880372,...
$ gdpPercap <dbl> 779.4453, 820.8530, 853.1007, 836.1971, 739.9811, 786.113...
'table' num [1:2, 1:2, 1:6] 512 313 89 19 353 207 17 8 120 205 ...
- attr(*, "dimnames")=List of 3
..$ Admit : chr [1:2] "Admitted" "Rejected"
..$ Gender: chr [1:2] "Male" "Female"
..$ Dept : chr [1:6] "A" "B" "C" "D" ...
> ucb_admit <- read.csv("C:\\Users\\pbj20\\Documents\\R Documents\\ucba.csv")
>
> head(ucb_admit)
Admit Gender Dept
1 Admitted Male A
2 Admitted Male A
3 Admitted Male A
4 Admitted Male A
5 Admitted Male A
6 Admitted Male A
> # Count number of male and female applicants admitted
> ucb_admit %>%
+ count(Gender, Admit)
Gender Admit n
1 Female Admitted 557
2 Female Rejected 1278
3 Male Admitted 1198
4 Male Rejected 1493
> ucb_admission_counts <- ucb_admit %>%
+ count(Gender, Admit)
>
> ucb_admission_counts %>%
+ # Group by gender
+ group_by(Gender) %>%
+ # Create new variable
+ mutate(prop = n / sum(n)) %>%
+ # Filter for admitted
+ filter(Admit == "Admitted")
# A tibble: 2 x 4
# Groups: Gender [2]
Gender Admit n prop
<chr> <chr> <int> <dbl>
1 Female Admitted 557 0.304
2 Male Admitted 1198 0.445
> ucb_admission_counts <- ucb_admit %>%
+ # Counts by department, then gender, then admission status
+ count(Dept, Gender, Admit)
>
> # See the result
> ucb_admission_counts
Dept Gender Admit n
1 A Female Admitted 89
2 A Female Rejected 19
3 A Male Admitted 512
4 A Male Rejected 313
5 B Female Admitted 17
6 B Female Rejected 8
7 B Male Admitted 353
8 B Male Rejected 207
9 C Female Admitted 202
10 C Female Rejected 391
11 C Male Admitted 120
12 C Male Rejected 205
13 D Female Admitted 131
14 D Female Rejected 244
15 D Male Admitted 138
16 D Male Rejected 279
17 E Female Admitted 94
18 E Female Rejected 299
19 E Male Admitted 53
20 E Male Rejected 138
21 F Female Admitted 24
22 F Female Rejected 317
23 F Male Admitted 22
24 F Male Rejected 351
> ucb_admission_counts %>%
+ # Group by department, then gender
+ group_by(Dept, Gender) %>%
+ # Create new variable
+ mutate(prop = n/sum(n)) %>%
+ # Filter for male and admitted
+ filter(Gender == "Male", Admit == "Admitted")
# A tibble: 6 x 5
# Groups: Dept, Gender [6]
Dept Gender Admit n prop
<chr> <chr> <chr> <int> <dbl>
1 A Male Admitted 512 0.621
2 B Male Admitted 353 0.630
3 C Male Admitted 120 0.369
4 D Male Admitted 138 0.331
5 E Male Admitted 53 0.277
6 F Male Admitted 22 0.0590
> ucb_admission_counts %>%
+ # Group by department, then gender
+ group_by(Dept, Gender) %>%
+ # Create new variable
+ mutate(prop = n/sum(n))%>%
+ # Filter foradmitted
+ filter(Admit == "Admitted")
# A tibble: 12 x 5
# Groups: Dept, Gender [12]
Dept Gender Admit n prop
<chr> <chr> <chr> <int> <dbl>
1 A Female Admitted 89 0.824
2 A Male Admitted 512 0.621
3 B Female Admitted 17 0.68
4 B Male Admitted 353 0.630
5 C Female Admitted 202 0.341
6 C Male Admitted 120 0.369
7 D Female Admitted 131 0.349
8 D Male Admitted 138 0.331
9 E Female Admitted 94 0.239
10 E Male Admitted 53 0.277
11 F Female Admitted 24 0.0704
12 F Male Admitted 22 0.0590
# A tibble: 6 x 15
name state pop2000 pop2010 pop2017 pop_change poverty homeownership
<chr> <fct> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 Auta~ Alab~ 43671 54571 55504 1.48 13.7 77.5
2 Bald~ Alab~ 140415 182265 212628 9.19 11.8 76.7
3 Barb~ Alab~ 29038 27457 25270 -6.22 27.2 68
4 Bibb~ Alab~ 20826 22915 22668 0.73 15.2 82.9
5 Blou~ Alab~ 51024 57322 58013 0.68 15.6 82
6 Bull~ Alab~ 11714 10914 10309 -2.28 28.5 76.9
# ... with 7 more variables: multi_unit <dbl>, unemployment_rate <dbl>,
# metro <fct>, median_edu <fct>, per_capita_income <dbl>,
# median_hh_income <int>, smoking_ban <fct>
> #Remove DC
> county_noDC <- county %>%
+ filter(state != "District of Columbia") %>%
+ droplevels()
# A tibble: 6 x 15
name state pop2000 pop2010 pop2017 pop_change poverty homeownership
<chr> <fct> <dbl> <dbl> <int> <dbl> <dbl> <dbl>
1 Auta~ Alab~ 43671 54571 55504 1.48 13.7 77.5
2 Bald~ Alab~ 140415 182265 212628 9.19 11.8 76.7
3 Barb~ Alab~ 29038 27457 25270 -6.22 27.2 68
4 Bibb~ Alab~ 20826 22915 22668 0.73 15.2 82.9
5 Blou~ Alab~ 51024 57322 58013 0.68 15.6 82
6 Bull~ Alab~ 11714 10914 10309 -2.28 28.5 76.9
# ... with 7 more variables: multi_unit <dbl>, unemployment_rate <dbl>,
# metro <fct>, median_edu <fct>, per_capita_income <dbl>,
# median_hh_income <int>, smoking_ban <fct>
> #simple random sample of 150 counties
> county_srs <- county_noDC %>% sample_n(size=150)
Rows: 150
Columns: 15
$ name <chr> "Citrus County", "Chouteau County", "Stark County...
$ state <fct> Florida, Montana, North Dakota, New York, Texas, ...
$ pop2000 <dbl> 118085, 5970, 22636, 51134, 26031, 46611, 10909, ...
$ pop2010 <dbl> 141236, 5813, 24199, 51599, 35096, 45949, 10658, ...
$ pop2017 <int> 145647, 5765, 30209, 51116, 41441, 45778, 11472, ...
$ pop_change <dbl> 4.88, -1.33, 6.64, -0.25, 10.95, -0.08, 2.33, 3.4...
$ poverty <dbl> 17.4, 20.3, 8.1, 19.4, 12.9, 9.0, 11.6, 7.3, 17.2...
$ homeownership <dbl> 84.6, 66.3, 72.7, 72.6, 86.2, 77.9, 74.0, 81.5, 8...
$ multi_unit <dbl> 5.4, 5.1, 21.9, 16.5, 4.5, 12.0, 10.3, 10.4, 1.7,...
$ unemployment_rate <dbl> 5.78, 3.28, 2.67, 6.07, 6.51, 3.72, 4.06, 3.61, 4...
$ metro <fct> yes, no, no, no, yes, no, no, yes, no, yes, yes, ...
$ median_edu <fct> hs_diploma, some_college, some_college, hs_diplom...
$ per_capita_income <dbl> 24463.06, 21777.60, 38659.76, 26415.97, 30891.81,...
$ median_hh_income <int> 40574, 39577, 77328, 50733, 74368, 59516, 63333, ...
$ smoking_ban <fct> partial, partial, NA, partial, none, NA, partial,...
> #State distribution of SRS counties
> county_srs %>%
+ group_by(state) %>%
+ count()
# A tibble: 41 x 2
# Groups: state [41]
state n
<fct> <int>
1 Alabama 1
2 Arizona 1
3 Arkansas 3
4 California 2
5 Florida 3
6 Georgia 10
7 Idaho 2
8 Illinois 4
9 Indiana 5
10 Iowa 5
# ... with 31 more rows
> # Stratified sample of 150 counties, each state is a stratum
> county_str <- county_noDC %>%
+ group_by(state) %>%
+ sample_n(size=3)
Rows: 150
Columns: 15
Groups: state [50]
$ name <chr> "Jackson County", "Coosa County", "Jefferson Coun...
$ state <fct> Alabama, Alabama, Alabama, Alaska, Alaska, Alaska...
$ pop2000 <dbl> 53926, 12202, 662047, 10195, 2392, 7385, 843746, ...
$ pop2010 <dbl> 53227, 11539, 658466, 9636, 2508, 9430, 980263, 1...
$ pop2017 <int> 51909, 10754, 659197, 9278, 2526, 9782, 1022769, ...
$ pop_change <dbl> -1.92, -4.43, 0.07, -5.06, -1.29, 0.06, 2.82, 3.1...
$ poverty <dbl> 19.0, 14.4, 17.6, 7.4, 8.4, 10.2, 18.3, 21.0, 15....
$ homeownership <dbl> 76.6, 83.7, 66.8, 71.8, 74.5, 48.3, 64.6, 61.2, 6...
$ multi_unit <dbl> 5.8, 1.9, 24.0, 17.6, 13.2, 24.9, 22.9, 18.9, 25....
$ unemployment_rate <dbl> 4.77, 4.62, 4.24, 7.96, 9.09, 7.44, 4.48, 5.55, 4...
$ metro <fct> no, no, yes, no, no, no, yes, yes, yes, no, yes, ...
$ median_edu <fct> hs_diploma, hs_diploma, some_college, some_colleg...
$ per_capita_income <dbl> 20061.42, 19147.01, 29259.76, 30746.27, 37141.41,...
$ median_hh_income <int> 39281, 34792, 49321, 86019, 70640, 77266, 48676, ...
$ smoking_ban <fct> NA, none, none, none, none, none, NA, none, parti...
Case Study
# A tibble: 6 x 23
course_id prof_id score rank ethnicity gender language age cls_perc_eval
<int> <int> <dbl> <fct> <fct> <fct> <fct> <int> <dbl>
1 1 1 4.7 tenu~ minority female english 36 55.8
2 2 1 4.1 tenu~ minority female english 36 68.8
3 3 1 3.9 tenu~ minority female english 36 60.8
4 4 1 4.8 tenu~ minority female english 36 62.6
5 5 2 4.6 tenu~ not mino~ male english 59 85
6 6 2 4.3 tenu~ not mino~ male english 59 87.5
# ... with 14 more variables: cls_did_eval <int>, cls_students <int>,
# cls_level <fct>, cls_profs <fct>, cls_credits <fct>, bty_f1lower <int>,
# bty_f1upper <int>, bty_f2upper <int>, bty_m1lower <int>, bty_m1upper <int>,
# bty_m2upper <int>, bty_avg <dbl>, pic_outfit <fct>, pic_color <fct>
> # Inspect variable types
> glimpse(evals)
Rows: 463
Columns: 23
$ course_id <int> 1, 2, 3, 4, 5, 6, 7, 8, 9, 10, 11, 12, 13, 14, 15, 16...
$ prof_id <int> 1, 1, 1, 1, 2, 2, 2, 3, 3, 4, 4, 4, 4, 4, 4, 4, 4, 5,...
$ score <dbl> 4.7, 4.1, 3.9, 4.8, 4.6, 4.3, 2.8, 4.1, 3.4, 4.5, 3.8...
$ rank <fct> tenure track, tenure track, tenure track, tenure trac...
$ ethnicity <fct> minority, minority, minority, minority, not minority,...
$ gender <fct> female, female, female, female, male, male, male, mal...
$ language <fct> english, english, english, english, english, english,...
$ age <int> 36, 36, 36, 36, 59, 59, 59, 51, 51, 40, 40, 40, 40, 4...
$ cls_perc_eval <dbl> 55.81395, 68.80000, 60.80000, 62.60163, 85.00000, 87....
$ cls_did_eval <int> 24, 86, 76, 77, 17, 35, 39, 55, 111, 40, 24, 24, 17, ...
$ cls_students <int> 43, 125, 125, 123, 20, 40, 44, 55, 195, 46, 27, 25, 2...
$ cls_level <fct> upper, upper, upper, upper, upper, upper, upper, uppe...
$ cls_profs <fct> single, single, single, single, multiple, multiple, m...
$ cls_credits <fct> multi credit, multi credit, multi credit, multi credi...
$ bty_f1lower <int> 5, 5, 5, 5, 4, 4, 4, 5, 5, 2, 2, 2, 2, 2, 2, 2, 2, 7,...
$ bty_f1upper <int> 7, 7, 7, 7, 4, 4, 4, 2, 2, 5, 5, 5, 5, 5, 5, 5, 5, 9,...
$ bty_f2upper <int> 6, 6, 6, 6, 2, 2, 2, 5, 5, 4, 4, 4, 4, 4, 4, 4, 4, 9,...
$ bty_m1lower <int> 2, 2, 2, 2, 2, 2, 2, 2, 2, 3, 3, 3, 3, 3, 3, 3, 3, 7,...
$ bty_m1upper <int> 4, 4, 4, 4, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 3, 6,...
$ bty_m2upper <int> 6, 6, 6, 6, 3, 3, 3, 3, 3, 2, 2, 2, 2, 2, 2, 2, 2, 6,...
$ bty_avg <dbl> 5.000, 5.000, 5.000, 5.000, 3.000, 3.000, 3.000, 3.33...
$ pic_outfit <fct> not formal, not formal, not formal, not formal, not f...
$ pic_color <fct> color, color, color, color, color, color, color, colo...
> # Remove non-factor variables from the vector below
> cat_vars <- c("rank", "ethnicity", "gender", "language",
+ "cls_level", "cls_profs", "cls_credits",
+ "pic_outfit", "pic_color")
>
> # Recode cls_students as cls_type
> evals_fortified <- evals %>%
+ mutate(
+ cls_type = case_when(
+ cls_students <=18 ~ "small",
+ cls_students <=59 ~ "midsize",
+ cls_students >59 ~ "large"
+ )
+ )
> # Scatterplot of score vs. bty_avg
> ggplot(evals, aes(x=bty_avg, y=score)) +
+ geom_point()

> # Scatterplot of score vs. bty_avg colored by cls_type
> ggplot(evals_fortified, aes(x=bty_avg, y=score, color=cls_type)) +
+ geom_point()
