library(tidyverse)

## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──

## ✓ ggplot2 3.3.5     ✓ purrr   0.3.4
## ✓ tibble  3.1.6     ✓ dplyr   1.0.7
## ✓ tidyr   1.2.0     ✓ stringr 1.4.0
## ✓ readr   2.1.2     ✓ forcats 0.5.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()

# Toy datasets to use

pollution <- tribble(
       ~city,   ~size, ~amount, 
  "New York", "large",      23,
  "New York", "small",      14,
    "London", "large",      22,
    "London", "small",      16,
   "Beijing", "large",      121,
   "Beijing", "small",      56
)

#library(babynames)

Import Data

Your Turn 1

Set the working directory to 03-Transform. Then import the babynames.csv data set, which is in the 03-Transform folder, as babynames. Then copy the import code into the code chunk below. Does it run?

library(babynames)
library(readr)
write_csv(babynames, path = "babynames.csv")

## Warning: The `path` argument of `write_csv()` is deprecated as of readr 1.4.0.
## Please use the `file` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.

babynames

## # A tibble: 1,924,665 × 5
##     year sex   name          n   prop
##    <dbl> <chr> <chr>     <int>  <dbl>
##  1  1880 F     Mary       7065 0.0724
##  2  1880 F     Anna       2604 0.0267
##  3  1880 F     Emma       2003 0.0205
##  4  1880 F     Elizabeth  1939 0.0199
##  5  1880 F     Minnie     1746 0.0179
##  6  1880 F     Margaret   1578 0.0162
##  7  1880 F     Ida        1472 0.0151
##  8  1880 F     Alice      1414 0.0145
##  9  1880 F     Bertha     1320 0.0135
## 10  1880 F     Sarah      1288 0.0132
## # … with 1,924,655 more rows

dplyr

Your Turn 2

Alter the code to select just the n column:

select(babynames, n)

## # A tibble: 1,924,665 × 1
##        n
##    <int>
##  1  7065
##  2  2604
##  3  2003
##  4  1939
##  5  1746
##  6  1578
##  7  1472
##  8  1414
##  9  1320
## 10  1288
## # … with 1,924,655 more rows

Quiz

Which of these is NOT a way to select the name and n columns together?

select(babynames, -c(year, sex, prop))

## # A tibble: 1,924,665 × 2
##    name          n
##    <chr>     <int>
##  1 Mary       7065
##  2 Anna       2604
##  3 Emma       2003
##  4 Elizabeth  1939
##  5 Minnie     1746
##  6 Margaret   1578
##  7 Ida        1472
##  8 Alice      1414
##  9 Bertha     1320
## 10 Sarah      1288
## # … with 1,924,655 more rows

select(babynames, name:n)

## # A tibble: 1,924,665 × 2
##    name          n
##    <chr>     <int>
##  1 Mary       7065
##  2 Anna       2604
##  3 Emma       2003
##  4 Elizabeth  1939
##  5 Minnie     1746
##  6 Margaret   1578
##  7 Ida        1472
##  8 Alice      1414
##  9 Bertha     1320
## 10 Sarah      1288
## # … with 1,924,655 more rows

select(babynames, starts_with("n"))

## # A tibble: 1,924,665 × 2
##    name          n
##    <chr>     <int>
##  1 Mary       7065
##  2 Anna       2604
##  3 Emma       2003
##  4 Elizabeth  1939
##  5 Minnie     1746
##  6 Margaret   1578
##  7 Ida        1472
##  8 Alice      1414
##  9 Bertha     1320
## 10 Sarah      1288
## # … with 1,924,655 more rows

select(babynames, ends_with("n")) #This

## # A tibble: 1,924,665 × 1
##        n
##    <int>
##  1  7065
##  2  2604
##  3  2003
##  4  1939
##  5  1746
##  6  1578
##  7  1472
##  8  1414
##  9  1320
## 10  1288
## # … with 1,924,655 more rows

Your Turn 3

Use filter, babynames, and the logical operators to find:

All of the names where prop is greater than or equal to 0.08
All of the children named “Sea”

filter(babynames, prop >= 0.08)

## # A tibble: 3 × 5
##    year sex   name        n   prop
##   <dbl> <chr> <chr>   <int>  <dbl>
## 1  1880 M     John     9655 0.0815
## 2  1880 M     William  9532 0.0805
## 3  1881 M     John     8769 0.0810

filter(babynames, name == "Sea")

## # A tibble: 4 × 5
##    year sex   name      n       prop
##   <dbl> <chr> <chr> <int>      <dbl>
## 1  1982 F     Sea       5 0.00000276
## 2  1985 M     Sea       6 0.00000312
## 3  1986 M     Sea       5 0.0000026 
## 4  1998 F     Sea       5 0.00000258

Your Turn 4

Use Boolean operators to return only the rows that contain:

Boys named Sue
Names that were used by exactly 5 or 6 children in 1880
Names that are one of Acura, Lexus, or Yugo

filter(babynames, name == "Sue", sex == "M")

## # A tibble: 52 × 5
##     year sex   name      n       prop
##    <dbl> <chr> <chr> <int>      <dbl>
##  1  1917 M     Sue       7 0.0000073 
##  2  1927 M     Sue       5 0.0000043 
##  3  1928 M     Sue       5 0.00000438
##  4  1930 M     Sue       5 0.00000443
##  5  1931 M     Sue       6 0.00000561
##  6  1932 M     Sue       7 0.00000652
##  7  1933 M     Sue       7 0.00000686
##  8  1934 M     Sue      14 0.0000132 
##  9  1935 M     Sue      13 0.0000122 
## 10  1936 M     Sue       9 0.00000846
## # … with 42 more rows

filter(babynames, (n == 5 | n == 6) & year == 1880)

## # A tibble: 455 × 5
##     year sex   name        n      prop
##    <dbl> <chr> <chr>   <int>     <dbl>
##  1  1880 F     Abby        6 0.0000615
##  2  1880 F     Aileen      6 0.0000615
##  3  1880 F     Alba        6 0.0000615
##  4  1880 F     Alda        6 0.0000615
##  5  1880 F     Alla        6 0.0000615
##  6  1880 F     Alverta     6 0.0000615
##  7  1880 F     Ara         6 0.0000615
##  8  1880 F     Ardelia     6 0.0000615
##  9  1880 F     Ardella     6 0.0000615
## 10  1880 F     Arrie       6 0.0000615
## # … with 445 more rows

filter(babynames, name == "Acura" | name =="Lexus" |name =="Yugo")

## # A tibble: 57 × 5
##     year sex   name      n       prop
##    <dbl> <chr> <chr> <int>      <dbl>
##  1  1990 F     Lexus    36 0.0000175 
##  2  1990 M     Lexus    12 0.00000558
##  3  1991 F     Lexus   102 0.0000502 
##  4  1991 M     Lexus    16 0.00000755
##  5  1992 F     Lexus   193 0.0000963 
##  6  1992 M     Lexus    25 0.0000119 
##  7  1993 F     Lexus   285 0.000145  
##  8  1993 M     Lexus    30 0.0000145 
##  9  1994 F     Lexus   381 0.000195  
## 10  1994 F     Acura     6 0.00000308
## # … with 47 more rows

Help Me

What is the smallest value of n? What is the largest?

arrange(babynames, n, prop) #5

## # A tibble: 1,924,665 × 5
##     year sex   name            n       prop
##    <dbl> <chr> <chr>       <int>      <dbl>
##  1  2007 M     Aaban           5 0.00000226
##  2  2007 M     Aareon          5 0.00000226
##  3  2007 M     Aaris           5 0.00000226
##  4  2007 M     Abd             5 0.00000226
##  5  2007 M     Abdulazeez      5 0.00000226
##  6  2007 M     Abdulhadi       5 0.00000226
##  7  2007 M     Abdulhamid      5 0.00000226
##  8  2007 M     Abdulkadir      5 0.00000226
##  9  2007 M     Abdulraheem     5 0.00000226
## 10  2007 M     Abdulrahim      5 0.00000226
## # … with 1,924,655 more rows

arrange(babynames, desc(n)) #99686

## # A tibble: 1,924,665 × 5
##     year sex   name        n   prop
##    <dbl> <chr> <chr>   <int>  <dbl>
##  1  1947 F     Linda   99686 0.0548
##  2  1948 F     Linda   96209 0.0552
##  3  1947 M     James   94756 0.0510
##  4  1957 M     Michael 92695 0.0424
##  5  1947 M     Robert  91642 0.0493
##  6  1949 F     Linda   91016 0.0518
##  7  1956 M     Michael 90620 0.0423
##  8  1958 M     Michael 90520 0.0420
##  9  1948 M     James   88588 0.0497
## 10  1954 M     Michael 88514 0.0428
## # … with 1,924,655 more rows

Your Turn 5

Use %>% to write a sequence of functions that:

Filters babynames to just the girls that were born in 2017, then…
Selects the name and n columns, then…
Arranges the results so that the most popular names are near the top.

babynames %>%

filter(year == 2017, sex == "F") %>%

select(name, n) %>%

arrange(desc(n))

## # A tibble: 18,309 × 2
##    name          n
##    <chr>     <int>
##  1 Emma      19738
##  2 Olivia    18632
##  3 Ava       15902
##  4 Isabella  15100
##  5 Sophia    14831
##  6 Mia       13437
##  7 Charlotte 12893
##  8 Amelia    11800
##  9 Evelyn    10675
## 10 Abigail   10551
## # … with 18,299 more rows

Your Turn 6 - Exam

Trim babynames to just the rows that contain your name and your sex
Trim the result to just the columns that will appear in your graph (not strictly necessary, but useful practice)
Plot the results as a line graph with year on the x axis and prop on the y axis

babynames %>%

filter(name == "Garrett", sex == "M") %>%

select(year, prop) %>%

ggplot() +

geom_line(mapping = aes(year, prop))

Your Turn 7

Copmplete the code below to extract the rows where name == "Khaleesi". Then use summarise() and sum() and min() to find:

The total number of children named Khaleesi
The first year Khaleesi appeared in the data

(Hint: Be sure to remove each _ before running the code)

babynames %>%

filter(name == "Khaleesi") %>%

summarise(total = sum(n), first = min(year))

## # A tibble: 1 × 2
##   total first
##   <int> <dbl>
## 1  1964  2011

Your Turn 8

Use group_by(), summarise(), and arrange() to display the ten most popular names. Compute popularity as the total number of children of a single gender given a name.

(Hint: Be sure to remove each _ before running the code)

babynames %>%

group_by(name, sex) %>%

summarise(total = sum(n)) %>%

arrange(desc(total))

## `summarise()` has grouped output by 'name'. You can override using the `.groups` argument.

## # A tibble: 107,973 × 3
## # Groups:   name [97,310]
##    name    sex     total
##    <chr>   <chr>   <int>
##  1 James   M     5150472
##  2 John    M     5115466
##  3 Robert  M     4814815
##  4 Michael M     4350824
##  5 Mary    F     4123200
##  6 William M     4102604
##  7 David   M     3611329
##  8 Joseph  M     2603445
##  9 Richard M     2563082
## 10 Charles M     2386048
## # … with 107,963 more rows

Your Turn 9

Use group_by() to calculate and then plot the total number of children born each year over time.

babynames %>%

group_by(year) %>%

summarise(n_children = sum(n)) %>%

ggplot() +

geom_line(mapping = aes(x = year, y = n_children))

Your Turn 10

Use mutate() and min_rank()to rank each row in babynames from largest n to lowest n.

(Hint: Be sure to remove each _ before running the code)

babynames %>%

mutate(rank = min_rank(desc(prop)))

## # A tibble: 1,924,665 × 6
##     year sex   name          n   prop  rank
##    <dbl> <chr> <chr>     <int>  <dbl> <int>
##  1  1880 F     Mary       7065 0.0724    14
##  2  1880 F     Anna       2604 0.0267   709
##  3  1880 F     Emma       2003 0.0205  1131
##  4  1880 F     Elizabeth  1939 0.0199  1192
##  5  1880 F     Minnie     1746 0.0179  1427
##  6  1880 F     Margaret   1578 0.0162  1683
##  7  1880 F     Ida        1472 0.0151  1897
##  8  1880 F     Alice      1414 0.0145  2039
##  9  1880 F     Bertha     1320 0.0135  2279
## 10  1880 F     Sarah      1288 0.0132  2387
## # … with 1,924,655 more rows

Your Turn 11

Group babynames by year and then re-rank the data. Filter the results to just rows where rank == 1.

babynames %>%

group_by(year) %>%

mutate(rank = min_rank(desc(prop))) %>%

filter(rank == 1)

## # A tibble: 138 × 6
## # Groups:   year [138]
##     year sex   name      n   prop  rank
##    <dbl> <chr> <chr> <int>  <dbl> <int>
##  1  1880 M     John   9655 0.0815     1
##  2  1881 M     John   8769 0.0810     1
##  3  1882 M     John   9557 0.0783     1
##  4  1883 M     John   8894 0.0791     1
##  5  1884 M     John   9388 0.0765     1
##  6  1885 M     John   8756 0.0755     1
##  7  1886 M     John   9026 0.0758     1
##  8  1887 M     John   8110 0.0742     1
##  9  1888 M     John   9247 0.0712     1
## 10  1889 M     John   8548 0.0718     1
## # … with 128 more rows

Take aways

Extract variables with select()
Extract cases with filter()
Arrange cases, with arrange()
Make tables of summaries with summarise()
Make new variables, with mutate()
Do groupwise operations with group_by()
Connect operations with %>%

Transform Data

Elanur Ural

2/13/2022

Import Data

Your Turn 1

dplyr

Your Turn 2

Quiz

Your Turn 3

Your Turn 4

Help Me

Your Turn 5

Your Turn 6 - Exam

Your Turn 7

Your Turn 8

Your Turn 9

Your Turn 10

Your Turn 11

Take aways