library(tidyverse)
## ── Attaching packages ─────────────────────────────────────── tidyverse 1.3.1 ──
## ✓ ggplot2 3.3.5 ✓ purrr 0.3.4
## ✓ tibble 3.1.6 ✓ dplyr 1.0.7
## ✓ tidyr 1.2.0 ✓ stringr 1.4.0
## ✓ readr 2.1.2 ✓ forcats 0.5.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
# Toy datasets to use
pollution <- tribble(
~city, ~size, ~amount,
"New York", "large", 23,
"New York", "small", 14,
"London", "large", 22,
"London", "small", 16,
"Beijing", "large", 121,
"Beijing", "small", 56
)
#library(babynames)
Set the working directory to 03-Transform
. Then import the babynames.csv
data set, which is in the 03-Transform
folder, as babynames
. Then copy the import code into the code chunk below. Does it run?
library(babynames)
library(readr)
write_csv(babynames, path = "babynames.csv")
## Warning: The `path` argument of `write_csv()` is deprecated as of readr 1.4.0.
## Please use the `file` argument instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was generated.
babynames
## # A tibble: 1,924,665 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1880 F Mary 7065 0.0724
## 2 1880 F Anna 2604 0.0267
## 3 1880 F Emma 2003 0.0205
## 4 1880 F Elizabeth 1939 0.0199
## 5 1880 F Minnie 1746 0.0179
## 6 1880 F Margaret 1578 0.0162
## 7 1880 F Ida 1472 0.0151
## 8 1880 F Alice 1414 0.0145
## 9 1880 F Bertha 1320 0.0135
## 10 1880 F Sarah 1288 0.0132
## # … with 1,924,655 more rows
Alter the code to select just the n
column:
select(babynames, n)
## # A tibble: 1,924,665 × 1
## n
## <int>
## 1 7065
## 2 2604
## 3 2003
## 4 1939
## 5 1746
## 6 1578
## 7 1472
## 8 1414
## 9 1320
## 10 1288
## # … with 1,924,655 more rows
Which of these is NOT a way to select the name
and n
columns together?
select(babynames, -c(year, sex, prop))
## # A tibble: 1,924,665 × 2
## name n
## <chr> <int>
## 1 Mary 7065
## 2 Anna 2604
## 3 Emma 2003
## 4 Elizabeth 1939
## 5 Minnie 1746
## 6 Margaret 1578
## 7 Ida 1472
## 8 Alice 1414
## 9 Bertha 1320
## 10 Sarah 1288
## # … with 1,924,655 more rows
select(babynames, name:n)
## # A tibble: 1,924,665 × 2
## name n
## <chr> <int>
## 1 Mary 7065
## 2 Anna 2604
## 3 Emma 2003
## 4 Elizabeth 1939
## 5 Minnie 1746
## 6 Margaret 1578
## 7 Ida 1472
## 8 Alice 1414
## 9 Bertha 1320
## 10 Sarah 1288
## # … with 1,924,655 more rows
select(babynames, starts_with("n"))
## # A tibble: 1,924,665 × 2
## name n
## <chr> <int>
## 1 Mary 7065
## 2 Anna 2604
## 3 Emma 2003
## 4 Elizabeth 1939
## 5 Minnie 1746
## 6 Margaret 1578
## 7 Ida 1472
## 8 Alice 1414
## 9 Bertha 1320
## 10 Sarah 1288
## # … with 1,924,655 more rows
select(babynames, ends_with("n")) #This
## # A tibble: 1,924,665 × 1
## n
## <int>
## 1 7065
## 2 2604
## 3 2003
## 4 1939
## 5 1746
## 6 1578
## 7 1472
## 8 1414
## 9 1320
## 10 1288
## # … with 1,924,655 more rows
Use filter
, babynames
, and the logical operators to find:
filter(babynames, prop >= 0.08)
## # A tibble: 3 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1880 M John 9655 0.0815
## 2 1880 M William 9532 0.0805
## 3 1881 M John 8769 0.0810
filter(babynames, name == "Sea")
## # A tibble: 4 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1982 F Sea 5 0.00000276
## 2 1985 M Sea 6 0.00000312
## 3 1986 M Sea 5 0.0000026
## 4 1998 F Sea 5 0.00000258
Use Boolean operators to return only the rows that contain:
filter(babynames, name == "Sue", sex == "M")
## # A tibble: 52 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1917 M Sue 7 0.0000073
## 2 1927 M Sue 5 0.0000043
## 3 1928 M Sue 5 0.00000438
## 4 1930 M Sue 5 0.00000443
## 5 1931 M Sue 6 0.00000561
## 6 1932 M Sue 7 0.00000652
## 7 1933 M Sue 7 0.00000686
## 8 1934 M Sue 14 0.0000132
## 9 1935 M Sue 13 0.0000122
## 10 1936 M Sue 9 0.00000846
## # … with 42 more rows
filter(babynames, (n == 5 | n == 6) & year == 1880)
## # A tibble: 455 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1880 F Abby 6 0.0000615
## 2 1880 F Aileen 6 0.0000615
## 3 1880 F Alba 6 0.0000615
## 4 1880 F Alda 6 0.0000615
## 5 1880 F Alla 6 0.0000615
## 6 1880 F Alverta 6 0.0000615
## 7 1880 F Ara 6 0.0000615
## 8 1880 F Ardelia 6 0.0000615
## 9 1880 F Ardella 6 0.0000615
## 10 1880 F Arrie 6 0.0000615
## # … with 445 more rows
filter(babynames, name == "Acura" | name =="Lexus" |name =="Yugo")
## # A tibble: 57 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1990 F Lexus 36 0.0000175
## 2 1990 M Lexus 12 0.00000558
## 3 1991 F Lexus 102 0.0000502
## 4 1991 M Lexus 16 0.00000755
## 5 1992 F Lexus 193 0.0000963
## 6 1992 M Lexus 25 0.0000119
## 7 1993 F Lexus 285 0.000145
## 8 1993 M Lexus 30 0.0000145
## 9 1994 F Lexus 381 0.000195
## 10 1994 F Acura 6 0.00000308
## # … with 47 more rows
What is the smallest value of n? What is the largest?
arrange(babynames, n, prop) #5
## # A tibble: 1,924,665 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 2007 M Aaban 5 0.00000226
## 2 2007 M Aareon 5 0.00000226
## 3 2007 M Aaris 5 0.00000226
## 4 2007 M Abd 5 0.00000226
## 5 2007 M Abdulazeez 5 0.00000226
## 6 2007 M Abdulhadi 5 0.00000226
## 7 2007 M Abdulhamid 5 0.00000226
## 8 2007 M Abdulkadir 5 0.00000226
## 9 2007 M Abdulraheem 5 0.00000226
## 10 2007 M Abdulrahim 5 0.00000226
## # … with 1,924,655 more rows
arrange(babynames, desc(n)) #99686
## # A tibble: 1,924,665 × 5
## year sex name n prop
## <dbl> <chr> <chr> <int> <dbl>
## 1 1947 F Linda 99686 0.0548
## 2 1948 F Linda 96209 0.0552
## 3 1947 M James 94756 0.0510
## 4 1957 M Michael 92695 0.0424
## 5 1947 M Robert 91642 0.0493
## 6 1949 F Linda 91016 0.0518
## 7 1956 M Michael 90620 0.0423
## 8 1958 M Michael 90520 0.0420
## 9 1948 M James 88588 0.0497
## 10 1954 M Michael 88514 0.0428
## # … with 1,924,655 more rows
Use %>%
to write a sequence of functions that:
name
and n
columns, then…babynames %>%
filter(year == 2017, sex == "F") %>%
select(name, n) %>%
arrange(desc(n))
## # A tibble: 18,309 × 2
## name n
## <chr> <int>
## 1 Emma 19738
## 2 Olivia 18632
## 3 Ava 15902
## 4 Isabella 15100
## 5 Sophia 14831
## 6 Mia 13437
## 7 Charlotte 12893
## 8 Amelia 11800
## 9 Evelyn 10675
## 10 Abigail 10551
## # … with 18,299 more rows
babynames
to just the rows that contain your name
and your sex
year
on the x axis and prop
on the y axisbabynames %>%
filter(name == "Garrett", sex == "M") %>%
select(year, prop) %>%
ggplot() +
geom_line(mapping = aes(year, prop))
Copmplete the code below to extract the rows where name == "Khaleesi"
. Then use summarise()
and sum()
and min()
to find:
(Hint: Be sure to remove each _
before running the code)
babynames %>%
filter(name == "Khaleesi") %>%
summarise(total = sum(n), first = min(year))
## # A tibble: 1 × 2
## total first
## <int> <dbl>
## 1 1964 2011
Use group_by()
, summarise()
, and arrange()
to display the ten most popular names. Compute popularity as the total number of children of a single gender given a name.
(Hint: Be sure to remove each _
before running the code)
babynames %>%
group_by(name, sex) %>%
summarise(total = sum(n)) %>%
arrange(desc(total))
## `summarise()` has grouped output by 'name'. You can override using the `.groups` argument.
## # A tibble: 107,973 × 3
## # Groups: name [97,310]
## name sex total
## <chr> <chr> <int>
## 1 James M 5150472
## 2 John M 5115466
## 3 Robert M 4814815
## 4 Michael M 4350824
## 5 Mary F 4123200
## 6 William M 4102604
## 7 David M 3611329
## 8 Joseph M 2603445
## 9 Richard M 2563082
## 10 Charles M 2386048
## # … with 107,963 more rows
Use group_by()
to calculate and then plot the total number of children born each year over time.
babynames %>%
group_by(year) %>%
summarise(n_children = sum(n)) %>%
ggplot() +
geom_line(mapping = aes(x = year, y = n_children))
Use mutate()
and min_rank()
to rank each row in babynames from largest n to lowest n.
(Hint: Be sure to remove each _
before running the code)
babynames %>%
mutate(rank = min_rank(desc(prop)))
## # A tibble: 1,924,665 × 6
## year sex name n prop rank
## <dbl> <chr> <chr> <int> <dbl> <int>
## 1 1880 F Mary 7065 0.0724 14
## 2 1880 F Anna 2604 0.0267 709
## 3 1880 F Emma 2003 0.0205 1131
## 4 1880 F Elizabeth 1939 0.0199 1192
## 5 1880 F Minnie 1746 0.0179 1427
## 6 1880 F Margaret 1578 0.0162 1683
## 7 1880 F Ida 1472 0.0151 1897
## 8 1880 F Alice 1414 0.0145 2039
## 9 1880 F Bertha 1320 0.0135 2279
## 10 1880 F Sarah 1288 0.0132 2387
## # … with 1,924,655 more rows
Group babynames by year and then re-rank the data. Filter the results to just rows where rank == 1.
babynames %>%
group_by(year) %>%
mutate(rank = min_rank(desc(prop))) %>%
filter(rank == 1)
## # A tibble: 138 × 6
## # Groups: year [138]
## year sex name n prop rank
## <dbl> <chr> <chr> <int> <dbl> <int>
## 1 1880 M John 9655 0.0815 1
## 2 1881 M John 8769 0.0810 1
## 3 1882 M John 9557 0.0783 1
## 4 1883 M John 8894 0.0791 1
## 5 1884 M John 9388 0.0765 1
## 6 1885 M John 8756 0.0755 1
## 7 1886 M John 9026 0.0758 1
## 8 1887 M John 8110 0.0742 1
## 9 1888 M John 9247 0.0712 1
## 10 1889 M John 8548 0.0718 1
## # … with 128 more rows
Extract variables with select()
Extract cases with filter()
Arrange cases, with arrange()
Make tables of summaries with summarise()
Make new variables, with mutate()
Do groupwise operations with group_by()
Connect operations with %>%