library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(babynames)
14.3.4
paste() concatenates with a space in between paste0() concatenates with no space in between you could do str_c() and set sep=“” so the separator is nothing and there is no gap in between.
14.5.3
babynames|>
mutate(name_length=str_length(name))|>
mutate(middle_letter = str_sub(name, name_length/2+1,name_length/2+1))|>
mutate(middle_letter = replace(middle_letter, name_length %% 2==0, NA))
## # A tibble: 1,924,665 × 7
## year sex name n prop name_length middle_letter
## <dbl> <chr> <chr> <int> <dbl> <int> <chr>
## 1 1880 F Mary 7065 0.0724 4 <NA>
## 2 1880 F Anna 2604 0.0267 4 <NA>
## 3 1880 F Emma 2003 0.0205 4 <NA>
## 4 1880 F Elizabeth 1939 0.0199 9 a
## 5 1880 F Minnie 1746 0.0179 6 <NA>
## 6 1880 F Margaret 1578 0.0162 8 <NA>
## 7 1880 F Ida 1472 0.0151 3 d
## 8 1880 F Alice 1414 0.0145 5 i
## 9 1880 F Bertha 1320 0.0135 6 <NA>
## 10 1880 F Sarah 1288 0.0132 5 r
## # ℹ 1,924,655 more rows
babynames_length_by_year<- babynames|>
mutate(name_length= str_length(name))|>
group_by(year)|>
summarize(mean_length=mean(name_length))
The name length ends up getting longer
first_letter_popularity <- babynames |>
mutate(first_letter = substr(name, 1, 1))|>
group_by(year, first_letter) |>
summarize(total = sum(n), .groups = "drop") |>
arrange(year, desc(total))|>
group_by(year) |>
slice_max(total, n = 1)
last_letter_popularity <- babynames |>
mutate(last_letter = substr(name, str_length(name), str_length(name))) |>
group_by(year, last_letter) |>
summarize(total = sum(n), .groups = "drop") |>
arrange(year, desc(total))|>
group_by(year) %>%
slice_max(total, n = 1)
print(last_letter_popularity)
## # A tibble: 138 × 3
## # Groups: year [138]
## year last_letter total
## <dbl> <chr> <int>
## 1 1880 e 46873
## 2 1881 e 46486
## 3 1882 e 55012
## 4 1883 e 55104
## 5 1884 e 62725
## 6 1885 e 63664
## 7 1886 e 68358
## 8 1887 e 67564
## 9 1888 e 82490
## 10 1889 e 81049
## # ℹ 128 more rows
print(first_letter_popularity)
## # A tibble: 138 × 3
## # Groups: year [138]
## year first_letter total
## <dbl> <chr> <int>
## 1 1880 J 26072
## 2 1881 J 24125
## 3 1882 J 26910
## 4 1883 M 26589
## 5 1884 M 30269
## 6 1885 M 30548
## 7 1886 M 32621
## 8 1887 M 32488
## 9 1888 M 39299
## 10 1889 M 38995
## # ℹ 128 more rows
19.2.4
library(nycflights13)
library(tibble)
holidays<- tibble(
month = c(12, 12, 1, 7, 11),
day = c(24, 25, 1, 4, 28),
holiday_name = c("Christmas Eve", "Christmas Day", "New Year's Day", "Independence Day", "Thanksgiving"),
)
holiday_flights<- flights |>
left_join(holidays, by = c("month", "day"))|>
arrange(is.na(holiday_name), holiday_name )|>
arrange(is.na(holiday_name), month, day)
print(holiday_flights)
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <dbl> <dbl> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, holiday_name <chr>
19.3.4
flights2 <- flights |>
mutate(id = row_number(), .before = 1)
flights2
## # A tibble: 336,776 × 20
## id year month day dep_time sched_dep_time dep_delay arr_time
## <int> <int> <int> <int> <int> <int> <dbl> <int>
## 1 1 2013 1 1 517 515 2 830
## 2 2 2013 1 1 533 529 4 850
## 3 3 2013 1 1 542 540 2 923
## 4 4 2013 1 1 544 545 -1 1004
## 5 5 2013 1 1 554 600 -6 812
## 6 6 2013 1 1 554 558 -4 740
## 7 7 2013 1 1 555 600 -5 913
## 8 8 2013 1 1 557 600 -3 709
## 9 9 2013 1 1 557 600 -3 838
## 10 10 2013 1 1 558 600 -2 753
## # ℹ 336,766 more rows
## # ℹ 12 more variables: sched_arr_time <int>, arr_delay <dbl>, carrier <chr>,
## # flight <int>, tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>,
## # distance <dbl>, hour <dbl>, minute <dbl>, time_hour <dttm>
top_dest <- flights2 |>
count(dest, sort = TRUE) |>
head(10)
top_dest
## # A tibble: 10 × 2
## dest n
## <chr> <int>
## 1 ORD 17283
## 2 ATL 17215
## 3 LAX 16174
## 4 BOS 15508
## 5 MCO 14082
## 6 CLT 14064
## 7 SFO 13331
## 8 FLL 12055
## 9 MIA 11728
## 10 DCA 9705
flights_to_top_dest <- flights |>
filter(dest %in% top_dest$dest)
flights_to_top_dest
## # A tibble: 141,145 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 542 540 2 923 850
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 554 558 -4 740 728
## 4 2013 1 1 555 600 -5 913 854
## 5 2013 1 1 557 600 -3 838 846
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 559 0 702 706
## 10 2013 1 1 600 600 0 851 858
## # ℹ 141,135 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>