Libraries needed:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(babynames)
library(nycflights13)
The paste() command and paste0() differ because the first allows you to choose and specify what the separator is between the elements in the series, while the second does not set any separators in between.
paste("My","name","is","Dalia")
## [1] "My name is Dalia"
paste0("My","name","is","Dalia")
## [1] "MynameisDalia"
To recreate the equivalent of paste() with str_c(), you need to indicate the separator for the str_c() command. If it was paste0() instead, the str_c() command by itself would work, or str_c(, sep = ““).
paste("My","name","is","Dalia")
## [1] "My name is Dalia"
str_c("My","name","is","Dalia", sep = " ")
## [1] "My name is Dalia"
Str_length() helps identify the length of each baby name. With str_sub(), we can then take out the middle letter. You can replace all the even names and make them as N/As, or you can choose to let it be and take the following letter as the midpoint of the string.
babynames |>
mutate(name_length = str_length(name)) |>
mutate(middle_char = str_sub(name, name_length/2+1, name_length/2+1)) |>
mutate(middle_char = replace(middle_char, name_length%%2 == 0, NA))
## # A tibble: 1,924,665 × 7
## year sex name n prop name_length middle_char
## <dbl> <chr> <chr> <int> <dbl> <int> <chr>
## 1 1880 F Mary 7065 0.0724 4 <NA>
## 2 1880 F Anna 2604 0.0267 4 <NA>
## 3 1880 F Emma 2003 0.0205 4 <NA>
## 4 1880 F Elizabeth 1939 0.0199 9 a
## 5 1880 F Minnie 1746 0.0179 6 <NA>
## 6 1880 F Margaret 1578 0.0162 8 <NA>
## 7 1880 F Ida 1472 0.0151 3 d
## 8 1880 F Alice 1414 0.0145 5 i
## 9 1880 F Bertha 1320 0.0135 6 <NA>
## 10 1880 F Sarah 1288 0.0132 5 r
## # ℹ 1,924,655 more rows
By grouping by year and finding the mean length of the names, you can see how name lengths have increased over time.
babynames_length_by_year <- babynames |>
group_by(year) |>
mutate(name_lenght = str_length(name)) |>
summarize (mean_name_length = mean(name_lenght))
babynames_length_by_year
## # A tibble: 138 × 2
## year mean_name_length
## <dbl> <dbl>
## 1 1880 5.7
## 2 1881 5.67
## 3 1882 5.72
## 4 1883 5.70
## 5 1884 5.72
## 6 1885 5.71
## 7 1886 5.71
## 8 1887 5.68
## 9 1888 5.70
## 10 1889 5.69
## # ℹ 128 more rows
To find the popularity of the first letters, do a count on those through the summarize function. You can also group by year and the first and last letter to see trends over time as well.
babynames_count_letter_by_year <- babynames |>
mutate(first_letter = str_sub(name,1,1)) |>
group_by(year, first_letter) |>
summarize (count = n())
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
babynames_max_letter_by_year <- babynames |>
mutate(first_letter = str_sub(name,1,1)) |>
group_by(year, first_letter) |>
summarize(first_letter_count = n()) |>
group_by(year) |>
summarize (count = max(first_letter_count)) |>
mutate(max= "T")
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
babynames_count_letter_by_year <- left_join(babynames_count_letter_by_year, babynames_max_letter_by_year, join_by(year, count))
first_letter_name_popular <- babynames_count_letter_by_year |>
filter(max == "T")
first_letter_name_popular
## # A tibble: 141 × 4
## # Groups: year [138]
## year first_letter count max
## <dbl> <chr> <int> <chr>
## 1 1880 A 190 T
## 2 1881 A 188 T
## 3 1882 A 199 T
## 4 1883 A 206 T
## 5 1884 A 215 T
## 6 1885 L 215 T
## 7 1886 L 229 T
## 8 1887 A 230 T
## 9 1888 L 246 T
## 10 1889 A 243 T
## # ℹ 131 more rows
Now to view the popularity of the last letters, you would do the following.
babynames_count_last_letter_by_year <- babynames |>
mutate(last_letter = str_sub(name,-1,-1)) |>
group_by(year, last_letter) |>
summarize (last_letter_count = n())
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
babynames_max_last_letter_by_year <- babynames |>
mutate(last_letter = str_sub(name,-1,-1)) |>
group_by(year, last_letter) |>
summarize(last_letter_count = n()) |>
group_by(year) |>
summarize (max_last_letter_count = max(last_letter_count)) |>
mutate(max= "T")
## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.
babynames_count_last_letter_by_year <- left_join(babynames_count_last_letter_by_year, babynames_max_last_letter_by_year, join_by(year, last_letter_count == max_last_letter_count))
last_letter_name_popular <- babynames_count_last_letter_by_year |>
filter(max == "T")
last_letter_name_popular
## # A tibble: 139 × 4
## # Groups: year [138]
## year last_letter last_letter_count max
## <dbl> <chr> <int> <chr>
## 1 1880 e 490 T
## 2 1881 e 484 T
## 3 1882 e 538 T
## 4 1883 e 542 T
## 5 1884 e 584 T
## 6 1885 e 614 T
## 7 1886 e 633 T
## 8 1887 e 633 T
## 9 1888 e 711 T
## 10 1889 e 721 T
## # ℹ 129 more rows
Creating a table of special dates is one way to visualize the information, making the date information the primary key (year, month, and day). This adds a column with describing whether or not the flight occurs on one of the designated special days, and if so which one.
special_days <- tibble(
year = c(2013, 2013, 2013, 2013),
month = c(01, 07, 11, 12),
day = c(01, 04, 29, 25),
holiday = c("New Years Day", "Independence Day", "Thanksgiving Day", "Christmas Day")
)
flights_special_days <- flights |>
left_join(special_days, by = c ("year", "month", "day"))
flights_special_days
## # A tibble: 336,776 × 20
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <dbl> <dbl> <dbl> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 517 515 2 830 819
## 2 2013 1 1 533 529 4 850 830
## 3 2013 1 1 542 540 2 923 850
## 4 2013 1 1 544 545 -1 1004 1022
## 5 2013 1 1 554 600 -6 812 837
## 6 2013 1 1 554 558 -4 740 728
## 7 2013 1 1 555 600 -5 913 854
## 8 2013 1 1 557 600 -3 709 723
## 9 2013 1 1 557 600 -3 838 846
## 10 2013 1 1 558 600 -2 753 745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>, holiday <chr>
The code above identifies the top 10 most popular destinations. We then have to filter the data to get all of the flights to those destinations, which can be accomplished with a filtering join.
top_dest <- flights |>
count(dest, sort = TRUE) |>
head(10)
flights_top_dest <- flights |>
semi_join(top_dest, by = "dest")
flights_top_dest
## # A tibble: 141,145 × 19
## year month day dep_time sched_dep_time dep_delay arr_time sched_arr_time
## <int> <int> <int> <int> <int> <dbl> <int> <int>
## 1 2013 1 1 542 540 2 923 850
## 2 2013 1 1 554 600 -6 812 837
## 3 2013 1 1 554 558 -4 740 728
## 4 2013 1 1 555 600 -5 913 854
## 5 2013 1 1 557 600 -3 838 846
## 6 2013 1 1 558 600 -2 753 745
## 7 2013 1 1 558 600 -2 924 917
## 8 2013 1 1 558 600 -2 923 937
## 9 2013 1 1 559 559 0 702 706
## 10 2013 1 1 600 600 0 851 858
## # ℹ 141,135 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## # tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## # hour <dbl>, minute <dbl>, time_hour <dttm>