Workshop 7: Creating strings and using join commands

Libraries needed:

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

library(babynames)
library(nycflights13)

14.3.4

Question 2:What’s the difference between paste() and paste0()? How can you recreate the equivalent of paste() with str_c()?

The paste() command and paste0() differ because the first allows you to choose and specify what the separator is between the elements in the series, while the second does not set any separators in between.

paste("My","name","is","Dalia")

## [1] "My name is Dalia"

paste0("My","name","is","Dalia")

## [1] "MynameisDalia"

To recreate the equivalent of paste() with str_c(), you need to indicate the separator for the str_c() command. If it was paste0() instead, the str_c() command by itself would work, or str_c(, sep = ““).

paste("My","name","is","Dalia")

## [1] "My name is Dalia"

str_c("My","name","is","Dalia", sep = " ")

## [1] "My name is Dalia"

14.5.3

Question 2: Use str_length() and str_sub() to extract the middle letter from each baby name. What will you do if the string has an even number of characters?

Str_length() helps identify the length of each baby name. With str_sub(), we can then take out the middle letter. You can replace all the even names and make them as N/As, or you can choose to let it be and take the following letter as the midpoint of the string.

babynames |> 
  mutate(name_length = str_length(name)) |>
  mutate(middle_char = str_sub(name, name_length/2+1, name_length/2+1)) |>
  mutate(middle_char = replace(middle_char, name_length%%2 == 0, NA))

## # A tibble: 1,924,665 × 7
##     year sex   name          n   prop name_length middle_char
##    <dbl> <chr> <chr>     <int>  <dbl>       <int> <chr>      
##  1  1880 F     Mary       7065 0.0724           4 <NA>       
##  2  1880 F     Anna       2604 0.0267           4 <NA>       
##  3  1880 F     Emma       2003 0.0205           4 <NA>       
##  4  1880 F     Elizabeth  1939 0.0199           9 a          
##  5  1880 F     Minnie     1746 0.0179           6 <NA>       
##  6  1880 F     Margaret   1578 0.0162           8 <NA>       
##  7  1880 F     Ida        1472 0.0151           3 d          
##  8  1880 F     Alice      1414 0.0145           5 i          
##  9  1880 F     Bertha     1320 0.0135           6 <NA>       
## 10  1880 F     Sarah      1288 0.0132           5 r          
## # ℹ 1,924,655 more rows

Question 3. Are there any major trends in the length of babynames over time? What about the popularity of first and last letters?

By grouping by year and finding the mean length of the names, you can see how name lengths have increased over time.

babynames_length_by_year <- babynames |>
  group_by(year) |>
  mutate(name_lenght = str_length(name)) |>
  summarize (mean_name_length = mean(name_lenght))

babynames_length_by_year

## # A tibble: 138 × 2
##     year mean_name_length
##    <dbl>            <dbl>
##  1  1880             5.7 
##  2  1881             5.67
##  3  1882             5.72
##  4  1883             5.70
##  5  1884             5.72
##  6  1885             5.71
##  7  1886             5.71
##  8  1887             5.68
##  9  1888             5.70
## 10  1889             5.69
## # ℹ 128 more rows

To find the popularity of the first letters, do a count on those through the summarize function. You can also group by year and the first and last letter to see trends over time as well.

babynames_count_letter_by_year <- babynames |>
  mutate(first_letter = str_sub(name,1,1)) |>
  group_by(year, first_letter) |>
  summarize (count = n())

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

babynames_max_letter_by_year <- babynames |>
  mutate(first_letter = str_sub(name,1,1)) |>
  group_by(year, first_letter) |>
  summarize(first_letter_count = n()) |>
  group_by(year) |>
  summarize (count = max(first_letter_count)) |>
  mutate(max= "T")

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

babynames_count_letter_by_year <- left_join(babynames_count_letter_by_year, babynames_max_letter_by_year, join_by(year, count))

first_letter_name_popular <- babynames_count_letter_by_year |>
  filter(max == "T")

first_letter_name_popular

## # A tibble: 141 × 4
## # Groups:   year [138]
##     year first_letter count max  
##    <dbl> <chr>        <int> <chr>
##  1  1880 A              190 T    
##  2  1881 A              188 T    
##  3  1882 A              199 T    
##  4  1883 A              206 T    
##  5  1884 A              215 T    
##  6  1885 L              215 T    
##  7  1886 L              229 T    
##  8  1887 A              230 T    
##  9  1888 L              246 T    
## 10  1889 A              243 T    
## # ℹ 131 more rows

Now to view the popularity of the last letters, you would do the following.

babynames_count_last_letter_by_year <- babynames |>
  mutate(last_letter = str_sub(name,-1,-1)) |>
  group_by(year, last_letter) |>
  summarize (last_letter_count = n())

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

babynames_max_last_letter_by_year <- babynames |>
  mutate(last_letter = str_sub(name,-1,-1)) |>
  group_by(year, last_letter) |>
  summarize(last_letter_count = n()) |>
  group_by(year) |>
  summarize (max_last_letter_count = max(last_letter_count)) |>
  mutate(max= "T")

## `summarise()` has grouped output by 'year'. You can override using the
## `.groups` argument.

babynames_count_last_letter_by_year <- left_join(babynames_count_last_letter_by_year, babynames_max_last_letter_by_year, join_by(year, last_letter_count == max_last_letter_count))

last_letter_name_popular <- babynames_count_last_letter_by_year |>
  filter(max == "T")

last_letter_name_popular

## # A tibble: 139 × 4
## # Groups:   year [138]
##     year last_letter last_letter_count max  
##    <dbl> <chr>                   <int> <chr>
##  1  1880 e                         490 T    
##  2  1881 e                         484 T    
##  3  1882 e                         538 T    
##  4  1883 e                         542 T    
##  5  1884 e                         584 T    
##  6  1885 e                         614 T    
##  7  1886 e                         633 T    
##  8  1887 e                         633 T    
##  9  1888 e                         711 T    
## 10  1889 e                         721 T    
## # ℹ 129 more rows

19.2.4

Question 4: We know that some days of the year are special and fewer people than usual fly on them (e.g., Christmas eve and Christmas day). How might you represent that data as a data frame? What would be the primary key? How would it connect to the existing data frames?

Creating a table of special dates is one way to visualize the information, making the date information the primary key (year, month, and day). This adds a column with describing whether or not the flight occurs on one of the designated special days, and if so which one.

special_days <- tibble(
  year = c(2013, 2013, 2013, 2013),
  month = c(01, 07, 11, 12),
  day = c(01, 04, 29, 25),
  holiday = c("New Years Day", "Independence Day", "Thanksgiving Day", "Christmas Day")
)

flights_special_days <- flights |>
  left_join(special_days, by = c ("year", "month", "day"))

flights_special_days

## # A tibble: 336,776 × 20
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <dbl> <dbl> <dbl>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      517            515         2      830            819
##  2  2013     1     1      533            529         4      850            830
##  3  2013     1     1      542            540         2      923            850
##  4  2013     1     1      544            545        -1     1004           1022
##  5  2013     1     1      554            600        -6      812            837
##  6  2013     1     1      554            558        -4      740            728
##  7  2013     1     1      555            600        -5      913            854
##  8  2013     1     1      557            600        -3      709            723
##  9  2013     1     1      557            600        -3      838            846
## 10  2013     1     1      558            600        -2      753            745
## # ℹ 336,766 more rows
## # ℹ 12 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>, holiday <chr>

19.3.4

Question 2: Imagine you’ve found the top 10 most popular destinations using this code: How can you find all flights to those destinations? top_dest <- flights2 |> count(dest, sort = TRUE) |> head(10)

The code above identifies the top 10 most popular destinations. We then have to filter the data to get all of the flights to those destinations, which can be accomplished with a filtering join.

top_dest <- flights |>
  count(dest, sort = TRUE) |>
  head(10)

flights_top_dest <- flights |>
  semi_join(top_dest, by = "dest")

flights_top_dest

## # A tibble: 141,145 × 19
##     year month   day dep_time sched_dep_time dep_delay arr_time sched_arr_time
##    <int> <int> <int>    <int>          <int>     <dbl>    <int>          <int>
##  1  2013     1     1      542            540         2      923            850
##  2  2013     1     1      554            600        -6      812            837
##  3  2013     1     1      554            558        -4      740            728
##  4  2013     1     1      555            600        -5      913            854
##  5  2013     1     1      557            600        -3      838            846
##  6  2013     1     1      558            600        -2      753            745
##  7  2013     1     1      558            600        -2      924            917
##  8  2013     1     1      558            600        -2      923            937
##  9  2013     1     1      559            559         0      702            706
## 10  2013     1     1      600            600         0      851            858
## # ℹ 141,135 more rows
## # ℹ 11 more variables: arr_delay <dbl>, carrier <chr>, flight <int>,
## #   tailnum <chr>, origin <chr>, dest <chr>, air_time <dbl>, distance <dbl>,
## #   hour <dbl>, minute <dbl>, time_hour <dttm>

Workshop 7

Dalia Rodriguez

2024-10-22