Import your data

results <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-07/results.csv')
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 25220 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): position, positionText, time, milliseconds, fastestLap, rank, fast...
## dbl (10): resultId, raceId, driverId, constructorId, number, grid, positionO...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(results)
Data summary
Name results
Number of rows 25220
Number of columns 18
_______________________
Column type frequency:
character 8
numeric 10
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
position 0 1 1 2 0 34 0
positionText 0 1 1 2 0 39 0
time 0 1 2 11 0 6488 0
milliseconds 0 1 2 8 0 6687 0
fastestLap 0 1 1 2 0 80 0
rank 0 1 1 2 0 26 0
fastestLapTime 0 1 2 8 0 6266 0
fastestLapSpeed 0 1 2 7 0 6395 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
resultId 0 1 12611.23 7281.58 1 6305.75 12610.5 18915.25 25225 ▇▇▇▇▇
raceId 0 1 517.95 290.34 1 287.00 503.0 762.00 1064 ▆▇▇▆▆
driverId 0 1 250.84 258.25 1 56.00 158.0 347.00 854 ▇▃▂▁▂
constructorId 0 1 47.48 58.39 1 6.00 25.0 57.00 214 ▇▂▁▁▁
number 6 1 17.59 14.80 0 7.00 15.0 23.00 208 ▇▁▁▁▁
grid 0 1 11.21 7.27 0 5.00 11.0 17.00 34 ▇▇▇▃▁
positionOrder 0 1 12.93 7.74 1 6.00 12.0 19.00 39 ▇▇▆▂▁
points 0 1 1.80 4.03 0 0.00 0.0 2.00 50 ▇▁▁▁▁
laps 0 1 45.79 30.04 0 21.00 52.0 66.00 200 ▅▇▁▁▁
statusId 0 1 17.72 26.10 1 1.00 11.0 14.00 139 ▇▁▁▁▁

Chapter 15

Create a factor

results %>% count(position)
## # A tibble: 34 × 2
##    position     n
##    <chr>    <int>
##  1 "\\N"    10762
##  2 "1"       1051
##  3 "10"       901
##  4 "11"       824
##  5 "12"       723
##  6 "13"       636
##  7 "14"       528
##  8 "15"       455
##  9 "16"       368
## 10 "17"       276
## # … with 24 more rows
results %>% filter(position != "\\N")
## # A tibble: 14,458 × 18
##    resultId raceId driverId constr…¹ number  grid posit…² posit…³ posit…⁴ points
##       <dbl>  <dbl>    <dbl>    <dbl>  <dbl> <dbl> <chr>   <chr>     <dbl>  <dbl>
##  1        1     18        1        1     22     1 1       1             1     10
##  2        2     18        2        2      3     5 2       2             2      8
##  3        3     18        3        3      7     7 3       3             3      6
##  4        4     18        4        4      5    11 4       4             4      5
##  5        5     18        5        1     23     3 5       5             5      4
##  6        6     18        6        3      8    13 6       6             6      3
##  7        7     18        7        5     14    17 7       7             7      2
##  8        8     18        8        6      1    15 8       8             8      1
##  9       23     19        8        6      1     2 1       1             1     10
## 10       24     19        9        2      4     4 2       2             2      8
## # … with 14,448 more rows, 8 more variables: laps <dbl>, time <chr>,
## #   milliseconds <chr>, fastestLap <chr>, rank <chr>, fastestLapTime <chr>,
## #   fastestLapSpeed <chr>, statusId <dbl>, and abbreviated variable names
## #   ¹​constructorId, ²​position, ³​positionText, ⁴​positionOrder
top_5_position <- c("1", "2", "3", "4", "5")

Modify factor order

Make two bar charts here - one before ordering another after

# Transform data: calculate average tv hours by religion
position_and_laps <- results %>%
    
    group_by(position) %>%
    summarise(
        avg_laps = mean(laps, na.rm = TRUE))
position_and_laps
## # A tibble: 34 × 2
##    position avg_laps
##    <chr>       <dbl>
##  1 "\\N"        22.0
##  2 "1"          65.0
##  3 "10"         62.2
##  4 "11"         62.6
##  5 "12"         62.1
##  6 "13"         61.3
##  7 "14"         61.0
##  8 "15"         61.7
##  9 "16"         61.6
## 10 "17"         61.7
## # … with 24 more rows
# Plot
position_and_laps %>%
    
    ggplot(aes(x = avg_laps, y = position)) +
    geom_point()

position_and_laps %>%
    
    ggplot(aes(x = avg_laps, y = fct_reorder(.f = position, .x = avg_laps))) +
    geom_point() +
    
    # Labeling
    labs(y = NULL, x = "Mean Laps per Position")

Modify factor levels

Show examples of three functions:

  • fct_recode
results %>% 
    filter(position != "\\N") %>% 
    mutate(top_5_position_re = fct_recode(position, 
                                 "P1" = "1",
                                 "P2" = "2", 
                                 "P3" = "3", 
                                 "P4" = "4", 
                                 "P5" = "5")) %>% 
    select(position, top_5_position_re) %>%
    sample_n(5)
## # A tibble: 5 × 2
##   position top_5_position_re
##   <chr>    <fct>            
## 1 13       13               
## 2 3        P3               
## 3 1        P1               
## 4 11       11               
## 5 2        P2
  • fct_collapse
results %>% 
    filter(position != "\\N") %>% 
    mutate(top_5_position_col = fct_collapse(position, 
                                         top_2 = c("1", "2"), 
                                         last_3 = c("3", "4", "5"))) %>% 
    select(position, top_5_position_col) %>% 
    sample_n(10)
## # A tibble: 10 × 2
##    position top_5_position_col
##    <chr>    <fct>             
##  1 8        8                 
##  2 3        last_3            
##  3 12       12                
##  4 10       10                
##  5 10       10                
##  6 8        8                 
##  7 11       11                
##  8 7        7                 
##  9 5        last_3            
## 10 7        7
  • fct_lump
results %>% 
    filter(position != "\\N") %>% 
    mutate(top_5_position_lump = fct_lump(position, n = 3)) %>% 
    select(position, top_5_position_lump) %>% 
    sample_n(5)
## # A tibble: 5 × 2
##   position top_5_position_lump
##   <chr>    <fct>              
## 1 2        2                  
## 2 16       Other              
## 3 10       Other              
## 4 9        Other              
## 5 1        Other

Chapter 16

No need to do anything here.