results <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-07/results.csv')
## Warning: One or more parsing issues, see `problems()` for details
## Rows: 25220 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (8): position, positionText, time, milliseconds, fastestLap, rank, fast...
## dbl (10): resultId, raceId, driverId, constructorId, number, grid, positionO...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
skimr::skim(results)
| Name | results |
| Number of rows | 25220 |
| Number of columns | 18 |
| _______________________ | |
| Column type frequency: | |
| character | 8 |
| numeric | 10 |
| ________________________ | |
| Group variables | None |
Variable type: character
| skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
|---|---|---|---|---|---|---|---|
| position | 0 | 1 | 1 | 2 | 0 | 34 | 0 |
| positionText | 0 | 1 | 1 | 2 | 0 | 39 | 0 |
| time | 0 | 1 | 2 | 11 | 0 | 6488 | 0 |
| milliseconds | 0 | 1 | 2 | 8 | 0 | 6687 | 0 |
| fastestLap | 0 | 1 | 1 | 2 | 0 | 80 | 0 |
| rank | 0 | 1 | 1 | 2 | 0 | 26 | 0 |
| fastestLapTime | 0 | 1 | 2 | 8 | 0 | 6266 | 0 |
| fastestLapSpeed | 0 | 1 | 2 | 7 | 0 | 6395 | 0 |
Variable type: numeric
| skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 | hist |
|---|---|---|---|---|---|---|---|---|---|---|
| resultId | 0 | 1 | 12611.23 | 7281.58 | 1 | 6305.75 | 12610.5 | 18915.25 | 25225 | ▇▇▇▇▇ |
| raceId | 0 | 1 | 517.95 | 290.34 | 1 | 287.00 | 503.0 | 762.00 | 1064 | ▆▇▇▆▆ |
| driverId | 0 | 1 | 250.84 | 258.25 | 1 | 56.00 | 158.0 | 347.00 | 854 | ▇▃▂▁▂ |
| constructorId | 0 | 1 | 47.48 | 58.39 | 1 | 6.00 | 25.0 | 57.00 | 214 | ▇▂▁▁▁ |
| number | 6 | 1 | 17.59 | 14.80 | 0 | 7.00 | 15.0 | 23.00 | 208 | ▇▁▁▁▁ |
| grid | 0 | 1 | 11.21 | 7.27 | 0 | 5.00 | 11.0 | 17.00 | 34 | ▇▇▇▃▁ |
| positionOrder | 0 | 1 | 12.93 | 7.74 | 1 | 6.00 | 12.0 | 19.00 | 39 | ▇▇▆▂▁ |
| points | 0 | 1 | 1.80 | 4.03 | 0 | 0.00 | 0.0 | 2.00 | 50 | ▇▁▁▁▁ |
| laps | 0 | 1 | 45.79 | 30.04 | 0 | 21.00 | 52.0 | 66.00 | 200 | ▅▇▁▁▁ |
| statusId | 0 | 1 | 17.72 | 26.10 | 1 | 1.00 | 11.0 | 14.00 | 139 | ▇▁▁▁▁ |
results %>% count(position)
## # A tibble: 34 × 2
## position n
## <chr> <int>
## 1 "\\N" 10762
## 2 "1" 1051
## 3 "10" 901
## 4 "11" 824
## 5 "12" 723
## 6 "13" 636
## 7 "14" 528
## 8 "15" 455
## 9 "16" 368
## 10 "17" 276
## # … with 24 more rows
results %>% filter(position != "\\N")
## # A tibble: 14,458 × 18
## resultId raceId driverId constr…¹ number grid posit…² posit…³ posit…⁴ points
## <dbl> <dbl> <dbl> <dbl> <dbl> <dbl> <chr> <chr> <dbl> <dbl>
## 1 1 18 1 1 22 1 1 1 1 10
## 2 2 18 2 2 3 5 2 2 2 8
## 3 3 18 3 3 7 7 3 3 3 6
## 4 4 18 4 4 5 11 4 4 4 5
## 5 5 18 5 1 23 3 5 5 5 4
## 6 6 18 6 3 8 13 6 6 6 3
## 7 7 18 7 5 14 17 7 7 7 2
## 8 8 18 8 6 1 15 8 8 8 1
## 9 23 19 8 6 1 2 1 1 1 10
## 10 24 19 9 2 4 4 2 2 2 8
## # … with 14,448 more rows, 8 more variables: laps <dbl>, time <chr>,
## # milliseconds <chr>, fastestLap <chr>, rank <chr>, fastestLapTime <chr>,
## # fastestLapSpeed <chr>, statusId <dbl>, and abbreviated variable names
## # ¹constructorId, ²position, ³positionText, ⁴positionOrder
top_5_position <- c("1", "2", "3", "4", "5")
Make two bar charts here - one before ordering another after
# Transform data: calculate average tv hours by religion
position_and_laps <- results %>%
group_by(position) %>%
summarise(
avg_laps = mean(laps, na.rm = TRUE))
position_and_laps
## # A tibble: 34 × 2
## position avg_laps
## <chr> <dbl>
## 1 "\\N" 22.0
## 2 "1" 65.0
## 3 "10" 62.2
## 4 "11" 62.6
## 5 "12" 62.1
## 6 "13" 61.3
## 7 "14" 61.0
## 8 "15" 61.7
## 9 "16" 61.6
## 10 "17" 61.7
## # … with 24 more rows
# Plot
position_and_laps %>%
ggplot(aes(x = avg_laps, y = position)) +
geom_point()
position_and_laps %>%
ggplot(aes(x = avg_laps, y = fct_reorder(.f = position, .x = avg_laps))) +
geom_point() +
# Labeling
labs(y = NULL, x = "Mean Laps per Position")
Show examples of three functions:
results %>%
filter(position != "\\N") %>%
mutate(top_5_position_re = fct_recode(position,
"P1" = "1",
"P2" = "2",
"P3" = "3",
"P4" = "4",
"P5" = "5")) %>%
select(position, top_5_position_re) %>%
sample_n(5)
## # A tibble: 5 × 2
## position top_5_position_re
## <chr> <fct>
## 1 13 13
## 2 3 P3
## 3 1 P1
## 4 11 11
## 5 2 P2
results %>%
filter(position != "\\N") %>%
mutate(top_5_position_col = fct_collapse(position,
top_2 = c("1", "2"),
last_3 = c("3", "4", "5"))) %>%
select(position, top_5_position_col) %>%
sample_n(10)
## # A tibble: 10 × 2
## position top_5_position_col
## <chr> <fct>
## 1 8 8
## 2 3 last_3
## 3 12 12
## 4 10 10
## 5 10 10
## 6 8 8
## 7 11 11
## 8 7 7
## 9 5 last_3
## 10 7 7
results %>%
filter(position != "\\N") %>%
mutate(top_5_position_lump = fct_lump(position, n = 3)) %>%
select(position, top_5_position_lump) %>%
sample_n(5)
## # A tibble: 5 × 2
## position top_5_position_lump
## <chr> <fct>
## 1 2 2
## 2 16 Other
## 3 10 Other
## 4 9 Other
## 5 1 Other
No need to do anything here.