Module 11: Apply it to your data 10

Import your data

results <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2021/2021-09-07/results.csv')

## Warning: One or more parsing issues, see `problems()` for details

## Rows: 25220 Columns: 18
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (8): position, positionText, time, milliseconds, fastestLap, rank, fast...
## dbl (10): resultId, raceId, driverId, constructorId, number, grid, positionO...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

skimr::skim(results)

Data summary
Name	results
Number of rows	25220
Number of columns	18
_______________________
Column type frequency:
character	8
numeric	10
________________________
Group variables	None

Variable type: character

skim_variable	complete_rate	min	max	n_unique
position	1	1	2	34
positionText	1	1	2	39
time	1	2	11	6488
milliseconds	1	2	8	6687
fastestLap	1	1	2	80
rank	1	1	2	26
fastestLapTime	1	2	8	6266
fastestLapSpeed	1	2	7	6395

Variable type: numeric

skim_variable	n_missing	complete_rate	mean	sd	p0	p25	p50	p75	p100	hist
resultId	0	1	12611.23	7281.58	1	6305.75	12610.5	18915.25	25225	▇▇▇▇▇
raceId	0	1	517.95	290.34	1	287.00	503.0	762.00	1064	▆▇▇▆▆
driverId	0	1	250.84	258.25	1	56.00	158.0	347.00	854	▇▃▂▁▂
constructorId	0	1	47.48	58.39	1	6.00	25.0	57.00	214	▇▂▁▁▁
number	6	1	17.59	14.80	0	7.00	15.0	23.00	208	▇▁▁▁▁
grid	0	1	11.21	7.27	0	5.00	11.0	17.00	34	▇▇▇▃▁
positionOrder	0	1	12.93	7.74	1	6.00	12.0	19.00	39	▇▇▆▂▁
points	0	1	1.80	4.03	0	0.00	0.0	2.00	50	▇▁▁▁▁
laps	0	1	45.79	30.04	0	21.00	52.0	66.00	200	▅▇▁▁▁
statusId	0	1	17.72	26.10	1	1.00	11.0	14.00	139	▇▁▁▁▁

Chapter 15

Create a factor

results %>% count(position)

## # A tibble: 34 × 2
##    position     n
##    <chr>    <int>
##  1 "\\N"    10762
##  2 "1"       1051
##  3 "10"       901
##  4 "11"       824
##  5 "12"       723
##  6 "13"       636
##  7 "14"       528
##  8 "15"       455
##  9 "16"       368
## 10 "17"       276
## # … with 24 more rows

results %>% filter(position != "\\N")

## # A tibble: 14,458 × 18
##    resultId raceId driverId constr…¹ number  grid posit…² posit…³ posit…⁴ points
##       <dbl>  <dbl>    <dbl>    <dbl>  <dbl> <dbl> <chr>   <chr>     <dbl>  <dbl>
##  1        1     18        1        1     22     1 1       1             1     10
##  2        2     18        2        2      3     5 2       2             2      8
##  3        3     18        3        3      7     7 3       3             3      6
##  4        4     18        4        4      5    11 4       4             4      5
##  5        5     18        5        1     23     3 5       5             5      4
##  6        6     18        6        3      8    13 6       6             6      3
##  7        7     18        7        5     14    17 7       7             7      2
##  8        8     18        8        6      1    15 8       8             8      1
##  9       23     19        8        6      1     2 1       1             1     10
## 10       24     19        9        2      4     4 2       2             2      8
## # … with 14,448 more rows, 8 more variables: laps <dbl>, time <chr>,
## #   milliseconds <chr>, fastestLap <chr>, rank <chr>, fastestLapTime <chr>,
## #   fastestLapSpeed <chr>, statusId <dbl>, and abbreviated variable names
## #   ¹constructorId, ²position, ³positionText, ⁴positionOrder

top_5_position <- c("1", "2", "3", "4", "5")

Modify factor order

Make two bar charts here - one before ordering another after

# Transform data: calculate average tv hours by religion
position_and_laps <- results %>%
    
    group_by(position) %>%
    summarise(
        avg_laps = mean(laps, na.rm = TRUE))
position_and_laps

## # A tibble: 34 × 2
##    position avg_laps
##    <chr>       <dbl>
##  1 "\\N"        22.0
##  2 "1"          65.0
##  3 "10"         62.2
##  4 "11"         62.6
##  5 "12"         62.1
##  6 "13"         61.3
##  7 "14"         61.0
##  8 "15"         61.7
##  9 "16"         61.6
## 10 "17"         61.7
## # … with 24 more rows

# Plot
position_and_laps %>%
    
    ggplot(aes(x = avg_laps, y = position)) +
    geom_point()

position_and_laps %>%
    
    ggplot(aes(x = avg_laps, y = fct_reorder(.f = position, .x = avg_laps))) +
    geom_point() +
    
    # Labeling
    labs(y = NULL, x = "Mean Laps per Position")

Modify factor levels

Show examples of three functions:

fct_recode

results %>% 
    filter(position != "\\N") %>% 
    mutate(top_5_position_re = fct_recode(position, 
                                 "P1" = "1",
                                 "P2" = "2", 
                                 "P3" = "3", 
                                 "P4" = "4", 
                                 "P5" = "5")) %>% 
    select(position, top_5_position_re) %>%
    sample_n(5)

## # A tibble: 5 × 2
##   position top_5_position_re
##   <chr>    <fct>            
## 1 13       13               
## 2 3        P3               
## 3 1        P1               
## 4 11       11               
## 5 2        P2

fct_collapse

results %>% 
    filter(position != "\\N") %>% 
    mutate(top_5_position_col = fct_collapse(position, 
                                         top_2 = c("1", "2"), 
                                         last_3 = c("3", "4", "5"))) %>% 
    select(position, top_5_position_col) %>% 
    sample_n(10)

## # A tibble: 10 × 2
##    position top_5_position_col
##    <chr>    <fct>             
##  1 8        8                 
##  2 3        last_3            
##  3 12       12                
##  4 10       10                
##  5 10       10                
##  6 8        8                 
##  7 11       11                
##  8 7        7                 
##  9 5        last_3            
## 10 7        7

fct_lump

results %>% 
    filter(position != "\\N") %>% 
    mutate(top_5_position_lump = fct_lump(position, n = 3)) %>% 
    select(position, top_5_position_lump) %>% 
    sample_n(5)

## # A tibble: 5 × 2
##   position top_5_position_lump
##   <chr>    <fct>              
## 1 2        2                  
## 2 16       Other              
## 3 10       Other              
## 4 9        Other              
## 5 1        Other

Chapter 16

No need to do anything here.

Module 11: Apply it to your data 10

Amanda Simpson

Import your data

Chapter 15

Create a factor

Modify factor order

Modify factor levels

Chapter 16