Import your data

Mydata <- read_csv("../00_data/tdf_winners.csv")
## Rows: 106 Columns: 19
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (7): winner_name, winner_team, full_name, nickname, birth_town, birth_c...
## dbl  (9): edition, distance, time_overall, time_margin, stage_wins, stages_l...
## date (3): start_date, born, died
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Mydata
## # A tibble: 106 × 19
##    edition start_date winner_name  winner_team distance time_overall time_margin
##      <dbl> <date>     <chr>        <chr>          <dbl>        <dbl>       <dbl>
##  1       1 1903-07-01 Maurice Gar… La Françai…     2428         94.6        2.99
##  2       2 1904-07-02 Henri Cornet Conte           2428         96.1        2.27
##  3       3 1905-07-09 Louis Trous… Peugeot–Wo…     2994         NA         NA   
##  4       4 1906-07-04 René Pottier Peugeot–Wo…     4637         NA         NA   
##  5       5 1907-07-08 Lucien Peti… Peugeot–Wo…     4488         NA         NA   
##  6       6 1908-07-13 Lucien Peti… Peugeot–Wo…     4497         NA         NA   
##  7       7 1909-07-05 François Fa… Alcyon–Dun…     4498         NA         NA   
##  8       8 1910-07-01 Octave Lapi… Alcyon–Dun…     4734         NA         NA   
##  9       9 1911-07-02 Gustave Gar… Alcyon–Dun…     5343         NA         NA   
## 10      10 1912-06-30 Odile Defra… Alcyon–Dun…     5289         NA         NA   
## # ℹ 96 more rows
## # ℹ 12 more variables: stage_wins <dbl>, stages_led <dbl>, height <dbl>,
## #   weight <dbl>, age <dbl>, born <date>, died <date>, full_name <chr>,
## #   nickname <chr>, birth_town <chr>, birth_country <chr>, nationality <chr>

Chapter 15

Create a factor

Mydata %>% count(birth_country)
## # A tibble: 15 × 2
##    birth_country     n
##    <chr>         <int>
##  1 Australia         1
##  2 Belgium          19
##  3 Columbia          1
##  4 Denmark           1
##  5 France           36
##  6 Germany           1
##  7 Ireland           1
##  8 Italy            11
##  9 Kenya             4
## 10 Luxembourg        4
## 11 Netherlands       2
## 12 Spain            12
## 13 Switzerland       2
## 14 USA              10
## 15 Wales             1
birth_country_levels <- c("France", "Belgium", "Spain", "Italy", "USA", "Luxembourg", "Kenya", "Switzerland", "Netherlands", "Ireland", "Denmark", "Germany", "Australia", "Wales", "Columbia" )

Mydata_rev <- Mydata %>% 
    mutate(birth_country = birth_country %>% factor(levels = birth_country_levels))

Mydata_rev %>% count(birth_country)
## # A tibble: 15 × 2
##    birth_country     n
##    <fct>         <int>
##  1 France           36
##  2 Belgium          19
##  3 Spain            12
##  4 Italy            11
##  5 USA              10
##  6 Luxembourg        4
##  7 Kenya             4
##  8 Switzerland       2
##  9 Netherlands       2
## 10 Ireland           1
## 11 Denmark           1
## 12 Germany           1
## 13 Australia         1
## 14 Wales             1
## 15 Columbia          1

Modify factor order

Mydata %>%
  count(birth_country) %>%
  ggplot(aes(x = birth_country, y = n)) +
  geom_bar(stat = "identity") +
  labs(x = "Birth Country", y = "Wins", title = "Wins per country") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

Mydata_rev %>%
    count(birth_country) %>%
  ggplot(aes(x = birth_country, y = n)) +
  geom_bar(stat = "identity") +
  labs(x = "Birth Country", y = "Wins", title = "Wins per country") +
    theme(axis.text.x = element_text(angle = 45, hjust = 1))

Modify factor levels

Show examples of three functions:

  • fct_recode
Mydata %>%
  mutate(birth_country = fct_recode(birth_country,
    "AUS" = "Australia",
  "BEL" = "Belgium",
  "COL" = "Columbia",
  "DEN" = "Denmark",
  "FRA" = "France",
  "GER" = "Germany",
  "IRL" = "Ireland",
  "ITA" = "Italy",
  "LUX" = "Luxembourg",
  "NED" = "Netherlands",
  "ESP" = "Spain",
  "SUI" = "Switzerland",
  "USA" = "USA"
  )) %>%
  count(birth_country)
## # A tibble: 15 × 2
##    birth_country     n
##    <fct>         <int>
##  1 AUS               1
##  2 BEL              19
##  3 COL               1
##  4 DEN               1
##  5 FRA              36
##  6 GER               1
##  7 IRL               1
##  8 ITA              11
##  9 Kenya             4
## 10 LUX               4
## 11 NED               2
## 12 ESP              12
## 13 SUI               2
## 14 USA              10
## 15 Wales             1
  • fct_collapse
Mydata %>%
    
mutate(birth_country = fct_collapse(birth_country,
  Europe = c(
    "France", "Belgium", "Italy", "Luxembourg",
    "Netherlands", "Switzerland", "Columbia", "Denmark", "Germany", "Ireland", "Spain", "Wales"),
  North_America = "USA",
  Africa = "Kenya",
  Oceania = c("Australia"))) %>%
    count(birth_country)
## # A tibble: 4 × 2
##   birth_country     n
##   <fct>         <int>
## 1 Oceania           1
## 2 Europe           91
## 3 Africa            4
## 4 North_America    10
  • fct_lump
Mydata %>%
    mutate(birth_country = fct_lump(birth_country)) %>%
    count(birth_country)
## # A tibble: 15 × 2
##    birth_country     n
##    <fct>         <int>
##  1 Australia         1
##  2 Belgium          19
##  3 Columbia          1
##  4 Denmark           1
##  5 France           36
##  6 Germany           1
##  7 Ireland           1
##  8 Italy            11
##  9 Kenya             4
## 10 Luxembourg        4
## 11 Netherlands       2
## 12 Spain            12
## 13 Switzerland       2
## 14 USA              10
## 15 Wales             1

Chapter 16

No need to do anything here.