Import your data

# excel file
data <- read_excel("../00_data/NHLDATA.xlsx")
data %>% distinct(birth_country)
## # A tibble: 46 × 1
##    birth_country
##    <chr>        
##  1 CAN          
##  2 SWE          
##  3 CZE          
##  4 USA          
##  5 LVA          
##  6 RUS          
##  7 NGA          
##  8 FIN          
##  9 SVK          
## 10 DEU          
## # ℹ 36 more rows

Chapter 15

Create a factor

# Convert 'birth_country' into a factor
data <- data %>%
  mutate(birth_country = as_factor(birth_country))

# Check levels
levels(data$birth_country)
##  [1] "CAN" "SWE" "CZE" "USA" "LVA" "RUS" "NGA" "FIN" "SVK" "DEU" "GBR" "UKR"
## [13] "KAZ" "POL" "PRY" "CHE" "FRA" "VEN" "NLD" "SRB" "HTI" "LTU" "BLR" "AUT"
## [25] "IRL" "LBN" "DNK" "ITA" "NOR" "SVN" "JAM" "KOR" "BRN" "ZAF" "BEL" "BHS"
## [37] "EST" "BRA" "IDN" "TWN" "JPN" "UZB" "AUS" "HRV" "BGR" "TZA"

Modify factor order

Make two bar charts here - one before ordering another after

# Unordered
ggplot(data, aes(birth_country)) +
  geom_bar(fill = "steelblue") +
  labs(title = "Birth Country (Unordered)", x = "Country", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

# Ordered by frequency
data <- data %>%
  mutate(birth_country = fct_infreq(birth_country))

ggplot(data, aes(birth_country)) +
  geom_bar(fill = "darkgreen") +
  labs(title = "Birth Country (Ordered by Frequency)", x = "Country", y = "Count") +
  theme(axis.text.x = element_text(angle = 45, hjust = 1))

Modify factor levels

Show examples of three functions:

  • fct_recode
data <- data %>%
  mutate(birth_country_grouped = fct_collapse(birth_country,
                                              "North America" = c("Canada", "USA"),
                                              "Europe" = c("Sweden", "Finland", "Russia", "Czechia", "Germany")))
table(data$birth_country)
## 
##  CAN  USA  SWE  RUS  CZE  FIN  SVK  GBR  DEU  CHE  LVA  DNK  UKR  FRA  BLR  AUT 
## 5468 1403  403  287  258  257   93   52   42   42   26   18   17   14   11   11 
##  KAZ  NOR  POL  ITA  NLD  LTU  IRL  NGA  VEN  SRB  SVN  KOR  BEL  BRA  JPN  HRV 
##   10    9    8    4    3    3    3    2    2    2    2    2    2    2    2    2 
##  PRY  HTI  LBN  JAM  BRN  ZAF  BHS  EST  IDN  TWN  UZB  AUS  BGR  TZA 
##    1    1    1    1    1    1    1    1    1    1    1    1    1    1
  • fct_collapse
data <- data %>%
  mutate(birth_country_grouped = fct_collapse(birth_country,
                                              "North America" = c("Canada", "USA"),
                                              "Europe" = c("Sweden", "Finland", "Russia", "Czechia", "Germany")))
table(data$birth_country_grouped)
## 
##           CAN North America           SWE           RUS           CZE 
##          5468          1403           403           287           258 
##           FIN           SVK           GBR           DEU           CHE 
##           257            93            52            42            42 
##           LVA           DNK           UKR           FRA           BLR 
##            26            18            17            14            11 
##           AUT           KAZ           NOR           POL           ITA 
##            11            10             9             8             4 
##           NLD           LTU           IRL           NGA           VEN 
##             3             3             3             2             2 
##           SRB           SVN           KOR           BEL           BRA 
##             2             2             2             2             2 
##           JPN           HRV           PRY           HTI           LBN 
##             2             2             1             1             1 
##           JAM           BRN           ZAF           BHS           EST 
##             1             1             1             1             1 
##           IDN           TWN           UZB           AUS           BGR 
##             1             1             1             1             1 
##           TZA 
##             1
  • fct_lump
# 3. fct_lump: lump smaller countries into "Other"

data <- data %>%
mutate(birth_country_lumped = fct_lump(birth_country, n = 5))

# Show counts for lumped factor

table(data$birth_country_lumped)
## 
##   CAN   USA   SWE   RUS   CZE Other 
##  5468  1403   403   287   258   655

Chapter 16

No need to do anything here.