# excel file
data <- read_excel("../00_data/NHLDATA.xlsx")
data %>% distinct(birth_country)
## # A tibble: 46 × 1
## birth_country
## <chr>
## 1 CAN
## 2 SWE
## 3 CZE
## 4 USA
## 5 LVA
## 6 RUS
## 7 NGA
## 8 FIN
## 9 SVK
## 10 DEU
## # ℹ 36 more rows
# Convert 'birth_country' into a factor
data <- data %>%
mutate(birth_country = as_factor(birth_country))
# Check levels
levels(data$birth_country)
## [1] "CAN" "SWE" "CZE" "USA" "LVA" "RUS" "NGA" "FIN" "SVK" "DEU" "GBR" "UKR"
## [13] "KAZ" "POL" "PRY" "CHE" "FRA" "VEN" "NLD" "SRB" "HTI" "LTU" "BLR" "AUT"
## [25] "IRL" "LBN" "DNK" "ITA" "NOR" "SVN" "JAM" "KOR" "BRN" "ZAF" "BEL" "BHS"
## [37] "EST" "BRA" "IDN" "TWN" "JPN" "UZB" "AUS" "HRV" "BGR" "TZA"
Make two bar charts here - one before ordering another after
# Unordered
ggplot(data, aes(birth_country)) +
geom_bar(fill = "steelblue") +
labs(title = "Birth Country (Unordered)", x = "Country", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
# Ordered by frequency
data <- data %>%
mutate(birth_country = fct_infreq(birth_country))
ggplot(data, aes(birth_country)) +
geom_bar(fill = "darkgreen") +
labs(title = "Birth Country (Ordered by Frequency)", x = "Country", y = "Count") +
theme(axis.text.x = element_text(angle = 45, hjust = 1))
Show examples of three functions:
data <- data %>%
mutate(birth_country_grouped = fct_collapse(birth_country,
"North America" = c("Canada", "USA"),
"Europe" = c("Sweden", "Finland", "Russia", "Czechia", "Germany")))
table(data$birth_country)
##
## CAN USA SWE RUS CZE FIN SVK GBR DEU CHE LVA DNK UKR FRA BLR AUT
## 5468 1403 403 287 258 257 93 52 42 42 26 18 17 14 11 11
## KAZ NOR POL ITA NLD LTU IRL NGA VEN SRB SVN KOR BEL BRA JPN HRV
## 10 9 8 4 3 3 3 2 2 2 2 2 2 2 2 2
## PRY HTI LBN JAM BRN ZAF BHS EST IDN TWN UZB AUS BGR TZA
## 1 1 1 1 1 1 1 1 1 1 1 1 1 1
data <- data %>%
mutate(birth_country_grouped = fct_collapse(birth_country,
"North America" = c("Canada", "USA"),
"Europe" = c("Sweden", "Finland", "Russia", "Czechia", "Germany")))
table(data$birth_country_grouped)
##
## CAN North America SWE RUS CZE
## 5468 1403 403 287 258
## FIN SVK GBR DEU CHE
## 257 93 52 42 42
## LVA DNK UKR FRA BLR
## 26 18 17 14 11
## AUT KAZ NOR POL ITA
## 11 10 9 8 4
## NLD LTU IRL NGA VEN
## 3 3 3 2 2
## SRB SVN KOR BEL BRA
## 2 2 2 2 2
## JPN HRV PRY HTI LBN
## 2 2 1 1 1
## JAM BRN ZAF BHS EST
## 1 1 1 1 1
## IDN TWN UZB AUS BGR
## 1 1 1 1 1
## TZA
## 1
# 3. fct_lump: lump smaller countries into "Other"
data <- data %>%
mutate(birth_country_lumped = fct_lump(birth_country, n = 5))
# Show counts for lumped factor
table(data$birth_country_lumped)
##
## CAN USA SWE RUS CZE Other
## 5468 1403 403 287 258 655
No need to do anything here.