data <- read_excel("../01_module4/data/MyData.xlsx")
data
## # A tibble: 2,657 × 10
## REF Compan…¹ Compa…² Revie…³ Count…⁴ Speci…⁵ Cocoa…⁶ Ingre…⁷ Most.…⁸ Rating
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 2454 5150 U.S.A. 2019 Tanzan… Kokoa … 0.76 3- B,S… rich c… 3.25
## 2 2458 5150 U.S.A. 2019 Domini… Zorzal… 0.76 3- B,S… cocoa,… 3.5
## 3 2454 5150 U.S.A. 2019 Madaga… Bejofo… 0.76 3- B,S… cocoa,… 3.75
## 4 2542 5150 U.S.A. 2021 Fiji Matasa… 0.68 3- B,S… chewy,… 3
## 5 2546 5150 U.S.A. 2021 Venezu… Sur de… 0.72 3- B,S… fatty,… 3
## 6 2546 5150 U.S.A. 2021 Uganda Semuli… 0.8 3- B,S… mildly… 3.25
## 7 2542 5150 U.S.A. 2021 India Anamal… 0.68 3- B,S… milk b… 3.5
## 8 2808 20N | 2… France 2022 Venezu… Chuao,… 0.78 2- B,S sandy,… 2.75
## 9 2808 20N | 2… France 2022 Venezu… Chuao,… 0.78 2- B,S sl. dr… 3
## 10 797 A. Morin France 2012 Bolivia Bolivia 0.7 4- B,S… vegeta… 3.5
## # … with 2,647 more rows, and abbreviated variable names ¹Company.Manufacturer,
## # ²Company.Location, ³Review.Date, ⁴Country.of.Bean.Origin,
## # ⁵Specific.Bean.Origin.or.Bar.Name, ⁶Cocoa.Percent, ⁷Ingredients,
## # ⁸Most.Memorable.Characteristics
Unordered factor levels
# Transform data: Calculate average Rating by Company Location
rating_by_location <- data %>%
group_by(Company.Location) %>%
summarise(
avg_rating = mean(Rating, na.rm = TRUE)
)
rating_by_location
## # A tibble: 69 × 2
## Company.Location avg_rating
## <chr> <dbl>
## 1 Argentina 3.31
## 2 Australia 3.36
## 3 Austria 3.26
## 4 Belgium 3.07
## 5 Bolivia 3.25
## 6 Brazil 3.27
## 7 Canada 3.31
## 8 Chile 3.75
## 9 Colombia 3.21
## 10 Costa Rica 3.14
## # … with 59 more rows
# Plot
rating_by_location %>%
ggplot(aes(x = avg_rating, y = Company.Location)) +
geom_point()
Ordered factor levels
rating_by_location %>%
ggplot(aes(x = avg_rating, y = fct_reorder(.f = Company.Location, .x = avg_rating))) +
geom_point() +
# Labeling
labs(y = NULL, x = "Mean Rating By Company Location")
Moving a single level to the front
rating_by_location %>%
ggplot(aes(x = avg_rating,
y = fct_reorder(.f = Company.Location, .x = avg_rating) %>%
fct_relevel("U.S.A."))) +
geom_point() +
# Labeling
labs(y = NULL, x = "Mean Rating By Company Location")
Show examples of three functions: * fct_recode * fct_collapse * fct_lump
data %>% distinct(Country.of.Bean.Origin)
## # A tibble: 64 × 1
## Country.of.Bean.Origin
## <chr>
## 1 Tanzania
## 2 Dominican Republic
## 3 Madagascar
## 4 Fiji
## 5 Venezuela
## 6 Uganda
## 7 India
## 8 Bolivia
## 9 Peru
## 10 Panama
## # … with 54 more rows
# Recode
data %>%
# Rename levels
mutate(Country.of.Bean.Origin_rev = fct_recode(Country.of.Bean.Origin, "South America" = "Peru")) %>%
select(Country.of.Bean.Origin, Country.of.Bean.Origin_rev) %>%
filter(Country.of.Bean.Origin == "Peru")
## # A tibble: 256 × 2
## Country.of.Bean.Origin Country.of.Bean.Origin_rev
## <chr> <fct>
## 1 Peru South America
## 2 Peru South America
## 3 Peru South America
## 4 Peru South America
## 5 Peru South America
## 6 Peru South America
## 7 Peru South America
## 8 Peru South America
## 9 Peru South America
## 10 Peru South America
## # … with 246 more rows
# Collapse multiple levels into one
data %>%
mutate(Country.of.Bean.Origin_col = fct_collapse(Country.of.Bean.Origin, "West Coast" = c("Peru", "Venezuela"))) %>%
select(Country.of.Bean.Origin, Country.of.Bean.Origin_col) %>%
filter(Country.of.Bean.Origin != "Brazil")
## # A tibble: 2,575 × 2
## Country.of.Bean.Origin Country.of.Bean.Origin_col
## <chr> <fct>
## 1 Tanzania Tanzania
## 2 Dominican Republic Dominican Republic
## 3 Madagascar Madagascar
## 4 Fiji Fiji
## 5 Venezuela West Coast
## 6 Uganda Uganda
## 7 India India
## 8 Venezuela West Coast
## 9 Venezuela West Coast
## 10 Bolivia Bolivia
## # … with 2,565 more rows
#Lump small levels into other levels
data %>% count(Country.of.Bean.Origin)
## # A tibble: 64 × 2
## Country.of.Bean.Origin n
## <chr> <int>
## 1 Australia 3
## 2 Bali 1
## 3 Belize 80
## 4 Blend 157
## 5 Bolivia 83
## 6 Brazil 82
## 7 Burma 1
## 8 Cameroon 3
## 9 China 1
## 10 Colombia 82
## # … with 54 more rows
data %>% mutate(Country.of.Bean.Origin_lump = fct_lump(Country.of.Bean.Origin)) %>% distinct(Country.of.Bean.Origin_lump)
## # A tibble: 64 × 1
## Country.of.Bean.Origin_lump
## <fct>
## 1 Tanzania
## 2 Dominican Republic
## 3 Madagascar
## 4 Fiji
## 5 Venezuela
## 6 Uganda
## 7 India
## 8 Bolivia
## 9 Peru
## 10 Panama
## # … with 54 more rows