library(dplyr)
## Warning: package 'dplyr' was built under R version 4.4.2
##
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
##
## filter, lag
## The following objects are masked from 'package:base':
##
## intersect, setdiff, setequal, union
library(stringr)
library(tidyr)
library(ggplot2)
## Warning: package 'ggplot2' was built under R version 4.4.2
library(forcats)
url <- "https://raw.githubusercontent.com/j-song-npc/607-Project-2-/refs/heads/main/cheeses.csv"
cheeses_df <- read.csv(url)
View(cheeses_df)
cheeses <- cheeses_df %>%
separate(country, into = c("country1", "country2", "country3", "country4", "country5", "country6", "country7", "country8"), sep = ",", fill = "right")
max_milk <- cheeses %>%
mutate(milk = str_count(milk, ",") + 1) %>%
summarize(max(milk)) %>%
pull()
cheeses <- cheeses %>%
separate(milk, into=c("milk1", "milk2", "milk3", "milk4"), sep = ",", fill = "right")
max_texture <-cheeses_df %>%
mutate(texture = str_count(texture, ",") +1 ) %>%
summarize(max(texture)) %>%
pull()
cheeses <- cheeses %>%
separate(texture, into=paste0("texture",1:max_texture), sep = ",", fill= "right")
max_type <- cheeses_df %>%
mutate(type = replace_na(type, "")) %>%
mutate(type = str_count(type, ",") +1) %>%
summarize(max(type)) %>%
pull()
cheeses <- cheeses %>%
separate(type, into=paste0("type",1:max_type), sep= ",", fill="right")
max_flavor <- cheeses_df %>%
mutate(flavor = replace_na(flavor, "")) %>%
mutate(flavor = str_count(flavor, ",") +1) %>%
summarize(max(flavor)) %>%
pull()
cheeses <- cheeses %>%
separate(flavor, into=paste0("flavor",1:max_flavor), sep= ",", fill="right")
max_aroma <- cheeses_df %>%
mutate(aroma = replace_na(aroma, "")) %>%
mutate(num_aroma = str_count(aroma, ",") +1) %>%
summarize(max(num_aroma)) %>%
pull()
cheeses <- cheeses %>%
separate(aroma, into=paste0("aroma",1:max_aroma), sep= ",", fill="right")
max_alt <-cheeses_df %>%
mutate(alt_spellings = replace_na(alt_spellings, "")) %>%
mutate(alt_spellings = str_count(alt_spellings, ",") +1 ) %>%
summarize(max(alt_spellings)) %>%
pull()
cheeses <- cheeses %>%
separate(alt_spellings, into=paste0("alt_spellings",1:max_alt), sep = ",", fill= "right")
cheeses <- cheeses %>%
mutate(across(where(is.character), ~ str_replace_all(., ",", ""))) %>%
mutate(across(where(is.character), ~ str_trim(.))) %>%
mutate(across(where(is.character), ~ na_if(., "")))
For my analysis, I wanted to see the flavor profiles by country. Since there are too many countries, I will evaluate the top countries that have the greatest variety of flavors.
cheeses_long <- cheeses %>%
pivot_longer(cols = starts_with("flavor"),
names_to = "flavor_type",
values_to = "flavor",
values_drop_na = TRUE) %>%
pivot_longer(cols= starts_with("Country"),
names_to= "Country_num",
values_to = "Country",
values_drop_na = TRUE) %>%
mutate(country_combined = case_when(
Country %in% c("United Kingdom", "Great Britain", "Wales", "Scotland", "England") ~ "United Kingdom",
Country %in% c("Netherlands", "Holland") ~ "Netherlands",
TRUE ~ Country))
cheese_flavor <- cheeses_long %>%
group_by(country_combined) %>%
summarize(flavor_count = n_distinct(flavor), .groups = "drop") %>%
arrange(desc(flavor_count)) %>%
top_n(10, flavor_count)
ggplot(cheese_flavor, aes(x = reorder(country_combined, flavor_count), y = flavor_count, fill = country_combined)) +
geom_bar(stat = "identity") +
theme_minimal() +
labs(title = "Top 10 Countries with the Most Unique Cheese Flavors",
x = "Country",
y = "Number of Unique Flavors") +
coord_flip()
From the data, we can see that the cheese from the UK has the most amount of unique flavors.