data <- read_excel("../01_module4/data/MyData.xlsx")
data
## # A tibble: 2,657 × 10
## REF Compan…¹ Compa…² Revie…³ Count…⁴ Speci…⁵ Cocoa…⁶ Ingre…⁷ Most.…⁸ Rating
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 2454 5150 U.S.A. 2019 Tanzan… Kokoa … 0.76 3- B,S… rich c… 3.25
## 2 2458 5150 U.S.A. 2019 Domini… Zorzal… 0.76 3- B,S… cocoa,… 3.5
## 3 2454 5150 U.S.A. 2019 Madaga… Bejofo… 0.76 3- B,S… cocoa,… 3.75
## 4 2542 5150 U.S.A. 2021 Fiji Matasa… 0.68 3- B,S… chewy,… 3
## 5 2546 5150 U.S.A. 2021 Venezu… Sur de… 0.72 3- B,S… fatty,… 3
## 6 2546 5150 U.S.A. 2021 Uganda Semuli… 0.8 3- B,S… mildly… 3.25
## 7 2542 5150 U.S.A. 2021 India Anamal… 0.68 3- B,S… milk b… 3.5
## 8 2808 20N | 2… France 2022 Venezu… Chuao,… 0.78 2- B,S sandy,… 2.75
## 9 2808 20N | 2… France 2022 Venezu… Chuao,… 0.78 2- B,S sl. dr… 3
## 10 797 A. Morin France 2012 Bolivia Bolivia 0.7 4- B,S… vegeta… 3.5
## # … with 2,647 more rows, and abbreviated variable names ¹Company.Manufacturer,
## # ²Company.Location, ³Review.Date, ⁴Country.of.Bean.Origin,
## # ⁵Specific.Bean.Origin.or.Bar.Name, ⁶Cocoa.Percent, ⁷Ingredients,
## # ⁸Most.Memorable.Characteristics
data %>% glimpse()
## Rows: 2,657
## Columns: 10
## $ REF <dbl> 2454, 2458, 2454, 2542, 2546, 2546, 2…
## $ Company.Manufacturer <chr> "5150", "5150", "5150", "5150", "5150…
## $ Company.Location <chr> "U.S.A.", "U.S.A.", "U.S.A.", "U.S.A.…
## $ Review.Date <dbl> 2019, 2019, 2019, 2021, 2021, 2021, 2…
## $ Country.of.Bean.Origin <chr> "Tanzania", "Dominican Republic", "Ma…
## $ Specific.Bean.Origin.or.Bar.Name <chr> "Kokoa Kamili, batch 1 ", "Zorzal, ba…
## $ Cocoa.Percent <dbl> 0.76, 0.76, 0.76, 0.68, 0.72, 0.80, 0…
## $ Ingredients <chr> "3- B,S,C", "3- B,S,C", "3- B,S,C", "…
## $ Most.Memorable.Characteristics <chr> "rich cocoa, fatty, bready", "cocoa, …
## $ Rating <dbl> 3.25, 3.50, 3.75, 3.00, 3.00, 3.25, 3…
data_small <- data %>% select(where(is.character)) %>% head(n = 10)
data_small %>%
summarise(sum(str_detect(Company.Location, "U.S.A.$")))
## # A tibble: 1 × 1
## `sum(str_detect(Company.Location, "U.S.A.$"))`
## <int>
## 1 7
str_detect(data_small$Company.Location, "U.S.A.$")
## [1] TRUE TRUE TRUE TRUE TRUE TRUE TRUE FALSE FALSE FALSE
sum(str_detect(data_small$Company.Location, "U.S.A.$"))
## [1] 7
data_small %>%
mutate(is_U.S.A. = str_detect(Company.Location, "U.S.A."))
## # A tibble: 10 × 7
## Company.Manufacturer Company.Location Count…¹ Speci…² Ingre…³ Most.…⁴ is_U.…⁵
## <chr> <chr> <chr> <chr> <chr> <chr> <lgl>
## 1 5150 U.S.A. Tanzan… Kokoa … 3- B,S… rich c… TRUE
## 2 5150 U.S.A. Domini… Zorzal… 3- B,S… cocoa,… TRUE
## 3 5150 U.S.A. Madaga… Bejofo… 3- B,S… cocoa,… TRUE
## 4 5150 U.S.A. Fiji Matasa… 3- B,S… chewy,… TRUE
## 5 5150 U.S.A. Venezu… Sur de… 3- B,S… fatty,… TRUE
## 6 5150 U.S.A. Uganda Semuli… 3- B,S… mildly… TRUE
## 7 5150 U.S.A. India Anamal… 3- B,S… milk b… TRUE
## 8 20N | 20S France Venezu… Chuao,… 2- B,S sandy,… FALSE
## 9 20N | 20S France Venezu… Chuao,… 2- B,S sl. dr… FALSE
## 10 A. Morin France Bolivia Bolivia 4- B,S… vegeta… FALSE
## # … with abbreviated variable names ¹Country.of.Bean.Origin,
## # ²Specific.Bean.Origin.or.Bar.Name, ³Ingredients,
## # ⁴Most.Memorable.Characteristics, ⁵is_U.S.A.
data_small %>%
mutate(is_U.S.A. = str_extract(Company.Location, "U.S.A."))
## # A tibble: 10 × 7
## Company.Manufacturer Company.Location Count…¹ Speci…² Ingre…³ Most.…⁴ is_U.…⁵
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 5150 U.S.A. Tanzan… Kokoa … 3- B,S… rich c… U.S.A.
## 2 5150 U.S.A. Domini… Zorzal… 3- B,S… cocoa,… U.S.A.
## 3 5150 U.S.A. Madaga… Bejofo… 3- B,S… cocoa,… U.S.A.
## 4 5150 U.S.A. Fiji Matasa… 3- B,S… chewy,… U.S.A.
## 5 5150 U.S.A. Venezu… Sur de… 3- B,S… fatty,… U.S.A.
## 6 5150 U.S.A. Uganda Semuli… 3- B,S… mildly… U.S.A.
## 7 5150 U.S.A. India Anamal… 3- B,S… milk b… U.S.A.
## 8 20N | 20S France Venezu… Chuao,… 2- B,S sandy,… <NA>
## 9 20N | 20S France Venezu… Chuao,… 2- B,S sl. dr… <NA>
## 10 A. Morin France Bolivia Bolivia 4- B,S… vegeta… <NA>
## # … with abbreviated variable names ¹Country.of.Bean.Origin,
## # ²Specific.Bean.Origin.or.Bar.Name, ³Ingredients,
## # ⁴Most.Memorable.Characteristics, ⁵is_U.S.A.
data_small %>%
mutate(is_U.S.A. = str_replace(Company.Location, "U.S.A.", "USA"))
## # A tibble: 10 × 7
## Company.Manufacturer Company.Location Count…¹ Speci…² Ingre…³ Most.…⁴ is_U.…⁵
## <chr> <chr> <chr> <chr> <chr> <chr> <chr>
## 1 5150 U.S.A. Tanzan… Kokoa … 3- B,S… rich c… USA
## 2 5150 U.S.A. Domini… Zorzal… 3- B,S… cocoa,… USA
## 3 5150 U.S.A. Madaga… Bejofo… 3- B,S… cocoa,… USA
## 4 5150 U.S.A. Fiji Matasa… 3- B,S… chewy,… USA
## 5 5150 U.S.A. Venezu… Sur de… 3- B,S… fatty,… USA
## 6 5150 U.S.A. Uganda Semuli… 3- B,S… mildly… USA
## 7 5150 U.S.A. India Anamal… 3- B,S… milk b… USA
## 8 20N | 20S France Venezu… Chuao,… 2- B,S sandy,… France
## 9 20N | 20S France Venezu… Chuao,… 2- B,S sl. dr… France
## 10 A. Morin France Bolivia Bolivia 4- B,S… vegeta… France
## # … with abbreviated variable names ¹Country.of.Bean.Origin,
## # ²Specific.Bean.Origin.or.Bar.Name, ³Ingredients,
## # ⁴Most.Memorable.Characteristics, ⁵is_U.S.A.
I started by detecting company location and found to be true 7 companies located in the USA. I then extracted the companies located in the USA and produced them in a separate column called is_U.S.A. Lastly I replaced the company location of U.S.A. to just USA is the new column of is_U.S.A.