Context

Chocolate is one of the most popular candies in the world. This dataset contains expert ratings of over 1,700 individual chocolate bars, along with information on their regional origin, percentage of cocoa, the variety of chocolate bean used and where the beans were grown.

Flavors of Cacao Rating System:

5= Elite (Transcending beyond the ordinary limits)

4= Premium (Superior flavor development, character and style)

3= Satisfactory(3.0) to praiseworthy(3.75) (well made with special qualities)

2= Disappointing (Passable but contains at least one significant flaw)

1= Unpleasant (mostly unpalatable)

Data source

These ratings were compiled by Brady Brelinski, Founding Member of the Manhattan Chocolate Society.

The dataset is public at Kaggle

Findings

Process

Installing and loading packages

library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5     v purrr   0.3.4
## v tibble  3.1.5     v dplyr   1.0.7
## v tidyr   1.1.4     v stringr 1.4.0
## v readr   2.0.2     v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag()    masks stats::lag()
library(janitor)
## Warning: package 'janitor' was built under R version 4.1.2
## 
## Attaching package: 'janitor'
## The following objects are masked from 'package:stats':
## 
##     chisq.test, fisher.test
library(ggplot2)
library(knitr)
## Warning: package 'knitr' was built under R version 4.1.2

Import Data

flavors_df <- read_csv("flavors_of_cacao.csv")
## Rows: 1795 Columns: 9
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (6): Company 
## (Maker-if known), Specific Bean Origin
## or Bar Name, Cocoa
## ...
## dbl (3): REF, Review
## Date, Rating
## 
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.

Data summary

colnames(flavors_df)
## [1] "Company \n(Maker-if known)"        "Specific Bean Origin\nor Bar Name"
## [3] "REF"                               "Review\nDate"                     
## [5] "Cocoa\nPercent"                    "Company\nLocation"                
## [7] "Rating"                            "Bean\nType"                       
## [9] "Broad Bean\nOrigin"
glimpse(flavors_df)
## Rows: 1,795
## Columns: 9
## $ `Company \n(Maker-if known)`        <chr> "A. Morin", "A. Morin", "A. Morin"~
## $ `Specific Bean Origin\nor Bar Name` <chr> "Agua Grande", "Kpime", "Atsane", ~
## $ REF                                 <dbl> 1876, 1676, 1676, 1680, 1704, 1315~
## $ `Review\nDate`                      <dbl> 2016, 2015, 2015, 2015, 2015, 2014~
## $ `Cocoa\nPercent`                    <chr> "63%", "70%", "70%", "70%", "70%",~
## $ `Company\nLocation`                 <chr> "France", "France", "France", "Fra~
## $ Rating                              <dbl> 3.75, 2.75, 3.00, 3.50, 3.50, 2.75~
## $ `Bean\nType`                        <chr> " ", " ", " ", " ", " ", "Criollo"~
## $ `Broad Bean\nOrigin`                <chr> "Sao Tome", "Togo", "Togo", "Togo"~

Data Cleaning

The data frame has unknown character in the column name so clean_names() function will be used to clean names.

cleaned_df <- clean_names(flavors_df)
glimpse(cleaned_df)
## Rows: 1,795
## Columns: 9
## $ company_maker_if_known           <chr> "A. Morin", "A. Morin", "A. Morin", "~
## $ specific_bean_origin_or_bar_name <chr> "Agua Grande", "Kpime", "Atsane", "Ak~
## $ ref                              <dbl> 1876, 1676, 1676, 1680, 1704, 1315, 1~
## $ review_date                      <dbl> 2016, 2015, 2015, 2015, 2015, 2014, 2~
## $ cocoa_percent                    <chr> "63%", "70%", "70%", "70%", "70%", "7~
## $ company_location                 <chr> "France", "France", "France", "France~
## $ rating                           <dbl> 3.75, 2.75, 3.00, 3.50, 3.50, 2.75, 3~
## $ bean_type                        <chr> " ", " ", " ", " ", " ", "Criollo", "~
## $ broad_bean_origin                <chr> "Sao Tome", "Togo", "Togo", "Togo", "~
colnames(cleaned_df)
## [1] "company_maker_if_known"           "specific_bean_origin_or_bar_name"
## [3] "ref"                              "review_date"                     
## [5] "cocoa_percent"                    "company_location"                
## [7] "rating"                           "bean_type"                       
## [9] "broad_bean_origin"
renamed_cleaned_df <- cleaned_df %>% 
  rename(Company = company_maker_if_known)
glimpse(renamed_cleaned_df)
## Rows: 1,795
## Columns: 9
## $ Company                          <chr> "A. Morin", "A. Morin", "A. Morin", "~
## $ specific_bean_origin_or_bar_name <chr> "Agua Grande", "Kpime", "Atsane", "Ak~
## $ ref                              <dbl> 1876, 1676, 1676, 1680, 1704, 1315, 1~
## $ review_date                      <dbl> 2016, 2015, 2015, 2015, 2015, 2014, 2~
## $ cocoa_percent                    <chr> "63%", "70%", "70%", "70%", "70%", "7~
## $ company_location                 <chr> "France", "France", "France", "France~
## $ rating                           <dbl> 3.75, 2.75, 3.00, 3.50, 3.50, 2.75, 3~
## $ bean_type                        <chr> " ", " ", " ", " ", " ", "Criollo", "~
## $ broad_bean_origin                <chr> "Sao Tome", "Togo", "Togo", "Togo", "~
trimmed_RC_df <- renamed_cleaned_df %>% 
  select(Company,cocoa_percent,rating,company_location)
head(trimmed_RC_df)
## # A tibble: 6 x 4
##   Company  cocoa_percent rating company_location
##   <chr>    <chr>          <dbl> <chr>           
## 1 A. Morin 63%             3.75 France          
## 2 A. Morin 70%             2.75 France          
## 3 A. Morin 70%             3    France          
## 4 A. Morin 70%             3.5  France          
## 5 A. Morin 70%             3.5  France          
## 6 A. Morin 70%             2.75 France
sum(is.na(trimmed_RC_df$cocoa_percent))
## [1] 0
sum(is.na(trimmed_RC_df$rating))
## [1] 0
sum(is.na(trimmed_RC_df$company_location))
## [1] 0
trimmed_RC_df %>% 
  summarise(sd = sd(rating),max= max(rating))
## # A tibble: 1 x 2
##      sd   max
##   <dbl> <dbl>
## 1 0.478     5
best_trimmed_RC_df <- trimmed_RC_df %>% 
  filter(cocoa_percent >= 80 & rating >= 3.75)
head(best_trimmed_RC_df)
## # A tibble: 6 x 4
##   Company             cocoa_percent rating company_location
##   <chr>               <chr>          <dbl> <chr>           
## 1 Chocolate Makers    80%             3.75 Amsterdam       
## 2 Chocolate Tree, The 80%             3.75 Scotland        
## 3 Ethereal            80%             3.75 U.S.A.          
## 4 Potomac             82%             3.75 U.S.A.          
## 5 Pralus              80%             4    France          
## 6 Rogue               80%             3.75 U.S.A.

Share

The companies produce the highest rating of chocolate

ggplot(data = best_trimmed_RC_df) + 
  geom_bar(mapping = aes(x = Company, fill = rating)) + 
  theme(axis.text.x = element_text(angle = 10)) + 
  labs(title = "Companies produce the highest rating of chocolate",x = "Company", y = "Count")

Figure 1 : Companies produce the highest rating of chocolate

Figure 1 shows that Companies Pralus and Soma produce the highest rating of chocolate.

Comparison by Companies

ggplot(data = best_trimmed_RC_df) + 
  geom_col(mapping = aes(x = Company, y = cocoa_percent)) +
  facet_wrap(~Company)+ 
  coord_flip()+
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Comparison by Companies", x ="Companies", y = "Cocoa Percent")

Figure 2 : Comparison in Cocoa Percent by Companies

Figure 2 shows the difference in Cocoa Percentage that Companies use in their chocolate. Company Videri and Company Soma use higher cocoa percentage compared to other companies, recorded at 90% and 88%, respectively.

What cocoa percent and rating determine the best chocolate

ggplot(data = best_trimmed_RC_df) + 
  geom_point(mapping = aes(x = cocoa_percent, y = rating)) + 
  theme(axis.text.x = element_text(angle = 90))+ 
  labs(title = "Best Chocolates", x = "Cocoa Percent", y = "Rating")

Figure 3 : What cocoa percent and rating determine the best chocolate

Figure 3 shows that the best chocolate with the highest rating can be made with 80% cocoa percentage or 88% cocoa percentage.

The countries produce the highest rating of chocolate

ggplot(data = best_trimmed_RC_df) + 
  geom_bar(mapping = aes(x = company_location,  alpha = rating))+ 
  theme(axis.text.x = element_text(angle = 90)) + 
  labs(title = "Countries produce the highest rating of chocolate",x = "Countries", y = "Count")

Figure 4 : What countries produce the highest rating of chocolate

Figure 4 shows that France and Canada produce the highest rating of chocolate.

Which countries are the largest producer of chocolate

arranged_trimmed_RC_df <- trimmed_RC_df %>% 
  count(company_location) %>% 
  arrange(desc(n))
head(arranged_trimmed_RC_df)
## # A tibble: 6 x 2
##   company_location     n
##   <chr>            <int>
## 1 U.S.A.             764
## 2 France             156
## 3 Canada             125
## 4 U.K.                96
## 5 Italy               63
## 6 Ecuador             54
countries_most <- arranged_trimmed_RC_df %>% 
  filter(n > 60)
head(countries_most)
## # A tibble: 5 x 2
##   company_location     n
##   <chr>            <int>
## 1 U.S.A.             764
## 2 France             156
## 3 Canada             125
## 4 U.K.                96
## 5 Italy               63
ggplot(data = countries_most) + 
  geom_col(mapping = aes(reorder(company_location,n),n))+ 
  labs(title = "Which coutries is the largest producer of chocolate",x = "Countries", y = "Count")

Figure 5 : Which countries are the largest producer of chocolate

Figure 5 shows that USA is the largest producer of chocolate, followed by France and Canada.

The Best Bean Grow Origin

bean_df <- renamed_cleaned_df %>% 
  select(broad_bean_origin, bean_type, rating)
head(bean_df)
## # A tibble: 6 x 3
##   broad_bean_origin bean_type rating
##   <chr>             <chr>      <dbl>
## 1 Sao Tome                      3.75
## 2 Togo                          2.75
## 3 Togo                          3   
## 4 Togo                          3.5 
## 5 Peru                          3.5 
## 6 Venezuela         Criollo     2.75
best_bean_df <- bean_df %>%
  filter(rating > 3.8)
head(best_bean_df)
## # A tibble: 6 x 3
##   broad_bean_origin bean_type           rating
##   <chr>             <chr>                <dbl>
## 1 Peru                                       4
## 2 Venezuela         Trinitario               4
## 3 Peru                                       4
## 4 Papua New Guinea                           4
## 5 Ecuador                                    4
## 6 Venezuela         Criollo (Porcelana)      4
ggplot(data = best_bean_df) + geom_bar(mapping = aes(x = broad_bean_origin, fill = bean_type)) +
  labs(title = "The Best Bean Grow Origin") +
  coord_flip()+
  theme(axis.text.x = element_text(angle = 90,vjust = 0.7)) 

Figure 6 : The origin of Best Bean Grow

Figure 6 shows that bean grows best in Venezuela, Peru and Madagascar.

The relationship between Rating and Cocoa Percent

relation_cocoa_df <- renamed_cleaned_df %>%
  select(rating, cocoa_percent)
head(relation_cocoa_df)
## # A tibble: 6 x 2
##   rating cocoa_percent
##    <dbl> <chr>        
## 1   3.75 63%          
## 2   2.75 70%          
## 3   3    70%          
## 4   3.5  70%          
## 5   3.5  70%          
## 6   2.75 70%
arranged_data_df <- relation_cocoa_df %>% 
  arrange(cocoa_percent)
head(arranged_data_df)
## # A tibble: 6 x 2
##   rating cocoa_percent
##    <dbl> <chr>        
## 1   1.75 100%         
## 2   1.5  100%         
## 3   1.5  100%         
## 4   3.5  100%         
## 5   1    100%         
## 6   1.5  100%
ggplot(data=arranged_data_df) + 
  geom_point(mapping = aes(x = rating, y = cocoa_percent)) +
  labs(title = "The relationship between rating and cocoa percent", x = "Rating", y= "Cocoa Percent")

Figure 7 : The relationship between rating and cocoa percent

Figure 7 shows that no relationship between rating and cocoa percent.