library(tidyverse)
## -- Attaching packages --------------------------------------- tidyverse 1.3.1 --
## v ggplot2 3.3.5 v purrr 0.3.4
## v tibble 3.1.4 v dplyr 1.0.7
## v tidyr 1.1.3 v stringr 1.4.0
## v readr 2.0.1 v forcats 0.5.1
## -- Conflicts ------------------------------------------ tidyverse_conflicts() --
## x dplyr::filter() masks stats::filter()
## x dplyr::lag() masks stats::lag()
library(tinytex)
library(ggplot2)
library(dplyr)
library(hrbrthemes)
## NOTE: Either Arial Narrow or Roboto Condensed fonts are required to use these themes.
## Please use hrbrthemes::import_roboto_condensed() to install Roboto Condensed and
## if Arial Narrow is not on your system, please see https://bit.ly/arialnarrow
library(viridis)
## Loading required package: viridisLite
library(RColorBrewer)
library(tidyr)
library(treemap)
setwd("C:/Users/gru_e/OneDrive/Desktop/DATA110/R Projects")
chocolate <- read_csv("chocolate.csv")
## New names:
## * `` -> ...1
## Rows: 2224 Columns: 21
## -- Column specification --------------------------------------------------------
## Delimiter: ","
## chr (15): company, company_location, country_of_bean_origin, specific_bean_o...
## dbl (6): ...1, ref, review_date, cocoa_percent, rating, counts_of_ingredients
##
## i Use `spec()` to retrieve the full column specification for this data.
## i Specify the column types or set `show_col_types = FALSE` to quiet this message.
chocolate
## # A tibble: 2,224 x 21
## ...1 ref company company_location review_date country_of_bean_origin
## <dbl> <dbl> <chr> <chr> <dbl> <chr>
## 1 0 2454 5150 U.S.A 2019 Madagascar
## 2 1 2458 5150 U.S.A 2019 Dominican republic
## 3 2 2454 5150 U.S.A 2019 Tanzania
## 4 3 797 A. Morin France 2012 Peru
## 5 4 797 A. Morin France 2012 Bolivia
## 6 5 1015 A. Morin France 2013 Venezuela
## 7 6 1019 A. Morin France 2013 Peru
## 8 7 1011 A. Morin France 2013 Ecuador
## 9 8 1019 A. Morin France 2013 Peru
## 10 9 1011 A. Morin France 2013 Brazil
## # ... with 2,214 more rows, and 15 more variables:
## # specific_bean_origin_or_bar_name <chr>, cocoa_percent <dbl>, rating <dbl>,
## # counts_of_ingredients <dbl>, beans <chr>, cocoa_butter <chr>,
## # vanilla <chr>, lecithin <chr>, salt <chr>, sugar <chr>,
## # sweetener_without_sugar <chr>, first_taste <chr>, second_taste <chr>,
## # third_taste <chr>, fourth_taste <chr>
chocolate2 <- chocolate %>%
select(company, company_location, `country_of_bean_origin`, review_date, 'cocoa_percent', `rating`, 'counts_of_ingredients', cocoa_butter) %>%
group_by(company, rating)
#head(chocolate2)
chocolate2
## # A tibble: 2,224 x 8
## # Groups: company, rating [1,275]
## company company_location country_of_bean_o~ review_date cocoa_percent rating
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 5150 U.S.A Madagascar 2019 76 3.75
## 2 5150 U.S.A Dominican republic 2019 76 3.5
## 3 5150 U.S.A Tanzania 2019 76 3.25
## 4 A. Morin France Peru 2012 63 3.75
## 5 A. Morin France Bolivia 2012 70 3.5
## 6 A. Morin France Venezuela 2013 70 4
## 7 A. Morin France Peru 2013 63 4
## 8 A. Morin France Ecuador 2013 70 3.75
## 9 A. Morin France Peru 2013 70 3.5
## 10 A. Morin France Brazil 2013 70 3.25
## # ... with 2,214 more rows, and 2 more variables: counts_of_ingredients <dbl>,
## # cocoa_butter <chr>
Because there are a lot of chocolate company around the world, I will just pick some of the countries.
#chocolate2 <- chocolate2 %>%
#filter( company =='Amano'|company =='Amedei'|company =='Tribe'| company #=='Valrhona' |company =='Szanto Tibor'|company =='Zotter')
#chocolate2
chocolate2 <- chocolate2 %>%
filter( company_location =='U.S.A'|company_location =='Italy'|company_location =='Canada'| company_location =='France' |company_location =='Switzerland'|company_location =='Ecuador')
chocolate2
## # A tibble: 1,433 x 8
## # Groups: company, rating [771]
## company company_location country_of_bean_o~ review_date cocoa_percent rating
## <chr> <chr> <chr> <dbl> <dbl> <dbl>
## 1 5150 U.S.A Madagascar 2019 76 3.75
## 2 5150 U.S.A Dominican republic 2019 76 3.5
## 3 5150 U.S.A Tanzania 2019 76 3.25
## 4 A. Morin France Peru 2012 63 3.75
## 5 A. Morin France Bolivia 2012 70 3.5
## 6 A. Morin France Venezuela 2013 70 4
## 7 A. Morin France Peru 2013 63 4
## 8 A. Morin France Ecuador 2013 70 3.75
## 9 A. Morin France Peru 2013 70 3.5
## 10 A. Morin France Brazil 2013 70 3.25
## # ... with 1,423 more rows, and 2 more variables: counts_of_ingredients <dbl>,
## # cocoa_butter <chr>
str(chocolate2)
## grouped_df [1,433 x 8] (S3: grouped_df/tbl_df/tbl/data.frame)
## $ company : chr [1:1433] "5150" "5150" "5150" "A. Morin" ...
## $ company_location : chr [1:1433] "U.S.A" "U.S.A" "U.S.A" "France" ...
## $ country_of_bean_origin: chr [1:1433] "Madagascar" "Dominican republic" "Tanzania" "Peru" ...
## $ review_date : num [1:1433] 2019 2019 2019 2012 2012 ...
## $ cocoa_percent : num [1:1433] 76 76 76 63 70 70 63 70 70 70 ...
## $ rating : num [1:1433] 3.75 3.5 3.25 3.75 3.5 4 4 3.75 3.5 3.25 ...
## $ counts_of_ingredients : num [1:1433] 3 3 3 4 4 4 3 4 4 4 ...
## $ cocoa_butter : chr [1:1433] "have_cocoa_butter" "have_cocoa_butter" "have_cocoa_butter" "have_cocoa_butter" ...
## - attr(*, "groups")= tibble [771 x 3] (S3: tbl_df/tbl/data.frame)
## ..$ company: chr [1:771] "5150" "5150" "5150" "A. Morin" ...
## ..$ rating : num [1:771] 3.25 3.5 3.75 2.75 3 3.25 3.5 3.75 4 3.25 ...
## ..$ .rows : list<int> [1:771]
## .. ..$ : int 3
## .. ..$ : int 2
## .. ..$ : int 1
## .. ..$ : int [1:4] 15 16 21 25
## .. ..$ : int [1:3] 13 14 24
## .. ..$ : int [1:3] 10 11 12
## .. ..$ : int [1:7] 5 9 19 20 22 23 28
## .. ..$ : int [1:5] 4 8 18 26 27
## .. ..$ : int [1:3] 6 7 17
## .. ..$ : int 32
## .. ..$ : int 31
## .. ..$ : int [1:2] 29 30
## .. ..$ : int 34
## .. ..$ : int 33
## .. ..$ : int [1:2] 35 36
## .. ..$ : int 37
## .. ..$ : int [1:2] 38 39
## .. ..$ : int [1:2] 41 42
## .. ..$ : int 40
## .. ..$ : int 43
## .. ..$ : int [1:3] 48 52 53
## .. ..$ : int [1:2] 50 51
## .. ..$ : int [1:4] 46 47 49 54
## .. ..$ : int [1:2] 44 45
## .. ..$ : int 58
## .. ..$ : int [1:3] 57 59 62
## .. ..$ : int 56
## .. ..$ : int [1:2] 55 61
## .. ..$ : int [1:2] 60 63
## .. ..$ : int 69
## .. ..$ : int [1:4] 65 66 67 68
## .. ..$ : int 64
## .. ..$ : int 82
## .. ..$ : int [1:5] 75 76 79 80 81
## .. ..$ : int [1:4] 72 73 74 78
## .. ..$ : int [1:3] 70 71 77
## .. ..$ : int 83
## .. ..$ : int 84
## .. ..$ : int 88
## .. ..$ : int 87
## .. ..$ : int [1:2] 85 86
## .. ..$ : int 97
## .. ..$ : int [1:2] 118 120
## .. ..$ : int [1:6] 96 107 108 109 110 119
## .. ..$ : int [1:12] 92 93 94 95 103 104 105 106 113 114 ...
## .. ..$ : int [1:5] 90 91 100 101 102
## .. ..$ : int [1:6] 89 98 99 111 112 117
## .. ..$ : int 122
## .. ..$ : int 121
## .. ..$ : int [1:4] 123 124 125 126
## .. ..$ : int 129
## .. ..$ : int 128
## .. ..$ : int 127
## .. ..$ : int 132
## .. ..$ : int [1:4] 131 134 135 136
## .. ..$ : int 130
## .. ..$ : int 133
## .. ..$ : int 139
## .. ..$ : int 138
## .. ..$ : int 137
## .. ..$ : int [1:4] 140 141 142 143
## .. ..$ : int 146
## .. ..$ : int [1:2] 144 145
## .. ..$ : int 147
## .. ..$ : int [1:2] 154 155
## .. ..$ : int [1:2] 150 151
## .. ..$ : int [1:3] 148 149 153
## .. ..$ : int 152
## .. ..$ : int [1:3] 157 158 159
## .. ..$ : int 156
## .. ..$ : int 172
## .. ..$ : int [1:5] 162 163 167 170 171
## .. ..$ : int [1:2] 166 169
## .. ..$ : int [1:4] 161 164 165 173
## .. ..$ : int [1:2] 160 168
## .. ..$ : int 177
## .. ..$ : int [1:2] 175 176
## .. ..$ : int 174
## .. ..$ : int [1:2] 179 180
## .. ..$ : int 178
## .. ..$ : int 186
## .. ..$ : int 185
## .. ..$ : int [1:3] 182 183 184
## .. ..$ : int 181
## .. ..$ : int 187
## .. ..$ : int 194
## .. ..$ : int 199
## .. ..$ : int [1:4] 193 196 197 198
## .. ..$ : int [1:3] 207 209 212
## .. ..$ : int [1:6] 203 204 205 206 208 213
## .. ..$ : int [1:5] 190 191 192 202 211
## .. ..$ : int [1:8] 188 189 195 200 201 210 214 215
## .. ..$ : int 216
## .. ..$ : int 217
## .. ..$ : int 219
## .. ..$ : int 218
## .. ..$ : int 223
## .. ..$ : int [1:2] 222 230
## .. ..$ : int [1:3] 220 221 228
## .. .. [list output truncated]
## .. ..@ ptype: int(0)
## ..- attr(*, ".drop")= logi TRUE
#I exploreed this, but it is not neccessary for this visualization
#data_new1 <- chocolate2[order(chocolate2$rating, decreasing = TRUE), ]
#data_new2 <- chocolate2 %>% # Top N highest values by group
#arrange(desc(rating)) %>%
#group_by(company) %>%
#slice(1:1)
data_new2 <- chocolate2
It is time to perform our scatterplot.
ggplot(data_new2, aes(x=review_date, y=cocoa_percent, size = rating, color= company_location)) +
geom_point(alpha=0.7)+
scale_size(range = c(.1, 8) )+
ggtitle("Chocolate Bar Quality") +
ylab("Cocoa percentage")+
xlab("Year of review")
Chocolate is one of the most popular sweets in the world. Yet, not all chocolate bars are created alike. This dataset includes expert ratings of over 1,700 individual chocolate bars, including data on their regional origin, percentage of cocoa, the variety of chocolate beans used, and where the beans were grown.
Rating Scale
4.0 - 5.0 = Outstanding
3.5 - 3.9 = Highly Recommended
3.0 - 3.49 = Recommended
2.0 - 2.9 = Disappointing
1.0 - 1.9 = Unpleasant
This dataset is called Chocolate. I found it from https://www.kaggle.com/soroushghaderi/chocolate-bar-2020 According to Kaggle, this dataset focuses on only plain dark chocolate to appreciate the cacao flavors when made into chocolate.
My visualization represents the quality of cocoa display with the date reviewed. Each color represents the company locations that I picked to display.
My finding is that America seemed to be the leader in making chocolate in the past few years. This visualization also shows that average chocolate quality is around 65%-80%. I am surprised because we are looking at dark chocolate. I would think that it should be closer to pure chocolate and that quality should be closer to about 90-100%
I tried to build a treemap, I could get it to work by grouping the county, but the legend was supposed to be rating ranging from 1.5 - 5 only.
#Treemap
treemap(chocolate2, index=c("company_location","company"), vSize="cocoa_percent",
vColor="rating", type="value",
palette = "Purples")