Import Data
data <- read_excel("../01_module4/data/MyData.xlsx")
data
## # A tibble: 2,657 × 10
## REF Compan…¹ Compa…² Revie…³ Count…⁴ Speci…⁵ Cocoa…⁶ Ingre…⁷ Most.…⁸ Rating
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 2454 5150 U.S.A. 2019 Tanzan… Kokoa … 0.76 3- B,S… rich c… 3.25
## 2 2458 5150 U.S.A. 2019 Domini… Zorzal… 0.76 3- B,S… cocoa,… 3.5
## 3 2454 5150 U.S.A. 2019 Madaga… Bejofo… 0.76 3- B,S… cocoa,… 3.75
## 4 2542 5150 U.S.A. 2021 Fiji Matasa… 0.68 3- B,S… chewy,… 3
## 5 2546 5150 U.S.A. 2021 Venezu… Sur de… 0.72 3- B,S… fatty,… 3
## 6 2546 5150 U.S.A. 2021 Uganda Semuli… 0.8 3- B,S… mildly… 3.25
## 7 2542 5150 U.S.A. 2021 India Anamal… 0.68 3- B,S… milk b… 3.5
## 8 2808 20N | 2… France 2022 Venezu… Chuao,… 0.78 2- B,S sandy,… 2.75
## 9 2808 20N | 2… France 2022 Venezu… Chuao,… 0.78 2- B,S sl. dr… 3
## 10 797 A. Morin France 2012 Bolivia Bolivia 0.7 4- B,S… vegeta… 3.5
## # … with 2,647 more rows, and abbreviated variable names ¹Company.Manufacturer,
## # ²Company.Location, ³Review.Date, ⁴Country.of.Bean.Origin,
## # ⁵Specific.Bean.Origin.or.Bar.Name, ⁶Cocoa.Percent, ⁷Ingredients,
## # ⁸Most.Memorable.Characteristics
Introduction
Questions
Variation
Visualizing distributions
data %>%
ggplot(aes(x = Review.Date)) +
geom_bar()

Typical values
data %>%
#filter out years
filter(Review.Date > 2017 )%>%
#plot
ggplot(aes(x = Review.Date)) +
geom_histogram(binwidth = 0.5)

Unusual values
data %>%
ggplot(aes(Review.Date)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>%
ggplot(aes(Review.Date)) +
geom_histogram() +
coord_cartesian(ylim = c(0, 300))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
data %>%
# filter(y < 2006 | y > 2012) %>%
mutate(y_year = ifelse(Review.Date < 2006 | Review.Date > 2012, NA, Review.Date)) %>%
#Plot
ggplot(aes(x = Rating, y = y_year)) +
geom_point()
## Warning: Removed 1840 rows containing missing values (`geom_point()`).

Covariation
A categorical and continuous variable
data %>%
filter(!is.na(Ingredients))%>%
ggplot(aes(x = Rating, y = Ingredients)) +
geom_boxplot()

Two categorical variables
data %>%
count(Cocoa.Percent, Rating) %>%
ggplot(aes(x = Cocoa.Percent, y = Rating, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
data %>%
ggplot(aes(x = Rating, y = Review.Date)) +
geom_hex()

data %>%
filter(Rating > 2) %>%
ggplot(aes(x = Rating, y = Review.Date)) +
geom_boxplot(aes(group = cut_width(Rating, 0.1)))

Patterns and models
mod <- lm(log(Review.Date) ~ log(Rating), data = data)
data %>%
modelr::add_residuals(mod) %>%
mutate(resid = exp(resid))
## # A tibble: 2,657 × 11
## REF Compan…¹ Compa…² Revie…³ Count…⁴ Speci…⁵ Cocoa…⁶ Ingre…⁷ Most.…⁸ Rating
## <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl> <chr> <chr> <dbl>
## 1 2454 5150 U.S.A. 2019 Tanzan… Kokoa … 0.76 3- B,S… rich c… 3.25
## 2 2458 5150 U.S.A. 2019 Domini… Zorzal… 0.76 3- B,S… cocoa,… 3.5
## 3 2454 5150 U.S.A. 2019 Madaga… Bejofo… 0.76 3- B,S… cocoa,… 3.75
## 4 2542 5150 U.S.A. 2021 Fiji Matasa… 0.68 3- B,S… chewy,… 3
## 5 2546 5150 U.S.A. 2021 Venezu… Sur de… 0.72 3- B,S… fatty,… 3
## 6 2546 5150 U.S.A. 2021 Uganda Semuli… 0.8 3- B,S… mildly… 3.25
## 7 2542 5150 U.S.A. 2021 India Anamal… 0.68 3- B,S… milk b… 3.5
## 8 2808 20N | 2… France 2022 Venezu… Chuao,… 0.78 2- B,S sandy,… 2.75
## 9 2808 20N | 2… France 2022 Venezu… Chuao,… 0.78 2- B,S sl. dr… 3
## 10 797 A. Morin France 2012 Bolivia Bolivia 0.7 4- B,S… vegeta… 3.5
## # … with 2,647 more rows, 1 more variable: resid <dbl>, and abbreviated
## # variable names ¹Company.Manufacturer, ²Company.Location, ³Review.Date,
## # ⁴Country.of.Bean.Origin, ⁵Specific.Bean.Origin.or.Bar.Name, ⁶Cocoa.Percent,
## # ⁷Ingredients, ⁸Most.Memorable.Characteristics
data %>%
ggplot(aes(Rating, Review.Date)) +
geom_point()

data %>%
ggplot(aes(as.factor(Review.Date), Rating)) +
geom_boxplot() +
theme(axis.text = element_text(angle = 90))
