Module 7: Apply it to your data 6

Import Data

data <- read_excel("../01_module4/data/MyData.xlsx")
data

## # A tibble: 2,657 × 10
##      REF Compan…¹ Compa…² Revie…³ Count…⁴ Speci…⁵ Cocoa…⁶ Ingre…⁷ Most.…⁸ Rating
##    <dbl> <chr>    <chr>     <dbl> <chr>   <chr>     <dbl> <chr>   <chr>    <dbl>
##  1  2454 5150     U.S.A.     2019 Tanzan… Kokoa …    0.76 3- B,S… rich c…   3.25
##  2  2458 5150     U.S.A.     2019 Domini… Zorzal…    0.76 3- B,S… cocoa,…   3.5 
##  3  2454 5150     U.S.A.     2019 Madaga… Bejofo…    0.76 3- B,S… cocoa,…   3.75
##  4  2542 5150     U.S.A.     2021 Fiji    Matasa…    0.68 3- B,S… chewy,…   3   
##  5  2546 5150     U.S.A.     2021 Venezu… Sur de…    0.72 3- B,S… fatty,…   3   
##  6  2546 5150     U.S.A.     2021 Uganda  Semuli…    0.8  3- B,S… mildly…   3.25
##  7  2542 5150     U.S.A.     2021 India   Anamal…    0.68 3- B,S… milk b…   3.5 
##  8  2808 20N | 2… France     2022 Venezu… Chuao,…    0.78 2- B,S  sandy,…   2.75
##  9  2808 20N | 2… France     2022 Venezu… Chuao,…    0.78 2- B,S  sl. dr…   3   
## 10   797 A. Morin France     2012 Bolivia Bolivia    0.7  4- B,S… vegeta…   3.5 
## # … with 2,647 more rows, and abbreviated variable names ¹Company.Manufacturer,
## #   ²Company.Location, ³Review.Date, ⁴Country.of.Bean.Origin,
## #   ⁵Specific.Bean.Origin.or.Bar.Name, ⁶Cocoa.Percent, ⁷Ingredients,
## #   ⁸Most.Memorable.Characteristics

Introduction

Questions

Variation

Visualizing distributions

data %>%
    ggplot(aes(x = Review.Date)) +
    geom_bar()

Typical values

data %>%
    
    #filter out years 
    filter(Review.Date > 2017 )%>%
    
    #plot
    ggplot(aes(x = Review.Date)) +
    geom_histogram(binwidth = 0.5)

Unusual values

data %>%
    ggplot(aes(Review.Date)) +
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

data %>%
    ggplot(aes(Review.Date)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0, 300))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

data %>%
    
   # filter(y < 2006 | y > 2012) %>%
    
    mutate(y_year = ifelse(Review.Date < 2006 | Review.Date > 2012, NA, Review.Date)) %>%
    
    #Plot
    ggplot(aes(x = Rating, y = y_year)) +
    geom_point()

## Warning: Removed 1840 rows containing missing values (`geom_point()`).

Covariation

A categorical and continuous variable

data %>%
    filter(!is.na(Ingredients))%>%
    
    ggplot(aes(x = Rating, y = Ingredients)) +
    geom_boxplot()

Two categorical variables

data %>%
    
    count(Cocoa.Percent, Rating) %>%
    
    ggplot(aes(x = Cocoa.Percent, y = Rating, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
data %>%
    ggplot(aes(x = Rating, y = Review.Date)) +
    geom_hex()

data %>% 
    filter(Rating > 2) %>%
    ggplot(aes(x = Rating, y = Review.Date)) +
    geom_boxplot(aes(group = cut_width(Rating, 0.1)))

Patterns and models

mod <- lm(log(Review.Date) ~ log(Rating), data = data)

data %>%
    modelr::add_residuals(mod) %>%
    mutate(resid = exp(resid))

## # A tibble: 2,657 × 11
##      REF Compan…¹ Compa…² Revie…³ Count…⁴ Speci…⁵ Cocoa…⁶ Ingre…⁷ Most.…⁸ Rating
##    <dbl> <chr>    <chr>     <dbl> <chr>   <chr>     <dbl> <chr>   <chr>    <dbl>
##  1  2454 5150     U.S.A.     2019 Tanzan… Kokoa …    0.76 3- B,S… rich c…   3.25
##  2  2458 5150     U.S.A.     2019 Domini… Zorzal…    0.76 3- B,S… cocoa,…   3.5 
##  3  2454 5150     U.S.A.     2019 Madaga… Bejofo…    0.76 3- B,S… cocoa,…   3.75
##  4  2542 5150     U.S.A.     2021 Fiji    Matasa…    0.68 3- B,S… chewy,…   3   
##  5  2546 5150     U.S.A.     2021 Venezu… Sur de…    0.72 3- B,S… fatty,…   3   
##  6  2546 5150     U.S.A.     2021 Uganda  Semuli…    0.8  3- B,S… mildly…   3.25
##  7  2542 5150     U.S.A.     2021 India   Anamal…    0.68 3- B,S… milk b…   3.5 
##  8  2808 20N | 2… France     2022 Venezu… Chuao,…    0.78 2- B,S  sandy,…   2.75
##  9  2808 20N | 2… France     2022 Venezu… Chuao,…    0.78 2- B,S  sl. dr…   3   
## 10   797 A. Morin France     2012 Bolivia Bolivia    0.7  4- B,S… vegeta…   3.5 
## # … with 2,647 more rows, 1 more variable: resid <dbl>, and abbreviated
## #   variable names ¹Company.Manufacturer, ²Company.Location, ³Review.Date,
## #   ⁴Country.of.Bean.Origin, ⁵Specific.Bean.Origin.or.Bar.Name, ⁶Cocoa.Percent,
## #   ⁷Ingredients, ⁸Most.Memorable.Characteristics

data %>%
    ggplot(aes(Rating, Review.Date)) +
    geom_point()

data %>%
    ggplot(aes(as.factor(Review.Date), Rating)) +
    geom_boxplot() +
    
    theme(axis.text = element_text(angle = 90))

Module 7: Apply it to your data 6

Spencer Murrin

2023-10-11

Import Data

Introduction

Questions

Variation

Visualizing distributions

Typical values

Unusual values

Missing Values

Covariation

A categorical and continuous variable

Two categorical variables

Two continous variables

Patterns and models