Module 11: Apply it to your data 10

Import your data

data <- read_excel("../01_module4/data/MyData.xlsx")
data

## # A tibble: 2,657 × 10
##      REF Compan…¹ Compa…² Revie…³ Count…⁴ Speci…⁵ Cocoa…⁶ Ingre…⁷ Most.…⁸ Rating
##    <dbl> <chr>    <chr>     <dbl> <chr>   <chr>     <dbl> <chr>   <chr>    <dbl>
##  1  2454 5150     U.S.A.     2019 Tanzan… Kokoa …    0.76 3- B,S… rich c…   3.25
##  2  2458 5150     U.S.A.     2019 Domini… Zorzal…    0.76 3- B,S… cocoa,…   3.5 
##  3  2454 5150     U.S.A.     2019 Madaga… Bejofo…    0.76 3- B,S… cocoa,…   3.75
##  4  2542 5150     U.S.A.     2021 Fiji    Matasa…    0.68 3- B,S… chewy,…   3   
##  5  2546 5150     U.S.A.     2021 Venezu… Sur de…    0.72 3- B,S… fatty,…   3   
##  6  2546 5150     U.S.A.     2021 Uganda  Semuli…    0.8  3- B,S… mildly…   3.25
##  7  2542 5150     U.S.A.     2021 India   Anamal…    0.68 3- B,S… milk b…   3.5 
##  8  2808 20N | 2… France     2022 Venezu… Chuao,…    0.78 2- B,S  sandy,…   2.75
##  9  2808 20N | 2… France     2022 Venezu… Chuao,…    0.78 2- B,S  sl. dr…   3   
## 10   797 A. Morin France     2012 Bolivia Bolivia    0.7  4- B,S… vegeta…   3.5 
## # … with 2,647 more rows, and abbreviated variable names ¹Company.Manufacturer,
## #   ²Company.Location, ³Review.Date, ⁴Country.of.Bean.Origin,
## #   ⁵Specific.Bean.Origin.or.Bar.Name, ⁶Cocoa.Percent, ⁷Ingredients,
## #   ⁸Most.Memorable.Characteristics

Chapter 15

Create a factor

Modify factor order

Unordered factor levels

# Transform data: Calculate average Rating by Company Location 
rating_by_location <- data %>%
    
    group_by(Company.Location) %>%
    summarise(
        avg_rating = mean(Rating, na.rm = TRUE)
    )

rating_by_location

## # A tibble: 69 × 2
##    Company.Location avg_rating
##    <chr>                 <dbl>
##  1 Argentina              3.31
##  2 Australia              3.36
##  3 Austria                3.26
##  4 Belgium                3.07
##  5 Bolivia                3.25
##  6 Brazil                 3.27
##  7 Canada                 3.31
##  8 Chile                  3.75
##  9 Colombia               3.21
## 10 Costa Rica             3.14
## # … with 59 more rows

# Plot 
rating_by_location %>%
    
    ggplot(aes(x = avg_rating, y = Company.Location)) +
    geom_point()

Ordered factor levels

rating_by_location %>%
    
    ggplot(aes(x = avg_rating, y = fct_reorder(.f = Company.Location, .x = avg_rating))) +
    geom_point() +
    
    # Labeling 
    labs(y = NULL, x = "Mean Rating By Company Location")

Moving a single level to the front

rating_by_location %>%
    
    ggplot(aes(x = avg_rating, 
               y = fct_reorder(.f = Company.Location, .x = avg_rating) %>% 
                   fct_relevel("U.S.A."))) +
    geom_point() +
    
    # Labeling 
    labs(y = NULL, x = "Mean Rating By Company Location")

Modify factor levels

Show examples of three functions: * fct_recode * fct_collapse * fct_lump

data %>% distinct(Country.of.Bean.Origin)

## # A tibble: 64 × 1
##    Country.of.Bean.Origin
##    <chr>                 
##  1 Tanzania              
##  2 Dominican Republic    
##  3 Madagascar            
##  4 Fiji                  
##  5 Venezuela             
##  6 Uganda                
##  7 India                 
##  8 Bolivia               
##  9 Peru                  
## 10 Panama                
## # … with 54 more rows

# Recode
data %>%
    
    # Rename levels
    mutate(Country.of.Bean.Origin_rev = fct_recode(Country.of.Bean.Origin, "South America" = "Peru")) %>%
    select(Country.of.Bean.Origin, Country.of.Bean.Origin_rev) %>% 
    filter(Country.of.Bean.Origin == "Peru")

## # A tibble: 256 × 2
##    Country.of.Bean.Origin Country.of.Bean.Origin_rev
##    <chr>                  <fct>                     
##  1 Peru                   South America             
##  2 Peru                   South America             
##  3 Peru                   South America             
##  4 Peru                   South America             
##  5 Peru                   South America             
##  6 Peru                   South America             
##  7 Peru                   South America             
##  8 Peru                   South America             
##  9 Peru                   South America             
## 10 Peru                   South America             
## # … with 246 more rows

# Collapse multiple levels into one 
data %>%
    
    mutate(Country.of.Bean.Origin_col = fct_collapse(Country.of.Bean.Origin, "West Coast" = c("Peru", "Venezuela"))) %>%
    select(Country.of.Bean.Origin, Country.of.Bean.Origin_col) %>%
    filter(Country.of.Bean.Origin != "Brazil")

## # A tibble: 2,575 × 2
##    Country.of.Bean.Origin Country.of.Bean.Origin_col
##    <chr>                  <fct>                     
##  1 Tanzania               Tanzania                  
##  2 Dominican Republic     Dominican Republic        
##  3 Madagascar             Madagascar                
##  4 Fiji                   Fiji                      
##  5 Venezuela              West Coast                
##  6 Uganda                 Uganda                    
##  7 India                  India                     
##  8 Venezuela              West Coast                
##  9 Venezuela              West Coast                
## 10 Bolivia                Bolivia                   
## # … with 2,565 more rows

#Lump small levels into other levels 
data %>% count(Country.of.Bean.Origin)

## # A tibble: 64 × 2
##    Country.of.Bean.Origin     n
##    <chr>                  <int>
##  1 Australia                  3
##  2 Bali                       1
##  3 Belize                    80
##  4 Blend                    157
##  5 Bolivia                   83
##  6 Brazil                    82
##  7 Burma                      1
##  8 Cameroon                   3
##  9 China                      1
## 10 Colombia                  82
## # … with 54 more rows

data %>% mutate(Country.of.Bean.Origin_lump = fct_lump(Country.of.Bean.Origin)) %>% distinct(Country.of.Bean.Origin_lump)

## # A tibble: 64 × 1
##    Country.of.Bean.Origin_lump
##    <fct>                      
##  1 Tanzania                   
##  2 Dominican Republic         
##  3 Madagascar                 
##  4 Fiji                       
##  5 Venezuela                  
##  6 Uganda                     
##  7 India                      
##  8 Bolivia                    
##  9 Peru                       
## 10 Panama                     
## # … with 54 more rows

Module 11: Apply it to your data 10

Spencer Murrin

Import your data

Chapter 15

Create a factor

Modify factor order

Modify factor levels

Chapter 16