Module 11: Apply it to your data 10

Import your data

MyData <- readr::read_csv('https://raw.githubusercontent.com/rfordatascience/tidytuesday/main/data/2024/2024-11-26/cbp_resp.csv')

## Rows: 68815 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (10): month_grouping, month_abbv, component, land_border_region, area_of...
## dbl  (2): fiscal_year, encounter_count
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Chapter 15

Create a factor

# Transform Data: calculate average encounter count hours by land_border_region
encountercount_by_region <- MyData %>%
   
    group_by(land_border_region) %>%
    summarise(
        avg_encounter_count = mean(encounter_count, na.rm = TRUE))

encountercount_by_region

## # A tibble: 3 × 2
##   land_border_region    avg_encounter_count
##   <chr>                               <dbl>
## 1 Northern Land Border                 46.8
## 2 Other                                73.2
## 3 Southwest Land Border               276.

# Plot

encountercount_by_region %>%
    
    ggplot(aes(x = avg_encounter_count, y = land_border_region)) + 
    geom_point()

Ordered factor levels

encountercount_by_region %>%
    
    ggplot(aes(x = avg_encounter_count, y = fct_reorder(.f = land_border_region, .x = avg_encounter_count))) + 
    geom_point() + 
    
    
    # Labeling 
    labs(y = NULL, x = "Mean Encounter Count at Land Border Region")

Moving a single level to the front

encountercount_by_region %>%
    
    ggplot(aes(x = avg_encounter_count,
               y = fct_reorder(.f = land_border_region, .x = avg_encounter_count) %>%
                   fct_relevel("Don't know"))) + 
    geom_point() + 
    
    
    # Labeling 
    labs(y = NULL, x = "Mean Encounter Count at Land Border Region")

## Warning: 1 unknown level in `f`: Don't know

Modify factor order

Make two bar charts here - one before ordering another after

# Before Ordering 
MyData %>%
    ggplot(aes(x = encounter_count, y = land_border_region)) + 
    geom_point()

# After Ordering 
encountercount_by_region %>%
    
    ggplot(aes(x = avg_encounter_count, y = land_border_region)) + 
    geom_point()

Modify factor levels

Show examples of three functions:

fct_recode

MyData %>%
    
    mutate(month_abbv_rev = fct_recode(month_abbv, "April" = "APR")) %>%
    select(month_abbv, month_abbv_rev) %>%
    filter(month_abbv == "April")

## # A tibble: 0 × 2
## # ℹ 2 variables: month_abbv <chr>, month_abbv_rev <fct>

fct_collapse

MyData %>% 
    
    mutate(citizenship_col = fct_collapse(citizenship, "Southern America" = c("MEXICO"))) %>%
    select(citizenship, citizenship_col) %>%
    filter(citizenship != "China")

## # A tibble: 68,815 × 2
##    citizenship                citizenship_col           
##    <chr>                      <fct>                     
##  1 BRAZIL                     BRAZIL                    
##  2 CANADA                     CANADA                    
##  3 CANADA                     CANADA                    
##  4 CANADA                     CANADA                    
##  5 CHINA, PEOPLES REPUBLIC OF CHINA, PEOPLES REPUBLIC OF
##  6 CHINA, PEOPLES REPUBLIC OF CHINA, PEOPLES REPUBLIC OF
##  7 OTHER                      OTHER                     
##  8 OTHER                      OTHER                     
##  9 PHILIPPINES                PHILIPPINES               
## 10 RUSSIA                     RUSSIA                    
## # ℹ 68,805 more rows

fct_lump

MyData %>% count(citizenship)

## # A tibble: 22 × 2
##    citizenship                    n
##    <chr>                      <int>
##  1 BRAZIL                      3040
##  2 CANADA                      2527
##  3 CHINA, PEOPLES REPUBLIC OF  2661
##  4 COLOMBIA                    3620
##  5 CUBA                        3147
##  6 ECUADOR                     3262
##  7 EL SALVADOR                 3787
##  8 GUATEMALA                   4394
##  9 HAITI                       2565
## 10 HONDURAS                    4253
## # ℹ 12 more rows

MyData %>% 
    mutate(citizenship_lump = fct_lump(citizenship, prop = 0.03)) %>% 
    distinct(citizenship_lump)

## # A tibble: 19 × 1
##    citizenship_lump          
##    <fct>                     
##  1 BRAZIL                    
##  2 CANADA                    
##  3 CHINA, PEOPLES REPUBLIC OF
##  4 OTHER                     
##  5 Other                     
##  6 RUSSIA                    
##  7 COLOMBIA                  
##  8 GUATEMALA                 
##  9 HAITI                     
## 10 INDIA                     
## 11 MEXICO                    
## 12 CUBA                      
## 13 EL SALVADOR               
## 14 UKRAINE                   
## 15 VENEZUELA                 
## 16 HONDURAS                  
## 17 PERU                      
## 18 ECUADOR                   
## 19 NICARAGUA

Chapter 16

No need to do anything here.

Module 11: Apply it to your data 10

Madeleine Lorenz

Import your data

Chapter 15

Create a factor

Modify factor order

Modify factor levels

Chapter 16