Module 11: Apply it to your data 10

Import your data

library(readxl)
# excel file
data <- read_excel("../00_data/data/myData.xlsx")
data

## # A tibble: 9,355 × 12
##    work_year job_title    job_category      salary_currency salary salary_in_usd
##        <dbl> <chr>        <chr>             <chr>            <dbl>         <dbl>
##  1      2023 AI Architect Machine Learning… USD             305100        305100
##  2      2023 AI Architect Machine Learning… USD             146900        146900
##  3      2023 AI Architect Machine Learning… USD             330000        330000
##  4      2023 AI Architect Machine Learning… USD             204000        204000
##  5      2023 AI Architect Machine Learning… USD             330000        330000
##  6      2023 AI Architect Machine Learning… USD             204000        204000
##  7      2023 AI Architect Machine Learning… EUR             200000        215936
##  8      2023 AI Architect Machine Learning… USD             330000        330000
##  9      2023 AI Architect Machine Learning… USD             204000        204000
## 10      2023 AI Architect Machine Learning… USD             200000        200000
## # ℹ 9,345 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## #   employment_type <chr>, work_setting <chr>, company_location <chr>,
## #   company_size <chr>

# Make data smaller

data_analysis <- data %>% filter(job_category == "Data Analysis")
data_analysis

## # A tibble: 1,457 × 12
##    work_year job_title       job_category  salary_currency salary salary_in_usd
##        <dbl> <chr>           <chr>         <chr>            <dbl>         <dbl>
##  1      2023 BI Data Analyst Data Analysis USD              25000         25000
##  2      2023 BI Data Analyst Data Analysis USD              50000         50000
##  3      2023 BI Data Analyst Data Analysis USD              85000         85000
##  4      2023 BI Data Analyst Data Analysis USD              70000         70000
##  5      2023 BI Data Analyst Data Analysis USD              60000         60000
##  6      2023 BI Data Analyst Data Analysis EUR              67000         72338
##  7      2022 BI Data Analyst Data Analysis EUR              58000         60938
##  8      2022 BI Data Analyst Data Analysis EUR             100000        105066
##  9      2022 BI Data Analyst Data Analysis USD              57000         57000
## 10      2022 BI Data Analyst Data Analysis AUD              65000         45050
## # ℹ 1,447 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## #   employment_type <chr>, work_setting <chr>, company_location <chr>,
## #   company_size <chr>

Chapter 15

Create a factor

x <- factor(c("AUD", "BRL", "CAD", "CHF", "DKK", "EUR", "GBP", "PLN", "SGD", "TRY", "USD"))

Modify factor order

Make two bar charts here - one before ordering another after

Unordered facto levels

salary_by_currency <- data_analysis %>%
    
    group_by(salary_currency) %>%
    summarise(avg_salary = mean(salary_in_usd, na.rm = TRUE))

salary_by_currency

## # A tibble: 7 × 2
##   salary_currency avg_salary
##   <chr>                <dbl>
## 1 AUD                 46815.
## 2 CAD                 55764.
## 3 EUR                 53516.
## 4 GBP                 74269.
## 5 PLN                 18160 
## 6 SGD                 65257 
## 7 USD                112935.

# Plot
salary_by_currency %>%
    
    ggplot(aes(x = salary_currency, y = avg_salary)) +
    geom_col()

Ordered Factor Levels

salary_by_currency %>%
    
    ggplot(aes(x = fct_reorder(.f = salary_currency, .x = avg_salary), y = avg_salary)) +
    geom_col() +

# Labeling
labs(y = NULL, x = "Mean payment in different currencies (in USD)")

Modify factor levels

Show examples of three functions:

fct_recode
fct_collapse
fct_lump

data_analysis %>% distinct(employee_residence)

## # A tibble: 35 × 1
##    employee_residence
##    <chr>             
##  1 Ukraine           
##  2 Kenya             
##  3 Canada            
##  4 Egypt             
##  5 Argentina         
##  6 Germany           
##  7 France            
##  8 United States     
##  9 Australia         
## 10 India             
## # ℹ 25 more rows

# Recode
data_analysis %>%
    
    #Rename levels
    mutate(country_rec = fct_recode(employee_residence, "UK" = "United Kingdom")) %>%
    select(employee_residence, country_rec) %>%
    filter(employee_residence == "United Kingdom")

## # A tibble: 93 × 2
##    employee_residence country_rec
##    <chr>              <fct>      
##  1 United Kingdom     UK         
##  2 United Kingdom     UK         
##  3 United Kingdom     UK         
##  4 United Kingdom     UK         
##  5 United Kingdom     UK         
##  6 United Kingdom     UK         
##  7 United Kingdom     UK         
##  8 United Kingdom     UK         
##  9 United Kingdom     UK         
## 10 United Kingdom     UK         
## # ℹ 83 more rows

# Collapse multiple levels into one
data_analysis %>%
    
    mutate(country_col = fct_collapse(employee_residence, "North America" = c("United States", "Canada"))) %>%
    select(employee_residence, country_col) %>%
    filter(employee_residence == "United States")

## # A tibble: 1,239 × 2
##    employee_residence country_col  
##    <chr>              <fct>        
##  1 United States      North America
##  2 United States      North America
##  3 United States      North America
##  4 United States      North America
##  5 United States      North America
##  6 United States      North America
##  7 United States      North America
##  8 United States      North America
##  9 United States      North America
## 10 United States      North America
## # ℹ 1,229 more rows

# Lump small levels into one
data_analysis %>% count(experience_level)

## # A tibble: 4 × 2
##   experience_level     n
##   <chr>            <int>
## 1 Entry-level        140
## 2 Executive           16
## 3 Mid-level          397
## 4 Senior             904

data_analysis %>% mutate(exp_level_lump = fct_lump(experience_level)) %>%
    distinct(exp_level_lump)

## # A tibble: 2 × 1
##   exp_level_lump
##   <fct>         
## 1 Other         
## 2 Senior

Chapter 16

No need to do anything here.

Module 11: Apply it to your data 10

Daniel Lee

Import your data

Chapter 15

Create a factor

Modify factor order

Modify factor levels

Chapter 16