library(readxl)
# excel file
data <- read_excel("../00_data/data/myData.xlsx")
data
## # A tibble: 9,355 × 12
## work_year job_title job_category salary_currency salary salary_in_usd
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2023 AI Architect Machine Learning… USD 305100 305100
## 2 2023 AI Architect Machine Learning… USD 146900 146900
## 3 2023 AI Architect Machine Learning… USD 330000 330000
## 4 2023 AI Architect Machine Learning… USD 204000 204000
## 5 2023 AI Architect Machine Learning… USD 330000 330000
## 6 2023 AI Architect Machine Learning… USD 204000 204000
## 7 2023 AI Architect Machine Learning… EUR 200000 215936
## 8 2023 AI Architect Machine Learning… USD 330000 330000
## 9 2023 AI Architect Machine Learning… USD 204000 204000
## 10 2023 AI Architect Machine Learning… USD 200000 200000
## # ℹ 9,345 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## # employment_type <chr>, work_setting <chr>, company_location <chr>,
## # company_size <chr>
# Make data smaller
data_analysis <- data %>% filter(job_category == "Data Analysis")
data_analysis
## # A tibble: 1,457 × 12
## work_year job_title job_category salary_currency salary salary_in_usd
## <dbl> <chr> <chr> <chr> <dbl> <dbl>
## 1 2023 BI Data Analyst Data Analysis USD 25000 25000
## 2 2023 BI Data Analyst Data Analysis USD 50000 50000
## 3 2023 BI Data Analyst Data Analysis USD 85000 85000
## 4 2023 BI Data Analyst Data Analysis USD 70000 70000
## 5 2023 BI Data Analyst Data Analysis USD 60000 60000
## 6 2023 BI Data Analyst Data Analysis EUR 67000 72338
## 7 2022 BI Data Analyst Data Analysis EUR 58000 60938
## 8 2022 BI Data Analyst Data Analysis EUR 100000 105066
## 9 2022 BI Data Analyst Data Analysis USD 57000 57000
## 10 2022 BI Data Analyst Data Analysis AUD 65000 45050
## # ℹ 1,447 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## # employment_type <chr>, work_setting <chr>, company_location <chr>,
## # company_size <chr>
x <- factor(c("AUD", "BRL", "CAD", "CHF", "DKK", "EUR", "GBP", "PLN", "SGD", "TRY", "USD"))
Make two bar charts here - one before ordering another after
Unordered facto levels
salary_by_currency <- data_analysis %>%
group_by(salary_currency) %>%
summarise(avg_salary = mean(salary_in_usd, na.rm = TRUE))
salary_by_currency
## # A tibble: 7 × 2
## salary_currency avg_salary
## <chr> <dbl>
## 1 AUD 46815.
## 2 CAD 55764.
## 3 EUR 53516.
## 4 GBP 74269.
## 5 PLN 18160
## 6 SGD 65257
## 7 USD 112935.
# Plot
salary_by_currency %>%
ggplot(aes(x = salary_currency, y = avg_salary)) +
geom_col()
Ordered Factor Levels
salary_by_currency %>%
ggplot(aes(x = fct_reorder(.f = salary_currency, .x = avg_salary), y = avg_salary)) +
geom_col() +
# Labeling
labs(y = NULL, x = "Mean payment in different currencies (in USD)")
Show examples of three functions:
data_analysis %>% distinct(employee_residence)
## # A tibble: 35 × 1
## employee_residence
## <chr>
## 1 Ukraine
## 2 Kenya
## 3 Canada
## 4 Egypt
## 5 Argentina
## 6 Germany
## 7 France
## 8 United States
## 9 Australia
## 10 India
## # ℹ 25 more rows
# Recode
data_analysis %>%
#Rename levels
mutate(country_rec = fct_recode(employee_residence, "UK" = "United Kingdom")) %>%
select(employee_residence, country_rec) %>%
filter(employee_residence == "United Kingdom")
## # A tibble: 93 × 2
## employee_residence country_rec
## <chr> <fct>
## 1 United Kingdom UK
## 2 United Kingdom UK
## 3 United Kingdom UK
## 4 United Kingdom UK
## 5 United Kingdom UK
## 6 United Kingdom UK
## 7 United Kingdom UK
## 8 United Kingdom UK
## 9 United Kingdom UK
## 10 United Kingdom UK
## # ℹ 83 more rows
# Collapse multiple levels into one
data_analysis %>%
mutate(country_col = fct_collapse(employee_residence, "North America" = c("United States", "Canada"))) %>%
select(employee_residence, country_col) %>%
filter(employee_residence == "United States")
## # A tibble: 1,239 × 2
## employee_residence country_col
## <chr> <fct>
## 1 United States North America
## 2 United States North America
## 3 United States North America
## 4 United States North America
## 5 United States North America
## 6 United States North America
## 7 United States North America
## 8 United States North America
## 9 United States North America
## 10 United States North America
## # ℹ 1,229 more rows
# Lump small levels into one
data_analysis %>% count(experience_level)
## # A tibble: 4 × 2
## experience_level n
## <chr> <int>
## 1 Entry-level 140
## 2 Executive 16
## 3 Mid-level 397
## 4 Senior 904
data_analysis %>% mutate(exp_level_lump = fct_lump(experience_level)) %>%
distinct(exp_level_lump)
## # A tibble: 2 × 1
## exp_level_lump
## <fct>
## 1 Other
## 2 Senior
No need to do anything here.