Import data

library(readxl)
# excel file
data <- read_excel("../00_data/data/myData.xlsx")
data
## # A tibble: 9,355 × 12
##    work_year job_title    job_category      salary_currency salary salary_in_usd
##        <dbl> <chr>        <chr>             <chr>            <dbl>         <dbl>
##  1      2023 AI Architect Machine Learning… USD             305100        305100
##  2      2023 AI Architect Machine Learning… USD             146900        146900
##  3      2023 AI Architect Machine Learning… USD             330000        330000
##  4      2023 AI Architect Machine Learning… USD             204000        204000
##  5      2023 AI Architect Machine Learning… USD             330000        330000
##  6      2023 AI Architect Machine Learning… USD             204000        204000
##  7      2023 AI Architect Machine Learning… EUR             200000        215936
##  8      2023 AI Architect Machine Learning… USD             330000        330000
##  9      2023 AI Architect Machine Learning… USD             204000        204000
## 10      2023 AI Architect Machine Learning… USD             200000        200000
## # ℹ 9,345 more rows
## # ℹ 6 more variables: employee_residence <chr>, experience_level <chr>,
## #   employment_type <chr>, work_setting <chr>, company_location <chr>,
## #   company_size <chr>

Introduction

Questions

Variation

ggplot(data) +
    geom_bar(mapping = aes(x = job_category))

ggplot(data) +
    geom_histogram(mapping = aes(x = salary_in_usd))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

ggplot(data = data, mapping = aes(x = salary_in_usd, colour = job_category)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Visualizing distributions

data %>%
    ggplot(aes(x = job_category)) +
    geom_bar()

data %>%
    filter(employee_residence == "United States" & job_category == "Data Analysis") %>%
    
    ggplot(aes(x = work_year)) +
    geom_histogram(binwidth = 0.5)

Typical values

data %>%
    
    # Filter out diamonds > 3 carat
    filter(salary_currency == "EUR") %>%
    
    # Plot
    ggplot(aes(x = work_year)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Unusual values

data %>%
    ggplot(aes(x = salary)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0, 1200))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

data %>%
    
    #filter(y < 3 | y > 20) %>%
    
    mutate(salary_currency = ifelse(salary_currency == "USD" | salary_currency == "CAD", NA, salary_currency)) %>%
    
    # Plot
    ggplot(aes(x = work_year, y = salary_in_usd)) +
    geom_point()

## Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = work_year, y = salary_in_usd)) +
    geom_boxplot()
## Warning: Continuous x aesthetic
## ℹ did you forget `aes(group = ...)`?

### Two categorical variables

data %>%
    count(salary_currency, employee_residence) %>%
    
    ggplot(aes(x = salary_currency, y = employee_residence, fill = n)) +
    geom_tile()

### Two continous variables

library(hexbin)
data %>%
    ggplot(aes(x = salary_currency, y = salary_in_usd)) +
    geom_hex()

data %>%
    filter(job_category == "Data Analysis") %>%
    ggplot(aes(x = job_category, y = salary_in_usd)) +
    geom_boxplot()

## Patterns and models