Import Data

# csv file
jobs_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv")

## Rows: 2088 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): occupation, major_category, minor_category
## dbl (9): year, total_workers, workers_male, workers_female, percent_female, ...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Introduction

Questions

Variation

Visualizing distributions

jobs_gender %>%
    ggplot(aes(x = year)) +
    geom_bar(mapping = aes(x = year))

jobs_gender %>%
    ggplot(mapping = aes(x = year)) +
    geom_histogram(binwidth = .05)

jobs_gender %>%
    ggplot(aes(x = year, color = total_earnings)) +
    geom_freqpoly()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values

jobs_gender %>%
    
    # Filter out total workers male > 5 years
    filter(workers_male < 5) %>%
    
    # Plot
    ggplot(aes(x = year)) +
    geom_histogram(binwidth = 0.10)

Unusual values

jobs_gender %>%
    ggplot(aes(y = total_workers)) +
    geom_histogram()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

jobs_gender %>%
    ggplot(aes(y = total_workers)) +
    geom_histogram() +
    coord_cartesian(ylim = c(0, 500))

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values

Covariation

A categorical and continuous variable

jobs_gender %>%
    
    ggplot(aes(x = year, y = occupation)) +
    geom_boxplot()

Two categorical variables

jobs_gender %>%
    
    count(year, occupation) %>%
    
    ggplot(aes(x = year, y = occupation, fill = n)) +
    geom_tile()

Two continous variables

library(hexbin)
jobs_gender %>%
    ggplot(aes(x = year, y = occupation)) +
    geom_hex()

jobs_gender %>%
    ggplot(aes(x = year, y = occupation)) +
    geom_boxplot(aes(group = cut_width(year, 0.1)))

Week 7: Apply it to your data 6

Jason Zink

2023-10-16

Import Data

Introduction

Questions

Variation

Visualizing distributions

Typical values

Unusual values

Missing Values

Covariation

A categorical and continuous variable

Two categorical variables

Two continous variables

Patterns and models