Import Data
# csv file
jobs_gender <- readr::read_csv("https://raw.githubusercontent.com/rfordatascience/tidytuesday/master/data/2019/2019-03-05/jobs_gender.csv")
## Rows: 2088 Columns: 12
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (3): occupation, major_category, minor_category
## dbl (9): year, total_workers, workers_male, workers_female, percent_female, ...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
Introduction
Questions
Variation
Visualizing distributions
jobs_gender %>%
ggplot(aes(x = year)) +
geom_bar(mapping = aes(x = year))

jobs_gender %>%
ggplot(mapping = aes(x = year)) +
geom_histogram(binwidth = .05)

jobs_gender %>%
ggplot(aes(x = year, color = total_earnings)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Typical values
jobs_gender %>%
# Filter out total workers male > 5 years
filter(workers_male < 5) %>%
# Plot
ggplot(aes(x = year)) +
geom_histogram(binwidth = 0.10)

Unusual values
jobs_gender %>%
ggplot(aes(y = total_workers)) +
geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

jobs_gender %>%
ggplot(aes(y = total_workers)) +
geom_histogram() +
coord_cartesian(ylim = c(0, 500))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing Values
Covariation
A categorical and continuous variable
jobs_gender %>%
ggplot(aes(x = year, y = occupation)) +
geom_boxplot()

Two categorical variables
jobs_gender %>%
count(year, occupation) %>%
ggplot(aes(x = year, y = occupation, fill = n)) +
geom_tile()

Two continous variables
library(hexbin)
jobs_gender %>%
ggplot(aes(x = year, y = occupation)) +
geom_hex()

jobs_gender %>%
ggplot(aes(x = year, y = occupation)) +
geom_boxplot(aes(group = cut_width(year, 0.1)))

Patterns and models