Import data

# excel file
data <- read_excel("Salaries.xlsx")
data
## # A tibble: 397 × 6
##    rank      discipline yrs.since.phd yrs.service sex    salary
##    <chr>     <chr>              <dbl>       <dbl> <chr>   <dbl>
##  1 Prof      B                     19          18 Male   139750
##  2 Prof      B                     20          16 Male   173200
##  3 AsstProf  B                      4           3 Male    79750
##  4 Prof      B                     45          39 Male   115000
##  5 Prof      B                     40          41 Male   141500
##  6 AssocProf B                      6           6 Male    97000
##  7 Prof      B                     30          23 Male   175000
##  8 Prof      B                     45          45 Male   147765
##  9 Prof      B                     21          20 Male   119250
## 10 Prof      B                     18          18 Female 129000
## # … with 387 more rows

Introduction

Questions

What occupational industry makes up the majority of those who are considered wealthy?

Variation

Visualizing distributions

data %>%
    
    ggplot(aes(yrs.since.phd)) + 
    geom_point(mapping = aes(x = yrs.since.phd, y = salary))

Typical values

data %>%
    
     # Plot
    ggplot(aes(x = yrs.since.phd)) +
    geom_histogram(binwidth = 1)

Unusual values

data %>%
    
    ggplot(aes(yrs.service)) +
    geom_histogram()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

Missing values

data %>%
    
    # Filter(salary < 100000 | yrs.service > 300000)
    
    mutate(salary = ifelse(salary <100000 | salary >300000, NA, salary))
## # A tibble: 397 × 6
##    rank      discipline yrs.since.phd yrs.service sex    salary
##    <chr>     <chr>              <dbl>       <dbl> <chr>   <dbl>
##  1 Prof      B                     19          18 Male   139750
##  2 Prof      B                     20          16 Male   173200
##  3 AsstProf  B                      4           3 Male       NA
##  4 Prof      B                     45          39 Male   115000
##  5 Prof      B                     40          41 Male   141500
##  6 AssocProf B                      6           6 Male       NA
##  7 Prof      B                     30          23 Male   175000
##  8 Prof      B                     45          45 Male   147765
##  9 Prof      B                     21          20 Male   119250
## 10 Prof      B                     18          18 Female 129000
## # … with 387 more rows

Covariation

A categorical and continuous variable

data %>%
    
    ggplot(aes(x = yrs.since.phd, y = yrs.service)) +
    geom_boxplot()
## Warning: Continuous x aesthetic -- did you forget aes(group=...)?

Two categorical variables

data %>%
    
    count(yrs.since.phd, yrs.service) %>%
    ggplot(aes(x = yrs.since.phd, y = yrs.service, fill = n)) +
    geom_tile()

Two continuous variables

library(hexbin)
data %>%
    ggplot(aes(x = yrs.service, y = salary))+
    geom_hex()