#import data

library(tidyverse)
library(readxl)

data <-read_excel("../00_data/Salaries.xlsx")

skimr::skim(data)
Data summary
Name data
Number of rows 397
Number of columns 6
_______________________
Column type frequency:
character 3
numeric 3
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
rank 0 1 4 9 0 3 0
discipline 0 1 1 1 0 2 0
sex 0 1 4 6 0 2 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100 hist
yrs.since.phd 0 1 22.31 12.89 1 12 21 32 56 ▇▇▆▅▁
yrs.service 0 1 17.61 13.01 0 7 16 27 60 ▇▅▃▂▁
salary 0 1 113706.46 30289.04 57800 91000 107300 134185 231545 ▅▇▅▂▁
library(skimr)

Visualizing distributions

data %>%
    ggplot(aes(x=rank)) +
    geom_bar()

diamonds %>%
    ggplot(mapping=aes(x+carat)) +
    geom_histogram(binwidth=0.5)

diamonds %>%
    filter(carat < 3) %>% 
    ggplot(aes(x=carat)) +
    geom_histogram(binwidth = 0.5)

diamonds %>%
    ggplot (aes(x=carat,color=cut)) +
    geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values

Unusual values

Missing Values

Covariation

A categorical and continuous variable

Two categorical variables

Two continous variables

Patterns and models