# Load library for Excel reading
library(readxl)
library(tidyverse)
# Import Excel data
salary <- read_excel("../00_data/Salaries.xlsx")
Define variable names
Check structure
Introduction
Questions
tibble(
question = c(
"How does salary vary?",
"How does salary differ by rank/sex?",
"How does salary relate to years since PhD?"
)
)
## # A tibble: 3 × 1
## question
## <chr>
## 1 How does salary vary?
## 2 How does salary differ by rank/sex?
## 3 How does salary relate to years since PhD?
Variation
ggplot(data = salary) +
geom_bar(mapping = aes(x = rank))

Visualizing distributions
ggplot(data = salary) +
geom_bar(mapping = aes(x = rank))

salary %>% count(rank)
## # A tibble: 3 × 2
## rank n
## <chr> <int>
## 1 AssocProf 64
## 2 AsstProf 67
## 3 Prof 266
ggplot(data = salary) +
geom_histogram(mapping = aes(x = salary))
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(data = salary, mapping = aes(x = salary, colour = rank)) +
geom_freqpoly()
## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values
salary %>%
summarise(
n = n(),
mean = mean(salary, na.rm = TRUE),
median = median(salary, na.rm = TRUE),
sd = sd(salary, na.rm = TRUE),
IQR = IQR(salary, na.rm = TRUE)
)
## # A tibble: 1 × 5
## n mean median sd IQR
## <int> <dbl> <dbl> <dbl> <dbl>
## 1 397 113706. 107300 30289. 43185
Unusual values
q <- quantile(salary$salary, c(0.25, 0.75), na.rm = TRUE)
iqr <- q[2] - q[1]
lo <- q[1] - 1.5*iqr
hi <- q[2] + 1.5*iqr
salary %>% filter(salary < lo | salary > hi)
## # A tibble: 3 × 6
## rank discipline yrs.since.phd yrs.service sex salary
## <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 Prof B 38 38 Male 231545
## 2 Prof A 29 7 Male 204000
## 3 Prof A 43 43 Male 205500
Missing Values
colSums(is.na(salary))
## rank discipline yrs.since.phd yrs.service sex
## 0 0 0 0 0
## salary
## 0
Covariation
A categorical and continuous variable
Two categorical variables
salary %>% count(rank, sex) %>% arrange(desc(n))
## # A tibble: 6 × 3
## rank sex n
## <chr> <chr> <int>
## 1 Prof Male 248
## 2 AsstProf Male 56
## 3 AssocProf Male 54
## 4 Prof Female 18
## 5 AsstProf Female 11
## 6 AssocProf Female 10
Two continous variables
ggplot(salary, aes(x = `yrs.since.phd`, y = salary)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)
## `geom_smooth()` using formula = 'y ~ x'

Patterns and models
m1 <- lm(salary ~ `yrs.since.phd` + rank + sex, data = salary)
summary(m1)
##
## Call:
## lm(formula = salary ~ yrs.since.phd + rank + sex, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67230 -15338 -1530 12163 105318
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 90957.3 4836.0 18.808 < 2e-16 ***
## yrs.since.phd -92.1 129.7 -0.710 0.47801
## rankAsstProf -14012.8 4342.8 -3.227 0.00136 **
## rankProf 33623.1 3694.1 9.102 < 2e-16 ***
## sexMale 5146.6 4038.9 1.274 0.20332
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23630 on 392 degrees of freedom
## Multiple R-squared: 0.3973, Adjusted R-squared: 0.3912
## F-statistic: 64.61 on 4 and 392 DF, p-value: < 2.2e-16