Check structure

Introduction

Questions

tibble(
question = c(
"How does salary vary?",
"How does salary differ by rank/sex?",
"How does salary relate to years since PhD?"
)
)

## # A tibble: 3 × 1
##   question                                  
##   <chr>                                     
## 1 How does salary vary?                     
## 2 How does salary differ by rank/sex?       
## 3 How does salary relate to years since PhD?

Variation

ggplot(data = salary) +
  geom_bar(mapping = aes(x = rank))

Visualizing distributions

ggplot(data = salary) +
  geom_bar(mapping = aes(x = rank))

salary %>% count(rank)

## # A tibble: 3 × 2
##   rank          n
##   <chr>     <int>
## 1 AssocProf    64
## 2 AsstProf     67
## 3 Prof        266

ggplot(data = salary) +
  geom_histogram(mapping = aes(x = salary))

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

ggplot(data = salary, mapping = aes(x = salary, colour = rank)) +
  geom_freqpoly()

## `stat_bin()` using `bins = 30`. Pick better value `binwidth`.

Typical values

salary %>%
summarise(
n = n(),
mean = mean(salary, na.rm = TRUE),
median = median(salary, na.rm = TRUE),
sd = sd(salary, na.rm = TRUE),
IQR = IQR(salary, na.rm = TRUE)
)

## # A tibble: 1 × 5
##       n    mean median     sd   IQR
##   <int>   <dbl>  <dbl>  <dbl> <dbl>
## 1   397 113706. 107300 30289. 43185

Unusual values

q <- quantile(salary$salary, c(0.25, 0.75), na.rm = TRUE)
iqr <- q[2] - q[1]
lo <- q[1] - 1.5*iqr
hi <- q[2] + 1.5*iqr

salary %>% filter(salary < lo | salary > hi)

## # A tibble: 3 × 6
##   rank  discipline yrs.since.phd yrs.service sex   salary
##   <chr> <chr>              <dbl>       <dbl> <chr>  <dbl>
## 1 Prof  B                     38          38 Male  231545
## 2 Prof  A                     29           7 Male  204000
## 3 Prof  A                     43          43 Male  205500

Missing Values

colSums(is.na(salary))

##          rank    discipline yrs.since.phd   yrs.service           sex 
##             0             0             0             0             0 
##        salary 
##             0

Covariation

A categorical and continuous variable

Two categorical variables

salary %>% count(rank, sex) %>% arrange(desc(n))

## # A tibble: 6 × 3
##   rank      sex        n
##   <chr>     <chr>  <int>
## 1 Prof      Male     248
## 2 AsstProf  Male      56
## 3 AssocProf Male      54
## 4 Prof      Female    18
## 5 AsstProf  Female    11
## 6 AssocProf Female    10

Two continous variables

ggplot(salary, aes(x = `yrs.since.phd`, y = salary)) +
geom_point() +
geom_smooth(method = "lm", se = FALSE)

## `geom_smooth()` using formula = 'y ~ x'

Patterns and models

m1 <- lm(salary ~ `yrs.since.phd` + rank + sex, data = salary)
summary(m1)

## 
## Call:
## lm(formula = salary ~ yrs.since.phd + rank + sex, data = salary)
## 
## Residuals:
##    Min     1Q Median     3Q    Max 
## -67230 -15338  -1530  12163 105318 
## 
## Coefficients:
##               Estimate Std. Error t value Pr(>|t|)    
## (Intercept)    90957.3     4836.0  18.808  < 2e-16 ***
## yrs.since.phd    -92.1      129.7  -0.710  0.47801    
## rankAsstProf  -14012.8     4342.8  -3.227  0.00136 ** 
## rankProf       33623.1     3694.1   9.102  < 2e-16 ***
## sexMale         5146.6     4038.9   1.274  0.20332    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 23630 on 392 degrees of freedom
## Multiple R-squared:  0.3973, Adjusted R-squared:  0.3912 
## F-statistic: 64.61 on 4 and 392 DF,  p-value: < 2.2e-16

Week 7: Apply it to your data 6

Matthew Plutzner

2025-11-11

Define variable names

Check structure

Introduction

Questions

Variation

Visualizing distributions

Typical values

Unusual values

Missing Values

Covariation

A categorical and continuous variable

Two categorical variables

Two continous variables

Patterns and models