library(carData);
library(dplyr);
## 
## Attaching package: 'dplyr'
## The following objects are masked from 'package:stats':
## 
##     filter, lag
## The following objects are masked from 'package:base':
## 
##     intersect, setdiff, setequal, union
library(rstatix)
## 
## Attaching package: 'rstatix'
## The following object is masked from 'package:stats':
## 
##     filter
salaries <- carData::Salaries;
df1 <- group_by(salaries, sex, rank, discipline)

# lets assign ID to each row
df1$id <- seq.int(nrow(df1))

summarise(df1, mean = round(mean(salary), 2))
## `summarise()` has grouped output by 'sex', 'rank'. You can override using the `.groups` argument.
## # A tibble: 12 x 4
## # Groups:   sex, rank [6]
##    sex    rank      discipline    mean
##    <fct>  <fct>     <fct>        <dbl>
##  1 Female AsstProf  A           72933.
##  2 Female AsstProf  B           84190.
##  3 Female AssocProf A           72128.
##  4 Female AssocProf B           99436.
##  5 Female Prof      A          109632.
##  6 Female Prof      B          131836.
##  7 Male   AsstProf  A           74270.
##  8 Male   AsstProf  B           84647.
##  9 Male   AssocProf A           85049.
## 10 Male   AssocProf B          101622.
## 11 Male   Prof      A          120619.
## 12 Male   Prof      B          133518.
#%>% print.data.frame()

OUTLIERS

df1 %>%
  group_by(rank, discipline, sex) %>%
  identify_outliers(salary) -> df1_outliers

There are 18 outliers (including 3 extreme ones). So let’s get rid of them:

df1 <- anti_join(df1, df1_outliers, by = "id")

NORMALITY To check normality, let’s just use Shapiro-Wilk test

shapiro_test(df1, salary)
## # A tibble: 12 x 6
##    rank      discipline sex    variable statistic       p
##    <fct>     <fct>      <fct>  <chr>        <dbl>   <dbl>
##  1 AsstProf  A          Female salary       0.813 0.103  
##  2 AsstProf  B          Female salary       0.889 0.354  
##  3 AssocProf A          Female salary       0.976 0.703  
##  4 AssocProf B          Female salary       0.916 0.514  
##  5 Prof      A          Female salary       0.934 0.549  
##  6 Prof      B          Female salary       0.974 0.923  
##  7 AsstProf  A          Male   salary       0.953 0.581  
##  8 AsstProf  B          Male   salary       0.941 0.0458 
##  9 AssocProf A          Male   salary       0.899 0.0787 
## 10 AssocProf B          Male   salary       0.976 0.698  
## 11 Prof      A          Male   salary       0.967 0.00457
## 12 Prof      B          Male   salary       0.986 0.218

The results are in general normally distributed except for 2 groups, where salary is lower than 0.05 (in one group just by 0.005p - so almost normal)

HOMOGENITY

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:dplyr':
## 
##     recode
leveneTest(data = df1, df1$salary ~ sex * rank * discipline)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group  11  10.911 < 2.2e-16 ***
##       367                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

P-value here is almost 0, so homogenity is not good here.

We can try to improve those results for ANOVA assumption by taking for example log of salary:

df1_log <- df1
df1_log$salary <- log(df1$salary)
shapiro_test(df1_log, salary)
## # A tibble: 12 x 6
##    rank      discipline sex    variable statistic      p
##    <fct>     <fct>      <fct>  <chr>        <dbl>  <dbl>
##  1 AsstProf  A          Female salary       0.813 0.104 
##  2 AsstProf  B          Female salary       0.896 0.387 
##  3 AssocProf A          Female salary       0.978 0.717 
##  4 AssocProf B          Female salary       0.916 0.517 
##  5 Prof      A          Female salary       0.936 0.575 
##  6 Prof      B          Female salary       0.976 0.939 
##  7 AsstProf  A          Male   salary       0.952 0.560 
##  8 AsstProf  B          Male   salary       0.931 0.0218
##  9 AssocProf A          Male   salary       0.891 0.0587
## 10 AssocProf B          Male   salary       0.981 0.840 
## 11 Prof      A          Male   salary       0.985 0.212 
## 12 Prof      B          Male   salary       0.986 0.231
leveneTest(data = df1_log, df1_log$salary ~ sex * rank * discipline)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group  11  8.9954 3.053e-14 ***
##       367                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As we can see, log helped only slightly - we can assume that normality assumption is not that violated but still we have problem with homogenity.

df1_log_triple <- df1
df1_log_triple$salary <- log(log(log(df1$salary)))
leveneTest(data = df1_log, df1_log$salary ~ sex * rank * discipline)
## Levene's Test for Homogeneity of Variance (center = median)
##        Df F value    Pr(>F)    
## group  11  8.9954 3.053e-14 ***
##       367                      
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

Triple log also did not fix the problem. So let’s try do ANOVA anyway.

Three-way ANOVA: 1. Sex: Male/Female 2. Rank: AssocProf/AsstProf/Prof 3. Discipline: A/B

an <- aov(data = df1_log, salary ~ sex * rank * discipline)
anova(an)
## Analysis of Variance Table
## 
## Response: salary
##                      Df  Sum Sq Mean Sq  F value    Pr(>F)    
## sex                   1  0.2989  0.2989   9.9901  0.001705 ** 
## rank                  2 10.3354  5.1677 172.7471 < 2.2e-16 ***
## discipline            1  1.6916  1.6916  56.5484 4.253e-13 ***
## sex:rank              2  0.0058  0.0029   0.0964  0.908136    
## sex:discipline        1  0.0287  0.0287   0.9605  0.327706    
## rank:discipline       2  0.0939  0.0469   1.5690  0.209649    
## sex:rank:discipline   2  0.0190  0.0095   0.3175  0.728131    
## Residuals           367 10.9787  0.0299                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

From that ANOVA, we can see that rank very significantly influences salary. Discipline also quite solidly influences salary, while sex is not that important factor. However, p-value of each interaction (sex-ranko, sex-discipline, rank-discipline, sex-rank-discipline) is quite high (for sure greater than our significance level alpha = 0.05).

[Q3] Is there any significant difference in years since PhD (yrs.since.phd) and seniority (yrs.service) of different rank professors?

In this case we should use ANCOVA test. Let’s use log(salary) as well to better fit in normality/homogenity assumptions. so let’s proceed:

anova(aov(data = df1_log, salary ~ yrs.since.phd + rank * discipline * sex))
## Analysis of Variance Table
## 
## Response: salary
##                      Df  Sum Sq Mean Sq  F value    Pr(>F)    
## yrs.since.phd         1  3.9755  3.9755 132.7644 < 2.2e-16 ***
## rank                  2  6.7859  3.3930 113.3097 < 2.2e-16 ***
## discipline            1  1.5701  1.5701  52.4333 2.645e-12 ***
## sex                   1  0.0127  0.0127   0.4230    0.5158    
## rank:discipline       2  0.1012  0.0506   1.6906    0.1858    
## rank:sex              2  0.0061  0.0031   0.1024    0.9027    
## discipline:sex        1  0.0226  0.0226   0.7564    0.3850    
## rank:discipline:sex   2  0.0182  0.0091   0.3044    0.7378    
## Residuals           366 10.9595  0.0299                       
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1

As we can see, yrs.since.phd singificantly affects salary. It affects even more, than discipline.

.