#data()
#data(package= .packages(all.available=TRUE))
#install.packages("carData")
library(carData)

Reference: Fox J. and Weisberg, S. (2019) An R Companion to Applied Regression, Third Edition, Sage

mydata <- data.frame(Salaries)
head(mydata)
##        rank discipline yrs.since.phd yrs.service  sex salary
## 1      Prof          B            19          18 Male 139750
## 2      Prof          B            20          16 Male 173200
## 3  AsstProf          B             4           3 Male  79750
## 4      Prof          B            45          39 Male 115000
## 5      Prof          B            40          41 Male 141500
## 6 AssocProf          B             6           6 Male  97000
nrow(mydata)
## [1] 397

Explanation of a data set:

We have a data set of salaries for professors and we have 397 observations (units) with 6 variables.

Unit of observation: A professor in the U.S. Sample size = 397

Variables:

colnames(mydata) <- c("Position", "Discipline", "YearsSincePhD", "YearsOfService", "Gender", "Salary")   
#Change of names of the variables
mydata$DisciplineF <- factor(mydata$Discipline,
                             levels = c("A", "B"),
                             labels = c("A", "B"))

mydata$GenderF <- factor(mydata$Gender,                 #Convert categorical variables into factors
                         levels = c("Male", "Female"),
                         labels = c("Male", "Female"))


mydata <- mydata[, -7, drop = FALSE]
head(mydata)
##    Position Discipline YearsSincePhD YearsOfService Gender Salary GenderF
## 1      Prof          B            19             18   Male 139750    Male
## 2      Prof          B            20             16   Male 173200    Male
## 3  AsstProf          B             4              3   Male  79750    Male
## 4      Prof          B            45             39   Male 115000    Male
## 5      Prof          B            40             41   Male 141500    Male
## 6 AssocProf          B             6              6   Male  97000    Male
any_missing <- any(is.na(mydata))   #There is no missing values as it shows "FALSE"
summary(mydata)
##       Position   Discipline YearsSincePhD   YearsOfService     Gender   
##  AsstProf : 67   A:181      Min.   : 1.00   Min.   : 0.00   Female: 39  
##  AssocProf: 64   B:216      1st Qu.:12.00   1st Qu.: 7.00   Male  :358  
##  Prof     :266              Median :21.00   Median :16.00               
##                             Mean   :22.31   Mean   :17.61               
##                             3rd Qu.:32.00   3rd Qu.:27.00               
##                             Max.   :56.00   Max.   :60.00               
##      Salary         GenderF   
##  Min.   : 57800   Male  :358  
##  1st Qu.: 91000   Female: 39  
##  Median :107300               
##  Mean   :113706               
##  3rd Qu.:134185               
##  Max.   :231545

Descriptive statistics:

library(psych)                                       #For the variables in RQ 1
psych::describe(mydata[ , c("YearsSincePhD", "Salary")])
##               vars   n      mean       sd median   trimmed      mad   min
## YearsSincePhD    1 397     22.31    12.89     21     21.83    14.83     1
## Salary           2 397 113706.46 30289.04 107300 111401.61 29355.48 57800
##                  max  range skew kurtosis      se
## YearsSincePhD     56     55 0.30    -0.81    0.65
## Salary        231545 173745 0.71     0.18 1520.16

Correlation analysis

Research question 1: Is there a correlation between the salary and the number of years since PhD?

H0: There is no correlation between the nine-month salary of the professors and the number of years since PhD.

  • rho = 0

H1: There is correlation between the nine-month salary of the professors and the number of years since PhD.

  • rho =/= 0

Assumptions:

  1. Both variables are numeric.

This assumption is met.

  1. Errors are normally distributed.

We assume this assumption is not violated, because our sample size is sufficiently large (n = 397).

  1. Linear relationship between the chosen variables.

We have to check that with the following procedure (scatter plot).

Normality:

  • H0: The variable Salary is normally distributed.

  • H1: The variable Salary is not normally distributed.

shapiro.test(mydata$Salary)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$Salary
## W = 0.95988, p-value = 6.076e-09

We can reject H0 with p < 0.001, and conclude the variable Salary is not normally distributed.

Let’s check for the other variable:

  • H0: Variable YearsSincePhD is normally distributed.

  • H1: Variable YearsSincePhD is not normally distributed.

shapiro.test(mydata$YearsSincePhD)
## 
##  Shapiro-Wilk normality test
## 
## data:  mydata$YearsSincePhD
## W = 0.96957, p-value = 2.328e-07

We can reject H0 at p < 0.001, and conclude that the variable YearsSincePhD is not normally distributed.

Since normality is violated, we will do Spearman’s correlation test (even though our sample is sufficiently large). But first, let’s check the linearity assumption:

library(car)
## 
## Attaching package: 'car'
## The following object is masked from 'package:psych':
## 
##     logit
scatterplot(mydata$Salary, mydata$YearsSincePhD,
            smooth = TRUE,
            boxplot = FALSE,
            main = "Relationship between the Salary and Years since PhD",
            xlab = "Years since PhD",
            ylab = "Salary")    

From the scatter plot, we can see it does not look like a linear relationship of the two variables.

library(car)                            #Scatter plot matrix
scatterplotMatrix(mydata[ , c(-1,-2,-4,-5,-7,-8, -9)], smooth=FALSE)

  • The relationship between professor’s nine-month salaries and years from their PhD studies is POSITIVE.

  • We can see from the scatter plot matrix that both the Years since PhD and Salary are not normally distributed, and therefore skewed to the right.

fit <- lm(Salary ~ YearsSincePhD, data = mydata)    #A histogram of standardized residuals

mydata$StdResid <-round(rstandard(fit), 3)
mydata$CooksD <- round(cooks.distance(fit), 3)

hist(mydata$StdResid,
     xlab = "Standardized residuals",
     ylab = "Frequency",
     main = "Histogram of standardized residuals")

We can see the outliers, where units are not included in the [-3,3] interval. However, since our sample size is quite large, by removing them, it would not make the function linear.

hist(mydata$CooksD,
     xlab = "Cooks distance",
     ylab = "Frequency",
     main = "Histogram of Cooks distances")

As we can see from Cooks distance histogram, there are no units with high impact.

library(Hmisc)                     #Pearson correlation coefficient
## 
## Attaching package: 'Hmisc'
## The following object is masked from 'package:psych':
## 
##     describe
## The following objects are masked from 'package:base':
## 
##     format.pval, units
rcorr(as.matrix(mydata[ , c(-1,-2,-4,-5,-7,-8,-9,-10)]), 
      type = "pearson")
##               YearsSincePhD Salary
## YearsSincePhD          1.00   0.42
## Salary                 0.42   1.00
## 
## n= 397 
## 
## 
## P
##               YearsSincePhD Salary
## YearsSincePhD                0    
## Salary         0
  • If years since PhD increase by 1 year, the nine-month salary, on average increases by 0.42 USD. Assuming all other variables remain unchanged. - There is a positive and semi-strong relationship between the nine-month salary and years since PhD.

**However, this test was made only for educational purposes - we shall not use it with a data base like this one, so let’s continue the testing with Spearman.

rcorr(as.matrix(mydata[ , c(-1,-2,-4,-5,-7,-8,-9,-10)]),   
      type = "spearman")    #Spearman correlation coefficient
##               YearsSincePhD Salary
## YearsSincePhD          1.00   0.48
## Salary                 0.48   1.00
## 
## n= 397 
## 
## 
## P
##               YearsSincePhD Salary
## YearsSincePhD                0    
## Salary         0
  • If years since PhD increase by 1 year, nine-month salaries, on average increase by 0.48 USD, assuming all other values remain unchanged.

  • The relationship between these two variables is positive and semi-strong.

cor.test(mydata$Salary, mydata$YearsSincePhD,
         method = "spearman",
         use = "complete.obs",
         exact = FALSE)
## 
##  Spearman's rank correlation rho
## 
## data:  mydata$Salary and mydata$YearsSincePhD
## S = 5468989, p-value < 2.2e-16
## alternative hypothesis: true rho is not equal to 0
## sample estimates:
##       rho 
## 0.4755676

H0: There is no correlation between the nine-month salary of the professors and the number of years since PhD.

  • rho = 0

H1: There is a correlation between the nine-month salary of the professors and the number of years since PhD.

  • rho =/= 0

  • Conclusion: We reject H0 at p < 0.001, and conclude that there is a correlation between a nine-month Salary and Years since PhD. It is a positive and semi-strong relationship.

Association

Research question 2: Is there an association between the gender and the position of the professors?

H0: There is no association between the two categorical variables (between Gender and position of the professor).

H1: There is an association between these two categorical variables (between the Gender and the Position).

Assumptions:

  1. Observations need to be independent.

This assumption is met (we can see it from the variable Gender).

  1. All expected frequencies must be greater than 1.

  2. Up to 20% of the expected frequencies is allowed to be in between 1 and 5, however, any expected frequency that is lower than 5 will reduce the power of the test.

results <- chisq.test(mydata$GenderF, mydata$Position,
                      correct = FALSE)
results
## 
##  Pearson's Chi-squared test
## 
## data:  mydata$GenderF and mydata$Position
## X-squared = 8.5259, df = 2, p-value = 0.01408

The p-value = 0.014, which tells us to reject the null hypothesis (p value is lower than 0.05). Based on the data we got, we showed that there is the association between the Gender and Position of professors.

addmargins(results$observed)   #Observed or empirical frequencies
##               mydata$Position
## mydata$GenderF AsstProf AssocProf Prof Sum
##         Male         56        54  248 358
##         Female       11        10   18  39
##         Sum          67        64  266 397
  • In the position of an Associate Professor, out of 64 of them, 54 were males and 10 of them were females.
addmargins(round(results$expected, 2))  #Expected of theoretical frequencies
##               mydata$Position
## mydata$GenderF AsstProf AssocProf   Prof Sum
##         Male      60.42     57.71 239.87 358
##         Female     6.58      6.29  26.13  39
##         Sum       67.00     64.00 266.00 397

Explanation:

  • The second assumption is met, as ALL of the expected frequencies are greater than 1.

  • As well as the third assumption is met, as ALL of the expected frequencies are also greater than 5.

round(results$res, 2)  #Standardized residuals
##               mydata$Position
## mydata$GenderF AsstProf AssocProf  Prof
##         Male      -0.57     -0.49  0.52
##         Female     1.72      1.48 -1.59

Standardized residuals tell us whether the discrepancies between observed and expected frequencies are statistically significant.

  • There are no significant discrepancies between actual and expected values, for all categories, because all values of standardized residuals are below 1.96 and this differences are not statistically significant at 5%.

For educational purposes, I will explain one number, if it was statistically significant.

  • -0.57 : There is less than expected number of male professors in the category Assistant Professors.
addmargins(round(prop.table(results$observed), 3))  #Proportional table
##               mydata$Position
## mydata$GenderF AsstProf AssocProf  Prof   Sum
##         Male      0.141     0.136 0.625 0.902
##         Female    0.028     0.025 0.045 0.098
##         Sum       0.169     0.161 0.670 1.000
  • 0.141 –> Out of 397 professors, there is 14.1% of professors, who are males and at the same time on an Assisting Professor position.

  • Out of 397, there are 62.5% of professors that are males and are on the Professor’s position.

addmargins(round(prop.table(results$observed, 1), 3),2)  
##               mydata$Position
## mydata$GenderF AsstProf AssocProf  Prof   Sum
##         Male      0.156     0.151 0.693 1.000
##         Female    0.282     0.256 0.462 1.000
  • Out of all professors in the sample that are females, 46.2% of them hold a position of a Professor.
addmargins(round(prop.table(results$observed, 2), 3),1)
##               mydata$Position
## mydata$GenderF AsstProf AssocProf  Prof
##         Male      0.836     0.844 0.932
##         Female    0.164     0.156 0.068
##         Sum       1.000     1.000 1.000
  • Out of all Associate Professors in the sample, 15.6% of them are females.
library(effectsize)
## 
## Attaching package: 'effectsize'
## The following object is masked from 'package:psych':
## 
##     phi
effectsize::cramers_v(mydata$GenderF, mydata$Position)
## Cramer's V (adj.) |       95% CI
## --------------------------------
## 0.13              | [0.00, 1.00]
## 
## - One-sided CIs: upper bound fixed at [1.00].
interpret_cramers_v(0.13)
## [1] "small"
## (Rules: funder2019)
  • Conclusion: Based on the data from our sample, we reject our null hypothesis at p < 0.001, and can conclude that there is an association between the gender of professors and the position professors have. The relationship is positive and the effect is small.

  • There is no Odds ratio calculation, as my table is 2 x 3.