library(ggplot2)
library(grid)
library(gridExtra)
library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(lmboot)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::combine() masks gridExtra::combine()
## ✖ dplyr::filter()  masks stats::filter()
## ✖ dplyr::lag()     masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

Task 1:

Task 1.1:

  • Study Design: cross-sectional investigation of 396 collegiate professors in the US from 2008 to 2009 was conducted to determine whether salaries significantly differed among professors’ ranks.
  • Null Hypothesis: There are no difference in salaries among professors’ ranks.
  • Alternative Hypothesis: There are difference in salaries among professors’ ranks.

Task 1.2: Import the dataset

salary = read.csv("D:\\OneDrive\\ANDREAS\\ACADEMICS_UNIVERSITY\\Year_4_2024\\Autumn\\32931\\Classes\\R_Basic\\Professorial_Salaries.csv")
dim(salary)
## [1] 397   9
head(salary)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000

Task 1.3: Describe characteristics of the study sample by professors’ ranks

table1(~ Sex + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Rank, data = salary)
AssocProf
(N=64)
AsstProf
(N=67)
Prof
(N=266)
Overall
(N=397)
Sex
Female 10 (15.6%) 11 (16.4%) 18 (6.8%) 39 (9.8%)
Male 54 (84.4%) 56 (83.6%) 248 (93.2%) 358 (90.2%)
Discipline
A 26 (40.6%) 24 (35.8%) 131 (49.2%) 181 (45.6%)
B 38 (59.4%) 43 (64.2%) 135 (50.8%) 216 (54.4%)
Yrs.since.phd
Mean (SD) 15.5 (9.65) 5.10 (2.54) 28.3 (10.1) 22.3 (12.9)
Median [Min, Max] 12.0 [6.00, 49.0] 4.00 [1.00, 11.0] 28.0 [11.0, 56.0] 21.0 [1.00, 56.0]
Yrs.service
Mean (SD) 12.0 (10.1) 2.37 (1.50) 22.8 (11.6) 17.6 (13.0)
Median [Min, Max] 8.00 [1.00, 53.0] 3.00 [0, 6.00] 21.0 [0, 60.0] 16.0 [0, 60.0]
NPubs
Mean (SD) 18.7 (13.5) 15.9 (11.2) 18.6 (14.7) 18.2 (14.0)
Median [Min, Max] 16.0 [1.00, 50.0] 12.0 [2.00, 50.0] 13.0 [1.00, 69.0] 13.0 [1.00, 69.0]
Ncits
Mean (SD) 42.0 (15.6) 36.3 (16.7) 40.8 (17.2) 40.2 (16.9)
Median [Min, Max] 36.5 [14.0, 83.0] 34.0 [1.00, 83.0] 35.0 [1.00, 90.0] 35.0 [1.00, 90.0]
Salary
Mean (SD) 93900 (13800) 80800 (8170) 127000 (27700) 114000 (30300)
Median [Min, Max] 95600 [62900, 126000] 79800 [63100, 97000] 123000 [57800, 232000] 107000 [57800, 232000]

Task 1.4: Develop a graph to check the distribution of professors’ salaries. Write a sentence to describe the graph

p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(aes(y = after_stat(density)), color = "black", fill = "lightblue")
p1 = p1 + geom_density(col="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

- The distribution is skewed right.

Task 1.5: Develop a box plot to describe the differences in salaries among professors’ ranks. Write a sentence to describe the graph

p = ggplot(data = salary, aes(x = Rank,  y = Salary, fill = Rank, col = Rank))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) 
p1 + labs(x = "Rank", y = "Salaries (USD)") + ggtitle("Professors' salaries by rank") + theme_bw()

Task 1.6: Determine whether salaries significantly differed among professors’ ranks

sal.rank = aov(Salary ~ Rank, data = salary)
summary(sal.rank)
##              Df    Sum Sq   Mean Sq F value Pr(>F)    
## Rank          2 1.432e+11 7.162e+10   128.2 <2e-16 ***
## Residuals   394 2.201e+11 5.586e+08                   
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
  • There is evidence that (P < 0.0001) that professors’ salaries significantly differed among their ranks.

Task 1.7: Check the assumptions

par(mfrow=c(2,2))
plot(sal.rank)

Task 1.8: Conduct post-hoc analyses to determine which particular groups significantly differ from each other. Interpret the findings

tukey.sal.rank = TukeyHSD(sal.rank)
tukey.sal.rank
##   Tukey multiple comparisons of means
##     95% family-wise confidence level
## 
## Fit: aov(formula = Salary ~ Rank, data = salary)
## 
## $Rank
##                         diff       lwr       upr     p adj
## AsstProf-AssocProf -13100.45 -22818.71 -3382.195 0.0046514
## Prof-AssocProf      32895.67  25154.51 40636.836 0.0000000
## Prof-AsstProf       45996.12  38395.94 53596.307 0.0000000

Task 2:

Task 2.1:

  • Study Design: An exploratory analysis was conducted in a subgroup of 21 female professors in the Applied discipline to determine whether the average number of publications differed among professors’ ranks.
  • Null Hypothesis: The average number of publications is not different among female professors in the Applied discipline’s ranks.
  • Alternative Hypothesis: The average number of publications is different among female professors in the Applied discipline’s ranks.

Task 2.2: Select a subgroup of female professors in the Applied discipline

Fem.P = subset(salary, Sex == "Female" & Discipline == "B")
dim(Fem.P)
## [1] 21  9
head(Fem.P)
##    ID     Rank Discipline Yrs.since.phd Yrs.service    Sex NPubs Ncits Salary
## 10 10     Prof          B            18          18 Female    22    29 129000
## 35 35 AsstProf          B             4           2 Female    19    69  80225
## 36 36 AsstProf          B             5           0 Female    11    69  77000
## 48 48     Prof          B            23          19 Female    50    55 151768
## 49 49     Prof          B            25          25 Female    18    33 140096
## 53 53 AsstProf          B            11           3 Female    50    31  74692

Task 2.3: Describe characteristics of the study sample by professors’ ranks

table1(~ Sex + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Rank, data = Fem.P)
AssocProf
(N=6)
AsstProf
(N=5)
Prof
(N=10)
Overall
(N=21)
Sex
Female 6 (100%) 5 (100%) 10 (100%) 21 (100%)
Discipline
B 6 (100%) 5 (100%) 10 (100%) 21 (100%)
Yrs.since.phd
Mean (SD) 13.5 (2.88) 6.60 (3.65) 21.5 (5.95) 15.7 (7.72)
Median [Min, Max] 12.5 [11.0, 19.0] 5.00 [3.00, 11.0] 19.0 [17.0, 36.0] 17.0 [3.00, 36.0]
Yrs.service
Mean (SD) 8.83 (1.94) 2.60 (1.82) 17.9 (4.77) 11.7 (7.36)
Median [Min, Max] 9.50 [6.00, 11.0] 3.00 [0, 5.00] 17.5 [10.0, 26.0] 10.0 [0, 26.0]
NPubs
Mean (SD) 16.3 (14.3) 18.8 (18.3) 21.3 (14.2) 19.3 (14.6)
Median [Min, Max] 14.5 [1.00, 38.0] 11.0 [3.00, 50.0] 20.0 [6.00, 50.0] 18.0 [1.00, 50.0]
Ncits
Mean (SD) 45.8 (12.9) 46.4 (24.1) 38.0 (15.7) 42.2 (16.9)
Median [Min, Max] 49.0 [26.0, 60.0] 49.0 [14.0, 69.0] 36.5 [18.0, 60.0] 48.0 [14.0, 69.0]
Salary
Mean (SD) 99400 (14100) 84200 (9790) 132000 (17500) 111000 (25400)
Median [Min, Max] 104000 [71100, 110000] 80200 [74700, 97000] 128000 [105000, 161000] 105000 [71100, 161000]

Task 2.4: Develop a graph to check the distribution of the number of publications. Write a sentence to describe the graph

p = ggplot(data = Fem.P, aes(x = NPubs))
p1 = p + geom_histogram(aes(y = after_stat(density)), color = "black", fill = "lightblue")
p1 = p1 + geom_density(col="blue")
p1
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

  • The distribution is skewed right.

Task 2.5: Conduct a non-parametric test to determine whether the number of publications differed among professors’ ranks. Interpret the findings

kruskal.test(NPubs ~ Rank, data = Fem.P)
## 
##  Kruskal-Wallis rank sum test
## 
## data:  NPubs by Rank
## Kruskal-Wallis chi-squared = 0.81015, df = 2, p-value = 0.6669
  • There is no evidence that the number of publications differed among professors’ ranks.

Task 2.6: Carry out a bootstrap to determine whether the mean number of publications differed among professors’ ranks. Interpret the findings

boot = ANOVA.boot(NPubs ~ Rank, B = 1000, seed = 1234, data = Fem.P)
boot$'p-values'
## [1] 0.821
  • The bootstrapping method indicates there is no evidence (P= 0.82) that the number of publications differed among ranks in a group of female professors in Applied discipline.