salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2024_Autumn semester\\Data\\Professorial Salaries.csv")
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = salary)
Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
---|---|---|---|
Rank | |||
AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
Discipline | |||
A | 18 (46.2%) | 163 (45.5%) | 181 (45.6%) |
B | 21 (53.8%) | 195 (54.5%) | 216 (54.4%) |
Yrs.since.phd | |||
Mean (SD) | 16.5 (9.78) | 22.9 (13.0) | 22.3 (12.9) |
Median [Min, Max] | 17.0 [2.00, 39.0] | 22.0 [1.00, 56.0] | 21.0 [1.00, 56.0] |
Yrs.service | |||
Mean (SD) | 11.6 (8.81) | 18.3 (13.2) | 17.6 (13.0) |
Median [Min, Max] | 10.0 [0, 36.0] | 18.0 [0, 60.0] | 16.0 [0, 60.0] |
NPubs | |||
Mean (SD) | 20.2 (14.4) | 17.9 (13.9) | 18.2 (14.0) |
Median [Min, Max] | 18.0 [1.00, 50.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
Ncits | |||
Mean (SD) | 40.7 (16.2) | 40.2 (17.0) | 40.2 (16.9) |
Median [Min, Max] | 36.0 [14.0, 70.0] | 35.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
Salary | |||
Mean (SD) | 101000 (26000) | 115000 (30400) | 114000 (30300) |
Median [Min, Max] | 104000 [62900, 161000] | 108000 [57800, 232000] | 107000 [57800, 232000] |
library(ggplot2)
p = ggplot(data = salary, aes(x = Salary))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of professors' salaries") + theme_bw()
## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = salary, aes(x = Sex, y = Salary, fill = Sex, col = Sex))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Sex", y = "Professors' salaries (USD)") + ggtitle("Professors' salaries by sex") + theme_bw()
t.test(Salary ~ Sex, data = salary)
##
## Welch Two Sample t-test
##
## data: Salary by Sex
## t = -3.1615, df = 50.122, p-value = 0.002664
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -23037.916 -5138.102
## sample estimates:
## mean in group Female mean in group Male
## 101002.4 115090.4
Interpretation: There is evidence (P= 0.003) that the salaries of male professors were $14,088 higher than female professors, ranging from $5,138 to $23,037.
Assoc.A = subset(salary, Rank == "AssocProf" & Discipline == "A")
dim(Assoc.A)
## [1] 26 9
library(table1)
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = Assoc.A)
Female (N=4) |
Male (N=22) |
Overall (N=26) |
|
---|---|---|---|
Rank | |||
AssocProf | 4 (100%) | 22 (100%) | 26 (100%) |
Discipline | |||
A | 4 (100%) | 22 (100%) | 26 (100%) |
Yrs.since.phd | |||
Mean (SD) | 18.5 (8.19) | 17.7 (12.2) | 17.8 (11.5) |
Median [Min, Max] | 19.0 [10.0, 26.0] | 12.5 [8.00, 49.0] | 13.0 [8.00, 49.0] |
Yrs.service | |||
Mean (SD) | 15.5 (8.70) | 13.1 (12.3) | 13.5 (11.7) |
Median [Min, Max] | 15.0 [8.00, 24.0] | 8.00 [1.00, 49.0] | 8.00 [1.00, 49.0] |
NPubs | |||
Mean (SD) | 10.0 (4.97) | 21.6 (14.2) | 19.8 (13.8) |
Median [Min, Max] | 10.0 [4.00, 16.0] | 16.0 [3.00, 48.0] | 16.0 [3.00, 48.0] |
Ncits | |||
Mean (SD) | 38.5 (18.5) | 44.3 (15.2) | 43.4 (15.5) |
Median [Min, Max] | 37.5 [19.0, 60.0] | 47.0 [24.0, 69.0] | 47.0 [19.0, 69.0] |
Salary | |||
Mean (SD) | 72100 (6400) | 85000 (10600) | 83100 (11100) |
Median [Min, Max] | 74100 [62900, 77500] | 82400 [70000, 108000] | 81900 [62900, 108000] |
p = ggplot(data = Assoc.A, aes(x = NPubs))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of number of publications among associate professors in Theoretical discipline") + theme_bw()
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = Assoc.A, aes(x = Sex, y = NPubs, fill = Sex, col = Sex))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05)
p1 + labs(x = "Sex", y = "Number of publications") + ggtitle("Number of publications by sex") + theme_bw()
wilcox.test(NPubs ~ Sex, data = Assoc.A)
## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties
##
## Wilcoxon rank sum test with continuity correction
##
## data: NPubs by Sex
## W = 21.5, p-value = 0.1168
## alternative hypothesis: true location shift is not equal to 0
library(simpleboot)
## Warning: package 'simpleboot' was built under R version 4.3.2
## Simple Bootstrap Routines (1.1-7)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.3 ✔ tibble 3.2.1
## ✔ purrr 1.0.2 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
male = Assoc.A %>% filter(Sex == "Male")
female = Assoc.A %>% filter(Sex == "Female")
set.seed(1234)
b.means = two.boot(male$NPubs, female$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)
quantile(b.means$t, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## 4.976136 11.568182 18.273295
set.seed(1234)
b.medians = two.boot(male$NPubs, female$NPubs, median, R = 1000)
hist (b.medians$t, breaks = 20)
quantile(b.medians$t, probs=c(0.025, 0.50, 0.975))
## 2.5% 50% 97.5%
## -0.5000 7.0000 19.5125