TRM Practical Data Analysis - Basic level Lecture 4. Analysis of difference

#TRM Pratical Data Analysis - Basic Level ##Task 1. Read the “Professorial Salaries” and name this dataset “salary”

salary4=read.csv("C:/Users/24544355/OneDrive - UTS/Desktop/Assignment/32931/Module10/Professorial Salaries.csv")

##Task 2. Describe characteristics of the study sample by sex

library(table1)

## 
## Attaching package: 'table1'

## The following objects are masked from 'package:base':
## 
##     units, units<-

table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = salary4 )

	Female (N=39)	Male (N=358)	Overall (N=397)
Rank
AssocProf	10 (25.6%)	54 (15.1%)	64 (16.1%)
AsstProf	11 (28.2%)	56 (15.6%)	67 (16.9%)
Prof	18 (46.2%)	248 (69.3%)	266 (67.0%)
Discipline
A	18 (46.2%)	163 (45.5%)	181 (45.6%)
B	21 (53.8%)	195 (54.5%)	216 (54.4%)
Yrs.since.phd
Mean (SD)	16.5 (9.78)	22.9 (13.0)	22.3 (12.9)
Median [Min, Max]	17.0 [2.00, 39.0]	22.0 [1.00, 56.0]	21.0 [1.00, 56.0]
Yrs.service
Mean (SD)	11.6 (8.81)	18.3 (13.2)	17.6 (13.0)
Median [Min, Max]	10.0 [0, 36.0]	18.0 [0, 60.0]	16.0 [0, 60.0]
NPubs
Mean (SD)	20.2 (14.4)	17.9 (13.9)	18.2 (14.0)
Median [Min, Max]	18.0 [1.00, 50.0]	13.0 [1.00, 69.0]	13.0 [1.00, 69.0]
Ncits
Mean (SD)	40.7 (16.2)	40.2 (17.0)	40.2 (16.9)
Median [Min, Max]	36.0 [14.0, 70.0]	35.0 [1.00, 90.0]	35.0 [1.00, 90.0]
Salary
Mean (SD)	101000 (26000)	115000 (30400)	114000 (30300)
Median [Min, Max]	104000 [62900, 161000]	108000 [57800, 232000]	107000 [57800, 232000]

##Task 3. Comparision of salaries between male and female professors ###3a. Check the distribution of professors’ salaries

library(ggplot2)
p = ggplot(data = salary4, aes(x = Salary))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of professors' salaries") + theme_bw()

## Warning: The dot-dot notation (`..density..`) was deprecated in ggplot2 3.4.0.
## ℹ Please use `after_stat(density)` instead.
## This warning is displayed once every 8 hours.
## Call `lifecycle::last_lifecycle_warnings()` to see where this warning was
## generated.

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###3b. Student’s t-test to determine whether salaries were different between male and female professors

t.test(Salary ~ Sex, data = salary4)

## 
##  Welch Two Sample t-test
## 
## data:  Salary by Sex
## t = -3.1615, df = 50.122, p-value = 0.002664
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
##  -23037.916  -5138.102
## sample estimates:
## mean in group Female   mean in group Male 
##             101002.4             115090.4

##Task 4. Select a subgroup of associate professors in Theoretical discipline

###4a

Assoc.A = subset(salary4, Rank == "AssocProf" & Discipline == "A")
dim(Assoc.A)

## [1] 26  9

###4b. Describe characteristics of associate professors in Theoretical discipline by sex

library(table1)
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = Assoc.A )

	Female (N=4)	Male (N=22)	Overall (N=26)
Rank
AssocProf	4 (100%)	22 (100%)	26 (100%)
Discipline
A	4 (100%)	22 (100%)	26 (100%)
Yrs.since.phd
Mean (SD)	18.5 (8.19)	17.7 (12.2)	17.8 (11.5)
Median [Min, Max]	19.0 [10.0, 26.0]	12.5 [8.00, 49.0]	13.0 [8.00, 49.0]
Yrs.service
Mean (SD)	15.5 (8.70)	13.1 (12.3)	13.5 (11.7)
Median [Min, Max]	15.0 [8.00, 24.0]	8.00 [1.00, 49.0]	8.00 [1.00, 49.0]
NPubs
Mean (SD)	10.0 (4.97)	21.6 (14.2)	19.8 (13.8)
Median [Min, Max]	10.0 [4.00, 16.0]	16.0 [3.00, 48.0]	16.0 [3.00, 48.0]
Ncits
Mean (SD)	38.5 (18.5)	44.3 (15.2)	43.4 (15.5)
Median [Min, Max]	37.5 [19.0, 60.0]	47.0 [24.0, 69.0]	47.0 [19.0, 69.0]
Salary
Mean (SD)	72100 (6400)	85000 (10600)	83100 (11100)
Median [Min, Max]	74100 [62900, 77500]	82400 [70000, 108000]	81900 [62900, 108000]

##Task 5. Comparision of number of publications between male and female associate professors in Theoretical discipline ###5a. Check the distribution of number of publications among associate professors in Theoretical discipline

p = ggplot(data = Assoc.A, aes(x = NPubs))
p1 = p + geom_histogram(aes(y = ..density..), color = "white", fill = "blue")
p2 = p1 + geom_density(col="red")
p2 + ggtitle("Distribution of number of publications among associate professors in Theoretical discipline") + theme_bw()

## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.

###5b. Describe the differences in number of publications among associate professors in Theoretical discipline

p = ggplot(data = Assoc.A, aes(x = Sex,  y = NPubs, fill = Sex, col = Sex))
p1 = p + geom_boxplot(col = "black") + geom_jitter(alpha = 0.05) 
p1 + labs(x = "Sex", y = "Number of publications") + ggtitle("Number of publications by sex") + theme_bw()

###5c. Mann-Whitney non-parametric test to determine whether number of publications differed between male and female associate professors in Theoretical discipline

wilcox.test(NPubs ~ Sex, data = Assoc.A)

## Warning in wilcox.test.default(x = DATA[[1L]], y = DATA[[2L]], ...): cannot
## compute exact p-value with ties

## 
##  Wilcoxon rank sum test with continuity correction
## 
## data:  NPubs by Sex
## W = 21.5, p-value = 0.1168
## alternative hypothesis: true location shift is not equal to 0

###5d. Bootstrap to determine whether number of publications differed between male and female associate professors in Theoretical discipline ####Differences in the MEAN number of publications

library(simpleboot)

## Simple Bootstrap Routines (1.1-7)

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1

## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

male = Assoc.A %>% filter(Sex == "Male")
female = Assoc.A %>% filter(Sex == "Female")
set.seed(1234)

b.means = two.boot(male$NPubs, female$NPubs, mean, R = 1000)
hist (b.means$t, breaks = 20)

quantile(b.means$t, probs=c(0.025, 0.50, 0.975))

##      2.5%       50%     97.5% 
##  4.976136 11.568182 18.273295

####Differences in the MEDIAN number of publications

set.seed(1234)

b.medians = two.boot(male$NPubs, female$NPubs, median, R = 1000)
hist (b.medians$t, breaks = 20)

quantile(b.medians$t, probs=c(0.025, 0.50, 0.975))

##    2.5%     50%   97.5% 
## -0.5000  7.0000 19.5125

TRM Practical Data Analysis - Basic level Lecture 4. Analysis of difference - Numeric data

Shuaiwen Feng

2024-03-25