#install.packages(c("readxl", "tidyverse", "dplyr", "ggplot2", "gridExtra", "GGally", "DescTools", "table1", "compareGroups", "simpleboot", "epiDisplay", "Publish", "rms"), dependencies = T)
salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2023_Spring semester\\Data\\Professorial Salaries.csv")
dim(salary)
## [1] 397 9
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
tail(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 392 392 Prof A 30 19 Male 6 27 151292
## 393 393 Prof A 33 30 Male 19 83 103106
## 394 394 Prof A 31 19 Male 11 49 150564
## 395 395 Prof A 42 25 Male 13 14 101738
## 396 396 Prof A 25 15 Male 3 36 95329
## 397 397 AsstProf A 8 4 Male 8 34 81035
Cách đơn giản
salary$rank.num[salary$Rank == "AsstProf"] = 1
salary$rank.num[salary$Rank == "AssocProf"] = 2
salary$rank.num[salary$Rank == "Prof"] = 3
table(salary$rank.num, salary$Rank)
##
## AssocProf AsstProf Prof
## 1 0 67 0
## 2 64 0 0
## 3 0 0 266
Sử dụng gói tidyverse
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)
salary = salary %>%
mutate(rank.sum2 = case_when(Rank == "AsstProf" ~ 1,
Rank == "AssocProf" ~ 2,
Rank == "Prof" ~ 3))
table(salary$rank.sum2, salary$Rank)
##
## AssocProf AsstProf Prof
## 1 0 67 0
## 2 64 0 0
## 3 0 0 266
# Cách đơn giản:
salary$salary.high[salary$Salary<130000] = "No"
salary$salary.high[salary$Salary>=130000] = "Yes"
salary$salary.high = ifelse(salary$Salary>= 130000, "Yes", "No")
# Sử dụng tidyverse
salary = salary %>%
mutate(salary.high2 = case_when(Salary < 130000 ~ "No",
Salary >= 130000 ~ "Yes"))
table(salary$salary.high, salary$salary.high2)
##
## No Yes
## No 287 0
## Yes 0 110
salary.prof = subset(salary, Rank == "Prof")
dim(salary.prof)
## [1] 266 13
table(salary.prof$Rank)
##
## Prof
## 266
salary.prof2 = salary %>%
filter(Rank == "Prof")
dim(salary.prof2)
## [1] 266 13
salary.women = subset(salary, Sex == "Female", c("Yrs.since.phd", "Yrs.service", "Sex", "Salary"))
dim(salary.women)
## [1] 39 4
head(salary.women)
## Yrs.since.phd Yrs.service Sex Salary
## 10 18 18 Female 129000
## 20 39 36 Female 137000
## 25 13 8 Female 74830
## 35 4 2 Female 80225
## 36 5 0 Female 77000
## 48 23 19 Female 151768
summary(salary)
## ID Rank Discipline Yrs.since.phd
## Min. : 1 Length:397 Length:397 Min. : 1.00
## 1st Qu.:100 Class :character Class :character 1st Qu.:12.00
## Median :199 Mode :character Mode :character Median :21.00
## Mean :199 Mean :22.31
## 3rd Qu.:298 3rd Qu.:32.00
## Max. :397 Max. :56.00
## Yrs.service Sex NPubs Ncits
## Min. : 0.00 Length:397 Min. : 1.00 Min. : 1.00
## 1st Qu.: 7.00 Class :character 1st Qu.: 8.00 1st Qu.:28.00
## Median :16.00 Mode :character Median :13.00 Median :35.00
## Mean :17.61 Mean :18.15 Mean :40.22
## 3rd Qu.:27.00 3rd Qu.:26.00 3rd Qu.:50.00
## Max. :60.00 Max. :69.00 Max. :90.00
## Salary rank.num rank.sum2 salary.high
## Min. : 57800 Min. :1.000 Min. :1.000 Length:397
## 1st Qu.: 91000 1st Qu.:2.000 1st Qu.:2.000 Class :character
## Median :107300 Median :3.000 Median :3.000 Mode :character
## Mean :113706 Mean :2.501 Mean :2.501
## 3rd Qu.:134185 3rd Qu.:3.000 3rd Qu.:3.000
## Max. :231545 Max. :3.000 Max. :3.000
## salary.high2
## Length:397
## Class :character
## Mode :character
##
##
##
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = salary)
Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
---|---|---|---|
Rank | |||
AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
Discipline | |||
A | 18 (46.2%) | 163 (45.5%) | 181 (45.6%) |
B | 21 (53.8%) | 195 (54.5%) | 216 (54.4%) |
Yrs.since.phd | |||
Mean (SD) | 16.5 (9.78) | 22.9 (13.0) | 22.3 (12.9) |
Median [Min, Max] | 17.0 [2.00, 39.0] | 22.0 [1.00, 56.0] | 21.0 [1.00, 56.0] |
Yrs.service | |||
Mean (SD) | 11.6 (8.81) | 18.3 (13.2) | 17.6 (13.0) |
Median [Min, Max] | 10.0 [0, 36.0] | 18.0 [0, 60.0] | 16.0 [0, 60.0] |
NPubs | |||
Mean (SD) | 20.2 (14.4) | 17.9 (13.9) | 18.2 (14.0) |
Median [Min, Max] | 18.0 [1.00, 50.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
Ncits | |||
Mean (SD) | 40.7 (16.2) | 40.2 (17.0) | 40.2 (16.9) |
Median [Min, Max] | 36.0 [14.0, 70.0] | 35.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
Salary | |||
Mean (SD) | 101000 (26000) | 115000 (30400) | 114000 (30300) |
Median [Min, Max] | 104000 [62900, 161000] | 108000 [57800, 232000] | 107000 [57800, 232000] |
table1(~ Rank + rank.num + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + salary.high | Sex, data = salary)
Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
---|---|---|---|
Rank | |||
AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
rank.num | |||
Mean (SD) | 2.18 (0.854) | 2.54 (0.750) | 2.50 (0.767) |
Median [Min, Max] | 2.00 [1.00, 3.00] | 3.00 [1.00, 3.00] | 3.00 [1.00, 3.00] |
Discipline | |||
A | 18 (46.2%) | 163 (45.5%) | 181 (45.6%) |
B | 21 (53.8%) | 195 (54.5%) | 216 (54.4%) |
Yrs.since.phd | |||
Mean (SD) | 16.5 (9.78) | 22.9 (13.0) | 22.3 (12.9) |
Median [Min, Max] | 17.0 [2.00, 39.0] | 22.0 [1.00, 56.0] | 21.0 [1.00, 56.0] |
Yrs.service | |||
Mean (SD) | 11.6 (8.81) | 18.3 (13.2) | 17.6 (13.0) |
Median [Min, Max] | 10.0 [0, 36.0] | 18.0 [0, 60.0] | 16.0 [0, 60.0] |
NPubs | |||
Mean (SD) | 20.2 (14.4) | 17.9 (13.9) | 18.2 (14.0) |
Median [Min, Max] | 18.0 [1.00, 50.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
Ncits | |||
Mean (SD) | 40.7 (16.2) | 40.2 (17.0) | 40.2 (16.9) |
Median [Min, Max] | 36.0 [14.0, 70.0] | 35.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
Salary | |||
Mean (SD) | 101000 (26000) | 115000 (30400) | 114000 (30300) |
Median [Min, Max] | 104000 [62900, 161000] | 108000 [57800, 232000] | 107000 [57800, 232000] |
salary.high | |||
No | 34 (87.2%) | 253 (70.7%) | 287 (72.3%) |
Yes | 5 (12.8%) | 105 (29.3%) | 110 (27.7%) |
library(compareGroups)
createTable(compareGroups(Sex ~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary, data = salary))
##
## --------Summary descriptives table by 'Sex'---------
##
## _____________________________________________________
## Female Male p.overall
## N=39 N=358
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Rank: 0.014
## AssocProf 10 (25.6%) 54 (15.1%)
## AsstProf 11 (28.2%) 56 (15.6%)
## Prof 18 (46.2%) 248 (69.3%)
## Discipline: 1.000
## A 18 (46.2%) 163 (45.5%)
## B 21 (53.8%) 195 (54.5%)
## Yrs.since.phd 16.5 (9.78) 22.9 (13.0) <0.001
## Yrs.service 11.6 (8.81) 18.3 (13.2) <0.001
## NPubs 20.2 (14.4) 17.9 (13.9) 0.352
## Ncits 40.7 (16.2) 40.2 (17.0) 0.851
## Salary 101002 (25952) 115090 (30437) 0.003
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯