Basic data analysis workshop - BTH (8-10/12/2023)

Ngày 1 (8/12/2023) - Giới thiệu R

Việc 2. Cài đặt các gói phân tích (packages cần thiết)

#install.packages(c("readxl", "tidyverse", "dplyr", "ggplot2", "gridExtra", "GGally", "DescTools", "table1", "compareGroups", "simpleboot", "epiDisplay", "Publish", "rms"), dependencies = T)

Việc 3. Đọc dữ liệu vào R

salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2023_Spring semester\\Data\\Professorial Salaries.csv")
dim(salary)
## [1] 397   9
head(salary)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000
tail(salary)
##      ID     Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 392 392     Prof          A            30          19 Male     6    27 151292
## 393 393     Prof          A            33          30 Male    19    83 103106
## 394 394     Prof          A            31          19 Male    11    49 150564
## 395 395     Prof          A            42          25 Male    13    14 101738
## 396 396     Prof          A            25          15 Male     3    36  95329
## 397 397 AsstProf          A             8           4 Male     8    34  81035

Việc 4. Tạo biến số mới

4.1 Biến số rank.num

Cách đơn giản

salary$rank.num[salary$Rank == "AsstProf"] = 1
salary$rank.num[salary$Rank == "AssocProf"] = 2
salary$rank.num[salary$Rank == "Prof"] = 3

table(salary$rank.num, salary$Rank)
##    
##     AssocProf AsstProf Prof
##   1         0       67    0
##   2        64        0    0
##   3         0        0  266

Sử dụng gói tidyverse

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(dplyr)

salary = salary %>% 
  mutate(rank.sum2 = case_when(Rank == "AsstProf" ~ 1,
                              Rank == "AssocProf" ~ 2,
                              Rank == "Prof" ~ 3))

table(salary$rank.sum2, salary$Rank)
##    
##     AssocProf AsstProf Prof
##   1         0       67    0
##   2        64        0    0
##   3         0        0  266

4.2 Biến số salary.high

# Cách đơn giản:

salary$salary.high[salary$Salary<130000] = "No"
salary$salary.high[salary$Salary>=130000] = "Yes"

salary$salary.high = ifelse(salary$Salary>= 130000, "Yes", "No")

# Sử dụng tidyverse

salary = salary %>% 
  mutate(salary.high2 = case_when(Salary < 130000 ~ "No",
                                  Salary >= 130000 ~ "Yes"))

table(salary$salary.high, salary$salary.high2)
##      
##        No Yes
##   No  287   0
##   Yes   0 110

Việc 5. Chọn tập tin

5.1 Tập dữ liệu chỉ gồm GS

salary.prof = subset(salary, Rank == "Prof")
dim(salary.prof)
## [1] 266  13
table(salary.prof$Rank)
## 
## Prof 
##  266
salary.prof2 = salary %>%
  filter(Rank == "Prof")
dim(salary.prof2)
## [1] 266  13

5.2 Tập dữ liệu gồm Nữ và 1 vài biến số

salary.women = subset(salary, Sex == "Female", c("Yrs.since.phd", "Yrs.service", "Sex", "Salary"))
dim(salary.women)
## [1] 39  4
head(salary.women)
##    Yrs.since.phd Yrs.service    Sex Salary
## 10            18          18 Female 129000
## 20            39          36 Female 137000
## 25            13           8 Female  74830
## 35             4           2 Female  80225
## 36             5           0 Female  77000
## 48            23          19 Female 151768

Việc 6. Tóm tắt dữ liệu

summary(salary)
##        ID          Rank            Discipline        Yrs.since.phd  
##  Min.   :  1   Length:397         Length:397         Min.   : 1.00  
##  1st Qu.:100   Class :character   Class :character   1st Qu.:12.00  
##  Median :199   Mode  :character   Mode  :character   Median :21.00  
##  Mean   :199                                         Mean   :22.31  
##  3rd Qu.:298                                         3rd Qu.:32.00  
##  Max.   :397                                         Max.   :56.00  
##   Yrs.service        Sex                NPubs           Ncits      
##  Min.   : 0.00   Length:397         Min.   : 1.00   Min.   : 1.00  
##  1st Qu.: 7.00   Class :character   1st Qu.: 8.00   1st Qu.:28.00  
##  Median :16.00   Mode  :character   Median :13.00   Median :35.00  
##  Mean   :17.61                      Mean   :18.15   Mean   :40.22  
##  3rd Qu.:27.00                      3rd Qu.:26.00   3rd Qu.:50.00  
##  Max.   :60.00                      Max.   :69.00   Max.   :90.00  
##      Salary          rank.num       rank.sum2     salary.high       
##  Min.   : 57800   Min.   :1.000   Min.   :1.000   Length:397        
##  1st Qu.: 91000   1st Qu.:2.000   1st Qu.:2.000   Class :character  
##  Median :107300   Median :3.000   Median :3.000   Mode  :character  
##  Mean   :113706   Mean   :2.501   Mean   :2.501                     
##  3rd Qu.:134185   3rd Qu.:3.000   3rd Qu.:3.000                     
##  Max.   :231545   Max.   :3.000   Max.   :3.000                     
##  salary.high2      
##  Length:397        
##  Class :character  
##  Mode  :character  
##                    
##                    
## 

Viêc 7. Mô tả dữ liệu theo giới tính

7.1 Các biến số trong tập dữ liệu gốc:

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary | Sex, data = salary)
Female
(N=39)
Male
(N=358)
Overall
(N=397)
Rank
AssocProf 10 (25.6%) 54 (15.1%) 64 (16.1%)
AsstProf 11 (28.2%) 56 (15.6%) 67 (16.9%)
Prof 18 (46.2%) 248 (69.3%) 266 (67.0%)
Discipline
A 18 (46.2%) 163 (45.5%) 181 (45.6%)
B 21 (53.8%) 195 (54.5%) 216 (54.4%)
Yrs.since.phd
Mean (SD) 16.5 (9.78) 22.9 (13.0) 22.3 (12.9)
Median [Min, Max] 17.0 [2.00, 39.0] 22.0 [1.00, 56.0] 21.0 [1.00, 56.0]
Yrs.service
Mean (SD) 11.6 (8.81) 18.3 (13.2) 17.6 (13.0)
Median [Min, Max] 10.0 [0, 36.0] 18.0 [0, 60.0] 16.0 [0, 60.0]
NPubs
Mean (SD) 20.2 (14.4) 17.9 (13.9) 18.2 (14.0)
Median [Min, Max] 18.0 [1.00, 50.0] 13.0 [1.00, 69.0] 13.0 [1.00, 69.0]
Ncits
Mean (SD) 40.7 (16.2) 40.2 (17.0) 40.2 (16.9)
Median [Min, Max] 36.0 [14.0, 70.0] 35.0 [1.00, 90.0] 35.0 [1.00, 90.0]
Salary
Mean (SD) 101000 (26000) 115000 (30400) 114000 (30300)
Median [Min, Max] 104000 [62900, 161000] 108000 [57800, 232000] 107000 [57800, 232000]

7.2 Thêm các biến số mới tạo:

table1(~ Rank + rank.num + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + salary.high | Sex, data = salary)
Female
(N=39)
Male
(N=358)
Overall
(N=397)
Rank
AssocProf 10 (25.6%) 54 (15.1%) 64 (16.1%)
AsstProf 11 (28.2%) 56 (15.6%) 67 (16.9%)
Prof 18 (46.2%) 248 (69.3%) 266 (67.0%)
rank.num
Mean (SD) 2.18 (0.854) 2.54 (0.750) 2.50 (0.767)
Median [Min, Max] 2.00 [1.00, 3.00] 3.00 [1.00, 3.00] 3.00 [1.00, 3.00]
Discipline
A 18 (46.2%) 163 (45.5%) 181 (45.6%)
B 21 (53.8%) 195 (54.5%) 216 (54.4%)
Yrs.since.phd
Mean (SD) 16.5 (9.78) 22.9 (13.0) 22.3 (12.9)
Median [Min, Max] 17.0 [2.00, 39.0] 22.0 [1.00, 56.0] 21.0 [1.00, 56.0]
Yrs.service
Mean (SD) 11.6 (8.81) 18.3 (13.2) 17.6 (13.0)
Median [Min, Max] 10.0 [0, 36.0] 18.0 [0, 60.0] 16.0 [0, 60.0]
NPubs
Mean (SD) 20.2 (14.4) 17.9 (13.9) 18.2 (14.0)
Median [Min, Max] 18.0 [1.00, 50.0] 13.0 [1.00, 69.0] 13.0 [1.00, 69.0]
Ncits
Mean (SD) 40.7 (16.2) 40.2 (17.0) 40.2 (16.9)
Median [Min, Max] 36.0 [14.0, 70.0] 35.0 [1.00, 90.0] 35.0 [1.00, 90.0]
Salary
Mean (SD) 101000 (26000) 115000 (30400) 114000 (30300)
Median [Min, Max] 104000 [62900, 161000] 108000 [57800, 232000] 107000 [57800, 232000]
salary.high
No 34 (87.2%) 253 (70.7%) 287 (72.3%)
Yes 5 (12.8%) 105 (29.3%) 110 (27.7%)

Việc 8. So sánh đặt điểm mẫu nghiên cứu giữa nam và nữ GS

library(compareGroups)
createTable(compareGroups(Sex ~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary, data = salary))
## 
## --------Summary descriptives table by 'Sex'---------
## 
## _____________________________________________________ 
##                   Female          Male      p.overall 
##                    N=39          N=358                
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯ 
## Rank:                                         0.014   
##     AssocProf   10 (25.6%)     54 (15.1%)             
##     AsstProf    11 (28.2%)     56 (15.6%)             
##     Prof        18 (46.2%)    248 (69.3%)             
## Discipline:                                   1.000   
##     A           18 (46.2%)    163 (45.5%)             
##     B           21 (53.8%)    195 (54.5%)             
## Yrs.since.phd  16.5 (9.78)    22.9 (13.0)    <0.001   
## Yrs.service    11.6 (8.81)    18.3 (13.2)    <0.001   
## NPubs          20.2 (14.4)    17.9 (13.9)     0.352   
## Ncits          40.7 (16.2)    40.2 (17.0)     0.851   
## Salary        101002 (25952) 115090 (30437)   0.003   
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯

Việc 9. Ghi lại các lệnh trong R markdown và chia sẻ trên mạng rpubs.com (https://rpubs.com/ThachTran/1116772)