Cac ham pho bien voi tidyverse

Nhap data: Salary

datt=read.csv("E:/CONG VIEC/Ky nang ngoai/Xu ly so lieu Bang R/Van Lang R and Machine Learning 2023/Thuc hanh ngay 1/Salaries.csv",header = T)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.2     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ lubridate 1.9.2     ✔ tibble    3.2.1
## ✔ purrr     1.0.1     ✔ tidyr     1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(table1)
## 
## Attaching package: 'table1'
## 
## The following objects are masked from 'package:base':
## 
##     units, units<-
table1(data=datt,~Rank+Discipline+Yrs.service+Yrs.since.phd+Salary|Sex)
Female
(N=39)
Male
(N=358)
Overall
(N=397)
Rank
AssocProf 10 (25.6%) 54 (15.1%) 64 (16.1%)
AsstProf 11 (28.2%) 56 (15.6%) 67 (16.9%)
Prof 18 (46.2%) 248 (69.3%) 266 (67.0%)
Discipline
A 18 (46.2%) 163 (45.5%) 181 (45.6%)
B 21 (53.8%) 195 (54.5%) 216 (54.4%)
Yrs.service
Mean (SD) 11.6 (8.81) 18.3 (13.2) 17.6 (13.0)
Median [Min, Max] 10.0 [0, 36.0] 18.0 [0, 60.0] 16.0 [0, 60.0]
Yrs.since.phd
Mean (SD) 16.5 (9.78) 22.9 (13.0) 22.3 (12.9)
Median [Min, Max] 17.0 [2.00, 39.0] 22.0 [1.00, 56.0] 21.0 [1.00, 56.0]
Salary
Mean (SD) 101000 (26000) 115000 (30400) 114000 (30300)
Median [Min, Max] 104000 [62900, 161000] 108000 [57800, 232000] 107000 [57800, 232000]

Chon bien (variable) bang select

sele=datt %>% select(Rank,Discipline,Yrs.since.phd,Sex,Salary)
head(sele)
##        Rank Discipline Yrs.since.phd  Sex Salary
## 1      Prof          B            19 Male 139750
## 2      Prof          B            20 Male 173200
## 3  AsstProf          B             4 Male  79750
## 4      Prof          B            45 Male 115000
## 5      Prof          B            40 Male 141500
## 6 AssocProf          B             6 Male  97000

Chon dong (observations) bang filter

fil1= datt %>% filter(Rank=="Prof")
head(fil1)
##   ID Rank Discipline Yrs.since.phd Yrs.service  Sex Salary
## 1  1 Prof          B            19          18 Male 139750
## 2  2 Prof          B            20          16 Male 173200
## 3  4 Prof          B            45          39 Male 115000
## 4  5 Prof          B            40          41 Male 141500
## 5  7 Prof          B            30          23 Male 175000
## 6  8 Prof          B            45          45 Male 147765
fil2= datt %>% filter(Rank=="AsstProf")
head(fil2)
##   ID     Rank Discipline Yrs.since.phd Yrs.service  Sex Salary
## 1  3 AsstProf          B             4           3 Male  79750
## 2 12 AsstProf          B             7           2 Male  79800
## 3 13 AsstProf          B             1           1 Male  77700
## 4 14 AsstProf          B             2           0 Male  78000
## 5 28 AsstProf          B             5           3 Male  82379
## 6 29 AsstProf          B            11           0 Male  77000

Noi data bang rbind

fil=rbind(fil1,fil2)
head(fil)
##   ID Rank Discipline Yrs.since.phd Yrs.service  Sex Salary
## 1  1 Prof          B            19          18 Male 139750
## 2  2 Prof          B            20          16 Male 173200
## 3  4 Prof          B            45          39 Male 115000
## 4  5 Prof          B            40          41 Male 141500
## 5  7 Prof          B            30          23 Male 175000
## 6  8 Prof          B            45          45 Male 147765

Sap xep thu tu bang arrange

fil=arrange(fil,Yrs.since.phd,Salary)
head(fil)
##    ID     Rank Discipline Yrs.since.phd Yrs.service    Sex Salary
## 1  50 AsstProf          B             1           1   Male  70768
## 2  13 AsstProf          B             1           1   Male  77700
## 3 158 AsstProf          B             1           0   Male  88000
## 4 165 AsstProf          B             1           0   Male  88795
## 5 128 AsstProf          A             2           0 Female  72500
## 6  14 AsstProf          B             2           0   Male  78000

Tao bien moi, tao bien SumSalary trong dataset moi ten “datnew”

datnew=mutate(fil,SumSalary=Salary*Yrs.service)
head(datnew)
##    ID     Rank Discipline Yrs.since.phd Yrs.service    Sex Salary SumSalary
## 1  50 AsstProf          B             1           1   Male  70768     70768
## 2  13 AsstProf          B             1           1   Male  77700     77700
## 3 158 AsstProf          B             1           0   Male  88000         0
## 4 165 AsstProf          B             1           0   Male  88795         0
## 5 128 AsstProf          A             2           0 Female  72500         0
## 6  14 AsstProf          B             2           0   Male  78000         0

Dung group_by va summarize

chongroup=group_by(datt,Rank,Discipline)
t= summarise(chongroup,count= n(),mean.Salary=mean(Salary,na.rm=T),mean.yearsv=mean(Yrs.service,na.rm = T))
## `summarise()` has grouped output by 'Rank'. You can override using the
## `.groups` argument.
  t           
## # A tibble: 6 × 5
## # Groups:   Rank [3]
##   Rank      Discipline count mean.Salary mean.yearsv
##   <chr>     <chr>      <int>       <dbl>       <dbl>
## 1 AssocProf A             26      83061.       13.5 
## 2 AssocProf B             38     101276.       10.9 
## 3 AsstProf  A             24      73936.        2.42
## 4 AsstProf  B             43      84594.        2.35
## 5 Prof      A            131     119948.       24.4 
## 6 Prof      B            135     133394.       21.2

Lấy mẫu theo so luong

nsample=sample_n(datt,30)
head(nsample)
##    ID      Rank Discipline Yrs.since.phd Yrs.service    Sex Salary
## 1 201  AsstProf          B             4           4   Male  92700
## 2 199      Prof          B            34          33   Male 189409
## 3  99      Prof          B            30          14   Male 102235
## 4 229      Prof          A            16          11   Male  88175
## 5 133 AssocProf          A            10           8 Female  77500
## 6  83      Prof          B            22          20   Male 144640

Lấy mẫu theo % (0.4 = 40%)

samplefrc=sample_frac(datt,0.4)
head(samplefrc)
##    ID     Rank Discipline Yrs.since.phd Yrs.service    Sex Salary
## 1  86     Prof          B            15          14   Male 132825
## 2  36 AsstProf          B             5           0 Female  77000
## 3 358     Prof          A            39          35   Male 107309
## 4  51     Prof          B            28          28   Male 126621
## 5 262     Prof          A            45          45   Male 107550
## 6 318     Prof          B            46          45   Male  67559