datt=read.csv("E:/CONG VIEC/Ky nang ngoai/Xu ly so lieu Bang R/Van Lang R and Machine Learning 2023/Thuc hanh ngay 1/Salaries.csv",header = T)
library(ggplot2)
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ lubridate 1.9.2 ✔ tibble 3.2.1
## ✔ purrr 1.0.1 ✔ tidyr 1.3.0
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(table1)
##
## Attaching package: 'table1'
##
## The following objects are masked from 'package:base':
##
## units, units<-
table1(data=datt,~Rank+Discipline+Yrs.service+Yrs.since.phd+Salary|Sex)
| Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
|---|---|---|---|
| Rank | |||
| AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
| AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
| Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
| Discipline | |||
| A | 18 (46.2%) | 163 (45.5%) | 181 (45.6%) |
| B | 21 (53.8%) | 195 (54.5%) | 216 (54.4%) |
| Yrs.service | |||
| Mean (SD) | 11.6 (8.81) | 18.3 (13.2) | 17.6 (13.0) |
| Median [Min, Max] | 10.0 [0, 36.0] | 18.0 [0, 60.0] | 16.0 [0, 60.0] |
| Yrs.since.phd | |||
| Mean (SD) | 16.5 (9.78) | 22.9 (13.0) | 22.3 (12.9) |
| Median [Min, Max] | 17.0 [2.00, 39.0] | 22.0 [1.00, 56.0] | 21.0 [1.00, 56.0] |
| Salary | |||
| Mean (SD) | 101000 (26000) | 115000 (30400) | 114000 (30300) |
| Median [Min, Max] | 104000 [62900, 161000] | 108000 [57800, 232000] | 107000 [57800, 232000] |
sele=datt %>% select(Rank,Discipline,Yrs.since.phd,Sex,Salary)
head(sele)
## Rank Discipline Yrs.since.phd Sex Salary
## 1 Prof B 19 Male 139750
## 2 Prof B 20 Male 173200
## 3 AsstProf B 4 Male 79750
## 4 Prof B 45 Male 115000
## 5 Prof B 40 Male 141500
## 6 AssocProf B 6 Male 97000
fil1= datt %>% filter(Rank=="Prof")
head(fil1)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## 1 1 Prof B 19 18 Male 139750
## 2 2 Prof B 20 16 Male 173200
## 3 4 Prof B 45 39 Male 115000
## 4 5 Prof B 40 41 Male 141500
## 5 7 Prof B 30 23 Male 175000
## 6 8 Prof B 45 45 Male 147765
fil2= datt %>% filter(Rank=="AsstProf")
head(fil2)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## 1 3 AsstProf B 4 3 Male 79750
## 2 12 AsstProf B 7 2 Male 79800
## 3 13 AsstProf B 1 1 Male 77700
## 4 14 AsstProf B 2 0 Male 78000
## 5 28 AsstProf B 5 3 Male 82379
## 6 29 AsstProf B 11 0 Male 77000
fil=rbind(fil1,fil2)
head(fil)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## 1 1 Prof B 19 18 Male 139750
## 2 2 Prof B 20 16 Male 173200
## 3 4 Prof B 45 39 Male 115000
## 4 5 Prof B 40 41 Male 141500
## 5 7 Prof B 30 23 Male 175000
## 6 8 Prof B 45 45 Male 147765
fil=arrange(fil,Yrs.since.phd,Salary)
head(fil)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## 1 50 AsstProf B 1 1 Male 70768
## 2 13 AsstProf B 1 1 Male 77700
## 3 158 AsstProf B 1 0 Male 88000
## 4 165 AsstProf B 1 0 Male 88795
## 5 128 AsstProf A 2 0 Female 72500
## 6 14 AsstProf B 2 0 Male 78000
datnew=mutate(fil,SumSalary=Salary*Yrs.service)
head(datnew)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary SumSalary
## 1 50 AsstProf B 1 1 Male 70768 70768
## 2 13 AsstProf B 1 1 Male 77700 77700
## 3 158 AsstProf B 1 0 Male 88000 0
## 4 165 AsstProf B 1 0 Male 88795 0
## 5 128 AsstProf A 2 0 Female 72500 0
## 6 14 AsstProf B 2 0 Male 78000 0
chongroup=group_by(datt,Rank,Discipline)
t= summarise(chongroup,count= n(),mean.Salary=mean(Salary,na.rm=T),mean.yearsv=mean(Yrs.service,na.rm = T))
## `summarise()` has grouped output by 'Rank'. You can override using the
## `.groups` argument.
t
## # A tibble: 6 × 5
## # Groups: Rank [3]
## Rank Discipline count mean.Salary mean.yearsv
## <chr> <chr> <int> <dbl> <dbl>
## 1 AssocProf A 26 83061. 13.5
## 2 AssocProf B 38 101276. 10.9
## 3 AsstProf A 24 73936. 2.42
## 4 AsstProf B 43 84594. 2.35
## 5 Prof A 131 119948. 24.4
## 6 Prof B 135 133394. 21.2
nsample=sample_n(datt,30)
head(nsample)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## 1 201 AsstProf B 4 4 Male 92700
## 2 199 Prof B 34 33 Male 189409
## 3 99 Prof B 30 14 Male 102235
## 4 229 Prof A 16 11 Male 88175
## 5 133 AssocProf A 10 8 Female 77500
## 6 83 Prof B 22 20 Male 144640
samplefrc=sample_frac(datt,0.4)
head(samplefrc)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## 1 86 Prof B 15 14 Male 132825
## 2 36 AsstProf B 5 0 Female 77000
## 3 358 Prof A 39 35 Male 107309
## 4 51 Prof B 28 28 Male 126621
## 5 262 Prof A 45 45 Male 107550
## 6 318 Prof B 46 45 Male 67559