#1/ Input data “Salary”, xem data bang table1
setwd("E:/CONG VIEC/Ky nang ngoai/Xu ly so lieu Bang R")
datt=read.csv("Van Lang R and Machine Learning 2023/Thuc hanh ngay 1/Salaries.csv",header = T,na.strings = "NA")
head(datt)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## 1 1 Prof B 19 18 Male 139750
## 2 2 Prof B 20 16 Male 173200
## 3 3 AsstProf B 4 3 Male 79750
## 4 4 Prof B 45 39 Male 115000
## 5 5 Prof B 40 41 Male 141500
## 6 6 AssocProf B 6 6 Male 97000
attach(datt)
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
library(ggplot2)
table1(data=datt,~Rank+Discipline+Yrs.service+Yrs.since.phd+Salary|Sex)
| Female (N=39) |
Male (N=358) |
Overall (N=397) |
|
|---|---|---|---|
| Rank | |||
| AssocProf | 10 (25.6%) | 54 (15.1%) | 64 (16.1%) |
| AsstProf | 11 (28.2%) | 56 (15.6%) | 67 (16.9%) |
| Prof | 18 (46.2%) | 248 (69.3%) | 266 (67.0%) |
| Discipline | |||
| A | 18 (46.2%) | 163 (45.5%) | 181 (45.6%) |
| B | 21 (53.8%) | 195 (54.5%) | 216 (54.4%) |
| Yrs.service | |||
| Mean (SD) | 11.6 (8.81) | 18.3 (13.2) | 17.6 (13.0) |
| Median [Min, Max] | 10.0 [0, 36.0] | 18.0 [0, 60.0] | 16.0 [0, 60.0] |
| Yrs.since.phd | |||
| Mean (SD) | 16.5 (9.78) | 22.9 (13.0) | 22.3 (12.9) |
| Median [Min, Max] | 17.0 [2.00, 39.0] | 22.0 [1.00, 56.0] | 21.0 [1.00, 56.0] |
| Salary | |||
| Mean (SD) | 101000 (26000) | 115000 (30400) | 114000 (30300) |
| Median [Min, Max] | 104000 [62900, 161000] | 108000 [57800, 232000] | 107000 [57800, 232000] |
ggplot(data=datt,aes(x=Sex,y=Salary,fill=Sex))+geom_boxplot(outlier.colour = "red")
# 3/ Ve bieu do them cham jitter, pch>20, cac diem phan theo mau khac
nhau
ggplot(data=datt,aes(x=Sex,y=Salary,fill=Sex))+geom_boxplot(outlier.colour = "red")+theme_classic()+geom_jitter(colour="purple",alpha=0.6,pch=22)
# Trang tri, them ten truc va tieu de
ggplot(data=datt,aes(x=Sex,y=Salary,fill=Sex))+geom_boxplot(outlier.colour = "red",outlier.size = 2)+theme_classic()+geom_jitter(colour="purple",alpha=0.6,pch=21,size=2.2)+labs(x="",y="Mức lương",title="So sánh lương theo giới tính")
library(beanplot)
## Warning: package 'beanplot' was built under R version 4.3.1
beanplot(datt$Salary~datt$Sex,col=c("khaki1","coral","lightgreen"),border="cyan",main="So sanh luong theo gioi tinh",ylab="Muc Luong",names=c("Nữ","Nam"))
## log="y" selected
# Ve bieu do luong theo Rank, bo chu giai legend
ggplot(data=datt,aes(x=Rank,y=Salary,fill=Rank))+geom_boxplot(outlier.colour = "red",outlier.shape = 19)+theme_classic()+geom_jitter(pch=21,size=2.5,alpha=0.3)+theme(legend.position="none")
ggplot(data=datt,aes(x=Rank,y=Salary,fill=Rank))+geom_boxplot(outlier.colour = "red",outlier.shape = 19,colour="darkblue")+theme_classic()+geom_jitter(pch=21,size=2.5,alpha=0.3)+theme(legend.position="none")
datt$Loai[datt$Rank=="Prof"]="Giao su"
datt$Loai[datt$Rank=="AssocProf"]="Pho giao su"
datt$Loai[datt$Rank=="AsstProf"]="Tro ly"
datt$Loai=factor(datt$Loai,levels=c("Tro ly","Giao su","Pho giao su"))
head(datt,13)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary Loai
## 1 1 Prof B 19 18 Male 139750 Giao su
## 2 2 Prof B 20 16 Male 173200 Giao su
## 3 3 AsstProf B 4 3 Male 79750 Tro ly
## 4 4 Prof B 45 39 Male 115000 Giao su
## 5 5 Prof B 40 41 Male 141500 Giao su
## 6 6 AssocProf B 6 6 Male 97000 Pho giao su
## 7 7 Prof B 30 23 Male 175000 Giao su
## 8 8 Prof B 45 45 Male 147765 Giao su
## 9 9 Prof B 21 20 Male 119250 Giao su
## 10 10 Prof B 18 18 Female 129000 Giao su
## 11 11 AssocProf B 12 8 Male 119800 Pho giao su
## 12 12 AsstProf B 7 2 Male 79800 Tro ly
## 13 13 AsstProf B 1 1 Male 77700 Tro ly
ggplot(data=datt,aes(x=Loai,y=Salary,fill=Loai))+geom_boxplot(outlier.alpha = 0.3,outlier.colour = "red",outlier.size = 2, colour="darkblue")+geom_jitter(pch=22,size=2,alpha=0.5)+theme_minimal()+theme(legend.position = "none")
# Ve bieu do violin co so sanh cac nhom
library(ggstatsplot)
## Warning: package 'ggstatsplot' was built under R version 4.3.1
## You can cite this package as:
## Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
## Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
ggbetweenstats(
data = datt,
x = Rank,
y = Salary,
title = "Distribution of Salary across Rank")
ggbetweenstats(
data = datt,
x = Sex,
y = Salary,
title = "Distribution of Salary across Sex")
ggbetweenstats(
data = datt,
x = Sex,
y = Yrs.service,
title = "Distribution of Year of service across Sex")
library(afex)
## Warning: package 'afex' was built under R version 4.3.1
## Loading required package: lme4
## Loading required package: Matrix
## ************
## Welcome to afex. For support visit: http://afex.singmann.science/
## - Functions for ANOVAs: aov_car(), aov_ez(), and aov_4()
## - Methods for calculating p-values with mixed(): 'S', 'KR', 'LRT', and 'PB'
## - 'afex_aov' and 'mixed' objects can be passed to emmeans() for follow-up tests
## - Get and set global package options with: afex_options()
## - Set sum-to-zero contrasts globally: set_sum_contrasts()
## - For example analyses see: browseVignettes("afex")
## ************
##
## Attaching package: 'afex'
## The following object is masked from 'package:lme4':
##
## lmer
library(WRS2)
## Warning: package 'WRS2' was built under R version 4.3.1
library(ggstatsplot)
grouped_gghistostats(
data = datt,
x = Salary,
test.value = 50,
type = "nonparametric",
xlab = "Salary (in million US$)",
grouping.var = Sex,
normal.curve = TRUE,
normal.curve.args = list(color = "red", size = 1),
ggtheme = ggthemes::theme_tufte(),
## modify the defaults from `{ggstatsplot}` for each plot
plotgrid.args = list(nrow = 1),
annotation.args = list(title = "Salary for different Sex")
)
ggscatterstats(
data = datt,
x = Yrs.service,
y = Salary,
xlab = "Time in Sevice (in years)",
ylab = "Amount of Salary (in USD)",
title = "Understanding Salary and time in service")
## Registered S3 method overwritten by 'ggside':
## method from
## +.gg ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.