#1/ Input data “Salary”, xem data bang table1

setwd("E:/CONG VIEC/Ky nang ngoai/Xu ly so lieu Bang R")
datt=read.csv("Van Lang R and Machine Learning 2023/Thuc hanh ngay 1/Salaries.csv",header = T,na.strings = "NA")
head(datt)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex Salary
## 1  1      Prof          B            19          18 Male 139750
## 2  2      Prof          B            20          16 Male 173200
## 3  3  AsstProf          B             4           3 Male  79750
## 4  4      Prof          B            45          39 Male 115000
## 5  5      Prof          B            40          41 Male 141500
## 6  6 AssocProf          B             6           6 Male  97000
attach(datt)

Xem data bang table1

library(table1)
## 
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
## 
##     units, units<-
library(ggplot2)
table1(data=datt,~Rank+Discipline+Yrs.service+Yrs.since.phd+Salary|Sex)
Female
(N=39)
Male
(N=358)
Overall
(N=397)
Rank
AssocProf 10 (25.6%) 54 (15.1%) 64 (16.1%)
AsstProf 11 (28.2%) 56 (15.6%) 67 (16.9%)
Prof 18 (46.2%) 248 (69.3%) 266 (67.0%)
Discipline
A 18 (46.2%) 163 (45.5%) 181 (45.6%)
B 21 (53.8%) 195 (54.5%) 216 (54.4%)
Yrs.service
Mean (SD) 11.6 (8.81) 18.3 (13.2) 17.6 (13.0)
Median [Min, Max] 10.0 [0, 36.0] 18.0 [0, 60.0] 16.0 [0, 60.0]
Yrs.since.phd
Mean (SD) 16.5 (9.78) 22.9 (13.0) 22.3 (12.9)
Median [Min, Max] 17.0 [2.00, 39.0] 22.0 [1.00, 56.0] 21.0 [1.00, 56.0]
Salary
Mean (SD) 101000 (26000) 115000 (30400) 114000 (30300)
Median [Min, Max] 104000 [62900, 161000] 108000 [57800, 232000] 107000 [57800, 232000]

2/ Ve bieu do hop don gian luong theo gioi tinh

ggplot(data=datt,aes(x=Sex,y=Salary,fill=Sex))+geom_boxplot(outlier.colour = "red")

# 3/ Ve bieu do them cham jitter, pch>20, cac diem phan theo mau khac nhau

ggplot(data=datt,aes(x=Sex,y=Salary,fill=Sex))+geom_boxplot(outlier.colour = "red")+theme_classic()+geom_jitter(colour="purple",alpha=0.6,pch=22)

# Trang tri, them ten truc va tieu de

ggplot(data=datt,aes(x=Sex,y=Salary,fill=Sex))+geom_boxplot(outlier.colour = "red",outlier.size = 2)+theme_classic()+geom_jitter(colour="purple",alpha=0.6,pch=21,size=2.2)+labs(x="",y="Mức lương",title="So sánh lương theo giới tính")

Ve bieu do Beanplot

library(beanplot)
## Warning: package 'beanplot' was built under R version 4.3.1
beanplot(datt$Salary~datt$Sex,col=c("khaki1","coral","lightgreen"),border="cyan",main="So sanh luong theo gioi tinh",ylab="Muc Luong",names=c("Nữ","Nam"))
## log="y" selected

# Ve bieu do luong theo Rank, bo chu giai legend

ggplot(data=datt,aes(x=Rank,y=Salary,fill=Rank))+geom_boxplot(outlier.colour = "red",outlier.shape = 19)+theme_classic()+geom_jitter(pch=21,size=2.5,alpha=0.3)+theme(legend.position="none")

thay mau cho duong vien border trong geom_boxpot: colour=“darkblue”

ggplot(data=datt,aes(x=Rank,y=Salary,fill=Rank))+geom_boxplot(outlier.colour = "red",outlier.shape = 19,colour="darkblue")+theme_classic()+geom_jitter(pch=21,size=2.5,alpha=0.3)+theme(legend.position="none")

thay chu thich tieng Viet, xep cot theo thu tu mong muon: Tro ly, giao su, Pho giao su

datt$Loai[datt$Rank=="Prof"]="Giao su"
datt$Loai[datt$Rank=="AssocProf"]="Pho giao su"
datt$Loai[datt$Rank=="AsstProf"]="Tro ly"
datt$Loai=factor(datt$Loai,levels=c("Tro ly","Giao su","Pho giao su"))
head(datt,13)
##    ID      Rank Discipline Yrs.since.phd Yrs.service    Sex Salary        Loai
## 1   1      Prof          B            19          18   Male 139750     Giao su
## 2   2      Prof          B            20          16   Male 173200     Giao su
## 3   3  AsstProf          B             4           3   Male  79750      Tro ly
## 4   4      Prof          B            45          39   Male 115000     Giao su
## 5   5      Prof          B            40          41   Male 141500     Giao su
## 6   6 AssocProf          B             6           6   Male  97000 Pho giao su
## 7   7      Prof          B            30          23   Male 175000     Giao su
## 8   8      Prof          B            45          45   Male 147765     Giao su
## 9   9      Prof          B            21          20   Male 119250     Giao su
## 10 10      Prof          B            18          18 Female 129000     Giao su
## 11 11 AssocProf          B            12           8   Male 119800 Pho giao su
## 12 12  AsstProf          B             7           2   Male  79800      Tro ly
## 13 13  AsstProf          B             1           1   Male  77700      Tro ly
ggplot(data=datt,aes(x=Loai,y=Salary,fill=Loai))+geom_boxplot(outlier.alpha = 0.3,outlier.colour = "red",outlier.size = 2, colour="darkblue")+geom_jitter(pch=22,size=2,alpha=0.5)+theme_minimal()+theme(legend.position = "none")

# Ve bieu do violin co so sanh cac nhom

library(ggstatsplot)
## Warning: package 'ggstatsplot' was built under R version 4.3.1
## You can cite this package as:
##      Patil, I. (2021). Visualizations with statistical details: The 'ggstatsplot' approach.
##      Journal of Open Source Software, 6(61), 3167, doi:10.21105/joss.03167
ggbetweenstats(
data  = datt,
x     = Rank,
y     = Salary,
title = "Distribution of Salary across Rank")

ggbetweenstats(
data  = datt,
x     = Sex,
y     = Salary,
title = "Distribution of Salary across Sex")

ggbetweenstats(
data  = datt,
x     = Sex,
y     = Yrs.service,
title = "Distribution of Year of service across Sex")

library(afex)
## Warning: package 'afex' was built under R version 4.3.1
## Loading required package: lme4
## Loading required package: Matrix
## ************
## Welcome to afex. For support visit: http://afex.singmann.science/
## - Functions for ANOVAs: aov_car(), aov_ez(), and aov_4()
## - Methods for calculating p-values with mixed(): 'S', 'KR', 'LRT', and 'PB'
## - 'afex_aov' and 'mixed' objects can be passed to emmeans() for follow-up tests
## - Get and set global package options with: afex_options()
## - Set sum-to-zero contrasts globally: set_sum_contrasts()
## - For example analyses see: browseVignettes("afex")
## ************
## 
## Attaching package: 'afex'
## The following object is masked from 'package:lme4':
## 
##     lmer
library(WRS2)
## Warning: package 'WRS2' was built under R version 4.3.1
library(ggstatsplot)
grouped_gghistostats(
data              = datt,
x                 = Salary,
test.value        = 50,
type              = "nonparametric",
xlab              = "Salary (in million US$)",
grouping.var      = Sex,
normal.curve      = TRUE,
normal.curve.args = list(color = "red", size = 1),
ggtheme           = ggthemes::theme_tufte(),
## modify the defaults from `{ggstatsplot}` for each plot
plotgrid.args     = list(nrow = 1),
annotation.args   = list(title = "Salary for different Sex")
)

ggscatterstats(
data  = datt,
x     = Yrs.service,
y     = Salary,
xlab  = "Time in Sevice (in years)",
ylab  = "Amount of Salary (in USD)",
title = "Understanding Salary and time in service")
## Registered S3 method overwritten by 'ggside':
##   method from   
##   +.gg   ggplot2
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.