https://cran.r-project.org/mirrors.html https://www.rstudio.com/products/rstudio/download
#install.packages(c("readxl", "tidyverse", "table1", "compareGroups", "ggplot2", "GGally", "DescTools", "simpleboot"), dependencies = T)
salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2024_Autumn semester\\Data\\Professorial Salaries.csv")
dim(salary)
## [1] 397 9
names(salary)
## [1] "ID" "Rank" "Discipline" "Yrs.since.phd"
## [5] "Yrs.service" "Sex" "NPubs" "Ncits"
## [9] "Salary"
List the first 10 observations [head(salary, n= 10)] or the last 6 observation [tail(salary)]
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
# Base package:
salary$salary.level1[salary$Salary>= 130000] = "High"
salary$salary.level1[salary$Salary>= 100000 & salary$Salary< 130000] = "Medium"
salary$salary.level1[salary$Salary< 100000] = "Low"
table(salary$salary.level1)
##
## High Low Medium
## 110 140 147
# tidyverse package:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.3 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
salary = salary %>% mutate(salary.level2 = case_when(Salary >= 130000~ "High",
Salary>= 100000 & Salary< 130000~ "Medium",
Salary<100000~ "Low"))
table(salary$salary.level2)
##
## High Low Medium
## 110 140 147
## Cross-check:
table(salary$salary.level1, salary$salary.level2)
##
## High Low Medium
## High 110 0 0
## Low 0 140 0
## Medium 0 0 147
# Base package:
salary$high.salary1[salary$Salary>= 130000] = 1
salary$high.salary1[salary$Salary< 130000] = 0
table(salary$salary.level1, salary$high.salary1)
##
## 0 1
## High 0 110
## Low 140 0
## Medium 147 0
salary$high.salary2 = ifelse(salary$Salary>= 130000, 1, 0)
# tidyverse package:
salary = salary %>% mutate(high.salary3 = case_when(Salary >= 130000~ 1,
Salary<130000~ 0))
## Cross-check:
table(salary$high.salary1, salary$high.salary2)
##
## 0 1
## 0 287 0
## 1 0 110
table(salary$high.salary1, salary$high.salary3)
##
## 0 1
## 0 287 0
## 1 0 110
# Base package:
salary$salary.aud = salary$Salary*1.53
# tidyverse package:
salary = salary %>% mutate(salary.aud = Salary*1.53)
# Base package:
men.A.high = subset(salary, Sex == "Male" & Discipline == "A" & salary.level1 == "High")
dim(men.A.high)
## [1] 40 15
# tidyverse package:
Men.A.High = salary %>% filter(Sex == "Male", Discipline == "A", salary.level1 == "High")
dim(Men.A.High)
## [1] 40 15
# Base package:
var.select1 = subset(salary, select = c(ID, Rank, Sex, Salary))
names(var.select1)
## [1] "ID" "Rank" "Sex" "Salary"
# tidyverse package:
var.select2 = salary %>% select(ID, Rank, Salary, Sex)
names(var.select2)
## [1] "ID" "Rank" "Salary" "Sex"