Read the file
salary = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Basic\\Professorial Salaries.csv")
View the dimension of salary
dim(salary)
## [1] 397 9
List variable name
names(salary)
## [1] "ID" "Rank" "Discipline" "Yrs.since.phd"
## [5] "Yrs.service" "Sex" "NPubs" "Ncits"
## [9] "Salary"
List first six variable name
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
Create variable ‘salary level’
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
salary = salary %>% mutate(salary.level = case_when(Salary >= 130000~ "High",
Salary>= 100000 & Salary< 130000~ "Medium",
Salary<100000~ "Low"))
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
## salary.level
## 1 High
## 2 High
## 3 Low
## 4 Medium
## 5 High
## 6 Low
Create variable ‘high.salary’
salary = salary %>% mutate(high_salary = case_when(Salary >= 130000~ "1", Salary< 130000~ "0",))
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
## salary.level high_salary
## 1 High 1
## 2 High 1
## 3 Low 0
## 4 Medium 0
## 5 High 1
## 6 Low 0
Create ‘salary.aud’
salary = salary %>% mutate(salary.aud = Salary*1.53)
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
## salary.level high_salary salary.aud
## 1 High 1 213817.5
## 2 High 1 264996.0
## 3 Low 0 122017.5
## 4 Medium 0 175950.0
## 5 High 1 216495.0
## 6 Low 0 148410.0
Subset of participant that include male participant who had high
salaries
Men.A.High = salary %>% filter(Sex == "Male", Discipline == "A",salary.level == "High")
dim(Men.A.High)
## [1] 40 12
New data set including 4 variables
var.select1 = salary %>% select(ID, Rank, Salary, Sex)
names(var.select1)
## [1] "ID" "Rank" "Salary" "Sex"