Read the file

salary = read.csv("C:\\Users\\User\\Documents\\UTS\\AUTUMN 2024\\TRM\\Data Analyst R Basic\\Professorial Salaries.csv")

View the dimension of salary

dim(salary)
## [1] 397   9

List variable name

names(salary)
## [1] "ID"            "Rank"          "Discipline"    "Yrs.since.phd"
## [5] "Yrs.service"   "Sex"           "NPubs"         "Ncits"        
## [9] "Salary"

List first six variable name

head(salary)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000

Create variable ‘salary level’

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
salary = salary %>% mutate(salary.level = case_when(Salary >= 130000~ "High",
                                                    Salary>= 100000 & Salary< 130000~ "Medium",
                                                    Salary<100000~ "Low"))
head(salary)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000
##   salary.level
## 1         High
## 2         High
## 3          Low
## 4       Medium
## 5         High
## 6          Low

Create variable ‘high.salary’

salary = salary %>% mutate(high_salary = case_when(Salary >= 130000~ "1", Salary< 130000~ "0",))
head(salary)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000
##   salary.level high_salary
## 1         High           1
## 2         High           1
## 3          Low           0
## 4       Medium           0
## 5         High           1
## 6          Low           0

Create ‘salary.aud’

salary = salary %>% mutate(salary.aud = Salary*1.53)
head(salary)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000
##   salary.level high_salary salary.aud
## 1         High           1   213817.5
## 2         High           1   264996.0
## 3          Low           0   122017.5
## 4       Medium           0   175950.0
## 5         High           1   216495.0
## 6          Low           0   148410.0

Subset of participant that include male participant who had high salaries

Men.A.High = salary %>% filter(Sex == "Male", Discipline == "A",salary.level == "High")
dim(Men.A.High)
## [1] 40 12

New data set including 4 variables

var.select1 = salary %>% select(ID, Rank, Salary, Sex)
names(var.select1)
## [1] "ID"     "Rank"   "Salary" "Sex"