#TRM Pratical Data Analysis - Basic Level ## Day 1. Introduction to R ### Task 2: Install the following package
#install.packages(c("readxl", "tidyverse", "dplyr", "table1", "compareGroups", "ggplot2", "grid", "gridExtra", "GGally", "ggthemes", "DescTools", "simpleboot", "lmboot"), dependencies=T)
###Task 3: Import the csv data set “Professorial Salaries” into R
salary=read.csv("C:/Users/24544355/OneDrive - UTS/Desktop/Assignment/32931/Module10/Professorial Salaries.csv")
###Task 4: Describe the salary data set ####4.1 How many participants/variables are there in the “salary” data set?
dim(salary)
## [1] 397 9
####4.2 List the variable names
names(salary)
## [1] "ID" "Rank" "Discipline" "Yrs.since.phd"
## [5] "Yrs.service" "Sex" "NPubs" "Ncits"
## [9] "Salary"
####4.3 List the first 6 observations
head(salary)
## ID Rank Discipline Yrs.since.phd Yrs.service Sex NPubs Ncits Salary
## 1 1 Prof B 19 18 Male 18 50 139750
## 2 2 Prof B 20 16 Male 3 26 173200
## 3 3 AsstProf B 4 3 Male 2 50 79750
## 4 4 Prof B 45 39 Male 17 34 115000
## 5 5 Prof B 40 41 Male 11 41 141500
## 6 6 AssocProf B 6 6 Male 6 37 97000
###optional: Decribe the dataset
library (Hmisc)
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
describe(salary)
## salary
##
## 9 Variables 397 Observations
## --------------------------------------------------------------------------------
## ID
## n missing distinct Info Mean Gmd .05 .10
## 397 0 397 1 199 132.7 20.8 40.6
## .25 .50 .75 .90 .95
## 100.0 199.0 298.0 357.4 377.2
##
## lowest : 1 2 3 4 5, highest: 393 394 395 396 397
## --------------------------------------------------------------------------------
## Rank
## n missing distinct
## 397 0 3
##
## Value AssocProf AsstProf Prof
## Frequency 64 67 266
## Proportion 0.161 0.169 0.670
## --------------------------------------------------------------------------------
## Discipline
## n missing distinct
## 397 0 2
##
## Value A B
## Frequency 181 216
## Proportion 0.456 0.544
## --------------------------------------------------------------------------------
## Yrs.since.phd
## n missing distinct Info Mean Gmd .05 .10
## 397 0 53 0.999 22.31 14.77 4 5
## .25 .50 .75 .90 .95
## 12 21 32 40 45
##
## lowest : 1 2 3 4 5, highest: 49 51 52 54 56
## --------------------------------------------------------------------------------
## Yrs.service
## n missing distinct Info Mean Gmd .05 .10
## 397 0 52 0.999 17.61 14.63 1.0 3.0
## .25 .50 .75 .90 .95
## 7.0 16.0 27.0 37.0 41.4
##
## lowest : 0 1 2 3 4, highest: 49 51 53 57 60
## --------------------------------------------------------------------------------
## Sex
## n missing distinct
## 397 0 2
##
## Value Female Male
## Frequency 39 358
## Proportion 0.098 0.902
## --------------------------------------------------------------------------------
## NPubs
## n missing distinct Info Mean Gmd .05 .10
## 397 0 36 0.998 18.15 14.76 3.0 5.0
## .25 .50 .75 .90 .95
## 8.0 13.0 26.0 38.4 48.4
##
## lowest : 1 2 3 4 5, highest: 39 41 48 50 69
## --------------------------------------------------------------------------------
## Ncits
## n missing distinct Info Mean Gmd .05 .10
## 397 0 36 0.998 40.22 18.59 18 25
## .25 .50 .75 .90 .95
## 28 35 50 69 69
##
## lowest : 1 14 18 19 24, highest: 61 69 70 83 90
## --------------------------------------------------------------------------------
## Salary
## n missing distinct Info Mean Gmd .05 .10
## 397 0 371 1 113706 33872 73293 77381
## .25 .50 .75 .90 .95
## 91000 107300 134185 152946 168908
##
## lowest : 57800 62884 63100 63900 67559, highest: 193000 194800 204000 205500 231545
## --------------------------------------------------------------------------------
###Task 5: Create a new variable “salary.level”:
salary$salary.level1[salary$Salary>= 130000] = "High"
salary$salary.level1[salary$Salary>= 100000 & salary$Salary< 130000] = "Medium"
salary$salary.level1[salary$Salary< 100000] = "Low"
table(salary$salary.level1)
##
## High Low Medium
## 110 140 147
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.4 ✔ readr 2.1.5
## ✔ forcats 1.0.0 ✔ stringr 1.5.1
## ✔ ggplot2 3.5.0 ✔ tibble 3.2.1
## ✔ lubridate 1.9.3 ✔ tidyr 1.3.1
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ✖ dplyr::src() masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
salary = salary %>% mutate(salary.level2 = case_when(Salary >= 130000~ "High",
Salary>= 100000 & Salary< 130000~ "Medium",
Salary<100000~ "Low"))
table(salary$salary.level2)
##
## High Low Medium
## 110 140 147
###Task 6: Create a new variable “high.salary”:
salary$high.salary1[salary$Salary>= 130000] = 1
salary$high.salary1[salary$Salary< 130000] = 0
table(salary$salary.level1, salary$high.salary1)
##
## 0 1
## High 0 110
## Low 140 0
## Medium 147 0
library(tidyverse)
salary$high.salary2 = ifelse(salary$Salary>= 130000, 1, 0)
salary = salary %>% mutate(high.salary3 = case_when(Salary >= 130000~ 1,
Salary<130000~ 0))
table(salary$high.salary1, salary$high.salary2)
##
## 0 1
## 0 287 0
## 1 0 110
table(salary$high.salary1, salary$high.salary3)
##
## 0 1
## 0 287 0
## 1 0 110
###Task 7: Create a new variable “salary.aud” as professor’s salaries in AUD (USD/AUD ratio = 1.53).
salary$salary.aud = salary$Salary*1.53
library(tidyverse)
salary = salary %>% mutate(salary.aud = Salary*1.53)
###Task 8: Select a subset of participants that includes male professors in the theoretical department who had high salaries.
men.A.high = subset(salary, Sex == "Male" & Discipline == "A" & salary.level1 == "High")
dim(men.A.high)
## [1] 40 15
library(tidyverse)
Men.A.High = salary %>% filter(Sex == "Male", Discipline == "A", salary.level1 == "High")
dim(Men.A.High)
## [1] 40 15
###Task 9. Select 4 variables (ID, Rank, Sex, Salary)
var.select1 = subset(salary, select = c(ID, Rank, Sex, Salary))
names(var.select1)
## [1] "ID" "Rank" "Sex" "Salary"
library(tidyverse)
var.select2 = salary %>% select(ID, Rank, Salary, Sex)
names(var.select2)
## [1] "ID" "Rank" "Salary" "Sex"