TRM Practical Data Analysis - Basic level

Lecture 1. Introduction to R

Task 2. Install packages needed for hands-on sessions

#install.packages(c("readxl", "tidyverse", "table1", "compareGroups", "ggplot2", "GGally", "DescTools", "simpleboot"), dependencies = T)

Task 3. Read the “Professorial Salaries” and name this dataset “salary”

salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2024_Autumn semester\\Data\\Professorial Salaries.csv")

Task 4. Describe the dataset

4.1 Determine the number of observations and variables in the dataset:

dim(salary)
## [1] 397   9

4.2 List the variable names:

names(salary)
## [1] "ID"            "Rank"          "Discipline"    "Yrs.since.phd"
## [5] "Yrs.service"   "Sex"           "NPubs"         "Ncits"        
## [9] "Salary"

4.3 List the first 6 observations:

List the first 10 observations [head(salary, n= 10)] or the last 6 observation [tail(salary)]

head(salary)
##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000

Task 5. Create a new categorical variable “salary.level” with 3 levels (Low, Medium, High)

# Base package:

salary$salary.level1[salary$Salary>= 130000] = "High"
salary$salary.level1[salary$Salary>= 100000 & salary$Salary< 130000] = "Medium"
salary$salary.level1[salary$Salary< 100000] = "Low"
table(salary$salary.level1)
## 
##   High    Low Medium 
##    110    140    147
# tidyverse package:
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.3     ✔ readr     2.1.4
## ✔ forcats   1.0.0     ✔ stringr   1.5.0
## ✔ ggplot2   3.4.3     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.0
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
salary = salary %>% mutate(salary.level2 = case_when(Salary >= 130000~ "High", 
                                                    Salary>= 100000 & Salary< 130000~ "Medium",
                                                    Salary<100000~ "Low"))
table(salary$salary.level2)
## 
##   High    Low Medium 
##    110    140    147
## Cross-check:
table(salary$salary.level1, salary$salary.level2)
##         
##          High Low Medium
##   High    110   0      0
##   Low       0 140      0
##   Medium    0   0    147

Task 6. Create a new binary variable “high.salary” with 2 levels (Low, High)

# Base package:

salary$high.salary1[salary$Salary>= 130000] = 1
salary$high.salary1[salary$Salary< 130000] = 0
table(salary$salary.level1, salary$high.salary1)
##         
##            0   1
##   High     0 110
##   Low    140   0
##   Medium 147   0
salary$high.salary2 = ifelse(salary$Salary>= 130000, 1, 0)

# tidyverse package:
salary = salary %>% mutate(high.salary3 = case_when(Salary >= 130000~ 1,
                                                    Salary<130000~ 0))

## Cross-check:
table(salary$high.salary1, salary$high.salary2)
##    
##       0   1
##   0 287   0
##   1   0 110
table(salary$high.salary1, salary$high.salary3)
##    
##       0   1
##   0 287   0
##   1   0 110

Task 7. Create a new continuous variable “salary.aud” (1 USD = 1.53 AUD)

# Base package:
salary$salary.aud = salary$Salary*1.53

# tidyverse package:
salary = salary %>% mutate(salary.aud = Salary*1.53)

Task 8. Select a subset that includes male professors in Discipline A and had high salary

# Base package:
men.A.high = subset(salary, Sex == "Male" & Discipline == "A" & salary.level1 == "High")
dim(men.A.high)
## [1] 40 15
# tidyverse package:
Men.A.High = salary %>% filter(Sex == "Male", Discipline == "A", salary.level1 == "High")
dim(Men.A.High)
## [1] 40 15

Task 9. Select 4 variables (ID, Rank, Sex, Salary)

# Base package:
var.select1 = subset(salary, select = c(ID, Rank, Sex, Salary))
names(var.select1)
## [1] "ID"     "Rank"   "Sex"    "Salary"
# tidyverse package:
var.select2 = salary %>% select(ID, Rank, Salary, Sex)
names(var.select2)
## [1] "ID"     "Rank"   "Salary" "Sex"

Task 10. Post the exercise into your Rpubs account