TRM PraCtical Data Analysis

#TRM Pratical Data Analysis - Basic Level ## Day 1. Introduction to R ### Task 2: Install the following package

#install.packages(c("readxl", "tidyverse", "dplyr", "table1", "compareGroups", "ggplot2", "grid", "gridExtra", "GGally", "ggthemes", "DescTools", "simpleboot", "lmboot"), dependencies=T)

###Task 3: Import the csv data set “Professorial Salaries” into R

salary=read.csv("C:/Users/24544355/OneDrive - UTS/Desktop/Assignment/32931/Module10/Professorial Salaries.csv")

###Task 4: Describe the salary data set ####4.1 How many participants/variables are there in the “salary” data set?

dim(salary)

## [1] 397   9

####4.2 List the variable names

names(salary)

## [1] "ID"            "Rank"          "Discipline"    "Yrs.since.phd"
## [5] "Yrs.service"   "Sex"           "NPubs"         "Ncits"        
## [9] "Salary"

####4.3 List the first 6 observations

head(salary)

##   ID      Rank Discipline Yrs.since.phd Yrs.service  Sex NPubs Ncits Salary
## 1  1      Prof          B            19          18 Male    18    50 139750
## 2  2      Prof          B            20          16 Male     3    26 173200
## 3  3  AsstProf          B             4           3 Male     2    50  79750
## 4  4      Prof          B            45          39 Male    17    34 115000
## 5  5      Prof          B            40          41 Male    11    41 141500
## 6  6 AssocProf          B             6           6 Male     6    37  97000

###optional: Decribe the dataset

library (Hmisc)

## 
## Attaching package: 'Hmisc'

## The following objects are masked from 'package:base':
## 
##     format.pval, units

describe(salary)

## salary 
## 
##  9  Variables      397  Observations
## --------------------------------------------------------------------------------
## ID 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      397        0      397        1      199    132.7     20.8     40.6 
##      .25      .50      .75      .90      .95 
##    100.0    199.0    298.0    357.4    377.2 
## 
## lowest :   1   2   3   4   5, highest: 393 394 395 396 397
## --------------------------------------------------------------------------------
## Rank 
##        n  missing distinct 
##      397        0        3 
##                                         
## Value      AssocProf  AsstProf      Prof
## Frequency         64        67       266
## Proportion     0.161     0.169     0.670
## --------------------------------------------------------------------------------
## Discipline 
##        n  missing distinct 
##      397        0        2 
##                       
## Value          A     B
## Frequency    181   216
## Proportion 0.456 0.544
## --------------------------------------------------------------------------------
## Yrs.since.phd 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      397        0       53    0.999    22.31    14.77        4        5 
##      .25      .50      .75      .90      .95 
##       12       21       32       40       45 
## 
## lowest :  1  2  3  4  5, highest: 49 51 52 54 56
## --------------------------------------------------------------------------------
## Yrs.service 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      397        0       52    0.999    17.61    14.63      1.0      3.0 
##      .25      .50      .75      .90      .95 
##      7.0     16.0     27.0     37.0     41.4 
## 
## lowest :  0  1  2  3  4, highest: 49 51 53 57 60
## --------------------------------------------------------------------------------
## Sex 
##        n  missing distinct 
##      397        0        2 
##                         
## Value      Female   Male
## Frequency      39    358
## Proportion  0.098  0.902
## --------------------------------------------------------------------------------
## NPubs 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      397        0       36    0.998    18.15    14.76      3.0      5.0 
##      .25      .50      .75      .90      .95 
##      8.0     13.0     26.0     38.4     48.4 
## 
## lowest :  1  2  3  4  5, highest: 39 41 48 50 69
## --------------------------------------------------------------------------------
## Ncits 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      397        0       36    0.998    40.22    18.59       18       25 
##      .25      .50      .75      .90      .95 
##       28       35       50       69       69 
## 
## lowest :  1 14 18 19 24, highest: 61 69 70 83 90
## --------------------------------------------------------------------------------
## Salary 
##        n  missing distinct     Info     Mean      Gmd      .05      .10 
##      397        0      371        1   113706    33872    73293    77381 
##      .25      .50      .75      .90      .95 
##    91000   107300   134185   152946   168908 
## 
## lowest :  57800  62884  63100  63900  67559, highest: 193000 194800 204000 205500 231545
## --------------------------------------------------------------------------------

###Task 5: Create a new variable “salary.level”:

salary$salary.level1[salary$Salary>= 130000] = "High"
salary$salary.level1[salary$Salary>= 100000 & salary$Salary< 130000] = "Medium"
salary$salary.level1[salary$Salary< 100000] = "Low"
table(salary$salary.level1)

## 
##   High    Low Medium 
##    110    140    147

library(tidyverse)

## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.0     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter()    masks stats::filter()
## ✖ dplyr::lag()       masks stats::lag()
## ✖ dplyr::src()       masks Hmisc::src()
## ✖ dplyr::summarize() masks Hmisc::summarize()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors

salary = salary %>% mutate(salary.level2 = case_when(Salary >= 130000~ "High", 
                                                    Salary>= 100000 & Salary< 130000~ "Medium",
                                                    Salary<100000~ "Low"))
table(salary$salary.level2)

## 
##   High    Low Medium 
##    110    140    147

###Task 6: Create a new variable “high.salary”:

salary$high.salary1[salary$Salary>= 130000] = 1
salary$high.salary1[salary$Salary< 130000] = 0
table(salary$salary.level1, salary$high.salary1)

##         
##            0   1
##   High     0 110
##   Low    140   0
##   Medium 147   0

library(tidyverse)
salary$high.salary2 = ifelse(salary$Salary>= 130000, 1, 0)

salary = salary %>% mutate(high.salary3 = case_when(Salary >= 130000~ 1,
                                                    Salary<130000~ 0))

table(salary$high.salary1, salary$high.salary2)

##    
##       0   1
##   0 287   0
##   1   0 110

table(salary$high.salary1, salary$high.salary3)

##    
##       0   1
##   0 287   0
##   1   0 110

###Task 7: Create a new variable “salary.aud” as professor’s salaries in AUD (USD/AUD ratio = 1.53).

salary$salary.aud = salary$Salary*1.53

library(tidyverse)
salary = salary %>% mutate(salary.aud = Salary*1.53)

###Task 8: Select a subset of participants that includes male professors in the theoretical department who had high salaries.

men.A.high = subset(salary, Sex == "Male" & Discipline == "A" & salary.level1 == "High")
dim(men.A.high)

## [1] 40 15

library(tidyverse)
Men.A.High = salary %>% filter(Sex == "Male", Discipline == "A", salary.level1 == "High")
dim(Men.A.High)

## [1] 40 15

###Task 9. Select 4 variables (ID, Rank, Sex, Salary)

var.select1 = subset(salary, select = c(ID, Rank, Sex, Salary))
names(var.select1)

## [1] "ID"     "Rank"   "Sex"    "Salary"

library(tidyverse)
var.select2 = salary %>% select(ID, Rank, Salary, Sex)
names(var.select2)

## [1] "ID"     "Rank"   "Salary" "Sex"

TRM PraCtical Data Analysis - Basic Level

Shuaiwen Feng

2024-03-11