https://cran.r-project.org/mirrors.html https://www.rstudio.com/products/rstudio/download
#install.packages(c("readxl","tidyverse","ggplot2","GGally","DescTools","table1","compareGroups","caret","MASS","pROC","factoextra","devtools","datasets","ape","ggdendro","NbClust","dendextend","cluster","randomForest"), dependencies = T)
library(readxl)
salary = read_excel("C:\\Thach\\VLU workshop (Jun2023)\\Datasets\\Professorial Salaries.xlsx")
dim(salary)
## [1] 397 7
head(salary)
## # A tibble: 6 × 7
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 1 Prof B 19 18 Male 139750
## 2 2 Prof B 20 16 Male 173200
## 3 3 AsstProf B 4 3 Male 79750
## 4 4 Prof B 45 39 Male 115000
## 5 5 Prof B 40 41 Male 141500
## 6 6 AssocProf B 6 6 Male 97000
tail(salary)
## # A tibble: 6 × 7
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 392 Prof A 30 19 Male 151292
## 2 393 Prof A 33 30 Male 103106
## 3 394 Prof A 31 19 Male 150564
## 4 395 Prof A 42 25 Male 101738
## 5 396 Prof A 25 15 Male 95329
## 6 397 AsstProf A 8 4 Male 81035
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.0 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.1 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the ]8;;http://conflicted.r-lib.org/conflicted package]8;; to force all conflicts to become errors
salary = salary %>% mutate(salary.level = case_when(Salary >= 150000~ "High",
Salary>100000 & Salary< 150000~ "Medium",
Salary<=100000~ "Low"))
salary = salary %>% mutate(salary.vn = Salary*23481532)
head(salary)
## # A tibble: 6 × 9
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary salary.level
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr>
## 1 1 Prof B 19 18 Male 139750 Medium
## 2 2 Prof B 20 16 Male 173200 High
## 3 3 AsstProf B 4 3 Male 79750 Low
## 4 4 Prof B 45 39 Male 115000 Medium
## 5 5 Prof B 40 41 Male 141500 Medium
## 6 6 AssocProf B 6 6 Male 97000 Low
## # ℹ 1 more variable: salary.vn <dbl>
Men.A.high = salary %>% filter(Sex == "Male", Discipline == "A", salary.level == "High")
head(Men.A.high)
## # A tibble: 6 × 9
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary salary.level
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl> <chr>
## 1 127 Prof A 28 26 Male 155500 High
## 2 135 Prof A 35 25 Male 168635 High
## 3 140 Prof A 21 18 Male 152664 High
## 4 250 Prof A 29 7 Male 204000 High
## 5 272 Prof A 42 18 Male 194800 High
## 6 278 Prof A 31 27 Male 163200 High
## # ℹ 1 more variable: salary.vn <dbl>
Subset = salary %>% select(ID, Rank, Salary, Sex)
head(Subset)
## # A tibble: 6 × 4
## ID Rank Salary Sex
## <dbl> <chr> <dbl> <chr>
## 1 1 Prof 139750 Male
## 2 2 Prof 173200 Male
## 3 3 AsstProf 79750 Male
## 4 4 Prof 115000 Male
## 5 5 Prof 141500 Male
## 6 6 AssocProf 97000 Male
library(ggplot2)
p = ggplot(data = salary, aes(x = Salary))
p + geom_histogram(fill = "blue", col = "white") + labs(x = "Salary (USD)", y = "Number of people", title = "Distribution of salary in USD")
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
p = ggplot(data = salary, aes(x = Sex, y = Salary, col = Sex))
p + geom_boxplot() + geom_jitter(alpha = 0.2) +
labs(x = "Sex", y = "Salary (USD)", title = "Distribution of salary in USD by sexes")
ggplot(data = salary, aes(x = Yrs.since.phd, y = Salary)) + geom_point() + geom_smooth(method="loess") + labs(x = "Time since PhD (years)", y= "Salary (dollars)")
## `geom_smooth()` using formula = 'y ~ x'
(Not sure whether it works)
ggplot(data = salary, aes(x = Yrs.since.phd, y = Salary, col = Sex)) + geom_point() + geom_smooth(method="loess") + labs(x = "Time since PhD (years)", y= "Salary (dollars)")
## `geom_smooth()` using formula = 'y ~ x'