Đọc dữ liệu “Professorial Salaries” vào R
library(readxl)
salary = read_excel("C:\\Thach\\VLU workshop (Jun2023)\\Datasets\\Professorial Salaries.xlsx")
head(salary)
## # A tibble: 6 × 7
## ID Rank Discipline Yrs.since.phd Yrs.service Sex Salary
## <dbl> <chr> <chr> <dbl> <dbl> <chr> <dbl>
## 1 1 Prof B 19 18 Male 139750
## 2 2 Prof B 20 16 Male 173200
## 3 3 AsstProf B 4 3 Male 79750
## 4 4 Prof B 45 39 Male 115000
## 5 5 Prof B 40 41 Male 141500
## 6 6 AssocProf B 6 6 Male 97000
So sánh đặc điểm mẫu nghiên cứu
library(compareGroups)
createTable(compareGroups(Sex ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Salary, data = salary))
##
## --------Summary descriptives table by 'Sex'---------
##
## _____________________________________________________
## Female Male p.overall
## N=39 N=358
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
## Rank: 0.014
## AssocProf 10 (25.6%) 54 (15.1%)
## AsstProf 11 (28.2%) 56 (15.6%)
## Prof 18 (46.2%) 248 (69.3%)
## Discipline: 1.000
## A 18 (46.2%) 163 (45.5%)
## B 21 (53.8%) 195 (54.5%)
## Yrs.since.phd 16.5 (9.78) 22.9 (13.0) <0.001
## Yrs.service 11.6 (8.81) 18.3 (13.2) <0.001
## Salary 101002 (25952) 115090 (30437) 0.003
## ¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯¯
library(GGally)
## Loading required package: ggplot2
## Registered S3 method overwritten by 'GGally':
## method from
## +.gg ggplot2
vars = salary[, c("Rank", "Discipline", "Yrs.since.phd", "Yrs.service", "Sex", "Salary")]
ggpairs(data = vars, mapping = aes(color = Sex))
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
## `stat_bin()` using `bins = 30`. Pick better value with `binwidth`.
m1 = lm(Salary ~ Sex, data = salary)
summary(m1)
##
## Call:
## lm(formula = Salary ~ Sex, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57290 -23502 -6828 19710 116455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101002 4809 21.001 < 2e-16 ***
## SexMale 14088 5065 2.782 0.00567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared: 0.01921, Adjusted R-squared: 0.01673
## F-statistic: 7.738 on 1 and 395 DF, p-value: 0.005667
par(mfrow = c(2, 2))
plot(m1)
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
m1.v = ols(Salary ~ Sex, data = salary, x = TRUE, y = TRUE)
validate = validate(m1.v, B = 100)
validate
## index.orig training test optimism index.corrected
## R-square 1.920000e-02 2.190000e-02 1.440000e-02 0.0075 1.170000e-02
## MSE 8.975331e+08 8.954530e+08 9.019175e+08 -6464537.1892 9.039976e+08
## g 2.502313e+03 2.559476e+03 2.502313e+03 57.1639 2.445149e+03
## Intercept 0.000000e+00 0.000000e+00 -1.290136e+04 12901.3608 -1.290136e+04
## Slope 1.000000e+00 1.000000e+00 1.113300e+00 -0.1133 1.113300e+00
## n
## R-square 100
## MSE 100
## g 100
## Intercept 100
## Slope 100
Thời gian sau tiến sĩ rất khác nhau giữa nam và nữ GS nên có thể ảnh hưởng đến mối liến quan giữa lương và giới tính.
m2 = lm(Salary ~ Sex + Yrs.since.phd, data = salary)
summary(m2)
##
## Call:
## lm(formula = Salary ~ Sex + Yrs.since.phd, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84167 -19735 -2551 15427 102033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85181.8 4748.3 17.939 <2e-16 ***
## SexMale 7923.6 4684.1 1.692 0.0915 .
## Yrs.since.phd 958.1 108.3 8.845 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27470 on 394 degrees of freedom
## Multiple R-squared: 0.1817, Adjusted R-squared: 0.1775
## F-statistic: 43.74 on 2 and 394 DF, p-value: < 2.2e-16
m2.v = ols(Salary ~ Sex + Yrs.since.phd, data = salary, x = TRUE, y = TRUE)
validate = validate(m2.v, B = 100)
validate
## index.orig training test optimism index.corrected n
## R-square 1.817000e-01 1.893000e-01 1.745000e-01 0.0148 1.669000e-01 100
## MSE 7.488405e+08 7.553878e+08 7.553888e+08 -1032.3432 7.488416e+08 100
## g 1.480181e+04 1.515046e+04 1.472677e+04 423.6947 1.437811e+04 100
## Intercept 0.000000e+00 0.000000e+00 1.870635e+03 -1870.6347 1.870635e+03 100
## Slope 1.000000e+00 1.000000e+00 9.805000e-01 0.0195 9.805000e-01 100
Chia dữ liệu thành 2 tập dữ liệu nhò
library(caret)
## Loading required package: lattice
set.seed(234)
sample = createDataPartition(y = salary$Salary, p = 0.6, list = FALSE)
train = salary[sample,]
dim(train)
## [1] 240 7
test = salary[-sample,]
dim(test)
## [1] 157 7
Thực hiện mô hình trên tập dữ liệu train
m3 = train(Salary ~ Sex + Rank + Discipline + Yrs.since.phd, data = train, method = "lm", metric = "Rsquared")
summary(m3)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67696 -14677 -500 11519 83456
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 83567.33 6683.26 12.504 < 2e-16 ***
## SexMale 3288.41 5301.46 0.620 0.5357
## RankAsstProf -13476.54 5378.20 -2.506 0.0129 *
## RankProf 31859.51 4553.46 6.997 2.73e-11 ***
## DisciplineB 12978.84 3035.31 4.276 2.77e-05 ***
## Yrs.since.phd 77.42 167.46 0.462 0.6443
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22980 on 234 degrees of freedom
## Multiple R-squared: 0.4388, Adjusted R-squared: 0.4268
## F-statistic: 36.59 on 5 and 234 DF, p-value: < 2.2e-16
Kiểm tra mô hình trên tập dữ liệu test
m3.pred = predict(m3, test)
test.data = data.frame(obs = test$Salary, pred = m3.pred)
defaultSummary(test.data)
## RMSE Rsquared MAE
## 2.223733e+04 4.618031e-01 1.634904e+04
m3.v = ols(Salary ~ Sex + Rank + Discipline + Yrs.since.phd, data = salary, x = TRUE, y = TRUE)
validate = validate(m3.v, B = 100)
validate
## index.orig training test optimism index.corrected
## R-square 4.472000e-01 4.511000e-01 4.409000e-01 1.020000e-02 4.370000e-01
## MSE 5.058598e+08 5.014979e+08 5.116585e+08 -1.016055e+07 5.160204e+08
## g 2.202228e+04 2.212248e+04 2.201399e+04 1.084880e+02 2.191379e+04
## Intercept 0.000000e+00 0.000000e+00 5.726126e+02 -5.726126e+02 5.726126e+02
## Slope 1.000000e+00 1.000000e+00 9.952000e-01 4.800000e-03 9.952000e-01
## n
## R-square 100
## MSE 100
## g 100
## Intercept 100
## Slope 100
plot(varImp(m3))