Đọc dữ liệu vào R
salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2023_Spring semester\\Data\\Professorial Salaries.csv")
dim(salary)
## [1] 397 9
t.test(Salary ~ Sex, data = salary)
##
## Welch Two Sample t-test
##
## data: Salary by Sex
## t = -3.1615, df = 50.122, p-value = 0.002664
## alternative hypothesis: true difference in means between group Female and group Male is not equal to 0
## 95 percent confidence interval:
## -23037.916 -5138.102
## sample estimates:
## mean in group Female mean in group Male
## 101002.4 115090.4
Kết quả:
m.1 = lm(Salary ~ Sex, data = salary)
summary(m.1)
##
## Call:
## lm(formula = Salary ~ Sex, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -57290 -23502 -6828 19710 116455
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 101002 4809 21.001 < 2e-16 ***
## SexMale 14088 5065 2.782 0.00567 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 30030 on 395 degrees of freedom
## Multiple R-squared: 0.01921, Adjusted R-squared: 0.01673
## F-statistic: 7.738 on 1 and 395 DF, p-value: 0.005667
Kiểm tra giả định:
par(mfrow = c(2,2))
plot(m.1)
library(ggfortify)
## Warning: package 'ggfortify' was built under R version 4.3.2
## Loading required package: ggplot2
autoplot(m.1)
m.2 = lm(Salary ~ Rank, data = salary)
summary(m.2)
##
## Call:
## lm(formula = Salary ~ Rank, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68972 -16376 -1580 11755 104773
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 93876 2954 31.777 < 2e-16 ***
## RankAsstProf -13100 4131 -3.171 0.00164 **
## RankProf 32896 3290 9.997 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23630 on 394 degrees of freedom
## Multiple R-squared: 0.3943, Adjusted R-squared: 0.3912
## F-statistic: 128.2 on 2 and 394 DF, p-value: < 2.2e-16
salary$rank.order = factor(salary$Rank, levels = c("AsstProf", "AssocProf", "Prof"))
m.2b = lm(Salary ~ rank.order, data = salary)
summary(m.2b)
##
## Call:
## lm(formula = Salary ~ rank.order, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -68972 -16376 -1580 11755 104773
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 80776 2887 27.976 < 2e-16 ***
## rank.orderAssocProf 13100 4131 3.171 0.00164 **
## rank.orderProf 45996 3230 14.238 < 2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 23630 on 394 degrees of freedom
## Multiple R-squared: 0.3943, Adjusted R-squared: 0.3912
## F-statistic: 128.2 on 2 and 394 DF, p-value: < 2.2e-16
m.3 = lm(Salary ~ Yrs.since.phd, data = salary)
summary(m.3)
##
## Call:
## lm(formula = Salary ~ Yrs.since.phd, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84171 -19432 -2858 16086 102383
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 91718.7 2765.8 33.162 <2e-16 ***
## Yrs.since.phd 985.3 107.4 9.177 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27530 on 395 degrees of freedom
## Multiple R-squared: 0.1758, Adjusted R-squared: 0.1737
## F-statistic: 84.23 on 1 and 395 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m.3)
m.4 = lm(Salary ~ Sex + Yrs.since.phd, data = salary)
summary(m.4)
##
## Call:
## lm(formula = Salary ~ Sex + Yrs.since.phd, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -84167 -19735 -2551 15427 102033
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85181.8 4748.3 17.939 <2e-16 ***
## SexMale 7923.6 4684.1 1.692 0.0915 .
## Yrs.since.phd 958.1 108.3 8.845 <2e-16 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 27470 on 394 degrees of freedom
## Multiple R-squared: 0.1817, Adjusted R-squared: 0.1775
## F-statistic: 43.74 on 2 and 394 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m.4)
m.6 = lm(Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Sex + NPubs + Ncits, data = salary)
step = step(m.6)
## Start: AIC=7965.36
## Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Sex +
## NPubs + Ncits
##
## Df Sum of Sq RSS AIC
## - NPubs 1 4.5771e+07 1.9626e+11 7963.5
## - Sex 1 7.9652e+08 1.9701e+11 7965.0
## <none> 1.9622e+11 7965.4
## - Ncits 1 1.8373e+09 1.9805e+11 7967.1
## - Yrs.since.phd 1 2.3730e+09 1.9859e+11 7968.1
## - Yrs.service 1 2.5825e+09 1.9880e+11 7968.6
## - Discipline 1 1.9267e+10 2.1548e+11 8000.5
## - Rank 2 6.8891e+10 2.6511e+11 8080.8
##
## Step: AIC=7963.46
## Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Sex +
## Ncits
##
## Df Sum of Sq RSS AIC
## - Sex 1 8.2020e+08 1.9708e+11 7963.1
## <none> 1.9626e+11 7963.5
## - Ncits 1 1.8538e+09 1.9812e+11 7965.2
## - Yrs.since.phd 1 2.3748e+09 1.9864e+11 7966.2
## - Yrs.service 1 2.5877e+09 1.9885e+11 7966.7
## - Discipline 1 1.9221e+10 2.1548e+11 7998.5
## - Rank 2 6.8845e+10 2.6511e+11 8078.8
##
## Step: AIC=7963.11
## Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Ncits
##
## Df Sum of Sq RSS AIC
## <none> 1.9708e+11 7963.1
## - Ncits 1 1.8143e+09 1.9890e+11 7964.8
## - Yrs.since.phd 1 2.3721e+09 1.9945e+11 7965.9
## - Yrs.service 1 2.4548e+09 1.9954e+11 7966.0
## - Discipline 1 1.9479e+10 2.1656e+11 7998.5
## - Rank 2 7.0053e+10 2.6714e+11 8079.9
summary(step)
##
## Call:
## lm(formula = Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service +
## Ncits, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67185 -13280 -1513 9905 101141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77435.68 4791.69 16.160 < 2e-16 ***
## RankAsstProf -12140.21 4150.07 -2.925 0.00364 **
## RankProf 32672.39 3525.11 9.268 < 2e-16 ***
## DisciplineB 14501.42 2335.70 6.209 1.37e-09 ***
## Yrs.since.phd 521.01 240.47 2.167 0.03087 *
## Yrs.service -465.52 211.22 -2.204 0.02811 *
## Ncits 127.09 67.07 1.895 0.05886 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22480 on 390 degrees of freedom
## Multiple R-squared: 0.4575, Adjusted R-squared: 0.4492
## F-statistic: 54.82 on 6 and 390 DF, p-value: < 2.2e-16
m.7 = lm(Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Ncits, data = salary)
summary(m.7)
##
## Call:
## lm(formula = Salary ~ Rank + Discipline + Yrs.since.phd + Yrs.service +
## Ncits, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -67185 -13280 -1513 9905 101141
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 77435.68 4791.69 16.160 < 2e-16 ***
## RankAsstProf -12140.21 4150.07 -2.925 0.00364 **
## RankProf 32672.39 3525.11 9.268 < 2e-16 ***
## DisciplineB 14501.42 2335.70 6.209 1.37e-09 ***
## Yrs.since.phd 521.01 240.47 2.167 0.03087 *
## Yrs.service -465.52 211.22 -2.204 0.02811 *
## Ncits 127.09 67.07 1.895 0.05886 .
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22480 on 390 degrees of freedom
## Multiple R-squared: 0.4575, Adjusted R-squared: 0.4492
## F-statistic: 54.82 on 6 and 390 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m.7)
library(car)
## Warning: package 'car' was built under R version 4.3.2
## Loading required package: carData
vif(m.7)
## GVIF Df GVIF^(1/(2*Df))
## Rank 2.019189 2 1.192049
## Discipline 1.063139 1 1.031086
## Yrs.since.phd 7.525650 1 2.743292
## Yrs.service 5.913613 1 2.431792
## Ncits 1.012499 1 1.006230
# VIF> 5 indicates an increased risk of multicollinearity
library(BMA)
## Loading required package: survival
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
xvars = salary[,c("Rank", "Discipline", "Yrs.since.phd", "Yrs.service", "Sex", "NPubs", "Ncits")]
bma.1 = bicreg(xvars, salary$Salary, strict = FALSE, OR = 20)
summary(bma.1)
##
## Call:
## bicreg(x = xvars, y = salary$Salary, strict = FALSE, OR = 20)
##
##
## 6 models were selected
## Best 5 models (cumulative posterior probability = 0.9676 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 8.428e+04 4246.08 85705.87 80159.33 81946.95
## RankAsstProf 100.0 -1.360e+04 3992.00 -13761.54 -13011.52 -13723.42
## RankProf 100.0 3.409e+04 3193.57 34082.30 34252.88 33679.90
## DisciplineB 100.0 1.376e+04 2298.00 13760.96 13779.31 13708.69
## Yrs.since.phd 3.7 2.628e+00 27.72 . . .
## Yrs.service 3.9 -3.007e+00 26.61 . . .
## SexMale 6.1 2.760e+02 1441.87 . . 4491.80
## NPubs 3.2 7.712e-01 15.32 . . .
## Ncits 21.3 2.801e+01 62.18 . 131.65 .
##
## nVar 3 4 4
## r2 0.445 0.450 0.447
## BIC -215.78 -213.65 -211.17
## post prob 0.618 0.213 0.061
## model 4 model 5
## Intercept 86736.76 84435.56
## RankAsstProf -14483.23 -13030.16
## RankProf 34894.27 33181.41
## DisciplineB 13561.43 14028.68
## Yrs.since.phd . 71.92
## Yrs.service -76.33 .
## SexMale . .
## NPubs . .
## Ncits . .
##
## nVar 4 4
## r2 0.446 0.445
## BIC -210.28 -210.13
## post prob 0.039 0.037
imageplot.bma(bma.1)
m.8 = lm(Salary ~ Rank + Discipline, data = salary)
summary(m.8)
##
## Call:
## lm(formula = Salary ~ Rank + Discipline, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65990 -14049 -1288 10760 97996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85706 3142 27.273 < 2e-16 ***
## RankAsstProf -13762 3961 -3.475 0.000569 ***
## RankProf 34082 3160 10.786 < 2e-16 ***
## DisciplineB 13761 2296 5.993 4.65e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22650 on 393 degrees of freedom
## Multiple R-squared: 0.445, Adjusted R-squared: 0.4407
## F-statistic: 105 on 3 and 393 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m.8)
vif(m.8)
## GVIF Df GVIF^(1/(2*Df))
## Rank 1.011848 2 1.002949
## Discipline 1.011848 1 1.005907
salary$salary.high = ifelse(salary$Salary>= 130000, 1, 0)
m.9 = glm(salary.high ~ Sex, family = binomial, data = salary)
summary(m.9)
##
## Call:
## glm(formula = salary.high ~ Sex, family = binomial, data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -1.9169 0.4790 -4.002 6.28e-05 ***
## SexMale 1.0375 0.4928 2.105 0.0353 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 468.60 on 396 degrees of freedom
## Residual deviance: 463.11 on 395 degrees of freedom
## AIC: 467.11
##
## Number of Fisher Scoring iterations: 4
library(epiDisplay)
## Warning: package 'epiDisplay' was built under R version 4.3.2
## Loading required package: foreign
## Loading required package: MASS
## Loading required package: nnet
##
## Attaching package: 'epiDisplay'
## The following object is masked from 'package:ggplot2':
##
## alpha
logistic.display(m.9)
##
## Logistic regression predicting salary.high
##
## OR(95%CI) P(Wald's test) P(LR-test)
## Sex (cont. var.) 2.82 (1.07,7.41) 0.035 0.019
##
## Log-likelihood = -231.5529
## No. of observations = 397
## AIC value = 467.1058
library(Publish)
## Loading required package: prodlim
publish(m.9)
## Variable Units OddsRatio CI.95 p-value
## Sex Female Ref
## Male 2.82 [1.07;7.41] 0.03528
m.10 = glm(salary.high ~ Yrs.since.phd, family = binomial, data = salary)
summary(m.10)
##
## Call:
## glm(formula = salary.high ~ Yrs.since.phd, family = binomial,
## data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.429837 0.279482 -8.694 < 2e-16 ***
## Yrs.since.phd 0.060400 0.009683 6.238 4.43e-10 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 468.60 on 396 degrees of freedom
## Residual deviance: 424.64 on 395 degrees of freedom
## AIC: 428.64
##
## Number of Fisher Scoring iterations: 4
m.11 = glm(salary.high ~ Sex + Yrs.since.phd, family = binomial, data = salary)
summary(m.11)
##
## Call:
## glm(formula = salary.high ~ Sex + Yrs.since.phd, family = binomial,
## data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -3.001198 0.528377 -5.680 1.35e-08 ***
## SexMale 0.670864 0.507551 1.322 0.186
## Yrs.since.phd 0.058448 0.009746 5.997 2.00e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 468.60 on 396 degrees of freedom
## Residual deviance: 422.66 on 394 degrees of freedom
## AIC: 428.66
##
## Number of Fisher Scoring iterations: 4
library(BMA)
salary.1 = na.omit(salary)
bma.2 = bic.glm(salary.high ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Sex + NPubs + Ncits, data = salary.1, strict = F, OR = 20, glm.family = "binomial")
summary(bma.2)
##
## Call:
## bic.glm.formula(f = salary.high ~ Rank + Discipline + Yrs.since.phd + Yrs.service + Sex + NPubs + Ncits, data = salary.1, glm.family = "binomial", strict = F, OR = 20)
##
##
## 10 models were selected
## Best 5 models (cumulative posterior probability = 0.8525 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 -2.088e+01 9.253e+02 -2.101e+01 -2.012e+01 -2.130e+01
## RankAsstProf 2.3 1.639e-03 2.776e+02 . . .
## RankProf 100.0 1.925e+01 9.253e+02 1.933e+01 1.933e+01 1.892e+01
## DisciplineB 96.3 8.374e-01 3.072e-01 8.515e-01 8.307e-01 9.653e-01
## Yrs.since.phd 17.5 4.306e-03 1.087e-02 . . 2.401e-02
## Yrs.service 3.7 4.081e-04 3.013e-03 . . .
## SexMale 5.9 4.292e-02 2.182e-01 . . .
## NPubs 5.0 4.356e-04 2.951e-03 . . .
## Ncits 76.2 1.608e-02 1.118e-02 2.130e-02 . 2.048e-02
##
## nVar 3 2 4
## BIC -2.010e+03 -2.008e+03 -2.007e+03
## post prob 0.461 0.157 0.120
## model 4 model 5
## Intercept -2.166e+01 -2.048e+01
## RankAsstProf . .
## RankProf 1.929e+01 1.890e+01
## DisciplineB 8.643e-01 9.554e-01
## Yrs.since.phd . 2.574e-02
## Yrs.service . .
## SexMale 7.303e-01 .
## NPubs . .
## Ncits 2.147e-02 .
##
## nVar 4 3
## BIC -2.006e+03 -2.006e+03
## post prob 0.059 0.056
##
## 1 observations deleted due to missingness.
imageplot.bma(bma.2)
m.12 = glm(salary.high ~ Rank + Discipline + Ncits, family = binomial, data = salary)
summary(m.12)
##
## Call:
## glm(formula = salary.high ~ Rank + Discipline + Ncits, family = binomial,
## data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -2.104e+01 1.307e+03 -0.016 0.98716
## RankAsstProf 7.081e-02 1.825e+03 0.000 0.99997
## RankProf 1.937e+01 1.307e+03 0.015 0.98818
## DisciplineB 8.515e-01 2.597e-01 3.278 0.00104 **
## Ncits 2.130e-02 7.611e-03 2.799 0.00513 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 468.60 on 396 degrees of freedom
## Residual deviance: 341.77 on 392 degrees of freedom
## AIC: 351.77
##
## Number of Fisher Scoring iterations: 18
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:base':
##
## format.pval, units
## Registered S3 method overwritten by 'rms':
## method from
## print.lrtest epiDisplay
##
## Attaching package: 'rms'
## The following object is masked from 'package:epiDisplay':
##
## lrtest
## The following objects are masked from 'package:car':
##
## Predict, vif
ddist = datadist(salary)
options(datadist = 'ddist')
m = lrm(salary.high ~ Rank + Discipline + Ncits, data = salary)
summary(m)
## Effects Response : salary.high
##
## Factor Low High Diff. Effect S.E. Lower 0.95
## Ncits 28 50 22 4.6858e-01 0.16743 1.4041e-01
## Odds Ratio 28 50 22 1.5977e+00 NA 1.1508e+00
## Rank - AssocProf:Prof 3 1 NA -1.1291e+01 37.99300 -8.5755e+01
## Odds Ratio 3 1 NA 1.2486e-05 NA 5.7132e-38
## Rank - AsstProf:Prof 3 2 NA -1.1220e+01 37.00400 -8.3747e+01
## Odds Ratio 3 2 NA 1.3400e-05 NA 4.2561e-37
## Discipline - A:B 2 1 NA -8.5147e-01 0.25973 -1.3605e+00
## Odds Ratio 2 1 NA 4.2679e-01 NA 2.5652e-01
## Upper 0.95
## 7.9675e-01
## 2.2183e+00
## 6.3174e+01
## 2.7287e+27
## 6.1307e+01
## 4.2190e+26
## -3.4241e-01
## 7.1006e-01
nom.m = nomogram(m, fun = function(x)1/(1+exp(-x)), fun.at = c(.001, .01, .10, seq(.2, .8, by = .2), .90, .99, .999),
funlabel = "Prediction of the likelihood of getting a high salary")
plot(nom.m)