salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2024_Autumn semester\\Data\\Professorial Salaries.csv")
library(caret)
## Loading required package: ggplot2
## Loading required package: lattice
set.seed(1234)
index = createDataPartition(salary$Salary, p = 0.75, list = FALSE)
training = salary[index, ]
dim(training)
## [1] 300 9
testing = salary[-index, ]
dim(testing)
## [1] 97 9
library(table1)
##
## Attaching package: 'table1'
## The following objects are masked from 'package:base':
##
## units, units<-
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + Sex, data = training)
Overall (N=300) |
|
---|---|
Rank | |
AssocProf | 50 (16.7%) |
AsstProf | 50 (16.7%) |
Prof | 200 (66.7%) |
Discipline | |
A | 137 (45.7%) |
B | 163 (54.3%) |
Yrs.since.phd | |
Mean (SD) | 22.0 (12.9) |
Median [Min, Max] | 20.5 [1.00, 56.0] |
Yrs.service | |
Mean (SD) | 17.7 (12.9) |
Median [Min, Max] | 17.0 [0, 57.0] |
NPubs | |
Mean (SD) | 18.0 (14.0) |
Median [Min, Max] | 13.0 [1.00, 69.0] |
Ncits | |
Mean (SD) | 39.9 (17.6) |
Median [Min, Max] | 35.0 [1.00, 90.0] |
Salary | |
Mean (SD) | 114000 (29800) |
Median [Min, Max] | 107000 [57800, 232000] |
Sex | |
Female | 24 (8.0%) |
Male | 276 (92.0%) |
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + Sex, data = testing)
Overall (N=97) |
|
---|---|
Rank | |
AssocProf | 14 (14.4%) |
AsstProf | 17 (17.5%) |
Prof | 66 (68.0%) |
Discipline | |
A | 44 (45.4%) |
B | 53 (54.6%) |
Yrs.since.phd | |
Mean (SD) | 23.2 (13.0) |
Median [Min, Max] | 22.0 [3.00, 56.0] |
Yrs.service | |
Mean (SD) | 17.2 (13.3) |
Median [Min, Max] | 14.0 [0, 60.0] |
NPubs | |
Mean (SD) | 18.7 (13.9) |
Median [Min, Max] | 14.0 [1.00, 50.0] |
Ncits | |
Mean (SD) | 41.2 (14.6) |
Median [Min, Max] | 37.0 [14.0, 83.0] |
Salary | |
Mean (SD) | 113000 (31900) |
Median [Min, Max] | 107000 [62900, 206000] |
Sex | |
Female | 15 (15.5%) |
Male | 82 (84.5%) |
library(BMA)
## Loading required package: survival
##
## Attaching package: 'survival'
## The following object is masked from 'package:caret':
##
## cluster
## Loading required package: leaps
## Loading required package: robustbase
##
## Attaching package: 'robustbase'
## The following object is masked from 'package:survival':
##
## heart
## Loading required package: inline
## Loading required package: rrcov
## Scalable Robust Estimators with High Breakdown Point (version 1.7-4)
xvars = training[, c("Rank", "Discipline", "Yrs.since.phd", "Yrs.service", "Sex", "NPubs", "Ncits")]
yvar = training[,c("Salary")]
bma = bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
summary(bma)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 7 models were selected
## Best 5 models (cumulative posterior probability = 0.9347 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 8.833e+04 4376.20 88094.11 91372.51 86697.82
## RankAsstProf 100.0 -1.653e+04 4677.07 -16064.89 -18293.10 -16414.39
## RankProf 100.0 3.256e+04 3896.77 31938.49 34788.76 32340.00
## DisciplineB 100.0 1.315e+04 2697.84 13227.64 12527.42 13644.18
## Yrs.since.phd 17.8 9.058e+01 254.77 . . 648.48
## Yrs.service 33.1 -1.546e+02 285.53 . -249.58 -751.30
## SexMale 3.8 1.281e+02 1153.12 . . .
## NPubs 3.1 -7.687e-01 17.20 . . .
## Ncits 4.5 3.017e+00 21.15 . . .
##
## nVar 3 4 5
## r2 0.427 0.434 0.443
## BIC -149.79 -147.75 -147.22
## post prob 0.521 0.187 0.144
## model 4 model 5
## Intercept 85234.76 85203.55
## RankAsstProf -15556.44 -15926.67
## RankProf 32013.72 31688.22
## DisciplineB 13319.46 13128.83
## Yrs.since.phd . .
## Yrs.service . .
## SexMale . 3356.59
## NPubs . .
## Ncits 67.03 .
##
## nVar 4 4
## r2 0.428 0.428
## BIC -144.90 -144.57
## post prob 0.045 0.038
imageplot.bma(bma)
m1 = lm(Salary ~ Rank + Discipline, data = training)
summary(m1)
##
## Call:
## lm(formula = Salary ~ Rank + Discipline, data = training)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65701 -13823 -778 11321 98285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88094 3632 24.258 < 2e-16 ***
## RankAsstProf -16065 4535 -3.542 0.000461 ***
## RankProf 31939 3608 8.852 < 2e-16 ***
## DisciplineB 13228 2665 4.963 1.17e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22670 on 296 degrees of freedom
## Multiple R-squared: 0.4267, Adjusted R-squared: 0.4209
## F-statistic: 73.44 on 3 and 296 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m1)
fit.m1 = train(Salary ~ Rank + Discipline, data = training, method = "lm", metric = "Rsquared")
summary(fit.m1)
##
## Call:
## lm(formula = .outcome ~ ., data = dat)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65701 -13823 -778 11321 98285
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 88094 3632 24.258 < 2e-16 ***
## RankAsstProf -16065 4535 -3.542 0.000461 ***
## RankProf 31939 3608 8.852 < 2e-16 ***
## DisciplineB 13228 2665 4.963 1.17e-06 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22670 on 296 degrees of freedom
## Multiple R-squared: 0.4267, Adjusted R-squared: 0.4209
## F-statistic: 73.44 on 3 and 296 DF, p-value: < 2.2e-16
pred.m1 = predict(fit.m1, testing)
testing.data = data.frame(obs = testing$Salary, pred = pred.m1)
defaultSummary(testing.data)
## RMSE Rsquared MAE
## 2.266292e+04 4.940016e-01 1.672117e+04
table1(~ Rank + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + Sex, data = salary)
Overall (N=397) |
|
---|---|
Rank | |
AssocProf | 64 (16.1%) |
AsstProf | 67 (16.9%) |
Prof | 266 (67.0%) |
Discipline | |
A | 181 (45.6%) |
B | 216 (54.4%) |
Yrs.since.phd | |
Mean (SD) | 22.3 (12.9) |
Median [Min, Max] | 21.0 [1.00, 56.0] |
Yrs.service | |
Mean (SD) | 17.6 (13.0) |
Median [Min, Max] | 16.0 [0, 60.0] |
NPubs | |
Mean (SD) | 18.2 (14.0) |
Median [Min, Max] | 13.0 [1.00, 69.0] |
Ncits | |
Mean (SD) | 40.2 (16.9) |
Median [Min, Max] | 35.0 [1.00, 90.0] |
Salary | |
Mean (SD) | 114000 (30300) |
Median [Min, Max] | 107000 [57800, 232000] |
Sex | |
Female | 39 (9.8%) |
Male | 358 (90.2%) |
library(BMA)
xvars = salary[, c("Rank", "Discipline", "Yrs.since.phd", "Yrs.service", "Sex", "NPubs", "Ncits")]
yvar = salary[,c("Salary")]
bma.2 = bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
summary(bma.2)
##
## Call:
## bicreg(x = xvars, y = yvar, strict = FALSE, OR = 20)
##
##
## 6 models were selected
## Best 5 models (cumulative posterior probability = 0.9676 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100.0 8.428e+04 4246.08 85705.87 80159.33 81946.95
## RankAsstProf 100.0 -1.360e+04 3992.00 -13761.54 -13011.52 -13723.42
## RankProf 100.0 3.409e+04 3193.57 34082.30 34252.88 33679.90
## DisciplineB 100.0 1.376e+04 2298.00 13760.96 13779.31 13708.69
## Yrs.since.phd 3.7 2.628e+00 27.72 . . .
## Yrs.service 3.9 -3.007e+00 26.61 . . .
## SexMale 6.1 2.760e+02 1441.87 . . 4491.80
## NPubs 3.2 7.712e-01 15.32 . . .
## Ncits 21.3 2.801e+01 62.18 . 131.65 .
##
## nVar 3 4 4
## r2 0.445 0.450 0.447
## BIC -215.78 -213.65 -211.17
## post prob 0.618 0.213 0.061
## model 4 model 5
## Intercept 86736.76 84435.56
## RankAsstProf -14483.23 -13030.16
## RankProf 34894.27 33181.41
## DisciplineB 13561.43 14028.68
## Yrs.since.phd . 71.92
## Yrs.service -76.33 .
## SexMale . .
## NPubs . .
## Ncits . .
##
## nVar 4 4
## r2 0.446 0.445
## BIC -210.28 -210.13
## post prob 0.039 0.037
imageplot.bma(bma.2)
m2 = lm(Salary ~ Rank + Discipline, data = salary)
summary(m2)
##
## Call:
## lm(formula = Salary ~ Rank + Discipline, data = salary)
##
## Residuals:
## Min 1Q Median 3Q Max
## -65990 -14049 -1288 10760 97996
##
## Coefficients:
## Estimate Std. Error t value Pr(>|t|)
## (Intercept) 85706 3142 27.273 < 2e-16 ***
## RankAsstProf -13762 3961 -3.475 0.000569 ***
## RankProf 34082 3160 10.786 < 2e-16 ***
## DisciplineB 13761 2296 5.993 4.65e-09 ***
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## Residual standard error: 22650 on 393 degrees of freedom
## Multiple R-squared: 0.445, Adjusted R-squared: 0.4407
## F-statistic: 105 on 3 and 393 DF, p-value: < 2.2e-16
par(mfrow = c(2,2))
plot(m2)
library(rms)
## Loading required package: Hmisc
##
## Attaching package: 'Hmisc'
## The following objects are masked from 'package:table1':
##
## label, label<-, units
## The following objects are masked from 'package:base':
##
## format.pval, units
fit.m2 = ols(Salary ~ Rank + Discipline, data = salary, x = TRUE, y = TRUE)
fit.m2
## Linear Regression Model
##
## ols(formula = Salary ~ Rank + Discipline, data = salary, x = TRUE,
## y = TRUE)
##
## Model Likelihood Discrimination
## Ratio Test Indexes
## Obs 397 LR chi2 233.73 R2 0.445
## sigma22651.1854 d.f. 3 R2 adj 0.441
## d.f. 393 Pr(> chi2) 0.0000 g 21717.086
##
## Residuals
##
## Min 1Q Median 3Q Max
## -65990 -14049 -1288 10760 97996
##
##
## Coef S.E. t Pr(>|t|)
## Intercept 85705.8693 3142.5016 27.27 <0.0001
## Rank=AsstProf -13761.5432 3960.6611 -3.47 0.0006
## Rank=Prof 34082.2954 3159.8850 10.79 <0.0001
## Discipline=B 13760.9570 2296.0309 5.99 <0.0001
set.seed(1234)
v.m2 = validate(fit.m2, B = 500)
v.m2
## index.orig training test optimism index.corrected
## R-square 4.450000e-01 4.467000e-01 4.412000e-01 0.0055 4.395000e-01
## MSE 5.079067e+08 5.039997e+08 5.114001e+08 -7400384.7194 5.153071e+08
## g 2.171709e+04 2.162071e+04 2.167298e+04 -52.2655 2.176935e+04
## Intercept 0.000000e+00 0.000000e+00 9.012950e+01 -90.1295 9.012950e+01
## Slope 1.000000e+00 1.000000e+00 9.998000e-01 0.0002 9.998000e-01
## n
## R-square 500
## MSE 500
## g 500
## Intercept 500
## Slope 500
salary = read.csv("C:\\Thach\\UTS\\Teaching\\TRM\\Practical Data Analysis\\2024_Autumn semester\\Data\\Professorial Salaries.csv")
salary$high.salary = ifelse(salary$Salary> 120000, 1, 0)
salary$Prof[salary$Rank == "Prof"] = "Professor"
salary$Prof[salary$Rank != "Prof"] = "AssProfessor"
library(caret)
set.seed(1234)
index = createDataPartition(salary$Salary, p = 0.75, list = FALSE)
training = salary[index, ]
dim(training)
## [1] 300 11
testing = salary[-index, ]
dim(testing)
## [1] 97 11
library(table1)
table1(~ Rank + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + Sex | as.factor(high.salary), data = training)
0 (N=193) |
1 (N=107) |
Overall (N=300) |
|
---|---|---|---|
Rank | |||
AssocProf | 49 (25.4%) | 1 (0.9%) | 50 (16.7%) |
AsstProf | 50 (25.9%) | 0 (0%) | 50 (16.7%) |
Prof | 94 (48.7%) | 106 (99.1%) | 200 (66.7%) |
Prof | |||
AssProfessor | 99 (51.3%) | 1 (0.9%) | 100 (33.3%) |
Professor | 94 (48.7%) | 106 (99.1%) | 200 (66.7%) |
Discipline | |||
A | 92 (47.7%) | 45 (42.1%) | 137 (45.7%) |
B | 101 (52.3%) | 62 (57.9%) | 163 (54.3%) |
Yrs.since.phd | |||
Mean (SD) | 19.0 (13.7) | 27.6 (8.98) | 22.0 (12.9) |
Median [Min, Max] | 15.0 [1.00, 56.0] | 28.0 [11.0, 46.0] | 20.5 [1.00, 56.0] |
Yrs.service | |||
Mean (SD) | 15.1 (13.7) | 22.5 (9.87) | 17.7 (12.9) |
Median [Min, Max] | 9.00 [0, 57.0] | 20.0 [2.00, 45.0] | 17.0 [0, 57.0] |
NPubs | |||
Mean (SD) | 17.5 (12.8) | 18.9 (15.9) | 18.0 (14.0) |
Median [Min, Max] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
Ncits | |||
Mean (SD) | 38.3 (16.4) | 42.8 (19.4) | 39.9 (17.6) |
Median [Min, Max] | 35.0 [1.00, 90.0] | 36.0 [1.00, 90.0] | 35.0 [1.00, 90.0] |
Salary | |||
Mean (SD) | 95600 (14400) | 147000 (20300) | 114000 (29800) |
Median [Min, Max] | 96600 [57800, 120000] | 144000 [121000, 232000] | 107000 [57800, 232000] |
Sex | |||
Female | 19 (9.8%) | 5 (4.7%) | 24 (8.0%) |
Male | 174 (90.2%) | 102 (95.3%) | 276 (92.0%) |
table1(~ Rank + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + Sex | as.factor(high.salary), data = testing)
0 (N=62) |
1 (N=35) |
Overall (N=97) |
|
---|---|---|---|
Rank | |||
AssocProf | 14 (22.6%) | 0 (0%) | 14 (14.4%) |
AsstProf | 17 (27.4%) | 0 (0%) | 17 (17.5%) |
Prof | 31 (50.0%) | 35 (100%) | 66 (68.0%) |
Prof | |||
AssProfessor | 31 (50.0%) | 0 (0%) | 31 (32.0%) |
Professor | 31 (50.0%) | 35 (100%) | 66 (68.0%) |
Discipline | |||
A | 35 (56.5%) | 9 (25.7%) | 44 (45.4%) |
B | 27 (43.5%) | 26 (74.3%) | 53 (54.6%) |
Yrs.since.phd | |||
Mean (SD) | 19.5 (13.0) | 29.8 (10.2) | 23.2 (13.0) |
Median [Min, Max] | 16.5 [3.00, 46.0] | 31.0 [13.0, 56.0] | 22.0 [3.00, 56.0] |
Yrs.service | |||
Mean (SD) | 14.3 (12.6) | 22.4 (13.0) | 17.2 (13.3) |
Median [Min, Max] | 9.00 [0, 46.0] | 20.0 [2.00, 60.0] | 14.0 [0, 60.0] |
NPubs | |||
Mean (SD) | 17.9 (14.4) | 20.1 (13.2) | 18.7 (13.9) |
Median [Min, Max] | 13.0 [1.00, 50.0] | 18.0 [3.00, 50.0] | 14.0 [1.00, 50.0] |
Ncits | |||
Mean (SD) | 39.5 (13.6) | 44.3 (16.0) | 41.2 (14.6) |
Median [Min, Max] | 36.0 [14.0, 70.0] | 41.0 [14.0, 83.0] | 37.0 [14.0, 83.0] |
Salary | |||
Mean (SD) | 93100 (15300) | 149000 (21000) | 113000 (31900) |
Median [Min, Max] | 95500 [62900, 120000] | 145000 [121000, 206000] | 107000 [62900, 206000] |
Sex | |||
Female | 11 (17.7%) | 4 (11.4%) | 15 (15.5%) |
Male | 51 (82.3%) | 31 (88.6%) | 82 (84.5%) |
library(BMA)
bma.bin1 = bic.glm(high.salary ~ Sex + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits, strict = F, OR = 20, glm.family = "binomial", data = training)
summary(bma.bin1)
##
## Call:
## bic.glm.formula(f = high.salary ~ Sex + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits, data = training, glm.family = "binomial", strict = F, OR = 20)
##
##
## 8 models were selected
## Best 5 models (cumulative posterior probability = 0.9053 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 -5.139e+00 1.085239 -5.184e+00 -4.595e+00 -5.700e+00
## SexMale 3.2 6.160e-03 0.121368 . . .
## ProfProfessor 100.0 4.875e+00 1.024675 4.923e+00 4.715e+00 4.903e+00
## DisciplineB 77.9 6.213e-01 0.417734 7.970e-01 . 8.054e-01
## Yrs.since.phd 3.1 6.003e-05 0.002609 . . .
## Yrs.service 3.3 -1.357e-04 0.002464 . . .
## NPubs 3.1 4.455e-05 0.001748 . . .
## Ncits 15.4 2.003e-03 0.005665 . . 1.307e-02
##
## nVar 2 1 3
## BIC -1.414e+03 -1.412e+03 -1.411e+03
## post prob 0.535 0.183 0.116
## model 4 model 5
## Intercept -5.098e+00 -5.144e+00
## SexMale . .
## ProfProfessor 4.703e+00 4.987e+00
## DisciplineB . 7.814e-01
## Yrs.since.phd . .
## Yrs.service . -4.171e-03
## NPubs . .
## Ncits 1.266e-02 .
##
## nVar 2 3
## BIC -1.409e+03 -1.409e+03
## post prob 0.038 0.033
##
## 1 observations deleted due to missingness.
imageplot.bma(bma.bin1)
bin.m1 = glm(high.salary ~ Prof + Discipline, family = "binomial", data = training)
summary(bin.m1)
##
## Call:
## glm(formula = high.salary ~ Prof + Discipline, family = "binomial",
## data = training)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.1843 1.0321 -5.023 5.09e-07 ***
## ProfProfessor 4.9231 1.0205 4.824 1.41e-06 ***
## DisciplineB 0.7970 0.2876 2.771 0.00558 **
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 390.89 on 299 degrees of freedom
## Residual deviance: 279.89 on 297 degrees of freedom
## AIC: 285.89
##
## Number of Fisher Scoring iterations: 7
fit.bin.m1 = train(factor(high.salary) ~ Prof + Discipline + Ncits, data = training, method = "glm", family = "binomial")
summary(fit.bin.m1)
##
## Call:
## NULL
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -5.700254 1.086959 -5.244 1.57e-07 ***
## ProfProfessor 4.903412 1.020676 4.804 1.55e-06 ***
## DisciplineB 0.805372 0.289619 2.781 0.00542 **
## Ncits 0.013071 0.008133 1.607 0.10800
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 390.89 on 299 degrees of freedom
## Residual deviance: 277.24 on 296 degrees of freedom
## AIC: 285.24
##
## Number of Fisher Scoring iterations: 7
pred.bin.m1 = predict(fit.bin.m1, testing, type = "raw")
confusionMatrix(factor(testing$high.salary), pred.bin.m1, positive = "1")
## Confusion Matrix and Statistics
##
## Reference
## Prediction 0 1
## 0 50 12
## 1 6 29
##
## Accuracy : 0.8144
## 95% CI : (0.7227, 0.8862)
## No Information Rate : 0.5773
## P-Value [Acc > NIR] : 6.302e-07
##
## Kappa : 0.6122
##
## Mcnemar's Test P-Value : 0.2386
##
## Sensitivity : 0.7073
## Specificity : 0.8929
## Pos Pred Value : 0.8286
## Neg Pred Value : 0.8065
## Prevalence : 0.4227
## Detection Rate : 0.2990
## Detection Prevalence : 0.3608
## Balanced Accuracy : 0.8001
##
## 'Positive' Class : 1
##
library(pROC)
## Type 'citation("pROC")' for a citation.
##
## Attaching package: 'pROC'
## The following objects are masked from 'package:stats':
##
## cov, smooth, var
prob = predict(fit.bin.m1, newdata = testing, type = "prob")
pred.data = data.frame(prob, testing$high.salary)
head(pred.data)
## X0 X1 testing.high.salary
## 5 0.3671548 0.63284524 1
## 6 0.9880057 0.01199434 0
## 16 0.3948900 0.60511003 0
## 22 0.5840340 0.41596600 0
## 27 0.6091936 0.39080637 1
## 31 0.3402721 0.65972788 1
roc(pred.data$testing.high.salary, pred.data$X1)
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
##
## Call:
## roc.default(response = pred.data$testing.high.salary, predictor = pred.data$X1)
##
## Data: pred.data$X1 in 62 controls (pred.data$testing.high.salary 0) < 35 cases (pred.data$testing.high.salary 1).
## Area under the curve: 0.8776
auc = smooth(roc(pred.data$testing.high.salary, pred.data$X1))
## Setting levels: control = 0, case = 1
## Setting direction: controls < cases
plot(auc)
table1(~ Rank + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits + Salary + Sex | as.factor(high.salary), data = salary)
0 (N=255) |
1 (N=142) |
Overall (N=397) |
|
---|---|---|---|
Rank | |||
AssocProf | 63 (24.7%) | 1 (0.7%) | 64 (16.1%) |
AsstProf | 67 (26.3%) | 0 (0%) | 67 (16.9%) |
Prof | 125 (49.0%) | 141 (99.3%) | 266 (67.0%) |
Prof | |||
AssProfessor | 130 (51.0%) | 1 (0.7%) | 131 (33.0%) |
Professor | 125 (49.0%) | 141 (99.3%) | 266 (67.0%) |
Discipline | |||
A | 127 (49.8%) | 54 (38.0%) | 181 (45.6%) |
B | 128 (50.2%) | 88 (62.0%) | 216 (54.4%) |
Yrs.since.phd | |||
Mean (SD) | 19.1 (13.5) | 28.1 (9.29) | 22.3 (12.9) |
Median [Min, Max] | 15.0 [1.00, 56.0] | 28.0 [11.0, 56.0] | 21.0 [1.00, 56.0] |
Yrs.service | |||
Mean (SD) | 14.9 (13.4) | 22.5 (10.7) | 17.6 (13.0) |
Median [Min, Max] | 9.00 [0, 57.0] | 20.0 [2.00, 60.0] | 16.0 [0, 60.0] |
NPubs | |||
Mean (SD) | 17.6 (13.2) | 19.2 (15.2) | 18.2 (14.0) |
Median [Min, Max] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] | 13.0 [1.00, 69.0] |
Ncits | |||
Mean (SD) | 38.6 (15.8) | 43.1 (18.6) | 40.2 (16.9) |
Median [Min, Max] | 35.0 [1.00, 90.0] | 38.5 [1.00, 90.0] | 35.0 [1.00, 90.0] |
Salary | |||
Mean (SD) | 95000 (14600) | 147000 (20400) | 114000 (30300) |
Median [Min, Max] | 96200 [57800, 120000] | 144000 [121000, 232000] | 107000 [57800, 232000] |
Sex | |||
Female | 30 (11.8%) | 9 (6.3%) | 39 (9.8%) |
Male | 225 (88.2%) | 133 (93.7%) | 358 (90.2%) |
bma.bin2 = bic.glm(high.salary ~ Sex + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits, strict = F, OR = 20, glm.family = "binomial", data = salary)
summary(bma.bin2)
##
## Call:
## bic.glm.formula(f = high.salary ~ Sex + Prof + Discipline + Yrs.since.phd + Yrs.service + NPubs + Ncits, data = salary, glm.family = "binomial", strict = F, OR = 20)
##
##
## 9 models were selected
## Best 5 models (cumulative posterior probability = 0.9093 ):
##
## p!=0 EV SD model 1 model 2 model 3
## Intercept 100 -5.978e+00 1.129453 -6.336e+00 -5.559e+00 -5.691e+00
## SexMale 4.6 1.053e-02 0.117232 . . .
## ProfProfessor 100.0 5.202e+00 1.018200 5.222e+00 5.198e+00 5.033e+00
## DisciplineB 100.0 9.733e-01 0.254090 9.797e-01 9.618e-01 1.006e+00
## Yrs.since.phd 5.3 4.565e-04 0.003544 . . 9.694e-03
## Yrs.service 2.1 1.670e-05 0.001626 . . .
## NPubs 4.7 1.025e-05 0.002167 . . .
## Ncits 51.8 9.540e-03 0.010718 1.837e-02 . .
##
## nVar 3 2 3
## BIC -1.993e+03 -1.993e+03 -1.988e+03
## post prob 0.425 0.408 0.027
## model 4 model 5
## Intercept -6.421e+00 -6.542e+00
## SexMale . 2.341e-01
## ProfProfessor 5.090e+00 5.206e+00
## DisciplineB 1.013e+00 9.837e-01
## Yrs.since.phd 7.581e-03 .
## Yrs.service . .
## NPubs . .
## Ncits 1.802e-02 1.843e-02
##
## nVar 4 4
## BIC -1.988e+03 -1.988e+03
## post prob 0.025 0.024
##
## 1 observations deleted due to missingness.
imageplot.bma(bma.bin2)
bin.m2 = glm(high.salary ~ Prof + Discipline + Ncits, family = "binomial", data = salary)
summary(bin.m2)
##
## Call:
## glm(formula = high.salary ~ Prof + Discipline + Ncits, family = "binomial",
## data = salary)
##
## Coefficients:
## Estimate Std. Error z value Pr(>|z|)
## (Intercept) -6.335608 1.083334 -5.848 4.97e-09 ***
## ProfProfessor 5.221741 1.016653 5.136 2.80e-07 ***
## DisciplineB 0.979729 0.254899 3.844 0.000121 ***
## Ncits 0.018368 0.007614 2.412 0.015853 *
## ---
## Signif. codes: 0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
##
## (Dispersion parameter for binomial family taken to be 1)
##
## Null deviance: 517.75 on 396 degrees of freedom
## Residual deviance: 358.41 on 393 degrees of freedom
## AIC: 366.41
##
## Number of Fisher Scoring iterations: 7
library(rms)
fit.bin.m2 = lrm(high.salary ~ Prof + Discipline + Ncits, data = salary, x = TRUE, y = TRUE)
fit.bin.m2
## Logistic Regression Model
##
## lrm(formula = high.salary ~ Prof + Discipline + Ncits, data = salary,
## x = TRUE, y = TRUE)
##
## Model Likelihood Discrimination Rank Discrim.
## Ratio Test Indexes Indexes
## Obs 397 LR chi2 159.34 R2 0.454 C 0.824
## 0 255 d.f. 3 R2(3,397)0.326 Dxy 0.648
## 1 142 Pr(> chi2) <0.0001 R2(3,273.6)0.435 gamma 0.653
## max |deriv| 1e-05 Brier 0.157 tau-a 0.298
##
## Coef S.E. Wald Z Pr(>|Z|)
## Intercept -6.3356 1.0834 -5.85 <0.0001
## Prof=Professor 5.2217 1.0168 5.14 <0.0001
## Discipline=B 0.9797 0.2549 3.84 0.0001
## Ncits 0.0184 0.0076 2.41 0.0159
set.seed(1234)
val.bin.m2 = validate(fit.bin.m2, B = 500)
val.bin.m2
## index.orig training test optimism index.corrected n
## Dxy 0.6478 0.6494 0.6464 0.0030 0.6449 500
## R2 0.4537 0.4590 0.4462 0.0128 0.4409 500
## Intercept 0.0000 0.0000 0.0175 -0.0175 0.0175 500
## Slope 1.0000 1.0000 0.8564 0.1436 0.8564 500
## Emax 0.0000 0.0000 0.0374 0.0374 0.0374 500
## D 0.3988 0.4056 0.3907 0.0148 0.3840 500
## U -0.0050 -0.0050 0.0050 -0.0100 0.0050 500
## Q 0.4039 0.4106 0.3857 0.0249 0.3790 500
## B 0.1567 0.1555 0.1584 -0.0029 0.1596 500
## g 2.6489 3.5705 2.7251 0.8454 1.8036 500
## gp 0.3031 0.3031 0.2935 0.0096 0.2935 500
pred.logit = predict(fit.bin.m2)
phat = 1/(1 + exp(-pred.logit))
val.prob(phat, salary$high.salary, m = 20, cex = .6)
## Dxy C (ROC) R2 D D:Chi-sq
## 6.478597e-01 8.239299e-01 4.537261e-01 3.988291e-01 1.593352e+02
## D:p U U:Chi-sq U:p Q
## NA -5.037783e-03 -2.273737e-13 1.000000e+00 4.038669e-01
## Brier Intercept Slope Emax E90
## 1.566567e-01 -6.466591e-09 1.000000e+00 7.602105e-02 1.641977e-02
## Eavg S:z S:p
## 8.148218e-03 2.475030e-03 9.980252e-01