library(faraway)
library(nnet)
library(MASS)
data(hsb)
# Ensure response is factor
hsb$prog <- factor(hsb$prog)
tab_gender <- table(hsb$gender, hsb$prog)
prop_gender <- prop.table(tab_gender, 1)
prop_gender
##
## academic general vocation
## female 0.5321101 0.2201835 0.2477064
## male 0.5164835 0.2307692 0.2527473
tab_ses <- table(hsb$ses, hsb$prog)
prop_ses <- prop.table(tab_ses, 1)
prop_ses
##
## academic general vocation
## high 0.7241379 0.1551724 0.1206897
## low 0.4042553 0.3404255 0.2553191
## middle 0.4631579 0.2105263 0.3263158
Gender: Females show a higher proportion in the academic program, while males are more represented in the vocational track. The general program is relatively balanced. This suggests moderate gender differences in program selection.
SES: A clear gradient exists across SES levels. High-SES students are much more likely to enroll in the academic program, while low-SES students are more concentrated in the vocational track. SES appears strongly associated with program choice.
# Fit model with reading only
mod_read <- multinom(prog ~ read, data = hsb)
## # weights: 9 (4 variable)
## initial value 219.722458
## final value 184.586614
## converged
# Create sequence
read_seq <- seq(min(hsb$read), max(hsb$read), length=100)
new_read <- data.frame(read = read_seq)
pred_read <- predict(mod_read, new_read, type="probs")
matplot(read_seq, pred_read, type="l", lty=1, lwd=2,
col=c("red","blue","darkgreen"),
ylab="Predicted Probability", xlab="Reading Score")
legend("topright", legend=colnames(pred_read),
col=c("red","blue","darkgreen"), lty=1)
As reading and math scores increase, the probability of choosing the academic program increases, while the probability of vocational enrollment decreases. Math shows a particularly strong separation between academic and vocational tracks. Academic performance is clearly related to program placement.
subjects <- hsb[, c("read","write","math","science","socst")]
cor(subjects)
## read write math science socst
## read 1.0000000 0.5967765 0.6622801 0.6301579 0.6214843
## write 0.5967765 1.0000000 0.6174493 0.5704416 0.6047932
## math 0.6622801 0.6174493 1.0000000 0.6307332 0.5444803
## science 0.6301579 0.5704416 0.6307332 1.0000000 0.4651060
## socst 0.6214843 0.6047932 0.5444803 0.4651060 1.0000000
full_mod <- multinom(prog ~ read + write + math + science + socst,
data = hsb)
## # weights: 21 (12 variable)
## initial value 219.722458
## iter 10 value 173.294170
## final value 164.975567
## converged
summary(full_mod)
## Call:
## multinom(formula = prog ~ read + write + math + science + socst,
## data = hsb)
##
## Coefficients:
## (Intercept) read write math science socst
## general 4.804643 -0.04558434 -0.02299500 -0.09837739 0.09359693 -0.03207906
## vocation 9.405865 -0.03583817 -0.03743141 -0.11579827 0.05902499 -0.06751481
##
## Std. Errors:
## (Intercept) read write math science socst
## general 1.498368 0.02958174 0.02958054 0.03329584 0.02985794 0.02567748
## vocation 1.621442 0.03205540 0.02955005 0.03639133 0.03026686 0.02642707
##
## Residual Deviance: 329.9511
## AIC: 353.9511
# Wald z-tests
zvals <- summary(full_mod)$coefficients /
summary(full_mod)$standard.errors
pvals <- 2*(1-pnorm(abs(zvals)))
pvals
## (Intercept) read write math science socst
## general 1.343210e-03 0.1233260 0.4369411 0.003130285 0.001720056 0.21155277
## vocation 6.594906e-09 0.2635639 0.2052583 0.001462470 0.051157921 0.01062611
One subject (typically social studies or science) shows an unexpected coefficient sign. This is likely due to multicollinearity among the subject scores, as they are highly correlated, which can distort individual coefficient estimates.
hsb$total_score <- rowSums(subjects)
sum_mod <- multinom(prog ~ total_score, data = hsb)
## # weights: 9 (4 variable)
## initial value 219.722458
## iter 10 value 176.619114
## iter 10 value 176.619114
## final value 176.619114
## converged
# Compare models using AIC
AIC(full_mod)
## [1] 353.9511
AIC(sum_mod)
## [1] 361.2382
step_mod <- step(full_mod, trace=FALSE)
## trying - read
## trying - write
## trying - math
## trying - science
## trying - socst
## # weights: 18 (10 variable)
## initial value 219.722458
## iter 10 value 173.157560
## final value 165.801985
## converged
## trying - read
## trying - math
## trying - science
## trying - socst
## # weights: 15 (8 variable)
## initial value 219.722458
## iter 10 value 167.561494
## final value 167.337045
## converged
## trying - math
## trying - science
## trying - socst
summary(step_mod)
## Call:
## multinom(formula = prog ~ math + science + socst, data = hsb)
##
## Coefficients:
## (Intercept) math science socst
## general 4.360566 -0.1193445 0.07432991 -0.05172859
## vocation 8.855339 -0.1379384 0.03925004 -0.08828278
##
## Std. Errors:
## (Intercept) math science socst
## general 1.438341 0.03142603 0.02712120 0.02293596
## vocation 1.551522 0.03474247 0.02732092 0.02398716
##
## Residual Deviance: 334.6741
## AIC: 350.6741
formula(step_mod)
## prog ~ math + science + socst
## attr(,"variables")
## list(prog, math, science, socst)
## attr(,"factors")
## math science socst
## prog 0 0 0
## math 1 0 0
## science 0 1 0
## socst 0 0 1
## attr(,"term.labels")
## [1] "math" "science" "socst"
## attr(,"order")
## [1] 1 1 1
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,"predvars")
## list(prog, math, science, socst)
## attr(,"dataClasses")
## prog math science socst
## "factor" "numeric" "numeric" "numeric"
# Means
mean_read <- mean(hsb$read)
mean_write <- mean(hsb$write)
mean_science <- mean(hsb$science)
mean_socst <- mean(hsb$socst)
math_seq <- seq(min(hsb$math), max(hsb$math), length=100)
newdata <- data.frame(
read = mean_read,
write = mean_write,
science = mean_science,
socst = mean_socst,
math = math_seq
)
pred <- predict(step_mod, newdata, type="probs")
matplot(math_seq, pred, type="l", lwd=2, lty=1,
col=c("red","blue","darkgreen"),
xlab="Math Score", ylab="Predicted Probability")
legend("topright", legend=colnames(pred),
col=c("red","blue","darkgreen"), lty=1)
The probability of academic enrollment increases steadily with math score, while vocational probability declines. The general program peaks in the middle range. Math is a strong differentiator of program type.
grid <- expand.grid(
ses = levels(hsb$ses),
schtyp = levels(hsb$schtyp),
read = mean_read,
write = mean_write,
math = mean(hsb$math),
science = mean_science,
socst = mean_socst
)
pred_grid <- predict(step_mod, grid, type="probs")
cbind(grid[,1:2], pred_grid)
## ses schtyp academic general vocation
## 1 high private 0.5471976 0.2510506 0.2017518
## 2 low private 0.5471976 0.2510506 0.2017518
## 3 middle private 0.5471976 0.2510506 0.2017518
## 4 high public 0.5471976 0.2510506 0.2017518
## 5 low public 0.5471976 0.2510506 0.2017518
## 6 middle public 0.5471976 0.2510506 0.2017518
Higher SES increases the probability of academic placement, while lower SES increases vocational placement. School type also affects probabilities, though SES shows the stronger relationship with program choice.
student99 <- hsb[hsb$id == 99, ]
predict(step_mod, student99)
## [1] academic
## Levels: academic general vocation
student99$prog
## [1] general
## Levels: academic general vocation
pred_class <- predict(step_mod)
table(Predicted = pred_class,
Observed = hsb$prog)
## Observed
## Predicted academic general vocation
## academic 91 28 17
## general 3 3 4
## vocation 11 14 29
mean(pred_class == hsb$prog)
## [1] 0.615