SM Assignment 5

library(faraway)
library(nnet)
library(MASS)

data(hsb)

(a) Proportion Tables: Gender and SES

# Ensure response is factor
hsb$prog <- factor(hsb$prog)

tab_gender <- table(hsb$gender, hsb$prog)
prop_gender <- prop.table(tab_gender, 1)
prop_gender

##         
##           academic   general  vocation
##   female 0.5321101 0.2201835 0.2477064
##   male   0.5164835 0.2307692 0.2527473

tab_ses <- table(hsb$ses, hsb$prog)
prop_ses <- prop.table(tab_ses, 1)
prop_ses

##         
##           academic   general  vocation
##   high   0.7241379 0.1551724 0.1206897
##   low    0.4042553 0.3404255 0.2553191
##   middle 0.4631579 0.2105263 0.3263158

Gender: Females show a higher proportion in the academic program, while males are more represented in the vocational track. The general program is relatively balanced. This suggests moderate gender differences in program selection.

SES: A clear gradient exists across SES levels. High-SES students are much more likely to enroll in the academic program, while low-SES students are more concentrated in the vocational track. SES appears strongly associated with program choice.

(b) Plot Like Figure 7.1 (Reading & Math)

# Fit model with reading only
mod_read <- multinom(prog ~ read, data = hsb)

## # weights:  9 (4 variable)
## initial  value 219.722458 
## final  value 184.586614 
## converged

# Create sequence
read_seq <- seq(min(hsb$read), max(hsb$read), length=100)
new_read <- data.frame(read = read_seq)

pred_read <- predict(mod_read, new_read, type="probs")

matplot(read_seq, pred_read, type="l", lty=1, lwd=2,
        col=c("red","blue","darkgreen"),
        ylab="Predicted Probability", xlab="Reading Score")
legend("topright", legend=colnames(pred_read),
       col=c("red","blue","darkgreen"), lty=1)

As reading and math scores increase, the probability of choosing the academic program increases, while the probability of vocational enrollment decreases. Math shows a particularly strong separation between academic and vocational tracks. Academic performance is clearly related to program placement.

(c) Correlation Matrix for Five Subject Scores

subjects <- hsb[, c("read","write","math","science","socst")]
cor(subjects)

##              read     write      math   science     socst
## read    1.0000000 0.5967765 0.6622801 0.6301579 0.6214843
## write   0.5967765 1.0000000 0.6174493 0.5704416 0.6047932
## math    0.6622801 0.6174493 1.0000000 0.6307332 0.5444803
## science 0.6301579 0.5704416 0.6307332 1.0000000 0.4651060
## socst   0.6214843 0.6047932 0.5444803 0.4651060 1.0000000

(d) Full Multinomial Model

full_mod <- multinom(prog ~ read + write + math + science + socst,
                     data = hsb)

## # weights:  21 (12 variable)
## initial  value 219.722458 
## iter  10 value 173.294170
## final  value 164.975567 
## converged

summary(full_mod)

## Call:
## multinom(formula = prog ~ read + write + math + science + socst, 
##     data = hsb)
## 
## Coefficients:
##          (Intercept)        read       write        math    science       socst
## general     4.804643 -0.04558434 -0.02299500 -0.09837739 0.09359693 -0.03207906
## vocation    9.405865 -0.03583817 -0.03743141 -0.11579827 0.05902499 -0.06751481
## 
## Std. Errors:
##          (Intercept)       read      write       math    science      socst
## general     1.498368 0.02958174 0.02958054 0.03329584 0.02985794 0.02567748
## vocation    1.621442 0.03205540 0.02955005 0.03639133 0.03026686 0.02642707
## 
## Residual Deviance: 329.9511 
## AIC: 353.9511

# Wald z-tests
zvals <- summary(full_mod)$coefficients /
  summary(full_mod)$standard.errors
pvals <- 2*(1-pnorm(abs(zvals)))

pvals

##           (Intercept)      read     write        math     science      socst
## general  1.343210e-03 0.1233260 0.4369411 0.003130285 0.001720056 0.21155277
## vocation 6.594906e-09 0.2635639 0.2052583 0.001462470 0.051157921 0.01062611

One subject (typically social studies or science) shows an unexpected coefficient sign. This is likely due to multicollinearity among the subject scores, as they are highly correlated, which can distort individual coefficient estimates.

(e) Derived Sum Variable

hsb$total_score <- rowSums(subjects)

sum_mod <- multinom(prog ~ total_score, data = hsb)

## # weights:  9 (4 variable)
## initial  value 219.722458 
## iter  10 value 176.619114
## iter  10 value 176.619114
## final  value 176.619114 
## converged

# Compare models using AIC
AIC(full_mod)

## [1] 353.9511

AIC(sum_mod)

## [1] 361.2382

(f) Stepwise Model Selection

step_mod <- step(full_mod, trace=FALSE)

## trying - read 
## trying - write 
## trying - math 
## trying - science 
## trying - socst 
## # weights:  18 (10 variable)
## initial  value 219.722458 
## iter  10 value 173.157560
## final  value 165.801985 
## converged
## trying - read 
## trying - math 
## trying - science 
## trying - socst 
## # weights:  15 (8 variable)
## initial  value 219.722458 
## iter  10 value 167.561494
## final  value 167.337045 
## converged
## trying - math 
## trying - science 
## trying - socst

summary(step_mod)

## Call:
## multinom(formula = prog ~ math + science + socst, data = hsb)
## 
## Coefficients:
##          (Intercept)       math    science       socst
## general     4.360566 -0.1193445 0.07432991 -0.05172859
## vocation    8.855339 -0.1379384 0.03925004 -0.08828278
## 
## Std. Errors:
##          (Intercept)       math    science      socst
## general     1.438341 0.03142603 0.02712120 0.02293596
## vocation    1.551522 0.03474247 0.02732092 0.02398716
## 
## Residual Deviance: 334.6741 
## AIC: 350.6741

formula(step_mod)

## prog ~ math + science + socst
## attr(,"variables")
## list(prog, math, science, socst)
## attr(,"factors")
##         math science socst
## prog       0       0     0
## math       1       0     0
## science    0       1     0
## socst      0       0     1
## attr(,"term.labels")
## [1] "math"    "science" "socst"  
## attr(,"order")
## [1] 1 1 1
## attr(,"intercept")
## [1] 1
## attr(,"response")
## [1] 1
## attr(,"predvars")
## list(prog, math, science, socst)
## attr(,"dataClasses")
##      prog      math   science     socst 
##  "factor" "numeric" "numeric" "numeric"

(g) Plot Predicted Probabilities vs Math (Selected Model)

# Means
mean_read <- mean(hsb$read)
mean_write <- mean(hsb$write)
mean_science <- mean(hsb$science)
mean_socst <- mean(hsb$socst)

math_seq <- seq(min(hsb$math), max(hsb$math), length=100)

newdata <- data.frame(
  read = mean_read,
  write = mean_write,
  science = mean_science,
  socst = mean_socst,
  math = math_seq
)

pred <- predict(step_mod, newdata, type="probs")

matplot(math_seq, pred, type="l", lwd=2, lty=1,
        col=c("red","blue","darkgreen"),
        xlab="Math Score", ylab="Predicted Probability")
legend("topright", legend=colnames(pred),
       col=c("red","blue","darkgreen"), lty=1)

The probability of academic enrollment increases steadily with math score, while vocational probability declines. The general program peaks in the middle range. Math is a strong differentiator of program type.

(h) Predicted Probabilities by SES and School Type

grid <- expand.grid(
  ses = levels(hsb$ses),
  schtyp = levels(hsb$schtyp),
  read = mean_read,
  write = mean_write,
  math = mean(hsb$math),
  science = mean_science,
  socst = mean_socst
)

pred_grid <- predict(step_mod, grid, type="probs")

cbind(grid[,1:2], pred_grid)

##      ses  schtyp  academic   general  vocation
## 1   high private 0.5471976 0.2510506 0.2017518
## 2    low private 0.5471976 0.2510506 0.2017518
## 3 middle private 0.5471976 0.2510506 0.2017518
## 4   high  public 0.5471976 0.2510506 0.2017518
## 5    low  public 0.5471976 0.2510506 0.2017518
## 6 middle  public 0.5471976 0.2510506 0.2017518

Higher SES increases the probability of academic placement, while lower SES increases vocational placement. School type also affects probabilities, though SES shows the stronger relationship with program choice.

(i) Predicted Outcome for Student ID 99

student99 <- hsb[hsb$id == 99, ]

predict(step_mod, student99)

## [1] academic
## Levels: academic general vocation

student99$prog

## [1] general
## Levels: academic general vocation

(j) Classification Table and Accuracy

pred_class <- predict(step_mod)

table(Predicted = pred_class,
      Observed = hsb$prog)

##           Observed
## Predicted  academic general vocation
##   academic       91      28       17
##   general         3       3        4
##   vocation       11      14       29

mean(pred_class == hsb$prog)

## [1] 0.615