Dataset is part of faraway package.

library(faraway)
library(nnet)
library(gridExtra)
library(grid)
library(ggplot2)
library(lattice)

tabletheme <- gridExtra::ttheme_default(
    core = list(fg_params=list(cex = 0.65)),
    colhead = list(fg_params=list(cex = 0.65)),
    rowhead = list(fg_params=list(cex = 0.65)),
    padding = unit(c(3, 3), "mm"))

data(hsb)
hsb.df <- hsb

#Set reference category for socioeconomic class low to high
hsb.df$ses <- relevel(hsb.df$ses, ref = "low")
summary(hsb.df)
##        id            gender              race         ses    
##  Min.   :  1.00   female:109   african-amer: 20   low   :47  
##  1st Qu.: 50.75   male  : 91   asian       : 11   high  :58  
##  Median :100.50                hispanic    : 24   middle:95  
##  Mean   :100.50                white       :145              
##  3rd Qu.:150.25                                              
##  Max.   :200.00                                              
##      schtyp          prog          read           write      
##  private: 32   academic:105   Min.   :28.00   Min.   :31.00  
##  public :168   general : 45   1st Qu.:44.00   1st Qu.:45.75  
##                vocation: 50   Median :50.00   Median :54.00  
##                               Mean   :52.23   Mean   :52.77  
##                               3rd Qu.:60.00   3rd Qu.:60.00  
##                               Max.   :76.00   Max.   :67.00  
##       math          science          socst      
##  Min.   :33.00   Min.   :26.00   Min.   :26.00  
##  1st Qu.:45.00   1st Qu.:44.00   1st Qu.:46.00  
##  Median :52.00   Median :53.00   Median :52.00  
##  Mean   :52.65   Mean   :51.85   Mean   :52.41  
##  3rd Qu.:59.00   3rd Qu.:58.00   3rd Qu.:61.00  
##  Max.   :75.00   Max.   :74.00   Max.   :71.00

Summary of the dataset shows, observations are not equaly distributed,

sProg <- hsb.df$prog
summary(sProg)
## academic  general vocation 
##      105       45       50
cutRead <- cut (hsb.df$read,7)
cutWrite <- cut (hsb.df$write,7)
cutMath <- cut (hsb.df$math,7)
cutScience <- cut (hsb.df$science,7)
cutSocst <- cut (hsb.df$socst,7)

scoreRange <- c(20,30,40,50,60,70,80)

t1 = tableGrob(cbind(Read = row.names(table(cutRead, hsb.df$prog)), table(cutRead, hsb.df$prog)), rows = NULL, theme = tabletheme)
t2 = tableGrob(cbind(Write = row.names(table(cutWrite, hsb.df$prog)), table(cutWrite, hsb.df$prog)), rows = NULL, theme = tabletheme)
t3 = tableGrob(cbind(Math = row.names(table(cutMath, hsb.df$prog)), table(cutMath, hsb.df$prog)), rows = NULL, theme = tabletheme)
t4 = tableGrob(cbind(Science = row.names(table(cutScience, hsb.df$prog)), table(cutScience, hsb.df$prog)), rows = NULL, theme = tabletheme)
t5 = tableGrob(cbind(`Social(Socst)` = row.names(table(cutSocst, hsb.df$prog)), table(cutSocst, hsb.df$prog)), rows = NULL, theme = tabletheme)

t6 <- tableGrob(cbind(Gender = row.names(table(hsb.df$gender,hsb.df$prog)), table(hsb.df$gender,hsb.df$prog)), rows = NULL, theme = tabletheme)
t7 <- tableGrob(cbind(Race = row.names(table(hsb.df$race,hsb.df$prog)), table(hsb.df$race,hsb.df$prog)), rows = NULL, theme = tabletheme)
t8 <- tableGrob(cbind(`Socioeconomic(ses)` = row.names(table(hsb.df$ses,hsb.df$prog)), table(hsb.df$ses,hsb.df$prog)), rows = NULL, theme = tabletheme)
t9 <- tableGrob(cbind(`School Type(schtyp)` = row.names(table(hsb.df$schtyp,hsb.df$prog)), table(hsb.df$schtyp,hsb.df$prog)), rows = NULL, theme = tabletheme)

grid.arrange(t1, t2, t4, t5, t3, ncol=2, nrow = 3)

I have cut predictors read, write, math, science and socst into seven levels and used the approximate midpoints of the ranges to label the groups. We can notice as score in each subject increases students participation in academic program increases and participation in general and vocation program decreases.

grid.arrange(t6, t9, t7, t8, ncol=1, nrow = 4)

Gender does not reveal much information about participation in the program. Both genders female and male have approximately equal distribution among the programs. schtyp show students from public school have more participation in academic program than private school. We can say this is mainly due to observation. Same is the case with race, as this is observational data, there are more observations in white category. Socioeconomics(ses) does give some insights, students belonging to a socioeconomic class of high and middle have higher participation in academic program compared to lower economic class.

#par(oma = c(4, 1, 1, 1))
#par(fig = c(0, 1, 0, 1), oma = c(0, 0, 0, 0), mar = c(0, 0, 0, 0), new = TRUE)
par(mfrow=c(2, 3), lwd=2, font=2, font.lab=2, font.axis=2, oma=c(0,0,2,0))
matplot(scoreRange, prop.table(table(cutRead,hsb.df$prog),1),lty=c(1,2,5), type="l",ylab="Proportion",xlab="Read")
matplot(scoreRange, prop.table(table(cutWrite,hsb.df$prog),1),lty=c(1,2,5), type="l",ylab="Proportion",xlab="Write")
matplot(scoreRange, prop.table(table(cutMath,hsb.df$prog),1),lty=c(1,2,5), type="l",ylab="Proportion",xlab="Math")
matplot(scoreRange, prop.table(table(cutScience,hsb.df$prog),1),lty=c(1,2,5), type="l",ylab="Proportion",xlab="Science")
matplot(scoreRange, prop.table(table(cutSocst,hsb.df$prog),1),lty=c(1,2,5), type="l",ylab="Proportion",xlab="Social Studies")
plot(1, type = "n", axes=FALSE, xlab="", ylab="")
legend("bottom", inset=0.01, legend=colnames(prop.table(table(cutRead,sProg),1))[1:3], lty=c(1,2,5), col=c(1:3),bg= ("white"), horiz=F, box.lty=1)

Above plots suggest as students receive higher score in the subjects, they are more likely to choose academic program. Change in choice happens approximately between scores of 40 and 50.

Question(a) Fit a trinomial response model with the other relevant variables as predictors (untransformed).

To build a multimomial model, I will be using multimon function from nnet library. Since response variable is a trinomial, we will set one of the categories as baseline category. We can check if baseline category is set using contrasts function.

#set baseline category
hsb.df$prog <- relevel(hsb.df$prog, ref = "academic")

#check if category is set as baseline
contrasts(hsb.df$prog)
#set baseline category
hsb.df$prog <- relevel(hsb.df$prog, ref = "academic")

#check if category is set as baseline
contrasts(hsb.df$prog)
##          general vocation
## academic       0        0
## general        1        0
## vocation       0        1

The output of contrasts, shows academic has values as zero, it means academic won’t be part of the output, and coefficients generated using multinom function are in comparison to academic.

hsb.mm <- multinom(prog ~ gender + race + ses + schtyp + read + write + math + science + socst, data = hsb.df)
## # weights:  42 (26 variable)
## initial  value 219.722458 
## iter  10 value 173.352632
## iter  20 value 153.271418
## iter  30 value 152.935259
## iter  30 value 152.935257
## iter  30 value 152.935257
## final  value 152.935257 
## converged
summary(hsb.mm)
## Call:
## multinom(formula = prog ~ gender + race + ses + schtyp + read + 
##     write + math + science + socst, data = hsb.df)
## 
## Coefficients:
##          (Intercept)  gendermale  raceasian racehispanic racewhite
## general     4.730187 -0.09265779  1.3521578   -0.6323758 0.2962486
## vocation    7.529231 -0.32108944 -0.7009121   -0.1996227 0.3355866
##              seshigh  sesmiddle schtyppublic        read       write
## general  -1.09846294 -0.3955846    0.5845205 -0.04417864 -0.03627145
## vocation -0.04727141  1.1342286    2.0551041 -0.03480958 -0.03166060
##                math   science       socst
## general  -0.1092784 0.1019299 -0.01976958
## vocation -0.1139836 0.0522927 -0.08040055
## 
## Std. Errors:
##          (Intercept) gendermale raceasian racehispanic racewhite   seshigh
## general     1.715629  0.4548686  1.058737    0.8935398 0.7354735 0.6066647
## vocation    1.986980  0.5021110  1.470261    0.8393640 0.7480493 0.7045770
##          sesmiddle schtyppublic       read      write       math
## general  0.5346586    0.5642847 0.03103633 0.03381269 0.03522350
## vocation 0.5920155    0.8347685 0.03422389 0.03585703 0.03885101
##             science      socst
## general  0.03273958 0.02712550
## vocation 0.03424719 0.02938178
## 
## Residual Deviance: 305.8705 
## AIC: 357.8705

Model output shows Coefficients, Standard Errors, Residual Deviance and AIC. There are two sets of coefficients, first set is comparing prog = general to baseline prog = academic and second set is comparing prog = general to baseline prog = vocation.

First equation,

\[ln\bigg(\frac{P(prog=general)}{P(prog=academic)}\bigg)=4.73 -0.09(gender=male) \\ +1.35(race=asian) -0.63(race=hispanic) + 0.30(race=white) \\ -1.10(sec=high) - 0.40(sec=middle) + 0.58(schtyp=public) \\ -0.04read - 0.04write -0.11math + 0.10science - 0.02socst\]

Second equation,

\[ln\bigg(\frac{P(prog=vocation)}{P(prog=academic)}\bigg)=7.53 -0.32(gender=male) \\ -0.70(race=asian) -0.20(race=hispanic) + 0.34(race=white) \\ +0.05(sec=high) + 1.13(sec=middle) + 2.06(schtyp=public) \\ -0.03read - 0.03write -0.11math + 0.05science - 0.08socst\]

Question(b) Use backward elimination to reduce the model to one where all predictors are statistically significant. Give an interpretation of the resulting model.

I will be using step function to find best-reduced model. Backward elimination process uses AIC value and compares the current model with next model by removing one variable at a time. Model with least AIC is considered the best model.

Step function has identified variables race, gender, write and read are not statistically significant. Removing the variables from full model would lower the value of AIC. Full model’s AIC is 357.8705, when race is removed from the model it will result in reduced AIC of 351.5522. Similarly, in the first go, if we were to remove variable gender from the model instead of variable race, AIC would drop to 354.2857. With the variable read removed AIC would drop to 356.1305 and with the variable write removed AIC would be dropped to 355.2524.

step(hsb.mm, direction = "backward")
## Start:  AIC=357.87
## prog ~ gender + race + ses + schtyp + read + write + math + science + 
##     socst
## 
## trying - gender 
## # weights:  39 (24 variable)
## initial  value 219.722458 
## iter  10 value 173.053191
## iter  20 value 153.384543
## final  value 153.142827 
## converged
## trying - race 
## # weights:  33 (20 variable)
## initial  value 219.722458 
## iter  10 value 174.298263
## iter  20 value 155.955741
## final  value 155.776076 
## converged
## trying - ses 
## # weights:  36 (22 variable)
## initial  value 219.722458 
## iter  10 value 173.415148
## iter  20 value 159.270550
## final  value 159.018173 
## converged
## trying - schtyp 
## # weights:  39 (24 variable)
## initial  value 219.722458 
## iter  10 value 180.237051
## iter  20 value 157.353858
## final  value 157.125206 
## converged
## trying - read 
## # weights:  39 (24 variable)
## initial  value 219.722458 
## iter  10 value 187.280604
## iter  20 value 154.113586
## final  value 154.065249 
## converged
## trying - write 
## # weights:  39 (24 variable)
## initial  value 219.722458 
## iter  10 value 176.544847
## iter  20 value 153.775216
## final  value 153.626207 
## converged
## trying - math 
## # weights:  39 (24 variable)
## initial  value 219.722458 
## iter  10 value 183.027567
## iter  20 value 160.082718
## final  value 159.929539 
## converged
## trying - science 
## # weights:  39 (24 variable)
## initial  value 219.722458 
## iter  10 value 185.604114
## iter  20 value 158.427914
## iter  30 value 158.243167
## iter  30 value 158.243167
## iter  30 value 158.243167
## final  value 158.243167 
## converged
## trying - socst 
## # weights:  39 (24 variable)
## initial  value 219.722458 
## iter  10 value 180.756032
## iter  20 value 157.315601
## final  value 157.146736 
## converged
##           Df      AIC
## - race    20 351.5522
## - gender  24 354.2857
## - write   24 355.2524
## - read    24 356.1305
## <none>    26 357.8705
## - ses     22 362.0363
## - schtyp  24 362.2504
## - socst   24 362.2935
## - science 24 364.4863
## - math    24 367.8591
## # weights:  33 (20 variable)
## initial  value 219.722458 
## iter  10 value 174.298263
## iter  20 value 155.955741
## final  value 155.776076 
## converged
## 
## Step:  AIC=351.55
## prog ~ gender + ses + schtyp + read + write + math + science + 
##     socst
## 
## trying - gender 
## # weights:  30 (18 variable)
## initial  value 219.722458 
## iter  10 value 174.064055
## iter  20 value 156.042483
## final  value 156.032828 
## converged
## trying - ses 
## # weights:  27 (16 variable)
## initial  value 219.722458 
## iter  10 value 174.614066
## iter  20 value 161.475590
## final  value 161.472216 
## converged
## trying - schtyp 
## # weights:  30 (18 variable)
## initial  value 219.722458 
## iter  10 value 181.583997
## iter  20 value 159.698526
## final  value 159.649518 
## converged
## trying - read 
## # weights:  30 (18 variable)
## initial  value 219.722458 
## iter  10 value 186.186083
## iter  20 value 157.051181
## final  value 156.905034 
## converged
## trying - write 
## # weights:  30 (18 variable)
## initial  value 219.722458 
## iter  10 value 177.314303
## iter  20 value 156.401745
## final  value 156.325078 
## converged
## trying - math 
## # weights:  30 (18 variable)
## initial  value 219.722458 
## iter  10 value 184.204849
## iter  20 value 162.683071
## final  value 162.533639 
## converged
## trying - science 
## # weights:  30 (18 variable)
## initial  value 219.722458 
## iter  10 value 186.697272
## iter  20 value 162.012056
## final  value 161.818793 
## converged
## trying - socst 
## # weights:  30 (18 variable)
## initial  value 219.722458 
## iter  10 value 181.705379
## iter  20 value 159.690011
## final  value 159.589251 
## converged
##           Df      AIC
## - gender  18 348.0657
## - write   18 348.6502
## - read    18 349.8101
## <none>    20 351.5522
## - ses     16 354.9444
## - socst   18 355.1785
## - schtyp  18 355.2990
## - science 18 359.6376
## - math    18 361.0673
## # weights:  30 (18 variable)
## initial  value 219.722458 
## iter  10 value 174.064055
## iter  20 value 156.042483
## final  value 156.032828 
## converged
## 
## Step:  AIC=348.07
## prog ~ ses + schtyp + read + write + math + science + socst
## 
## trying - ses 
## # weights:  24 (14 variable)
## initial  value 219.722458 
## iter  10 value 174.143433
## iter  20 value 161.751125
## iter  20 value 161.751124
## iter  20 value 161.751124
## final  value 161.751124 
## converged
## trying - schtyp 
## # weights:  27 (16 variable)
## initial  value 219.722458 
## iter  10 value 181.523639
## iter  20 value 159.901319
## final  value 159.901300 
## converged
## trying - read 
## # weights:  27 (16 variable)
## initial  value 219.722458 
## iter  10 value 185.082756
## iter  20 value 157.256538
## final  value 157.255365 
## converged
## trying - write 
## # weights:  27 (16 variable)
## initial  value 219.722458 
## iter  10 value 177.307341
## iter  20 value 156.408389
## final  value 156.406678 
## converged
## trying - math 
## # weights:  27 (16 variable)
## initial  value 219.722458 
## iter  10 value 184.045510
## iter  20 value 162.998503
## final  value 162.998232 
## converged
## trying - science 
## # weights:  27 (16 variable)
## initial  value 219.722458 
## iter  10 value 186.937918
## iter  20 value 162.117593
## final  value 162.117077 
## converged
## trying - socst 
## # weights:  27 (16 variable)
## initial  value 219.722458 
## iter  10 value 181.713098
## iter  20 value 159.842303
## final  value 159.841915 
## converged
##           Df      AIC
## - write   16 344.8134
## - read    16 346.5107
## <none>    18 348.0657
## - ses     14 351.5022
## - socst   16 351.6838
## - schtyp  16 351.8026
## - science 16 356.2342
## - math    16 357.9965
## # weights:  27 (16 variable)
## initial  value 219.722458 
## iter  10 value 177.307341
## iter  20 value 156.408389
## final  value 156.406678 
## converged
## 
## Step:  AIC=344.81
## prog ~ ses + schtyp + read + math + science + socst
## 
## trying - ses 
## # weights:  21 (12 variable)
## initial  value 219.722458 
## iter  10 value 175.697433
## final  value 162.312774 
## converged
## trying - schtyp 
## # weights:  24 (14 variable)
## initial  value 219.722458 
## iter  10 value 184.270328
## iter  20 value 160.624525
## final  value 160.624514 
## converged
## trying - read 
## # weights:  24 (14 variable)
## initial  value 219.722458 
## iter  10 value 171.057596
## iter  20 value 157.775568
## final  value 157.775540 
## converged
## trying - math 
## # weights:  24 (14 variable)
## initial  value 219.722458 
## iter  10 value 175.416572
## iter  20 value 164.774180
## final  value 164.774168 
## converged
## trying - science 
## # weights:  24 (14 variable)
## initial  value 219.722458 
## iter  10 value 174.140609
## iter  20 value 162.188206
## final  value 162.188190 
## converged
## trying - socst 
## # weights:  24 (14 variable)
## initial  value 219.722458 
## iter  10 value 171.471593
## iter  20 value 161.495964
## final  value 161.495961 
## converged
##           Df      AIC
## - read    14 343.5511
## <none>    16 344.8134
## - ses     12 348.6255
## - schtyp  14 349.2490
## - socst   14 350.9919
## - science 14 352.3764
## - math    14 357.5483
## # weights:  24 (14 variable)
## initial  value 219.722458 
## iter  10 value 171.057596
## iter  20 value 157.775568
## final  value 157.775540 
## converged
## 
## Step:  AIC=343.55
## prog ~ ses + schtyp + math + science + socst
## 
## trying - ses 
## # weights:  18 (10 variable)
## initial  value 219.722458 
## iter  10 value 166.035430
## final  value 163.818866 
## converged
## trying - schtyp 
## # weights:  21 (12 variable)
## initial  value 219.722458 
## iter  10 value 171.569774
## final  value 162.019070 
## converged
## trying - math 
## # weights:  21 (12 variable)
## initial  value 219.722458 
## iter  10 value 170.382701
## final  value 169.805302 
## converged
## trying - science 
## # weights:  21 (12 variable)
## initial  value 219.722458 
## iter  10 value 165.342936
## final  value 162.495650 
## converged
## trying - socst 
## # weights:  21 (12 variable)
## initial  value 219.722458 
## iter  10 value 166.298973
## final  value 165.099779 
## converged
##           Df      AIC
## <none>    14 343.5511
## - ses     10 347.6377
## - schtyp  12 348.0381
## - science 12 348.9913
## - socst   12 354.1996
## - math    12 363.6106
## Call:
## multinom(formula = prog ~ ses + schtyp + math + science + socst, 
##     data = hsb.df)
## 
## Coefficients:
##          (Intercept)     seshigh  sesmiddle schtyppublic      math
## general     3.463436 -0.87607679 -0.1781425    0.6469675 -0.121237
## vocation    6.671789  0.01571742  1.2222511    1.9958920 -0.136975
##             science       socst
## general  0.08210394 -0.04441418
## vocation 0.03941548 -0.09363788
## 
## Residual Deviance: 315.5511 
## AIC: 343.5511

As we examine the output of step function, it suggest model built using variables ses, schtyp, science, socst and math yeilds lower AIC. In other words, these variables are statistically significant to the model.

Last part of the step function gives Coefficients and Standard Errors of best fit model based on AIC.

#Reduced model
hsb.rmm <- multinom(formula = prog ~ ses + schtyp + math + science + socst, data = hsb.df)
## # weights:  24 (14 variable)
## initial  value 219.722458 
## iter  10 value 171.057596
## iter  20 value 157.775568
## final  value 157.775540 
## converged
#summary(hsb.rmm)
#z-values
z_val <- summary(hsb.rmm)$coefficients/summary(hsb.rmm)$standard.errors

z_val
##          (Intercept)    seshigh  sesmiddle schtyppublic      math  science
## general     2.261578 -1.5212603 -0.3550735     1.185771 -3.772784 2.945165
## vocation    3.694773  0.0234905  2.1796872     2.455075 -3.813516 1.375763
##              socst
## general  -1.894076
## vocation -3.619858

Above are z-values of the model

#p-values, 2 tailed z test
p_val <- (1 - pnorm(abs(z_val), 0, 1)) * 2 

p_val
##           (Intercept)   seshigh  sesmiddle schtyppublic         math
## general  0.0237234617 0.1281945 0.72253450   0.23571289 0.0001614358
## vocation 0.0002200831 0.9812590 0.02928065   0.01408551 0.0001370039
##              science        socst
## general  0.003227821 0.0582148598
## vocation 0.168895015 0.0002947645

Interpretation of coefficients for general program compared to baseline academic program.

Interpretation of coefficients for vocation program compared to baseline academic program.

Question(c) For the student with id 99, compute the predicted probabilities of the three possible choices.

Student id 99, has ses as low. Scores in subjects math, science and socst are below average. There is high chance that student will choose vocation program. I will be using predict function to compute probabilities.

Student details,

hsb.df[99,]
##    id gender     race ses schtyp     prog read write math science socst
## 99  1 female hispanic low public vocation   34    44   40      39    41

Predicted probabilities,

predict(hsb.rmm, newdata = hsb.df[99,], 'prob')
##  academic   general  vocation 
## 0.1877069 0.3566938 0.4555993

References