BANA7042_Assignment5_naikvr

Multinomial & Ordinal Regression

library(faraway)
library(nnet)
library(MASS)
library(dplyr)

Attaching package: 'dplyr'
The following object is masked from 'package:MASS':

    select
The following objects are masked from 'package:stats':

    filter, lag
The following objects are masked from 'package:base':

    intersect, setdiff, setequal, union
library(ggplot2)

data(hsb)
str(hsb)
'data.frame':   200 obs. of  11 variables:
 $ id     : int  70 121 86 141 172 113 50 11 84 48 ...
 $ gender : Factor w/ 2 levels "female","male": 2 1 2 2 2 2 2 2 2 2 ...
 $ race   : Factor w/ 4 levels "african-amer",..: 4 4 4 4 4 4 1 3 4 1 ...
 $ ses    : Factor w/ 3 levels "high","low","middle": 2 3 1 1 3 3 3 3 3 3 ...
 $ schtyp : Factor w/ 2 levels "private","public": 2 2 2 2 2 2 2 2 2 2 ...
 $ prog   : Factor w/ 3 levels "academic","general",..: 2 3 2 3 1 1 2 1 2 1 ...
 $ read   : int  57 68 44 63 47 44 50 34 63 57 ...
 $ write  : int  52 59 33 44 52 52 59 46 57 55 ...
 $ math   : int  41 53 54 47 57 51 42 45 54 52 ...
 $ science: int  47 63 58 53 53 63 53 39 58 50 ...
 $ socst  : int  57 61 31 56 61 61 61 36 51 51 ...
# Question 1.A
prop.table(table(hsb$gender, hsb$prog), 1)
        
          academic   general  vocation
  female 0.5321101 0.2201835 0.2477064
  male   0.5164835 0.2307692 0.2527473
prop.table(table(hsb$ses, hsb$prog), 1)
        
          academic   general  vocation
  high   0.7241379 0.1551724 0.1206897
  low    0.4042553 0.3404255 0.2553191
  middle 0.4631579 0.2105263 0.3263158
# Question 1.B

ggplot(hsb, aes(read, fill = prog)) +
  geom_density(alpha = 0.4)

ggplot(hsb, aes(math, fill = prog)) +
  geom_density(alpha = 0.4)

# Question 1.C

subjects <- hsb[, c("read","write","math","science","socst")]
cor(subjects)
             read     write      math   science     socst
read    1.0000000 0.5967765 0.6622801 0.6301579 0.6214843
write   0.5967765 1.0000000 0.6174493 0.5704416 0.6047932
math    0.6622801 0.6174493 1.0000000 0.6307332 0.5444803
science 0.6301579 0.5704416 0.6307332 1.0000000 0.4651060
socst   0.6214843 0.6047932 0.5444803 0.4651060 1.0000000
# Question 1.D

hsb$prog <- relevel(hsb$prog, ref = "general")

m_full <- multinom(prog ~ read + write + math + science + socst, data = hsb)
# weights:  21 (12 variable)
initial  value 219.722458 
iter  10 value 183.838126
iter  20 value 164.975570
final  value 164.975567 
converged
summary(m_full)
Call:
multinom(formula = prog ~ read + write + math + science + socst, 
    data = hsb)

Coefficients:
         (Intercept)        read       write        math     science
academic   -4.804712 0.045583547  0.02299571  0.09837651 -0.09359544
vocation    4.601228 0.009746056 -0.01443702 -0.01742029 -0.03457200
               socst
academic  0.03207963
vocation -0.03543572

Std. Errors:
         (Intercept)       read      write       math    science      socst
academic    1.498370 0.02958171 0.02958053 0.03329577 0.02985787 0.02567747
vocation    1.589176 0.03223947 0.03010542 0.03585979 0.02993956 0.02548155

Residual Deviance: 329.9511 
AIC: 353.9511 
# Question 1.E

hsb$total <- rowSums(subjects)

m_sum <- multinom(prog ~ total, data = hsb)
# weights:  9 (4 variable)
initial  value 219.722458 
iter  10 value 176.619134
final  value 176.619114 
converged
AIC(m_full, m_sum)
       df      AIC
m_full 12 353.9511
m_sum   4 361.2382
# Question 1.F

m_step <- step(m_full)
Start:  AIC=353.95
prog ~ read + write + math + science + socst

trying - read 
# weights:  18 (10 variable)
initial  value 219.722458 
iter  10 value 173.569762
final  value 166.302302 
converged
trying - write 
# weights:  18 (10 variable)
initial  value 219.722458 
iter  10 value 172.475666
final  value 165.801985 
converged
trying - math 
# weights:  18 (10 variable)
initial  value 219.722458 
iter  10 value 176.746544
final  value 172.243618 
converged
trying - science 
# weights:  18 (10 variable)
initial  value 219.722458 
iter  10 value 183.815852
final  value 170.511980 
converged
trying - socst 
# weights:  18 (10 variable)
initial  value 219.722458 
iter  10 value 170.569129
final  value 168.411657 
converged
          Df      AIC
- write   10 351.6040
- read    10 352.6046
<none>    12 353.9511
- socst   10 356.8233
- science 10 361.0240
- math    10 364.4872
# weights:  18 (10 variable)
initial  value 219.722458 
iter  10 value 172.475666
final  value 165.801985 
converged

Step:  AIC=351.6
prog ~ read + math + science + socst

trying - read 
# weights:  15 (8 variable)
initial  value 219.722458 
iter  10 value 167.367707
final  value 167.337045 
converged
trying - math 
# weights:  15 (8 variable)
initial  value 219.722458 
iter  10 value 174.812772
final  value 174.789964 
converged
trying - science 
# weights:  15 (8 variable)
initial  value 219.722458 
iter  10 value 171.013834
final  value 170.980400 
converged
trying - socst 
# weights:  15 (8 variable)
initial  value 219.722458 
iter  10 value 170.888131
final  value 170.868694 
converged
          Df      AIC
- read     8 350.6741
<none>    10 351.6040
- socst    8 357.7374
- science  8 357.9608
- math     8 365.5799
# weights:  15 (8 variable)
initial  value 219.722458 
iter  10 value 167.367707
final  value 167.337045 
converged

Step:  AIC=350.67
prog ~ math + science + socst

trying - math 
# weights:  12 (6 variable)
initial  value 219.722458 
iter  10 value 180.004791
final  value 180.004124 
converged
trying - science 
# weights:  12 (6 variable)
initial  value 219.722458 
iter  10 value 171.369370
final  value 171.367391 
converged
trying - socst 
# weights:  12 (6 variable)
initial  value 219.722458 
iter  10 value 175.075743
final  value 175.074336 
converged
          Df      AIC
<none>     8 350.6741
- science  6 354.7348
- socst    6 362.1487
- math     6 372.0082
summary(m_step)
Call:
multinom(formula = prog ~ math + science + socst, data = hsb)

Coefficients:
         (Intercept)        math     science       socst
academic   -4.360615  0.11934098 -0.07432424  0.05172746
vocation    4.494800 -0.01859258 -0.03508102 -0.03655492

Std. Errors:
         (Intercept)       math    science      socst
academic    1.438341 0.03142578 0.02712094 0.02293584
vocation    1.525430 0.03457054 0.02669371 0.02283319

Residual Deviance: 334.6741 
AIC: 350.6741 
# Question 1.G

math_seq <- seq(min(hsb$math), max(hsb$math), length = 100)

newdata <- data.frame(
  read = mean(hsb$read),
  write = mean(hsb$write),
  math = math_seq,
  science = mean(hsb$science),
  socst = mean(hsb$socst)
)

pred <- predict(m_step, newdata, type="probs")
plotdata <- cbind(newdata, pred)


matplot(math_seq, pred, type="l", lty=1,
        col=1:3, xlab="Math", ylab="Probability")
legend("topright", legend=colnames(pred), col=1:3, lty=1)

# Question 1.H

grid <- expand.grid(
  ses = levels(hsb$ses),
  schtyp = levels(hsb$schtyp)
)

grid$read = mean(hsb$read)
grid$write = mean(hsb$write)
grid$math = mean(hsb$math)
grid$science = mean(hsb$science)
grid$socst = mean(hsb$socst)

predict(m_step, grid, type="probs")
    general academic  vocation
1 0.2510505 0.547198 0.2017514
2 0.2510505 0.547198 0.2017514
3 0.2510505 0.547198 0.2017514
4 0.2510505 0.547198 0.2017514
5 0.2510505 0.547198 0.2017514
6 0.2510505 0.547198 0.2017514
# Question 1.I
student99 <- hsb[99,]
predict(m_step, student99, type="class")
[1] vocation
Levels: general academic vocation
student99$prog
[1] vocation
Levels: general academic vocation
# Question 1.J

pred_class <- predict(m_step, type="class")

table(pred_class, hsb$prog)
          
pred_class general academic vocation
  general        3        3        4
  academic      28       91       17
  vocation      14       11       29
mean(pred_class == hsb$prog)
[1] 0.615