Load the packages

library(tidymodels)
library(tidyverse)
library(ggformula)
library(GGally)
library(skimr)
library(discrim)
library(dplyr)

Import the datasets

pfi_2016 <- openxlsx::read.xlsx("pfi-data.xlsx", sheet = "curated 2016")
pfi_2019 <- openxlsx::read.xlsx("pfi-data.xlsx", sheet = "curated 2019")

Check for missing values in both datasets

pfi_2016 |>
  skim() |>
  dplyr::select(-numeric.hist)
Data summary
Name pfi_2016
Number of rows 13463
Number of columns 69
_______________________
Column type frequency:
character 1
numeric 68
________________________
Group variables None

Variable type: character

skim_variable n_missing complete_rate min max empty n_unique whitespace
ALLGRADEX 0 1 1 2 0 13 0

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
BASMID 0 1 2.016111e+10 65733.15 20161000013 20161056265 20161112330 20161169876 20161226588
EDCPUB 0 1 1.110000e+00 0.32 1 1 1 1 2
SPUBCHOIX 0 1 1.940000e+00 0.77 1 1 2 3 3
SCONSIDR 0 1 1.690000e+00 0.46 1 1 2 2 2
SEENJOY 0 1 1.750000e+00 0.70 1 1 2 2 4
SEGRADES 0 1 2.050000e+00 1.31 1 1 2 2 5
SEABSNT 0 1 4.210000e+00 6.89 0 1 3 5 364
SEFUTUREX 0 1 4.960000e+00 1.22 1 4 5 6 6
SEGRADEQ 0 1 2.070000e+00 0.91 1 1 2 3 5
FSSPORTX 0 1 1.200000e+00 0.40 1 1 1 1 2
FSVOL 0 1 1.580000e+00 0.49 1 1 2 2 2
FSMTNG 0 1 1.140000e+00 0.35 1 1 1 1 2
FSPTMTNG 0 1 1.550000e+00 0.50 1 1 2 2 2
FSATCNFN 0 1 1.260000e+00 0.44 1 1 1 2 2
FSFUNDRS 0 1 1.390000e+00 0.49 1 1 1 2 2
FSCOMMTE 0 1 1.870000e+00 0.34 1 2 2 2 2
FSCOUNSLR 0 1 1.640000e+00 0.48 1 1 2 2 2
FSFREQ 0 1 7.930000e+00 9.00 0 3 5 10 99
FSNOTESX 0 1 1.370000e+00 0.48 1 1 1 2 2
FSMEMO 0 1 1.100000e+00 0.29 1 1 1 1 2
FCSCHOOL 0 1 1.490000e+00 0.70 1 1 1 2 4
FCTEACHR 0 1 1.500000e+00 0.68 1 1 1 2 4
FCSTDS 0 1 1.500000e+00 0.70 1 1 1 2 4
FCORDER 0 1 1.530000e+00 0.75 1 1 1 2 4
FCSUPPRT 0 1 1.620000e+00 0.77 1 1 1 2 4
FHHOME 0 1 3.220000e+00 1.01 1 3 3 4 6
FHWKHRS 0 1 5.530000e+00 5.43 -1 2 4 8 75
FHAMOUNT 0 1 1.250000e+00 0.85 -1 1 1 1 3
FHCAMT 0 1 1.270000e+00 0.76 -1 1 1 2 3
FHPLACE 0 1 1.050000e+00 0.62 -1 1 1 1 3
FHCHECKX 0 1 3.170000e+00 1.29 -1 3 4 4 4
FHHELP 0 1 2.330000e+00 1.50 -1 1 2 3 5
FOSTORY2X 0 1 1.430000e+00 0.50 1 1 1 2 2
FOCRAFTS 0 1 1.590000e+00 0.49 1 1 2 2 2
FOGAMES 0 1 1.520000e+00 0.50 1 1 2 2 2
FOBUILDX 0 1 1.430000e+00 0.50 1 1 1 2 2
FOSPORT 0 1 1.270000e+00 0.44 1 1 1 2 2
FORESPON 0 1 1.290000e+00 0.45 1 1 1 2 2
FOHISTX 0 1 1.460000e+00 0.50 1 1 1 2 2
FODINNERX 0 1 4.750000e+00 2.07 0 3 5 7 7
FOLIBRAYX 0 1 1.670000e+00 0.47 1 1 2 2 2
FOBOOKSTX 0 1 1.640000e+00 0.48 1 1 2 2 2
HDHEALTH 0 1 1.570000e+00 0.77 1 1 1 2 5
CDOBMM 0 1 6.530000e+00 3.41 1 4 7 9 12
CDOBYY 0 1 2.002950e+03 3.79 1995 2000 2003 2006 2012
CSEX 0 1 1.490000e+00 0.50 1 1 1 2 2
CSPEAKX 0 1 2.330000e+00 0.96 1 2 2 2 6
HHTOTALXX 0 1 4.030000e+00 1.22 2 3 4 5 10
RELATION 0 1 1.640000e+00 1.28 1 1 1 2 9
P1REL 0 1 1.290000e+00 0.97 1 1 1 1 6
P1SEX 0 1 1.700000e+00 0.46 1 1 2 2 2
P1MRSTA 0 1 1.780000e+00 1.34 1 1 1 3 5
P1EMPL 0 1 1.980000e+00 1.74 1 1 1 2 7
P1HRSWK 0 1 3.116000e+01 19.68 -1 16 40 40 80
P1MTHSWRK 0 1 9.370000e+00 4.59 0 9 12 12 12
P1AGE 0 1 4.488000e+01 8.72 15 39 45 50 90
P2GUARD 0 1 1.260000e+00 0.44 1 1 1 2 2
TTLHHINC 0 1 6.620000e+00 2.84 1 4 7 9 10
OWNRNTHB 0 1 1.260000e+00 0.48 1 1 1 1 3
DSBLTY 0 1 1.830000e+00 0.38 1 2 2 2 2
HHPARN16X 0 1 1.460000e+00 0.80 1 1 1 2 4
HHPARN16_BRD 0 1 1.260000e+00 0.44 1 1 1 2 2
NUMSIBSX 0 1 1.020000e+00 0.95 0 0 1 1 7
PARGRADEX 0 1 3.550000e+00 1.15 1 3 4 4 5
RACEETHN 0 1 1.890000e+00 1.13 1 1 1 3 4
INTACC 0 1 1.200000e+00 0.61 1 1 1 1 4
CENREG 0 1 2.520000e+00 1.04 1 2 2 3 4
ZIPLOCL 0 1 2.270000e+01 10.27 11 13 21 23 43
pfi_2019 |>
  skim() |>
  dplyr::select(-numeric.hist)
Data summary
Name pfi_2019
Number of rows 15500
Number of columns 75
_______________________
Column type frequency:
numeric 75
________________________
Group variables None

Variable type: numeric

skim_variable n_missing complete_rate mean sd p0 p25 p50 p75 p100
BASMID 0 1 2.019111e+10 65000.56 20191000012 2.019106e+10 20191113174 20191169241 20191225500
ALLGRADEX 0 1 9.610000e+00 3.86 2 6.750000e+00 10 13 15
EDCPUB 0 1 1.110000e+00 0.31 1 1.000000e+00 1 1 2
SCCHOICE 0 1 1.370000e+00 0.48 -1 1.000000e+00 1 2 2
SPUBCHOIX 0 1 1.900000e+00 0.78 -1 1.000000e+00 2 3 3
SCONSIDR 0 1 1.620000e+00 0.49 -1 1.000000e+00 2 2 2
SCHLHRSWK 0 1 3.770000e+00 0.60 -1 4.000000e+00 4 4 4
EINTNET 0 1 3.920000e+00 0.35 -1 4.000000e+00 4 4 4
MOSTIMPT 0 1 -5.200000e-01 2.22 -1 -1.000000e+00 -1 -1 14
INTNUM 0 1 -8.200000e-01 0.83 -1 -1.000000e+00 -1 -1 16
SEENJOY 0 1 1.770000e+00 0.70 -1 1.000000e+00 2 2 4
SEGRADES 0 1 2.000000e+00 1.33 -1 1.000000e+00 2 2 5
SEABSNT 0 1 1.230000e+00 0.55 -1 1.000000e+00 1 1 4
SEGRADEQ 0 1 2.040000e+00 0.91 -1 1.000000e+00 2 3 5
FSSPORTX 0 1 1.180000e+00 0.45 -1 1.000000e+00 1 1 2
FSVOL 0 1 1.560000e+00 0.54 -1 1.000000e+00 2 2 2
FSMTNG 0 1 1.130000e+00 0.40 -1 1.000000e+00 1 1 2
FSPTMTNG 0 1 1.520000e+00 0.55 -1 1.000000e+00 2 2 2
FSATCNFN 0 1 1.250000e+00 0.49 -1 1.000000e+00 1 2 2
FSFUNDRS 0 1 1.400000e+00 0.54 -1 1.000000e+00 1 2 2
FSCOMMTE 0 1 1.850000e+00 0.42 -1 2.000000e+00 2 2 2
FSCOUNSLR 0 1 1.620000e+00 0.53 -1 1.000000e+00 2 2 2
FSFREQ 0 1 6.950000e+00 9.23 -1 2.000000e+00 4 8 99
FSNOTESX 0 1 1.330000e+00 0.49 -1 1.000000e+00 1 2 2
FSMEMO 0 1 1.090000e+00 0.32 -1 1.000000e+00 1 1 2
FCSCHOOL 0 1 1.440000e+00 0.67 -1 1.000000e+00 1 2 4
FCTEACHR 0 1 1.460000e+00 0.67 -1 1.000000e+00 1 2 4
FCSTDS 0 1 1.460000e+00 0.68 -1 1.000000e+00 1 2 4
FCORDER 0 1 1.520000e+00 0.75 -1 1.000000e+00 1 2 4
FCSUPPRT 0 1 1.590000e+00 0.77 -1 1.000000e+00 1 2 4
FHHOME 0 1 3.160000e+00 1.11 -1 3.000000e+00 3 4 6
FHWKHRS 0 1 5.360000e+00 5.39 -1 2.000000e+00 4 7 75
FHAMOUNT 0 1 1.200000e+00 0.88 -1 1.000000e+00 1 1 3
FHCAMT 0 1 1.230000e+00 0.80 -1 1.000000e+00 1 2 3
FHPLACE 0 1 1.010000e+00 0.66 -1 1.000000e+00 1 1 3
FHCHECKX 0 1 3.080000e+00 1.38 -1 3.000000e+00 4 4 4
FHHELP 0 1 2.190000e+00 1.55 -1 1.000000e+00 2 3 5
FOSTORY2X 0 1 1.410000e+00 0.49 1 1.000000e+00 1 2 2
FOCRAFTS 0 1 1.560000e+00 0.50 1 1.000000e+00 2 2 2
FOGAMES 0 1 1.460000e+00 0.50 1 1.000000e+00 1 2 2
FOBUILDX 0 1 1.450000e+00 0.50 1 1.000000e+00 1 2 2
FOSPORT 0 1 1.320000e+00 0.47 1 1.000000e+00 1 2 2
FORESPON 0 1 1.300000e+00 0.46 1 1.000000e+00 1 2 2
FOHISTX 0 1 1.460000e+00 0.50 1 1.000000e+00 1 2 2
FODINNERX 0 1 4.780000e+00 2.05 0 3.000000e+00 5 7 7
FOLIBRAYX 0 1 1.680000e+00 0.47 1 1.000000e+00 2 2 2
FOBOOKSTX 0 1 1.670000e+00 0.47 1 1.000000e+00 2 2 2
HDHEALTH 0 1 1.580000e+00 0.77 1 1.000000e+00 1 2 5
CDOBMM 0 1 6.560000e+00 3.42 1 4.000000e+00 7 9 12
CDOBYY 0 1 2.006010e+03 3.82 1998 2.003000e+03 2006 2009 2015
CSEX 0 1 1.480000e+00 0.50 1 1.000000e+00 1 2 2
CSPEAKX 0 1 2.290000e+00 0.89 1 2.000000e+00 2 2 6
HHTOTALXX 0 1 4.050000e+00 1.23 2 3.000000e+00 4 5 10
RELATION 0 1 1.790000e+00 1.49 1 1.000000e+00 1 2 11
P1REL 0 1 1.290000e+00 0.97 1 1.000000e+00 1 1 6
P1SEX 0 1 1.660000e+00 0.47 1 1.000000e+00 2 2 2
P1MRSTA 0 1 1.820000e+00 1.37 1 1.000000e+00 1 3 5
P1EMPL 0 1 1.890000e+00 1.70 1 1.000000e+00 1 2 7
P1HRSWK 0 1 3.251000e+01 19.04 -1 2.300000e+01 40 43 80
P1MTHSWRK 0 1 9.680000e+00 4.38 0 1.000000e+01 12 12 12
P1AGE 0 1 4.480000e+01 8.70 15 3.900000e+01 44 50 90
P2GUARD 0 1 1.260000e+00 0.44 1 1.000000e+00 1 2 2
TTLHHINC 0 1 7.230000e+00 3.05 1 5.000000e+00 8 9 12
OWNRNTHB 0 1 1.260000e+00 0.47 1 1.000000e+00 1 1 3
CHLDNT 0 1 1.740000e+00 1.01 1 1.000000e+00 1 2 5
SEFUTUREX 0 1 4.920000e+00 1.20 1 4.000000e+00 5 6 6
DSBLTY 0 1 1.760000e+00 0.43 1 2.000000e+00 2 2 2
HHPARN19X 0 1 1.450000e+00 0.79 1 1.000000e+00 1 2 4
HHPARN19_BRD 0 1 1.260000e+00 0.44 1 1.000000e+00 1 2 2
NUMSIBSX 0 1 1.030000e+00 0.96 0 0.000000e+00 1 1 7
PARGRADEX 0 1 3.620000e+00 1.15 1 3.000000e+00 4 5 5
RACEETH 0 1 2.000000e+00 1.29 1 1.000000e+00 1 3 5
INTACC 0 1 1.140000e+00 0.53 1 1.000000e+00 1 1 4
CENREG 0 1 2.530000e+00 1.03 1 2.000000e+00 2 3 4
ZIPLOCL 0 1 2.281000e+01 10.31 11 1.300000e+01 21 31 43

Check the relation between the potential predictor and response variables

pfi_2016 |>
  dplyr::select(SEGRADES, HHPARN16X, PARGRADEX, EDCPUB, INTACC, SEABSNT, SEFUTUREX, TTLHHINC) |>
  ggpairs()

pfi_2019 |>
  dplyr::select(SEGRADES, HHPARN19X, PARGRADEX, EDCPUB, INTACC, SEABSNT, SEFUTUREX, TTLHHINC) |>
  ggpairs()

Concatenate both the datasets

pfi_data <- rbind(pfi_2016 |>
                    mutate(HHPARNX = HHPARN16X) |>
                    dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC),
                  pfi_2019 |>
                    mutate(HHPARNX = HHPARN19X) |>
                    dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC)
                  )
pfi_data |>
  glimpse()
## Rows: 28,963
## Columns: 7
## $ SEGRADES  <dbl> 1, 2, 5, 1, 2, 1, 3, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, …
## $ HHPARNX   <dbl> 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 4, 2, 4, 2, 1, 1, 2, 3, 3, 1, …
## $ PARGRADEX <dbl> 4, 3, 4, 4, 3, 1, 3, 5, 2, 3, 3, 2, 2, 1, 4, 4, 3, 3, 2, 5, …
## $ EDCPUB    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, …
## $ INTACC    <dbl> 1, 1, 1, 1, 1, 3, 1, 1, 3, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, …
## $ SEABSNT   <dbl> 0, 0, 5, 1, 2, 3, 0, 4, 3, 8, 0, 4, 2, 2, 0, 8, 0, 6, 3, 10,…
## $ TTLHHINC  <dbl> 10, 6, 6, 9, 8, 4, 5, 9, 3, 7, 8, 3, 5, 2, 10, 8, 5, 8, 9, 9…
pfi_data |>
  dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC) |>
  ggpairs()

Encode all the variables and convert them to factors

pfi <- pfi_data |>
  mutate(GRADE = case_match(SEGRADES,
                    1 ~ "A",
                    2 ~ "B",
                    3 ~ "C",
                    4 ~ "D",
                    c(5,-1) ~ "Other"),
         PARENT = case_match(HHPARNX,
                             1 ~ "Both",
                             2 ~ "Mother",
                             3 ~ "Father",
                             4 ~ "None"),
         HAS_POST_SEC = case_match(PARGRADEX,
                                 c(1,2) ~ "No",
                                 c(3,4,5) ~ "Yes"),
         PUBLIC_SCHOOL = case_match(EDCPUB,
                                    1 ~ "Yes",
                                    2 ~ "No"),
         INTERNET_ACCESS = case_match(INTACC,
                                     c(1,2,3) ~ "Yes",
                                     4 ~ "No"),
         NO_OF_ABSENT = case_match(SEABSNT,
                                   c(1,2) ~ "0-10 days",
                                   c(3,4) ~ "11-20 days",
                                   .default = "Other"),
         INCOME_CAT = case_match(TTLHHINC,
                                 c(1:8) ~ "$0-100K",
                                 c(9:10) ~ "$100-200K",
                                 c(11:12) ~ "$200K or more"),
  GRADE = factor(GRADE, levels = c("Other", "D", "C", "B", "A")),
  PARENT = factor(PARENT, levels = c("None", "Father", "Mother", "Both")),
  HAS_POST_SEC = factor(HAS_POST_SEC, levels = c("No", "Yes")),
  PUBLIC_SCHOOL = factor(PUBLIC_SCHOOL, levels = c("No", "Yes")),
  INTERNET_ACCESS = factor(INTERNET_ACCESS, levels = c("No", "Yes")),
  NO_OF_ABSENT = factor(NO_OF_ABSENT, levels = c("0-10 days","11-20 days","Other")),
  INCOME_CAT = factor(INCOME_CAT, levels = c("$0-100K", "$100-200K", "$200K or more"))
  ) |>
  dplyr::select(GRADE, PARENT, HAS_POST_SEC, PUBLIC_SCHOOL, INTERNET_ACCESS, NO_OF_ABSENT, INCOME_CAT)

Let’s check the relationship between GRADE and the variables NO_OF_ABSENT and INCOME_CAT

pfi |>
  ggbivariate("GRADE", c("PUBLIC_SCHOOL", "INCOME_CAT"), title = "Student Grade outcome by School and Parents Income Category")

Train and test split

set.seed(631)

# Split the dataset into training (75%) and testing (25%)
pfi_split <- initial_split(pfi, prop = 0.75, strata = GRADE)
pfi_train <- training(pfi_split)
pfi_test <- testing(pfi_split)

Define k-fold cross validation with 5 levels

pfi.folds <- vfold_cv(pfi_train, v=5, strata = GRADE)

Multinomial Regression

multi_spec <- multinom_reg() |>
  set_engine("nnet")

multi_wf <- workflow() |>
  add_model(multi_spec) |>
  add_formula(as.formula("GRADE ~ ."))

fit_multi <- multi_wf |>
  fit_resamples(pfi.folds, control=control_resamples(save_pred=TRUE, save_workflow=TRUE, extract=extract_model))
## → A | warning: `extract_model()` was deprecated in tune 0.1.6.
##                ℹ Please use `extract_fit_engine()` instead.
##                ℹ The deprecated feature was likely used in the tune package.
##                  Please report the issue at <https://github.com/tidymodels/tune/issues>.
## There were issues with some computations   A: x1There were issues with some computations   A: x1
multi_metrics_train <- collect_metrics(fit_multi) |>
  mutate(Model = "Multinomial Regression", Data = "Train")

multi_metrics_train

Confusion Matrix for Multinomial train data

cv_predictions_multi <- collect_predictions(fit_multi)

# Confusion matrix
conf_mat_multi <- cv_predictions_multi %>% 
  count(GRADE, .pred_class, .drop = FALSE)

conf_mat_multi %>% 
  pivot_wider(
    names_from = .pred_class,
    values_from = n
  )
conf_mat_multi %>% 
  ggplot(aes(x = GRADE, y = n, fill = .pred_class)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    main = "Predicted vs Actual GRADE"
    ) +
  theme_minimal() + 
  theme(legend.position = "top")

Fit the model

multi_model <- multinom_reg() |>
  set_engine("nnet") |>
  fit(GRADE ~ ., data = pfi_train)

tidy(multi_model) %>% 
  print(n = Inf) # This will display all rows of the tibble
## # A tibble: 44 × 6
##    y.level term                    estimate std.error statistic  p.value
##    <chr>   <chr>                      <dbl>     <dbl>     <dbl>    <dbl>
##  1 D       (Intercept)              -1.36      0.501     -2.72  6.59e- 3
##  2 D       PARENTFather             -0.388     0.249     -1.56  1.20e- 1
##  3 D       PARENTMother             -0.831     0.198     -4.19  2.77e- 5
##  4 D       PARENTBoth               -1.39      0.191     -7.27  3.60e-13
##  5 D       HAS_POST_SECYes          -0.664     0.123     -5.38  7.46e- 8
##  6 D       PUBLIC_SCHOOLYes          1.40      0.345      4.06  4.96e- 5
##  7 D       INTERNET_ACCESSYes       -0.342     0.348     -0.984 3.25e- 1
##  8 D       NO_OF_ABSENT11-20 days    0.814     0.145      5.60  2.15e- 8
##  9 D       NO_OF_ABSENTOther         0.813     0.122      6.64  3.14e-11
## 10 D       INCOME_CAT$100-200K      -1.06      0.171     -6.23  4.65e-10
## 11 D       INCOME_CAT$200K or more  -0.916     0.374     -2.45  1.43e- 2
## 12 C       (Intercept)               0.822     0.299      2.75  5.91e- 3
## 13 C       PARENTFather             -0.176     0.176     -1.00  3.17e- 1
## 14 C       PARENTMother             -0.583     0.143     -4.08  4.55e- 5
## 15 C       PARENTBoth               -0.998     0.136     -7.33  2.28e-13
## 16 C       HAS_POST_SECYes          -0.518     0.0795    -6.52  7.22e-11
## 17 C       PUBLIC_SCHOOLYes          0.287     0.111      2.59  9.69e- 3
## 18 C       INTERNET_ACCESSYes       -0.110     0.261     -0.423 6.72e- 1
## 19 C       NO_OF_ABSENT11-20 days    0.401     0.0871     4.60  4.18e- 6
## 20 C       NO_OF_ABSENTOther         0.359     0.0723     4.97  6.85e- 7
## 21 C       INCOME_CAT$100-200K      -0.669     0.0774    -8.64  5.50e-18
## 22 C       INCOME_CAT$200K or more  -1.23      0.182     -6.74  1.63e-11
## 23 B       (Intercept)               1.85      0.255      7.27  3.58e-13
## 24 B       PARENTFather              0.115     0.158      0.725 4.68e- 1
## 25 B       PARENTMother             -0.192     0.129     -1.49  1.37e- 1
## 26 B       PARENTBoth               -0.431     0.123     -3.50  4.63e- 4
## 27 B       HAS_POST_SECYes          -0.376     0.0675    -5.57  2.54e- 8
## 28 B       PUBLIC_SCHOOLYes          0.0370    0.0760     0.487 6.26e- 1
## 29 B       INTERNET_ACCESSYes       -0.315     0.225     -1.40  1.62e- 1
## 30 B       NO_OF_ABSENT11-20 days    0.0911    0.0705     1.29  1.96e- 1
## 31 B       NO_OF_ABSENTOther         0.0657    0.0580     1.13  2.58e- 1
## 32 B       INCOME_CAT$100-200K      -0.261     0.0548    -4.76  1.97e- 6
## 33 B       INCOME_CAT$200K or more  -0.528     0.0997    -5.30  1.16e- 7
## 34 A       (Intercept)               1.10      0.261      4.22  2.41e- 5
## 35 A       PARENTFather              0.462     0.160      2.89  3.84e- 3
## 36 A       PARENTMother              0.162     0.132      1.23  2.20e- 1
## 37 A       PARENTBoth                0.272     0.126      2.15  3.12e- 2
## 38 A       HAS_POST_SECYes           0.109     0.0673     1.62  1.06e- 1
## 39 A       PUBLIC_SCHOOLYes         -0.152     0.0686    -2.21  2.68e- 2
## 40 A       INTERNET_ACCESSYes        0.0324    0.232      0.140 8.89e- 1
## 41 A       NO_OF_ABSENT11-20 days   -0.159     0.0674    -2.36  1.84e- 2
## 42 A       NO_OF_ABSENTOther        -0.0751    0.0550    -1.37  1.72e- 1
## 43 A       INCOME_CAT$100-200K       0.0672    0.0504     1.33  1.83e- 1
## 44 A       INCOME_CAT$200K or more   0.0836    0.0853     0.979 3.27e- 1

The log-odds of a student who got “Other/No” grade vs “A” grade.

tidy(multi_model) %>%
  filter(y.level=='A') %>%
  dplyr::select(estimate,term)

\[ \begin{aligned} \log\left(\frac{\hat{p}_{\texttt{"A"}}}{\hat{p}_{\texttt{"Other"}}}\right) = & -1.1039 \\ & + 0.46164431{\space}PARENTFather \\ & + 0.16219859{\space}PARENTMother \\ & + 0.27154298{\space}PARENTBoth \\ & + 0.10870736{\space}HAS\_POST\_SECYes \\ & - 0.15197641{\space}PUBLIC\_SCHOOLYes \\ & + 0.03235633{\space}INTERNET\_ACCESSYes \\ & - 0.15881349{\space}NO\_OF\_ABSENT11-20 days \\ & - 0.07506882{\space}NO\_OF\_ABSENTOther \\ & + 0.06715767{\space}INCOME\_CAT100-200K \\ & + 0.08357023{\space}INCOME\_CAT200K or more \end{aligned} \]

Predict using the test data

pfi_multi_aug <- augment(multi_model, new_data = pfi_test) 
pfi_multi_aug

Check accuracy and ROC_AUC

multi_test_metrics <- rbind(
pfi_multi_aug |>
  metrics(truth = GRADE, estimate = .pred_class) |>
  dplyr::filter(.metric == "accuracy") |>
  mutate(Model = "Multinomial Regression", Data = "Test"),
pfi_multi_aug |>
  roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
  mutate(Model = "Multinomial Regression", Data = "Test")
)

multi_test_metrics

Linear Discriminant Analysis

lda_spec <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS")

lda_wf <- workflow() |>
  add_model(lda_spec) |>
  add_formula(as.formula("GRADE ~ ."))

fit_lda <- lda_wf |>
  fit_resamples(pfi.folds, control=control_resamples(save_pred=TRUE, save_workflow=TRUE, extract=extract_model))

lda_metrics_train <- collect_metrics(fit_lda) |>
  mutate(Model = "LDA", Data = "Train")

lda_metrics_train

Confusion Matrix for LDA train data

cv_predictions_lda <- collect_predictions(fit_lda)

# Confusion matrix
conf_mat_lda <- cv_predictions_lda %>% 
  count(GRADE, .pred_class, .drop = FALSE)

conf_mat_lda %>% 
  pivot_wider(
    names_from = .pred_class,
    values_from = n
  )
conf_mat_lda %>% 
  ggplot(aes(x = GRADE, y = n, fill = .pred_class)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    main = "Predicted vs Actual GRADE"
    ) +
  theme_minimal() + 
  theme(legend.position = "top")

Fit the model

lda_model <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS") |>
  fit(GRADE ~ ., data = pfi_train)

lda_model
## parsnip model object
## 
## Call:
## lda(GRADE ~ ., data = data)
## 
## Prior probabilities of groups:
##      Other          D          C          B          A 
## 0.12374551 0.01928920 0.09128994 0.28956818 0.47610717 
## 
## Group means:
##       PARENTFather PARENTMother PARENTBoth HAS_POST_SECYes PUBLIC_SCHOOLYes
## Other   0.04278274    0.1856399  0.7377232       0.8638393        0.8876488
## D       0.10023866    0.3054893  0.4582339       0.6563246        0.9785203
## C       0.09026727    0.2793747  0.5350479       0.7196167        0.9329299
## B       0.07249603    0.2383148  0.6343402       0.7817170        0.9044515
## A       0.05134403    0.1633146  0.7602011       0.8826146        0.8680139
##       INTERNET_ACCESSYes NO_OF_ABSENT11-20 days NO_OF_ABSENTOther
## Other          0.9906994              0.1283482         0.2139137
## D              0.9665871              0.2052506         0.3556086
## C              0.9793243              0.1704488         0.2813918
## B              0.9815580              0.1400636         0.2311606
## A              0.9919745              0.1128408         0.2035390
##       INCOME_CAT$100-200K INCOME_CAT$200K or more
## Other           0.3459821              0.08072917
## D               0.1097852              0.01909308
## C               0.1790217              0.01966717
## B               0.2662957              0.04546900
## A               0.3684974              0.09205183
## 
## Coefficients of linear discriminants:
##                                LD1        LD2          LD3        LD4
## PARENTFather            -1.0088962  2.7053003  2.184326680 -1.6417640
## PARENTMother            -1.1936981  2.6679281 -1.437210029 -1.1835795
## PARENTBoth              -1.9371515  2.2149478 -1.001298286 -1.1784287
## HAS_POST_SECYes         -1.0089877 -0.6518897 -0.004238791  1.0228292
## PUBLIC_SCHOOLYes         0.5019837 -0.7976498 -1.503719481 -0.7528578
## INTERNET_ACCESSYes      -0.4456721 -1.0044516  0.608645609  6.5555885
## NO_OF_ABSENT11-20 days   0.7352946 -1.1543041 -0.702487947  0.1486981
## NO_OF_ABSENTOther        0.5772406 -1.5110239  0.205484055 -0.3271538
## INCOME_CAT$100-200K     -0.8225327 -0.1387578 -0.143245660 -0.7863718
## INCOME_CAT$200K or more -1.1055775 -2.2070822  0.521825832 -1.9155322
## 
## Proportion of trace:
##    LD1    LD2    LD3    LD4 
## 0.9650 0.0259 0.0061 0.0029

Predict using the test data

pfi_lda_aug <- augment(lda_model, new_data = pfi_test) 
pfi_lda_aug

Check accuracy and ROC_AUC

lda_test_metrics <- rbind(
pfi_lda_aug |>
  metrics(truth = GRADE, estimate = .pred_class) |>
  dplyr::filter(.metric == "accuracy") |>
  mutate(Model = "LDA", Data = "Test"),
pfi_lda_aug |>
  roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
  mutate(Model = "LDA", Data = "Test")
)

lda_test_metrics

Model Evalution & Comparison

Comparison on Model Performance for train data

train_plot <- rbind(multi_metrics_train, lda_metrics_train) |>
  dplyr::filter(.metric %in% c("accuracy", "roc_auc")) |>
  ggplot(aes(x = Model, y = mean, fill = .metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Model Performance Comparison on Train Data (Multinomial vs LDA)", 
       y = "Value", 
       x = "Model", 
       fill = "Metric") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
  theme_minimal()

train_plot

Comparison on Model Performance for test data

test_plot <- rbind(multi_test_metrics, lda_test_metrics) |>
  ggplot(aes(x = Model, y = .estimate, fill = .metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Model Performance Comparison on Test Data (Multinomial vs LDA)", 
       y = "Value", 
       x = "Model", 
       fill = "Metric") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
  theme_minimal()

test_plot

# Combine train and test metrics
all_metrics <- bind_rows(multi_metrics_train |>
                           mutate(.estimate = mean) |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         lda_metrics_train  |>
                           mutate(.estimate = mean) |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         multi_test_metrics |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         lda_test_metrics |>
                           dplyr::select(.metric, .estimate, Model, Data)) |>
  filter(.metric %in% c("accuracy", "roc_auc")) 

ggplot(all_metrics, aes(x = Model, y = .estimate, fill = .metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(. ~ Data) +
  labs(title = "Model Performance Comparison (Train vs Test)", 
       y = "Metric Value", x = "Model", fill = "Metric") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
  theme_minimal()