GVSU k-12 student grade prediction

Load the packages

library(tidymodels)
library(tidyverse)
library(ggformula)
library(GGally)
library(skimr)
library(discrim)
library(dplyr)

Import the datasets

pfi_2016 <- openxlsx::read.xlsx("pfi-data.xlsx", sheet = "curated 2016")
pfi_2019 <- openxlsx::read.xlsx("pfi-data.xlsx", sheet = "curated 2019")

Check for missing values in both datasets

pfi_2016 |>
  skim() |>
  dplyr::select(-numeric.hist)

Data summary
Name	pfi_2016
Number of rows	13463
Number of columns	69
_______________________
Column type frequency:
character	1
numeric	68
________________________
Group variables	None

Variable type: character

skim_variable	n_missing	complete_rate	min	max	empty	n_unique	whitespace
ALLGRADEX	0	1	1	2	0	13	0

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
BASMID	1	2.016111e+10	65733.15	20161000013	20161056265	20161112330	20161169876	20161226588
EDCPUB	1	1.110000e+00	0.32	1	1	1	1	2
SPUBCHOIX	1	1.940000e+00	0.77	1	1	2	3	3
SCONSIDR	1	1.690000e+00	0.46	1	1	2	2	2
SEENJOY	1	1.750000e+00	0.70	1	1	2	2	4
SEGRADES	1	2.050000e+00	1.31	1	1	2	2	5
SEABSNT	1	4.210000e+00	6.89	0	1	3	5	364
SEFUTUREX	1	4.960000e+00	1.22	1	4	5	6	6
SEGRADEQ	1	2.070000e+00	0.91	1	1	2	3	5
FSSPORTX	1	1.200000e+00	0.40	1	1	1	1	2
FSVOL	1	1.580000e+00	0.49	1	1	2	2	2
FSMTNG	1	1.140000e+00	0.35	1	1	1	1	2
FSPTMTNG	1	1.550000e+00	0.50	1	1	2	2	2
FSATCNFN	1	1.260000e+00	0.44	1	1	1	2	2
FSFUNDRS	1	1.390000e+00	0.49	1	1	1	2	2
FSCOMMTE	1	1.870000e+00	0.34	1	2	2	2	2
FSCOUNSLR	1	1.640000e+00	0.48	1	1	2	2	2
FSFREQ	1	7.930000e+00	9.00	0	3	5	10	99
FSNOTESX	1	1.370000e+00	0.48	1	1	1	2	2
FSMEMO	1	1.100000e+00	0.29	1	1	1	1	2
FCSCHOOL	1	1.490000e+00	0.70	1	1	1	2	4
FCTEACHR	1	1.500000e+00	0.68	1	1	1	2	4
FCSTDS	1	1.500000e+00	0.70	1	1	1	2	4
FCORDER	1	1.530000e+00	0.75	1	1	1	2	4
FCSUPPRT	1	1.620000e+00	0.77	1	1	1	2	4
FHHOME	1	3.220000e+00	1.01	1	3	3	4	6
FHWKHRS	1	5.530000e+00	5.43	-1	2	4	8	75
FHAMOUNT	1	1.250000e+00	0.85	-1	1	1	1	3
FHCAMT	1	1.270000e+00	0.76	-1	1	1	2	3
FHPLACE	1	1.050000e+00	0.62	-1	1	1	1	3
FHCHECKX	1	3.170000e+00	1.29	-1	3	4	4	4
FHHELP	1	2.330000e+00	1.50	-1	1	2	3	5
FOSTORY2X	1	1.430000e+00	0.50	1	1	1	2	2
FOCRAFTS	1	1.590000e+00	0.49	1	1	2	2	2
FOGAMES	1	1.520000e+00	0.50	1	1	2	2	2
FOBUILDX	1	1.430000e+00	0.50	1	1	1	2	2
FOSPORT	1	1.270000e+00	0.44	1	1	1	2	2
FORESPON	1	1.290000e+00	0.45	1	1	1	2	2
FOHISTX	1	1.460000e+00	0.50	1	1	1	2	2
FODINNERX	1	4.750000e+00	2.07	0	3	5	7	7
FOLIBRAYX	1	1.670000e+00	0.47	1	1	2	2	2
FOBOOKSTX	1	1.640000e+00	0.48	1	1	2	2	2
HDHEALTH	1	1.570000e+00	0.77	1	1	1	2	5
CDOBMM	1	6.530000e+00	3.41	1	4	7	9	12
CDOBYY	1	2.002950e+03	3.79	1995	2000	2003	2006	2012
CSEX	1	1.490000e+00	0.50	1	1	1	2	2
CSPEAKX	1	2.330000e+00	0.96	1	2	2	2	6
HHTOTALXX	1	4.030000e+00	1.22	2	3	4	5	10
RELATION	1	1.640000e+00	1.28	1	1	1	2	9
P1REL	1	1.290000e+00	0.97	1	1	1	1	6
P1SEX	1	1.700000e+00	0.46	1	1	2	2	2
P1MRSTA	1	1.780000e+00	1.34	1	1	1	3	5
P1EMPL	1	1.980000e+00	1.74	1	1	1	2	7
P1HRSWK	1	3.116000e+01	19.68	-1	16	40	40	80
P1MTHSWRK	1	9.370000e+00	4.59	0	9	12	12	12
P1AGE	1	4.488000e+01	8.72	15	39	45	50	90
P2GUARD	1	1.260000e+00	0.44	1	1	1	2	2
TTLHHINC	1	6.620000e+00	2.84	1	4	7	9	10
OWNRNTHB	1	1.260000e+00	0.48	1	1	1	1	3
DSBLTY	1	1.830000e+00	0.38	1	2	2	2	2
HHPARN16X	1	1.460000e+00	0.80	1	1	1	2	4
HHPARN16_BRD	1	1.260000e+00	0.44	1	1	1	2	2
NUMSIBSX	1	1.020000e+00	0.95	0	0	1	1	7
PARGRADEX	1	3.550000e+00	1.15	1	3	4	4	5
RACEETHN	1	1.890000e+00	1.13	1	1	1	3	4
INTACC	1	1.200000e+00	0.61	1	1	1	1	4
CENREG	1	2.520000e+00	1.04	1	2	2	3	4
ZIPLOCL	1	2.270000e+01	10.27	11	13	21	23	43

pfi_2019 |>
  skim() |>
  dplyr::select(-numeric.hist)

Data summary
Name	pfi_2019
Number of rows	15500
Number of columns	75
_______________________
Column type frequency:
numeric	75
________________________
Group variables	None

Variable type: numeric

skim_variable	complete_rate	mean	sd	p0	p25	p50	p75	p100
BASMID	1	2.019111e+10	65000.56	20191000012	2.019106e+10	20191113174	20191169241	20191225500
ALLGRADEX	1	9.610000e+00	3.86	2	6.750000e+00	10	13	15
EDCPUB	1	1.110000e+00	0.31	1	1.000000e+00	1	1	2
SCCHOICE	1	1.370000e+00	0.48	-1	1.000000e+00	1	2	2
SPUBCHOIX	1	1.900000e+00	0.78	-1	1.000000e+00	2	3	3
SCONSIDR	1	1.620000e+00	0.49	-1	1.000000e+00	2	2	2
SCHLHRSWK	1	3.770000e+00	0.60	-1	4.000000e+00	4	4	4
EINTNET	1	3.920000e+00	0.35	-1	4.000000e+00	4	4	4
MOSTIMPT	1	-5.200000e-01	2.22	-1	-1.000000e+00	-1	-1	14
INTNUM	1	-8.200000e-01	0.83	-1	-1.000000e+00	-1	-1	16
SEENJOY	1	1.770000e+00	0.70	-1	1.000000e+00	2	2	4
SEGRADES	1	2.000000e+00	1.33	-1	1.000000e+00	2	2	5
SEABSNT	1	1.230000e+00	0.55	-1	1.000000e+00	1	1	4
SEGRADEQ	1	2.040000e+00	0.91	-1	1.000000e+00	2	3	5
FSSPORTX	1	1.180000e+00	0.45	-1	1.000000e+00	1	1	2
FSVOL	1	1.560000e+00	0.54	-1	1.000000e+00	2	2	2
FSMTNG	1	1.130000e+00	0.40	-1	1.000000e+00	1	1	2
FSPTMTNG	1	1.520000e+00	0.55	-1	1.000000e+00	2	2	2
FSATCNFN	1	1.250000e+00	0.49	-1	1.000000e+00	1	2	2
FSFUNDRS	1	1.400000e+00	0.54	-1	1.000000e+00	1	2	2
FSCOMMTE	1	1.850000e+00	0.42	-1	2.000000e+00	2	2	2
FSCOUNSLR	1	1.620000e+00	0.53	-1	1.000000e+00	2	2	2
FSFREQ	1	6.950000e+00	9.23	-1	2.000000e+00	4	8	99
FSNOTESX	1	1.330000e+00	0.49	-1	1.000000e+00	1	2	2
FSMEMO	1	1.090000e+00	0.32	-1	1.000000e+00	1	1	2
FCSCHOOL	1	1.440000e+00	0.67	-1	1.000000e+00	1	2	4
FCTEACHR	1	1.460000e+00	0.67	-1	1.000000e+00	1	2	4
FCSTDS	1	1.460000e+00	0.68	-1	1.000000e+00	1	2	4
FCORDER	1	1.520000e+00	0.75	-1	1.000000e+00	1	2	4
FCSUPPRT	1	1.590000e+00	0.77	-1	1.000000e+00	1	2	4
FHHOME	1	3.160000e+00	1.11	-1	3.000000e+00	3	4	6
FHWKHRS	1	5.360000e+00	5.39	-1	2.000000e+00	4	7	75
FHAMOUNT	1	1.200000e+00	0.88	-1	1.000000e+00	1	1	3
FHCAMT	1	1.230000e+00	0.80	-1	1.000000e+00	1	2	3
FHPLACE	1	1.010000e+00	0.66	-1	1.000000e+00	1	1	3
FHCHECKX	1	3.080000e+00	1.38	-1	3.000000e+00	4	4	4
FHHELP	1	2.190000e+00	1.55	-1	1.000000e+00	2	3	5
FOSTORY2X	1	1.410000e+00	0.49	1	1.000000e+00	1	2	2
FOCRAFTS	1	1.560000e+00	0.50	1	1.000000e+00	2	2	2
FOGAMES	1	1.460000e+00	0.50	1	1.000000e+00	1	2	2
FOBUILDX	1	1.450000e+00	0.50	1	1.000000e+00	1	2	2
FOSPORT	1	1.320000e+00	0.47	1	1.000000e+00	1	2	2
FORESPON	1	1.300000e+00	0.46	1	1.000000e+00	1	2	2
FOHISTX	1	1.460000e+00	0.50	1	1.000000e+00	1	2	2
FODINNERX	1	4.780000e+00	2.05	0	3.000000e+00	5	7	7
FOLIBRAYX	1	1.680000e+00	0.47	1	1.000000e+00	2	2	2
FOBOOKSTX	1	1.670000e+00	0.47	1	1.000000e+00	2	2	2
HDHEALTH	1	1.580000e+00	0.77	1	1.000000e+00	1	2	5
CDOBMM	1	6.560000e+00	3.42	1	4.000000e+00	7	9	12
CDOBYY	1	2.006010e+03	3.82	1998	2.003000e+03	2006	2009	2015
CSEX	1	1.480000e+00	0.50	1	1.000000e+00	1	2	2
CSPEAKX	1	2.290000e+00	0.89	1	2.000000e+00	2	2	6
HHTOTALXX	1	4.050000e+00	1.23	2	3.000000e+00	4	5	10
RELATION	1	1.790000e+00	1.49	1	1.000000e+00	1	2	11
P1REL	1	1.290000e+00	0.97	1	1.000000e+00	1	1	6
P1SEX	1	1.660000e+00	0.47	1	1.000000e+00	2	2	2
P1MRSTA	1	1.820000e+00	1.37	1	1.000000e+00	1	3	5
P1EMPL	1	1.890000e+00	1.70	1	1.000000e+00	1	2	7
P1HRSWK	1	3.251000e+01	19.04	-1	2.300000e+01	40	43	80
P1MTHSWRK	1	9.680000e+00	4.38	0	1.000000e+01	12	12	12
P1AGE	1	4.480000e+01	8.70	15	3.900000e+01	44	50	90
P2GUARD	1	1.260000e+00	0.44	1	1.000000e+00	1	2	2
TTLHHINC	1	7.230000e+00	3.05	1	5.000000e+00	8	9	12
OWNRNTHB	1	1.260000e+00	0.47	1	1.000000e+00	1	1	3
CHLDNT	1	1.740000e+00	1.01	1	1.000000e+00	1	2	5
SEFUTUREX	1	4.920000e+00	1.20	1	4.000000e+00	5	6	6
DSBLTY	1	1.760000e+00	0.43	1	2.000000e+00	2	2	2
HHPARN19X	1	1.450000e+00	0.79	1	1.000000e+00	1	2	4
HHPARN19_BRD	1	1.260000e+00	0.44	1	1.000000e+00	1	2	2
NUMSIBSX	1	1.030000e+00	0.96	0	0.000000e+00	1	1	7
PARGRADEX	1	3.620000e+00	1.15	1	3.000000e+00	4	5	5
RACEETH	1	2.000000e+00	1.29	1	1.000000e+00	1	3	5
INTACC	1	1.140000e+00	0.53	1	1.000000e+00	1	1	4
CENREG	1	2.530000e+00	1.03	1	2.000000e+00	2	3	4
ZIPLOCL	1	2.281000e+01	10.31	11	1.300000e+01	21	31	43

Check the relation between the potential predictor and response variables

pfi_2016 |>
  dplyr::select(SEGRADES, HHPARN16X, PARGRADEX, EDCPUB, INTACC, SEABSNT, SEFUTUREX, TTLHHINC) |>
  ggpairs()

pfi_2019 |>
  dplyr::select(SEGRADES, HHPARN19X, PARGRADEX, EDCPUB, INTACC, SEABSNT, SEFUTUREX, TTLHHINC) |>
  ggpairs()

Concatenate both the datasets

pfi_data <- rbind(pfi_2016 |>
                    mutate(HHPARNX = HHPARN16X) |>
                    dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC),
                  pfi_2019 |>
                    mutate(HHPARNX = HHPARN19X) |>
                    dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC)
                  )

pfi_data |>
  glimpse()

## Rows: 28,963
## Columns: 7
## $ SEGRADES  <dbl> 1, 2, 5, 1, 2, 1, 3, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, …
## $ HHPARNX   <dbl> 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 4, 2, 4, 2, 1, 1, 2, 3, 3, 1, …
## $ PARGRADEX <dbl> 4, 3, 4, 4, 3, 1, 3, 5, 2, 3, 3, 2, 2, 1, 4, 4, 3, 3, 2, 5, …
## $ EDCPUB    <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, …
## $ INTACC    <dbl> 1, 1, 1, 1, 1, 3, 1, 1, 3, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, …
## $ SEABSNT   <dbl> 0, 0, 5, 1, 2, 3, 0, 4, 3, 8, 0, 4, 2, 2, 0, 8, 0, 6, 3, 10,…
## $ TTLHHINC  <dbl> 10, 6, 6, 9, 8, 4, 5, 9, 3, 7, 8, 3, 5, 2, 10, 8, 5, 8, 9, 9…

pfi_data |>
  dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC) |>
  ggpairs()

Encode all the variables and convert them to factors

pfi <- pfi_data |>
  mutate(GRADE = case_match(SEGRADES,
                    1 ~ "A",
                    2 ~ "B",
                    3 ~ "C",
                    4 ~ "D",
                    c(5,-1) ~ "Other"),
         PARENT = case_match(HHPARNX,
                             1 ~ "Both",
                             2 ~ "Mother",
                             3 ~ "Father",
                             4 ~ "None"),
         HAS_POST_SEC = case_match(PARGRADEX,
                                 c(1,2) ~ "No",
                                 c(3,4,5) ~ "Yes"),
         PUBLIC_SCHOOL = case_match(EDCPUB,
                                    1 ~ "Yes",
                                    2 ~ "No"),
         INTERNET_ACCESS = case_match(INTACC,
                                     c(1,2,3) ~ "Yes",
                                     4 ~ "No"),
         NO_OF_ABSENT = case_match(SEABSNT,
                                   c(1,2) ~ "0-10 days",
                                   c(3,4) ~ "11-20 days",
                                   .default = "Other"),
         INCOME_CAT = case_match(TTLHHINC,
                                 c(1:8) ~ "$0-100K",
                                 c(9:10) ~ "$100-200K",
                                 c(11:12) ~ "$200K or more"),
  GRADE = factor(GRADE, levels = c("Other", "D", "C", "B", "A")),
  PARENT = factor(PARENT, levels = c("None", "Father", "Mother", "Both")),
  HAS_POST_SEC = factor(HAS_POST_SEC, levels = c("No", "Yes")),
  PUBLIC_SCHOOL = factor(PUBLIC_SCHOOL, levels = c("No", "Yes")),
  INTERNET_ACCESS = factor(INTERNET_ACCESS, levels = c("No", "Yes")),
  NO_OF_ABSENT = factor(NO_OF_ABSENT, levels = c("0-10 days","11-20 days","Other")),
  INCOME_CAT = factor(INCOME_CAT, levels = c("$0-100K", "$100-200K", "$200K or more"))
  ) |>
  dplyr::select(GRADE, PARENT, HAS_POST_SEC, PUBLIC_SCHOOL, INTERNET_ACCESS, NO_OF_ABSENT, INCOME_CAT)

Let’s check the relationship between GRADE and the variables NO_OF_ABSENT and INCOME_CAT

pfi |>
  ggbivariate("GRADE", c("PUBLIC_SCHOOL", "INCOME_CAT"), title = "Student Grade outcome by School and Parents Income Category")

Train and test split

set.seed(631)

# Split the dataset into training (75%) and testing (25%)
pfi_split <- initial_split(pfi, prop = 0.75, strata = GRADE)
pfi_train <- training(pfi_split)
pfi_test <- testing(pfi_split)

Define k-fold cross validation with 5 levels

pfi.folds <- vfold_cv(pfi_train, v=5, strata = GRADE)

Multinomial Regression

multi_spec <- multinom_reg() |>
  set_engine("nnet")

multi_wf <- workflow() |>
  add_model(multi_spec) |>
  add_formula(as.formula("GRADE ~ ."))

fit_multi <- multi_wf |>
  fit_resamples(pfi.folds, control=control_resamples(save_pred=TRUE, save_workflow=TRUE, extract=extract_model))

## → A | warning: `extract_model()` was deprecated in tune 0.1.6.
##                ℹ Please use `extract_fit_engine()` instead.
##                ℹ The deprecated feature was likely used in the tune package.
##                  Please report the issue at <https://github.com/tidymodels/tune/issues>.

## There were issues with some computations   A: x1There were issues with some computations   A: x1

multi_metrics_train <- collect_metrics(fit_multi) |>
  mutate(Model = "Multinomial Regression", Data = "Train")

multi_metrics_train

Confusion Matrix for Multinomial train data

cv_predictions_multi <- collect_predictions(fit_multi)

# Confusion matrix
conf_mat_multi <- cv_predictions_multi %>% 
  count(GRADE, .pred_class, .drop = FALSE)

conf_mat_multi %>% 
  pivot_wider(
    names_from = .pred_class,
    values_from = n
  )

conf_mat_multi %>% 
  ggplot(aes(x = GRADE, y = n, fill = .pred_class)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    main = "Predicted vs Actual GRADE"
    ) +
  theme_minimal() + 
  theme(legend.position = "top")

Fit the model

multi_model <- multinom_reg() |>
  set_engine("nnet") |>
  fit(GRADE ~ ., data = pfi_train)

tidy(multi_model) %>% 
  print(n = Inf) # This will display all rows of the tibble

## # A tibble: 44 × 6
##    y.level term                    estimate std.error statistic  p.value
##    <chr>   <chr>                      <dbl>     <dbl>     <dbl>    <dbl>
##  1 D       (Intercept)              -1.36      0.501     -2.72  6.59e- 3
##  2 D       PARENTFather             -0.388     0.249     -1.56  1.20e- 1
##  3 D       PARENTMother             -0.831     0.198     -4.19  2.77e- 5
##  4 D       PARENTBoth               -1.39      0.191     -7.27  3.60e-13
##  5 D       HAS_POST_SECYes          -0.664     0.123     -5.38  7.46e- 8
##  6 D       PUBLIC_SCHOOLYes          1.40      0.345      4.06  4.96e- 5
##  7 D       INTERNET_ACCESSYes       -0.342     0.348     -0.984 3.25e- 1
##  8 D       NO_OF_ABSENT11-20 days    0.814     0.145      5.60  2.15e- 8
##  9 D       NO_OF_ABSENTOther         0.813     0.122      6.64  3.14e-11
## 10 D       INCOME_CAT$100-200K      -1.06      0.171     -6.23  4.65e-10
## 11 D       INCOME_CAT$200K or more  -0.916     0.374     -2.45  1.43e- 2
## 12 C       (Intercept)               0.822     0.299      2.75  5.91e- 3
## 13 C       PARENTFather             -0.176     0.176     -1.00  3.17e- 1
## 14 C       PARENTMother             -0.583     0.143     -4.08  4.55e- 5
## 15 C       PARENTBoth               -0.998     0.136     -7.33  2.28e-13
## 16 C       HAS_POST_SECYes          -0.518     0.0795    -6.52  7.22e-11
## 17 C       PUBLIC_SCHOOLYes          0.287     0.111      2.59  9.69e- 3
## 18 C       INTERNET_ACCESSYes       -0.110     0.261     -0.423 6.72e- 1
## 19 C       NO_OF_ABSENT11-20 days    0.401     0.0871     4.60  4.18e- 6
## 20 C       NO_OF_ABSENTOther         0.359     0.0723     4.97  6.85e- 7
## 21 C       INCOME_CAT$100-200K      -0.669     0.0774    -8.64  5.50e-18
## 22 C       INCOME_CAT$200K or more  -1.23      0.182     -6.74  1.63e-11
## 23 B       (Intercept)               1.85      0.255      7.27  3.58e-13
## 24 B       PARENTFather              0.115     0.158      0.725 4.68e- 1
## 25 B       PARENTMother             -0.192     0.129     -1.49  1.37e- 1
## 26 B       PARENTBoth               -0.431     0.123     -3.50  4.63e- 4
## 27 B       HAS_POST_SECYes          -0.376     0.0675    -5.57  2.54e- 8
## 28 B       PUBLIC_SCHOOLYes          0.0370    0.0760     0.487 6.26e- 1
## 29 B       INTERNET_ACCESSYes       -0.315     0.225     -1.40  1.62e- 1
## 30 B       NO_OF_ABSENT11-20 days    0.0911    0.0705     1.29  1.96e- 1
## 31 B       NO_OF_ABSENTOther         0.0657    0.0580     1.13  2.58e- 1
## 32 B       INCOME_CAT$100-200K      -0.261     0.0548    -4.76  1.97e- 6
## 33 B       INCOME_CAT$200K or more  -0.528     0.0997    -5.30  1.16e- 7
## 34 A       (Intercept)               1.10      0.261      4.22  2.41e- 5
## 35 A       PARENTFather              0.462     0.160      2.89  3.84e- 3
## 36 A       PARENTMother              0.162     0.132      1.23  2.20e- 1
## 37 A       PARENTBoth                0.272     0.126      2.15  3.12e- 2
## 38 A       HAS_POST_SECYes           0.109     0.0673     1.62  1.06e- 1
## 39 A       PUBLIC_SCHOOLYes         -0.152     0.0686    -2.21  2.68e- 2
## 40 A       INTERNET_ACCESSYes        0.0324    0.232      0.140 8.89e- 1
## 41 A       NO_OF_ABSENT11-20 days   -0.159     0.0674    -2.36  1.84e- 2
## 42 A       NO_OF_ABSENTOther        -0.0751    0.0550    -1.37  1.72e- 1
## 43 A       INCOME_CAT$100-200K       0.0672    0.0504     1.33  1.83e- 1
## 44 A       INCOME_CAT$200K or more   0.0836    0.0853     0.979 3.27e- 1

The log-odds of a student who got “Other/No” grade vs “A” grade.

tidy(multi_model) %>%
  filter(y.level=='A') %>%
  dplyr::select(estimate,term)

\[ \begin{aligned} \log\left(\frac{\hat{p}_{\texttt{"A"}}}{\hat{p}_{\texttt{"Other"}}}\right) = & -1.1039 \\ & + 0.46164431{\space}PARENTFather \\ & + 0.16219859{\space}PARENTMother \\ & + 0.27154298{\space}PARENTBoth \\ & + 0.10870736{\space}HAS\_POST\_SECYes \\ & - 0.15197641{\space}PUBLIC\_SCHOOLYes \\ & + 0.03235633{\space}INTERNET\_ACCESSYes \\ & - 0.15881349{\space}NO\_OF\_ABSENT11-20 days \\ & - 0.07506882{\space}NO\_OF\_ABSENTOther \\ & + 0.06715767{\space}INCOME\_CAT100-200K \\ & + 0.08357023{\space}INCOME\_CAT200K or more \end{aligned} \]

Predict using the test data

pfi_multi_aug <- augment(multi_model, new_data = pfi_test) 
pfi_multi_aug

Check accuracy and ROC_AUC

multi_test_metrics <- rbind(
pfi_multi_aug |>
  metrics(truth = GRADE, estimate = .pred_class) |>
  dplyr::filter(.metric == "accuracy") |>
  mutate(Model = "Multinomial Regression", Data = "Test"),
pfi_multi_aug |>
  roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
  mutate(Model = "Multinomial Regression", Data = "Test")
)

multi_test_metrics

Linear Discriminant Analysis

lda_spec <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS")

lda_wf <- workflow() |>
  add_model(lda_spec) |>
  add_formula(as.formula("GRADE ~ ."))

fit_lda <- lda_wf |>
  fit_resamples(pfi.folds, control=control_resamples(save_pred=TRUE, save_workflow=TRUE, extract=extract_model))

lda_metrics_train <- collect_metrics(fit_lda) |>
  mutate(Model = "LDA", Data = "Train")

lda_metrics_train

Confusion Matrix for LDA train data

cv_predictions_lda <- collect_predictions(fit_lda)

# Confusion matrix
conf_mat_lda <- cv_predictions_lda %>% 
  count(GRADE, .pred_class, .drop = FALSE)

conf_mat_lda %>% 
  pivot_wider(
    names_from = .pred_class,
    values_from = n
  )

conf_mat_lda %>% 
  ggplot(aes(x = GRADE, y = n, fill = .pred_class)) +
  geom_bar(stat = "identity") +
  scale_fill_brewer(palette = "Set2") +
  labs(
    main = "Predicted vs Actual GRADE"
    ) +
  theme_minimal() + 
  theme(legend.position = "top")

Fit the model

lda_model <- discrim_linear() %>%
  set_mode("classification") %>%
  set_engine("MASS") |>
  fit(GRADE ~ ., data = pfi_train)

lda_model

## parsnip model object
## 
## Call:
## lda(GRADE ~ ., data = data)
## 
## Prior probabilities of groups:
##      Other          D          C          B          A 
## 0.12374551 0.01928920 0.09128994 0.28956818 0.47610717 
## 
## Group means:
##       PARENTFather PARENTMother PARENTBoth HAS_POST_SECYes PUBLIC_SCHOOLYes
## Other   0.04278274    0.1856399  0.7377232       0.8638393        0.8876488
## D       0.10023866    0.3054893  0.4582339       0.6563246        0.9785203
## C       0.09026727    0.2793747  0.5350479       0.7196167        0.9329299
## B       0.07249603    0.2383148  0.6343402       0.7817170        0.9044515
## A       0.05134403    0.1633146  0.7602011       0.8826146        0.8680139
##       INTERNET_ACCESSYes NO_OF_ABSENT11-20 days NO_OF_ABSENTOther
## Other          0.9906994              0.1283482         0.2139137
## D              0.9665871              0.2052506         0.3556086
## C              0.9793243              0.1704488         0.2813918
## B              0.9815580              0.1400636         0.2311606
## A              0.9919745              0.1128408         0.2035390
##       INCOME_CAT$100-200K INCOME_CAT$200K or more
## Other           0.3459821              0.08072917
## D               0.1097852              0.01909308
## C               0.1790217              0.01966717
## B               0.2662957              0.04546900
## A               0.3684974              0.09205183
## 
## Coefficients of linear discriminants:
##                                LD1        LD2          LD3        LD4
## PARENTFather            -1.0088962  2.7053003  2.184326680 -1.6417640
## PARENTMother            -1.1936981  2.6679281 -1.437210029 -1.1835795
## PARENTBoth              -1.9371515  2.2149478 -1.001298286 -1.1784287
## HAS_POST_SECYes         -1.0089877 -0.6518897 -0.004238791  1.0228292
## PUBLIC_SCHOOLYes         0.5019837 -0.7976498 -1.503719481 -0.7528578
## INTERNET_ACCESSYes      -0.4456721 -1.0044516  0.608645609  6.5555885
## NO_OF_ABSENT11-20 days   0.7352946 -1.1543041 -0.702487947  0.1486981
## NO_OF_ABSENTOther        0.5772406 -1.5110239  0.205484055 -0.3271538
## INCOME_CAT$100-200K     -0.8225327 -0.1387578 -0.143245660 -0.7863718
## INCOME_CAT$200K or more -1.1055775 -2.2070822  0.521825832 -1.9155322
## 
## Proportion of trace:
##    LD1    LD2    LD3    LD4 
## 0.9650 0.0259 0.0061 0.0029

Predict using the test data

pfi_lda_aug <- augment(lda_model, new_data = pfi_test) 
pfi_lda_aug

Check accuracy and ROC_AUC

lda_test_metrics <- rbind(
pfi_lda_aug |>
  metrics(truth = GRADE, estimate = .pred_class) |>
  dplyr::filter(.metric == "accuracy") |>
  mutate(Model = "LDA", Data = "Test"),
pfi_lda_aug |>
  roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
  mutate(Model = "LDA", Data = "Test")
)

lda_test_metrics

Model Evalution & Comparison

Comparison on Model Performance for train data

train_plot <- rbind(multi_metrics_train, lda_metrics_train) |>
  dplyr::filter(.metric %in% c("accuracy", "roc_auc")) |>
  ggplot(aes(x = Model, y = mean, fill = .metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Model Performance Comparison on Train Data (Multinomial vs LDA)", 
       y = "Value", 
       x = "Model", 
       fill = "Metric") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
  theme_minimal()

train_plot

Comparison on Model Performance for test data

test_plot <- rbind(multi_test_metrics, lda_test_metrics) |>
  ggplot(aes(x = Model, y = .estimate, fill = .metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  labs(title = "Model Performance Comparison on Test Data (Multinomial vs LDA)", 
       y = "Value", 
       x = "Model", 
       fill = "Metric") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
  theme_minimal()

test_plot

# Combine train and test metrics
all_metrics <- bind_rows(multi_metrics_train |>
                           mutate(.estimate = mean) |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         lda_metrics_train  |>
                           mutate(.estimate = mean) |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         multi_test_metrics |>
                           dplyr::select(.metric, .estimate, Model, Data), 
                         lda_test_metrics |>
                           dplyr::select(.metric, .estimate, Model, Data)) |>
  filter(.metric %in% c("accuracy", "roc_auc")) 

ggplot(all_metrics, aes(x = Model, y = .estimate, fill = .metric)) +
  geom_bar(stat = "identity", position = "dodge") +
  facet_wrap(. ~ Data) +
  labs(title = "Model Performance Comparison (Train vs Test)", 
       y = "Metric Value", x = "Model", fill = "Metric") +
  scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
  theme_minimal()

GVSU k-12 student grade prediction

Rohith Anugolu

Wednesday, August 06, 2025

Load the packages

Import the datasets

Check for missing values in both datasets

Check the relation between the potential predictor and response variables

Concatenate both the datasets

Encode all the variables and convert them to factors

Train and test split

Define k-fold cross validation with 5 levels

Multinomial Regression

Confusion Matrix for Multinomial train data

Fit the model

Predict using the test data

Check accuracy and ROC_AUC

Linear Discriminant Analysis

Confusion Matrix for LDA train data

Fit the model

Predict using the test data

Check accuracy and ROC_AUC

Model Evalution & Comparison