pfi_2016 <- openxlsx::read.xlsx("pfi-data.xlsx", sheet = "curated 2016")
pfi_2019 <- openxlsx::read.xlsx("pfi-data.xlsx", sheet = "curated 2019")
Name | pfi_2016 |
Number of rows | 13463 |
Number of columns | 69 |
_______________________ | |
Column type frequency: | |
character | 1 |
numeric | 68 |
________________________ | |
Group variables | None |
Variable type: character
skim_variable | n_missing | complete_rate | min | max | empty | n_unique | whitespace |
---|---|---|---|---|---|---|---|
ALLGRADEX | 0 | 1 | 1 | 2 | 0 | 13 | 0 |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
BASMID | 0 | 1 | 2.016111e+10 | 65733.15 | 20161000013 | 20161056265 | 20161112330 | 20161169876 | 20161226588 |
EDCPUB | 0 | 1 | 1.110000e+00 | 0.32 | 1 | 1 | 1 | 1 | 2 |
SPUBCHOIX | 0 | 1 | 1.940000e+00 | 0.77 | 1 | 1 | 2 | 3 | 3 |
SCONSIDR | 0 | 1 | 1.690000e+00 | 0.46 | 1 | 1 | 2 | 2 | 2 |
SEENJOY | 0 | 1 | 1.750000e+00 | 0.70 | 1 | 1 | 2 | 2 | 4 |
SEGRADES | 0 | 1 | 2.050000e+00 | 1.31 | 1 | 1 | 2 | 2 | 5 |
SEABSNT | 0 | 1 | 4.210000e+00 | 6.89 | 0 | 1 | 3 | 5 | 364 |
SEFUTUREX | 0 | 1 | 4.960000e+00 | 1.22 | 1 | 4 | 5 | 6 | 6 |
SEGRADEQ | 0 | 1 | 2.070000e+00 | 0.91 | 1 | 1 | 2 | 3 | 5 |
FSSPORTX | 0 | 1 | 1.200000e+00 | 0.40 | 1 | 1 | 1 | 1 | 2 |
FSVOL | 0 | 1 | 1.580000e+00 | 0.49 | 1 | 1 | 2 | 2 | 2 |
FSMTNG | 0 | 1 | 1.140000e+00 | 0.35 | 1 | 1 | 1 | 1 | 2 |
FSPTMTNG | 0 | 1 | 1.550000e+00 | 0.50 | 1 | 1 | 2 | 2 | 2 |
FSATCNFN | 0 | 1 | 1.260000e+00 | 0.44 | 1 | 1 | 1 | 2 | 2 |
FSFUNDRS | 0 | 1 | 1.390000e+00 | 0.49 | 1 | 1 | 1 | 2 | 2 |
FSCOMMTE | 0 | 1 | 1.870000e+00 | 0.34 | 1 | 2 | 2 | 2 | 2 |
FSCOUNSLR | 0 | 1 | 1.640000e+00 | 0.48 | 1 | 1 | 2 | 2 | 2 |
FSFREQ | 0 | 1 | 7.930000e+00 | 9.00 | 0 | 3 | 5 | 10 | 99 |
FSNOTESX | 0 | 1 | 1.370000e+00 | 0.48 | 1 | 1 | 1 | 2 | 2 |
FSMEMO | 0 | 1 | 1.100000e+00 | 0.29 | 1 | 1 | 1 | 1 | 2 |
FCSCHOOL | 0 | 1 | 1.490000e+00 | 0.70 | 1 | 1 | 1 | 2 | 4 |
FCTEACHR | 0 | 1 | 1.500000e+00 | 0.68 | 1 | 1 | 1 | 2 | 4 |
FCSTDS | 0 | 1 | 1.500000e+00 | 0.70 | 1 | 1 | 1 | 2 | 4 |
FCORDER | 0 | 1 | 1.530000e+00 | 0.75 | 1 | 1 | 1 | 2 | 4 |
FCSUPPRT | 0 | 1 | 1.620000e+00 | 0.77 | 1 | 1 | 1 | 2 | 4 |
FHHOME | 0 | 1 | 3.220000e+00 | 1.01 | 1 | 3 | 3 | 4 | 6 |
FHWKHRS | 0 | 1 | 5.530000e+00 | 5.43 | -1 | 2 | 4 | 8 | 75 |
FHAMOUNT | 0 | 1 | 1.250000e+00 | 0.85 | -1 | 1 | 1 | 1 | 3 |
FHCAMT | 0 | 1 | 1.270000e+00 | 0.76 | -1 | 1 | 1 | 2 | 3 |
FHPLACE | 0 | 1 | 1.050000e+00 | 0.62 | -1 | 1 | 1 | 1 | 3 |
FHCHECKX | 0 | 1 | 3.170000e+00 | 1.29 | -1 | 3 | 4 | 4 | 4 |
FHHELP | 0 | 1 | 2.330000e+00 | 1.50 | -1 | 1 | 2 | 3 | 5 |
FOSTORY2X | 0 | 1 | 1.430000e+00 | 0.50 | 1 | 1 | 1 | 2 | 2 |
FOCRAFTS | 0 | 1 | 1.590000e+00 | 0.49 | 1 | 1 | 2 | 2 | 2 |
FOGAMES | 0 | 1 | 1.520000e+00 | 0.50 | 1 | 1 | 2 | 2 | 2 |
FOBUILDX | 0 | 1 | 1.430000e+00 | 0.50 | 1 | 1 | 1 | 2 | 2 |
FOSPORT | 0 | 1 | 1.270000e+00 | 0.44 | 1 | 1 | 1 | 2 | 2 |
FORESPON | 0 | 1 | 1.290000e+00 | 0.45 | 1 | 1 | 1 | 2 | 2 |
FOHISTX | 0 | 1 | 1.460000e+00 | 0.50 | 1 | 1 | 1 | 2 | 2 |
FODINNERX | 0 | 1 | 4.750000e+00 | 2.07 | 0 | 3 | 5 | 7 | 7 |
FOLIBRAYX | 0 | 1 | 1.670000e+00 | 0.47 | 1 | 1 | 2 | 2 | 2 |
FOBOOKSTX | 0 | 1 | 1.640000e+00 | 0.48 | 1 | 1 | 2 | 2 | 2 |
HDHEALTH | 0 | 1 | 1.570000e+00 | 0.77 | 1 | 1 | 1 | 2 | 5 |
CDOBMM | 0 | 1 | 6.530000e+00 | 3.41 | 1 | 4 | 7 | 9 | 12 |
CDOBYY | 0 | 1 | 2.002950e+03 | 3.79 | 1995 | 2000 | 2003 | 2006 | 2012 |
CSEX | 0 | 1 | 1.490000e+00 | 0.50 | 1 | 1 | 1 | 2 | 2 |
CSPEAKX | 0 | 1 | 2.330000e+00 | 0.96 | 1 | 2 | 2 | 2 | 6 |
HHTOTALXX | 0 | 1 | 4.030000e+00 | 1.22 | 2 | 3 | 4 | 5 | 10 |
RELATION | 0 | 1 | 1.640000e+00 | 1.28 | 1 | 1 | 1 | 2 | 9 |
P1REL | 0 | 1 | 1.290000e+00 | 0.97 | 1 | 1 | 1 | 1 | 6 |
P1SEX | 0 | 1 | 1.700000e+00 | 0.46 | 1 | 1 | 2 | 2 | 2 |
P1MRSTA | 0 | 1 | 1.780000e+00 | 1.34 | 1 | 1 | 1 | 3 | 5 |
P1EMPL | 0 | 1 | 1.980000e+00 | 1.74 | 1 | 1 | 1 | 2 | 7 |
P1HRSWK | 0 | 1 | 3.116000e+01 | 19.68 | -1 | 16 | 40 | 40 | 80 |
P1MTHSWRK | 0 | 1 | 9.370000e+00 | 4.59 | 0 | 9 | 12 | 12 | 12 |
P1AGE | 0 | 1 | 4.488000e+01 | 8.72 | 15 | 39 | 45 | 50 | 90 |
P2GUARD | 0 | 1 | 1.260000e+00 | 0.44 | 1 | 1 | 1 | 2 | 2 |
TTLHHINC | 0 | 1 | 6.620000e+00 | 2.84 | 1 | 4 | 7 | 9 | 10 |
OWNRNTHB | 0 | 1 | 1.260000e+00 | 0.48 | 1 | 1 | 1 | 1 | 3 |
DSBLTY | 0 | 1 | 1.830000e+00 | 0.38 | 1 | 2 | 2 | 2 | 2 |
HHPARN16X | 0 | 1 | 1.460000e+00 | 0.80 | 1 | 1 | 1 | 2 | 4 |
HHPARN16_BRD | 0 | 1 | 1.260000e+00 | 0.44 | 1 | 1 | 1 | 2 | 2 |
NUMSIBSX | 0 | 1 | 1.020000e+00 | 0.95 | 0 | 0 | 1 | 1 | 7 |
PARGRADEX | 0 | 1 | 3.550000e+00 | 1.15 | 1 | 3 | 4 | 4 | 5 |
RACEETHN | 0 | 1 | 1.890000e+00 | 1.13 | 1 | 1 | 1 | 3 | 4 |
INTACC | 0 | 1 | 1.200000e+00 | 0.61 | 1 | 1 | 1 | 1 | 4 |
CENREG | 0 | 1 | 2.520000e+00 | 1.04 | 1 | 2 | 2 | 3 | 4 |
ZIPLOCL | 0 | 1 | 2.270000e+01 | 10.27 | 11 | 13 | 21 | 23 | 43 |
Name | pfi_2019 |
Number of rows | 15500 |
Number of columns | 75 |
_______________________ | |
Column type frequency: | |
numeric | 75 |
________________________ | |
Group variables | None |
Variable type: numeric
skim_variable | n_missing | complete_rate | mean | sd | p0 | p25 | p50 | p75 | p100 |
---|---|---|---|---|---|---|---|---|---|
BASMID | 0 | 1 | 2.019111e+10 | 65000.56 | 20191000012 | 2.019106e+10 | 20191113174 | 20191169241 | 20191225500 |
ALLGRADEX | 0 | 1 | 9.610000e+00 | 3.86 | 2 | 6.750000e+00 | 10 | 13 | 15 |
EDCPUB | 0 | 1 | 1.110000e+00 | 0.31 | 1 | 1.000000e+00 | 1 | 1 | 2 |
SCCHOICE | 0 | 1 | 1.370000e+00 | 0.48 | -1 | 1.000000e+00 | 1 | 2 | 2 |
SPUBCHOIX | 0 | 1 | 1.900000e+00 | 0.78 | -1 | 1.000000e+00 | 2 | 3 | 3 |
SCONSIDR | 0 | 1 | 1.620000e+00 | 0.49 | -1 | 1.000000e+00 | 2 | 2 | 2 |
SCHLHRSWK | 0 | 1 | 3.770000e+00 | 0.60 | -1 | 4.000000e+00 | 4 | 4 | 4 |
EINTNET | 0 | 1 | 3.920000e+00 | 0.35 | -1 | 4.000000e+00 | 4 | 4 | 4 |
MOSTIMPT | 0 | 1 | -5.200000e-01 | 2.22 | -1 | -1.000000e+00 | -1 | -1 | 14 |
INTNUM | 0 | 1 | -8.200000e-01 | 0.83 | -1 | -1.000000e+00 | -1 | -1 | 16 |
SEENJOY | 0 | 1 | 1.770000e+00 | 0.70 | -1 | 1.000000e+00 | 2 | 2 | 4 |
SEGRADES | 0 | 1 | 2.000000e+00 | 1.33 | -1 | 1.000000e+00 | 2 | 2 | 5 |
SEABSNT | 0 | 1 | 1.230000e+00 | 0.55 | -1 | 1.000000e+00 | 1 | 1 | 4 |
SEGRADEQ | 0 | 1 | 2.040000e+00 | 0.91 | -1 | 1.000000e+00 | 2 | 3 | 5 |
FSSPORTX | 0 | 1 | 1.180000e+00 | 0.45 | -1 | 1.000000e+00 | 1 | 1 | 2 |
FSVOL | 0 | 1 | 1.560000e+00 | 0.54 | -1 | 1.000000e+00 | 2 | 2 | 2 |
FSMTNG | 0 | 1 | 1.130000e+00 | 0.40 | -1 | 1.000000e+00 | 1 | 1 | 2 |
FSPTMTNG | 0 | 1 | 1.520000e+00 | 0.55 | -1 | 1.000000e+00 | 2 | 2 | 2 |
FSATCNFN | 0 | 1 | 1.250000e+00 | 0.49 | -1 | 1.000000e+00 | 1 | 2 | 2 |
FSFUNDRS | 0 | 1 | 1.400000e+00 | 0.54 | -1 | 1.000000e+00 | 1 | 2 | 2 |
FSCOMMTE | 0 | 1 | 1.850000e+00 | 0.42 | -1 | 2.000000e+00 | 2 | 2 | 2 |
FSCOUNSLR | 0 | 1 | 1.620000e+00 | 0.53 | -1 | 1.000000e+00 | 2 | 2 | 2 |
FSFREQ | 0 | 1 | 6.950000e+00 | 9.23 | -1 | 2.000000e+00 | 4 | 8 | 99 |
FSNOTESX | 0 | 1 | 1.330000e+00 | 0.49 | -1 | 1.000000e+00 | 1 | 2 | 2 |
FSMEMO | 0 | 1 | 1.090000e+00 | 0.32 | -1 | 1.000000e+00 | 1 | 1 | 2 |
FCSCHOOL | 0 | 1 | 1.440000e+00 | 0.67 | -1 | 1.000000e+00 | 1 | 2 | 4 |
FCTEACHR | 0 | 1 | 1.460000e+00 | 0.67 | -1 | 1.000000e+00 | 1 | 2 | 4 |
FCSTDS | 0 | 1 | 1.460000e+00 | 0.68 | -1 | 1.000000e+00 | 1 | 2 | 4 |
FCORDER | 0 | 1 | 1.520000e+00 | 0.75 | -1 | 1.000000e+00 | 1 | 2 | 4 |
FCSUPPRT | 0 | 1 | 1.590000e+00 | 0.77 | -1 | 1.000000e+00 | 1 | 2 | 4 |
FHHOME | 0 | 1 | 3.160000e+00 | 1.11 | -1 | 3.000000e+00 | 3 | 4 | 6 |
FHWKHRS | 0 | 1 | 5.360000e+00 | 5.39 | -1 | 2.000000e+00 | 4 | 7 | 75 |
FHAMOUNT | 0 | 1 | 1.200000e+00 | 0.88 | -1 | 1.000000e+00 | 1 | 1 | 3 |
FHCAMT | 0 | 1 | 1.230000e+00 | 0.80 | -1 | 1.000000e+00 | 1 | 2 | 3 |
FHPLACE | 0 | 1 | 1.010000e+00 | 0.66 | -1 | 1.000000e+00 | 1 | 1 | 3 |
FHCHECKX | 0 | 1 | 3.080000e+00 | 1.38 | -1 | 3.000000e+00 | 4 | 4 | 4 |
FHHELP | 0 | 1 | 2.190000e+00 | 1.55 | -1 | 1.000000e+00 | 2 | 3 | 5 |
FOSTORY2X | 0 | 1 | 1.410000e+00 | 0.49 | 1 | 1.000000e+00 | 1 | 2 | 2 |
FOCRAFTS | 0 | 1 | 1.560000e+00 | 0.50 | 1 | 1.000000e+00 | 2 | 2 | 2 |
FOGAMES | 0 | 1 | 1.460000e+00 | 0.50 | 1 | 1.000000e+00 | 1 | 2 | 2 |
FOBUILDX | 0 | 1 | 1.450000e+00 | 0.50 | 1 | 1.000000e+00 | 1 | 2 | 2 |
FOSPORT | 0 | 1 | 1.320000e+00 | 0.47 | 1 | 1.000000e+00 | 1 | 2 | 2 |
FORESPON | 0 | 1 | 1.300000e+00 | 0.46 | 1 | 1.000000e+00 | 1 | 2 | 2 |
FOHISTX | 0 | 1 | 1.460000e+00 | 0.50 | 1 | 1.000000e+00 | 1 | 2 | 2 |
FODINNERX | 0 | 1 | 4.780000e+00 | 2.05 | 0 | 3.000000e+00 | 5 | 7 | 7 |
FOLIBRAYX | 0 | 1 | 1.680000e+00 | 0.47 | 1 | 1.000000e+00 | 2 | 2 | 2 |
FOBOOKSTX | 0 | 1 | 1.670000e+00 | 0.47 | 1 | 1.000000e+00 | 2 | 2 | 2 |
HDHEALTH | 0 | 1 | 1.580000e+00 | 0.77 | 1 | 1.000000e+00 | 1 | 2 | 5 |
CDOBMM | 0 | 1 | 6.560000e+00 | 3.42 | 1 | 4.000000e+00 | 7 | 9 | 12 |
CDOBYY | 0 | 1 | 2.006010e+03 | 3.82 | 1998 | 2.003000e+03 | 2006 | 2009 | 2015 |
CSEX | 0 | 1 | 1.480000e+00 | 0.50 | 1 | 1.000000e+00 | 1 | 2 | 2 |
CSPEAKX | 0 | 1 | 2.290000e+00 | 0.89 | 1 | 2.000000e+00 | 2 | 2 | 6 |
HHTOTALXX | 0 | 1 | 4.050000e+00 | 1.23 | 2 | 3.000000e+00 | 4 | 5 | 10 |
RELATION | 0 | 1 | 1.790000e+00 | 1.49 | 1 | 1.000000e+00 | 1 | 2 | 11 |
P1REL | 0 | 1 | 1.290000e+00 | 0.97 | 1 | 1.000000e+00 | 1 | 1 | 6 |
P1SEX | 0 | 1 | 1.660000e+00 | 0.47 | 1 | 1.000000e+00 | 2 | 2 | 2 |
P1MRSTA | 0 | 1 | 1.820000e+00 | 1.37 | 1 | 1.000000e+00 | 1 | 3 | 5 |
P1EMPL | 0 | 1 | 1.890000e+00 | 1.70 | 1 | 1.000000e+00 | 1 | 2 | 7 |
P1HRSWK | 0 | 1 | 3.251000e+01 | 19.04 | -1 | 2.300000e+01 | 40 | 43 | 80 |
P1MTHSWRK | 0 | 1 | 9.680000e+00 | 4.38 | 0 | 1.000000e+01 | 12 | 12 | 12 |
P1AGE | 0 | 1 | 4.480000e+01 | 8.70 | 15 | 3.900000e+01 | 44 | 50 | 90 |
P2GUARD | 0 | 1 | 1.260000e+00 | 0.44 | 1 | 1.000000e+00 | 1 | 2 | 2 |
TTLHHINC | 0 | 1 | 7.230000e+00 | 3.05 | 1 | 5.000000e+00 | 8 | 9 | 12 |
OWNRNTHB | 0 | 1 | 1.260000e+00 | 0.47 | 1 | 1.000000e+00 | 1 | 1 | 3 |
CHLDNT | 0 | 1 | 1.740000e+00 | 1.01 | 1 | 1.000000e+00 | 1 | 2 | 5 |
SEFUTUREX | 0 | 1 | 4.920000e+00 | 1.20 | 1 | 4.000000e+00 | 5 | 6 | 6 |
DSBLTY | 0 | 1 | 1.760000e+00 | 0.43 | 1 | 2.000000e+00 | 2 | 2 | 2 |
HHPARN19X | 0 | 1 | 1.450000e+00 | 0.79 | 1 | 1.000000e+00 | 1 | 2 | 4 |
HHPARN19_BRD | 0 | 1 | 1.260000e+00 | 0.44 | 1 | 1.000000e+00 | 1 | 2 | 2 |
NUMSIBSX | 0 | 1 | 1.030000e+00 | 0.96 | 0 | 0.000000e+00 | 1 | 1 | 7 |
PARGRADEX | 0 | 1 | 3.620000e+00 | 1.15 | 1 | 3.000000e+00 | 4 | 5 | 5 |
RACEETH | 0 | 1 | 2.000000e+00 | 1.29 | 1 | 1.000000e+00 | 1 | 3 | 5 |
INTACC | 0 | 1 | 1.140000e+00 | 0.53 | 1 | 1.000000e+00 | 1 | 1 | 4 |
CENREG | 0 | 1 | 2.530000e+00 | 1.03 | 1 | 2.000000e+00 | 2 | 3 | 4 |
ZIPLOCL | 0 | 1 | 2.281000e+01 | 10.31 | 11 | 1.300000e+01 | 21 | 31 | 43 |
pfi_2016 |>
dplyr::select(SEGRADES, HHPARN16X, PARGRADEX, EDCPUB, INTACC, SEABSNT, SEFUTUREX, TTLHHINC) |>
ggpairs()
pfi_2019 |>
dplyr::select(SEGRADES, HHPARN19X, PARGRADEX, EDCPUB, INTACC, SEABSNT, SEFUTUREX, TTLHHINC) |>
ggpairs()
pfi_data <- rbind(pfi_2016 |>
mutate(HHPARNX = HHPARN16X) |>
dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC),
pfi_2019 |>
mutate(HHPARNX = HHPARN19X) |>
dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC)
)
## Rows: 28,963
## Columns: 7
## $ SEGRADES <dbl> 1, 2, 5, 1, 2, 1, 3, 1, 3, 2, 1, 2, 2, 2, 1, 1, 1, 2, 2, 2, …
## $ HHPARNX <dbl> 1, 1, 1, 1, 1, 3, 2, 1, 1, 2, 4, 2, 4, 2, 1, 1, 2, 3, 3, 1, …
## $ PARGRADEX <dbl> 4, 3, 4, 4, 3, 1, 3, 5, 2, 3, 3, 2, 2, 1, 4, 4, 3, 3, 2, 5, …
## $ EDCPUB <dbl> 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 1, 2, 1, 1, 1, 1, 1, …
## $ INTACC <dbl> 1, 1, 1, 1, 1, 3, 1, 1, 3, 1, 2, 1, 1, 3, 1, 1, 1, 1, 1, 1, …
## $ SEABSNT <dbl> 0, 0, 5, 1, 2, 3, 0, 4, 3, 8, 0, 4, 2, 2, 0, 8, 0, 6, 3, 10,…
## $ TTLHHINC <dbl> 10, 6, 6, 9, 8, 4, 5, 9, 3, 7, 8, 3, 5, 2, 10, 8, 5, 8, 9, 9…
pfi_data |>
dplyr::select(SEGRADES, HHPARNX, PARGRADEX, EDCPUB, INTACC, SEABSNT, TTLHHINC) |>
ggpairs()
pfi <- pfi_data |>
mutate(GRADE = case_match(SEGRADES,
1 ~ "A",
2 ~ "B",
3 ~ "C",
4 ~ "D",
c(5,-1) ~ "Other"),
PARENT = case_match(HHPARNX,
1 ~ "Both",
2 ~ "Mother",
3 ~ "Father",
4 ~ "None"),
HAS_POST_SEC = case_match(PARGRADEX,
c(1,2) ~ "No",
c(3,4,5) ~ "Yes"),
PUBLIC_SCHOOL = case_match(EDCPUB,
1 ~ "Yes",
2 ~ "No"),
INTERNET_ACCESS = case_match(INTACC,
c(1,2,3) ~ "Yes",
4 ~ "No"),
NO_OF_ABSENT = case_match(SEABSNT,
c(1,2) ~ "0-10 days",
c(3,4) ~ "11-20 days",
.default = "Other"),
INCOME_CAT = case_match(TTLHHINC,
c(1:8) ~ "$0-100K",
c(9:10) ~ "$100-200K",
c(11:12) ~ "$200K or more"),
GRADE = factor(GRADE, levels = c("Other", "D", "C", "B", "A")),
PARENT = factor(PARENT, levels = c("None", "Father", "Mother", "Both")),
HAS_POST_SEC = factor(HAS_POST_SEC, levels = c("No", "Yes")),
PUBLIC_SCHOOL = factor(PUBLIC_SCHOOL, levels = c("No", "Yes")),
INTERNET_ACCESS = factor(INTERNET_ACCESS, levels = c("No", "Yes")),
NO_OF_ABSENT = factor(NO_OF_ABSENT, levels = c("0-10 days","11-20 days","Other")),
INCOME_CAT = factor(INCOME_CAT, levels = c("$0-100K", "$100-200K", "$200K or more"))
) |>
dplyr::select(GRADE, PARENT, HAS_POST_SEC, PUBLIC_SCHOOL, INTERNET_ACCESS, NO_OF_ABSENT, INCOME_CAT)
Let’s check the relationship between GRADE and the variables NO_OF_ABSENT and INCOME_CAT
pfi |>
ggbivariate("GRADE", c("PUBLIC_SCHOOL", "INCOME_CAT"), title = "Student Grade outcome by School and Parents Income Category")
multi_spec <- multinom_reg() |>
set_engine("nnet")
multi_wf <- workflow() |>
add_model(multi_spec) |>
add_formula(as.formula("GRADE ~ ."))
fit_multi <- multi_wf |>
fit_resamples(pfi.folds, control=control_resamples(save_pred=TRUE, save_workflow=TRUE, extract=extract_model))
## → A | warning: `extract_model()` was deprecated in tune 0.1.6.
## ℹ Please use `extract_fit_engine()` instead.
## ℹ The deprecated feature was likely used in the tune package.
## Please report the issue at <https://github.com/tidymodels/tune/issues>.
## There were issues with some computations A: x1There were issues with some computations A: x1
multi_metrics_train <- collect_metrics(fit_multi) |>
mutate(Model = "Multinomial Regression", Data = "Train")
multi_metrics_train
cv_predictions_multi <- collect_predictions(fit_multi)
# Confusion matrix
conf_mat_multi <- cv_predictions_multi %>%
count(GRADE, .pred_class, .drop = FALSE)
conf_mat_multi %>%
pivot_wider(
names_from = .pred_class,
values_from = n
)
conf_mat_multi %>%
ggplot(aes(x = GRADE, y = n, fill = .pred_class)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set2") +
labs(
main = "Predicted vs Actual GRADE"
) +
theme_minimal() +
theme(legend.position = "top")
multi_model <- multinom_reg() |>
set_engine("nnet") |>
fit(GRADE ~ ., data = pfi_train)
tidy(multi_model) %>%
print(n = Inf) # This will display all rows of the tibble
## # A tibble: 44 × 6
## y.level term estimate std.error statistic p.value
## <chr> <chr> <dbl> <dbl> <dbl> <dbl>
## 1 D (Intercept) -1.36 0.501 -2.72 6.59e- 3
## 2 D PARENTFather -0.388 0.249 -1.56 1.20e- 1
## 3 D PARENTMother -0.831 0.198 -4.19 2.77e- 5
## 4 D PARENTBoth -1.39 0.191 -7.27 3.60e-13
## 5 D HAS_POST_SECYes -0.664 0.123 -5.38 7.46e- 8
## 6 D PUBLIC_SCHOOLYes 1.40 0.345 4.06 4.96e- 5
## 7 D INTERNET_ACCESSYes -0.342 0.348 -0.984 3.25e- 1
## 8 D NO_OF_ABSENT11-20 days 0.814 0.145 5.60 2.15e- 8
## 9 D NO_OF_ABSENTOther 0.813 0.122 6.64 3.14e-11
## 10 D INCOME_CAT$100-200K -1.06 0.171 -6.23 4.65e-10
## 11 D INCOME_CAT$200K or more -0.916 0.374 -2.45 1.43e- 2
## 12 C (Intercept) 0.822 0.299 2.75 5.91e- 3
## 13 C PARENTFather -0.176 0.176 -1.00 3.17e- 1
## 14 C PARENTMother -0.583 0.143 -4.08 4.55e- 5
## 15 C PARENTBoth -0.998 0.136 -7.33 2.28e-13
## 16 C HAS_POST_SECYes -0.518 0.0795 -6.52 7.22e-11
## 17 C PUBLIC_SCHOOLYes 0.287 0.111 2.59 9.69e- 3
## 18 C INTERNET_ACCESSYes -0.110 0.261 -0.423 6.72e- 1
## 19 C NO_OF_ABSENT11-20 days 0.401 0.0871 4.60 4.18e- 6
## 20 C NO_OF_ABSENTOther 0.359 0.0723 4.97 6.85e- 7
## 21 C INCOME_CAT$100-200K -0.669 0.0774 -8.64 5.50e-18
## 22 C INCOME_CAT$200K or more -1.23 0.182 -6.74 1.63e-11
## 23 B (Intercept) 1.85 0.255 7.27 3.58e-13
## 24 B PARENTFather 0.115 0.158 0.725 4.68e- 1
## 25 B PARENTMother -0.192 0.129 -1.49 1.37e- 1
## 26 B PARENTBoth -0.431 0.123 -3.50 4.63e- 4
## 27 B HAS_POST_SECYes -0.376 0.0675 -5.57 2.54e- 8
## 28 B PUBLIC_SCHOOLYes 0.0370 0.0760 0.487 6.26e- 1
## 29 B INTERNET_ACCESSYes -0.315 0.225 -1.40 1.62e- 1
## 30 B NO_OF_ABSENT11-20 days 0.0911 0.0705 1.29 1.96e- 1
## 31 B NO_OF_ABSENTOther 0.0657 0.0580 1.13 2.58e- 1
## 32 B INCOME_CAT$100-200K -0.261 0.0548 -4.76 1.97e- 6
## 33 B INCOME_CAT$200K or more -0.528 0.0997 -5.30 1.16e- 7
## 34 A (Intercept) 1.10 0.261 4.22 2.41e- 5
## 35 A PARENTFather 0.462 0.160 2.89 3.84e- 3
## 36 A PARENTMother 0.162 0.132 1.23 2.20e- 1
## 37 A PARENTBoth 0.272 0.126 2.15 3.12e- 2
## 38 A HAS_POST_SECYes 0.109 0.0673 1.62 1.06e- 1
## 39 A PUBLIC_SCHOOLYes -0.152 0.0686 -2.21 2.68e- 2
## 40 A INTERNET_ACCESSYes 0.0324 0.232 0.140 8.89e- 1
## 41 A NO_OF_ABSENT11-20 days -0.159 0.0674 -2.36 1.84e- 2
## 42 A NO_OF_ABSENTOther -0.0751 0.0550 -1.37 1.72e- 1
## 43 A INCOME_CAT$100-200K 0.0672 0.0504 1.33 1.83e- 1
## 44 A INCOME_CAT$200K or more 0.0836 0.0853 0.979 3.27e- 1
The log-odds of a student who got “Other/No” grade vs “A” grade.
\[ \begin{aligned} \log\left(\frac{\hat{p}_{\texttt{"A"}}}{\hat{p}_{\texttt{"Other"}}}\right) = & -1.1039 \\ & + 0.46164431{\space}PARENTFather \\ & + 0.16219859{\space}PARENTMother \\ & + 0.27154298{\space}PARENTBoth \\ & + 0.10870736{\space}HAS\_POST\_SECYes \\ & - 0.15197641{\space}PUBLIC\_SCHOOLYes \\ & + 0.03235633{\space}INTERNET\_ACCESSYes \\ & - 0.15881349{\space}NO\_OF\_ABSENT11-20 days \\ & - 0.07506882{\space}NO\_OF\_ABSENTOther \\ & + 0.06715767{\space}INCOME\_CAT100-200K \\ & + 0.08357023{\space}INCOME\_CAT200K or more \end{aligned} \]
multi_test_metrics <- rbind(
pfi_multi_aug |>
metrics(truth = GRADE, estimate = .pred_class) |>
dplyr::filter(.metric == "accuracy") |>
mutate(Model = "Multinomial Regression", Data = "Test"),
pfi_multi_aug |>
roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
mutate(Model = "Multinomial Regression", Data = "Test")
)
multi_test_metrics
lda_spec <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS")
lda_wf <- workflow() |>
add_model(lda_spec) |>
add_formula(as.formula("GRADE ~ ."))
fit_lda <- lda_wf |>
fit_resamples(pfi.folds, control=control_resamples(save_pred=TRUE, save_workflow=TRUE, extract=extract_model))
lda_metrics_train <- collect_metrics(fit_lda) |>
mutate(Model = "LDA", Data = "Train")
lda_metrics_train
cv_predictions_lda <- collect_predictions(fit_lda)
# Confusion matrix
conf_mat_lda <- cv_predictions_lda %>%
count(GRADE, .pred_class, .drop = FALSE)
conf_mat_lda %>%
pivot_wider(
names_from = .pred_class,
values_from = n
)
conf_mat_lda %>%
ggplot(aes(x = GRADE, y = n, fill = .pred_class)) +
geom_bar(stat = "identity") +
scale_fill_brewer(palette = "Set2") +
labs(
main = "Predicted vs Actual GRADE"
) +
theme_minimal() +
theme(legend.position = "top")
lda_model <- discrim_linear() %>%
set_mode("classification") %>%
set_engine("MASS") |>
fit(GRADE ~ ., data = pfi_train)
lda_model
## parsnip model object
##
## Call:
## lda(GRADE ~ ., data = data)
##
## Prior probabilities of groups:
## Other D C B A
## 0.12374551 0.01928920 0.09128994 0.28956818 0.47610717
##
## Group means:
## PARENTFather PARENTMother PARENTBoth HAS_POST_SECYes PUBLIC_SCHOOLYes
## Other 0.04278274 0.1856399 0.7377232 0.8638393 0.8876488
## D 0.10023866 0.3054893 0.4582339 0.6563246 0.9785203
## C 0.09026727 0.2793747 0.5350479 0.7196167 0.9329299
## B 0.07249603 0.2383148 0.6343402 0.7817170 0.9044515
## A 0.05134403 0.1633146 0.7602011 0.8826146 0.8680139
## INTERNET_ACCESSYes NO_OF_ABSENT11-20 days NO_OF_ABSENTOther
## Other 0.9906994 0.1283482 0.2139137
## D 0.9665871 0.2052506 0.3556086
## C 0.9793243 0.1704488 0.2813918
## B 0.9815580 0.1400636 0.2311606
## A 0.9919745 0.1128408 0.2035390
## INCOME_CAT$100-200K INCOME_CAT$200K or more
## Other 0.3459821 0.08072917
## D 0.1097852 0.01909308
## C 0.1790217 0.01966717
## B 0.2662957 0.04546900
## A 0.3684974 0.09205183
##
## Coefficients of linear discriminants:
## LD1 LD2 LD3 LD4
## PARENTFather -1.0088962 2.7053003 2.184326680 -1.6417640
## PARENTMother -1.1936981 2.6679281 -1.437210029 -1.1835795
## PARENTBoth -1.9371515 2.2149478 -1.001298286 -1.1784287
## HAS_POST_SECYes -1.0089877 -0.6518897 -0.004238791 1.0228292
## PUBLIC_SCHOOLYes 0.5019837 -0.7976498 -1.503719481 -0.7528578
## INTERNET_ACCESSYes -0.4456721 -1.0044516 0.608645609 6.5555885
## NO_OF_ABSENT11-20 days 0.7352946 -1.1543041 -0.702487947 0.1486981
## NO_OF_ABSENTOther 0.5772406 -1.5110239 0.205484055 -0.3271538
## INCOME_CAT$100-200K -0.8225327 -0.1387578 -0.143245660 -0.7863718
## INCOME_CAT$200K or more -1.1055775 -2.2070822 0.521825832 -1.9155322
##
## Proportion of trace:
## LD1 LD2 LD3 LD4
## 0.9650 0.0259 0.0061 0.0029
lda_test_metrics <- rbind(
pfi_lda_aug |>
metrics(truth = GRADE, estimate = .pred_class) |>
dplyr::filter(.metric == "accuracy") |>
mutate(Model = "LDA", Data = "Test"),
pfi_lda_aug |>
roc_auc(truth = GRADE, .pred_A, .pred_B, .pred_C, .pred_D, .pred_Other) |>
mutate(Model = "LDA", Data = "Test")
)
lda_test_metrics
Comparison on Model Performance for train data
train_plot <- rbind(multi_metrics_train, lda_metrics_train) |>
dplyr::filter(.metric %in% c("accuracy", "roc_auc")) |>
ggplot(aes(x = Model, y = mean, fill = .metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Model Performance Comparison on Train Data (Multinomial vs LDA)",
y = "Value",
x = "Model",
fill = "Metric") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
theme_minimal()
train_plot
Comparison on Model Performance for test data
test_plot <- rbind(multi_test_metrics, lda_test_metrics) |>
ggplot(aes(x = Model, y = .estimate, fill = .metric)) +
geom_bar(stat = "identity", position = "dodge") +
labs(title = "Model Performance Comparison on Test Data (Multinomial vs LDA)",
y = "Value",
x = "Model",
fill = "Metric") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
theme_minimal()
test_plot
# Combine train and test metrics
all_metrics <- bind_rows(multi_metrics_train |>
mutate(.estimate = mean) |>
dplyr::select(.metric, .estimate, Model, Data),
lda_metrics_train |>
mutate(.estimate = mean) |>
dplyr::select(.metric, .estimate, Model, Data),
multi_test_metrics |>
dplyr::select(.metric, .estimate, Model, Data),
lda_test_metrics |>
dplyr::select(.metric, .estimate, Model, Data)) |>
filter(.metric %in% c("accuracy", "roc_auc"))
ggplot(all_metrics, aes(x = Model, y = .estimate, fill = .metric)) +
geom_bar(stat = "identity", position = "dodge") +
facet_wrap(. ~ Data) +
labs(title = "Model Performance Comparison (Train vs Test)",
y = "Metric Value", x = "Model", fill = "Metric") +
scale_y_continuous(limits = c(0, 1), breaks = seq(0, 1, by = 0.2)) +
theme_minimal()