Data Cleaning

library(car)
## Loading required package: carData
library(ResourceSelection)
## ResourceSelection 0.3-6   2023-06-27
library(MASS)

mat <- read.csv("~/Documents/Regression Analysis/final/student-mat.csv",sep=";")

# Define a binary y-variable (failing:0 vs. passing:1)
mat$y_pass <- ifelse(mat$G3 >= 0 & mat$G3 <= 11, 0, 1)

# Exclude these data fields because we want to exclusively look at behavior and environment
mat <- mat[, !names(mat) %in% c("G1", "G2", "G3","failures","school","age","sex","health")]
names(mat) #response (y_pass) & predictor variables
##  [1] "address"    "famsize"    "Pstatus"    "Medu"       "Fedu"      
##  [6] "Mjob"       "Fjob"       "reason"     "guardian"   "traveltime"
## [11] "studytime"  "schoolsup"  "famsup"     "paid"       "activities"
## [16] "nursery"    "higher"     "internet"   "romantic"   "famrel"    
## [21] "freetime"   "goout"      "Dalc"       "Walc"       "absences"  
## [26] "y_pass"

Model 1: Logit Creation

# Full logistic regression model
model_full <- glm(y_pass ~ as.factor(address) + as.factor(famsize) + 
                    as.factor(Pstatus) + Medu + Fedu + as.factor(Mjob) + as.factor(Fjob) + as.factor(reason) + as.factor(guardian) + traveltime + studytime + as.factor(schoolsup) + as.factor(famsup) + as.factor(paid) + as.factor(activities) + as.factor(nursery) + as.factor(higher) + as.factor(internet)  + as.factor(romantic) + famrel + freetime + goout + Dalc + Walc + absences, data = mat, family=binomial)
summary(model_full) # AIC: 528.19
## 
## Call:
## glm(formula = y_pass ~ as.factor(address) + as.factor(famsize) + 
##     as.factor(Pstatus) + Medu + Fedu + as.factor(Mjob) + as.factor(Fjob) + 
##     as.factor(reason) + as.factor(guardian) + traveltime + studytime + 
##     as.factor(schoolsup) + as.factor(famsup) + as.factor(paid) + 
##     as.factor(activities) + as.factor(nursery) + as.factor(higher) + 
##     as.factor(internet) + as.factor(romantic) + famrel + freetime + 
##     goout + Dalc + Walc + absences, family = binomial, data = mat)
## 
## Coefficients:
##                             Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -1.62824    1.29867  -1.254  0.20992    
## as.factor(address)U          0.41813    0.31935   1.309  0.19042    
## as.factor(famsize)LE3        0.51155    0.26480   1.932  0.05338 .  
## as.factor(Pstatus)T          0.12638    0.39124   0.323  0.74667    
## Medu                         0.35127    0.17570   1.999  0.04558 *  
## Fedu                         0.15232    0.14745   1.033  0.30159    
## as.factor(Mjob)health        0.31019    0.60235   0.515  0.60658    
## as.factor(Mjob)other        -0.15904    0.39877  -0.399  0.69002    
## as.factor(Mjob)services      0.01389    0.43019   0.032  0.97424    
## as.factor(Mjob)teacher      -1.00198    0.56576  -1.771  0.07656 .  
## as.factor(Fjob)health       -0.27181    0.77631  -0.350  0.72624    
## as.factor(Fjob)other        -0.22016    0.54541  -0.404  0.68646    
## as.factor(Fjob)services     -0.26782    0.56135  -0.477  0.63329    
## as.factor(Fjob)teacher       0.56540    0.69821   0.810  0.41806    
## as.factor(reason)home       -0.03841    0.30212  -0.127  0.89885    
## as.factor(reason)other       0.22625    0.44095   0.513  0.60788    
## as.factor(reason)reputation  0.42878    0.31192   1.375  0.16924    
## as.factor(guardian)mother    0.04406    0.29542   0.149  0.88144    
## as.factor(guardian)other    -0.47516    0.51284  -0.927  0.35417    
## traveltime                  -0.16422    0.19281  -0.852  0.39439    
## studytime                    0.17517    0.15272   1.147  0.25138    
## as.factor(schoolsup)yes     -1.60860    0.42428  -3.791  0.00015 ***
## as.factor(famsup)yes        -0.47704    0.25702  -1.856  0.06345 .  
## as.factor(paid)yes          -0.09770    0.25828  -0.378  0.70522    
## as.factor(activities)yes    -0.06395    0.24070  -0.266  0.79049    
## as.factor(nursery)yes       -0.19930    0.29953  -0.665  0.50580    
## as.factor(higher)yes         0.71212    0.63570   1.120  0.26263    
## as.factor(internet)yes       0.28561    0.33903   0.842  0.39954    
## as.factor(romantic)yes      -0.17692    0.25512  -0.693  0.48801    
## famrel                       0.01100    0.13038   0.084  0.93273    
## freetime                     0.16716    0.12967   1.289  0.19738    
## goout                       -0.35497    0.12712  -2.792  0.00523 ** 
## Dalc                         0.06428    0.18688   0.344  0.73088    
## Walc                        -0.07395    0.13805  -0.536  0.59216    
## absences                    -0.03270    0.01851  -1.767  0.07724 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 458.19  on 360  degrees of freedom
## AIC: 528.19
## 
## Number of Fisher Scoring iterations: 4
# Null model
model_null <- glm(y_pass ~ 1, data = mat, family=binomial)

# Automated Stepwise Regression AIC 499.91
step_model1 <- stepAIC(model_null,
                      scope = list(lower = model_null, upper = model_full),
                      direction = "both", family=binomial)
## Start:  AIC=536.75
## y_pass ~ 1
## 
##                         Df Deviance    AIC
## + as.factor(schoolsup)   1   520.21 524.21
## + Medu                   1   521.47 525.47
## + Fedu                   1   526.45 530.45
## + goout                  1   528.24 532.24
## + absences               1   529.27 533.27
## + traveltime             1   530.01 534.01
## + as.factor(Mjob)        4   524.13 534.13
## + Walc                   1   530.37 534.37
## + as.factor(higher)      1   530.56 534.56
## + as.factor(address)     1   530.71 534.71
## <none>                       534.75 536.75
## + as.factor(famsize)     1   532.78 536.78
## + as.factor(internet)    1   532.79 536.79
## + studytime              1   532.87 536.87
## + Dalc                   1   533.14 537.14
## + as.factor(famsup)      1   533.54 537.54
## + as.factor(guardian)    2   531.69 537.69
## + as.factor(romantic)    1   534.54 538.54
## + freetime               1   534.60 538.60
## + as.factor(activities)  1   534.65 538.65
## + as.factor(nursery)     1   534.66 538.66
## + famrel                 1   534.70 538.70
## + as.factor(reason)      3   530.71 538.71
## + as.factor(Pstatus)     1   534.75 538.75
## + as.factor(paid)        1   534.75 538.75
## + as.factor(Fjob)        4   530.34 540.34
## 
## Step:  AIC=524.21
## y_pass ~ as.factor(schoolsup)
## 
##                         Df Deviance    AIC
## + Medu                   1   507.52 513.52
## + Fedu                   1   510.77 516.77
## + goout                  1   512.64 518.64
## + Walc                   1   514.12 520.12
## + absences               1   514.92 520.92
## + traveltime             1   515.10 521.10
## + as.factor(higher)      1   515.13 521.13
## + as.factor(address)     1   515.66 521.66
## + as.factor(Mjob)        4   510.55 522.55
## + studytime              1   517.85 523.85
## + as.factor(guardian)    2   516.14 524.14
## <none>                       520.21 524.21
## + as.factor(internet)    1   518.27 524.27
## + Dalc                   1   518.31 524.31
## + as.factor(famsize)     1   518.46 524.46
## + as.factor(romantic)    1   519.62 525.62
## + as.factor(famsup)      1   519.67 525.67
## + as.factor(reason)      3   515.74 525.74
## + as.factor(Fjob)        4   513.81 525.81
## + as.factor(activities)  1   519.96 525.96
## + as.factor(nursery)     1   519.98 525.98
## + freetime               1   520.16 526.16
## + famrel                 1   520.16 526.16
## + as.factor(Pstatus)     1   520.16 526.16
## + as.factor(paid)        1   520.20 526.20
## - as.factor(schoolsup)   1   534.75 536.75
## 
## Step:  AIC=513.52
## y_pass ~ as.factor(schoolsup) + Medu
## 
##                         Df Deviance    AIC
## + goout                  1   498.63 506.63
## + absences               1   500.04 508.04
## + Walc                   1   502.06 510.06
## + as.factor(higher)      1   504.62 512.62
## + traveltime             1   504.62 512.62
## + as.factor(address)     1   504.69 512.69
## + Dalc                   1   505.30 513.30
## + as.factor(famsize)     1   505.37 513.37
## + as.factor(famsup)      1   505.40 513.40
## <none>                       507.52 513.52
## + studytime              1   505.77 513.77
## + as.factor(guardian)    2   504.18 514.18
## + Fedu                   1   506.29 514.29
## + as.factor(romantic)    1   506.70 514.70
## + as.factor(paid)        1   506.95 514.95
## + as.factor(internet)    1   507.11 515.11
## + as.factor(Mjob)        4   501.24 515.24
## + famrel                 1   507.46 515.46
## + as.factor(nursery)     1   507.47 515.47
## + as.factor(Pstatus)     1   507.48 515.48
## + freetime               1   507.50 515.50
## + as.factor(activities)  1   507.51 515.51
## + as.factor(reason)      3   504.41 516.41
## + as.factor(Fjob)        4   504.54 518.54
## - Medu                   1   520.21 524.21
## - as.factor(schoolsup)   1   521.47 525.47
## 
## Step:  AIC=506.63
## y_pass ~ as.factor(schoolsup) + Medu + goout
## 
##                         Df Deviance    AIC
## + absences               1   491.95 501.95
## + as.factor(address)     1   494.97 504.97
## + traveltime             1   495.85 505.85
## + as.factor(famsize)     1   496.10 506.10
## + as.factor(higher)      1   496.11 506.11
## + as.factor(famsup)      1   496.23 506.23
## <none>                       498.63 506.63
## + as.factor(Mjob)        4   491.15 507.15
## + Walc                   1   497.20 507.20
## + Fedu                   1   497.28 507.28
## + studytime              1   497.30 507.30
## + freetime               1   497.63 507.63
## + as.factor(guardian)    2   495.69 507.69
## + as.factor(romantic)    1   497.73 507.73
## + as.factor(internet)    1   497.90 507.90
## + Dalc                   1   498.06 508.06
## + as.factor(paid)        1   498.10 508.10
## + famrel                 1   498.45 508.45
## + as.factor(Pstatus)     1   498.57 508.57
## + as.factor(activities)  1   498.58 508.58
## + as.factor(nursery)     1   498.58 508.58
## + as.factor(reason)      3   495.91 509.91
## + as.factor(Fjob)        4   496.13 512.13
## - goout                  1   507.52 513.52
## - Medu                   1   512.64 518.64
## - as.factor(schoolsup)   1   513.42 519.42
## 
## Step:  AIC=501.95
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences
## 
##                         Df Deviance    AIC
## + as.factor(address)     1   488.78 500.78
## + as.factor(famsize)     1   488.96 500.96
## + traveltime             1   489.13 501.13
## + as.factor(famsup)      1   489.47 501.47
## <none>                       491.95 501.95
## + as.factor(higher)      1   490.04 502.04
## + as.factor(Mjob)        4   484.18 502.18
## + as.factor(internet)    1   490.85 502.85
## + Fedu                   1   490.89 502.89
## + studytime              1   491.03 503.03
## + Walc                   1   491.23 503.23
## + freetime               1   491.30 503.30
## + as.factor(paid)        1   491.34 503.34
## + as.factor(romantic)    1   491.68 503.68
## + Dalc                   1   491.71 503.71
## + famrel                 1   491.87 503.87
## + as.factor(nursery)     1   491.90 503.90
## + as.factor(Pstatus)     1   491.93 503.93
## + as.factor(activities)  1   491.94 503.94
## + as.factor(guardian)    2   490.34 504.34
## + as.factor(reason)      3   488.70 504.70
## - absences               1   498.63 506.63
## + as.factor(Fjob)        4   489.85 507.85
## - goout                  1   500.04 508.04
## - as.factor(schoolsup)   1   506.73 514.73
## - Medu                   1   507.92 515.92
## 
## Step:  AIC=500.78
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address)
## 
##                         Df Deviance    AIC
## + as.factor(famsize)     1   486.14 500.14
## + as.factor(famsup)      1   486.23 500.23
## <none>                       488.78 500.78
## + as.factor(higher)      1   486.90 500.90
## + traveltime             1   487.43 501.43
## + as.factor(Mjob)        4   481.61 501.61
## + Fedu                   1   487.65 501.65
## + studytime              1   487.74 501.74
## - as.factor(address)     1   491.95 501.95
## + freetime               1   488.07 502.07
## + as.factor(paid)        1   488.08 502.08
## + as.factor(internet)    1   488.28 502.28
## + Walc                   1   488.41 502.41
## + as.factor(romantic)    1   488.50 502.50
## + as.factor(reason)      3   484.62 502.62
## + as.factor(nursery)     1   488.71 502.71
## + famrel                 1   488.71 502.71
## + Dalc                   1   488.73 502.73
## + as.factor(activities)  1   488.73 502.73
## + as.factor(Pstatus)     1   488.77 502.77
## + as.factor(guardian)    2   487.12 503.12
## - absences               1   494.97 504.97
## + as.factor(Fjob)        4   486.58 506.58
## - goout                  1   497.58 507.58
## - Medu                   1   502.84 512.84
## - as.factor(schoolsup)   1   503.93 513.93
## 
## Step:  AIC=500.14
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address) + 
##     as.factor(famsize)
## 
##                         Df Deviance    AIC
## + as.factor(famsup)      1   484.07 500.07
## <none>                       486.14 500.14
## + as.factor(Mjob)        4   478.25 500.25
## + as.factor(higher)      1   484.31 500.31
## + traveltime             1   484.42 500.42
## - as.factor(famsize)     1   488.78 500.78
## + studytime              1   484.88 500.88
## + Fedu                   1   484.88 500.88
## - as.factor(address)     1   488.96 500.96
## + freetime               1   485.44 501.44
## + as.factor(paid)        1   485.45 501.45
## + Walc                   1   485.58 501.58
## + as.factor(internet)    1   485.61 501.61
## + as.factor(romantic)    1   485.82 501.82
## + as.factor(reason)      3   481.88 501.88
## + as.factor(nursery)     1   485.94 501.94
## + Dalc                   1   485.97 501.97
## + famrel                 1   486.05 502.05
## + as.factor(activities)  1   486.09 502.09
## + as.factor(Pstatus)     1   486.10 502.10
## + as.factor(guardian)    2   484.57 502.57
## - absences               1   492.80 504.80
## + as.factor(Fjob)        4   483.73 505.73
## - goout                  1   495.26 507.26
## - Medu                   1   500.86 512.86
## - as.factor(schoolsup)   1   500.98 512.98
## 
## Step:  AIC=500.07
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address) + 
##     as.factor(famsize) + as.factor(famsup)
## 
##                         Df Deviance    AIC
## + as.factor(Mjob)        4   475.91 499.91
## + as.factor(higher)      1   482.05 500.05
## <none>                       484.07 500.07
## - as.factor(famsup)      1   486.14 500.14
## - as.factor(famsize)     1   486.23 500.23
## + studytime              1   482.26 500.26
## + traveltime             1   482.45 500.45
## + Fedu                   1   482.46 500.46
## - as.factor(address)     1   486.98 500.98
## + freetime               1   483.31 501.31
## + Walc                   1   483.31 501.31
## + as.factor(internet)    1   483.39 501.39
## + as.factor(reason)      3   479.49 501.49
## + as.factor(romantic)    1   483.77 501.77
## + Dalc                   1   483.87 501.87
## + as.factor(paid)        1   483.88 501.88
## + as.factor(nursery)     1   483.93 501.93
## + famrel                 1   484.00 502.00
## + as.factor(Pstatus)     1   484.03 502.03
## + as.factor(activities)  1   484.05 502.05
## + as.factor(guardian)    2   482.57 502.57
## - absences               1   490.70 504.70
## + as.factor(Fjob)        4   481.42 505.42
## - goout                  1   493.59 507.59
## - as.factor(schoolsup)   1   497.81 511.81
## - Medu                   1   500.41 514.41
## 
## Step:  AIC=499.91
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address) + 
##     as.factor(famsize) + as.factor(famsup) + as.factor(Mjob)
## 
##                         Df Deviance    AIC
## <none>                       475.91 499.91
## + Fedu                   1   474.00 500.00
## + as.factor(higher)      1   474.03 500.03
## - as.factor(Mjob)        4   484.07 500.07
## + studytime              1   474.23 500.23
## - as.factor(address)     1   478.24 500.24
## - as.factor(famsup)      1   478.25 500.25
## + traveltime             1   474.47 500.47
## + freetime               1   474.66 500.66
## - as.factor(famsize)     1   478.67 500.67
## + as.factor(internet)    1   475.00 501.00
## + Walc                   1   475.21 501.21
## + as.factor(romantic)    1   475.26 501.26
## + as.factor(Pstatus)     1   475.73 501.73
## + as.factor(nursery)     1   475.76 501.76
## + as.factor(activities)  1   475.83 501.83
## + famrel                 1   475.84 501.84
## + Dalc                   1   475.84 501.84
## + as.factor(paid)        1   475.84 501.84
## + as.factor(guardian)    2   474.51 502.51
## + as.factor(reason)      3   472.72 502.72
## + as.factor(Fjob)        4   472.10 504.10
## - absences               1   482.99 504.99
## - goout                  1   486.79 508.79
## - as.factor(schoolsup)   1   490.92 512.92
## - Medu                   1   491.10 513.10
# Automated Stepwise Regression AIC 499.91
step_model2 <- stepAIC(model_null,
                      scope = list(lower = model_null, upper = model_full),
                      direction = "forward", family=binomial) 
## Start:  AIC=536.75
## y_pass ~ 1
## 
##                         Df Deviance    AIC
## + as.factor(schoolsup)   1   520.21 524.21
## + Medu                   1   521.47 525.47
## + Fedu                   1   526.45 530.45
## + goout                  1   528.24 532.24
## + absences               1   529.27 533.27
## + traveltime             1   530.01 534.01
## + as.factor(Mjob)        4   524.13 534.13
## + Walc                   1   530.37 534.37
## + as.factor(higher)      1   530.56 534.56
## + as.factor(address)     1   530.71 534.71
## <none>                       534.75 536.75
## + as.factor(famsize)     1   532.78 536.78
## + as.factor(internet)    1   532.79 536.79
## + studytime              1   532.87 536.87
## + Dalc                   1   533.14 537.14
## + as.factor(famsup)      1   533.54 537.54
## + as.factor(guardian)    2   531.69 537.69
## + as.factor(romantic)    1   534.54 538.54
## + freetime               1   534.60 538.60
## + as.factor(activities)  1   534.65 538.65
## + as.factor(nursery)     1   534.66 538.66
## + famrel                 1   534.70 538.70
## + as.factor(reason)      3   530.71 538.71
## + as.factor(Pstatus)     1   534.75 538.75
## + as.factor(paid)        1   534.75 538.75
## + as.factor(Fjob)        4   530.34 540.34
## 
## Step:  AIC=524.21
## y_pass ~ as.factor(schoolsup)
## 
##                         Df Deviance    AIC
## + Medu                   1   507.52 513.52
## + Fedu                   1   510.77 516.77
## + goout                  1   512.64 518.64
## + Walc                   1   514.12 520.12
## + absences               1   514.92 520.92
## + traveltime             1   515.10 521.10
## + as.factor(higher)      1   515.13 521.13
## + as.factor(address)     1   515.66 521.66
## + as.factor(Mjob)        4   510.55 522.55
## + studytime              1   517.85 523.85
## + as.factor(guardian)    2   516.14 524.14
## <none>                       520.21 524.21
## + as.factor(internet)    1   518.27 524.27
## + Dalc                   1   518.31 524.31
## + as.factor(famsize)     1   518.46 524.46
## + as.factor(romantic)    1   519.62 525.62
## + as.factor(famsup)      1   519.67 525.67
## + as.factor(reason)      3   515.74 525.74
## + as.factor(Fjob)        4   513.81 525.81
## + as.factor(activities)  1   519.96 525.96
## + as.factor(nursery)     1   519.98 525.98
## + freetime               1   520.16 526.16
## + famrel                 1   520.16 526.16
## + as.factor(Pstatus)     1   520.16 526.16
## + as.factor(paid)        1   520.20 526.20
## 
## Step:  AIC=513.52
## y_pass ~ as.factor(schoolsup) + Medu
## 
##                         Df Deviance    AIC
## + goout                  1   498.63 506.63
## + absences               1   500.04 508.04
## + Walc                   1   502.06 510.06
## + as.factor(higher)      1   504.62 512.62
## + traveltime             1   504.62 512.62
## + as.factor(address)     1   504.69 512.69
## + Dalc                   1   505.30 513.30
## + as.factor(famsize)     1   505.37 513.37
## + as.factor(famsup)      1   505.40 513.40
## <none>                       507.52 513.52
## + studytime              1   505.77 513.77
## + as.factor(guardian)    2   504.18 514.18
## + Fedu                   1   506.29 514.29
## + as.factor(romantic)    1   506.70 514.70
## + as.factor(paid)        1   506.95 514.95
## + as.factor(internet)    1   507.11 515.11
## + as.factor(Mjob)        4   501.24 515.24
## + famrel                 1   507.46 515.46
## + as.factor(nursery)     1   507.47 515.47
## + as.factor(Pstatus)     1   507.48 515.48
## + freetime               1   507.50 515.50
## + as.factor(activities)  1   507.51 515.51
## + as.factor(reason)      3   504.41 516.41
## + as.factor(Fjob)        4   504.54 518.54
## 
## Step:  AIC=506.63
## y_pass ~ as.factor(schoolsup) + Medu + goout
## 
##                         Df Deviance    AIC
## + absences               1   491.95 501.95
## + as.factor(address)     1   494.97 504.97
## + traveltime             1   495.85 505.85
## + as.factor(famsize)     1   496.10 506.10
## + as.factor(higher)      1   496.11 506.11
## + as.factor(famsup)      1   496.23 506.23
## <none>                       498.63 506.63
## + as.factor(Mjob)        4   491.15 507.15
## + Walc                   1   497.20 507.20
## + Fedu                   1   497.28 507.28
## + studytime              1   497.30 507.30
## + freetime               1   497.63 507.63
## + as.factor(guardian)    2   495.69 507.69
## + as.factor(romantic)    1   497.73 507.73
## + as.factor(internet)    1   497.90 507.90
## + Dalc                   1   498.06 508.06
## + as.factor(paid)        1   498.10 508.10
## + famrel                 1   498.45 508.45
## + as.factor(Pstatus)     1   498.57 508.57
## + as.factor(activities)  1   498.58 508.58
## + as.factor(nursery)     1   498.58 508.58
## + as.factor(reason)      3   495.91 509.91
## + as.factor(Fjob)        4   496.13 512.13
## 
## Step:  AIC=501.95
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences
## 
##                         Df Deviance    AIC
## + as.factor(address)     1   488.78 500.78
## + as.factor(famsize)     1   488.96 500.96
## + traveltime             1   489.13 501.13
## + as.factor(famsup)      1   489.47 501.47
## <none>                       491.95 501.95
## + as.factor(higher)      1   490.04 502.04
## + as.factor(Mjob)        4   484.18 502.18
## + as.factor(internet)    1   490.85 502.85
## + Fedu                   1   490.89 502.89
## + studytime              1   491.03 503.03
## + Walc                   1   491.23 503.23
## + freetime               1   491.30 503.30
## + as.factor(paid)        1   491.34 503.34
## + as.factor(romantic)    1   491.68 503.68
## + Dalc                   1   491.71 503.71
## + famrel                 1   491.87 503.87
## + as.factor(nursery)     1   491.90 503.90
## + as.factor(Pstatus)     1   491.93 503.93
## + as.factor(activities)  1   491.94 503.94
## + as.factor(guardian)    2   490.34 504.34
## + as.factor(reason)      3   488.70 504.70
## + as.factor(Fjob)        4   489.85 507.85
## 
## Step:  AIC=500.78
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address)
## 
##                         Df Deviance    AIC
## + as.factor(famsize)     1   486.14 500.14
## + as.factor(famsup)      1   486.23 500.23
## <none>                       488.78 500.78
## + as.factor(higher)      1   486.90 500.90
## + traveltime             1   487.43 501.43
## + as.factor(Mjob)        4   481.61 501.61
## + Fedu                   1   487.65 501.65
## + studytime              1   487.74 501.74
## + freetime               1   488.07 502.07
## + as.factor(paid)        1   488.08 502.08
## + as.factor(internet)    1   488.28 502.28
## + Walc                   1   488.41 502.41
## + as.factor(romantic)    1   488.50 502.50
## + as.factor(reason)      3   484.62 502.62
## + as.factor(nursery)     1   488.71 502.71
## + famrel                 1   488.71 502.71
## + Dalc                   1   488.73 502.73
## + as.factor(activities)  1   488.73 502.73
## + as.factor(Pstatus)     1   488.77 502.77
## + as.factor(guardian)    2   487.12 503.12
## + as.factor(Fjob)        4   486.58 506.58
## 
## Step:  AIC=500.14
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address) + 
##     as.factor(famsize)
## 
##                         Df Deviance    AIC
## + as.factor(famsup)      1   484.07 500.07
## <none>                       486.14 500.14
## + as.factor(Mjob)        4   478.25 500.25
## + as.factor(higher)      1   484.31 500.31
## + traveltime             1   484.42 500.42
## + studytime              1   484.88 500.88
## + Fedu                   1   484.88 500.88
## + freetime               1   485.44 501.44
## + as.factor(paid)        1   485.45 501.45
## + Walc                   1   485.58 501.58
## + as.factor(internet)    1   485.61 501.61
## + as.factor(romantic)    1   485.82 501.82
## + as.factor(reason)      3   481.88 501.88
## + as.factor(nursery)     1   485.94 501.94
## + Dalc                   1   485.97 501.97
## + famrel                 1   486.05 502.05
## + as.factor(activities)  1   486.09 502.09
## + as.factor(Pstatus)     1   486.10 502.10
## + as.factor(guardian)    2   484.57 502.57
## + as.factor(Fjob)        4   483.73 505.73
## 
## Step:  AIC=500.07
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address) + 
##     as.factor(famsize) + as.factor(famsup)
## 
##                         Df Deviance    AIC
## + as.factor(Mjob)        4   475.91 499.91
## + as.factor(higher)      1   482.05 500.05
## <none>                       484.07 500.07
## + studytime              1   482.26 500.26
## + traveltime             1   482.45 500.45
## + Fedu                   1   482.46 500.46
## + freetime               1   483.31 501.31
## + Walc                   1   483.31 501.31
## + as.factor(internet)    1   483.39 501.39
## + as.factor(reason)      3   479.49 501.49
## + as.factor(romantic)    1   483.77 501.77
## + Dalc                   1   483.87 501.87
## + as.factor(paid)        1   483.88 501.88
## + as.factor(nursery)     1   483.93 501.93
## + famrel                 1   484.00 502.00
## + as.factor(Pstatus)     1   484.03 502.03
## + as.factor(activities)  1   484.05 502.05
## + as.factor(guardian)    2   482.57 502.57
## + as.factor(Fjob)        4   481.42 505.42
## 
## Step:  AIC=499.91
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address) + 
##     as.factor(famsize) + as.factor(famsup) + as.factor(Mjob)
## 
##                         Df Deviance    AIC
## <none>                       475.91 499.91
## + Fedu                   1   474.00 500.00
## + as.factor(higher)      1   474.03 500.03
## + studytime              1   474.23 500.23
## + traveltime             1   474.47 500.47
## + freetime               1   474.66 500.66
## + as.factor(internet)    1   475.00 501.00
## + Walc                   1   475.21 501.21
## + as.factor(romantic)    1   475.26 501.26
## + as.factor(Pstatus)     1   475.73 501.73
## + as.factor(nursery)     1   475.76 501.76
## + as.factor(activities)  1   475.83 501.83
## + famrel                 1   475.84 501.84
## + Dalc                   1   475.84 501.84
## + as.factor(paid)        1   475.84 501.84
## + as.factor(guardian)    2   474.51 502.51
## + as.factor(reason)      3   472.72 502.72
## + as.factor(Fjob)        4   472.10 504.10
# Automated Stepwise Regression AIC 536.75
step_model3 <- stepAIC(model_null,
                      scope = list(lower = model_null, upper = model_full),
                      direction = "back", family=binomial) 
## Start:  AIC=536.75
## y_pass ~ 1
# Automated Stepwise Regression Using BIC Criterion: AIC 521.84
n <- nrow(mat)
step_model4 <- stepAIC(model_null,
                       scope = list(lower = model_null, upper = model_full),
                       direction = "both", family=binomial,
                       k = log(n))
## Start:  AIC=540.73
## y_pass ~ 1
## 
##                         Df Deviance    AIC
## + as.factor(schoolsup)   1   520.21 532.17
## + Medu                   1   521.47 533.43
## + Fedu                   1   526.45 538.40
## + goout                  1   528.24 540.19
## <none>                       534.75 540.73
## + absences               1   529.27 541.23
## + traveltime             1   530.01 541.97
## + Walc                   1   530.37 542.33
## + as.factor(higher)      1   530.56 542.51
## + as.factor(address)     1   530.71 542.67
## + as.factor(famsize)     1   532.78 544.74
## + as.factor(internet)    1   532.79 544.74
## + studytime              1   532.87 544.83
## + Dalc                   1   533.14 545.10
## + as.factor(famsup)      1   533.54 545.50
## + as.factor(romantic)    1   534.54 546.50
## + freetime               1   534.60 546.56
## + as.factor(activities)  1   534.65 546.61
## + as.factor(nursery)     1   534.66 546.62
## + famrel                 1   534.70 546.66
## + as.factor(Pstatus)     1   534.75 546.71
## + as.factor(paid)        1   534.75 546.71
## + as.factor(guardian)    2   531.69 549.63
## + as.factor(Mjob)        4   524.13 554.03
## + as.factor(reason)      3   530.71 554.63
## + as.factor(Fjob)        4   530.34 560.23
## 
## Step:  AIC=532.17
## y_pass ~ as.factor(schoolsup)
## 
##                         Df Deviance    AIC
## + Medu                   1   507.52 525.45
## + Fedu                   1   510.77 528.71
## + goout                  1   512.64 530.58
## + Walc                   1   514.12 532.05
## <none>                       520.21 532.17
## + absences               1   514.92 532.86
## + traveltime             1   515.10 533.04
## + as.factor(higher)      1   515.13 533.06
## + as.factor(address)     1   515.66 533.60
## + studytime              1   517.85 535.79
## + as.factor(internet)    1   518.27 536.21
## + Dalc                   1   518.31 536.25
## + as.factor(famsize)     1   518.46 536.40
## + as.factor(romantic)    1   519.62 537.56
## + as.factor(famsup)      1   519.67 537.61
## + as.factor(activities)  1   519.96 537.90
## + as.factor(nursery)     1   519.98 537.91
## + freetime               1   520.16 538.09
## + famrel                 1   520.16 538.10
## + as.factor(Pstatus)     1   520.16 538.10
## + as.factor(paid)        1   520.20 538.13
## + as.factor(guardian)    2   516.14 540.05
## - as.factor(schoolsup)   1   534.75 540.73
## + as.factor(reason)      3   515.74 545.64
## + as.factor(Mjob)        4   510.55 546.43
## + as.factor(Fjob)        4   513.81 549.68
## 
## Step:  AIC=525.45
## y_pass ~ as.factor(schoolsup) + Medu
## 
##                         Df Deviance    AIC
## + goout                  1   498.63 522.54
## + absences               1   500.04 523.95
## <none>                       507.52 525.45
## + Walc                   1   502.06 525.97
## + as.factor(higher)      1   504.62 528.54
## + traveltime             1   504.62 528.54
## + as.factor(address)     1   504.69 528.61
## + Dalc                   1   505.30 529.22
## + as.factor(famsize)     1   505.37 529.28
## + as.factor(famsup)      1   505.40 529.32
## + studytime              1   505.77 529.69
## + Fedu                   1   506.29 530.20
## + as.factor(romantic)    1   506.70 530.62
## + as.factor(paid)        1   506.95 530.86
## + as.factor(internet)    1   507.11 531.03
## + famrel                 1   507.46 531.37
## + as.factor(nursery)     1   507.47 531.39
## + as.factor(Pstatus)     1   507.48 531.40
## + freetime               1   507.50 531.42
## + as.factor(activities)  1   507.51 531.43
## - Medu                   1   520.21 532.17
## - as.factor(schoolsup)   1   521.47 533.43
## + as.factor(guardian)    2   504.18 534.07
## + as.factor(reason)      3   504.41 540.29
## + as.factor(Mjob)        4   501.24 543.09
## + as.factor(Fjob)        4   504.54 546.39
## 
## Step:  AIC=522.54
## y_pass ~ as.factor(schoolsup) + Medu + goout
## 
##                         Df Deviance    AIC
## + absences               1   491.95 521.84
## <none>                       498.63 522.54
## + as.factor(address)     1   494.97 524.87
## - goout                  1   507.52 525.45
## + traveltime             1   495.85 525.74
## + as.factor(famsize)     1   496.10 526.00
## + as.factor(higher)      1   496.11 526.01
## + as.factor(famsup)      1   496.23 526.12
## + Walc                   1   497.20 527.10
## + Fedu                   1   497.28 527.18
## + studytime              1   497.30 527.20
## + freetime               1   497.63 527.52
## + as.factor(romantic)    1   497.73 527.62
## + as.factor(internet)    1   497.90 527.79
## + Dalc                   1   498.06 527.96
## + as.factor(paid)        1   498.10 528.00
## + famrel                 1   498.45 528.34
## + as.factor(Pstatus)     1   498.57 528.47
## + as.factor(activities)  1   498.58 528.48
## + as.factor(nursery)     1   498.58 528.48
## - Medu                   1   512.64 530.58
## - as.factor(schoolsup)   1   513.42 531.35
## + as.factor(guardian)    2   495.69 531.56
## + as.factor(reason)      3   495.91 537.77
## + as.factor(Mjob)        4   491.15 538.99
## + as.factor(Fjob)        4   496.13 543.97
## 
## Step:  AIC=521.84
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences
## 
##                         Df Deviance    AIC
## <none>                       491.95 521.84
## - absences               1   498.63 522.54
## - goout                  1   500.04 523.95
## + as.factor(address)     1   488.78 524.65
## + as.factor(famsize)     1   488.96 524.83
## + traveltime             1   489.13 525.00
## + as.factor(famsup)      1   489.47 525.34
## + as.factor(higher)      1   490.04 525.91
## + as.factor(internet)    1   490.85 526.72
## + Fedu                   1   490.89 526.77
## + studytime              1   491.03 526.90
## + Walc                   1   491.23 527.10
## + freetime               1   491.30 527.17
## + as.factor(paid)        1   491.34 527.21
## + as.factor(romantic)    1   491.68 527.55
## + Dalc                   1   491.71 527.58
## + famrel                 1   491.87 527.75
## + as.factor(nursery)     1   491.90 527.78
## + as.factor(Pstatus)     1   491.93 527.80
## + as.factor(activities)  1   491.94 527.81
## - as.factor(schoolsup)   1   506.73 530.64
## - Medu                   1   507.92 531.84
## + as.factor(guardian)    2   490.34 532.19
## + as.factor(reason)      3   488.70 536.53
## + as.factor(Mjob)        4   484.18 537.99
## + as.factor(Fjob)        4   489.85 543.66
# Using the BIC Criterion creates a model with a higher AIC

## From stepwise regression, step_model1 seems to be the best fit.
# Significant predictors (p-val<0.05): extra_edu_supp, Mother_edu, goout_with_friends, absences
summary(step_model1)
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences + as.factor(address) + as.factor(famsize) + as.factor(famsup) + 
##     as.factor(Mjob), family = binomial, data = mat)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.55606    0.51017  -1.090 0.275730    
## as.factor(schoolsup)yes -1.42606    0.40451  -3.525 0.000423 ***
## Medu                     0.52897    0.13909   3.803 0.000143 ***
## goout                   -0.33774    0.10466  -3.227 0.001250 ** 
## absences                -0.04096    0.01695  -2.416 0.015675 *  
## as.factor(address)U      0.42107    0.27870   1.511 0.130824    
## as.factor(famsize)LE3    0.41056    0.24752   1.659 0.097172 .  
## as.factor(famsup)yes    -0.35670    0.23398  -1.524 0.127391    
## as.factor(Mjob)health    0.29872    0.54631   0.547 0.584522    
## as.factor(Mjob)other    -0.07399    0.36371  -0.203 0.838797    
## as.factor(Mjob)services  0.10627    0.39554   0.269 0.788182    
## as.factor(Mjob)teacher  -0.84124    0.51252  -1.641 0.100716    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 475.91  on 383  degrees of freedom
## AIC: 499.91
## 
## Number of Fisher Scoring iterations: 4

Model 1: Determining Predictor Significance & Goodness of Fit

#### Significant Predictors:

### Full model likelihood
model1.logit <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + 
                      absences, data = mat, family = binomial)
summary(model1.logit)
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences, family = binomial, data = mat)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.24899    0.41479  -0.600 0.548324    
## as.factor(schoolsup)yes -1.37040    0.39287  -3.488 0.000486 ***
## Medu                     0.39679    0.10160   3.906  9.4e-05 ***
## goout                   -0.27963    0.09961  -2.807 0.004996 ** 
## absences                -0.03828    0.01622  -2.360 0.018282 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 491.95  on 390  degrees of freedom
## AIC: 501.95
## 
## Number of Fisher Scoring iterations: 4
linear_pred_full <- predict.glm(model1.logit, data=mat)
full_log_lik <- sum(mat$y_pass*linear_pred_full - log(1 + exp(linear_pred_full)))

### LR: schoolsup (should we drop schoolsup from the model?)
model_with_schoolsup <- glm(y_pass ~ Medu + goout + absences, 
                          family = binomial, data = mat)

pred_with_schoolsup <- predict.glm(model_with_schoolsup, mat)

ll_with_schoolsup <- sum(mat$y_pass * pred_with_schoolsup - log(1 + exp(pred_with_schoolsup)))

lrt_schoolsup <- -2*(ll_with_schoolsup - full_log_lik)

crit_val_schoolsup <- qchisq(0.95, 1)  # df = 1 for 1 added parameter
p_val_schoolsup <- pchisq(lrt_schoolsup, 1, lower.tail = FALSE)

lrt_schoolsup
## [1] 14.78016
crit_val_schoolsup
## [1] 3.841459
p_val_schoolsup ## p-val:0.00012<0.05 --> schoolsup is significant
## [1] 0.0001207997
### LR: Medu (should we drop Medu from the model?)
model_with_Medu <- glm(y_pass ~ as.factor(schoolsup) + goout + absences, 
                            family = binomial, data = mat)
pred_with_Medu <- predict.glm(model_with_Medu, mat)
ll_with_Medu <- sum(mat$y_pass * pred_with_Medu - log(1 + exp(pred_with_Medu)))
lrt_Medu <- -2*(ll_with_Medu - full_log_lik)
crit_val_Medu <- qchisq(0.95, 1)  # df = 1 for 1 added parameter
p_val_Medu <- pchisq(lrt_Medu, 1, lower.tail = FALSE)

lrt_Medu
## [1] 15.9776
crit_val_Medu
## [1] 3.841459
p_val_Medu ## p-val: 6.4e-05<0.05 --> Medu is significant
## [1] 6.409626e-05
### LR: goout (should we drop goout from the model?)
model_with_goout <- glm(y_pass ~ as.factor(schoolsup) + Medu + absences, 
                        family = binomial, data = mat)
pred_with_goout <- predict.glm(model_with_goout, mat)
ll_with_goout <- sum(mat$y_pass * pred_with_goout - log(1 + exp(pred_with_goout)))
lrt_goout <- -2 * (ll_with_goout - full_log_lik)
crit_val_goout <- qchisq(0.95, 1)  # df = 1 for 1 added parameter
p_val_goout <- pchisq(lrt_goout, 1, lower.tail = FALSE)

lrt_goout
## [1] 8.0913
crit_val_goout
## [1] 3.841459
p_val_goout ## p-val: 0.004<0.05 --> goout is significant
## [1] 0.004447824
### LR: absences (should we drop absences from the model?)
model_with_absences <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout, 
                           family = binomial, data = mat)
pred_with_absences <- predict.glm(model_with_absences, mat)
ll_with_absences <- sum(mat$y_pass * pred_with_absences - log(1 + exp(pred_with_absences)))
lrt_absences <- -2 * (ll_with_absences - full_log_lik)
crit_val_absences <- qchisq(0.95, 1)  # df = 1 for 1 added parameter
p_val_absences <- pchisq(lrt_absences, 1, lower.tail = FALSE)

lrt_absences
## [1] 6.681073
crit_val_absences
## [1] 3.841459
p_val_absences ## p-val:0.0097<0.05 --> absences is significant
## [1] 0.009744195
############# Pearson GOF test for model1.logit
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.

# Pearson GOF test for model1.logit
pearson_chi1 <- sum(residuals(model1.logit, type = "pearson")^2)
df1 <- model1.logit$df.residual
p_value1 <- pchisq(pearson_chi1, df1, lower.tail = FALSE)
crit_val1 <- qchisq(.95,df=df1-2)
cat("Model 1 - Pearson Chi-square:", pearson_chi1, "critical value:", crit_val1, "df:", df1, "p-value:", p_value1, "\n") #p-val=0.2325391>0.05 --> Can't reject null that model is fitting
## Model 1 - Pearson Chi-square: 392.5875 critical value: 434.9289 df: 390 p-value: 0.4537058
pi_hats1 <- predict.glm(model1.logit)

############## Hosmer-Lemeshow Goodness of Fit Test
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.
hos_test1 <- hoslem.test(model1.logit$y,predict.glm(model1.logit, mat,type="response"),5)
obs5 <- hos_test1$observed
exp5 <- hos_test1$expected

crit_val1 <- qchisq(.95,4-2)
crit_val1
## [1] 5.991465
p_val1 <- pchisq(hos_test1$statistic, 4-2,lower.tail=F)
p_val1 ##p_val = 0.30>0.05 --> fail to reject the null hypothesis that model is fitting
## X-squared 
## 0.2996385

Model 2: Reassessing Insignificant Predictors & Goodness of Fit in Logit Model

###### Insignificant Predictors from Stepwise Model: 
## Likelihood Ratio Test
#H0: smaller number of parameters are sufficient

### LRT: address
model_with_address <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address), family = binomial, data = mat)
pred_with_address <- predict(model_with_address)
ll_with_address <- sum(mat$y_pass * pred_with_address - log(1 + exp(pred_with_address)))
model_bic <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences, 
                 family = binomial, data = mat)
pred_bic <- predict(model_bic)
ll_bic <- sum(mat$y_pass * pred_bic - log(1 + exp(pred_bic)))
lrt_address <- -2 * (ll_bic - ll_with_address)
crit_val_address <- qchisq(0.95, 1)
p_val_address <- pchisq(lrt_address, 1, lower.tail = FALSE)
p_val_address #p-val: 0.075>0.05 --> address  remains insignificant
## [1] 0.07522347
### LRT: famsize
model_with_famsize <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(famsize), 
                          family = binomial, data = mat)
pred_with_famsize <- predict(model_with_famsize)
ll_with_famsize <- sum(mat$y_pass * pred_with_famsize - log(1 + exp(pred_with_famsize)))
model_bic <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences, 
                 family = binomial, data = mat)
pred_bic <- predict(model_bic)
ll_bic <- sum(mat$y_pass * pred_bic - log(1 + exp(pred_bic)))
lrt_famsize <- -2 * (ll_bic - ll_with_famsize)
crit_val_famsize <- qchisq(0.95, 1)
p_val_famsize <- pchisq(lrt_famsize, 1, lower.tail = FALSE)
p_val_famsize #p-val: 0.084>0.05 --> famsize remains insignificant
## [1] 0.08400694
### LRT: famsup
model_with_famsup <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(famsup), 
                         family = binomial, data = mat)
pred_with_famsup <- predict(model_with_famsup)
ll_with_famsup <- sum(mat$y_pass * pred_with_famsup - log(1 + exp(pred_with_famsup)))
model_bic <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences, 
                 family = binomial, data = mat)
pred_bic <- predict(model_bic)
ll_bic <- sum(mat$y_pass * pred_bic - log(1 + exp(pred_bic)))
lrt_famsup <- -2 * (ll_bic - ll_with_famsup)
crit_val_famsup <- qchisq(0.95, 1)
p_val_famsup <- pchisq(lrt_famsup, 1, lower.tail = FALSE)
p_val_famsup #p-val: 0.12>0.05 --> famsup remains insignificant
## [1] 0.1153993
### LRT: MJob
model_with_Mjob <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(Mjob), 
                       family = binomial, data = mat)
pred_with_Mjob <- predict(model_with_Mjob)
ll_with_Mjob <- sum(mat$y_pass * pred_with_Mjob - log(1 + exp(pred_with_Mjob)))
model_bic <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences, 
                 family = binomial, data = mat)
pred_bic <- predict(model_bic)
ll_bic <- sum(mat$y_pass * pred_bic - log(1 + exp(pred_bic)))
lrt_Mjob <- -2 * (ll_bic - ll_with_Mjob)
crit_val_Mjob <- qchisq(0.95, 1)
p_val_Mjob <- pchisq(lrt_Mjob, 1, lower.tail = FALSE)
p_val_Mjob #p-val:0.00532<0.05 --> MJob significantly contributes to the model
## [1] 0.005324737
###### After adding Mjob from LRT: model2.logit
model2.logit <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + 
                     absences + as.factor(Mjob), data = mat, family = binomial)

############# Pearson GOF test for model2.logit
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.
pearson_chi1 <- sum(residuals(model2.logit, type = "pearson")^2)
df2 <- model2.logit$df.residual
p_value2 <- pchisq(pearson_chi1, df2, lower.tail = FALSE)
crit_val2 <- qchisq(.95,df=df2-2)
cat("Model 2 - Pearson Chi-square:", pearson_chi1, "critical value:", crit_val2, "df:", df2, "p-value:", p_value2, "\n") #p-val=0.416>0.05 --> Can't reject null that model is fitting
## Model 2 - Pearson Chi-square: 391.2732 critical value: 430.6919 df: 386 p-value: 0.4157033
pi_hats1 <- predict.glm(model2.logit)

############## Hosmer-Lemeshow Goodness of Fit Test
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.
hos_test2 <- hoslem.test(model2.logit$y,predict.glm(model2.logit, mat,type="response"),5)
obs2 <- hos_test2$observed
exp2 <- hos_test2$expected

crit.val2 <- qchisq(.95,4-2)
crit.val2
## [1] 5.991465
p_val2 <- pchisq(hos_test2$statistic, 4-2,lower.tail=F)
p_val2 ##p_val = 0.739>0.05 --> fail to reject the null hypothesis that model is fitting
## X-squared 
## 0.7390676
## Checking model fits
BIC(model2.logit) #BIC 537.99
## [1] 537.9896
summary(model2.logit) #502.18
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences + as.factor(Mjob), family = binomial, data = mat)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.39219    0.46708  -0.840 0.401097    
## as.factor(schoolsup)yes -1.45179    0.39989  -3.630 0.000283 ***
## Medu                     0.47128    0.13458   3.502 0.000462 ***
## goout                   -0.30321    0.10124  -2.995 0.002745 ** 
## absences                -0.03993    0.01663  -2.401 0.016360 *  
## as.factor(Mjob)health    0.45347    0.53529   0.847 0.396912    
## as.factor(Mjob)other     0.03785    0.35816   0.106 0.915828    
## as.factor(Mjob)services  0.25473    0.38720   0.658 0.510622    
## as.factor(Mjob)teacher  -0.63241    0.49895  -1.267 0.204984    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 484.18  on 386  degrees of freedom
## AIC: 502.18
## 
## Number of Fisher Scoring iterations: 4

Model 3: Probit Model on Model 1

model_full_probit <- glm(y_pass ~ as.factor(address) + as.factor(famsize) + 
                    as.factor(Pstatus) + Medu + Fedu + as.factor(Mjob) + as.factor(Fjob) + as.factor(reason) + as.factor(guardian) + traveltime + studytime + as.factor(schoolsup) + as.factor(famsup) + as.factor(paid) + as.factor(activities) + as.factor(nursery) + as.factor(higher) + as.factor(internet)  + as.factor(romantic) + famrel + freetime + goout + Dalc + Walc + absences, data = mat, family=binomial(link=probit))
summary(model_full_probit) # AIC: 528.74
## 
## Call:
## glm(formula = y_pass ~ as.factor(address) + as.factor(famsize) + 
##     as.factor(Pstatus) + Medu + Fedu + as.factor(Mjob) + as.factor(Fjob) + 
##     as.factor(reason) + as.factor(guardian) + traveltime + studytime + 
##     as.factor(schoolsup) + as.factor(famsup) + as.factor(paid) + 
##     as.factor(activities) + as.factor(nursery) + as.factor(higher) + 
##     as.factor(internet) + as.factor(romantic) + famrel + freetime + 
##     goout + Dalc + Walc + absences, family = binomial(link = probit), 
##     data = mat)
## 
## Coefficients:
##                              Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                 -0.940794   0.772905  -1.217 0.223521    
## as.factor(address)U          0.244100   0.189707   1.287 0.198193    
## as.factor(famsize)LE3        0.301113   0.158926   1.895 0.058135 .  
## as.factor(Pstatus)T          0.061230   0.235129   0.260 0.794549    
## Medu                         0.209102   0.105272   1.986 0.046999 *  
## Fedu                         0.096796   0.088866   1.089 0.276047    
## as.factor(Mjob)health        0.178180   0.361834   0.492 0.622412    
## as.factor(Mjob)other        -0.118577   0.236950  -0.500 0.616772    
## as.factor(Mjob)services     -0.012823   0.257203  -0.050 0.960236    
## as.factor(Mjob)teacher      -0.623989   0.338174  -1.845 0.065013 .  
## as.factor(Fjob)health       -0.136863   0.465759  -0.294 0.768873    
## as.factor(Fjob)other        -0.080812   0.329126  -0.246 0.806042    
## as.factor(Fjob)services     -0.118509   0.339401  -0.349 0.726961    
## as.factor(Fjob)teacher       0.335185   0.419208   0.800 0.423961    
## as.factor(reason)home       -0.025078   0.181334  -0.138 0.890006    
## as.factor(reason)other       0.156775   0.264164   0.593 0.552864    
## as.factor(reason)reputation  0.266692   0.186776   1.428 0.153329    
## as.factor(guardian)mother    0.013118   0.177637   0.074 0.941131    
## as.factor(guardian)other    -0.299083   0.306020  -0.977 0.328405    
## traveltime                  -0.096902   0.114283  -0.848 0.396485    
## studytime                    0.099212   0.091436   1.085 0.277904    
## as.factor(schoolsup)yes     -0.923133   0.238118  -3.877 0.000106 ***
## as.factor(famsup)yes        -0.280173   0.154300  -1.816 0.069407 .  
## as.factor(paid)yes          -0.050471   0.154958  -0.326 0.744644    
## as.factor(activities)yes    -0.025373   0.144195  -0.176 0.860321    
## as.factor(nursery)yes       -0.126675   0.180160  -0.703 0.481976    
## as.factor(higher)yes         0.403901   0.365616   1.105 0.269283    
## as.factor(internet)yes       0.179272   0.202669   0.885 0.376397    
## as.factor(romantic)yes      -0.084178   0.153128  -0.550 0.582508    
## famrel                      -0.003099   0.078593  -0.039 0.968542    
## freetime                     0.103452   0.077444   1.336 0.181606    
## goout                       -0.210082   0.074734  -2.811 0.004938 ** 
## Dalc                         0.042901   0.111406   0.385 0.700177    
## Walc                        -0.054071   0.082638  -0.654 0.512916    
## absences                    -0.020114   0.010838  -1.856 0.063463 .  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 458.74  on 360  degrees of freedom
## AIC: 528.74
## 
## Number of Fisher Scoring iterations: 5
# Null model
model_null_probit <- glm(y_pass ~ 1, data = mat, family=binomial(link=probit))

# Automated Stepwise Regression AIC 500.05
step_model1_probit <- stepAIC(model_null_probit,
                      scope = list(lower = model_null_probit, upper = model_full_probit),
                      direction = "both", family=binomial(link=probit))
## Start:  AIC=536.75
## y_pass ~ 1
## 
##                         Df Deviance    AIC
## + as.factor(schoolsup)   1   520.21 524.21
## + Medu                   1   521.53 525.53
## + Fedu                   1   526.47 530.47
## + goout                  1   528.20 532.20
## + absences               1   529.14 533.14
## + traveltime             1   529.99 533.99
## + as.factor(Mjob)        4   524.13 534.13
## + Walc                   1   530.38 534.38
## + as.factor(higher)      1   530.56 534.56
## + as.factor(address)     1   530.71 534.71
## <none>                       534.75 536.75
## + as.factor(famsize)     1   532.78 536.78
## + as.factor(internet)    1   532.79 536.79
## + studytime              1   532.88 536.88
## + Dalc                   1   533.16 537.16
## + as.factor(famsup)      1   533.54 537.54
## + as.factor(guardian)    2   531.69 537.69
## + as.factor(romantic)    1   534.54 538.54
## + freetime               1   534.60 538.60
## + as.factor(activities)  1   534.65 538.65
## + as.factor(nursery)     1   534.66 538.66
## + famrel                 1   534.70 538.70
## + as.factor(reason)      3   530.71 538.71
## + as.factor(Pstatus)     1   534.75 538.75
## + as.factor(paid)        1   534.75 538.75
## + as.factor(Fjob)        4   530.34 540.34
## 
## Step:  AIC=524.21
## y_pass ~ as.factor(schoolsup)
## 
##                         Df Deviance    AIC
## + Medu                   1   507.80 513.80
## + Fedu                   1   510.82 516.82
## + goout                  1   512.62 518.62
## + Walc                   1   514.03 520.03
## + absences               1   514.67 520.67
## + traveltime             1   515.02 521.02
## + as.factor(higher)      1   515.10 521.10
## + as.factor(address)     1   515.79 521.79
## + as.factor(Mjob)        4   510.68 522.68
## + as.factor(guardian)    2   516.14 524.14
## + studytime              1   518.15 524.15
## <none>                       520.21 524.21
## + Dalc                   1   518.33 524.33
## + as.factor(internet)    1   518.36 524.36
## + as.factor(famsize)     1   518.57 524.57
## + as.factor(famsup)      1   519.66 525.66
## + as.factor(romantic)    1   519.66 525.66
## + as.factor(reason)      3   515.77 525.77
## + as.factor(Fjob)        4   513.95 525.95
## + as.factor(activities)  1   519.99 525.99
## + as.factor(nursery)     1   520.00 526.00
## + as.factor(Pstatus)     1   520.17 526.17
## + freetime               1   520.17 526.17
## + famrel                 1   520.18 526.18
## + as.factor(paid)        1   520.20 526.20
## - as.factor(schoolsup)   1   534.75 536.75
## 
## Step:  AIC=513.8
## y_pass ~ as.factor(schoolsup) + Medu
## 
##                         Df Deviance    AIC
## + goout                  1   498.76 506.76
## + absences               1   500.09 508.09
## + Walc                   1   502.29 510.29
## + traveltime             1   504.80 512.80
## + as.factor(higher)      1   504.92 512.92
## + as.factor(address)     1   505.04 513.04
## + as.factor(famsup)      1   505.64 513.64
## + Dalc                   1   505.69 513.69
## + as.factor(famsize)     1   505.69 513.69
## <none>                       507.80 513.80
## + studytime              1   506.25 514.25
## + as.factor(guardian)    2   504.45 514.45
## + Fedu                   1   506.51 514.51
## + as.factor(romantic)    1   507.06 515.06
## + as.factor(paid)        1   507.31 515.31
## + as.factor(internet)    1   507.39 515.39
## + as.factor(Mjob)        4   501.53 515.53
## + as.factor(nursery)     1   507.76 515.76
## + as.factor(Pstatus)     1   507.76 515.76
## + famrel                 1   507.77 515.77
## + freetime               1   507.79 515.79
## + as.factor(activities)  1   507.79 515.79
## + as.factor(reason)      3   504.63 516.63
## + as.factor(Fjob)        4   504.82 518.82
## - Medu                   1   520.21 524.21
## - as.factor(schoolsup)   1   521.53 525.53
## 
## Step:  AIC=506.76
## y_pass ~ as.factor(schoolsup) + Medu + goout
## 
##                         Df Deviance    AIC
## + absences               1   491.81 501.81
## + as.factor(address)     1   495.12 505.12
## + traveltime             1   495.94 505.94
## + as.factor(higher)      1   496.17 506.17
## + as.factor(famsize)     1   496.26 506.26
## + as.factor(famsup)      1   496.32 506.32
## <none>                       498.76 506.76
## + as.factor(Mjob)        4   491.25 507.25
## + Walc                   1   497.34 507.34
## + Fedu                   1   497.35 507.35
## + studytime              1   497.56 507.56
## + as.factor(guardian)    2   495.71 507.71
## + freetime               1   497.73 507.73
## + as.factor(internet)    1   497.97 507.97
## + as.factor(romantic)    1   497.98 507.98
## + Dalc                   1   498.28 508.28
## + as.factor(paid)        1   498.32 508.32
## + famrel                 1   498.64 508.64
## + as.factor(Pstatus)     1   498.72 508.72
## + as.factor(nursery)     1   498.73 508.73
## + as.factor(activities)  1   498.73 508.73
## + as.factor(reason)      3   496.03 510.03
## + as.factor(Fjob)        4   496.46 512.46
## - goout                  1   507.80 513.80
## - Medu                   1   512.62 518.62
## - as.factor(schoolsup)   1   513.36 519.36
## 
## Step:  AIC=501.81
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences
## 
##                         Df Deviance    AIC
## + as.factor(address)     1   488.66 500.66
## + as.factor(famsize)     1   488.91 500.91
## + traveltime             1   488.94 500.94
## + as.factor(famsup)      1   489.33 501.33
## <none>                       491.81 501.81
## + as.factor(higher)      1   489.85 501.85
## + as.factor(Mjob)        4   483.97 501.97
## + as.factor(internet)    1   490.64 502.64
## + Fedu                   1   490.74 502.74
## + studytime              1   490.98 502.98
## + Walc                   1   491.08 503.08
## + freetime               1   491.15 503.15
## + as.factor(paid)        1   491.27 503.27
## + as.factor(romantic)    1   491.60 503.60
## + Dalc                   1   491.60 503.60
## + famrel                 1   491.77 503.77
## + as.factor(Pstatus)     1   491.79 503.79
## + as.factor(nursery)     1   491.79 503.79
## + as.factor(activities)  1   491.81 503.81
## + as.factor(guardian)    2   490.19 504.19
## + as.factor(reason)      3   488.59 504.59
## - absences               1   498.76 506.76
## + as.factor(Fjob)        4   489.92 507.92
## - goout                  1   500.09 508.09
## - as.factor(schoolsup)   1   506.47 514.47
## - Medu                   1   507.69 515.69
## 
## Step:  AIC=500.66
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address)
## 
##                         Df Deviance    AIC
## + as.factor(famsize)     1   486.05 500.05
## + as.factor(famsup)      1   486.19 500.19
## <none>                       488.66 500.66
## + as.factor(higher)      1   486.82 500.82
## + traveltime             1   487.28 501.28
## + as.factor(Mjob)        4   481.40 501.40
## + Fedu                   1   487.48 501.48
## + studytime              1   487.76 501.76
## - as.factor(address)     1   491.81 501.81
## + freetime               1   487.94 501.94
## + as.factor(paid)        1   488.06 502.06
## + as.factor(internet)    1   488.16 502.16
## + Walc                   1   488.29 502.29
## + as.factor(romantic)    1   488.43 502.43
## + as.factor(reason)      3   484.48 502.48
## + as.factor(nursery)     1   488.60 502.60
## + as.factor(activities)  1   488.62 502.62
## + Dalc                   1   488.63 502.63
## + famrel                 1   488.63 502.63
## + as.factor(Pstatus)     1   488.65 502.65
## + as.factor(guardian)    2   486.98 502.98
## - absences               1   495.12 505.12
## + as.factor(Fjob)        4   486.72 506.72
## - goout                  1   497.68 507.68
## - Medu                   1   502.72 512.72
## - as.factor(schoolsup)   1   503.72 513.72
## 
## Step:  AIC=500.05
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address) + 
##     as.factor(famsize)
## 
##                         Df Deviance    AIC
## <none>                       486.05 500.05
## + as.factor(famsup)      1   484.06 500.06
## + as.factor(higher)      1   484.19 500.19
## + as.factor(Mjob)        4   478.22 500.22
## + traveltime             1   484.25 500.25
## - as.factor(famsize)     1   488.66 500.66
## + Fedu                   1   484.75 500.75
## - as.factor(address)     1   488.91 500.91
## + studytime              1   484.93 500.93
## + freetime               1   485.34 501.34
## + as.factor(paid)        1   485.44 501.44
## + Walc                   1   485.44 501.44
## + as.factor(internet)    1   485.49 501.49
## + as.factor(romantic)    1   485.78 501.78
## + as.factor(nursery)     1   485.86 501.86
## + as.factor(reason)      3   481.90 501.90
## + Dalc                   1   485.90 501.90
## + as.factor(activities)  1   486.00 502.00
## + famrel                 1   486.01 502.01
## + as.factor(Pstatus)     1   486.03 502.03
## + as.factor(guardian)    2   484.51 502.51
## - absences               1   492.94 504.94
## + as.factor(Fjob)        4   484.00 506.00
## - goout                  1   495.42 507.42
## - as.factor(schoolsup)   1   500.67 512.67
## - Medu                   1   500.82 512.82
# Automated Stepwise Regression AIC 500.05
step_model2_probit <- stepAIC(model_null_probit,
                      scope = list(lower = model_null_probit, upper = model_full_probit),
                      direction = "forward", family=binomial(link=probit)) 
## Start:  AIC=536.75
## y_pass ~ 1
## 
##                         Df Deviance    AIC
## + as.factor(schoolsup)   1   520.21 524.21
## + Medu                   1   521.53 525.53
## + Fedu                   1   526.47 530.47
## + goout                  1   528.20 532.20
## + absences               1   529.14 533.14
## + traveltime             1   529.99 533.99
## + as.factor(Mjob)        4   524.13 534.13
## + Walc                   1   530.38 534.38
## + as.factor(higher)      1   530.56 534.56
## + as.factor(address)     1   530.71 534.71
## <none>                       534.75 536.75
## + as.factor(famsize)     1   532.78 536.78
## + as.factor(internet)    1   532.79 536.79
## + studytime              1   532.88 536.88
## + Dalc                   1   533.16 537.16
## + as.factor(famsup)      1   533.54 537.54
## + as.factor(guardian)    2   531.69 537.69
## + as.factor(romantic)    1   534.54 538.54
## + freetime               1   534.60 538.60
## + as.factor(activities)  1   534.65 538.65
## + as.factor(nursery)     1   534.66 538.66
## + famrel                 1   534.70 538.70
## + as.factor(reason)      3   530.71 538.71
## + as.factor(Pstatus)     1   534.75 538.75
## + as.factor(paid)        1   534.75 538.75
## + as.factor(Fjob)        4   530.34 540.34
## 
## Step:  AIC=524.21
## y_pass ~ as.factor(schoolsup)
## 
##                         Df Deviance    AIC
## + Medu                   1   507.80 513.80
## + Fedu                   1   510.82 516.82
## + goout                  1   512.62 518.62
## + Walc                   1   514.03 520.03
## + absences               1   514.67 520.67
## + traveltime             1   515.02 521.02
## + as.factor(higher)      1   515.10 521.10
## + as.factor(address)     1   515.79 521.79
## + as.factor(Mjob)        4   510.68 522.68
## + as.factor(guardian)    2   516.14 524.14
## + studytime              1   518.15 524.15
## <none>                       520.21 524.21
## + Dalc                   1   518.33 524.33
## + as.factor(internet)    1   518.36 524.36
## + as.factor(famsize)     1   518.57 524.57
## + as.factor(famsup)      1   519.66 525.66
## + as.factor(romantic)    1   519.66 525.66
## + as.factor(reason)      3   515.77 525.77
## + as.factor(Fjob)        4   513.95 525.95
## + as.factor(activities)  1   519.99 525.99
## + as.factor(nursery)     1   520.00 526.00
## + as.factor(Pstatus)     1   520.17 526.17
## + freetime               1   520.17 526.17
## + famrel                 1   520.18 526.18
## + as.factor(paid)        1   520.20 526.20
## 
## Step:  AIC=513.8
## y_pass ~ as.factor(schoolsup) + Medu
## 
##                         Df Deviance    AIC
## + goout                  1   498.76 506.76
## + absences               1   500.09 508.09
## + Walc                   1   502.29 510.29
## + traveltime             1   504.80 512.80
## + as.factor(higher)      1   504.92 512.92
## + as.factor(address)     1   505.04 513.04
## + as.factor(famsup)      1   505.64 513.64
## + Dalc                   1   505.69 513.69
## + as.factor(famsize)     1   505.69 513.69
## <none>                       507.80 513.80
## + studytime              1   506.25 514.25
## + as.factor(guardian)    2   504.45 514.45
## + Fedu                   1   506.51 514.51
## + as.factor(romantic)    1   507.06 515.06
## + as.factor(paid)        1   507.31 515.31
## + as.factor(internet)    1   507.39 515.39
## + as.factor(Mjob)        4   501.53 515.53
## + as.factor(nursery)     1   507.76 515.76
## + as.factor(Pstatus)     1   507.76 515.76
## + famrel                 1   507.77 515.77
## + freetime               1   507.79 515.79
## + as.factor(activities)  1   507.79 515.79
## + as.factor(reason)      3   504.63 516.63
## + as.factor(Fjob)        4   504.82 518.82
## 
## Step:  AIC=506.76
## y_pass ~ as.factor(schoolsup) + Medu + goout
## 
##                         Df Deviance    AIC
## + absences               1   491.81 501.81
## + as.factor(address)     1   495.12 505.12
## + traveltime             1   495.94 505.94
## + as.factor(higher)      1   496.17 506.17
## + as.factor(famsize)     1   496.26 506.26
## + as.factor(famsup)      1   496.32 506.32
## <none>                       498.76 506.76
## + as.factor(Mjob)        4   491.25 507.25
## + Walc                   1   497.34 507.34
## + Fedu                   1   497.35 507.35
## + studytime              1   497.56 507.56
## + as.factor(guardian)    2   495.71 507.71
## + freetime               1   497.73 507.73
## + as.factor(internet)    1   497.97 507.97
## + as.factor(romantic)    1   497.98 507.98
## + Dalc                   1   498.28 508.28
## + as.factor(paid)        1   498.32 508.32
## + famrel                 1   498.64 508.64
## + as.factor(Pstatus)     1   498.72 508.72
## + as.factor(nursery)     1   498.73 508.73
## + as.factor(activities)  1   498.73 508.73
## + as.factor(reason)      3   496.03 510.03
## + as.factor(Fjob)        4   496.46 512.46
## 
## Step:  AIC=501.81
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences
## 
##                         Df Deviance    AIC
## + as.factor(address)     1   488.66 500.66
## + as.factor(famsize)     1   488.91 500.91
## + traveltime             1   488.94 500.94
## + as.factor(famsup)      1   489.33 501.33
## <none>                       491.81 501.81
## + as.factor(higher)      1   489.85 501.85
## + as.factor(Mjob)        4   483.97 501.97
## + as.factor(internet)    1   490.64 502.64
## + Fedu                   1   490.74 502.74
## + studytime              1   490.98 502.98
## + Walc                   1   491.08 503.08
## + freetime               1   491.15 503.15
## + as.factor(paid)        1   491.27 503.27
## + as.factor(romantic)    1   491.60 503.60
## + Dalc                   1   491.60 503.60
## + famrel                 1   491.77 503.77
## + as.factor(Pstatus)     1   491.79 503.79
## + as.factor(nursery)     1   491.79 503.79
## + as.factor(activities)  1   491.81 503.81
## + as.factor(guardian)    2   490.19 504.19
## + as.factor(reason)      3   488.59 504.59
## + as.factor(Fjob)        4   489.92 507.92
## 
## Step:  AIC=500.66
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address)
## 
##                         Df Deviance    AIC
## + as.factor(famsize)     1   486.05 500.05
## + as.factor(famsup)      1   486.19 500.19
## <none>                       488.66 500.66
## + as.factor(higher)      1   486.82 500.82
## + traveltime             1   487.28 501.28
## + as.factor(Mjob)        4   481.40 501.40
## + Fedu                   1   487.48 501.48
## + studytime              1   487.76 501.76
## + freetime               1   487.94 501.94
## + as.factor(paid)        1   488.06 502.06
## + as.factor(internet)    1   488.16 502.16
## + Walc                   1   488.29 502.29
## + as.factor(romantic)    1   488.43 502.43
## + as.factor(reason)      3   484.48 502.48
## + as.factor(nursery)     1   488.60 502.60
## + as.factor(activities)  1   488.62 502.62
## + Dalc                   1   488.63 502.63
## + famrel                 1   488.63 502.63
## + as.factor(Pstatus)     1   488.65 502.65
## + as.factor(guardian)    2   486.98 502.98
## + as.factor(Fjob)        4   486.72 506.72
## 
## Step:  AIC=500.05
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(address) + 
##     as.factor(famsize)
## 
##                         Df Deviance    AIC
## <none>                       486.05 500.05
## + as.factor(famsup)      1   484.06 500.06
## + as.factor(higher)      1   484.19 500.19
## + as.factor(Mjob)        4   478.22 500.22
## + traveltime             1   484.25 500.25
## + Fedu                   1   484.75 500.75
## + studytime              1   484.93 500.93
## + freetime               1   485.34 501.34
## + as.factor(paid)        1   485.44 501.44
## + Walc                   1   485.44 501.44
## + as.factor(internet)    1   485.49 501.49
## + as.factor(romantic)    1   485.78 501.78
## + as.factor(nursery)     1   485.86 501.86
## + as.factor(reason)      3   481.90 501.90
## + Dalc                   1   485.90 501.90
## + as.factor(activities)  1   486.00 502.00
## + famrel                 1   486.01 502.01
## + as.factor(Pstatus)     1   486.03 502.03
## + as.factor(guardian)    2   484.51 502.51
## + as.factor(Fjob)        4   484.00 506.00
# Automated Stepwise Regression AIC 536.75
step_model3_probit <- stepAIC(model_null_probit,
                      scope = list(lower = model_null_probit, upper = model_full_probit),
                      direction = "back", family=binomial(link=probit)) 
## Start:  AIC=536.75
## y_pass ~ 1
# Automated Stepwise Regression Using BIC Criterion: AIC 521.71
step_model4_probit <- stepAIC(model_null_probit,
                       scope = list(lower = model_null_probit, upper = model_full_probit),
                       direction = "both", family=binomial(link=probit),
                       k = log(n)) 
## Start:  AIC=540.73
## y_pass ~ 1
## 
##                         Df Deviance    AIC
## + as.factor(schoolsup)   1   520.21 532.17
## + Medu                   1   521.53 533.49
## + Fedu                   1   526.47 538.43
## + goout                  1   528.20 540.16
## <none>                       534.75 540.73
## + absences               1   529.14 541.10
## + traveltime             1   529.99 541.95
## + Walc                   1   530.38 542.34
## + as.factor(higher)      1   530.56 542.51
## + as.factor(address)     1   530.71 542.67
## + as.factor(famsize)     1   532.78 544.74
## + as.factor(internet)    1   532.79 544.74
## + studytime              1   532.88 544.84
## + Dalc                   1   533.16 545.12
## + as.factor(famsup)      1   533.54 545.50
## + as.factor(romantic)    1   534.54 546.50
## + freetime               1   534.60 546.56
## + as.factor(activities)  1   534.65 546.61
## + as.factor(nursery)     1   534.66 546.62
## + famrel                 1   534.70 546.66
## + as.factor(Pstatus)     1   534.75 546.71
## + as.factor(paid)        1   534.75 546.71
## + as.factor(guardian)    2   531.69 549.63
## + as.factor(Mjob)        4   524.13 554.03
## + as.factor(reason)      3   530.71 554.63
## + as.factor(Fjob)        4   530.34 560.23
## 
## Step:  AIC=532.17
## y_pass ~ as.factor(schoolsup)
## 
##                         Df Deviance    AIC
## + Medu                   1   507.80 525.73
## + Fedu                   1   510.82 528.75
## + goout                  1   512.62 530.56
## + Walc                   1   514.03 531.97
## <none>                       520.21 532.17
## + absences               1   514.67 532.60
## + traveltime             1   515.02 532.96
## + as.factor(higher)      1   515.10 533.04
## + as.factor(address)     1   515.79 533.72
## + studytime              1   518.15 536.09
## + Dalc                   1   518.33 536.27
## + as.factor(internet)    1   518.36 536.29
## + as.factor(famsize)     1   518.57 536.51
## + as.factor(famsup)      1   519.66 537.59
## + as.factor(romantic)    1   519.66 537.60
## + as.factor(activities)  1   519.99 537.93
## + as.factor(nursery)     1   520.00 537.93
## + as.factor(Pstatus)     1   520.17 538.10
## + freetime               1   520.17 538.10
## + famrel                 1   520.18 538.12
## + as.factor(paid)        1   520.20 538.13
## + as.factor(guardian)    2   516.14 540.06
## - as.factor(schoolsup)   1   534.75 540.73
## + as.factor(reason)      3   515.77 545.67
## + as.factor(Mjob)        4   510.68 546.56
## + as.factor(Fjob)        4   513.95 549.82
## 
## Step:  AIC=525.73
## y_pass ~ as.factor(schoolsup) + Medu
## 
##                         Df Deviance    AIC
## + goout                  1   498.76 522.68
## + absences               1   500.09 524.01
## <none>                       507.80 525.73
## + Walc                   1   502.29 526.21
## + traveltime             1   504.80 528.71
## + as.factor(higher)      1   504.92 528.84
## + as.factor(address)     1   505.04 528.95
## + as.factor(famsup)      1   505.64 529.56
## + Dalc                   1   505.69 529.60
## + as.factor(famsize)     1   505.69 529.61
## + studytime              1   506.25 530.16
## + Fedu                   1   506.51 530.42
## + as.factor(romantic)    1   507.06 530.98
## + as.factor(paid)        1   507.31 531.23
## + as.factor(internet)    1   507.39 531.30
## + as.factor(nursery)     1   507.76 531.67
## + as.factor(Pstatus)     1   507.76 531.68
## + famrel                 1   507.77 531.69
## + freetime               1   507.79 531.70
## + as.factor(activities)  1   507.79 531.71
## - Medu                   1   520.21 532.17
## - as.factor(schoolsup)   1   521.53 533.49
## + as.factor(guardian)    2   504.45 534.34
## + as.factor(reason)      3   504.63 540.51
## + as.factor(Mjob)        4   501.53 543.38
## + as.factor(Fjob)        4   504.82 546.68
## 
## Step:  AIC=522.68
## y_pass ~ as.factor(schoolsup) + Medu + goout
## 
##                         Df Deviance    AIC
## + absences               1   491.81 521.71
## <none>                       498.76 522.68
## + as.factor(address)     1   495.12 525.01
## - goout                  1   507.80 525.73
## + traveltime             1   495.94 525.83
## + as.factor(higher)      1   496.17 526.06
## + as.factor(famsize)     1   496.26 526.16
## + as.factor(famsup)      1   496.32 526.22
## + Walc                   1   497.34 527.23
## + Fedu                   1   497.35 527.25
## + studytime              1   497.56 527.46
## + freetime               1   497.73 527.63
## + as.factor(internet)    1   497.97 527.86
## + as.factor(romantic)    1   497.98 527.87
## + Dalc                   1   498.28 528.17
## + as.factor(paid)        1   498.32 528.21
## + famrel                 1   498.64 528.53
## + as.factor(Pstatus)     1   498.72 528.62
## + as.factor(nursery)     1   498.73 528.63
## + as.factor(activities)  1   498.73 528.63
## - Medu                   1   512.62 530.56
## - as.factor(schoolsup)   1   513.36 531.30
## + as.factor(guardian)    2   495.71 531.58
## + as.factor(reason)      3   496.03 537.88
## + as.factor(Mjob)        4   491.25 539.08
## + as.factor(Fjob)        4   496.46 544.29
## 
## Step:  AIC=521.71
## y_pass ~ as.factor(schoolsup) + Medu + goout + absences
## 
##                         Df Deviance    AIC
## <none>                       491.81 521.71
## - absences               1   498.76 522.68
## - goout                  1   500.09 524.01
## + as.factor(address)     1   488.66 524.54
## + as.factor(famsize)     1   488.91 524.79
## + traveltime             1   488.94 524.82
## + as.factor(famsup)      1   489.33 525.20
## + as.factor(higher)      1   489.85 525.72
## + as.factor(internet)    1   490.64 526.51
## + Fedu                   1   490.74 526.61
## + studytime              1   490.98 526.85
## + Walc                   1   491.08 526.95
## + freetime               1   491.15 527.03
## + as.factor(paid)        1   491.27 527.15
## + as.factor(romantic)    1   491.60 527.48
## + Dalc                   1   491.60 527.48
## + famrel                 1   491.77 527.64
## + as.factor(Pstatus)     1   491.79 527.66
## + as.factor(nursery)     1   491.79 527.66
## + as.factor(activities)  1   491.81 527.68
## - as.factor(schoolsup)   1   506.47 530.39
## - Medu                   1   507.69 531.61
## + as.factor(guardian)    2   490.19 532.05
## + as.factor(reason)      3   488.59 536.42
## + as.factor(Mjob)        4   483.97 537.78
## + as.factor(Fjob)        4   489.92 543.73
# Using the BIC Criterion creates a model with a higher AIC, predictors: schoolsup, Medu, goout, absences

## From stepwise regression, step_model1_probit seems to be the best fit of probit models
# Significant predictors (p-val<0.05): school_supp, Medu, goout, absences
summary(step_model1_probit)
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences + as.factor(address) + as.factor(famsize), family = binomial(link = probit), 
##     data = mat)
## 
## Coefficients:
##                          Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.376979   0.274384  -1.374 0.169470    
## as.factor(schoolsup)yes -0.811741   0.223711  -3.629 0.000285 ***
## Medu                     0.237768   0.062342   3.814 0.000137 ***
## goout                   -0.185231   0.061324  -3.021 0.002523 ** 
## absences                -0.024455   0.009761  -2.505 0.012232 *  
## as.factor(address)U      0.276286   0.164708   1.677 0.093459 .  
## as.factor(famsize)LE3    0.235701   0.146219   1.612 0.106967    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 486.05  on 388  degrees of freedom
## AIC: 500.05
## 
## Number of Fisher Scoring iterations: 4
##### Supports significant predictors found in logit model 1: schoolsup, Medu, goout, absences
model3.probit <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences, data = mat, family = binomial(link=probit))

######## Pearson Goodness of Fit Test
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.
pearson_chi3 <- sum(residuals(model3.probit, type = "pearson")^2)
df3 <- model3.probit$df.residual
p_value3 <- pchisq(pearson_chi3, df3, lower.tail = FALSE)
crit_val3 <- qchisq(.95,df=df3-2)
cat("Model 3 - Pearson Chi-square:", pearson_chi3, "critical value:", crit_val3, "df:", df3, "p-value:", p_value3, "\n") #p-val: 0.453 --> Can't reject null that model is fitting
## Model 3 - Pearson Chi-square: 392.6587 critical value: 434.9289 df: 390 p-value: 0.4527007
pi_hats3 <- predict.glm(model3.probit)

######## Hosmer-Lemeshow Goodness of Fit Test
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.
hos_test3 <- hoslem.test(model3.probit$y,predict.glm(model3.probit, mat,type="response"),5)
obs3 <- hos_test3$observed
exp3 <- hos_test3$expected

crit_val3 <- qchisq(.95,4-2)
crit_val3
## [1] 5.991465
p_val3 <- pchisq(hos_test3$statistic, 4-2,lower.tail=F)
p_val3 ##p_val = 0.207 >0.05 --> fail to reject the null hypothesis that model is fitting
## X-squared 
## 0.2070925

Model 4: Probit Model on Model 2

# Probit model on Model 2
model4.probit <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences + 
                           as.factor(Mjob) + Medu * as.factor(Mjob), 
                         data = mat, family = binomial(link = "probit"))

######## Pearson Goodness of Fit Test
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.
pearson_chi4 <- sum(residuals(model4.probit, type = "pearson")^2)
df4 <- model4.probit$df.residual
p_value4 <- pchisq(pearson_chi4, df4, lower.tail = FALSE)
crit_val4 <- qchisq(.95,df=df4-2)
cat("Model 4 - Pearson Chi-square:", pearson_chi4, "critical value:", crit_val4, "df:", df4, "p-value:", p_value4, "\n") #p-val: 0.453 --> Can't reject null that model is fitting
## Model 4 - Pearson Chi-square: 385.0345 critical value: 426.4537 df: 382 p-value: 0.4468596
pi_hats4 <- predict.glm(model4.probit)

Comparing Logit and Probit Models

#Hosmer-Lemeshow Goodness-of-Fit to compare probit and logit
# Hosmer-Lemeshow GOF for logit model (with 5 groups)
hoslem_test_logit <- hoslem.test(mat$y_pass,
                                 predict(model2.logit, type = "response"),
                                 g = 5)
obs_logit <- hoslem_test_logit$observed
exp_logit <- hoslem_test_logit$expected
HL_stat_logit <- sum((obs_logit - exp_logit)^2 / exp_logit)
crit_val_logit <- qchisq(0.95, 5 - 2)
p_val_logit <- pchisq(HL_stat_logit, 5 - 2, lower.tail = FALSE)
cat("Logit HL stat =", HL_stat_logit, "\nCritical value =", crit_val_logit,
    "\nP-value =", p_val_logit, "\n")
## Logit HL stat = 0.6047318 
## Critical value = 7.814728 
## P-value = 0.8953483
# Hosmer-Lemeshow GOF for probit model (with 5 groups)
hoslem_test_probit <- hoslem.test(mat$y_pass,
                                  predict(model4.probit, type = "response"),
                                  g = 5)
obs_probit <- hoslem_test_probit$observed
exp_probit <- hoslem_test_probit$expected
HL_stat_probit <- sum((obs_probit - exp_probit)^2 / exp_probit)
crit_val_probit <- qchisq(0.95, 5 - 2)
p_val_probit <- pchisq(HL_stat_probit, 5 - 2, lower.tail = FALSE)
cat("Probit HL stat =", HL_stat_probit, "\nCritical value =", crit_val_probit,
    "\nP-value =", p_val_probit, "\n")
## Probit HL stat = 1.681302 
## Critical value = 7.814728 
## P-value = 0.6410988
### Both models are good, AIC and Hosmer-Lemeshow are very close factors. Thus, for simplicity, 
#choose the logistic model. With the model, school supplies, Mother education, going out,
# absences, and Mother job explained final performance the best. 

Logit and probit models are comparable, but logit models have a consistently lower AIC and thus likely a better fit to the data. From this point on, only the logit models will be considered.

Model 5: Checking Interaction Terms in Logit Model 2

######################### Manually Checking Interaction Terms:##################

############################ For Model 2 Logit: #########################
# Checking goout * absences from Model 2
model2_gooutxabsences <- glm(y_pass ~ as.factor(schoolsup) * as.factor(Mjob) + Medu + 
                                      goout + absences +  goout*absences, family = binomial, data = mat)
summary(model2_gooutxabsences) #AIC 509.3 
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) * as.factor(Mjob) + 
##     Medu + goout + absences + goout * absences, family = binomial, 
##     data = mat)
## 
## Coefficients:
##                                                   Estimate Std. Error z value
## (Intercept)                                     -3.931e-01  5.341e-01  -0.736
## as.factor(schoolsup)yes                         -6.775e-01  8.762e-01  -0.773
## as.factor(Mjob)health                            6.045e-01  5.558e-01   1.088
## as.factor(Mjob)other                             1.411e-01  3.772e-01   0.374
## as.factor(Mjob)services                          3.194e-01  4.085e-01   0.782
## as.factor(Mjob)teacher                          -4.998e-01  5.121e-01  -0.976
## Medu                                             4.594e-01  1.348e-01   3.408
## goout                                           -3.224e-01  1.277e-01  -2.525
## absences                                        -5.196e-02  4.493e-02  -1.157
## as.factor(schoolsup)yes:as.factor(Mjob)health   -1.504e+01  1.027e+03  -0.015
## as.factor(schoolsup)yes:as.factor(Mjob)other    -9.879e-01  1.101e+00  -0.897
## as.factor(schoolsup)yes:as.factor(Mjob)services -5.753e-01  1.084e+00  -0.531
## as.factor(schoolsup)yes:as.factor(Mjob)teacher  -1.454e+01  7.992e+02  -0.018
## goout:absences                                   4.119e-03  1.406e-02   0.293
##                                                 Pr(>|z|)    
## (Intercept)                                     0.461685    
## as.factor(schoolsup)yes                         0.439380    
## as.factor(Mjob)health                           0.276751    
## as.factor(Mjob)other                            0.708334    
## as.factor(Mjob)services                         0.434370    
## as.factor(Mjob)teacher                          0.329115    
## Medu                                            0.000654 ***
## goout                                           0.011572 *  
## absences                                        0.247476    
## as.factor(schoolsup)yes:as.factor(Mjob)health   0.988318    
## as.factor(schoolsup)yes:as.factor(Mjob)other    0.369724    
## as.factor(schoolsup)yes:as.factor(Mjob)services 0.595730    
## as.factor(schoolsup)yes:as.factor(Mjob)teacher  0.985485    
## goout:absences                                  0.769652    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 481.30  on 381  degrees of freedom
## AIC: 509.3
## 
## Number of Fisher Scoring iterations: 14
# Checking Medu * go out from Model 2
model2_Meduxgoout <- glm(y_pass ~ as.factor(schoolsup) * as.factor(Mjob) + Medu + 
                                      goout + absences +  Medu*goout, family = binomial, data = mat)
summary(model2_Meduxgoout) #AIC 509.35 
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) * as.factor(Mjob) + 
##     Medu + goout + absences + Medu * goout, family = binomial, 
##     data = mat)
## 
## Coefficients:
##                                                   Estimate Std. Error z value
## (Intercept)                                       -0.30007    0.95106  -0.316
## as.factor(schoolsup)yes                           -0.69487    0.87842  -0.791
## as.factor(Mjob)health                              0.60595    0.55596   1.090
## as.factor(Mjob)other                               0.14514    0.37725   0.385
## as.factor(Mjob)services                            0.32772    0.40971   0.800
## as.factor(Mjob)teacher                            -0.48836    0.51271  -0.953
## Medu                                               0.40081    0.32806   1.222
## goout                                             -0.35477    0.29541  -1.201
## absences                                          -0.04027    0.01683  -2.393
## as.factor(schoolsup)yes:as.factor(Mjob)health    -15.01217 1027.95470  -0.015
## as.factor(schoolsup)yes:as.factor(Mjob)other      -0.96499    1.10291  -0.875
## as.factor(schoolsup)yes:as.factor(Mjob)services   -0.56313    1.08356  -0.520
## as.factor(schoolsup)yes:as.factor(Mjob)teacher   -14.50877  797.80575  -0.018
## Medu:goout                                         0.01974    0.09923   0.199
##                                                 Pr(>|z|)  
## (Intercept)                                       0.7524  
## as.factor(schoolsup)yes                           0.4289  
## as.factor(Mjob)health                             0.2758  
## as.factor(Mjob)other                              0.7004  
## as.factor(Mjob)services                           0.4238  
## as.factor(Mjob)teacher                            0.3408  
## Medu                                              0.2218  
## goout                                             0.2298  
## absences                                          0.0167 *
## as.factor(schoolsup)yes:as.factor(Mjob)health     0.9883  
## as.factor(schoolsup)yes:as.factor(Mjob)other      0.3816  
## as.factor(schoolsup)yes:as.factor(Mjob)services   0.6033  
## as.factor(schoolsup)yes:as.factor(Mjob)teacher    0.9855  
## Medu:goout                                        0.8423  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 481.35  on 381  degrees of freedom
## AIC: 509.35
## 
## Number of Fisher Scoring iterations: 14
# Checking as.factor(schoolsup) * Medu from Model 2
model2_schoolsupxMedu <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + 
                     absences + as.factor(Mjob) + as.factor(schoolsup) * Medu, data = mat, family = binomial)
summary(model2_schoolsupxMedu) #AIC 502.33
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences + as.factor(Mjob) + as.factor(schoolsup) * Medu, 
##     family = binomial, data = mat)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                  -0.543044   0.482244  -1.126 0.260133    
## as.factor(schoolsup)yes       0.001982   1.083571   0.002 0.998541    
## Medu                          0.526274   0.141776   3.712 0.000206 ***
## goout                        -0.293673   0.101701  -2.888 0.003882 ** 
## absences                     -0.041513   0.016898  -2.457 0.014022 *  
## as.factor(Mjob)health         0.383367   0.539408   0.711 0.477259    
## as.factor(Mjob)other          0.026291   0.357392   0.074 0.941358    
## as.factor(Mjob)services       0.245506   0.386966   0.634 0.525795    
## as.factor(Mjob)teacher       -0.704687   0.503694  -1.399 0.161801    
## as.factor(schoolsup)yes:Medu -0.535402   0.391995  -1.366 0.171989    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 482.33  on 385  degrees of freedom
## AIC: 502.33
## 
## Number of Fisher Scoring iterations: 4
# Checking Medu * goout from Model 2
model2_Meduxgoout <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + 
                              absences + as.factor(Mjob) + Medu * goout, data = mat, family = binomial)
summary(model2_Meduxgoout) #AIC 504.1 
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences + as.factor(Mjob) + Medu * goout, family = binomial, 
##     data = mat)
## 
## Coefficients:
##                         Estimate Std. Error z value Pr(>|z|)    
## (Intercept)             -0.16312    0.94387  -0.173 0.862796    
## as.factor(schoolsup)yes -1.46104    0.40121  -3.642 0.000271 ***
## Medu                     0.38825    0.32601   1.191 0.233694    
## goout                   -0.38045    0.29534  -1.288 0.197692    
## absences                -0.04032    0.01672  -2.411 0.015919 *  
## as.factor(Mjob)health    0.44638    0.53599   0.833 0.404955    
## as.factor(Mjob)other     0.04017    0.35876   0.112 0.910857    
## as.factor(Mjob)services  0.26198    0.38857   0.674 0.500180    
## as.factor(Mjob)teacher  -0.62334    0.49991  -1.247 0.212432    
## Medu:goout               0.02752    0.09864   0.279 0.780272    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 484.10  on 385  degrees of freedom
## AIC: 504.1
## 
## Number of Fisher Scoring iterations: 4
# Checking as.factor(schoolsup) * goout
model2_schoolsupxgoout <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + 
                               absences + as.factor(Mjob) + as.factor(schoolsup) * goout, data = mat, family = binomial)
summary(model2_schoolsupxgoout) #AIC: 503.8
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences + as.factor(Mjob) + as.factor(schoolsup) * goout, 
##     family = binomial, data = mat)
## 
## Coefficients:
##                               Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                   -0.46518    0.48208  -0.965 0.334575    
## as.factor(schoolsup)yes       -0.86552    1.02396  -0.845 0.397959    
## Medu                           0.47590    0.13518   3.521 0.000431 ***
## goout                         -0.28342    0.10601  -2.674 0.007505 ** 
## absences                      -0.03976    0.01668  -2.383 0.017166 *  
## as.factor(Mjob)health          0.44775    0.53624   0.835 0.403726    
## as.factor(Mjob)other           0.03327    0.35814   0.093 0.925980    
## as.factor(Mjob)services        0.25633    0.38710   0.662 0.507857    
## as.factor(Mjob)teacher        -0.63558    0.49949  -1.272 0.203207    
## as.factor(schoolsup)yes:goout -0.21166    0.34800  -0.608 0.543048    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 483.80  on 385  degrees of freedom
## AIC: 503.8
## 
## Number of Fisher Scoring iterations: 4
# Checking as.factor(Mjob) * Medu
model2_MjobxMedu <- glm(y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(Mjob) + as.factor(Mjob) * Medu, data = mat, family = binomial)
summary(model2_MjobxMedu) #AIC:498.84 --> LOWERS AIC
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences + as.factor(Mjob) + as.factor(Mjob) * Medu, family = binomial, 
##     data = mat)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     0.89124    0.71316   1.250 0.211408    
## as.factor(schoolsup)yes        -1.41320    0.40377  -3.500 0.000465 ***
## Medu                           -0.29139    0.37251  -0.782 0.434073    
## goout                          -0.30763    0.10289  -2.990 0.002790 ** 
## absences                       -0.03884    0.01671  -2.324 0.020100 *  
## as.factor(Mjob)health          -6.59602    3.50604  -1.881 0.059927 .  
## as.factor(Mjob)other           -1.55322    0.83184  -1.867 0.061874 .  
## as.factor(Mjob)services        -0.75082    0.98677  -0.761 0.446724    
## as.factor(Mjob)teacher        -59.68389 3319.80671  -0.018 0.985656    
## Medu:as.factor(Mjob)health      2.31714    0.97873   2.367 0.017909 *  
## Medu:as.factor(Mjob)other       0.88843    0.41794   2.126 0.033527 *  
## Medu:as.factor(Mjob)services    0.66606    0.44724   1.489 0.136422    
## Medu:as.factor(Mjob)teacher    15.22040  829.95177   0.018 0.985368    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 472.84  on 382  degrees of freedom
## AIC: 498.84
## 
## Number of Fisher Scoring iterations: 14
######## Determining whether the significant interaction term adds to Model 2 #######
## Likelihood Ratio Test: Does adding Mjob * Medu significantly improve the model fit?
# H0: The simpler model without the interaction term is sufficient.
# Ha: The simpler model is not sufficient without the interaction term.
anova(model2.logit, model2_MjobxMedu, test = "LRT") #p-val: 0.02297<0.05
## Analysis of Deviance Table
## 
## Model 1: y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(Mjob)
## Model 2: y_pass ~ as.factor(schoolsup) + Medu + goout + absences + as.factor(Mjob) + 
##     as.factor(Mjob) * Medu
##   Resid. Df Resid. Dev Df Deviance Pr(>Chi)  
## 1       386     484.18                       
## 2       382     472.84  4   11.343  0.02297 *
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
# Defining model2_MjobxMedu as model5.interaction
model5.interaction <- model2_MjobxMedu
summary(model5.interaction)
## 
## Call:
## glm(formula = y_pass ~ as.factor(schoolsup) + Medu + goout + 
##     absences + as.factor(Mjob) + as.factor(Mjob) * Medu, family = binomial, 
##     data = mat)
## 
## Coefficients:
##                                Estimate Std. Error z value Pr(>|z|)    
## (Intercept)                     0.89124    0.71316   1.250 0.211408    
## as.factor(schoolsup)yes        -1.41320    0.40377  -3.500 0.000465 ***
## Medu                           -0.29139    0.37251  -0.782 0.434073    
## goout                          -0.30763    0.10289  -2.990 0.002790 ** 
## absences                       -0.03884    0.01671  -2.324 0.020100 *  
## as.factor(Mjob)health          -6.59602    3.50604  -1.881 0.059927 .  
## as.factor(Mjob)other           -1.55322    0.83184  -1.867 0.061874 .  
## as.factor(Mjob)services        -0.75082    0.98677  -0.761 0.446724    
## as.factor(Mjob)teacher        -59.68389 3319.80671  -0.018 0.985656    
## Medu:as.factor(Mjob)health      2.31714    0.97873   2.367 0.017909 *  
## Medu:as.factor(Mjob)other       0.88843    0.41794   2.126 0.033527 *  
## Medu:as.factor(Mjob)services    0.66606    0.44724   1.489 0.136422    
## Medu:as.factor(Mjob)teacher    15.22040  829.95177   0.018 0.985368    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 534.75  on 394  degrees of freedom
## Residual deviance: 472.84  on 382  degrees of freedom
## AIC: 498.84
## 
## Number of Fisher Scoring iterations: 14
######## Pearson Goodness of Fit Test
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.
pearson_chi5 <- sum(residuals(model5.interaction, type = "pearson")^2)
df5 <- model5.interaction$df.residual
p_value5 <- pchisq(pearson_chi5, df5, lower.tail = FALSE)
crit_val5 <- qchisq(.95,df=df5-2)
cat("Model 5 - Pearson Chi-square:", pearson_chi5, "critical value:", crit_val5, "df:", df5, "p-value:", p_value5, "\n") #p-val: 0.435 --> Can't reject null that model is fitting
## Model 5 - Pearson Chi-square: 385.8941 critical value: 426.4537 df: 382 p-value: 0.4346656
pi_hats5 <- predict.glm(model5.interaction)

######## Hosmer-Lemeshow Goodness of Fit Test
# H0: Model accurately fits observed data; Ha: Model does not accurately fit observed data.
hos_test5 <- hoslem.test(model5.interaction$y,predict.glm(model5.interaction, mat,type="response"),5)
obs5 <- hos_test5$observed
exp5 <- hos_test5$expected

crit_val5 <- qchisq(.95,5-2)
crit_val5
## [1] 7.814728
p_val5 <- pchisq(hos_test5$statistic, 5-2,lower.tail=F)
p_val5 ##p_val = 0.656 >0.05 --> fail to reject the null hypothesis that model is fitting
## X-squared 
##  0.655625

Checking Model Assumptions

##### Logistic Model Assumptions:

#1. Dependent variable is binary (pass(1) vs. fail(0)).

#2. Observations are collected independently of each other.

#3. No to minimal multicollinearity among independent variables. 
vif(model1.logit) # all are around 1, which is less than 5 --> minimal multicollinearity
## as.factor(schoolsup)                 Medu                goout 
##             1.006362             1.024805             1.017622 
##             absences 
##             1.020360
vif(model2.logit) # all are around 1, which is less than 5 --> minimal multicollinearity
##                          GVIF Df GVIF^(1/(2*Df))
## as.factor(schoolsup) 1.029432  1        1.014609
## Medu                 1.736719  1        1.317846
## goout                1.032023  1        1.015885
## absences             1.033095  1        1.016413
## as.factor(Mjob)      1.746609  4        1.072197
vif(model3.probit) # all are around 1, which is less than 5 --> minimal multicollinearity
## as.factor(schoolsup)                 Medu                goout 
##             1.004869             1.020436             1.014400 
##             absences 
##             1.017817
#4. A linear relationship exists between the independent variables and the log-odds of the dependent variable
###Box-Tidwell Test: determines whether transformations are needed on continuous variables

#Ho: There is a linear relationship btwn predictor and log(odds of response)
#Ha: There is no linear relationship btwn predictor and log(odds of response)
mat$Medu_adj <- mat$Medu + 1
mat$absences_adj <- mat$absences + 1
boxTidwell(y_pass ~ Medu_adj + goout + absences_adj, data = mat,max.iter=50)
## Warning in boxTidwell.default(y, X1, X2, max.iter = max.iter, tol = tol, :
## maximum iterations exceeded
##              MLE of lambda Score Statistic (t) Pr(>|t|)
## Medu_adj            2.1059              0.7864   0.4321
## goout               2.6089             -1.5654   0.1183
## absences_adj        1.3278             -0.6437   0.5201
## 
## iterations =  51 
## 
## Score test for null hypothesis that all lambdas = 1:
## F = 1.2868, df = 3 and 388, Pr(>F) = 0.2785
#p-val:0.2785 --> can't reject that there is a linear relationship btwn predictor and log(odds of response)

#5. Sample size is 395, which is sufficiently large.


### Converting log odds to normal odds
odds_ratios <- exp(coef(model5.interaction))
odds_ratios
##                  (Intercept)      as.factor(schoolsup)yes 
##                 2.438142e+00                 2.433626e-01 
##                         Medu                        goout 
##                 7.472218e-01                 7.351837e-01 
##                     absences        as.factor(Mjob)health 
##                 9.619015e-01                 1.365791e-03 
##         as.factor(Mjob)other      as.factor(Mjob)services 
##                 2.115662e-01                 4.719771e-01 
##       as.factor(Mjob)teacher   Medu:as.factor(Mjob)health 
##                 1.201203e-26                 1.014660e+01 
##    Medu:as.factor(Mjob)other Medu:as.factor(Mjob)services 
##                 2.431309e+00                 1.946545e+00 
##  Medu:as.factor(Mjob)teacher 
##                 4.075092e+06

Identifying Influential Points or Outliers

n <- nrow(mat)

# 1. Leverage points
length(coef(model5.interaction))
## [1] 13
leverage <- hatvalues(model5.interaction)
high_leverage_obs <- which(leverage > 0.0962) #0.0962 = 2*9/395
mat[high_leverage_obs, ]# shows the details of the 277th observation
##     address famsize Pstatus Medu Fedu    Mjob     Fjob     reason guardian
## 115       R     GT3       T    2    1  health services reputation   mother
## 128       U     GT3       T    0    1 at_home    other     course    other
## 147       U     GT3       T    3    2  health services       home   father
## 189       U     GT3       A    3    3  health    other reputation   mother
## 198       R     LE3       T    3    3 teacher    other       home   father
## 277       R     GT3       A    3    2   other services       home   mother
## 282       U     LE3       A    3    2 teacher services       home   mother
## 296       U     GT3       T    3    3  health    other       home   mother
## 304       U     GT3       T    3    2  health   health reputation   father
## 325       U     LE3       T    0    2 at_home  at_home       home   father
## 352       U     GT3       T    3    3  health    other     course   mother
## 364       U     LE3       T    4    4 at_home  at_home     course   mother
## 389       U     LE3       T    3    1 teacher services     course   mother
##     traveltime studytime schoolsup famsup paid activities nursery higher
## 115          1         2        no     no   no        yes     yes    yes
## 128          1         2        no    yes   no         no      no     no
## 147          1         2        no    yes   no         no     yes    yes
## 189          1         2        no    yes   no         no      no    yes
## 198          3         1        no    yes  yes        yes     yes    yes
## 277          2         2        no     no   no         no      no     no
## 282          1         1        no     no   no         no     yes    yes
## 296          1         1        no    yes  yes         no     yes    yes
## 304          1         4        no    yes  yes        yes      no    yes
## 325          2         3        no     no   no         no     yes    yes
## 352          2         2        no    yes  yes         no     yes    yes
## 364          1         2        no    yes  yes        yes     yes    yes
## 389          1         2        no    yes  yes         no     yes    yes
##     internet romantic famrel freetime goout Dalc Walc absences y_pass Medu_adj
## 115      yes      yes      5        4     2    1    1        8      0        3
## 128       no       no      3        4     2    1    1        2      0        1
## 147      yes       no      3        3     2    1    1        0      0        4
## 189      yes      yes      3        3     3    1    3        6      0        4
## 198      yes       no      3        3     4    3    5        8      0        4
## 277      yes      yes      4        1     1    1    1       75      0        4
## 282      yes       no      4        4     4    3    4       19      0        4
## 296      yes       no      4        4     3    1    3        4      0        4
## 304      yes       no      5        2     2    1    2        0      1        4
## 325      yes       no      3        3     3    2    3        0      1        1
## 352      yes       no      4        5     4    2    3        2      1        4
## 364      yes      yes      2        3     4    1    1        0      1        5
## 389      yes       no      4        3     4    1    1        0      0        4
##     absences_adj
## 115            9
## 128            3
## 147            1
## 189            7
## 198            9
## 277           76
## 282           20
## 296            5
## 304            1
## 325            1
## 352            3
## 364            1
## 389            1
leverage #the 277th student is an outlier
##          1          2          3          4          5          6          7 
## 0.04058564 0.03146899 0.03557356 0.03153174 0.01508999 0.03274560 0.01390466 
##          8          9         10         11         12         13         14 
## 0.04467583 0.01595798 0.02231262 0.02147394 0.02550955 0.03464832 0.02030136 
##         15         16         17         18         19         20         21 
## 0.01523081 0.04137353 0.03083871 0.03428446 0.02218808 0.03578240 0.02822080 
##         22         23         24         25         26         27         28 
## 0.03045839 0.02793080 0.01390466 0.04034667 0.03147361 0.01418677 0.04137353 
##         29         30         31         32         33         34         35 
## 0.02997921 0.02674738 0.03045839 0.03172352 0.02303019 0.01640835 0.01388250 
##         36         37         38         39         40         41         42 
## 0.02270297 0.02030136 0.02872982 0.03770903 0.03656291 0.02578816 0.01977585 
##         43         44         45         46         47         48         49 
## 0.03149156 0.05460326 0.01675191 0.05687633 0.01858116 0.03288500 0.02030136 
##         50         51         52         53         54         55         56 
## 0.05009569 0.02265836 0.03464832 0.05174569 0.05332741 0.01292071 0.01132162 
##         57         58         59         60         61         62         63 
## 0.03088153 0.02188963 0.02429605 0.03149156 0.04263602 0.02731213 0.01775091 
##         64         65         66         67         68         69         70 
## 0.03082777 0.05332741 0.02030136 0.02824696 0.02997921 0.04834435 0.01840030 
##         71         72         73         74         75         76         77 
## 0.01640835 0.02901431 0.01573455 0.01551868 0.02901731 0.01945813 0.01977585 
##         78         79         80         81         82         83         84 
## 0.01234008 0.03590729 0.06125915 0.02630992 0.02491016 0.01422332 0.02206417 
##         85         86         87         88         89         90         91 
## 0.02693931 0.03446910 0.02320960 0.02824696 0.02937081 0.02776942 0.01388250 
##         92         93         94         95         96         97         98 
## 0.04301042 0.02780567 0.02147394 0.02233481 0.04129094 0.05380555 0.01707140 
##         99        100        101        102        103        104        105 
## 0.03082136 0.05015605 0.03580046 0.03238523 0.03096424 0.01985795 0.01682303 
##        106        107        108        109        110        111        112 
## 0.02508992 0.02303421 0.01292442 0.03714906 0.05077757 0.01945813 0.03979163 
##        113        114        115        116        117        118        119 
## 0.02766625 0.02382242 0.18065701 0.02258948 0.02844396 0.01640835 0.02010766 
##        120        121        122        123        124        125        126 
## 0.01171907 0.03209062 0.02233481 0.01418677 0.05945944 0.01390466 0.03043952 
##        127        128        129        130        131        132        133 
## 0.03789789 0.10778684 0.02374719 0.02542531 0.01595798 0.03317372 0.01247278 
##        134        135        136        137        138        139        140 
## 0.03041949 0.07728853 0.03238523 0.06089592 0.01640835 0.05835832 0.02303019 
##        141        142        143        144        145        146        147 
## 0.04455135 0.02233494 0.02222378 0.03251891 0.01867795 0.02830424 0.16542873 
##        148        149        150        151        152        153        154 
## 0.03523749 0.02303019 0.03001262 0.02421348 0.02236204 0.02185017 0.01682303 
##        155        156        157        158        159        160        161 
## 0.07202144 0.02007680 0.02544349 0.03286600 0.03003641 0.01896572 0.03130619 
##        162        163        164        165        166        167        168 
## 0.02335466 0.02308935 0.03209062 0.02421348 0.02225267 0.01592756 0.03381810 
##        169        170        171        172        173        174        175 
## 0.01867795 0.03045839 0.02200656 0.02693931 0.02408725 0.03739936 0.01896572 
##        176        177        178        179        180        181        182 
## 0.02159540 0.02370980 0.01335668 0.02056374 0.02824696 0.02056374 0.01292442 
##        183        184        185        186        187        188        189 
## 0.02642953 0.08081027 0.02051010 0.01519888 0.05994138 0.01234008 0.14289925 
##        190        191        192        193        194        195        196 
## 0.03146899 0.02316543 0.03317372 0.03114580 0.01507436 0.01234008 0.02642953 
##        197        198        199        200        201        202        203 
## 0.03096424 0.33148519 0.04861653 0.02303019 0.05017386 0.01862283 0.02031865 
##        204        205        206        207        208        209        210 
## 0.02287444 0.02600320 0.05361743 0.01193356 0.02382242 0.03530327 0.02204244 
##        211        212        213        214        215        216        217 
## 0.01286689 0.04143996 0.01958877 0.02537555 0.03827509 0.01431408 0.04674601 
##        218        219        220        221        222        223        224 
## 0.01585573 0.02475875 0.02429883 0.02616543 0.03400816 0.05242466 0.01234008 
##        225        226        227        228        229        230        231 
## 0.02303019 0.01824394 0.01354668 0.02265836 0.02430560 0.01129968 0.04738735 
##        232        233        234        235        236        237        238 
## 0.01366350 0.02083724 0.04047347 0.02919148 0.06308034 0.01366350 0.02557650 
##        239        240        241        242        243        244        245 
## 0.03906454 0.01390466 0.05969428 0.02030136 0.03019470 0.03088153 0.01390466 
##        246        247        248        249        250        251        252 
## 0.01028043 0.01366350 0.02218808 0.01203543 0.04559726 0.02388165 0.03328578 
##        253        254        255        256        257        258        259 
## 0.02688068 0.01523081 0.01523081 0.04403309 0.01945813 0.03827509 0.01132162 
##        260        261        262        263        264        265        266 
## 0.03256298 0.04787231 0.02222378 0.01590788 0.01192479 0.02684646 0.01898362 
##        267        268        269        270        271        272        273 
## 0.01527299 0.02104306 0.05487436 0.01867795 0.02002982 0.01592756 0.02250498 
##        274        275        276        277        278        279        280 
## 0.03523749 0.02533425 0.02523864 0.14274079 0.03043901 0.08264064 0.01977585 
##        281        282        283        284        285        286        287 
## 0.05898731 0.21622243 0.03550255 0.01468082 0.01592756 0.02250498 0.02395040 
##        288        289        290        291        292        293        294 
## 0.05789746 0.02233481 0.02011255 0.02456944 0.03045839 0.02442295 0.01457937 
##        295        296        297        298        299        300        301 
## 0.01507436 0.14649063 0.03989813 0.03718365 0.02901431 0.02190460 0.05194822 
##        302        303        304        305        306        307        308 
## 0.03031780 0.02901431 0.16542873 0.02229233 0.02233494 0.01419883 0.04775829 
##        309        310        311        312        313        314        315 
## 0.01388250 0.02562496 0.05835832 0.04920252 0.02643007 0.03283363 0.04038634 
##        316        317        318        319        320        321        322 
## 0.05552421 0.02374719 0.03139330 0.06796615 0.02258948 0.05220875 0.01648860 
##        323        324        325        326        327        328        329 
## 0.02556350 0.01350234 0.10798765 0.02829640 0.01953287 0.02553888 0.02100967 
##        330        331        332        333        334        335        336 
## 0.01962983 0.01707140 0.02357010 0.01682303 0.01234008 0.02619318 0.02218808 
##        337        338        339        340        341        342        343 
## 0.01414171 0.01640835 0.01191394 0.01192479 0.02278338 0.02147394 0.01343905 
##        344        345        346        347        348        349        350 
## 0.03992941 0.02429883 0.01181220 0.02319461 0.03019470 0.03381810 0.01804595 
##        351        352        353        354        355        356        357 
## 0.01896895 0.13693823 0.03135592 0.02031865 0.04301042 0.01682303 0.01962983 
##        358        359        360        361        362        363        364 
## 0.01292442 0.02603095 0.03588096 0.03400816 0.03209062 0.01595798 0.13657549 
##        365        366        367        368        369        370        371 
## 0.02830424 0.03148012 0.02303019 0.03709216 0.02684646 0.03211269 0.01458060 
##        372        373        374        375        376        377        378 
## 0.03172320 0.01057403 0.01815535 0.03266719 0.02693931 0.03578240 0.01962983 
##        379        380        381        382        383        384        385 
## 0.01388250 0.04724616 0.02159540 0.01030776 0.01113530 0.02830424 0.03392743 
##        386        387        388        389        390        391        392 
## 0.02533425 0.01955682 0.02642953 0.45229240 0.03709216 0.02311323 0.02127747 
##        393        394        395 
## 0.02193733 0.02091195 0.02110802
par(mfrow = c(2, 2))
plot(leverage, main = "Leverage", ylab = "Hat Values")
abline(h = 2 * length(coef(model5.interaction)) / n, col = "red", lty = 2)

# 2. Cook's Distance
cooks_d <- cooks.distance(model5.interaction)
n <- nrow(mat)
threshold <- 4 / n
influential_obs <- which(cooks_d > threshold)
mat[influential_obs, ] #number 40,81,113,126,213,223,276,325 are influential observations
##     address famsize Pstatus Medu Fedu     Mjob     Fjob     reason guardian
## 20        U     LE3       T    4    3   health    other       home   father
## 40        R     GT3       T    2    2  at_home    other reputation   mother
## 81        U     GT3       T    2    3    other services     course   father
## 113       U     GT3       T    2    2  at_home    other       home   mother
## 126       U     GT3       T    3    4 services services       home   father
## 128       U     GT3       T    0    1  at_home    other     course    other
## 139       U     LE3       T    1    1 services    other     course   mother
## 147       U     GT3       T    3    2   health services       home   father
## 213       U     GT3       A    2    2    other    other reputation   mother
## 223       U     GT3       T    2    3 services  teacher      other   mother
## 250       U     GT3       T    0    2    other    other      other   mother
## 276       U     LE3       T    2    2 services    other     course   mother
## 297       U     GT3       T    4    4   health    other reputation    other
## 302       U     LE3       T    4    4    other  teacher       home   father
## 304       U     GT3       T    3    2   health   health reputation   father
## 325       U     LE3       T    0    2  at_home  at_home       home   father
## 352       U     GT3       T    3    3   health    other     course   mother
## 364       U     LE3       T    4    4  at_home  at_home     course   mother
##     traveltime studytime schoolsup famsup paid activities nursery higher
## 20           1         1        no     no  yes        yes     yes    yes
## 40           1         1       yes    yes  yes        yes     yes    yes
## 81           1         1       yes    yes  yes        yes      no    yes
## 113          1         2       yes     no   no        yes     yes    yes
## 126          1         1       yes     no   no         no     yes    yes
## 128          1         2        no    yes   no         no      no     no
## 139          1         2        no     no   no         no     yes    yes
## 147          1         2        no    yes   no         no     yes    yes
## 213          1         2       yes    yes  yes         no     yes    yes
## 223          1         2       yes     no   no         no     yes    yes
## 250          1         1        no     no  yes         no      no    yes
## 276          2         2       yes    yes  yes         no     yes    yes
## 297          2         2        no    yes  yes        yes     yes    yes
## 302          2         1        no     no  yes         no     yes    yes
## 304          1         4        no    yes  yes        yes      no    yes
## 325          2         3        no     no   no         no     yes    yes
## 352          2         2        no    yes  yes         no     yes    yes
## 364          1         2        no    yes  yes        yes     yes    yes
##     internet romantic famrel freetime goout Dalc Walc absences y_pass Medu_adj
## 20       yes       no      3        1     3    1    3        4      0        5
## 40        no       no      4        3     1    1    1        8      1        3
## 81       yes      yes      3        2     2    1    3        2      1        3
## 113      yes       no      3        1     2    1    1        6      1        3
## 126      yes       no      5        5     5    3    2        0      1        4
## 128       no       no      3        4     2    1    1        2      0        1
## 139       no      yes      4        4     4    1    3        0      1        2
## 147      yes       no      3        3     2    1    1        0      0        4
## 213      yes       no      3        3     4    1    1        0      1        3
## 223      yes       no      2        3     1    1    1        2      1        3
## 250      yes       no      4        3     2    2    4        0      1        1
## 276      yes      yes      4        4     4    2    3        6      1        3
## 297      yes       no      2        3     4    2    3        0      0        5
## 302      yes       no      4        1     1    2    2        0      0        5
## 304      yes       no      5        2     2    1    2        0      1        4
## 325      yes       no      3        3     3    2    3        0      1        1
## 352      yes       no      4        5     4    2    3        2      1        4
## 364      yes      yes      2        3     4    1    1        0      1        5
##     absences_adj
## 20             5
## 40             9
## 81             3
## 113            7
## 126            1
## 128            3
## 139            1
## 147            1
## 213            1
## 223            3
## 250            1
## 276            7
## 297            1
## 302            1
## 304            1
## 325            1
## 352            3
## 364            1
plot(cooks_d, main = "Cook's Distance", ylab = "Cook's D")
abline(h = 4 / n, col = "red", lty = 2)

# 3. Standardized (Pearson) residuals
std_resid <- rstandard(model5.interaction)
which(abs(std_resid) > 2) #potential outlier
## 113 213 276 
## 113 213 276
mat[which(abs(std_resid) > 2), ] #213, 276 are potential outliers
##     address famsize Pstatus Medu Fedu     Mjob  Fjob     reason guardian
## 113       U     GT3       T    2    2  at_home other       home   mother
## 213       U     GT3       A    2    2    other other reputation   mother
## 276       U     LE3       T    2    2 services other     course   mother
##     traveltime studytime schoolsup famsup paid activities nursery higher
## 113          1         2       yes     no   no        yes     yes    yes
## 213          1         2       yes    yes  yes         no     yes    yes
## 276          2         2       yes    yes  yes         no     yes    yes
##     internet romantic famrel freetime goout Dalc Walc absences y_pass Medu_adj
## 113      yes       no      3        1     2    1    1        6      1        3
## 213      yes       no      3        3     4    1    1        0      1        3
## 276      yes      yes      4        4     4    2    3        6      1        3
##     absences_adj
## 113            7
## 213            1
## 276            7
which(abs(std_resid) > 3) #likely outlier
## named integer(0)
mat[which(abs(std_resid) > 3), ] #no likely outlier
##  [1] address      famsize      Pstatus      Medu         Fedu        
##  [6] Mjob         Fjob         reason       guardian     traveltime  
## [11] studytime    schoolsup    famsup       paid         activities  
## [16] nursery      higher       internet     romantic     famrel      
## [21] freetime     goout        Dalc         Walc         absences    
## [26] y_pass       Medu_adj     absences_adj
## <0 rows> (or 0-length row.names)
plot(std_resid, main = "Standardized Residuals")
abline(h = c(-2, 2), col = "red", lty = 2)

# 4. Deviance residuals
dev_resid <- residuals(model5.interaction, type = "deviance")
which(abs(dev_resid) > 2) #potential outlier
## 113 213 276 
## 113 213 276
mat[which(abs(dev_resid) > 2), ] #213, 276 are potential outliers
##     address famsize Pstatus Medu Fedu     Mjob  Fjob     reason guardian
## 113       U     GT3       T    2    2  at_home other       home   mother
## 213       U     GT3       A    2    2    other other reputation   mother
## 276       U     LE3       T    2    2 services other     course   mother
##     traveltime studytime schoolsup famsup paid activities nursery higher
## 113          1         2       yes     no   no        yes     yes    yes
## 213          1         2       yes    yes  yes         no     yes    yes
## 276          2         2       yes    yes  yes         no     yes    yes
##     internet romantic famrel freetime goout Dalc Walc absences y_pass Medu_adj
## 113      yes       no      3        1     2    1    1        6      1        3
## 213      yes       no      3        3     4    1    1        0      1        3
## 276      yes      yes      4        4     4    2    3        6      1        3
##     absences_adj
## 113            7
## 213            1
## 276            7
which(abs(dev_resid) > 3) #likely outlier
## named integer(0)
mat[which(abs(dev_resid) > 3), ] #no likely outlier
##  [1] address      famsize      Pstatus      Medu         Fedu        
##  [6] Mjob         Fjob         reason       guardian     traveltime  
## [11] studytime    schoolsup    famsup       paid         activities  
## [16] nursery      higher       internet     romantic     famrel      
## [21] freetime     goout        Dalc         Walc         absences    
## [26] y_pass       Medu_adj     absences_adj
## <0 rows> (or 0-length row.names)
plot(dev_resid, main = "Deviance Residuals")

high_leverage <- which(leverage > (2 * length(coef(model5.interaction)) / n))
high_cooks <- which(cooks_d > (4 / n))
high_resid <- which(abs(std_resid) > 2)

Plotting Prediction Variables Vs. Response Variable

#option 1: plot y_pass against each predictor using the model1.logit while controlling for all other predictor. Flaw: great deviation because we're making so many assumptions on the baseline value
#1.0.1 Medu
plot(mat$Medu, mat$y_pass,
     ylim=c(-.4,1.4),
     xlab="Mother Education",
     ylab="G3 Pass/Fail")
abline(h=1,lty=2,col="darkred")
text(2, 1.15, cex = 0.8, "G3 Passed")
abline(h=0,lty=2,col="darkblue")
text(1,-0.15,cex=0.8,"G3 Failed")

x <- seq(0,4,.01)
y <- predict.glm(model1.logit, data.frame(Medu=x, schoolsup = "yes",goout = 2,absences = mean(mat$absences, na.rm = TRUE)), type="response")
lines(x,y,lwd=1.5,col="purple")
agg <- aggregate(y_pass~Medu,mean,data=mat)
points(c(0,1,2,3,4),agg$y_pass)

#1.0.1.1 Medu different controls
plot(mat$Medu, mat$y_pass,
     ylim=c(-.4,1.4),
     xlab="Mother Education",
     ylab="G3 Pass/Fail")
abline(h=1,lty=2,col="darkred")
text(2, 1.15, cex = 0.8, "G3 Passed")
abline(h=0,lty=2,col="darkblue")
text(1,-0.15,cex=0.8,"G3 Failed")

x <- seq(0,4,.01)
y <- predict.glm(model1.logit, data.frame(Medu=x, schoolsup = "no",goout = 2,absences = mean(mat$absences, na.rm = TRUE)), type="response")
lines(x,y,lwd=1.5,col="purple")
agg <- aggregate(y_pass~Medu,mean,data=mat)
points(c(0,1,2,3,4),agg$y_pass)

# option 1.1: plot every predictor against y using a simple logistic regression using only that predictor as the regressor. Flaw: doesn't make the most sense statistically and what we're trying to achieve.
#1.1.1 Medu
plot(mat$Medu, mat$y_pass,
     ylim=c(-.4,1.4),
     xlab="Mother Education",
     ylab="G3 Pass/Fail")
abline(h=1,lty=2,col="darkred")
text(1,1.15,cex=0.8,"G3 Passed")
abline(h=0,lty=2,col="darkblue")
text(1,-0.15,cex=0.8,"G3 Failed")

x <- seq(0,4,.01)
model_Medu.logit <- glm(y_pass~ Medu, data = mat, family = binomial)
y <- predict.glm(model_Medu.logit, data.frame(Medu=x), type="response")
lines(x,y,lwd=1.5,col="purple")
agg <- aggregate(y_pass~Medu,mean,data=mat)
points(agg$Medu,agg$y_pass)

# 1.1.2 goout
plot(mat$goout, mat$y_pass,
     ylim = c(-.4, 1.4),
     xlab = "Going Out Frequency",
     ylab = "G3 Pass/Fail")
abline(h = 1, lty = 2, col = "darkred")
text(2, 1.15, cex = 0.8, "G3 Passed")
abline(h = 0, lty = 2, col = "darkblue")
text(2, -0.15, cex = 0.8, "G3 Failed")

x <- seq(1, 5, .01)
model_goout.logit <- glm(y_pass ~ goout, data = mat, family = binomial)
y <- predict.glm(model_goout.logit, data.frame(goout = x), type = "response")
lines(x, y, lwd = 1.5, col = "purple")

agg <- aggregate(y_pass ~ goout, mean, data = mat)
points(agg$goout, agg$y_pass)

# 1.1.3 absences (continuous)
plot(mat$absences, mat$y_pass,
     ylim = c(-.4, 1.4),
     xlab = "Absences",
     ylab = "G3 Pass/Fail")
abline(h = 1, lty = 2, col = "darkred")
text(10, 1.15, cex = 0.8, "G3 Passed")
abline(h = 0, lty = 2, col = "darkblue")
text(10, -0.15, cex = 0.8, "G3 Failed")

x <- seq(0, 80, .01)
model_absences.logit <- glm(y_pass ~ absences, data = mat, family = binomial)
y <- predict.glm(model_absences.logit, data.frame(absences = x), type = "response")
lines(x, y, lwd = 1.5, col = "purple")

agg <- aggregate(y_pass ~ absences, mean, data = mat)
points(agg$absences, agg$y_pass)