library(ggplot2)
library(car)
## Loading required package: carData
library(MASS)
# Set CRAN mirror
options(repos = c(CRAN = "https://cran.r-project.org"))

# Install the ResourceSelection package
if (!requireNamespace("ResourceSelection", quietly = TRUE)) {
    install.packages("ResourceSelection")
}
library(ResourceSelection)
## Warning: package 'ResourceSelection' was built under R version 4.4.2
## ResourceSelection 0.3-6   2023-06-27

Loading my data set:

library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ lubridate 1.9.3     ✔ tibble    3.2.1
## ✔ purrr     1.0.2     ✔ tidyr     1.3.1
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ dplyr::select() masks MASS::select()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dataset <-read_delim("C:/Users/MSKR/MASTERS_ADS/STATISTICS_SEM1/DATA_SET_1.csv", delim = ",")
## Rows: 4424 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Target
## dbl (36): Marital status, Application mode, Application order, Course, Dayti...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.

Creating a custom table with mutating necessary categorical columns:

dataset_1<-dataset
dataset_1<-mutate(dataset_1, day_eve_class= ifelse(dataset_1$`Daytime/evening attendance    ` == 1, "day","evening"))

Adding a “sem_results” column which has the mean values of semester1 and semester2 results for a student.

dataset_1<-mutate(dataset_1, sem_results= rowMeans(select(dataset_1,`Curricular units 1st sem (grade)`, `Curricular units 2nd sem (grade)`)))

Mutating a column which assigns numeric values to the Target values:

dataset_1<-mutate(dataset_1, target = ifelse(dataset$Target == "Graduate",1,
                    ifelse(Target == "Enrolled",2,
                    ifelse(Target == "Dropout", 0, "no"))))

Adding a column “prev_perf” which has the mean values of Previous_grades and Admission_grades of a student.

dataset_1<-mutate(dataset_1, prev_perf= rowMeans(select(dataset_1,`Previous qualification (grade)`,`Admission grade`)))

Generalized Linear Model:

# Ensuring 'day_eve_class' is treated as a factor
dataset_1$day_eve_class <- as.factor(dataset_1$day_eve_class)

For a binomial regression model, we shall have the target to be either 0 or 1. In our case we can drop the students who fall under “Enrolled” (2) category as they do not affect our analysis on understanding about the “Droupout” and “Graduate” students.

df_bin<-filter(dataset_1,target!=2)
drops <- c("Target")
df_bin<-df_bin[ , !(names(df_bin) %in% drops)]
df_bin$target<-as.numeric(df_bin$target)
# Full model with all available predictors
full_model <- glm(target ~ ., data = df_bin, family = binomial)
summary(full_model)
## 
## Call:
## glm(formula = target ~ ., family = binomial, data = df_bin)
## 
## Coefficients: (3 not defined because of singularities)
##                                                    Estimate Std. Error z value
## (Intercept)                                      -5.499e-01  9.972e-01  -0.551
## `Marital status`                                  1.983e-01  1.405e-01   1.412
## `Application mode`                               -7.523e-03  4.965e-03  -1.515
## `Application order`                              -3.502e-02  5.378e-02  -0.651
## Course                                           -1.924e-04  6.405e-05  -3.005
## `Daytime/evening attendance\\t`                  -1.567e-01  2.481e-01  -0.631
## `Previous qualification`                          2.073e-02  7.563e-03   2.741
## `Previous qualification (grade)`                 -2.674e-03  6.039e-03  -0.443
## Nacionality                                      -5.468e-02  1.415e-02  -3.864
## `Mother's qualification`                         -4.868e-03  5.181e-03  -0.940
## `Father's qualification`                          7.924e-03  5.125e-03   1.546
## `Mother's occupation`                             1.227e-02  6.634e-03   1.850
## `Father's occupation`                            -4.372e-03  6.983e-03  -0.626
## `Admission grade`                                 2.291e-03  5.724e-03   0.400
## Displaced                                        -3.384e-01  1.541e-01  -2.196
## `Educational special needs`                      -2.861e-01  5.272e-01  -0.543
## Debtor                                           -1.259e+00  2.369e-01  -5.315
## `Tuition fees up to date`                         2.640e+00  2.801e-01   9.426
## Gender                                           -4.017e-01  1.391e-01  -2.888
## `Scholarship holder`                              7.980e-01  1.658e-01   4.814
## `Age at enrollment`                              -3.506e-02  1.263e-02  -2.776
## International                                     3.306e+00  8.127e-01   4.068
## `Curricular units 1st sem (credited)`            -2.703e-01  1.024e-01  -2.641
## `Curricular units 1st sem (enrolled)`            -2.041e-01  1.494e-01  -1.366
## `Curricular units 1st sem (evaluations)`          9.880e-03  3.768e-02   0.262
## `Curricular units 1st sem (approved)`             6.420e-01  8.155e-02   7.872
## `Curricular units 1st sem (grade)`               -7.253e-02  5.123e-02  -1.416
## `Curricular units 1st sem (without evaluations)`  2.327e-01  1.678e-01   1.387
## `Curricular units 2nd sem (credited)`            -1.226e-01  1.063e-01  -1.153
## `Curricular units 2nd sem (enrolled)`            -7.813e-01  1.441e-01  -5.423
## `Curricular units 2nd sem (evaluations)`         -1.791e-02  3.589e-02  -0.499
## `Curricular units 2nd sem (approved)`             9.767e-01  7.403e-02  13.193
## `Curricular units 2nd sem (grade)`                1.273e-01  5.356e-02   2.376
## `Curricular units 2nd sem (without evaluations)`  2.003e-01  1.364e-01   1.469
## `Unemployment rate`                              -5.714e-02  2.770e-02  -2.063
## `Inflation rate`                                  2.614e-02  4.743e-02   0.551
## GDP                                              -2.967e-02  3.276e-02  -0.906
## day_eve_classevening                                     NA         NA      NA
## sem_results                                              NA         NA      NA
## prev_perf                                                NA         NA      NA
##                                                  Pr(>|z|)    
## (Intercept)                                      0.581319    
## `Marital status`                                 0.158029    
## `Application mode`                               0.129697    
## `Application order`                              0.514875    
## Course                                           0.002659 ** 
## `Daytime/evening attendance\\t`                  0.527780    
## `Previous qualification`                         0.006123 ** 
## `Previous qualification (grade)`                 0.657871    
## Nacionality                                      0.000111 ***
## `Mother's qualification`                         0.347445    
## `Father's qualification`                         0.122037    
## `Mother's occupation`                            0.064355 .  
## `Father's occupation`                            0.531194    
## `Admission grade`                                0.689005    
## Displaced                                        0.028110 *  
## `Educational special needs`                      0.587289    
## Debtor                                           1.07e-07 ***
## `Tuition fees up to date`                         < 2e-16 ***
## Gender                                           0.003875 ** 
## `Scholarship holder`                             1.48e-06 ***
## `Age at enrollment`                              0.005507 ** 
## International                                    4.74e-05 ***
## `Curricular units 1st sem (credited)`            0.008259 ** 
## `Curricular units 1st sem (enrolled)`            0.171875    
## `Curricular units 1st sem (evaluations)`         0.793183    
## `Curricular units 1st sem (approved)`            3.48e-15 ***
## `Curricular units 1st sem (grade)`               0.156902    
## `Curricular units 1st sem (without evaluations)` 0.165520    
## `Curricular units 2nd sem (credited)`            0.248974    
## `Curricular units 2nd sem (enrolled)`            5.85e-08 ***
## `Curricular units 2nd sem (evaluations)`         0.617794    
## `Curricular units 2nd sem (approved)`             < 2e-16 ***
## `Curricular units 2nd sem (grade)`               0.017485 *  
## `Curricular units 2nd sem (without evaluations)` 0.141868    
## `Unemployment rate`                              0.039097 *  
## `Inflation rate`                                 0.581567    
## GDP                                              0.365084    
## day_eve_classevening                                   NA    
## sem_results                                            NA    
## prev_perf                                              NA    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4859.8  on 3629  degrees of freedom
## Residual deviance: 1647.4  on 3593  degrees of freedom
## AIC: 1721.4
## 
## Number of Fisher Scoring iterations: 8

Summary analysis:

To overcome the above fall outs, we can find the best model that fits the data nearly favorable in the estimations.

Akaike Information Criterion (AIC) is a metric used to evaluate and compare the quality of statistical models, particularly in the context of regression and time series analysis.

# Stepwise selection using AIC
best_model <- stepAIC(full_model, direction = "both", trace = FALSE)
summary(best_model)
## 
## Call:
## glm(formula = target ~ `Marital status` + Course + `Previous qualification` + 
##     Nacionality + `Mother's occupation` + Displaced + Debtor + 
##     `Tuition fees up to date` + Gender + `Scholarship holder` + 
##     `Age at enrollment` + International + `Curricular units 1st sem (credited)` + 
##     `Curricular units 1st sem (approved)` + `Curricular units 1st sem (without evaluations)` + 
##     `Curricular units 2nd sem (enrolled)` + `Curricular units 2nd sem (approved)` + 
##     `Curricular units 2nd sem (grade)` + `Curricular units 2nd sem (without evaluations)` + 
##     `Unemployment rate`, family = binomial, data = df_bin)
## 
## Coefficients:
##                                                    Estimate Std. Error z value
## (Intercept)                                      -8.107e-01  5.170e-01  -1.568
## `Marital status`                                  2.013e-01  1.410e-01   1.428
## Course                                           -2.299e-04  5.703e-05  -4.031
## `Previous qualification`                          1.496e-02  6.704e-03   2.232
## Nacionality                                      -5.434e-02  1.421e-02  -3.824
## `Mother's occupation`                             8.116e-03  3.353e-03   2.420
## Displaced                                        -3.477e-01  1.466e-01  -2.371
## Debtor                                           -1.256e+00  2.343e-01  -5.359
## `Tuition fees up to date`                         2.672e+00  2.814e-01   9.495
## Gender                                           -4.183e-01  1.369e-01  -3.056
## `Scholarship holder`                              8.036e-01  1.633e-01   4.921
## `Age at enrollment`                              -3.742e-02  1.089e-02  -3.438
## International                                     3.194e+00  8.120e-01   3.934
## `Curricular units 1st sem (credited)`            -3.872e-01  5.041e-02  -7.682
## `Curricular units 1st sem (approved)`             6.033e-01  6.714e-02   8.986
## `Curricular units 1st sem (without evaluations)`  2.371e-01  1.602e-01   1.481
## `Curricular units 2nd sem (enrolled)`            -9.836e-01  7.708e-02 -12.760
## `Curricular units 2nd sem (approved)`             9.894e-01  6.559e-02  15.084
## `Curricular units 2nd sem (grade)`                8.543e-02  4.239e-02   2.016
## `Curricular units 2nd sem (without evaluations)`  2.058e-01  1.330e-01   1.547
## `Unemployment rate`                              -5.288e-02  2.486e-02  -2.128
##                                                  Pr(>|z|)    
## (Intercept)                                      0.116898    
## `Marital status`                                 0.153423    
## Course                                           5.54e-05 ***
## `Previous qualification`                         0.025646 *  
## Nacionality                                      0.000131 ***
## `Mother's occupation`                            0.015513 *  
## Displaced                                        0.017742 *  
## Debtor                                           8.37e-08 ***
## `Tuition fees up to date`                         < 2e-16 ***
## Gender                                           0.002242 ** 
## `Scholarship holder`                             8.60e-07 ***
## `Age at enrollment`                              0.000586 ***
## International                                    8.37e-05 ***
## `Curricular units 1st sem (credited)`            1.56e-14 ***
## `Curricular units 1st sem (approved)`             < 2e-16 ***
## `Curricular units 1st sem (without evaluations)` 0.138730    
## `Curricular units 2nd sem (enrolled)`             < 2e-16 ***
## `Curricular units 2nd sem (approved)`             < 2e-16 ***
## `Curricular units 2nd sem (grade)`               0.043841 *  
## `Curricular units 2nd sem (without evaluations)` 0.121835    
## `Unemployment rate`                              0.033375 *  
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## (Dispersion parameter for binomial family taken to be 1)
## 
##     Null deviance: 4859.8  on 3629  degrees of freedom
## Residual deviance: 1659.1  on 3609  degrees of freedom
## AIC: 1701.1
## 
## Number of Fisher Scoring iterations: 7
# 1. Residuals vs Fitted Plot
plot(best_model, which = 1, main = "Residuals vs Fitted")

Interpretation:

# 2. Cook's Distance
plot(best_model, which = 4, main = "Cook's Distance")

Interpretation:

# 3. Leverage Plot
plot(hatvalues(best_model), main = "Leverage Values", ylab = "Leverage")
abline(h = 2 * mean(hatvalues(best_model)), col = "red", lty = 2)

Interpretation:

# 4. Variance Inflation Factor (VIF) for multicollinearity
vif(best_model)
##                                 `Marital status` 
##                                         1.505745 
##                                           Course 
##                                         7.655553 
##                         `Previous qualification` 
##                                         1.111836 
##                                      Nacionality 
##                                         3.262373 
##                            `Mother's occupation` 
##                                         1.042082 
##                                        Displaced 
##                                         1.250714 
##                                           Debtor 
##                                         1.127495 
##                        `Tuition fees up to date` 
##                                         1.132134 
##                                           Gender 
##                                         1.090516 
##                             `Scholarship holder` 
##                                         1.049656 
##                              `Age at enrollment` 
##                                         1.752143 
##                                    International 
##                                         3.349127 
##            `Curricular units 1st sem (credited)` 
##                                         4.299866 
##            `Curricular units 1st sem (approved)` 
##                                         8.776707 
## `Curricular units 1st sem (without evaluations)` 
##                                         1.168276 
##            `Curricular units 2nd sem (enrolled)` 
##                                        11.197613 
##            `Curricular units 2nd sem (approved)` 
##                                         6.949030 
##               `Curricular units 2nd sem (grade)` 
##                                         8.472960 
## `Curricular units 2nd sem (without evaluations)` 
##                                         1.132925 
##                              `Unemployment rate` 
##                                         1.095878