# Set CRAN mirror
options(repos = c(CRAN = "https://cran.r-project.org"))

# Install the car package
if (!requireNamespace("car", quietly = TRUE)) {
    install.packages("car")
}
library(car)
## Loading required package: carData
library(tidyverse)
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr     1.1.4     ✔ readr     2.1.5
## ✔ forcats   1.0.0     ✔ stringr   1.5.1
## ✔ ggplot2   3.5.1     ✔ tibble    3.2.1
## ✔ lubridate 1.9.3     ✔ tidyr     1.3.1
## ✔ purrr     1.0.2     
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag()    masks stats::lag()
## ✖ dplyr::recode() masks car::recode()
## ✖ purrr::some()   masks car::some()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
dataset <-read_delim("C:/Users/MSKR/MASTERS_ADS/STATISTICS_SEM1/DATA_SET_1.csv", delim = ",")
## Rows: 4424 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr  (1): Target
## dbl (36): Marital status, Application mode, Application order, Course, Dayti...
## 
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
dataset_1<-dataset
dataset_1<-mutate(dataset_1, day_eve_class= ifelse(dataset_1$`Daytime/evening attendance    ` == 1, "day","evening"))
dataset_1<-mutate(dataset_1, target = ifelse(dataset$Target == "Graduate",2,
                    ifelse(Target == "Enrolled",1,
                    ifelse(Target == "Dropout", 0, "no"))))
dataset_1<-mutate(dataset_1, sem_results= rowMeans(select(dataset_1,`Curricular units 1st sem (grade)`, `Curricular units 2nd sem (grade)`)))
dataset_1<-mutate(dataset_1, prev_perf= rowMeans(select(dataset_1,`Previous qualification (grade)`,`Admission grade`)))

Previous model:

Last week, we built a simple linear regression model using a mean value of admission grade and previous grades as the predictor for student’s semester results. The model was as follows:

# Linear regression model
lm_model <- lm(sem_results ~ prev_perf , data = dataset_1)
summary(lm_model)
## 
## Call:
## lm(formula = sem_results ~ prev_perf, data = dataset_1)
## 
## Residuals:
##      Min       1Q   Median       3Q      Max 
## -12.0442   0.5619   1.8408   2.8242   7.8444 
## 
## Coefficients:
##             Estimate Std. Error t value Pr(>|t|)    
## (Intercept) 6.522300   0.765659   8.519  < 2e-16 ***
## prev_perf   0.030150   0.005873   5.134 2.96e-07 ***
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 4.805 on 4422 degrees of freedom
## Multiple R-squared:  0.005925,   Adjusted R-squared:  0.0057 
## F-statistic: 26.36 on 1 and 4422 DF,  p-value: 2.96e-07

Now, we will enhance this model by adding more variables.

dataset_1$target<-as.numeric(dataset_1$target)
cor(dataset_1[, c('Admission grade','Previous qualification (grade)','Curricular units 1st sem (grade)','Curricular units 2nd sem (grade)','GDP','Inflation rate','Unemployment rate','target')])
##                                  Admission grade Previous qualification (grade)
## Admission grade                       1.00000000                     0.58044420
## Previous qualification (grade)        0.58044420                     1.00000000
## Curricular units 1st sem (grade)      0.07386623                     0.05944356
## Curricular units 2nd sem (grade)      0.07440153                     0.05323897
## GDP                                  -0.01951948                    -0.05262040
## Inflation rate                       -0.02162358                     0.01871038
## Unemployment rate                     0.03875569                     0.04522227
## target                                0.12088916                     0.10376370
##                                  Curricular units 1st sem (grade)
## Admission grade                                        0.07386623
## Previous qualification (grade)                         0.05944356
## Curricular units 1st sem (grade)                       1.00000000
## Curricular units 2nd sem (grade)                       0.83717620
## GDP                                                    0.05480897
## Inflation rate                                        -0.03391130
## Unemployment rate                                      0.01481629
## target                                                 0.48521304
##                                  Curricular units 2nd sem (grade)         GDP
## Admission grade                                       0.074401530 -0.01951948
## Previous qualification (grade)                        0.053238974 -0.05262040
## Curricular units 1st sem (grade)                      0.837176195  0.05480897
## Curricular units 2nd sem (grade)                      1.000000000  0.07126950
## GDP                                                   0.071269496  1.00000000
## Inflation rate                                       -0.038166042 -0.11229464
## Unemployment rate                                     0.001461858 -0.33517812
## target                                                0.566827280  0.04413469
##                                  Inflation rate Unemployment rate       target
## Admission grade                     -0.02162358       0.038755687  0.120889157
## Previous qualification (grade)       0.01871038       0.045222268  0.103763697
## Curricular units 1st sem (grade)    -0.03391130       0.014816291  0.485213038
## Curricular units 2nd sem (grade)    -0.03816604       0.001461858  0.566827280
## GDP                                 -0.11229464      -0.335178119  0.044134690
## Inflation rate                       1.00000000      -0.028884663 -0.026874065
## Unemployment rate                   -0.02888466       1.000000000  0.008626681
## target                              -0.02687406       0.008626681  1.000000000
#New model with additional variables

new_model <- lm(target ~ `Admission grade` + `Previous qualification (grade)` +`Curricular units 1st sem (grade)` + `Curricular units 2nd sem (grade)` + `Admission grade`:`Previous qualification (grade)` , data = dataset_1)
summary(new_model)
## 
## Call:
## lm(formula = target ~ `Admission grade` + `Previous qualification (grade)` + 
##     `Curricular units 1st sem (grade)` + `Curricular units 2nd sem (grade)` + 
##     `Admission grade`:`Previous qualification (grade)`, data = dataset_1)
## 
## Residuals:
##     Min      1Q  Median      3Q     Max 
## -2.0264 -0.3406  0.2060  0.5704  1.9246 
## 
## Coefficients:
##                                                      Estimate Std. Error
## (Intercept)                                         2.211e-01  7.366e-01
## `Admission grade`                                  -3.102e-03  5.697e-03
## `Previous qualification (grade)`                   -3.340e-03  5.459e-03
## `Curricular units 1st sem (grade)`                  5.790e-03  4.133e-03
## `Curricular units 2nd sem (grade)`                  9.110e-02  3.842e-03
## `Admission grade`:`Previous qualification (grade)`  4.721e-05  4.115e-05
##                                                    t value Pr(>|t|)    
## (Intercept)                                          0.300    0.764    
## `Admission grade`                                   -0.544    0.586    
## `Previous qualification (grade)`                    -0.612    0.541    
## `Curricular units 1st sem (grade)`                   1.401    0.161    
## `Curricular units 2nd sem (grade)`                  23.713   <2e-16 ***
## `Admission grade`:`Previous qualification (grade)`   1.147    0.251    
## ---
## Signif. codes:  0 '***' 0.001 '**' 0.01 '*' 0.05 '.' 0.1 ' ' 1
## 
## Residual standard error: 0.7279 on 4418 degrees of freedom
## Multiple R-squared:  0.3292, Adjusted R-squared:  0.3284 
## F-statistic: 433.6 on 5 and 4418 DF,  p-value: < 2.2e-16
# Check for multicollinearity
vif_values<-vif(new_model,type = "predictor")
## GVIFs computed for predictors
vif_values
##                                      GVIF Df GVIF^(1/(2*Df))
## Admission grade                  780.6209  0             Inf
## Previous qualification (grade)   780.6209  0             Inf
## Curricular units 1st sem (grade) 780.6209  0             Inf
## Curricular units 2nd sem (grade) 780.6209  0             Inf
##                                                    Interacts With
## Admission grade                  `Previous qualification (grade)`
## Previous qualification (grade)                  `Admission grade`
## Curricular units 1st sem (grade)                             --  
## Curricular units 2nd sem (grade)                             --  
##                                                                                                                                     Other Predictors
## Admission grade                  Admission grade, Previous qualification (grade), Curricular units 1st sem (grade), Curricular units 2nd sem (grade)
## Previous qualification (grade)   Admission grade, Previous qualification (grade), Curricular units 1st sem (grade), Curricular units 2nd sem (grade)
## Curricular units 1st sem (grade) Admission grade, Previous qualification (grade), Curricular units 1st sem (grade), Curricular units 2nd sem (grade)
## Curricular units 2nd sem (grade) Admission grade, Previous qualification (grade), Curricular units 1st sem (grade), Curricular units 2nd sem (grade)
# Set up a 2x2 plotting area
par(mfrow = c(2, 2))
plot(new_model)

# Cook's Distance Plot
plot(cooks.distance(new_model), main = "Cook's Distance", ylab = "Distance", xlab = "Index")
abline(h = 4 / length(dataset_1), col = "red")  # Add a cutoff line

Interpretation of Diagnostic Plots:

As we found patterns/significant deviations in some of the plots, my initial hypothesis of the model being linear in nature is contradicting and I need to consider refining my model through transformations, adding more interaction terms, or using robust regression techniques. This can be analysed in our data dive sessions.