library(tidyverse)
## Warning: package 'tidyverse' was built under R version 4.3.1
## Warning: package 'ggplot2' was built under R version 4.3.1
## Warning: package 'purrr' was built under R version 4.3.1
## ── Attaching core tidyverse packages ──────────────────────── tidyverse 2.0.0 ──
## ✔ dplyr 1.1.2 ✔ readr 2.1.4
## ✔ forcats 1.0.0 ✔ stringr 1.5.0
## ✔ ggplot2 3.4.3 ✔ tibble 3.2.1
## ✔ lubridate 1.9.2 ✔ tidyr 1.3.0
## ✔ purrr 1.0.2
## ── Conflicts ────────────────────────────────────────── tidyverse_conflicts() ──
## ✖ dplyr::filter() masks stats::filter()
## ✖ dplyr::lag() masks stats::lag()
## ℹ Use the conflicted package (<http://conflicted.r-lib.org/>) to force all conflicts to become errors
library(tidymodels)
## Warning: package 'tidymodels' was built under R version 4.3.1
## ── Attaching packages ────────────────────────────────────── tidymodels 1.1.1 ──
## ✔ broom 1.0.5 ✔ rsample 1.2.0
## ✔ dials 1.2.0 ✔ tune 1.1.2
## ✔ infer 1.0.5 ✔ workflows 1.1.3
## ✔ modeldata 1.2.0 ✔ workflowsets 1.0.1
## ✔ parsnip 1.1.1 ✔ yardstick 1.2.0
## ✔ recipes 1.0.8
## Warning: package 'dials' was built under R version 4.3.1
## Warning: package 'modeldata' was built under R version 4.3.1
## Warning: package 'parsnip' was built under R version 4.3.1
## Warning: package 'recipes' was built under R version 4.3.1
## Warning: package 'rsample' was built under R version 4.3.1
## Warning: package 'tune' was built under R version 4.3.1
## Warning: package 'workflows' was built under R version 4.3.1
## Warning: package 'workflowsets' was built under R version 4.3.1
## Warning: package 'yardstick' was built under R version 4.3.1
## ── Conflicts ───────────────────────────────────────── tidymodels_conflicts() ──
## ✖ scales::discard() masks purrr::discard()
## ✖ dplyr::filter() masks stats::filter()
## ✖ recipes::fixed() masks stringr::fixed()
## ✖ dplyr::lag() masks stats::lag()
## ✖ yardstick::spec() masks readr::spec()
## ✖ recipes::step() masks stats::step()
## • Learn how to get started at https://www.tidymodels.org/start/
students <- read_csv("data/oulad-students.csv")
## Rows: 32593 Columns: 15
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (9): code_module, code_presentation, gender, region, highest_education, ...
## dbl (6): id_student, num_of_prev_attempts, studied_credits, module_presentat...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
glimpse(students)
## Rows: 32,593
## Columns: 15
## $ code_module <chr> "AAA", "AAA", "AAA", "AAA", "AAA", "AAA", "…
## $ code_presentation <chr> "2013J", "2013J", "2013J", "2013J", "2013J"…
## $ id_student <dbl> 11391, 28400, 30268, 31604, 32885, 38053, 4…
## $ gender <chr> "M", "F", "F", "F", "F", "M", "M", "F", "F"…
## $ region <chr> "East Anglian Region", "Scotland", "North W…
## $ highest_education <chr> "HE Qualification", "HE Qualification", "A …
## $ imd_band <chr> "90-100%", "20-30%", "30-40%", "50-60%", "5…
## $ age_band <chr> "55<=", "35-55", "35-55", "35-55", "0-35", …
## $ num_of_prev_attempts <dbl> 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0, 0…
## $ studied_credits <dbl> 240, 60, 60, 60, 60, 60, 60, 120, 90, 60, 6…
## $ disability <chr> "N", "N", "Y", "N", "N", "N", "N", "N", "N"…
## $ final_result <chr> "Pass", "Pass", "Withdrawn", "Pass", "Pass"…
## $ module_presentation_length <dbl> 268, 268, 268, 268, 268, 268, 268, 268, 268…
## $ date_registration <dbl> -159, -53, -92, -52, -176, -110, -67, -29, …
## $ date_unregistration <dbl> NA, NA, 12, NA, NA, NA, NA, NA, NA, NA, NA,…
students <- students %>%
mutate(pass = ifelse(final_result == "Pass", 1, 0)) %>% # creates a new variable named "pass" and a dummy code of 1 if value of final_result equals "pass" and 0 if not
mutate(pass = as.factor(pass)) # makes the variable a factor, helping later steps
students <- students %>%
mutate(disability = as.factor(disability))
students <- students %>%
mutate(gender = as.factor(gender))
students %>%
count(id_student) # to find how many students
## # A tibble: 28,785 × 2
## id_student n
## <dbl> <int>
## 1 3733 1
## 2 6516 1
## 3 8462 2
## 4 11391 1
## 5 23629 1
## 6 23632 1
## 7 23698 1
## 8 23798 1
## 9 24186 1
## 10 24213 2
## # ℹ 28,775 more rows
students %>%
count(code_module, code_presentation)
## # A tibble: 22 × 3
## code_module code_presentation n
## <chr> <chr> <int>
## 1 AAA 2013J 383
## 2 AAA 2014J 365
## 3 BBB 2013B 1767
## 4 BBB 2013J 2237
## 5 BBB 2014B 1613
## 6 BBB 2014J 2292
## 7 CCC 2014B 1936
## 8 CCC 2014J 2498
## 9 DDD 2013B 1303
## 10 DDD 2013J 1938
## # ℹ 12 more rows
students <- students %>%
mutate(imd_band = factor(imd_band, levels = c("0-10%", "10-20%", "20-30%", "30-40%", "40-50%", "50-60%", "60-70%", "70-80%", "80-90%", "90-100%"))) %>%
mutate(imd_band = as.integer(imd_band)) # Convert factor levels to integers based on order
students
## # A tibble: 32,593 × 16
## code_module code_presentation id_student gender region highest_education
## <chr> <chr> <dbl> <fct> <chr> <chr>
## 1 AAA 2013J 11391 M East Angli… HE Qualification
## 2 AAA 2013J 28400 F Scotland HE Qualification
## 3 AAA 2013J 30268 F North West… A Level or Equiv…
## 4 AAA 2013J 31604 F South East… A Level or Equiv…
## 5 AAA 2013J 32885 F West Midla… Lower Than A Lev…
## 6 AAA 2013J 38053 M Wales A Level or Equiv…
## 7 AAA 2013J 45462 M Scotland HE Qualification
## 8 AAA 2013J 45642 F North West… A Level or Equiv…
## 9 AAA 2013J 52130 F East Angli… A Level or Equiv…
## 10 AAA 2013J 53025 M North Regi… Post Graduate Qu…
## # ℹ 32,583 more rows
## # ℹ 10 more variables: imd_band <int>, age_band <chr>,
## # num_of_prev_attempts <dbl>, studied_credits <dbl>, disability <fct>,
## # final_result <chr>, module_presentation_length <dbl>,
## # date_registration <dbl>, date_unregistration <dbl>, pass <fct>
set.seed(20230712)
train_test_split <- initial_split(students, prop = .80)
data_train <- training(train_test_split)
data_test <- testing(train_test_split)
data_train
## # A tibble: 26,074 × 16
## code_module code_presentation id_student gender region highest_education
## <chr> <chr> <dbl> <fct> <chr> <chr>
## 1 FFF 2014B 595186 M South Regi… Lower Than A Lev…
## 2 BBB 2014J 504066 F East Midla… Lower Than A Lev…
## 3 BBB 2013J 585790 F South East… HE Qualification
## 4 CCC 2014J 278413 M London Reg… HE Qualification
## 5 GGG 2014B 634933 F South Regi… Lower Than A Lev…
## 6 CCC 2014J 608577 M North Regi… HE Qualification
## 7 BBB 2014B 612120 F East Midla… Lower Than A Lev…
## 8 FFF 2013J 530852 M Wales Lower Than A Lev…
## 9 CCC 2014J 2555596 M South Regi… A Level or Equiv…
## 10 DDD 2013B 556575 M North Regi… HE Qualification
## # ℹ 26,064 more rows
## # ℹ 10 more variables: imd_band <int>, age_band <chr>,
## # num_of_prev_attempts <dbl>, studied_credits <dbl>, disability <fct>,
## # final_result <chr>, module_presentation_length <dbl>,
## # date_registration <dbl>, date_unregistration <dbl>, pass <fct>
data_test
## # A tibble: 6,519 × 16
## code_module code_presentation id_student gender region highest_education
## <chr> <chr> <dbl> <fct> <chr> <chr>
## 1 AAA 2013J 28400 F Scotland HE Qualification
## 2 AAA 2013J 31604 F South East… A Level or Equiv…
## 3 AAA 2013J 45462 M Scotland HE Qualification
## 4 AAA 2013J 53025 M North Regi… Post Graduate Qu…
## 5 AAA 2013J 65002 F East Angli… A Level or Equiv…
## 6 AAA 2013J 71361 M Ireland HE Qualification
## 7 AAA 2013J 77367 M East Midla… A Level or Equiv…
## 8 AAA 2013J 98094 M Wales Lower Than A Lev…
## 9 AAA 2013J 111717 F East Angli… HE Qualification
## 10 AAA 2013J 114017 F North Regi… Post Graduate Qu…
## # ℹ 6,509 more rows
## # ℹ 10 more variables: imd_band <int>, age_band <chr>,
## # num_of_prev_attempts <dbl>, studied_credits <dbl>, disability <fct>,
## # final_result <chr>, module_presentation_length <dbl>,
## # date_registration <dbl>, date_unregistration <dbl>, pass <fct>
my_rec <- recipe(pass ~ gender + highest_education + imd_band + age_band + num_of_prev_attempts + studied_credits + disability, data = data_train)
my_rec
##
## ── Recipe ──────────────────────────────────────────────────────────────────────
##
## ── Inputs
## Number of variables by role
## outcome: 1
## predictor: 7
my_mod <-
logistic_reg()
my_mod <-
logistic_reg() %>%
set_engine("glm") %>% # generalized linear model
set_mode("classification") # since we are predicting a dichotomous outcome, specify classification; for a number, specify regression
my_mod
## Logistic Regression Model Specification (classification)
##
## Computational engine: glm
# Assuming you have already loaded the required libraries and created 'my_rec' and 'my_mod' as mentioned in your previous code
# Fit the logistic regression model
logistic_model <- workflow() %>%
add_recipe(my_rec) %>%
add_model(my_mod) %>%
fit(data_train)
# Extract model coefficients
coefficients_table <- tidy(logistic_model, number = Inf)
# Order the coefficients by absolute values
coefficients_table_ordered <- coefficients_table %>%
arrange(desc(abs(estimate)))
# Set options to print numeric values without scientific notation
options(scipen = 999)
# Print the ordered coefficients
print(coefficients_table_ordered)
## # A tibble: 12 × 5
## term estimate std.error statistic p.value
## <chr> <dbl> <dbl> <dbl> <dbl>
## 1 highest_educationPost Graduate Qualifi… -0.797 0.196 -4.07 4.73e- 5
## 2 highest_educationNo Formal quals -0.665 0.157 -4.23 2.32e- 5
## 3 highest_educationLower Than A Level -0.334 0.0308 -10.8 2.16e-27
## 4 num_of_prev_attempts -0.233 0.0325 -7.16 8.23e-13
## 5 disabilityY -0.229 0.0491 -4.66 3.20e- 6
## 6 (Intercept) -0.170 0.0503 -3.38 7.28e- 4
## 7 age_band35-55 0.136 0.0311 4.38 1.19e- 5
## 8 age_band55<= 0.131 0.170 0.770 4.41e- 1
## 9 genderM -0.119 0.0282 -4.20 2.64e- 5
## 10 highest_educationHE Qualification -0.0638 0.0422 -1.51 1.30e- 1
## 11 imd_band 0.0519 0.00520 9.96 2.24e-23
## 12 studied_credits -0.00469 0.000374 -12.5 4.31e-36