Function to clean student drop out data
# Import the data
student_data <- read_csv(file = 'predict_students_dropout_and_academic_success_final.csv')
## Rows: 4424 Columns: 37
## ── Column specification ────────────────────────────────────────────────────────
## Delimiter: ","
## chr (1): Target
## dbl (36): Marital_status, Application_mode, Application_order, Course, Dayti...
##
## ℹ Use `spec()` to retrieve the full column specification for this data.
## ℹ Specify the column types or set `show_col_types = FALSE` to quiet this message.
# Remove "Enrolled" status
student_data_filtered <- student_data[student_data$Target != "Enrolled", ]
student_data_filtered <- student_data_filtered %>%
mutate(Marital_status = case_when(
Marital_status == 1 ~ "single",
Marital_status == 2 ~ "married",
Marital_status == 3 ~ "widower",
Marital_status == 4 ~ "divorced",
Marital_status == 5 ~ "facto union",
Marital_status == 6 ~ "legally separated",
TRUE ~ as.character(Marital_status)
))
student_data_filtered <- student_data_filtered %>%
mutate(Application_mode = case_when(
Application_mode == 1 ~ "1st phase general contingent",
Application_mode == 2 ~ "Ordinance No. 612/93",
Application_mode == 5 ~ "1st phase special contingent (Azores Island)",
Application_mode == 7 ~ "Holders of other higher courses",
Application_mode == 10 ~ "Ordinance No. 854-B/99",
Application_mode == 15 ~ "International student (bachelor)",
Application_mode == 16 ~ "1st phase special contingent (Madeira Island)",
Application_mode == 17 ~ "2nd phase general contingent",
Application_mode == 18 ~ "3rd phase general contingent",
Application_mode == 26 ~ "Ordinance No. 533 A 99, item b2) (Different Plan)",
Application_mode == 27 ~ "Ordinance No. 533 A 99, item b3 (Other Institution)",
Application_mode == 39 ~ "Over 23 years old",
Application_mode == 42 ~ "Transfer",
Application_mode == 43 ~ "Change of course",
Application_mode == 44 ~ "Technological specialization diploma holders",
Application_mode == 51 ~ "Change of institution/course",
Application_mode == 53 ~ "Short cycle diploma holders",
Application_mode == 57 ~ "Change of institution/course (International)",
TRUE ~ as.character(Application_mode)
))
student_data_filtered <- student_data_filtered %>%
mutate(Course = case_when(
Course == 33 ~ "Biofuel Production Technologies",
Course == 171 ~ "Animation and Multimedia Design",
Course == 8014 ~ "Social Service (evening attendance)",
Course == 9003 ~ "Agronomy",
Course == 9070 ~ "Communication Design",
Course == 9085 ~ "Veterinary Nursing",
Course == 9119 ~ "Informatics Engineering",
Course == 9130 ~ "Equinculture",
Course == 9147 ~ "Management",
Course == 9238 ~ "Social Service",
Course == 9254 ~ "Tourism",
Course == 9500 ~ "Nursing",
Course == 9556 ~ "Oral Hygiene",
Course == 9670 ~ "Advertising and Marketing Management",
Course == 9773 ~ "Journalism and Communication",
Course == 9853 ~ "Basic Education",
Course == 9991 ~ "Management (evening attendance)",
TRUE ~ as.character(Course)
))
student_data_filtered <- student_data_filtered %>%
mutate(Daytime_evening_attendance = case_when(
Daytime_evening_attendance == 1 ~ "daytime",
Daytime_evening_attendance == 0 ~ "evening",
TRUE ~ as.character(Daytime_evening_attendance)
))
student_data_filtered <- student_data_filtered %>%
mutate(Previous_qualification = case_when(
Previous_qualification == 1 ~ "Secondary education",
Previous_qualification == 2 ~ "Higher education bachelors degree",
Previous_qualification == 3 ~ "Higher education degree",
Previous_qualification == 4 ~ "Higher education masters",
Previous_qualification == 5 ~ "Higher education doctorate",
Previous_qualification == 6 ~ "Frequency of higher education",
Previous_qualification == 9 ~ "12th year of schooling not completed",
Previous_qualification == 10 ~ "11th year of schooling not completed",
Previous_qualification == 12 ~ "Other 11th year of schooling",
Previous_qualification == 14 ~ "10th year of schooling",
Previous_qualification == 15 ~ "10th year of schooling not completed",
Previous_qualification == 19 ~ "Basic education 3rd cycle (9th 10th 11th year) or equiv.",
Previous_qualification == 38 ~ "Basic education 2nd cycle (6th 7th 8th year) or equiv.",
Previous_qualification == 39 ~ "Technological specialization course",
Previous_qualification == 40 ~ "Higher education degree (1st cycle)",
Previous_qualification == 42 ~ "Professional higher technical course",
Previous_qualification == 43 ~ "Higher education master (2nd cycle)",
TRUE ~ as.character(Previous_qualification)
))
student_data_filtered <- student_data_filtered %>%
mutate(Nationality = case_when(
Nationality == 1 ~ "Portuguese",
Nationality == 2 ~ "German",
Nationality == 6 ~ "Spanish",
Nationality == 11 ~ "Italian",
Nationality == 13 ~ "Dutch",
Nationality == 14 ~ "English",
Nationality == 17 ~ "Lithuanian",
Nationality == 21 ~ "Angolan",
Nationality == 22 ~ "Cape Verdean",
Nationality == 24 ~ "Guinean",
Nationality == 25 ~ "Mozambican",
Nationality == 26 ~ "Santomean",
Nationality == 32 ~ "Turkish",
Nationality == 41 ~ "Brazilian",
Nationality == 62 ~ "Romanian",
Nationality == 100 ~ "Moldova (Republic of)",
Nationality == 101 ~ "Mexican",
Nationality == 103 ~ "Ukrainian",
Nationality == 105 ~ "Russian",
Nationality == 108 ~ "Cuban",
Nationality == 109 ~ "Colombian",
TRUE ~ as.character(Nationality)
))
student_data_filtered <- student_data_filtered %>%
mutate(Mothers_qualification = case_when(
Mothers_qualification == 1 ~ "Secondary Education 12th Year of Schooling or Eq.",
Mothers_qualification == 2 ~ "Higher Education Bachelors Degree",
Mothers_qualification == 3 ~ "Higher Education Degree",
Mothers_qualification == 4 ~ "Higher Education Masters",
Mothers_qualification == 5 ~ "Higher Education Doctorate",
Mothers_qualification == 6 ~ "Frequency of Higher Education",
Mothers_qualification == 9 ~ "12th Year of Schooling Not Completed",
Mothers_qualification == 10 ~ "11th Year of Schooling Not Completed",
Mothers_qualification == 11 ~ "7th Year (Old)",
Mothers_qualification == 12 ~ "Other 11th Year of Schooling",
Mothers_qualification == 14 ~ "10th Year of Schooling",
Mothers_qualification == 18 ~ "General Commerce Course",
Mothers_qualification == 19 ~ "Basic Education 3rd Cycle (9th 10th 11th Year) or Equiv.",
Mothers_qualification == 22 ~ "Technical-Professional Course",
Mothers_qualification == 26 ~ "7th Year of Schooling",
Mothers_qualification == 27 ~ "2nd Cycle of the General High School Course",
Mothers_qualification == 29 ~ "9th Year of Schooling Not Completed",
Mothers_qualification == 30 ~ "8th Year of Schooling",
Mothers_qualification == 34 ~ "Unknown",
Mothers_qualification == 35 ~ "Cant Read or Write",
Mothers_qualification == 36 ~ "Can Read Without Having a 4th Year of Schooling",
Mothers_qualification == 37 ~ "Basic Education 1st Cycle (4th 5th Year) or Equiv.",
Mothers_qualification == 38 ~ "Basic Education 2nd Cycle (6th 7th 8th Year) or Equiv.",
Mothers_qualification == 39 ~ "Technological Specialization Course",
Mothers_qualification == 40 ~ "Higher Education Degree (1st Cycle)",
Mothers_qualification == 41 ~ "Specialized Higher Studies Course",
Mothers_qualification == 42 ~ "Professional Higher Technical Course",
Mothers_qualification == 43 ~ "Higher Education Master (2nd Cycle)",
Mothers_qualification == 44 ~ "Higher Education Doctorate (3rd Cycle)",
TRUE ~ as.character(Mothers_qualification)
))
student_data_filtered <- student_data_filtered %>%
mutate(Fathers_qualification = case_when(
Fathers_qualification == 1 ~ "Secondary Education 12th Year of Schooling or Eq",
Fathers_qualification == 2 ~ "Higher Education Bachelors Degree",
Fathers_qualification == 3 ~ "Higher Education Degree",
Fathers_qualification == 4 ~ "Higher Education Masters",
Fathers_qualification == 5 ~ "Higher Education Doctorate",
Fathers_qualification == 6 ~ "Frequency of Higher Education",
Fathers_qualification == 9 ~ "12th Year of Schooling Not Completed",
Fathers_qualification == 10 ~ "11th Year of Schooling Not Completed",
Fathers_qualification == 11 ~ "7th Year (Old)",
Fathers_qualification == 12 ~ "Other 11th Year of Schooling",
Fathers_qualification == 13 ~ "2nd Year Complementary High School Course",
Fathers_qualification == 14 ~ "10th Year of Schooling",
Fathers_qualification == 18 ~ "General Commerce Course",
Fathers_qualification == 19 ~ "Basic Education 3rd Cycle (9th 10th 11th Year) or Equiv.",
Fathers_qualification == 20 ~ "Complementary High School Course",
Fathers_qualification == 22 ~ "Technical Professional Course",
Fathers_qualification == 25 ~ "Complementary High School Course Not Concluded",
Fathers_qualification == 26 ~ "7th Year of Schooling",
Fathers_qualification == 27 ~ "2nd Cycle of the General High School Course",
Fathers_qualification == 29 ~ "9th Year of Schooling Not Completed",
Fathers_qualification == 30 ~ "8th Year of Schooling",
Fathers_qualification == 31 ~ "General Course of Administration and Commerce",
Fathers_qualification == 33 ~ "Supplementary Accounting and Administration",
Fathers_qualification == 34 ~ "Unknown",
Fathers_qualification == 35 ~ "Cant Read or Write",
Fathers_qualification == 36 ~ "Can Read Without Having a 4th Year of Schooling",
Fathers_qualification == 37 ~ "Basic Education 1st Cycle (4th 5th Year) or Equiv",
Fathers_qualification == 38 ~ "Basic Education 2nd Cycle (6th 7th 8th Year) or Equiv",
Fathers_qualification == 39 ~ "Technological Specialization Course",
Fathers_qualification == 40 ~ "Higher Education Degree (1st Cycle)",
Fathers_qualification == 41 ~ "Specialized Higher Studies Course",
Fathers_qualification == 42 ~ "Professional Higher Technical Course",
Fathers_qualification == 43 ~ "Higher Education Master (2nd Cycle)",
Fathers_qualification == 44 ~ "Higher Education Doctorate (3rd Cycle)",
TRUE ~ as.character(Fathers_qualification)
))
student_data_filtered <- student_data_filtered %>%
mutate(Mothers_occupation = case_when(
Mothers_occupation == 0 ~ "Student",
Mothers_occupation == 1 ~ "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
Mothers_occupation == 2 ~ "Specialists in Intellectual and Scientific Activities",
Mothers_occupation == 3 ~ "Intermediate Level Technicians and Professions",
Mothers_occupation == 4 ~ "Administrative staff",
Mothers_occupation == 5 ~ "Personal Services, Security and Safety Workers and Sellers",
Mothers_occupation == 6 ~ "Farmers and Skilled Workers in Agriculture, Fisheries and Forestry",
Mothers_occupation == 7 ~ "Skilled Workers in Industry, Construction and Craftsmen",
Mothers_occupation == 8 ~ "Installation and Machine Operators and Assembly Workers",
Mothers_occupation == 9 ~ "Unskilled Workers",
Mothers_occupation == 10 ~ "Armed Forces Professions",
Mothers_occupation == 90 ~ "Other Situation",
Mothers_occupation == 99 ~ "(blank)",
Mothers_occupation == 122 ~ "Health professionals",
Mothers_occupation == 123 ~ "Teachers",
Mothers_occupation == 125 ~ "Specialists in Information and Communication Technologies (ICT)",
Mothers_occupation == 131 ~ "Intermediate Level Science and Engineering Technicians and Professions",
Mothers_occupation == 132 ~ "Technicians and Professionals of Intermediate Level of Health",
Mothers_occupation == 134 ~ "Intermediate Level Technicians from Legal, Social, Sports, Cultural and Similar Services",
Mothers_occupation == 141 ~ "Office Workers, Secretaries in General and Data Processing Operators",
Mothers_occupation == 143 ~ "Data, Accounting, Statistical, Financial Services and Registry-Related Operators",
Mothers_occupation == 144 ~ "Other Administrative Support Staff",
Mothers_occupation == 151 ~ "Personal Service Workers",
Mothers_occupation == 152 ~ "Sellers",
Mothers_occupation == 153 ~ "Personal Care Workers and the Like",
Mothers_occupation == 171 ~ "Skilled Construction Workers and the Like, Except Electricians",
Mothers_occupation == 173 ~ "Skilled Workers in Printing, Precision Instrument Manufacturing, Jewelers, Artisans and the Like",
Mothers_occupation == 175 ~ "Workers in Food Processing, Woodworking, Clothing and Other Industries and Crafts",
Mothers_occupation == 191 ~ "Cleaning Workers",
Mothers_occupation == 192 ~ "Unskilled Workers in Agriculture, Animal Production, Fisheries and Forestry",
Mothers_occupation == 193 ~ "Unskilled Workers in Extractive Industry, Construction, Manufacturing and Transport",
Mothers_occupation == 194 ~ "Meal Preparation Assistants",
TRUE ~ as.character(Mothers_occupation)
))
student_data_filtered <- student_data_filtered %>%
mutate(Fathers_occupation = case_when(
Fathers_occupation == 0 ~ "Student",
Fathers_occupation == 1 ~ "Representatives of the Legislative Power and Executive Bodies, Directors, Directors and Executive Managers",
Fathers_occupation == 2 ~ "Specialists in Intellectual and Scientific Activities",
Fathers_occupation == 3 ~ "Intermediate Level Technicians and Professions",
Fathers_occupation == 4 ~ "Administrative staff",
Fathers_occupation == 5 ~ "Personal Services, Security and Safety Workers and Sellers",
Fathers_occupation == 6 ~ "Farmers and Skilled Workers in Agriculture, Fisheries and Forestry",
Fathers_occupation == 7 ~ "Skilled Workers in Industry, Construction and Craftsmen",
Fathers_occupation == 8 ~ "Installation and Machine Operators and Assembly Workers",
Fathers_occupation == 9 ~ "Unskilled Workers",
Fathers_occupation == 10 ~ "Armed Forces Professions",
Fathers_occupation == 90 ~ "Other Situation",
Fathers_occupation == 99 ~ "(blank)",
Fathers_occupation == 101 ~ "Armed Forces Officers",
Fathers_occupation == 102 ~ "Armed Forces Sergeants",
Fathers_occupation == 103 ~ "Other Armed Forces personnel",
Fathers_occupation == 112 ~ "Directors of Administrative and Commercial Services",
Fathers_occupation == 114 ~ "Hotel, Catering, Trade, and Other Services Directors",
Fathers_occupation == 121 ~ "Specialists in the Physical Sciences, Mathematics, Engineering, and Related Techniques",
Fathers_occupation == 122 ~ "Health Professionals",
Fathers_occupation == 123 ~ "Teachers",
Fathers_occupation == 124 ~ "Specialists in Finance, Accounting, Administrative Organization, Public and Commercial Relations",
Fathers_occupation == 131 ~ "Intermediate Level Science and Engineering Technicians and Professions",
Fathers_occupation == 132 ~ "Technicians and Professionals of Intermediate Level of Health",
Fathers_occupation == 134 ~ "Intermediate Level Technicians from Legal, Social, Sports, Cultural, and Similar Services",
Fathers_occupation == 135 ~ "Information and Communication Technology Technicians",
Fathers_occupation == 141 ~ "Office Workers, Secretaries in General and Data Processing Operators",
Fathers_occupation == 143 ~ "Data, Accounting, Statistical, Financial Services, and Registry-Related Operators",
Fathers_occupation == 144 ~ "Other Administrative Support Staff",
Fathers_occupation == 151 ~ "Personal Service Workers",
Fathers_occupation == 152 ~ "Sellers",
Fathers_occupation == 153 ~ "Personal Care Workers and the Like",
Fathers_occupation == 154 ~ "Protection and Security Services Personnel",
Fathers_occupation == 161 ~ "Market-Oriented Farmers and Skilled Agricultural and Animal Production Workers",
Fathers_occupation == 163 ~ "Farmers, Livestock Keepers, Fishermen, Hunters, and Gatherers, Subsistence",
Fathers_occupation == 171 ~ "Skilled Construction Workers and the Like, Except Electricians",
Fathers_occupation == 172 ~ "Skilled Workers in Metallurgy, Metalworking, and Similar",
Fathers_occupation == 174 ~ "Skilled Workers in Electricity and Electronics",
Fathers_occupation == 175 ~ "Workers in Food Processing, Woodworking, Clothing, and Other Industries and Crafts",
Fathers_occupation == 181 ~ "Fixed Plant and Machine Operators",
Fathers_occupation == 182 ~ "Assembly Workers",
Fathers_occupation == 183 ~ "Vehicle Drivers and Mobile Equipment Operators",
Fathers_occupation == 192 ~ "Unskilled Workers in Agriculture, Animal Production, Fisheries, and Forestry",
Fathers_occupation == 193 ~ "Unskilled Workers in Extractive Industry, Construction, Manufacturing, and Transport",
Fathers_occupation == 194 ~ "Meal Preparation Assistants",
Fathers_occupation == 195 ~ "Street Vendors (Except Food) and Street Service Providers",
TRUE ~ as.character(Fathers_occupation)
))
student_data_filtered <- student_data_filtered %>%
mutate(Displaced = case_when(
Displaced == 1 ~ "yes",
Displaced == 0 ~ "no",
TRUE ~ as.character(Displaced)
))
student_data_filtered <- student_data_filtered %>%
mutate(Educational_special_needs = case_when(
Educational_special_needs == 1 ~ "yes",
Educational_special_needs == 0 ~ "no",
TRUE ~ as.character(Educational_special_needs)
))
student_data_filtered <- student_data_filtered %>%
mutate(Debtor = case_when(
Debtor == 1 ~ "yes",
Debtor == 0 ~ "no",
TRUE ~ as.character(Debtor)
))
student_data_filtered <- student_data_filtered %>%
mutate(Tuition_fees_up_to_date = case_when(
Tuition_fees_up_to_date == 1 ~ "yes",
Tuition_fees_up_to_date == 0 ~ "no",
TRUE ~ as.character(Tuition_fees_up_to_date)
))
student_data_filtered <- student_data_filtered %>%
mutate(Gender = case_when(
Gender == 1 ~ "male",
Gender == 0 ~ "female",
TRUE ~ as.character(Gender)
))
student_data_filtered <- student_data_filtered %>%
mutate(Scholarship_holder = case_when(
Scholarship_holder == 1 ~ "yes",
Scholarship_holder == 0 ~ "no",
TRUE ~ as.character(Scholarship_holder)
))
student_data_filtered <- student_data_filtered %>%
mutate(International = case_when(
International == 1 ~ "yes",
International == 0 ~ "no",
TRUE ~ as.character(International)
))
write_xlsx(student_data_filtered, "student_data_filtered_final.xlsx")